|
@@ -212,7 +212,6 @@ def export_image(image: LTImage, path: str) -> str:
|
|
|
else:
|
|
|
return None
|
|
|
|
|
|
-
|
|
|
def _save_jpeg(image: LTImage, path: str) -> str:
|
|
|
"""Save a JPEG encoded image"""
|
|
|
raw_data = image.stream.get_rawdata()
|
|
@@ -355,6 +354,7 @@ class PdfExtractAttr(object):
|
|
|
self.file_path = file_path
|
|
|
self.details = []
|
|
|
self.tables = []
|
|
|
+ self.content = []
|
|
|
|
|
|
def parse_outline(self):
|
|
|
"""PDF大纲解析
|
|
@@ -388,6 +388,15 @@ class PdfExtractAttr(object):
|
|
|
|
|
|
print(results)
|
|
|
|
|
|
+ def extract_content(self) -> list:
|
|
|
+ with pdfplumber.open(self.file_path) as pdf:
|
|
|
+ for page in pdf.pages:
|
|
|
+ self.content.append({
|
|
|
+ 'page_number': page.page_number,
|
|
|
+ 'text': page.extract_text()
|
|
|
+ })
|
|
|
+ return self.content
|
|
|
+
|
|
|
def parse_text(self) -> None:
|
|
|
"""文本解析
|
|
|
"""
|
|
@@ -495,6 +504,7 @@ if __name__ == '__main__':
|
|
|
# tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
|
|
|
# tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
|
|
|
agent = PdfExtractAttr(file_path=pdf_path)
|
|
|
+ print(agent.extract_content())
|
|
|
agent.parse_outline()
|
|
|
agent.parse_text()
|
|
|
agent.parse_table()
|