Jelajahi Sumber

add content

sprivacy 1 tahun lalu
induk
melakukan
3426a0f8de
1 mengubah file dengan 11 tambahan dan 1 penghapusan
  1. 11 1
      get_info.py

+ 11 - 1
get_info.py

@@ -212,7 +212,6 @@ def export_image(image: LTImage, path: str) -> str:
     else:
         return None
 
-
 def _save_jpeg(image: LTImage, path: str) -> str:
     """Save a JPEG encoded image"""
     raw_data = image.stream.get_rawdata()
@@ -355,6 +354,7 @@ class PdfExtractAttr(object):
         self.file_path = file_path
         self.details = []
         self.tables = []
+        self.content = []
 
     def parse_outline(self):
         """PDF大纲解析
@@ -388,6 +388,15 @@ class PdfExtractAttr(object):
 
         print(results)
 
+    def extract_content(self) -> list:
+        with pdfplumber.open(self.file_path) as pdf:
+            for page in pdf.pages:
+                self.content.append({
+                    'page_number': page.page_number,
+                    'text': page.extract_text()
+                })
+        return self.content
+
     def parse_text(self) -> None:
         """文本解析
         """
@@ -495,6 +504,7 @@ if __name__ == '__main__':
     # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
     # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
     agent = PdfExtractAttr(file_path=pdf_path)
+    print(agent.extract_content())
     agent.parse_outline()
     agent.parse_text()
     agent.parse_table()