Sfoglia il codice sorgente

fix no line table extract

sprivacy 1 anno fa
parent
commit
2079c7d22e
1 ha cambiato i file con 42 aggiunte e 9 eliminazioni
  1. 42 9
      get_info.py

+ 42 - 9
get_info.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-08-01 13:59:10
+# @Last Modified time: 2024-08-02 11:19:17
 
 # import os
 
@@ -100,6 +100,7 @@ from pdfminer.pdfparser import PDFParser, PDFSyntaxError
 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
 from pdfminer.image import BMPWriter
 import pdfplumber
+import camelot
 
 # 自定义包导入
 from tools import RefPageNumberResolver
@@ -510,6 +511,37 @@ class PdfExtractAttr(object):
                     for table_index in range(1, len(tables)):
                         self.concat_table(tables[table_index].extract(), page_number=page_number, new=True)
 
+    def parse_table_pro(self) -> None:
+        with pdfplumber.open(self.file_path) as pdf:
+            for page_number, page_layout in enumerate(pdf.pages):
+                # 查询是否存在表格
+                tables = page_layout.find_tables()
+
+                tables_pro = camelot.read_pdf(
+                    self.file_path,
+                    flavor='stream',
+                    pages=str(page_number+1),
+                    edge_tol=200,
+                )
+                # 检测到该页面存在一个表格,对其进行合并判断
+                if len(tables) == 1:
+                    table = tables[0]
+                    table_pro = tables_pro[0].df.to_dict(orient='split')['data']
+                    x0, y0, x1, y1 = table.bbox
+                    table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
+                    if table_title_df.empty:
+                        self.concat_table(table_pro, page_number=page_number)
+                    else:
+                        table_title_name = table_title_df.iloc[0]['text']
+                        self.concat_table(table_pro, page_number=page_number, table_name=table_title_name)
+                    table = tables[0]
+                # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
+                elif len(tables_pro) > 1:
+                    first_table = tables_pro[0]
+                    self.concat_table(first_table.df.to_dict(orient='split')['data'], page_number=page_number)
+                    for table_index in range(1, len(tables_pro)):
+                        self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
+
     def output(self, table_path: str = 'all_tables.json'):
         """结果输出
         """
@@ -522,17 +554,18 @@ class PdfExtractAttr(object):
 if __name__ == '__main__':
     # pdf_path = './投标文件-修改版9-5-1-1.pdf'
     pdf_path = './南方电网数字研究院有限公司.pdf'
+    pdf_path = './2022年度工程类-公招采购资料/2022-2025年度三峡电站9台机组检修密封加工制作重新招标/2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.pdf'
     # title_path = './投标文件-修改版9-5-1-1.json'
-    title_path = './南方电网数字研究院有限公司.json'
-    image_dir = './extracted_images'
-    os.makedirs(image_dir, exist_ok=True)
-    main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
+    # title_path = './南方电网数字研究院有限公司.json'
+    # image_dir = './extracted_images'
+    # os.makedirs(image_dir, exist_ok=True)
+    # main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
     # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
     # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
 
     agent = PdfExtractAttr(file_path=pdf_path)
-    print(agent.extract_content())
-    agent.parse_outline()
+    # print(agent.extract_content())
+    # agent.parse_outline()
     agent.parse_text()
-    agent.parse_table()
-    agent.output()
+    agent.parse_table_pro()
+    agent.output('all_tables_pro.json')