1 year ago · 3d3861034d
--- a/README.md
+++ b/README.md
@@ -5,3 +5,22 @@
 
				 2、get_info  PDF信息抽取模块
			
 
				 3、matcher   段落定位模块
			
 
				 
			
 
				+
			
 
				+##### PDF中无边框表格内容抽取
			
 
				+```
			
 
				+1. camelot-py git源下载
			
 
				+	git clone https://www.github.com/camelot-dev/camelot
			
 
				+	安装命令： 进入camelot目录下，pip install -e .
			
 
				+2. 在wsl Debian中安装 ghostscript 【模块本身】
			
 
				+	apt install ghostscript
			
 
				+3. ghostscript 下载
			
 
				+	pip install ghostscript==0.7.0 【模块驱动】
			
 
				+4. 代码修改 【CV运行时不需要设置宽高，使用默认即可】
			
 
				+	tables_pro = camelot.read_pdf(
			
 
				+                    self.file_path,
			
 
				+                    # flavor='stream',
			
 
				+                    pages=str(page_number+1),
			
 
				+                    # edge_tol=200,
			
 
				+                    # row_tol=50,
			
 
				+                )
			
 
				+```
			
--- a/get_info.py
+++ b/get_info.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-06-11 13:43:14
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-07-25 16:36:24
			
 
				+# @Last Modified time: 2024-08-02 11:19:17
			
 
				 
			
 
				 # import os
			
 
				 
			
@@ -100,6 +100,7 @@ from pdfminer.pdfparser import PDFParser, PDFSyntaxError
 
				 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
			
 
				 from pdfminer.image import BMPWriter
			
 
				 import pdfplumber
			
 
				+import camelot
			
 
				 
			
 
				 # 自定义包导入
			
 
				 from tools import RefPageNumberResolver
			
@@ -451,18 +452,23 @@ class PdfExtractAttr(object):
 
				                     })
			
 
				         self.detail_df = pd.DataFrame(self.details)
			
 
				 
			
 
				-    def concat_table(self, table: list, page_number: int, table_name: str = None) -> None:
			
 
				+    def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
			
 
				         """尝试将表添加到结果列中，有两种情况，直接添加一个新表；拼接最后一个表
			
 
				         @table
			
 
				         """
			
 
				         first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
			
 
				+
			
 
				+        if new:
			
 
				+            self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
			
 
				+            return
			
 
				+
			
 
				         tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
			
 
				         if len(table) > 1:
			
 
				             second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
			
 
				         else:
			
 
				             second = None
			
 
				         # pprint(first)
			
 
				-        if len(HEADERS & set(first)) > 2:
			
 
				+        if not self.tables or len(HEADERS & set(first)) > 2:
			
 
				             # pprint("找到大量表头元素，判断为独立表头，生成新表!")
			
 
				             self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
			
 
				         elif second and (len(HEADERS & set(second)) > 2):
			
@@ -478,7 +484,7 @@ class PdfExtractAttr(object):
 
				             self.tables[-1]['table'].extend(table)
			
 
				         else:
			
 
				             self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
			
 
				-    
			
 
				+
			
 
				     def parse_table(self) -> None:
			
 
				         """表格解析
			
 
				         """
			
@@ -500,7 +506,42 @@ class PdfExtractAttr(object):
 
				                     #self.concat_table(table.extract(), table_title_name)
			
 
				                 # 检测到存在多个表格，对第一个表格进行合并判断之后的表格一定不相干
			
 
				                 elif len(tables) > 1:
			
 
				-                    pass
			
 
				+                    first_table = tables[0]
			
 
				+                    self.concat_table(first_table.extract(), page_number=page_number)
			
 
				+                    for table_index in range(1, len(tables)):
			
 
				+                        self.concat_table(tables[table_index].extract(), page_number=page_number, new=True)
			
 
				+
			
 
				+    def parse_table_pro(self) -> None:
			
 
				+        with pdfplumber.open(self.file_path) as pdf:
			
 
				+            for page_number, page_layout in enumerate(pdf.pages):
			
 
				+                # 查询是否存在表格
			
 
				+                tables = page_layout.find_tables()
			
 
				+
			
 
				+                tables_pro = camelot.read_pdf(
			
 
				+                    self.file_path,
			
 
				+                    # flavor='stream',
			
 
				+                    pages=str(page_number+1),
			
 
				+                    # edge_tol=200,
			
 
				+                    # row_tol=50,
			
 
				+                )
			
 
				+                # 检测到该页面存在一个表格，对其进行合并判断
			
 
				+                if len(tables) == 1:
			
 
				+                    table = tables[0]
			
 
				+                    table_pro = tables_pro[0].df.to_dict(orient='split')['data']
			
 
				+                    x0, y0, x1, y1 = table.bbox
			
 
				+                    table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
			
 
				+                    if table_title_df.empty:
			
 
				+                        self.concat_table(table_pro, page_number=page_number)
			
 
				+                    else:
			
 
				+                        table_title_name = table_title_df.iloc[0]['text']
			
 
				+                        self.concat_table(table_pro, page_number=page_number, table_name=table_title_name)
			
 
				+                    table = tables[0]
			
 
				+                # 检测到存在多个表格，对第一个表格进行合并判断之后的表格一定不相干
			
 
				+                elif len(tables_pro) > 1:
			
 
				+                    first_table = tables_pro[0]
			
 
				+                    self.concat_table(first_table.df.to_dict(orient='split')['data'], page_number=page_number)
			
 
				+                    for table_index in range(1, len(tables_pro)):
			
 
				+                        self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
			
 
				 
			
 
				     def output(self, table_path: str = 'all_tables.json'):
			
 
				         """结果输出
			
@@ -512,19 +553,21 @@ class PdfExtractAttr(object):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    # pdf_path = './投标文件-修改版9-5-1-1.pdf'
			
 
				-    pdf_path = './南方电网数字研究院有限公司.pdf'
			
 
				-    # title_path = './投标文件-修改版9-5-1-1.json'
			
 
				-    title_path = './南方电网数字研究院有限公司.json'
			
 
				-    image_dir = './extracted_images'
			
 
				-    os.makedirs(image_dir, exist_ok=True)
			
 
				-    main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
			
 
				-    # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
			
 
				-    # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
			
 
				-
			
 
				-    agent = PdfExtractAttr(file_path=pdf_path)
			
 
				-    print(agent.extract_content())
			
 
				-    agent.parse_outline()
			
 
				+    pdf_path = 'data/预审查数据/2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.pdf'
			
 
				+    image_dir = 'data/预审查数据/extracted_images'
			
 
				+    title_path = 'data/预审查数据/2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.json'
			
 
				+
			
 
				+    # os.makedirs(image_dir, exist_ok=True)
			
 
				+    # main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
			
 
				+
			
 
				+    # table_path = 'data/预审查数据/all_tables_2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.json'
			
 
				+    # content_path = 'data/预审查数据/contexts_2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.json'
			
 
				+    agent = PdfExtractAttr_(file_path=pdf_path)
			
 
				+
			
 
				+    # agent.extract_content()
			
 
				+    # contents = agent.output_()  
			
 
				+    
			
 
				     agent.parse_text()
			
 
				-    agent.parse_table()
			
 
				-    agent.output()
			
 
				+    # agent.parse_table()
			
 
				+    agent.parse_table_pro()
			
 
				+    all_tables = agent.output()