|
@@ -2,7 +2,7 @@
|
|
|
# @Author: privacy
|
|
|
# @Date: 2024-06-11 13:43:14
|
|
|
# @Last Modified by: privacy
|
|
|
-# @Last Modified time: 2024-07-25 16:36:24
|
|
|
+# @Last Modified time: 2024-08-02 11:19:17
|
|
|
|
|
|
# import os
|
|
|
|
|
@@ -100,6 +100,7 @@ from pdfminer.pdfparser import PDFParser, PDFSyntaxError
|
|
|
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
|
|
from pdfminer.image import BMPWriter
|
|
|
import pdfplumber
|
|
|
+import camelot
|
|
|
|
|
|
# 自定义包导入
|
|
|
from tools import RefPageNumberResolver
|
|
@@ -451,18 +452,23 @@ class PdfExtractAttr(object):
|
|
|
})
|
|
|
self.detail_df = pd.DataFrame(self.details)
|
|
|
|
|
|
- def concat_table(self, table: list, page_number: int, table_name: str = None) -> None:
|
|
|
+ def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
|
|
|
"""尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
|
|
|
@table
|
|
|
"""
|
|
|
first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
|
|
|
+
|
|
|
+ if new:
|
|
|
+ self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
|
|
|
+ return
|
|
|
+
|
|
|
tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
|
|
|
if len(table) > 1:
|
|
|
second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
|
|
|
else:
|
|
|
second = None
|
|
|
# pprint(first)
|
|
|
- if len(HEADERS & set(first)) > 2:
|
|
|
+ if not self.tables or len(HEADERS & set(first)) > 2:
|
|
|
# pprint("找到大量表头元素,判断为独立表头,生成新表!")
|
|
|
self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
|
|
|
elif second and (len(HEADERS & set(second)) > 2):
|
|
@@ -478,7 +484,7 @@ class PdfExtractAttr(object):
|
|
|
self.tables[-1]['table'].extend(table)
|
|
|
else:
|
|
|
self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
|
|
|
-
|
|
|
+
|
|
|
def parse_table(self) -> None:
|
|
|
"""表格解析
|
|
|
"""
|
|
@@ -500,7 +506,42 @@ class PdfExtractAttr(object):
|
|
|
#self.concat_table(table.extract(), table_title_name)
|
|
|
# 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
|
|
|
elif len(tables) > 1:
|
|
|
- pass
|
|
|
+ first_table = tables[0]
|
|
|
+ self.concat_table(first_table.extract(), page_number=page_number)
|
|
|
+ for table_index in range(1, len(tables)):
|
|
|
+ self.concat_table(tables[table_index].extract(), page_number=page_number, new=True)
|
|
|
+
|
|
|
+ def parse_table_pro(self) -> None:
|
|
|
+ with pdfplumber.open(self.file_path) as pdf:
|
|
|
+ for page_number, page_layout in enumerate(pdf.pages):
|
|
|
+ # 查询是否存在表格
|
|
|
+ tables = page_layout.find_tables()
|
|
|
+
|
|
|
+ tables_pro = camelot.read_pdf(
|
|
|
+ self.file_path,
|
|
|
+ # flavor='stream',
|
|
|
+ pages=str(page_number+1),
|
|
|
+ # edge_tol=200,
|
|
|
+ # row_tol=50,
|
|
|
+ )
|
|
|
+ # 检测到该页面存在一个表格,对其进行合并判断
|
|
|
+ if len(tables) == 1:
|
|
|
+ table = tables[0]
|
|
|
+ table_pro = tables_pro[0].df.to_dict(orient='split')['data']
|
|
|
+ x0, y0, x1, y1 = table.bbox
|
|
|
+ table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
|
|
|
+ if table_title_df.empty:
|
|
|
+ self.concat_table(table_pro, page_number=page_number)
|
|
|
+ else:
|
|
|
+ table_title_name = table_title_df.iloc[0]['text']
|
|
|
+ self.concat_table(table_pro, page_number=page_number, table_name=table_title_name)
|
|
|
+ table = tables[0]
|
|
|
+ # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
|
|
|
+ elif len(tables_pro) > 1:
|
|
|
+ first_table = tables_pro[0]
|
|
|
+ self.concat_table(first_table.df.to_dict(orient='split')['data'], page_number=page_number)
|
|
|
+ for table_index in range(1, len(tables_pro)):
|
|
|
+ self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
|
|
|
|
|
|
def output(self, table_path: str = 'all_tables.json'):
|
|
|
"""结果输出
|
|
@@ -512,19 +553,21 @@ class PdfExtractAttr(object):
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- # pdf_path = './投标文件-修改版9-5-1-1.pdf'
|
|
|
- pdf_path = './南方电网数字研究院有限公司.pdf'
|
|
|
- # title_path = './投标文件-修改版9-5-1-1.json'
|
|
|
- title_path = './南方电网数字研究院有限公司.json'
|
|
|
- image_dir = './extracted_images'
|
|
|
- os.makedirs(image_dir, exist_ok=True)
|
|
|
- main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
|
|
|
- # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
|
|
|
- # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
|
|
|
-
|
|
|
- agent = PdfExtractAttr(file_path=pdf_path)
|
|
|
- print(agent.extract_content())
|
|
|
- agent.parse_outline()
|
|
|
+ pdf_path = 'data/预审查数据/2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.pdf'
|
|
|
+ image_dir = 'data/预审查数据/extracted_images'
|
|
|
+ title_path = 'data/预审查数据/2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.json'
|
|
|
+
|
|
|
+ # os.makedirs(image_dir, exist_ok=True)
|
|
|
+ # main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
|
|
|
+
|
|
|
+ # table_path = 'data/预审查数据/all_tables_2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.json'
|
|
|
+ # content_path = 'data/预审查数据/contexts_2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.json'
|
|
|
+ agent = PdfExtractAttr_(file_path=pdf_path)
|
|
|
+
|
|
|
+ # agent.extract_content()
|
|
|
+ # contents = agent.output_()
|
|
|
+
|
|
|
agent.parse_text()
|
|
|
- agent.parse_table()
|
|
|
- agent.output()
|
|
|
+ # agent.parse_table()
|
|
|
+ agent.parse_table_pro()
|
|
|
+ all_tables = agent.output()
|