|
@@ -2,7 +2,7 @@
|
|
|
# @Author: privacy
|
|
|
# @Date: 2024-06-11 13:43:14
|
|
|
# @Last Modified by: privacy
|
|
|
-# @Last Modified time: 2024-08-06 17:16:15
|
|
|
+# @Last Modified time: 2024-08-07 13:44:47
|
|
|
|
|
|
# import os
|
|
|
|
|
@@ -406,7 +406,6 @@ class PdfExtractAttr(object):
|
|
|
title_index += 1
|
|
|
self.outlines.at[idx, 'text'] += '\n'
|
|
|
self.outlines.at[idx, 'text'] += text
|
|
|
- print(self.outlines.iloc[idx]['text'])
|
|
|
|
|
|
# 正文部分
|
|
|
elif not current or self.can_merge_lines(current, element):# 可以合并
|
|
@@ -430,15 +429,78 @@ class PdfExtractAttr(object):
|
|
|
image_index += 1
|
|
|
|
|
|
|
|
|
- with open(title_path, 'w', encoding='utf-8') as fp:
|
|
|
- json.dump(texts, fp, indent=4, ensure_ascii=False)
|
|
|
+ if title_path:
|
|
|
+ with open(title_path, 'w', encoding='utf-8') as fp:
|
|
|
+ json.dump(texts, fp, indent=4, ensure_ascii=False)
|
|
|
|
|
|
- self.outlines.to_json(section_path, orient='records', lines=True, force_ascii=False)
|
|
|
+ if section_path:
|
|
|
+ self.outlines.to_json(section_path, orient='records', lines=True, force_ascii=False)
|
|
|
|
|
|
- def parse_outline(self) -> list:
|
|
|
- """PDF大纲解析
|
|
|
+ def extract_toc(self) -> list:
|
|
|
+ """PDF大纲解析,依据内容解析
|
|
|
"""
|
|
|
results = []
|
|
|
+
|
|
|
+ for page_number, page in enumerate(extract_pages(self.file_path)):
|
|
|
+
|
|
|
+ is_outline = False
|
|
|
+
|
|
|
+ if page_number < 1:
|
|
|
+ continue
|
|
|
+
|
|
|
+ if page_number > 20:
|
|
|
+ break
|
|
|
+
|
|
|
+ lines = [element.get_text().strip() for element in page if isinstance(element, LTTextBoxHorizontal)]
|
|
|
+
|
|
|
+ # 用于存储目录结构
|
|
|
+ # directory_structure = []
|
|
|
+
|
|
|
+ for line in lines:
|
|
|
+ # 检查是否符合目录格式
|
|
|
+ if line and '.' in line and (line[0].isdigit() or '\u4e00' <= line[0] <= '\u9fff') and line[-1].isdigit():
|
|
|
+ is_outline = True
|
|
|
+ # 计算缩进级别
|
|
|
+ indent_level = 1
|
|
|
+ # 获取内容
|
|
|
+ title = re.findall('^[\d\.、]{0,}[\u4e00-\u9fff、()\s]+', line).pop()
|
|
|
+ # 计算页码
|
|
|
+ page_n = int(re.findall('\d+$', line).pop())
|
|
|
+ # 添加到目录结构中
|
|
|
+ # directory_structure.append({
|
|
|
+ results.append({
|
|
|
+ "level": indent_level,
|
|
|
+ "title": title,
|
|
|
+ "page_number": page_n
|
|
|
+ })
|
|
|
+
|
|
|
+ # if directory_structure:
|
|
|
+ # pprint(directory_structure)
|
|
|
+
|
|
|
+ if not is_outline:
|
|
|
+ break
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+ def extract_content(self, content_path: str = None) -> list:
|
|
|
+ with pdfplumber.open(self.file_path) as pdf:
|
|
|
+ for page in pdf.pages:
|
|
|
+ self.content.append({
|
|
|
+ 'page_number': page.page_number - 1,
|
|
|
+ 'text': page.extract_text()
|
|
|
+ })
|
|
|
+
|
|
|
+ if content_path:
|
|
|
+ with open(content_path, 'w', encoding='utf-8') as fp:
|
|
|
+ json.dump(self.content, fp, indent=4, ensure_ascii=False)
|
|
|
+
|
|
|
+ return self.content
|
|
|
+
|
|
|
+ def parse_outline(self, outline_path: str = None) -> list:
|
|
|
+ """PDF大纲解析,依据元数据解析,解析失败则调用内容解析
|
|
|
+ """
|
|
|
+ results = []
|
|
|
+
|
|
|
with open(self.file_path, "rb") as fp:
|
|
|
try:
|
|
|
parser = PDFParser(fp)
|
|
@@ -464,8 +526,12 @@ class PdfExtractAttr(object):
|
|
|
finally:
|
|
|
parser.close()
|
|
|
|
|
|
- with open('outlines.json', 'w', encoding='utf-8') as op:
|
|
|
- json.dump(results, op, indent=4, ensure_ascii=False)
|
|
|
+ if not results:
|
|
|
+ results = self.extract_toc()
|
|
|
+
|
|
|
+ if outline_path:
|
|
|
+ with open(outline_path, 'w', encoding='utf-8') as op:
|
|
|
+ json.dump(results, op, indent=4, ensure_ascii=False)
|
|
|
|
|
|
self.outlines = pd.DataFrame(results)
|
|
|
|
|
@@ -540,7 +606,7 @@ class PdfExtractAttr(object):
|
|
|
else:
|
|
|
self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
|
|
|
|
|
|
- def parse_table_pro(self) -> None:
|
|
|
+ def parse_table_pro(self, table_path: str = 'all_tables.json') -> None:
|
|
|
"""表格解析
|
|
|
"""
|
|
|
if self.detail_df == None:
|
|
@@ -551,14 +617,24 @@ class PdfExtractAttr(object):
|
|
|
# 查询是否存在表格
|
|
|
tables = page_layout.find_tables()
|
|
|
|
|
|
+ if not tables:
|
|
|
+ continue
|
|
|
+
|
|
|
+ print(f"解析PDF{page_number}页的表格")
|
|
|
+
|
|
|
tables_pro = camelot.read_pdf(
|
|
|
self.file_path,
|
|
|
# flavor='stream',
|
|
|
pages=str(page_number+1),
|
|
|
# edge_tol=200,
|
|
|
)
|
|
|
+
|
|
|
+ if not tables_pro:
|
|
|
+ continue
|
|
|
+
|
|
|
# 检测到该页面存在一个表格,对其进行合并判断
|
|
|
- if len(tables) == 1:
|
|
|
+ if len(tables) == 1 and tables_pro:
|
|
|
+ # print(f"解析PDF{page_number}页的表格")
|
|
|
table = tables[0]
|
|
|
table_pro = tables_pro[0].df.to_dict(orient='split')['data']
|
|
|
x0, y0, x1, y1 = table.bbox
|
|
@@ -576,24 +652,22 @@ class PdfExtractAttr(object):
|
|
|
for table_index in range(1, len(tables_pro)):
|
|
|
self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
|
|
|
|
|
|
- def output(self, table_path: str = 'all_tables.json'):
|
|
|
- """结果输出
|
|
|
- """
|
|
|
- with open(table_path, 'w', encoding='utf-8') as fp:
|
|
|
- json.dump(self.tables, fp, indent=4, ensure_ascii=False)
|
|
|
+ if table_path:
|
|
|
+ with open(table_path, 'w', encoding='utf-8') as fp:
|
|
|
+ json.dump(self.tables, fp, indent=4, ensure_ascii=False)
|
|
|
|
|
|
return self.tables
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- # pdf_path = './投标文件-修改版9-5-1-1.pdf'
|
|
|
- pdf_path = './南方电网数字研究院有限公司.pdf'
|
|
|
+ pdf_path = './投标文件-修改版9-5-1-1.pdf'
|
|
|
+ # pdf_path = './南方电网数字研究院有限公司.pdf'
|
|
|
# pdf_path = './2022年度工程类-公招采购资料/2022-2025年度三峡电站9台机组检修密封加工制作重新招标/2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.pdf'
|
|
|
# title_path = './投标文件-修改版9-5-1-1.json'
|
|
|
# title_path = './投标文件-修改版9-5-1-1-title.json'
|
|
|
- title_path = './南方电网数字研究院有限公司.json'
|
|
|
+ # title_path = './南方电网数字研究院有限公司.json'
|
|
|
# section_path = './投标文件-修改版9-5-1-1-section.json'
|
|
|
- section_path = './南方电网数字研究院有限公司-section.json'
|
|
|
+ # section_path = './南方电网数字研究院有限公司-section.json'
|
|
|
# image_dir = './extracted_images'
|
|
|
# os.makedirs(image_dir, exist_ok=True)
|
|
|
|
|
@@ -601,7 +675,7 @@ if __name__ == '__main__':
|
|
|
# tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
|
|
|
|
|
|
agent = PdfExtractAttr(file_path=pdf_path)
|
|
|
- agent.parse_outline()
|
|
|
- agent.main_parse(title_path=title_path, section_path=section_path)
|
|
|
- # agent.parse_table_pro()
|
|
|
- # agent.output('all_tables_pro.json')
|
|
|
+ # agent.parse_outline()
|
|
|
+ # agent.main_parse(title_path=title_path, section_path=section_path)
|
|
|
+ agent.parse_table_pro()
|
|
|
+ agent.output('all_tables_pro.json')
|