|
@@ -95,6 +95,9 @@ from pdfminer.pdftypes import (
|
|
|
LITERALS_FLATE_DECODE,
|
|
|
)
|
|
|
|
|
|
+import pandas as pd
|
|
|
+import pdfplumber
|
|
|
+
|
|
|
|
|
|
def is_title(line: str) -> bool:
|
|
|
title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
|
|
@@ -205,16 +208,44 @@ def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
|
|
|
image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
|
|
|
image_file = export_image(e_obj, image_file)
|
|
|
images.append(image_file)
|
|
|
- print(f'Image saved: {image_file}')
|
|
|
+ pprint(f'Image saved: {image_file}')
|
|
|
image_index += 1
|
|
|
|
|
|
- with open(title_path, 'a', encoding='utf-8') as fp:
|
|
|
+ with open(title_path, 'w', encoding='utf-8') as fp:
|
|
|
json.dump(texts, fp, indent=4, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
+def table_parse(pdf_path: str, title_path: str, start_title: str = '六、已标价工程量清单', end_title: str = '七、施工组织设计', table_path: str = 'table.json') -> list:
|
|
|
+ tables = []
|
|
|
+ df = pd.read_json(title_path)
|
|
|
+ start_page_number = df[df['text'] == start_title].pageno.max()
|
|
|
+ end_page_number = df[df['text'] == end_title].pageno.max()
|
|
|
+ pdf = pdfplumber.open(pdf_path)
|
|
|
+ for i in range(start_page_number, end_page_number):
|
|
|
+ table = pdf.pages[i].extract_table()
|
|
|
+ if table:
|
|
|
+ first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
|
|
|
+ # pprint(first)
|
|
|
+ if len(set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)'}) & set(first)) > 2:
|
|
|
+ # pprint("找到大量表头元素,判断为独立表头,生成新表!")
|
|
|
+ tables.append({"pagenos": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
|
|
|
+ elif ((i-1) in tables[-1]['pagenos']) and (len(first) == tables[-1]['col_len']):
|
|
|
+ # pprint("有空列,不是单独表,直接合并")
|
|
|
+ tables[-1]['pagenos'].append(i)
|
|
|
+ tables[-1]['table'].extend(table)
|
|
|
+ else:
|
|
|
+ tables.append({"pagenos": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
|
|
|
+
|
|
|
+ with open(table_path, 'w', encoding='utf-8') as fp:
|
|
|
+ json.dump(tables, fp, indent=4, ensure_ascii=False)
|
|
|
+
|
|
|
+ return tables
|
|
|
+
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
|
pdf_path = './投标文件-修改版9-5-1-1.pdf'
|
|
|
title_path = './投标文件-修改版9-5-1-1.json'
|
|
|
image_dir = './extracted_images'
|
|
|
os.makedirs(image_dir, exist_ok=True)
|
|
|
- main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
|
|
|
+ main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
|
|
|
+ tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
|