import time import json from typing import ( List, Optional ) from get_info import PdfExtractAttr ins = ['合同金额', '合同价格', '发包人名称'] def batch_bool(instances: List[str], text: str) -> bool: for i in instances: if i in text: return True return False def extract_project(instances: List[str], table_dict: Optional[dict] = None, table_path: Optional[str] = None, pdf_path: Optional[str] = None) -> list: """ 从表格中抽取项目业绩 Args: instance: 抽取的字段 table_dict: json表格 table_path: 表格文件路径 pdf_path: pdf源文件路径 Returns: res 项目业绩表 """ if table_dict: tables = table_dict elif table_path: with open(table_path, 'r', encoding='utf-8') as jsonfile: tables = json.load(jsonfile) elif pdf_path: agent = PdfExtractAttr(file_path=pdf_path) tables = agent.parse_table_pro() else: raise ValueError("请输入需要解析的文件!") res = [] for table in tables: tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']] pages = table['page_numbers'] rows = [row[0] for row in tab] for i in rows: if batch_bool(instances, i): res.append({ "page_numbers": pages, "table": tab }) break return res if __name__ == '__main__': from pprint import pprint file = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf' table_path = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司-table.json' with open(table_path, 'r', encoding='utf-8') as jsonfile: tables = json.load(jsonfile) pprint( extract_project( instances=['合同金额', '合同价格', '发包人名称'], table_dict=tables ) )