from typing import List from get_info import PdfExtractAttr # from scan_dir import scan_dir import time ins = ['合同金额', '合同价格', '发包人名称'] def batch_bool(instances: List[str], text: str) -> bool: for i in instances: if i in text: return True return False def extract_project(path: str, instances: List[str]): agent = PdfExtractAttr(file_path=path) tables = agent.parse_table_pro() res = [] for table in tables: tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']] pages = table['page_numbers'] rows = [row[0] for row in tab] for i in rows: # if '合同金额' in i or '合同价格' in i or '发包人名称' in i: if batch_bool(instances, i): res.append({ "page_numbers": pages, "table": tab }) break return res if __name__ == '__main__': # fs = scan_dir('/home/zzh/ocr/pdf', 'pdf') # start = time.time() # for f in (fs[:]): # try: # print(f) # print(extract_project(f, ins)) # print('\n*********Runtime {} s *********\n'.format(time.time() - start)) # except BaseException as e: # print('Something wrong') # print(e) print(extract_project(r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf', ['合同金额', '合同价格', '发包人名称']))