1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- from typing import List
- from get_info import PdfExtractAttr
- # from scan_dir import scan_dir
- import time
- ins = ['合同金额', '合同价格', '发包人名称']
- def batch_bool(instances: List[str], text: str) -> bool:
- for i in instances:
- if i in text:
- return True
- return False
- def extract_project(path: str, instances: List[str]):
- agent = PdfExtractAttr(file_path=path)
- tables = agent.parse_table_pro()
- res = []
- for table in tables:
- tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
- pages = table['page_numbers']
- rows = [row[0] for row in tab]
- for i in rows:
- # if '合同金额' in i or '合同价格' in i or '发包人名称' in i:
- if batch_bool(instances, i):
- res.append({
- "page_numbers": pages,
- "table": tab
- })
- break
- return res
- if __name__ == '__main__':
- # fs = scan_dir('/home/zzh/ocr/pdf', 'pdf')
- # start = time.time()
- # for f in (fs[:]):
- # try:
- # print(f)
- # print(extract_project(f, ins))
- # print('\n*********Runtime {} s *********\n'.format(time.time() - start))
- # except BaseException as e:
- # print('Something wrong')
- # print(e)
- print(extract_project(r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf', ['合同金额', '合同价格', '发包人名称']))
|