project_loc.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. from typing import List
  2. from get_info import PdfExtractAttr
  3. # from scan_dir import scan_dir
  4. import time
  5. ins = ['合同金额', '合同价格', '发包人名称']
  6. def batch_bool(instances: List[str], text: str) -> bool:
  7. for i in instances:
  8. if i in text:
  9. return True
  10. return False
  11. def extract_project(path: str, instances: List[str]):
  12. agent = PdfExtractAttr(file_path=path)
  13. tables = agent.parse_table_pro()
  14. res = []
  15. for table in tables:
  16. tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
  17. pages = table['page_numbers']
  18. rows = [row[0] for row in tab]
  19. for i in rows:
  20. # if '合同金额' in i or '合同价格' in i or '发包人名称' in i:
  21. if batch_bool(instances, i):
  22. res.append({
  23. "page_numbers": pages,
  24. "table": tab
  25. })
  26. break
  27. return res
  28. if __name__ == '__main__':
  29. # fs = scan_dir('/home/zzh/ocr/pdf', 'pdf')
  30. # start = time.time()
  31. # for f in (fs[:]):
  32. # try:
  33. # print(f)
  34. # print(extract_project(f, ins))
  35. # print('\n*********Runtime {} s *********\n'.format(time.time() - start))
  36. # except BaseException as e:
  37. # print('Something wrong')
  38. # print(e)
  39. print(extract_project(r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf', ['合同金额', '合同价格', '发包人名称']))