project_loc.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. import time
  2. import json
  3. from typing import (
  4. List,
  5. Optional
  6. )
  7. from get_info import PdfExtractAttr
  8. ins = ['合同金额', '合同价格', '发包人名称']
  9. def batch_bool(instances: List[str], text: str) -> bool:
  10. for i in instances:
  11. if i in text:
  12. return True
  13. return False
  14. def extract_project(instances: List[str], table_dict: Optional[dict] = None, table_path: Optional[str] = None, pdf_path: Optional[str] = None) -> list:
  15. """
  16. 从表格中抽取项目业绩
  17. Args:
  18. instance: 抽取的字段
  19. table_dict: json表格
  20. table_path: 表格文件路径
  21. pdf_path: pdf源文件路径
  22. Returns:
  23. res 项目业绩表
  24. """
  25. if table_dict:
  26. tables = table_dict
  27. elif table_path:
  28. with open(table_path, 'r', encoding='utf-8') as jsonfile:
  29. tables = json.load(jsonfile)
  30. elif pdf_path:
  31. agent = PdfExtractAttr(file_path=pdf_path)
  32. tables = agent.parse_table_pro()
  33. else:
  34. raise ValueError("请输入需要解析的文件!")
  35. res = []
  36. for table in tables:
  37. tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
  38. pages = table['page_numbers']
  39. rows = [row[0] for row in tab]
  40. for i in rows:
  41. if batch_bool(instances, i):
  42. res.append({
  43. "page_numbers": pages,
  44. "table": tab
  45. })
  46. break
  47. return res
  48. if __name__ == '__main__':
  49. from pprint import pprint
  50. file = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf'
  51. table_path = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司-table.json'
  52. with open(table_path, 'r', encoding='utf-8') as jsonfile:
  53. tables = json.load(jsonfile)
  54. pprint(
  55. extract_project(
  56. instances=['合同金额', '合同价格', '发包人名称'],
  57. table_dict=tables
  58. )
  59. )