project_loc.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-03 16:03:06
  6. # description: 总测试次数:82 成功:61 失败:21 成功率:0.7439024390243902
  7. from typing import List
  8. def batch_bool(instances: List[str], text: str) -> bool:
  9. """
  10. 从列表中判断元素是否存在
  11. Args:
  12. instances: 字段
  13. text: 文本
  14. Returns:
  15. 是否存在
  16. """
  17. for i in instances:
  18. if i in text:
  19. return True
  20. return False
  21. def extract_project(table_list: list, instances: List[str] = ['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']) -> list:
  22. """
  23. 从表格中抽取项目业绩
  24. Args:
  25. table_list: PDF表格
  26. instance: 抽取的字段
  27. Returns:
  28. res 项目业绩表
  29. """
  30. res = []
  31. for table in table_list:
  32. tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
  33. pages = table['page_numbers']
  34. rows = [row[0] for row in tab]
  35. for i in rows:
  36. if batch_bool(instances, i):
  37. res.append({
  38. "page_numbers": pages,
  39. "table": tab
  40. })
  41. break
  42. return res
  43. if __name__ == '__main__':
  44. import json
  45. from pprint import pprint
  46. table_path = r'D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\三峡左岸及地下电站地坪整治\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json'
  47. with open(table_path, 'r', encoding='utf-8') as jsonfile:
  48. tables = json.load(jsonfile)
  49. pprint(
  50. extract_project(
  51. table_list=tables,
  52. instances=['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']
  53. )
  54. )