project_loc.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-03 16:03:06
  6. from typing import List
  7. from . import celery_app
  8. def batch_bool(instances: List[str], text: str) -> bool:
  9. """
  10. 从列表中判断元素是否存在
  11. Args:
  12. instances: 字段
  13. text: 文本
  14. Returns:
  15. 是否存在
  16. """
  17. for i in instances:
  18. if i in text:
  19. return True
  20. return False
  21. @celery_app.task
  22. def extract_project(table_list: list, instances: List[str] = ['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']) -> list:
  23. """
  24. 从表格中抽取项目业绩
  25. Args:
  26. table_list: PDF表格
  27. instance: 抽取的字段
  28. Returns:
  29. res 项目业绩表
  30. """
  31. res = []
  32. for table in table_list:
  33. tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
  34. pages = table['page_numbers']
  35. rows = [row[0] for row in tab]
  36. for i in rows:
  37. if batch_bool(instances, i):
  38. res.append({
  39. "page_numbers": pages,
  40. "table": tab
  41. })
  42. break
  43. return res
  44. if __name__ == '__main__':
  45. import json
  46. from pprint import pprint
  47. table_path = r'D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json'
  48. with open(table_path, 'r', encoding='utf-8') as jsonfile:
  49. tables = json.load(jsonfile)
  50. pprint(
  51. extract_project(
  52. table_list=tables,
  53. instances=['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']
  54. )
  55. )