extract_price.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-05 10:37:12
  6. from pprint import pprint
  7. from celery_tasks.text_extractor import get_instance
  8. from celery_tasks.tools import match_price_zhs, match_price_num, match_duration, match_quality
  9. if __name__ == '__main__':
  10. import json
  11. pdf_path = r'./data/projects/三峡左岸及地下电站地坪整治/投标/湖北建新建设工程有限公司_T221100130348%2F01整本文件/投标文件-修改版9-5-1-1.pdf'
  12. with open(pdf_path[:-4] + '-title.json', 'r', encoding='utf-8') as fp:
  13. titles = json.load(fp)
  14. with open(pdf_path[:-4] + '-text.json', 'r', encoding='utf-8') as fp:
  15. texts = json.load(fp)
  16. price_zhs = get_instance(
  17. title_instances=['投标函', '开标一览表'],
  18. content_instances=['人民币投标总报价'],
  19. titles_list=titles,
  20. texts_list=texts,
  21. extractor=match_price_zhs
  22. )
  23. price_num = get_instance(
  24. title_instances=['投标函', '开标一览表'],
  25. content_instances=['人民币投标总报价'],
  26. titles_list=titles,
  27. texts_list=texts,
  28. extractor=match_price_num
  29. )
  30. duration = get_instance(
  31. title_instances=['投标函', '开标一览表'],
  32. content_instances=['工期日历天'],
  33. titles_list=titles,
  34. texts_list=texts,
  35. extractor=match_duration
  36. )
  37. quality = get_instance(
  38. title_instances=['投标函', '开标一览表'],
  39. content_instances=['工程质量'],
  40. titles_list=titles,
  41. texts_list=texts,
  42. extractor=match_quality
  43. )
  44. # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
  45. pprint({
  46. "price_zhs": price_zhs,
  47. "price_num": price_num,
  48. "duration": duration,
  49. "quality": quality,
  50. # "valid": valid
  51. })