extract_price.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. import os
  2. os.environ['TRANSFORMERS_OFFLINE'] = '1'
  3. os.environ['HF_DATASETS_OFFLINE'] = '1'
  4. from re import findall
  5. from typing import List
  6. from text_extractor import get_instance
  7. def rmb_to_digit(rmb_str):
  8. digit_map = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9}
  9. unit_map = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '万': 10000, '亿': 100000000}
  10. digit = 0
  11. total = 0
  12. tmp = 0
  13. for char in rmb_str:
  14. if char in digit_map:
  15. digit = digit_map[char]
  16. elif char in unit_map:
  17. if digit + tmp:
  18. total += (tmp + digit) * unit_map[char]
  19. tmp = digit = 0
  20. else:
  21. total *= unit_map[char]
  22. else:
  23. tmp = digit
  24. total += tmp + digit
  25. return '{:.2f}'.format(total)
  26. def match_price_zhs(text: str) -> List[str]:
  27. pattern = (r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟][壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]+"
  28. r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]")
  29. temp = findall(pattern, text)
  30. for i in range(len(temp)):
  31. if temp[i].endswith('整元') or temp[i].endswith('角元') or temp[i].endswith('分元') or temp[i].endswith('元元'):
  32. temp[i] = temp[i][:-1]
  33. return temp
  34. def match_price_num(text: str) -> List[str]:
  35. pattern = (r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|"
  36. r"ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR|"
  37. r"PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
  38. r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)")
  39. return findall(pattern, text)
  40. def match_duration(text: str) -> List[str]:
  41. pattern = r"[1-9]+[\d]日历天"
  42. return findall(pattern, text)
  43. def match_quality(text: str) -> List[str]:
  44. pattern = r"工程质量.+"
  45. return findall(pattern, text)
  46. if __name__ == '__main__':
  47. from pprint import pprint
  48. pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
  49. price_zhs = get_instance(
  50. title_instances=['投标函', '开标一览表'],
  51. content_instances=['人民币投标总报价'],
  52. pdf_path,
  53. match_price_zhs
  54. )
  55. price_num = get_instance(
  56. title_instances=['投标函', '开标一览表'],
  57. content_instances=['人民币投标总报价'],
  58. pdf_path,
  59. match_price_num
  60. )
  61. duration = get_instance(
  62. title_instances=['投标函', '开标一览表'],
  63. content_instances=['工期日历天'],
  64. pdf_path,
  65. match_duration
  66. )
  67. quality = get_instance(
  68. title_instances=['投标函', '开标一览表'],
  69. content_instances=['工程质量'],
  70. pdf_path,
  71. match_quality
  72. )
  73. valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
  74. pprint({
  75. "price_zhs": price_zhs,
  76. "price_num": price_num,
  77. "duration": duration,
  78. "quality": quality,
  79. "valid": valid
  80. })