extract_price.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. from re import findall
  2. from typing import List
  3. from text_extractor import get_instance
  4. def rmb_to_digit(rmb_str):
  5. digit_map = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9}
  6. unit_map = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '万': 10000, '亿': 100000000}
  7. digit = 0
  8. total = 0
  9. tmp = 0
  10. for char in rmb_str:
  11. if char in digit_map:
  12. digit = digit_map[char]
  13. elif char in unit_map:
  14. if digit + tmp:
  15. total += (tmp + digit) * unit_map[char]
  16. tmp = digit = 0
  17. else:
  18. total *= unit_map[char]
  19. else:
  20. tmp = digit
  21. total += tmp + digit
  22. return '{:.2f}'.format(total)
  23. def match_price_zhs(text: str) -> List[str]:
  24. pattern = (r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟][壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]+"
  25. r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]")
  26. temp = findall(pattern, text)
  27. for i in range(len(temp)):
  28. if temp[i].endswith('整元') or temp[i].endswith('角元') or temp[i].endswith('分元') or temp[i].endswith('元元'):
  29. temp[i] = temp[i][:-1]
  30. return temp
  31. def match_price_num(text: str) -> List[str]:
  32. pattern = (r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|"
  33. r"ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR|"
  34. r"PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
  35. r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)")
  36. return findall(pattern, text)
  37. def match_duration(text: str) -> List[str]:
  38. pattern = r"[1-9]+[\d]日历天"
  39. return findall(pattern, text)
  40. def match_quality(text: str) -> List[str]:
  41. pattern = r"工程质量.+"
  42. return findall(pattern, text)
  43. if __name__ == '__main__':
  44. price_zhs = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
  45. '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
  46. match_price_zhs)
  47. price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
  48. '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
  49. match_price_num)
  50. duration = get_instance(['投标函', '开标一览表'], ['工期日历天'],
  51. '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
  52. match_duration)
  53. quality = get_instance(['投标函', '开标一览表'], ['工程质量'],
  54. '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
  55. match_quality)
  56. valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]