12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- from re import findall
- from typing import List
- from text_extractor import get_instance
- def rmb_to_digit(rmb_str):
- digit_map = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9}
- unit_map = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '万': 10000, '亿': 100000000}
- digit = 0
- total = 0
- tmp = 0
- for char in rmb_str:
- if char in digit_map:
- digit = digit_map[char]
- elif char in unit_map:
- if digit + tmp:
- total += (tmp + digit) * unit_map[char]
- tmp = digit = 0
- else:
- total *= unit_map[char]
- else:
- tmp = digit
- total += tmp + digit
- return '{:.2f}'.format(total)
- def match_price_zhs(text: str) -> List[str]:
- pattern = (r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟][壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]+"
- r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]")
- temp = findall(pattern, text)
- for i in range(len(temp)):
- if temp[i].endswith('整元') or temp[i].endswith('角元') or temp[i].endswith('分元') or temp[i].endswith('元元'):
- temp[i] = temp[i][:-1]
- return temp
- def match_price_num(text: str) -> List[str]:
- pattern = (r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|"
- r"ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR|"
- r"PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
- r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)")
- return findall(pattern, text)
- def match_duration(text: str) -> List[str]:
- pattern = r"[1-9]+[\d]日历天"
- return findall(pattern, text)
- def match_quality(text: str) -> List[str]:
- pattern = r"工程质量.+"
- return findall(pattern, text)
- if __name__ == '__main__':
- price_zhs = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
- '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
- match_price_zhs)
- price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
- '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
- match_price_num)
- duration = get_instance(['投标函', '开标一览表'], ['工期日历天'],
- '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
- match_duration)
- quality = get_instance(['投标函', '开标一览表'], ['工程质量'],
- '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
- match_quality)
- valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
|