from re import findall from typing import List from text_extractor import get_instance def rmb_to_digit(rmb_str): digit_map = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9} unit_map = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '万': 10000, '亿': 100000000} digit = 0 total = 0 tmp = 0 for char in rmb_str: if char in digit_map: digit = digit_map[char] elif char in unit_map: if digit + tmp: total += (tmp + digit) * unit_map[char] tmp = digit = 0 else: total *= unit_map[char] else: tmp = digit total += tmp + digit return '{:.2f}'.format(total) def match_price_zhs(text: str) -> List[str]: pattern = (r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟][壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]+" r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]") temp = findall(pattern, text) for i in range(len(temp)): if temp[i].endswith('整元') or temp[i].endswith('角元') or temp[i].endswith('分元') or temp[i].endswith('元元'): temp[i] = temp[i][:-1] return temp def match_price_num(text: str) -> List[str]: pattern = (r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|" r"ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR|" r"PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|" r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)") return findall(pattern, text) def match_duration(text: str) -> List[str]: pattern = r"[1-9]+[\d]日历天" return findall(pattern, text) def match_quality(text: str) -> List[str]: pattern = r"工程质量.+" return findall(pattern, text) if __name__ == '__main__': price_zhs = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'], '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf', match_price_zhs) price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'], '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf', match_price_num) duration = get_instance(['投标函', '开标一览表'], ['工期日历天'], '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf', match_duration) quality = get_instance(['投标函', '开标一览表'], ['工程质量'], '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf', match_quality) valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]