123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106 |
- import os
- os.environ['TRANSFORMERS_OFFLINE'] = '1'
- os.environ['HF_DATASETS_OFFLINE'] = '1'
- from re import findall
- from typing import List
- from text_extractor import get_instance
- def rmb_to_digit(rmb_str):
- digit_map = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9}
- unit_map = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '万': 10000, '亿': 100000000}
- digit = 0
- total = 0
- tmp = 0
- for char in rmb_str:
- if char in digit_map:
- digit = digit_map[char]
- elif char in unit_map:
- if digit + tmp:
- total += (tmp + digit) * unit_map[char]
- tmp = digit = 0
- else:
- total *= unit_map[char]
- else:
- tmp = digit
- total += tmp + digit
- return '{:.2f}'.format(total)
- def match_price_zhs(text: str) -> List[str]:
- pattern = (r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟][壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]+"
- r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]")
- temp = findall(pattern, text)
- for i in range(len(temp)):
- if temp[i].endswith('整元') or temp[i].endswith('角元') or temp[i].endswith('分元') or temp[i].endswith('元元'):
- temp[i] = temp[i][:-1]
- return temp
- def match_price_num(text: str) -> List[str]:
- pattern = (r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|"
- r"ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR|"
- r"PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
- r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)")
- return findall(pattern, text)
- def match_duration(text: str) -> List[str]:
- pattern = r"[1-9]+[\d]日历天"
- return findall(pattern, text)
- def match_quality(text: str) -> List[str]:
- pattern = r"工程质量.+"
- return findall(pattern, text)
- if __name__ == '__main__':
- from pprint import pprint
- from get_info import PdfExtractAttr
- pdf_path = r'./data/0预审查初审详审测试数据/三峡左岸及地下电站地坪整治/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
- file = PdfExtractAttr(file_path=pdf_path)
- titles = file.parse_title()
- texts = file.parse_text()
- price_zhs = get_instance(
- title_instances=['投标函', '开标一览表'],
- content_instances=['人民币投标总报价'],
- titles_list=titles,
- texts_list=texts,
- extractor=match_price_zhs
- )
- price_num = get_instance(
- title_instances=['投标函', '开标一览表'],
- content_instances=['人民币投标总报价'],
- titles_list=titles,
- texts_list=texts,
- extractor=match_price_num
- )
- duration = get_instance(
- title_instances=['投标函', '开标一览表'],
- content_instances=['工期日历天'],
- titles_list=titles,
- texts_list=texts,
- extractor=match_duration
- )
- quality = get_instance(
- title_instances=['投标函', '开标一览表'],
- content_instances=['工程质量'],
- titles_list=titles,
- texts_list=texts,
- extractor=match_quality
- )
- # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
- pprint({
- "price_zhs": price_zhs,
- "price_num": price_num,
- "duration": duration,
- "quality": quality,
- # "valid": valid
- })
|