# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-06-11 13:43:14 # @Last Modified by: privacy # @Last Modified time: 2024-09-05 10:37:12 from pprint import pprint from celery_tasks.text_extractor import get_instance from celery_tasks.tools import match_price_zhs, match_price_num, match_duration, match_quality if __name__ == '__main__': import json pdf_path = r'./data/projects/三峡左岸及地下电站地坪整治/投标/湖北建新建设工程有限公司_T221100130348%2F01整本文件/投标文件-修改版9-5-1-1.pdf' with open(pdf_path[:-4] + '-title.json', 'r', encoding='utf-8') as fp: titles = json.load(fp) with open(pdf_path[:-4] + '-text.json', 'r', encoding='utf-8') as fp: texts = json.load(fp) price_zhs = get_instance( title_instances=['投标函', '开标一览表'], content_instances=['人民币投标总报价'], titles_list=titles, texts_list=texts, extractor=match_price_zhs ) price_num = get_instance( title_instances=['投标函', '开标一览表'], content_instances=['人民币投标总报价'], titles_list=titles, texts_list=texts, extractor=match_price_num ) duration = get_instance( title_instances=['投标函', '开标一览表'], content_instances=['工期日历天'], titles_list=titles, texts_list=texts, extractor=match_duration ) quality = get_instance( title_instances=['投标函', '开标一览表'], content_instances=['工程质量'], titles_list=titles, texts_list=texts, extractor=match_quality ) # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:] pprint({ "price_zhs": price_zhs, "price_num": price_num, "duration": duration, "quality": quality, # "valid": valid })