# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-06-11 13:43:14 # @Last Modified by: privacy # @Last Modified time: 2024-12-25 14:29:44 # 技术部分定位 from typing import List, Optional import pandas as pd from celery_tasks.LLMAgent import get_proj from celery_tasks.text_extractor import similar_match from celery_tasks.tools import filter_content, comment_clean, TitleLevelJudge def tech_loc(scrutinize_dict: dict, outline_dict: List[dict], content_list: List[dict], supplier: str, project: str = None, file_name: str = '投标文件.pdf') -> Optional[List[dict]]: """ 投标技术部分定位 Args: scrutinize_dict: 详细评审 outline_dict: 投标文件大纲 Results: result: 技术部分详审结果 """ df = pd.DataFrame(outline_dict) title_judge = TitleLevelJudge(df.title.to_list()) targets = [] part = None # 从详审大纲中获取技术评分方法 for key in scrutinize_dict.keys(): if '技术' in key: part = key break # 没有找到技术评审方法则直接返回 if not part: return None # 对技术评分中评分标准的拆解 for ins in scrutinize_dict[key]: inst = ins['评分因素'] text = ins['评分标准'] weig = ins['权重'] t = [] for i in text.split('。'): for j in i.split(';'): for k in j.split(','): t.append(k) targets.append({ 'title': inst, 'text': t, 'weight': weig }) result = { 'writeName': '', 'scoringCriteria': [] } # 遍历评审方法 for item in targets: # 相似度查找 title_sims = similar_match(outline_dict, [item['title']], key='title') # 页码查找 pages = [{'fileName': file_name, 'conformFlag': 1, 'pageStart': str(sim['page_number']), 'pageEnd': str(sim['page_number']), 'pageKey': '', 'text': sim['title'], 'score': sim['相似度']} for sim in title_sims] input_list = [] for i in range(len(pages)): next_title = title_judge.find_next_title(pages[i]['text']) try: end_page = int(df[df.title == next_title]['page_number'].values[0]) print('当前标题为: ', pages[i]['text'], pages[i]['pageStart'], '下一标题为: ', next_title, end_page) except Exception: print('当前标题为: ', pages[i]['text'], '找不到下一标题') end_page = pages[i]['pageStart'] # 页码 ps = pages[i]['pageStart'] ps = int(ps) if ps else 0 if ps > end_page: midden = end_page end_page = ps pages[i]['pageStart'] = str(midden) pages[i]['pageEnd'] = str(end_page) try: filtered_content = filter_content(content_list=content_list, start_page=int(pages[i]['page_number']), end_page=end_page) input_list.append(filtered_content) except Exception: pass comment = get_proj(input_json=input_list, standard=''.join(item['text'])) comment = comment_clean(comment) # 填充返回结果 result['scoringCriteria'].append({ 'scoringFactors': item['title'], 'scoringStandard': ''.join(item['text']), 'percentage': item['weight'], 'expertAdvice': '', 'writeName': '', 'suppliers': [{ 'expertAdvice': '', 'writeName': '', 'name': supplier, 'grade': 'B', 'supplier': f'概括文字: {comment}', 'pages': pages }] }) return result if __name__ == '__main__': import os import json from glob import glob from pprint import pprint with open('bidding_dataset.json', 'r', encoding='utf-8') as fp: scrutinizes = json.load(fp) for project in scrutinizes.keys(): # 招标要素 scrutinize_dict = scrutinizes[project] for file in glob(f'./data/0预审查初审详审测试数据/{project}/*/*-outline.json'): with open(file, 'r', encoding='utf-8') as fp: outline_dict = json.load(fp) with open(file.replace('outline', 'content'), 'r', encoding='utf-8') as fp: content_list = json.load(fp) # 没有大纲 if outline_dict == []: os.remove(file) continue # 供应商 supplier = file.split('\\')[-2] result = tech_loc( scrutinize_dict=scrutinize_dict, outline_dict=outline_dict, content_list=content_list, supplier=supplier, project=project, file_name=file ) pprint(result) exit(0)