123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2024-06-11 13:43:14
- # @Last Modified by: privacy
- # @Last Modified time: 2024-12-25 14:29:44
- # 技术部分定位
- from typing import List, Optional
- import pandas as pd
- from celery_tasks.LLMAgent import get_proj
- from celery_tasks.text_extractor import similar_match
- from celery_tasks.tools import filter_content, comment_clean, TitleLevelJudge
- def tech_loc(scrutinize_dict: dict, outline_dict: List[dict], content_list: List[dict], supplier: str, project: str = None, file_name: str = '投标文件.pdf') -> Optional[List[dict]]:
- """
- 投标技术部分定位
- Args:
- scrutinize_dict: 详细评审
- outline_dict: 投标文件大纲
- Results:
- result: 技术部分详审结果
- """
- df = pd.DataFrame(outline_dict)
- title_judge = TitleLevelJudge(df.title.to_list())
- targets = []
- part = None
- # 从详审大纲中获取技术评分方法
- for key in scrutinize_dict.keys():
- if '技术' in key:
- part = key
- break
- # 没有找到技术评审方法则直接返回
- if not part:
- return None
- # 对技术评分中评分标准的拆解
- for ins in scrutinize_dict[key]:
- inst = ins['评分因素']
- text = ins['评分标准']
- weig = ins['权重']
- t = []
- for i in text.split('。'):
- for j in i.split(';'):
- for k in j.split(','):
- t.append(k)
- targets.append({
- 'title': inst,
- 'text': t,
- 'weight': weig
- })
- result = {
- 'writeName': '',
- 'scoringCriteria': []
- }
- # 遍历评审方法
- for item in targets:
- # 相似度查找
- title_sims = similar_match(outline_dict, [item['title']], key='title')
- # 页码查找
- pages = [{'fileName': file_name, 'conformFlag': 1, 'pageStart': str(sim['page_number']), 'pageEnd': str(sim['page_number']), 'pageKey': '', 'text': sim['title'], 'score': sim['相似度']} for sim in title_sims]
- input_list = []
- for i in range(len(pages)):
- next_title = title_judge.find_next_title(pages[i]['text'])
- try:
- end_page = int(df[df.title == next_title]['page_number'].values[0])
- print('当前标题为: ', pages[i]['text'], pages[i]['pageStart'], '下一标题为: ', next_title, end_page)
- except Exception:
- print('当前标题为: ', pages[i]['text'], '找不到下一标题')
- end_page = pages[i]['pageStart']
-
- # 页码
- ps = pages[i]['pageStart']
- ps = int(ps) if ps else 0
- if ps > end_page:
- midden = end_page
- end_page = ps
- pages[i]['pageStart'] = str(midden)
- pages[i]['pageEnd'] = str(end_page)
- try:
- filtered_content = filter_content(content_list=content_list, start_page=int(pages[i]['page_number']), end_page=end_page)
- input_list.append(filtered_content)
- except Exception:
- pass
- comment = get_proj(input_json=input_list, standard=''.join(item['text']))
- comment = comment_clean(comment)
- # 填充返回结果
- result['scoringCriteria'].append({
- 'scoringFactors': item['title'],
- 'scoringStandard': ''.join(item['text']),
- 'percentage': item['weight'],
- 'expertAdvice': '',
- 'writeName': '',
- 'suppliers': [{
- 'expertAdvice': '',
- 'writeName': '',
- 'name': supplier,
- 'grade': 'B',
- 'supplier': f'概括文字: {comment}',
- 'pages': pages
- }]
- })
- return result
- if __name__ == '__main__':
- import os
- import json
- from glob import glob
- from pprint import pprint
- with open('bidding_dataset.json', 'r', encoding='utf-8') as fp:
- scrutinizes = json.load(fp)
- for project in scrutinizes.keys():
- # 招标要素
- scrutinize_dict = scrutinizes[project]
- for file in glob(f'./data/0预审查初审详审测试数据/{project}/*/*-outline.json'):
- with open(file, 'r', encoding='utf-8') as fp:
- outline_dict = json.load(fp)
- with open(file.replace('outline', 'content'), 'r', encoding='utf-8') as fp:
- content_list = json.load(fp)
- # 没有大纲
- if outline_dict == []:
- os.remove(file)
- continue
- # 供应商
- supplier = file.split('\\')[-2]
- result = tech_loc(
- scrutinize_dict=scrutinize_dict,
- outline_dict=outline_dict,
- content_list=content_list,
- supplier=supplier,
- project=project,
- file_name=file
- )
- pprint(result)
- exit(0)
|