tech_instance.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-12-25 14:29:44
  6. # 技术部分定位
  7. from typing import List, Optional
  8. import pandas as pd
  9. from celery_tasks.LLMAgent import get_proj
  10. from celery_tasks.text_extractor import similar_match
  11. from celery_tasks.tools import filter_content, TitleLevelJudge
  12. def tech_loc(scrutinize_dict: dict, outline_dict: List[dict], content_list: List[dict], supplier: str, project: str = None, file_name: str = '投标文件.pdf') -> Optional[List[dict]]:
  13. """
  14. 投标技术部分定位
  15. Args:
  16. scrutinize_dict: 详细评审
  17. outline_dict: 投标文件大纲
  18. Results:
  19. result: 技术部分详审结果
  20. """
  21. df = pd.DataFrame(outline_dict)
  22. title_judge = TitleLevelJudge(df.title.to_list())
  23. targets = []
  24. part = None
  25. # 从详审大纲中获取技术评分方法
  26. for key in scrutinize_dict.keys():
  27. if '技术' in key:
  28. part = key
  29. break
  30. # 没有找到技术评审方法则直接返回
  31. if not part:
  32. return None
  33. # 对技术评分中评分标准的拆解
  34. for ins in scrutinize_dict[key]:
  35. inst = ins['评分因素']
  36. text = ins['评分标准']
  37. weig = ins['权重']
  38. t = []
  39. for i in text.split('。'):
  40. for j in i.split(';'):
  41. for k in j.split(','):
  42. t.append(k)
  43. targets.append({
  44. 'title': inst,
  45. 'text': t,
  46. 'weight': weig
  47. })
  48. result = {
  49. 'writeName': '',
  50. 'scoringCriteria': []
  51. }
  52. # 遍历评审方法
  53. for item in targets:
  54. # 相似度查找
  55. title_sims = similar_match(outline_dict, [item['title']], key='title')
  56. # 页码查找
  57. pages = [{'fileName': file_name, 'conformFlag': 1, 'pageStart': str(sim['page_number']), 'pageEnd': str(sim['page_number']), 'pageKey': '', 'text': sim['title'], 'score': sim['相似度']} for sim in title_sims]
  58. input_list = []
  59. for i in range(len(pages)):
  60. next_title = title_judge.find_next_title(pages[i]['text'])
  61. try:
  62. end_page = int(df[df.title == next_title]['page_number'].values[0])
  63. print('当前标题为: ', pages[i]['text'], pages[i]['pageStart'], '下一标题为: ', next_title, end_page)
  64. except Exception:
  65. print('当前标题为: ', pages[i]['text'], '找不到下一标题')
  66. end_page = pages[i]['pageStart']
  67. pages[i]['pageEnd'] = str(end_page)
  68. try:
  69. filtered_content = filter_content(content_list=content_list, start_page=int(pages[i]['page_number']), end_page=end_page)
  70. input_list.append(filtered_content)
  71. except Exception:
  72. pass
  73. comment = get_proj(input_json=input_list, standard=''.join(item['text']))
  74. # 填充返回结果
  75. result['scoringCriteria'].append({
  76. 'scoringFactors': item['title'],
  77. 'scoringStandard': ''.join(item['text']),
  78. 'percentage': item['weight'],
  79. 'expertAdvice': '',
  80. 'writeName': '',
  81. 'suppliers': [{
  82. 'expertAdvice': '',
  83. 'writeName': '',
  84. 'name': supplier,
  85. 'grade': 'B',
  86. 'supplier': f'概括文字: {comment}',
  87. 'pages': pages
  88. }]
  89. })
  90. return result
  91. if __name__ == '__main__':
  92. import os
  93. import json
  94. from glob import glob
  95. from pprint import pprint
  96. with open('bidding_dataset.json', 'r', encoding='utf-8') as fp:
  97. scrutinizes = json.load(fp)
  98. for project in scrutinizes.keys():
  99. # 招标要素
  100. scrutinize_dict = scrutinizes[project]
  101. for file in glob(f'./data/0预审查初审详审测试数据/{project}/*/*-outline.json'):
  102. with open(file, 'r', encoding='utf-8') as fp:
  103. outline_dict = json.load(fp)
  104. with open(file.replace('outline', 'content'), 'r', encoding='utf-8') as fp:
  105. content_list = json.load(fp)
  106. # 没有大纲
  107. if outline_dict == []:
  108. os.remove(file)
  109. continue
  110. # 供应商
  111. supplier = file.split('\\')[-2]
  112. result = tech_loc(
  113. scrutinize_dict=scrutinize_dict,
  114. outline_dict=outline_dict,
  115. content_list=content_list,
  116. supplier=supplier,
  117. project=project,
  118. file_name=file
  119. )
  120. pprint(result)
  121. exit(0)