tech_instance.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-12-25 14:29:44
  6. # 技术部分定位
  7. from typing import List, Optional
  8. import pandas as pd
  9. from celery_tasks.LLMAgent import get_proj
  10. from celery_tasks.text_extractor import similar_match
  11. from celery_tasks.tools import filter_content, comment_clean, TitleLevelJudge
  12. def tech_loc(scrutinize_dict: dict, outline_dict: List[dict], content_list: List[dict], supplier: str, project: str = None, file_name: str = '投标文件.pdf') -> Optional[List[dict]]:
  13. """
  14. 投标技术部分定位
  15. Args:
  16. scrutinize_dict: 详细评审
  17. outline_dict: 投标文件大纲
  18. Results:
  19. result: 技术部分详审结果
  20. """
  21. df = pd.DataFrame(outline_dict)
  22. title_judge = TitleLevelJudge(df.title.to_list())
  23. targets = []
  24. part = None
  25. # 从详审大纲中获取技术评分方法
  26. for key in scrutinize_dict.keys():
  27. if '技术' in key:
  28. part = key
  29. break
  30. # 没有找到技术评审方法则直接返回
  31. if not part:
  32. return None
  33. # 对技术评分中评分标准的拆解
  34. for ins in scrutinize_dict[key]:
  35. inst = ins['评分因素']
  36. text = ins['评分标准']
  37. weig = ins['权重']
  38. t = []
  39. for i in text.split('。'):
  40. for j in i.split(';'):
  41. for k in j.split(','):
  42. t.append(k)
  43. targets.append({
  44. 'title': inst,
  45. 'text': t,
  46. 'weight': weig
  47. })
  48. result = {
  49. 'writeName': '',
  50. 'scoringCriteria': []
  51. }
  52. # 遍历评审方法
  53. for item in targets:
  54. # 相似度查找
  55. title_sims = similar_match(outline_dict, [item['title']], key='title')
  56. # 页码查找
  57. pages = [{'fileName': file_name, 'conformFlag': 1, 'pageStart': str(sim['page_number']), 'pageEnd': str(sim['page_number']), 'pageKey': '', 'text': sim['title'], 'score': sim['相似度']} for sim in title_sims]
  58. input_list = []
  59. for i in range(len(pages)):
  60. next_title = title_judge.find_next_title(pages[i]['text'])
  61. try:
  62. end_page = int(df[df.title == next_title]['page_number'].values[0])
  63. print('当前标题为: ', pages[i]['text'], pages[i]['pageStart'], '下一标题为: ', next_title, end_page)
  64. except Exception:
  65. print('当前标题为: ', pages[i]['text'], '找不到下一标题')
  66. end_page = pages[i]['pageStart']
  67. # 页码
  68. ps = pages[i]['pageStart']
  69. ps = int(ps) if ps else 0
  70. if ps > end_page:
  71. midden = end_page
  72. end_page = ps
  73. pages[i]['pageStart'] = str(midden)
  74. pages[i]['pageEnd'] = str(end_page)
  75. try:
  76. filtered_content = filter_content(content_list=content_list, start_page=int(pages[i]['page_number']), end_page=end_page)
  77. input_list.append(filtered_content)
  78. except Exception:
  79. pass
  80. comment = get_proj(input_json=input_list, standard=''.join(item['text']))
  81. comment = comment_clean(comment)
  82. # 填充返回结果
  83. result['scoringCriteria'].append({
  84. 'scoringFactors': item['title'],
  85. 'scoringStandard': ''.join(item['text']),
  86. 'percentage': item['weight'],
  87. 'expertAdvice': '',
  88. 'writeName': '',
  89. 'suppliers': [{
  90. 'expertAdvice': '',
  91. 'writeName': '',
  92. 'name': supplier,
  93. 'grade': 'B',
  94. 'supplier': f'概括文字: {comment}',
  95. 'pages': pages
  96. }]
  97. })
  98. return result
  99. if __name__ == '__main__':
  100. import os
  101. import json
  102. from glob import glob
  103. from pprint import pprint
  104. with open('bidding_dataset.json', 'r', encoding='utf-8') as fp:
  105. scrutinizes = json.load(fp)
  106. for project in scrutinizes.keys():
  107. # 招标要素
  108. scrutinize_dict = scrutinizes[project]
  109. for file in glob(f'./data/0预审查初审详审测试数据/{project}/*/*-outline.json'):
  110. with open(file, 'r', encoding='utf-8') as fp:
  111. outline_dict = json.load(fp)
  112. with open(file.replace('outline', 'content'), 'r', encoding='utf-8') as fp:
  113. content_list = json.load(fp)
  114. # 没有大纲
  115. if outline_dict == []:
  116. os.remove(file)
  117. continue
  118. # 供应商
  119. supplier = file.split('\\')[-2]
  120. result = tech_loc(
  121. scrutinize_dict=scrutinize_dict,
  122. outline_dict=outline_dict,
  123. content_list=content_list,
  124. supplier=supplier,
  125. project=project,
  126. file_name=file
  127. )
  128. pprint(result)
  129. exit(0)