document_.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. '''
  2. 招投标文件预审查
  3. 1. 解析bidding_document_extract中all_tables.json结果
  4. '''
  5. from tools import BaseMethods
  6. from pprint import pprint
  7. import re
  8. chinese_num_map = {
  9. '零': 0,
  10. '一': 1,
  11. '二': 2,
  12. '三': 3,
  13. '四': 4,
  14. '五': 5,
  15. '六': 6,
  16. '七': 7,
  17. '八': 8,
  18. '九': 9,
  19. '十': 10
  20. }
  21. class DocumentPreReview():
  22. def __init__(self) -> None:
  23. self.bm = BaseMethods()
  24. self.bidding_tables = self.get_bidding_table()
  25. self.contexts = self.get_contexts()
  26. self.announcement = self.get_announcement()
  27. self.bidding_context = self.get_bidding_context()
  28. self.chinese_num_map = chinese_num_map
  29. def get_contexts(self, file_path:str = 'code/bidding_document_extract/contexts.json'):
  30. ''' get contexts by page
  31. '''
  32. contexts = self.bm.json_read(file_path)
  33. return contexts
  34. def get_bidding_table(self):
  35. ''' get table data
  36. '''
  37. file_path = "code/bidding_document_extract/all_tables.json"
  38. # file_path = "code/bidding_document_extract/all_tables_三峡左右岸.json"
  39. all_tables = self.bm.json_read(file_path)
  40. return all_tables
  41. def get_bidding_context(self):
  42. ''' read json to get context
  43. '''
  44. file_path = "code/bidding_document_extract/基于物联网技术的三峡坝区智慧仓储研究与建设招标文件-发出.json"
  45. bidding_context = self.bm.json_read(file_path)
  46. return bidding_context
  47. def get_table(self):
  48. ''' get table to json
  49. '''
  50. all_tables = self.bidding_tables
  51. tag_sign = ''
  52. tag_list = ("形式评审标准", "资格评审标准", "响应性评审标准")
  53. tag_dict = dict([(tag,[]) for tag in tag_list])
  54. scrutinize_tuple = ("商务部分评分标准","技术部分评审标准","投标报价评审标准","报价部分评审标准","报价评分标准")
  55. scrutinize_dict = dict([(scrutinize,[]) for scrutinize in scrutinize_tuple])
  56. scrutinize_page = 0
  57. scrutinize_index = 0
  58. scrutinize_Initial_position_marker = 0 # 详审位置标记
  59. record_page = 0
  60. bidder_know = {} # 投标人须知前附表
  61. for partial_form in all_tables:
  62. table_name = partial_form['table_name']
  63. page_number = partial_form['page_numbers']
  64. title_len = partial_form['title_len']
  65. tables = partial_form["table"]
  66. if '投标人须知前附表' == table_name:
  67. record_page = page_number[0]
  68. if page_number[0] < record_page + 3:
  69. for table in tables[1:]:
  70. if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
  71. if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
  72. if '评标方法' in table_name:
  73. table_name = table_name.strip().replace("\n","")
  74. if table_name == "评标办法前附表":
  75. table_page_num = page_number[0]
  76. inital_data = tables[0]
  77. # confirm data location
  78. regulation_number_index = inital_data.index("条款号")
  79. evaluation_factor_index = inital_data.index("评审因素")
  80. evaluation_criteria_index = inital_data.index("评审标准")
  81. for table in tables[1:]:
  82. tag = table[regulation_number_index+1]
  83. if tag: tag = tag.strip().replace("\n","")
  84. if tag and (tag in tag_list):
  85. tag_sign = tag
  86. evaluation_factor,evaluation_criteria = table[evaluation_factor_index],table[evaluation_criteria_index]
  87. if tag_sign in tag_dict:
  88. tag_dict[tag_sign].append({"评审因素":evaluation_factor.strip().replace("\n",""),
  89. "评审标准":evaluation_criteria.strip().replace("\n","")})
  90. if '评分因素' in table or '评分标准' in table:
  91. scrutinize_page = table_page_num
  92. scrutinize_Initial_position_marker = 1
  93. if not scrutinize_page: scrutinize_page = table_page_num+1
  94. ''' scrutinize '''
  95. if page_number[0] == 35:
  96. print()
  97. if (scrutinize_page == page_number[0] and scrutinize_Initial_position_marker) or scrutinize_page == page_number[0]:
  98. regulation_number_index,evaluation_factor_index,evaluation_criteria_index,weights_index = 0,0,0,0
  99. for table in tables:
  100. if '评分因素' in table and '评分标准' in table:
  101. regulation_number_index = table.index("条款号")
  102. evaluation_factor_index = table.index("评分因素")
  103. evaluation_criteria_index = table.index("评分标准")
  104. weights_index = table.index("权重")
  105. tag_sign = ''
  106. scrutinize_index = tables.index(table)
  107. if scrutinize_index:
  108. for table in tables[scrutinize_index+1:]:
  109. if table[regulation_number_index+1]: tag = table[regulation_number_index+1]
  110. else: tag = table[regulation_number_index+2]
  111. if tag:
  112. tag = tag.strip().replace("\n","")
  113. tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
  114. if tag and (tag in scrutinize_tuple):
  115. tag_sign = tag
  116. evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
  117. if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""),"评分标准":evaluation_criteria.strip().replace("\n","")}
  118. else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
  119. "评分标准":evaluation_criteria.strip().replace("\n",""),
  120. "权重":weights.strip().replace("\n","")}
  121. scrutinize_dict[tag_sign].append(value)
  122. if '报价' in tag_sign and '评审标准' in tag_sign:
  123. scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
  124. break
  125. elif scrutinize_page+1 == page_number[0] and title_len == 5 and '报价' not in tag_sign:
  126. if scrutinize_Initial_position_marker:
  127. evaluation_factor_index -= 1
  128. evaluation_criteria_index -= 1
  129. weights_index -= 1
  130. for table in tables:
  131. if not table[2]:
  132. scrutinize_dict[tag_sign][-1]['评分标准'] += table[3]
  133. continue
  134. tag = table[regulation_number_index+1]
  135. if tag:
  136. tag = tag.strip().replace("\n","")
  137. tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
  138. if tag and (tag in scrutinize_tuple):
  139. tag_sign = tag
  140. evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
  141. if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
  142. else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
  143. "评分标准":evaluation_criteria.strip().replace("\n",""),
  144. "权重":weights.strip().replace("\n","")}
  145. scrutinize_dict[tag_sign].append(value)
  146. if '报价' in tag_sign and '评审标准' in tag_sign:
  147. scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
  148. scrutinize_Initial_position_marker = 0
  149. break
  150. elif scrutinize_page+2 == page_number[0] and title_len == 5 and '报价' not in tag_sign:
  151. for table in tables:
  152. if not table[2]:
  153. scrutinize_dict[tag_sign][-1]['评分标准'] += table[3]
  154. continue
  155. tag = table[regulation_number_index+1]
  156. if tag:
  157. tag = tag.strip().replace("\n","")
  158. tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
  159. if tag and (tag in scrutinize_tuple):
  160. tag_sign = tag
  161. evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
  162. try:
  163. if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
  164. else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
  165. "评分标准":evaluation_criteria.strip().replace("\n",""),
  166. "权重":weights.strip().replace("\n","")}
  167. except:
  168. print()
  169. scrutinize_dict[tag_sign].append(value)
  170. if '报价' in tag_sign and '评审标准' in tag_sign:
  171. scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
  172. break
  173. # pprint(tag_dict)
  174. # pprint(scrutinize_dict)
  175. # pprint(bidder_know)
  176. return tag_dict,bidder_know,scrutinize_dict
  177. def get_announcement(self)->str:
  178. ''' bidder announcement
  179. '''
  180. announcements = ''
  181. announcement_contexts = self.contexts[2:8]
  182. for index, announcement in enumerate(announcement_contexts):
  183. finder = re.findall("^第一章",announcement['text'])
  184. if finder:
  185. for text in announcement_contexts[index:]:
  186. if re.findall("^第二章", text["text"]): break
  187. announcements += text["text"]
  188. break
  189. return announcements
  190. def formal_criteria(self, review_criteria_list:list):
  191. ''' Analysis of formal review criteria
  192. 形式评审标准
  193. [{'评审因素': '投标人名称', '评审标准': '与营业执照书一致'},
  194. {'评审因素': '投标文件封面、投标函签字盖章',
  195. '评审标准': '投标文件封面、投标函须有法定代表人(或其委托代理人)签字(或签章)并加盖单位章,由委托代理人签字的须具有有效的授权委托书'},
  196. {'评审因素': '投标文件格式', '评审标准': '符合第八章“投标文件格式”的要求'},
  197. {'评审因素': '联合体投标人(如有)', '评审标准': '不适用'},
  198. {'评审因素': '报价唯一', '评审标准': '只能有一个有效报价'}]
  199. '''
  200. for review_criteria in review_criteria_list:
  201. evaluation_factor = review_criteria['评审因素']
  202. evaluation_criteria = review_criteria['评审标准']
  203. if '投标人名称' in evaluation_factor or '供应商名称' in evaluation_factor:
  204. ['营业执照','资质证书']
  205. '''
  206. 要求投标文件中 投标公司 与 其提供的营业执照或资质证书中的名称相同
  207. '''
  208. pass
  209. elif '报价函签字盖章' in evaluation_factor or '投标文件封面、投标函签字盖章' in evaluation_factor:
  210. '''
  211. 要求投标文件中 投标公司的 法人或委托人签字或是 存在单位盖章
  212. '''
  213. pass
  214. elif '投标文件格式' in evaluation_factor:
  215. comp1 = re.compile("(第.*?章)")
  216. comp2 = re.compile("“(.*?)”")
  217. title = comp1.findall(evaluation_criteria)[0]+comp2.findall(evaluation_criteria)[0]
  218. comp3 = re.compile("第(.*?)章")
  219. title_list = []
  220. format_index,sta_page = -1,-1
  221. sign = True
  222. title_next = ''
  223. for context in self.bidding_context: # 取招标文件内容
  224. text = context['text'].strip().replace(" ","")
  225. if text == '目录':
  226. sta_page = context['page_number']
  227. if sta_page != -1 and context['page_number'] < 4:
  228. finder = comp3.findall(context['text'])
  229. if finder and sign:
  230. if title_list:
  231. chinese_num = self.chinese_num_map.get(comp3.findall(title_list[-1])[0],None)
  232. if chinese_num > self.chinese_num_map.get(finder[0],0):
  233. sign = False
  234. else:
  235. title_list.append(context['text'].split(' ')[0])
  236. else:
  237. title_list.append(context['text'].split(' ')[0])
  238. if text == title and format_index == -1:
  239. format_index = self.bidding_context.index(context)
  240. break
  241. '''
  242. 不对比目录,只对比内容,只要存在即认定符合要求
  243. '''
  244. title_index = title_list.index(title)
  245. if title_index != len(title_list)-1:
  246. title_next = title_list[title_index+1]
  247. file_format = {title:[]}
  248. for context in self.bidding_context[format_index+1:]:
  249. text = context['text'].strip().replace(" ","")
  250. if title_next and title_next == text:
  251. break
  252. file_format[title].append(context)
  253. file_format # 需要优化提取的内容
  254. '''
  255. 招标文件 file_format 与投标文件内容对比,投标文件中只要存在file_format内容即可
  256. '''
  257. elif '联合体投标人' in evaluation_factor:
  258. if '不适用' in evaluation_criteria: continue
  259. elif '报价唯一' in evaluation_factor:
  260. '''
  261. 需要在投标文件中比对三个位置的报价总和值抽取
  262. '''
  263. pass
  264. def qualification_criteria(self, review_criteria_list:list, bidder_know:dict):
  265. ''' Qualification assessment criteria
  266. 资格评审标准
  267. '''
  268. for review_criteria in review_criteria_list:
  269. evaluation_factor = review_criteria['评审因素']
  270. evaluation_criteria = review_criteria['评审标准']
  271. if '营业执照' in evaluation_factor:
  272. '''
  273. 在投标文件中 对营业执照识别营业期限;长期识别认为可以;只有开始时间没有结束时间给提示。
  274. '''
  275. pass
  276. elif '资质' in evaluation_factor:
  277. comp1 = re.compile('(第.*?章)')
  278. comp2 = re.compile('“(.*?)”')
  279. comp3 = re.compile('第(.*?)项规定')
  280. finder1 = comp1.findall(evaluation_criteria)[0]
  281. finder2 = comp2.findall(evaluation_criteria)[0]
  282. finder3 = comp3.findall(evaluation_criteria)[0]
  283. chapter_name = finder1+finder2
  284. stipulation = finder3
  285. if '第二章' in chapter_name:
  286. bidder_data = bidder_know.get(stipulation,None)
  287. if not bidder_data: continue
  288. clause_name = bidder_data['条款名称'].replace("\n","")
  289. list_content = bidder_data['编列内容']
  290. if '招标公告' in list_content:
  291. cert_index = self.announcement.index('资质') ## 默认 资质条件 不变
  292. cert_required = re.findall(":(.*?)\\n",self.announcement[cert_index:cert_index+500])[0]
  293. '''
  294. big model
  295. 需要设计prompt,可将内容及情况在线上glm4中使用,测出合适prompt
  296. '''
  297. def content_parsing(self):
  298. ''' data analysis aggregate function
  299. '''
  300. tag_dict,bidder_know = dpr.get_table()
  301. # {}
  302. # self.formal_criteria(tag_dict['形式评审标准'])
  303. # self.qualification_criteria(tag_dict['资格评审标准'], bidder_know)
  304. if __name__ == '__main__':
  305. dpr = DocumentPreReview()
  306. dpr.get_table()
  307. # print(dpr.bidding_context)
  308. # formal_review_criteria = [
  309. # {'评审因素': '投标文件格式', '评审标准': '符合第八章“投标文件格式”的要求'}
  310. # ]
  311. # dpr.formal_criteria(formal_review_criteria)