document_.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. '''
  2. 招投标文件预审查
  3. 1. 解析Bidding_document_extract中all_tables.json结果
  4. '''
  5. from tools import BaseMethods
  6. from pprint import pprint
  7. import re
  8. import logging
  9. import requests
  10. # from bidding_document_extract.get_Bidding_info import PdfExtractAttr_
  11. # from bidding_document_extract.get_bidding_info import PdfExtractAttr
  12. chinese_num_map = {
  13. '零': 0,
  14. '一': 1,
  15. '二': 2,
  16. '三': 3,
  17. '四': 4,
  18. '五': 5,
  19. '六': 6,
  20. '七': 7,
  21. '八': 8,
  22. '九': 9,
  23. '十': 10
  24. }
  25. def create_logger(log_path):
  26. """
  27. 将日志输出到日志文件和控制台
  28. """
  29. logger = logging.getLogger()
  30. logger.setLevel(logging.INFO)
  31. formatter = logging.Formatter(
  32. '%(asctime)s - %(levelname)s - %(message)s')
  33. # 创建一个handler,用于写入日志文件
  34. file_handler = logging.FileHandler(
  35. filename=log_path, mode='w')
  36. file_handler.setFormatter(formatter)
  37. file_handler.setLevel(logging.INFO)
  38. logger.addHandler(file_handler)
  39. # 创建一个handler,用于将日志输出到控制台
  40. console = logging.StreamHandler()
  41. console.setLevel(logging.DEBUG)
  42. console.setFormatter(formatter)
  43. logger.addHandler(console)
  44. return logger
  45. log_path = "code/logs/logs.log"
  46. logger = create_logger(log_path=log_path)
  47. class DocumentPreReview():
  48. def __init__(self, file_path) -> None:
  49. self.bm = BaseMethods()
  50. self.Bidding_tables = self.get_Bidding_table(file_path)
  51. def get_Bidding_table(self, file_path:str):
  52. ''' get table data
  53. '''
  54. # file_path = "data/预审查数据/三峡左岸及电源电站中央空调系统管网及末端改造(发布稿)-table.json"
  55. # file_path = "data/预审查数据/2023年档案管理系统功能优化项目采购程序文件-table.json"
  56. all_tables = self.bm.json_read(file_path)
  57. return all_tables
  58. def _scrutinize_judge(self, tag:str, threshold_value:int=3):
  59. ''' Clause number content judgment
  60. 商务 技术 报价 评审 评分 标准
  61. '''
  62. scrutinize_tuple = ("商务","技术","报价","评审","评分","标准")
  63. hit_num = 0
  64. for scru in scrutinize_tuple:
  65. if scru in tag: hit_num+= 1
  66. if hit_num>=threshold_value: return True
  67. else: return False
  68. def check_table(self, all_tables):
  69. ''' check the form to assess quailty'''
  70. # all_tables = self.Bidding_tables
  71. tables_list = []
  72. previous_page_number = 0
  73. criteria_sign = False
  74. for partial_form in all_tables:
  75. record_num = 1
  76. table_name = partial_form['table_name']
  77. page_number = partial_form['page_numbers']
  78. title_len = partial_form['title_len']
  79. col_len = partial_form['col_len']
  80. tables = partial_form["table"]
  81. form_ = {'table_name':table_name, 'page_numbers':page_number, 'table':[],
  82. 'col_len':col_len, 'title_len':title_len}
  83. if '办法' in table_name and '前附表' in table_name:
  84. previous_page_number = page_number[0]
  85. regulation_number_index,evaluation_factor_index,evaluation_criteria_index = 0,0,0
  86. regulation_number_index_,score_factor_index,score_criteria_index = 0,0,0
  87. for table_index, table in enumerate(tables):
  88. if '评审因素' in table and '评审标准' in table:
  89. regulation_number_index = table.index("条款号")
  90. evaluation_factor_index = table.index("评审因素")
  91. evaluation_criteria_index = table.index("评审标准")
  92. form_['table'].append(table)
  93. continue
  94. elif not table[evaluation_factor_index] and table[evaluation_criteria_index]:
  95. form_['table'][table_index-1][evaluation_criteria_index] += table[evaluation_factor_index]
  96. else:
  97. if table not in form_['table']: form_['table'].append(table)
  98. if '评分因素' in table and '评分标准' in table:
  99. regulation_number_index_ = table.index("条款号")
  100. score_factor_index = table.index("评分因素")
  101. score_criteria_index = table.index("评分标准")
  102. weights_index = table.index("权重")
  103. form_['table'].append(table)
  104. criteria_sign = True
  105. continue
  106. elif criteria_sign and self._scrutinize_judge(table[regulation_number_index_+1],2) and not table[score_factor_index]:
  107. form_['table'][table_index-record_num][score_factor_index-1] += table[score_factor_index-1]
  108. form_['table'][table_index-record_num][score_criteria_index] += table[score_criteria_index]
  109. form_['table'][table_index-record_num][weights_index] += table[weights_index]
  110. record_num += 1
  111. else:
  112. if table not in form_['table']: form_['table'].append(table)
  113. tables_list.append(form_)
  114. elif previous_page_number and page_number[-1]<previous_page_number+3:
  115. for table_index, table in enumerate(tables):
  116. if '评分因素' in table and '评分标准' in table:
  117. regulation_number_index_ = table.index("条款号")
  118. score_factor_index = table.index("评分因素")
  119. score_criteria_index = table.index("评分标准")
  120. weights_index = table.index("权重")
  121. form_['table'].append(table)
  122. criteria_sign = True
  123. continue
  124. elif criteria_sign and self._scrutinize_judge(table[regulation_number_index_+1],2) and not table[score_factor_index]:
  125. form_['table'][table_index-record_num][score_factor_index-1] += table[score_factor_index-1]
  126. form_['table'][table_index-record_num][score_criteria_index] += table[score_criteria_index]
  127. form_['table'][table_index-record_num][weights_index] += table[weights_index]
  128. record_num += 1
  129. else: form_['table'].append(table)
  130. tables_list.append(form_)
  131. else:
  132. tables_list.append(partial_form)
  133. return tables_list
  134. def get_table(self):
  135. ''' parse the Bidding_tables.json file to get the table data from it.
  136. '''
  137. all_tables = self.check_table(self.Bidding_tables)
  138. # 招标文件内容中预审查
  139. tag_sign = ''
  140. tag_list = ("形式评审标准", "资格评审标准", "响应性评审标准")
  141. tag_dict = dict([(tag,[]) for tag in tag_list])
  142. scrutinize_dict = {}
  143. scrutinize_page = 0
  144. scrutinize_index = -1
  145. scrutinize_Initial_title_len = 0 # 详审位置标记
  146. scrutinize_sign = False
  147. record_page = 0
  148. bidder_know = {} # 投标人须知前附表
  149. for partial_form in all_tables:
  150. table_name = partial_form['table_name']
  151. page_number = partial_form['page_numbers']
  152. title_len = partial_form['title_len']
  153. tables = partial_form["table"]
  154. if '投标人须知前附表' == table_name:
  155. record_page = page_number[0]
  156. if page_number[0] < record_page + 3:
  157. for table in tables[1:]:
  158. if '条' in table: continue # 存在BUG
  159. try:
  160. if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
  161. if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
  162. except:
  163. logger.error('该文件中的投标人须知前附表部分表格没有边框,只有中间部分表格存在边框,提取代码认为只有边框存在才被判定为表格内容')
  164. form_sign = re.findall('评\w+法前附表',table_name)
  165. if form_sign:
  166. table_page_num = page_number[-1]
  167. inital_data = tables[0]
  168. # confirm data location
  169. regulation_number_index = inital_data.index("条款号")
  170. evaluation_factor_index = inital_data.index("评审因素")
  171. evaluation_criteria_index = inital_data.index("评审标准")
  172. for table in tables[1:]:
  173. tag = table[regulation_number_index+1]
  174. if tag: tag = tag.strip().replace("\n","")
  175. if tag:
  176. tag_sign = tag
  177. evaluation_factor,evaluation_criteria = table[evaluation_factor_index],table[evaluation_criteria_index]
  178. if tag_sign in tag_dict:
  179. tag_dict[tag_sign].append({"评审因素":evaluation_factor.strip().replace("\n",""),
  180. "评审标准":evaluation_criteria.strip().replace("\n","")})
  181. if '评分因素' in table or '评分标准' in table:
  182. scrutinize_page = table_page_num
  183. scrutinize_Initial_title_len = title_len
  184. if not scrutinize_page: scrutinize_page = table_page_num+1
  185. ''' scrutinize '''
  186. if (scrutinize_page == page_number[0] and scrutinize_Initial_title_len) or scrutinize_page == page_number[0]:
  187. regulation_number_index_,evaluation_factor_index,evaluation_criteria_index,weights_index = 0,0,0,0
  188. scrutinize_sign = True
  189. if not scrutinize_Initial_title_len: scrutinize_Initial_title_len = title_len
  190. for table in tables:
  191. if '评分因素' in table and '评分标准' in table:
  192. regulation_number_index_ = table.index("条款号")
  193. evaluation_factor_index = table.index("评分因素")
  194. evaluation_criteria_index = table.index("评分标准")
  195. weights_index = table.index("权重")
  196. tag_sign_ = ''
  197. scrutinize_index = tables.index(table)
  198. break
  199. elif '评分因素' in table and '评分标准' not in table:
  200. scrutinize_index = tables.index(table)
  201. table_split = table[-1].replace(' ','').split()
  202. if '评分标准' in table_split and '权重' in table_split:
  203. table = table[:-1]
  204. table.extend(table_split)
  205. regulation_number_index_ = table.index("条款号")
  206. evaluation_factor_index = table.index("评分因素")
  207. evaluation_criteria_index = table.index("评分标准")
  208. weights_index = table.index("权重")
  209. tag_sign_ = ''
  210. break
  211. if scrutinize_index != -1:
  212. for table in tables[scrutinize_index+1:]:
  213. if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
  214. elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
  215. else: tag = table[regulation_number_index_]
  216. if tag:
  217. tag = tag.strip().replace("\n","")
  218. tag = ''.join(re.findall(r"[\u4e00-\u9fa5]+", tag))
  219. if tag and self._scrutinize_judge(tag):
  220. tag_sign_ = tag
  221. if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
  222. try:
  223. evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
  224. except:
  225. print()
  226. if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""),"评分标准":evaluation_criteria.strip().replace("\n","")}
  227. else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
  228. "评分标准":evaluation_criteria.strip().replace("\n",""),
  229. "权重":weights.strip().replace("\n","")}
  230. scrutinize_dict[tag_sign_].append(value)
  231. if table[regulation_number_index_]:
  232. if table[regulation_number_index_][0] == '3':
  233. scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
  234. scrutinize_Initial_title_len = 0
  235. break
  236. elif scrutinize_page+1 == page_number[0] and scrutinize_sign:
  237. difference_value = scrutinize_Initial_title_len - title_len
  238. if difference_value:
  239. table_length = len(table)
  240. evaluation_factor_index -= difference_value
  241. evaluation_criteria_index -= difference_value
  242. weights_index -= difference_value
  243. if weights_index >= table_length:
  244. evaluation_factor_index = table_length-3
  245. evaluation_criteria_index = table_length-2
  246. weights_index = table_length-1
  247. for table in tables:
  248. if not table[2]:
  249. scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
  250. continue
  251. if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
  252. elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
  253. else: tag = table[regulation_number_index_]
  254. if tag:
  255. tag = tag.strip().replace("\n","")
  256. tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
  257. if tag and self._scrutinize_judge(tag):
  258. tag_sign_ = tag
  259. if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
  260. evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
  261. if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
  262. else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
  263. "评分标准":evaluation_criteria.strip().replace("\n",""),
  264. "权重":weights.strip().replace("\n","")}
  265. scrutinize_dict[tag_sign_].append(value)
  266. if table[regulation_number_index_]:
  267. if table[regulation_number_index_][0] == '3':
  268. scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
  269. scrutinize_Initial_title_len = 0
  270. break
  271. elif scrutinize_page+2 == page_number[0] and scrutinize_sign:
  272. difference_value = scrutinize_Initial_title_len - title_len
  273. if scrutinize_Initial_title_len:
  274. evaluation_factor_index -= difference_value
  275. evaluation_criteria_index -= difference_value
  276. weights_index -= difference_value
  277. for table in tables:
  278. if not table[2]:
  279. scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
  280. continue
  281. if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
  282. elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
  283. else: tag = table[regulation_number_index_]
  284. if tag:
  285. tag = tag.strip().replace("\n","")
  286. tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
  287. if tag and self._scrutinize_judge(tag):
  288. tag_sign_ = tag
  289. if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
  290. evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
  291. if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
  292. else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
  293. "评分标准":evaluation_criteria.strip().replace("\n",""),
  294. "权重":weights.strip().replace("\n","")}
  295. scrutinize_dict[tag_sign_].append(value)
  296. if table[regulation_number_index_]:
  297. if table[regulation_number_index_][0] == '3':
  298. scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
  299. scrutinize_Initial_title_len = 0
  300. break
  301. pprint(scrutinize_dict)
  302. return scrutinize_dict
  303. from fastapi import FastAPI
  304. import uvicorn
  305. app = FastAPI()
  306. @app.post('get_pre_review')
  307. def get_pre_review():
  308. result = {
  309. "":""
  310. }
  311. return result
  312. if __name__ == '__main__':
  313. path_list = []
  314. for path_ in path_list:
  315. dpr = DocumentPreReview(path_)
  316. scrutinize_dict = dpr.get_table() # TODO scrutinize_dict是需要的结果