document_.py 17 KB


  1. from tools import BaseMethods
  2. from pprint import pprint
  3. import re
  4. import logging
  5. import requests
  6. def create_logger(log_path):
  7. """
  8. 将日志输出到日志文件和控制台
  9. """
  10. logger = logging.getLogger()
  11. logger.setLevel(logging.INFO)
  12. formatter = logging.Formatter(
  13. '%(asctime)s - %(levelname)s - %(message)s')
  14. # 创建一个handler,用于写入日志文件
  15. file_handler = logging.FileHandler(
  16. filename=log_path, mode='w')
  17. file_handler.setFormatter(formatter)
  18. file_handler.setLevel(logging.INFO)
  19. logger.addHandler(file_handler)
  20. # 创建一个handler,用于将日志输出到控制台
  21. console = logging.StreamHandler()
  22. console.setLevel(logging.DEBUG)
  23. console.setFormatter(formatter)
  24. logger.addHandler(console)
  25. return logger
  26. log_path = "code/logs/logs.log"
  27. logger = create_logger(log_path=log_path)
  28. class DocumentPreReview():
  29. def __init__(self, file_path) -> None:
  30. self.bm = BaseMethods()
  31. self.Bidding_tables = self.get_Bidding_table(file_path)
  32. def get_Bidding_table(self, file_path:str):
  33. ''' get table data
  34. '''
  35. # file_path = "data/预审查数据/三峡左岸及电源电站中央空调系统管网及末端改造(发布稿)-table.json"
  36. # file_path = "data/预审查数据/2023年档案管理系统功能优化项目采购程序文件-table.json"
  37. all_tables = self.bm.json_read(file_path)
  38. return all_tables
  39. def _scrutinize_judge(self, tag:str, threshold_value:int=3):
  40. ''' Clause number content judgment
  41. 商务 技术 报价 评审 评分 标准
  42. '''
  43. scrutinize_tuple = ("商务","技术","报价","评审","评分","标准")
  44. hit_num = 0
  45. for scru in scrutinize_tuple:
  46. if scru in tag: hit_num+= 1
  47. if hit_num>=threshold_value: return True
  48. else: return False
  49. def check_table(self, all_tables):
  50. ''' check the form to assess quailty'''
  51. # all_tables = self.Bidding_tables
  52. tables_list = []
  53. previous_page_number = 0
  54. criteria_sign = False
  55. for partial_form in all_tables:
  56. record_num = 1
  57. table_name = partial_form['table_name']
  58. page_number = partial_form['page_numbers']
  59. title_len = partial_form['title_len']
  60. col_len = partial_form['col_len']
  61. tables = partial_form["table"]
  62. form_ = {'table_name':table_name, 'page_numbers':page_number, 'table':[],
  63. 'col_len':col_len, 'title_len':title_len}
  64. if '办法' in table_name and '前附表' in table_name:
  65. previous_page_number = page_number[0]
  66. regulation_number_index,evaluation_factor_index,evaluation_criteria_index = 0,0,0
  67. regulation_number_index_,score_factor_index,score_criteria_index = 0,0,0
  68. for table_index, table in enumerate(tables):
  69. if '评审因素' in table and '评审标准' in table:
  70. regulation_number_index = table.index("条款号")
  71. evaluation_factor_index = table.index("评审因素")
  72. evaluation_criteria_index = table.index("评审标准")
  73. form_['table'].append(table)
  74. continue
  75. elif not table[evaluation_factor_index] and table[evaluation_criteria_index]:
  76. form_['table'][table_index-1][evaluation_criteria_index] += table[evaluation_factor_index]
  77. else:
  78. if table not in form_['table']: form_['table'].append(table)
  79. if '评分因素' in table and '评分标准' in table:
  80. regulation_number_index_ = table.index("条款号")
  81. score_factor_index = table.index("评分因素")
  82. score_criteria_index = table.index("评分标准")
  83. weights_index = table.index("权重")
  84. form_['table'].append(table)
  85. criteria_sign = True
  86. continue
  87. elif criteria_sign and self._scrutinize_judge(table[regulation_number_index_+1],2) and not table[score_factor_index]:
  88. form_['table'][table_index-record_num][score_factor_index-1] += table[score_factor_index-1]
  89. form_['table'][table_index-record_num][score_criteria_index] += table[score_criteria_index]
  90. form_['table'][table_index-record_num][weights_index] += table[weights_index]
  91. record_num += 1
  92. else:
  93. if table not in form_['table']: form_['table'].append(table)
  94. tables_list.append(form_)
  95. elif previous_page_number and page_number[-1]<previous_page_number+3:
  96. for table_index, table in enumerate(tables):
  97. if '评分因素' in table and '评分标准' in table:
  98. regulation_number_index_ = table.index("条款号")
  99. score_factor_index = table.index("评分因素")
  100. score_criteria_index = table.index("评分标准")
  101. weights_index = table.index("权重")
  102. form_['table'].append(table)
  103. criteria_sign = True
  104. continue
  105. elif criteria_sign and self._scrutinize_judge(table[regulation_number_index_+1],2) and not table[score_factor_index]:
  106. form_['table'][table_index-record_num][score_factor_index-1] += table[score_factor_index-1]
  107. form_['table'][table_index-record_num][score_criteria_index] += table[score_criteria_index]
  108. form_['table'][table_index-record_num][weights_index] += table[weights_index]
  109. record_num += 1
  110. else: form_['table'].append(table)
  111. tables_list.append(form_)
  112. else:
  113. tables_list.append(partial_form)
  114. return tables_list
  115. def get_table(self):
  116. ''' parse the Bidding_tables.json file to get the table data from it.
  117. '''
  118. all_tables = self.check_table(self.Bidding_tables)
  119. # 招标文件内容中预审查
  120. tag_sign = ''
  121. tag_list = ("形式评审标准", "资格评审标准", "响应性评审标准")
  122. tag_dict = dict([(tag,[]) for tag in tag_list])
  123. scrutinize_dict = {}
  124. scrutinize_page = 0
  125. scrutinize_index = -1
  126. scrutinize_Initial_title_len = 0 # 详审位置标记
  127. scrutinize_sign = False
  128. record_page = 0
  129. bidder_know = {} # 投标人须知前附表
  130. for partial_form in all_tables:
  131. table_name = partial_form['table_name']
  132. page_number = partial_form['page_numbers']
  133. title_len = partial_form['title_len']
  134. tables = partial_form["table"]
  135. if '投标人须知前附表' == table_name:
  136. record_page = page_number[0]
  137. if page_number[0] < record_page + 3:
  138. for table in tables[1:]:
  139. if '条' in table: continue # 存在BUG
  140. try:
  141. if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
  142. if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
  143. except:
  144. logger.error('该文件中的投标人须知前附表部分表格没有边框,只有中间部分表格存在边框,提取代码认为只有边框存在才被判定为表格内容')
  145. form_sign = re.findall('评\w+法前附表',table_name)
  146. if form_sign:
  147. table_page_num = page_number[-1]
  148. inital_data = tables[0]
  149. # confirm data location
  150. regulation_number_index = inital_data.index("条款号")
  151. evaluation_factor_index = inital_data.index("评审因素")
  152. evaluation_criteria_index = inital_data.index("评审标准")
  153. for table in tables[1:]:
  154. tag = table[regulation_number_index+1]
  155. if tag: tag = tag.strip().replace("\n","")
  156. if tag:
  157. tag_sign = tag
  158. evaluation_factor,evaluation_criteria = table[evaluation_factor_index],table[evaluation_criteria_index]
  159. if tag_sign in tag_dict:
  160. tag_dict[tag_sign].append({"评审因素":evaluation_factor.strip().replace("\n",""),
  161. "评审标准":evaluation_criteria.strip().replace("\n","")})
  162. if '评分因素' in table or '评分标准' in table:
  163. scrutinize_page = table_page_num
  164. scrutinize_Initial_title_len = title_len
  165. if not scrutinize_page: scrutinize_page = table_page_num+1
  166. ''' scrutinize '''
  167. if (scrutinize_page == page_number[0] and scrutinize_Initial_title_len) or scrutinize_page == page_number[0]:
  168. regulation_number_index_,evaluation_factor_index,evaluation_criteria_index,weights_index = 0,0,0,0
  169. scrutinize_sign = True
  170. if not scrutinize_Initial_title_len: scrutinize_Initial_title_len = title_len
  171. for table in tables:
  172. if '评分因素' in table and '评分标准' in table:
  173. regulation_number_index_ = table.index("条款号")
  174. evaluation_factor_index = table.index("评分因素")
  175. evaluation_criteria_index = table.index("评分标准")
  176. weights_index = table.index("权重")
  177. tag_sign_ = ''
  178. scrutinize_index = tables.index(table)
  179. break
  180. elif '评分因素' in table and '评分标准' not in table:
  181. scrutinize_index = tables.index(table)
  182. table_split = table[-1].replace(' ','').split()
  183. if '评分标准' in table_split and '权重' in table_split:
  184. table = table[:-1]
  185. table.extend(table_split)
  186. regulation_number_index_ = table.index("条款号")
  187. evaluation_factor_index = table.index("评分因素")
  188. evaluation_criteria_index = table.index("评分标准")
  189. weights_index = table.index("权重")
  190. tag_sign_ = ''
  191. break
  192. if scrutinize_index != -1:
  193. for table in tables[scrutinize_index+1:]:
  194. if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
  195. elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
  196. else: tag = table[regulation_number_index_]
  197. if tag:
  198. tag = tag.strip().replace("\n","")
  199. tag = ''.join(re.findall(r"[\u4e00-\u9fa5]+", tag))
  200. if tag and self._scrutinize_judge(tag):
  201. tag_sign_ = tag
  202. if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
  203. try:
  204. evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
  205. except:
  206. print()
  207. if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""),"评分标准":evaluation_criteria.strip().replace("\n","")}
  208. else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
  209. "评分标准":evaluation_criteria.strip().replace("\n",""),
  210. "权重":weights.strip().replace("\n","")}
  211. scrutinize_dict[tag_sign_].append(value)
  212. if table[regulation_number_index_]:
  213. if table[regulation_number_index_][0] == '3':
  214. scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
  215. scrutinize_Initial_title_len = 0
  216. break
  217. elif scrutinize_page+1 == page_number[0] and scrutinize_sign:
  218. difference_value = scrutinize_Initial_title_len - title_len
  219. if difference_value:
  220. table_length = len(table)
  221. evaluation_factor_index -= difference_value
  222. evaluation_criteria_index -= difference_value
  223. weights_index -= difference_value
  224. if weights_index >= table_length:
  225. evaluation_factor_index = table_length-3
  226. evaluation_criteria_index = table_length-2
  227. weights_index = table_length-1
  228. for table in tables:
  229. if not table[2]:
  230. scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
  231. continue
  232. if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
  233. elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
  234. else: tag = table[regulation_number_index_]
  235. if tag:
  236. tag = tag.strip().replace("\n","")
  237. tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
  238. if tag and self._scrutinize_judge(tag):
  239. tag_sign_ = tag
  240. if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
  241. evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
  242. if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
  243. else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
  244. "评分标准":evaluation_criteria.strip().replace("\n",""),
  245. "权重":weights.strip().replace("\n","")}
  246. scrutinize_dict[tag_sign_].append(value)
  247. if table[regulation_number_index_]:
  248. if table[regulation_number_index_][0] == '3':
  249. scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
  250. scrutinize_Initial_title_len = 0
  251. break
  252. elif scrutinize_page+2 == page_number[0] and scrutinize_sign:
  253. difference_value = scrutinize_Initial_title_len - title_len
  254. if scrutinize_Initial_title_len:
  255. evaluation_factor_index -= difference_value
  256. evaluation_criteria_index -= difference_value
  257. weights_index -= difference_value
  258. for table in tables:
  259. if not table[2]:
  260. scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
  261. continue
  262. if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
  263. elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
  264. else: tag = table[regulation_number_index_]
  265. if tag:
  266. tag = tag.strip().replace("\n","")
  267. tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
  268. if tag and self._scrutinize_judge(tag):
  269. tag_sign_ = tag
  270. if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
  271. evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
  272. if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
  273. else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
  274. "评分标准":evaluation_criteria.strip().replace("\n",""),
  275. "权重":weights.strip().replace("\n","")}
  276. scrutinize_dict[tag_sign_].append(value)
  277. if table[regulation_number_index_]:
  278. if table[regulation_number_index_][0] == '3':
  279. scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
  280. scrutinize_Initial_title_len = 0
  281. break
  282. pprint(scrutinize_dict)
  283. return scrutinize_dict
  284. if __name__ == '__main__':
  285. path_list = []
  286. for path_ in path_list:
  287. dpr = DocumentPreReview(path_)
  288. scrutinize_dict = dpr.get_table() # TODO scrutinize_dict是需要的结果