''' 招投标文件预审查 1. 解析Bidding_document_extract中all_tables.json结果 ''' import re import json from celery_tasks.tools import BaseMethods class DocumentPreReview: def __init__(self, table_path: str): with open(table_path, 'r', encoding='utf-8') as fp: self.Bidding_tables = json.load(fp) def _scrutinize_judge(self, tag: str, threshold_value: int = 3): ''' Clause number content judgment 商务 技术 报价 评审 评分 标准 ''' scrutinize_tuple = ("商务", "技术", "报价", "评审", "评分", "标准", "部分") hit_num = 0 for scru in scrutinize_tuple: if scru in tag: hit_num += 1 if hit_num >= threshold_value: return True else: return False def check_table(self, all_tables): ''' check the form to assess quailty''' # all_tables = self.Bidding_tables tables_list = [] previous_page_number = 0 criteria_sign = False for partial_form in all_tables: record_num = 1 table_name = partial_form['table_name'] page_number = partial_form['page_numbers'] title_len = partial_form['title_len'] col_len = partial_form['col_len'] tables = partial_form["table"] form_ = {'table_name':table_name, 'page_numbers':page_number, 'table':[], 'col_len':col_len, 'title_len':title_len} if '办法' in table_name and '前附表' in table_name: previous_page_number = page_number[0] regulation_number_index,evaluation_factor_index,evaluation_criteria_index = 0,0,0 regulation_number_index_,score_factor_index,score_criteria_index = 0,0,0 for table_index, table in enumerate(tables): if '评审因素' in table and '评审标准' in table: regulation_number_index = table.index("条款号") evaluation_factor_index = table.index("评审因素") evaluation_criteria_index = table.index("评审标准") form_['table'].append(table) continue elif not table[evaluation_factor_index] and table[evaluation_criteria_index]: form_['table'][table_index-1][evaluation_criteria_index] += table[evaluation_factor_index] else: if table not in form_['table'] and not criteria_sign: form_['table'].append(table) if '评分因素' in table and '评分标准' in table: regulation_number_index_ = table.index("条款号") score_factor_index = table.index("评分因素") score_criteria_index = table.index("评分标准") weights_index = table.index("权重") criteria_sign = True continue elif criteria_sign and self._scrutinize_judge(table[regulation_number_index_+1],2) and not table[score_factor_index]: form_['table'][table_index-record_num][score_factor_index-1] += table[score_factor_index-1] form_['table'][table_index-record_num][score_criteria_index] += table[score_criteria_index] form_['table'][table_index-record_num][weights_index] += table[weights_index] record_num += 1 else: if table not in form_['table'] and criteria_sign: form_['table'].append(table) continue tables_list.append(form_) elif previous_page_number and page_number[-1]= weights_index: weighr_finder = weight_comp.findall(table[-1]) if weighr_finder: table.append(weighr_finder[0]) else: table.append('3%') evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index] if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""),"评分标准":evaluation_criteria.strip().replace("\n","")} else: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n",""), "权重":weights.strip().replace("\n","")} scrutinize_dict[tag_sign_].append(value) if table[regulation_number_index_]: if table[regulation_number_index_][0] == '3': scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value} scrutinize_Initial_title_len = 0 break elif scrutinize_page+1 in page_number and scrutinize_sign: scrutinize_second_title_len = title_len difference_value = scrutinize_Initial_title_len - title_len if difference_value: table_length = len(table) evaluation_factor_index -= difference_value evaluation_criteria_index -= difference_value weights_index -= difference_value if weights_index >= table_length: evaluation_factor_index = table_length-3 evaluation_criteria_index = table_length-2 weights_index = table_length-1 for table in tables: if not table[evaluation_criteria_index]: scrutinize_dict[tag_sign_][-1]['评分标准'] += table[-1] if table[-1] else table[-2] continue if '条款内容' in table and '编列内容' in table: break if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1] elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2] else: tag = table[regulation_number_index_] if tag: tag = tag.strip().replace("\n","") tag = re.findall("[\u4e00-\u9fff]+", tag)[0] if tag and self._scrutinize_judge(tag): tag_sign_ = tag if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = [] if len(table) >= weights_index: weighr_finder = weight_comp.findall(table[-1]) if weighr_finder: table[weights_index] = weighr_finder[0] else: table[weights_index] = '3%' evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index] if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")} else: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n",""), "权重":weights.strip().replace("\n","")} scrutinize_dict[tag_sign_].append(value) if table[regulation_number_index_]: if table[regulation_number_index_][0] == '3': scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value} scrutinize_Initial_title_len = 0 break elif scrutinize_page+2 in page_number and scrutinize_sign: difference_value = scrutinize_second_title_len - title_len if difference_value: evaluation_factor_index -= difference_value evaluation_criteria_index -= difference_value weights_index -= difference_value for table in tables: if not table[evaluation_criteria_index]: scrutinize_dict[tag_sign_][-1]['评分标准'] += table[-1] if table[-1] else table[-2] continue if '条款内容' in table and '编列内容' in table: break if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1] elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2] else: tag = table[regulation_number_index_] if tag: tag = tag.strip().replace("\n","") tag = re.findall("[\u4e00-\u9fff]+", tag)[0] if tag and self._scrutinize_judge(tag): tag_sign_ = tag if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = [] if len(table) >= weights_index: weighr_finder = weight_comp.findall(table[-1]) if weighr_finder: table[weights_index] = weighr_finder[0] else: table[weights_index] = '3%' evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index] if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")} else: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n",""), "权重":weights.strip().replace("\n","")} scrutinize_dict[tag_sign_].append(value) if table[regulation_number_index_]: if table[regulation_number_index_][0] == '3': scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value} scrutinize_Initial_title_len = 0 break return scrutinize_dict if __name__ == '__main__': path_list = [] for path_ in path_list: dpr = DocumentPreReview(path_) scrutinize_dict = dpr.get_table() # TODO scrutinize_dict是需要的结果