瀏覽代碼

fix get_info bug

sprivacy 1 年之前
父節點
當前提交
4e8ff53f68
共有 2 個文件被更改,包括 373 次插入5 次删除
  1. 361 0
      document_.py
  2. 12 5
      get_info.py

+ 361 - 0
document_.py

@@ -0,0 +1,361 @@
+'''
+招投标文件预审查
+
+1. 解析bidding_document_extract中all_tables.json结果
+'''
+from tools import BaseMethods
+from pprint import pprint
+import re
+
+chinese_num_map = {  
+    '零': 0,  
+    '一': 1,  
+    '二': 2,  
+    '三': 3,  
+    '四': 4,  
+    '五': 5,  
+    '六': 6,  
+    '七': 7,  
+    '八': 8,  
+    '九': 9,
+    '十': 10
+} 
+
+
+
+class DocumentPreReview():
+    def __init__(self) -> None:
+        self.bm = BaseMethods()
+        self.bidding_tables = self.get_bidding_table()
+        self.contexts = self.get_contexts()
+        self.announcement = self.get_announcement()
+        self.bidding_context = self.get_bidding_context()
+        self.chinese_num_map = chinese_num_map
+
+    def get_contexts(self, file_path:str = 'data/contexts.json'):
+        ''' get contexts by page
+        '''
+        contexts = self.bm.json_read(file_path)
+        return contexts
+    
+    def get_bidding_table(self):
+        ''' get table data
+        '''
+        file_path = "data/all_tables_三峡左右岸.json"
+        # file_path = "code/bidding_document_extract/all_tables_三峡左右岸.json"
+        all_tables = self.bm.json_read(file_path)
+        return all_tables
+    
+    def get_bidding_context(self):
+        ''' read json to get context
+        '''
+        file_path = "data/基于物联网技术的三峡坝区智慧仓储研究与建设招标文件-发出.json"
+        bidding_context = self.bm.json_read(file_path)
+        return bidding_context
+
+    def get_table(self):
+        ''' get table to json
+        '''
+        all_tables = self.bidding_tables
+
+        tag_sign = ''
+        tag_list = ("形式评审标准", "资格评审标准", "响应性评审标准")
+        tag_dict = dict([(tag,[]) for tag in tag_list])
+        
+        scrutinize_tuple = ("商务部分评分标准","技术部分评审标准","投标报价评审标准","报价部分评审标准","报价评分标准")
+        scrutinize_dict = dict([(scrutinize,[]) for scrutinize in scrutinize_tuple])
+        scrutinize_page = 0
+        scrutinize_index = 0
+        scrutinize_Initial_position_marker = 0  # 详审位置标记
+
+        record_page = 0
+        bidder_know = {}   # 投标人须知前附表
+        for partial_form in all_tables:
+            table_name = partial_form['table_name']
+            page_number = partial_form['page_numbers']
+            title_len = partial_form['title_len']
+            tables = partial_form["table"]
+            
+            if '投标人须知前附表' == table_name:  
+                record_page = page_number[0]
+            if page_number[0] < record_page + 3: 
+                for table in tables[1:]:
+                    if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
+                    if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
+                
+            if '评标方法' in table_name:
+                table_name = table_name.strip().replace("\n","")
+            if table_name == "评标办法前附表":
+                table_page_num = page_number[0]
+                inital_data = tables[0]
+                # confirm data location
+                regulation_number_index = inital_data.index("条款号")
+                evaluation_factor_index = inital_data.index("评审因素")
+                evaluation_criteria_index = inital_data.index("评审标准")
+                
+                for table in tables[1:]:
+                    tag = table[regulation_number_index+1]
+                    if tag: tag = tag.strip().replace("\n","")
+                    if tag and (tag in tag_list):
+                        tag_sign = tag
+                    evaluation_factor,evaluation_criteria = table[evaluation_factor_index],table[evaluation_criteria_index]
+                    if tag_sign in tag_dict: 
+                        tag_dict[tag_sign].append({"评审因素":evaluation_factor.strip().replace("\n",""),
+                                                "评审标准":evaluation_criteria.strip().replace("\n","")})
+                    if '评分因素' in table or '评分标准' in table:
+                        scrutinize_page = table_page_num
+                        scrutinize_Initial_position_marker = 1
+                if not scrutinize_page: scrutinize_page = table_page_num+1
+
+            ''' scrutinize '''
+            if (scrutinize_page == page_number[0] and scrutinize_Initial_position_marker) or scrutinize_page == page_number[0]:
+                regulation_number_index,evaluation_factor_index,evaluation_criteria_index,weights_index = 0,0,0,0
+                for table in tables:
+                    if '评分因素' in table and '评分标准' in table:
+                        regulation_number_index = table.index("条款号")
+                        evaluation_factor_index = table.index("评分因素")
+                        evaluation_criteria_index = table.index("评分标准")
+                        weights_index = table.index("权重")
+                        tag_sign = ''
+                        scrutinize_index = tables.index(table)
+                if scrutinize_index:
+                    for table in tables[scrutinize_index+1:]:
+                        if table[regulation_number_index+1]: tag = table[regulation_number_index+1]
+                        else: tag = table[regulation_number_index+2]
+                        if tag: 
+                            tag = tag.strip().replace("\n","")
+                            tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
+                        if tag and (tag in scrutinize_tuple):
+                            tag_sign = tag
+                        evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
+                        if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""),"评分标准":evaluation_criteria.strip().replace("\n","")}
+                        else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
+                                        "评分标准":evaluation_criteria.strip().replace("\n",""),
+                                        "权重":weights.strip().replace("\n","")}
+                        scrutinize_dict[tag_sign].append(value)
+                        if '报价' in tag_sign and '标准' in tag_sign:
+                            scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
+                            break
+            elif scrutinize_page+1 == page_number[0] and title_len == 5  and '报价' not in tag_sign:
+                if scrutinize_Initial_position_marker:
+                    evaluation_factor_index -= 1
+                    evaluation_criteria_index -= 1
+                    weights_index -= 1
+                for table in tables:
+                    if not table[2]:
+                        scrutinize_dict[tag_sign][-1]['评分标准'] += table[3]
+                        continue
+                    tag = table[regulation_number_index+1]
+                    if tag: 
+                        tag = tag.strip().replace("\n","")
+                        tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
+                    if tag and (tag in scrutinize_tuple):
+                        tag_sign = tag
+                    evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
+                    if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
+                    else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
+                                    "评分标准":evaluation_criteria.strip().replace("\n",""),
+                                    "权重":weights.strip().replace("\n","")}
+                    scrutinize_dict[tag_sign].append(value)
+                    if '报价' in tag_sign and '标准' in tag_sign:
+                        scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
+                        scrutinize_Initial_position_marker = 0
+                        break
+            elif scrutinize_page+2 == page_number[0] and title_len == 5 and '报价' not in tag_sign:
+                for table in tables:
+                    if not table[2]:
+                        scrutinize_dict[tag_sign][-1]['评分标准'] += table[3]
+                        continue
+                    tag = table[regulation_number_index+1]
+                    if tag: 
+                        tag = tag.strip().replace("\n","")
+                        tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
+                    if tag and (tag in scrutinize_tuple):
+                        tag_sign = tag
+                    evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
+                    try:
+                        if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
+                        else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
+                                    "评分标准":evaluation_criteria.strip().replace("\n",""),
+                                    "权重":weights.strip().replace("\n","")}
+                    except:
+                        print()
+                    scrutinize_dict[tag_sign].append(value)
+                    if '报价' in tag_sign and '标准' in tag_sign:
+                        scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
+                        break
+
+        # pprint(tag_dict)
+        pprint(scrutinize_dict)
+        # pprint(bidder_know)
+        return tag_dict,bidder_know,scrutinize_dict
+    
+    def get_announcement(self)->str:
+        ''' bidder announcement
+        '''
+        announcements = ''
+        announcement_contexts = self.contexts[2:8]
+        for index, announcement in enumerate(announcement_contexts):
+            finder = re.findall("^第一章",announcement['text'])
+            if finder:
+                for text in announcement_contexts[index:]:
+                    if re.findall("^第二章", text["text"]): break
+                    announcements += text["text"]
+                break
+        return announcements
+    
+
+
+    def formal_criteria(self, review_criteria_list:list):
+        ''' Analysis of formal review criteria
+        形式评审标准
+        [{'评审因素': '投标人名称', '评审标准': '与营业执照书一致'},
+            {'评审因素': '投标文件封面、投标函签字盖章',
+             '评审标准': '投标文件封面、投标函须有法定代表人(或其委托代理人)签字(或签章)并加盖单位章,由委托代理人签字的须具有有效的授权委托书'},
+            {'评审因素': '投标文件格式', '评审标准': '符合第八章“投标文件格式”的要求'},
+            {'评审因素': '联合体投标人(如有)', '评审标准': '不适用'},
+            {'评审因素': '报价唯一', '评审标准': '只能有一个有效报价'}]
+        '''
+        for review_criteria in review_criteria_list:
+            evaluation_factor = review_criteria['评审因素']
+            evaluation_criteria = review_criteria['评审标准']
+            if '投标人名称' in evaluation_factor or '供应商名称' in evaluation_factor:
+                ['营业执照','资质证书']
+                '''
+                要求投标文件中 投标公司 与 其提供的营业执照或资质证书中的名称相同
+                '''
+                pass
+            elif '报价函签字盖章' in evaluation_factor or '投标文件封面、投标函签字盖章' in evaluation_factor:
+                '''
+                要求投标文件中 投标公司的 法人或委托人签字或是 存在单位盖章
+                '''
+                pass
+            elif '投标文件格式' in evaluation_factor:
+                comp1 = re.compile("(第.*?章)")
+                comp2 = re.compile("“(.*?)”")
+                title = comp1.findall(evaluation_criteria)[0]+comp2.findall(evaluation_criteria)[0]
+                comp3 = re.compile("第(.*?)章")
+                title_list = []
+
+                format_index,sta_page = -1,-1
+                sign = True
+                title_next = ''
+                for context in self.bidding_context: # 取招标文件内容
+                    text = context['text'].strip().replace(" ","")
+
+                    if text == '目录': 
+                        sta_page = context['page_number']
+                    if sta_page != -1 and context['page_number'] < 4:
+                        finder = comp3.findall(context['text'])
+                        if finder and sign:
+                            if title_list:
+                                chinese_num = self.chinese_num_map.get(comp3.findall(title_list[-1])[0],None)
+                                if chinese_num > self.chinese_num_map.get(finder[0],0):
+                                    sign = False
+                                else:
+                                    title_list.append(context['text'].split(' ')[0])
+                            else:
+                                title_list.append(context['text'].split(' ')[0])
+                        
+                    if text == title and format_index == -1:
+                        format_index = self.bidding_context.index(context)
+                        break
+                    '''
+                    不对比目录,只对比内容,只要存在即认定符合要求
+                    '''
+
+                title_index = title_list.index(title)
+                if title_index != len(title_list)-1:
+                    title_next = title_list[title_index+1]
+
+                file_format = {title:[]}
+                for context in self.bidding_context[format_index+1:]:
+                    text = context['text'].strip().replace(" ","")
+                    if title_next and title_next == text:
+                        break
+                    file_format[title].append(context)
+                file_format   # 需要优化提取的内容
+                '''
+                招标文件 file_format 与投标文件内容对比,投标文件中只要存在file_format内容即可
+                '''
+            
+            elif '联合体投标人' in evaluation_factor:
+                if '不适用' in evaluation_criteria: continue
+                
+            elif '报价唯一' in evaluation_factor:
+                '''
+                需要在投标文件中比对三个位置的报价总和值抽取
+                '''
+                pass
+    
+
+    def qualification_criteria(self, review_criteria_list:list, bidder_know:dict):
+        ''' Qualification assessment criteria
+        资格评审标准
+        '''
+        for review_criteria in review_criteria_list:
+            evaluation_factor = review_criteria['评审因素']
+            evaluation_criteria = review_criteria['评审标准']
+
+            if '营业执照' in evaluation_factor:
+                '''
+                在投标文件中 对营业执照识别营业期限;长期识别认为可以;只有开始时间没有结束时间给提示。
+                '''
+                pass
+            elif '资质' in evaluation_factor:
+                comp1 = re.compile('(第.*?章)')
+                comp2 = re.compile('“(.*?)”')
+                comp3 = re.compile('第(.*?)项规定')
+                
+                finder1 = comp1.findall(evaluation_criteria)[0]
+                finder2 = comp2.findall(evaluation_criteria)[0]
+                finder3 = comp3.findall(evaluation_criteria)[0]
+
+                chapter_name = finder1+finder2
+                stipulation = finder3
+
+                if '第二章' in chapter_name:
+                    bidder_data = bidder_know.get(stipulation,None)
+                    if not bidder_data: continue
+                    clause_name = bidder_data['条款名称'].replace("\n","")
+                    list_content = bidder_data['编列内容']
+
+                    if '招标公告' in list_content:
+                        cert_index = self.announcement.index('资质')   ## 默认 资质条件 不变
+                        cert_required = re.findall(":(.*?)\\n",self.announcement[cert_index:cert_index+500])[0]
+                        
+                        '''
+                        big model
+
+                        需要设计prompt,可将内容及情况在线上glm4中使用,测出合适prompt
+                        '''
+
+                        
+                        
+                
+                    
+
+
+
+
+    def content_parsing(self):
+        ''' data analysis aggregate function
+        '''
+        tag_dict,bidder_know = dpr.get_table()
+        # {}
+        # self.formal_criteria(tag_dict['形式评审标准'])
+
+        # self.qualification_criteria(tag_dict['资格评审标准'], bidder_know)
+
+                    
+
+
+if __name__ == '__main__':
+    dpr = DocumentPreReview()
+    dpr.get_table()
+    # print(dpr.bidding_context)
+
+    # formal_review_criteria = [
+    #         {'评审因素': '投标文件格式', '评审标准': '符合第八章“投标文件格式”的要求'}
+    #         ]
+    # dpr.formal_criteria(formal_review_criteria)

+ 12 - 5
get_info.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-07-25 16:36:24
+# @Last Modified time: 2024-08-01 13:43:01
 
 # import os
 
@@ -451,10 +451,14 @@ class PdfExtractAttr(object):
                     })
         self.detail_df = pd.DataFrame(self.details)
 
-    def concat_table(self, table: list, page_number: int, table_name: str = None) -> None:
+    def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
         """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
         @table
         """
+        if new:
+            self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
+            return
+
         first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
         tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
         if len(table) > 1:
@@ -462,7 +466,7 @@ class PdfExtractAttr(object):
         else:
             second = None
         # pprint(first)
-        if len(HEADERS & set(first)) > 2:
+        if not self.tables or len(HEADERS & set(first)) > 2:
             # pprint("找到大量表头元素,判断为独立表头,生成新表!")
             self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
         elif second and (len(HEADERS & set(second)) > 2):
@@ -478,7 +482,7 @@ class PdfExtractAttr(object):
             self.tables[-1]['table'].extend(table)
         else:
             self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
-    
+
     def parse_table(self) -> None:
         """表格解析
         """
@@ -500,7 +504,10 @@ class PdfExtractAttr(object):
                     #self.concat_table(table.extract(), table_title_name)
                 # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
                 elif len(tables) > 1:
-                    pass
+                    first_table = tables[0]
+                    self.concat_table(first_table.extract(), page_number=page_number)
+                    for table_index in range(1, len(tables)):
+                        self.concat_table(tables[table_index].extract(), page_number=page_number, new=True)
 
     def output(self, table_path: str = 'all_tables.json'):
         """结果输出