Procházet zdrojové kódy

添加评审因素段落定位

sprivacy před 1 rokem
rodič
revize
65241d1460

+ 7 - 0
README.md

@@ -0,0 +1,7 @@
+
+
+主要模块描述
+1、tools     大纲解析模块
+2、get_info  PDF信息抽取模块
+3、matcher   段落定位模块
+

+ 70 - 0
matcher.py

@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2024-06-27 09:33:01
+# @Last Modified by:   privacy
+# @Last Modified time: 2024-06-27 14:44:43
+import torch
+import numpy as np
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoTokenizer, AutoModel
+
+
+class Matcher:
+    def __init__(self):
+        # Load model directly
+        # # # 加载预训练的text2vec模型和分词器
+        self.tokenizer = AutoTokenizer.from_pretrained("GanymedeNil/text2vec-base-chinese")
+        self.model = AutoModel.from_pretrained("GanymedeNil/text2vec-base-chinese")
+
+    def TopK1(self, title: str, keywords: list, query_embedding, option_embeddings: list) -> pd.Series:
+        # 计算相似度
+        similarities = [cosine_similarity([query_embedding], [embedding])[0][0] for embedding in option_embeddings]
+    
+        # 找到最相近的关键词
+        most_similar_keyword = keywords[similarities.index(max(similarities))]
+    
+        print(f"和 {title} 最相近的关键词是:{most_similar_keyword}")
+    
+        return pd.Series([most_similar_keyword, max(similarities)])
+
+    def get_embedding(self, text: str):
+        encoded_input = tokenizer(text, return_tensors='pt')
+        with torch.no_grad():
+            output = model(**encoded_input)
+        text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
+        return text_embedding
+    
+    def get_embeddings(self, text_list: list) -> list:
+        text_embeddings = []
+        for text in text_list:
+            encoded_input = tokenizer(text, return_tensors='pt')
+            with torch.no_grad():
+                output = model(**encoded_input)
+            text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
+        return text_embeddings
+
+
+if __name__ == '__main__':
+    matcher = Matcher()
+
+    招标因素 = ['投标人名称', '投标文件封面、投标函签字盖章', '投标文件格式', '报价唯一', '营业执照', '安全生产许可证', '资质条件', '财务要求', '业绩要求', '人员要求', '信誉要求', '不得存在的情形', '其他要求', '投标报价', '投标内容', '工期', '工程质量', '投标有效期', '投标保证金', '权利义务', '己标价工程量清单', '技术标准和要求', '其他', '以往同类项目业绩、经验', '信用评价', '财务状况', '投标报价合理性', '施工组织设计', '无机磨石品牌及质量', '无机磨石地坪的施工工艺及质量控制', '投标关键技术、设备、部件及材料的来源及供应可靠性', '施工安全和文明施工', '组织机构及施工管理人员', '价格得分']
+
+    df = pd.read_json("投标文件-修改版9-5-1-1.json")
+    del df['bbox']
+
+    keyword_embeddings = matcher.get_embeddings(招标因素)
+
+    result = df['text'].apply(lambda x: matcher.TopK1(x, 招标因素, matcher.get_embedding(x), keyword_embeddings))
+
+    result.columns = ['因素', '相似度']
+
+    df['因素'] = result['因素']
+    df['相似度'] = result['相似度']
+
+    max_sim_idx = df.groupby('因素')['相似度'].idxmax()
+
+    max_sim_rows = df.loc[max_sim_idx]
+
+    max_sim_rows.to_json('相似度.json', orient='records', lines=True, force_ascii=False)
+

binární
三峡左岸及地下电站地坪整治招标文件(发售版).docx


+ 31 - 0
相似度.json

@@ -0,0 +1,31 @@
+{"index":0,"page_number":705,"text":"(五)不存在禁止投标情形的承诺书","因素":"不得存在的情形","相似度":0.5486750603}
+{"index":0,"page_number":55,"text":"三、奖惩办法","因素":"业绩要求","相似度":0.3549992144}
+{"index":0,"page_number":255,"text":"(二)主要人员简历表","因素":"人员要求","相似度":0.5141109228}
+{"index":0,"page_number":351,"text":"(三)近年类似项目业绩 1","因素":"以往同类项目业绩、经验","相似度":0.6570545435}
+{"index":0,"page_number":232,"text":"第九章 与发包人的配合","因素":"价格得分","相似度":0.3587560952}
+{"index":6,"page_number":706,"text":"6.在“信用中国”网站(www.creditchina.gov.cn)被列入失信被执行人名单;","因素":"信用评价","相似度":0.4579818249}
+{"index":0,"page_number":706,"text":"(六)企业信誉承诺书","因素":"信誉要求","相似度":0.5709712505}
+{"index":3,"page_number":114,"text":"8.4","因素":"其他","相似度":0.3379742801}
+{"index":1,"page_number":118,"text":"(一)、材料要求","因素":"其他要求","相似度":0.4477651715}
+{"index":3,"page_number":192,"text":"二、安全生产管理体系","因素":"安全生产许可证","相似度":0.7061559558}
+{"index":3,"page_number":55,"text":"一、总工期目标","因素":"工期","相似度":0.5227609277}
+{"index":3,"page_number":40,"text":"一、工程概况","因素":"工程质量","相似度":0.6530075073}
+{"index":6,"page_number":3,"text":"六、已标价工程量清单","因素":"己标价工程量清单","相似度":0.739189446}
+{"index":2,"page_number":120,"text":"(四)、检验标准和方法","因素":"技术标准和要求","相似度":0.5083931684}
+{"index":1,"page_number":321,"text":"(一)投标人基本情况表","因素":"投标人名称","相似度":0.772038877}
+{"index":5,"page_number":3,"text":"五、投标保证金","因素":"投标保证金","相似度":0.8615003228}
+{"index":11,"page_number":3,"text":"十一、投标关键技术、部件、设备及材料来源表","因素":"投标关键技术、设备、部件及材料的来源及供应可靠性","相似度":0.7936463952}
+{"index":0,"page_number":723,"text":"(十五)非联合体投标声明","因素":"投标报价","相似度":0.52962327}
+{"index":0,"page_number":9,"text":"四、联合体协议书","因素":"投标报价合理性","相似度":0.3283154368}
+{"index":1,"page_number":3,"text":"一、投标函及投标函附录","因素":"投标文件封面、投标函签字盖章","相似度":0.7844030857}
+{"index":2,"page_number":0,"text":"投标文件","因素":"投标文件格式","相似度":0.9003171921}
+{"index":9,"page_number":239,"text":"1.2 优良","因素":"报价唯一","相似度":0.338262856}
+{"index":0,"page_number":192,"text":"第五章 施工安全和文明施工","因素":"施工安全和文明施工","相似度":0.8497256041}
+{"index":0,"page_number":40,"text":"第一章 施工组织设计","因素":"施工组织设计","相似度":0.8328937292}
+{"index":0,"page_number":130,"text":"第二章 无机磨石品牌及质量","因素":"无机磨石品牌及质量","相似度":0.8734935522}
+{"index":0,"page_number":165,"text":"第三章 无机磨石地坪的施工工艺及质量控制","因素":"无机磨石地坪的施工工艺及质量控制","相似度":0.8576892614}
+{"index":0,"page_number":678,"text":"(五)近年发生的诉讼和仲裁情况","因素":"权利义务","相似度":0.3135215342}
+{"index":0,"page_number":216,"text":"第六章 组织机构及施工管理人员","因素":"组织机构及施工管理人员","相似度":0.8044318557}
+{"index":3,"page_number":13,"text":"5.17","因素":"营业执照","相似度":0.2897812426}
+{"index":0,"page_number":334,"text":"(二)近年财务状况表","因素":"财务状况","相似度":0.6810054779}
+{"index":10,"page_number":3,"text":"十、资格审查资料","因素":"资质条件","相似度":0.4772962928}