123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2024-06-27 09:33:01
- # @Last Modified by: privacy
- # @Last Modified time: 2024-12-23 15:07:08
- import os
- os.environ['TRANSFORMERS_OFFLINE'] = '1'
- from typing import List, Union
- import torch
- import numpy as np
- import pandas as pd
- from sklearn.metrics.pairwise import cosine_similarity
- from transformers import AutoTokenizer, AutoModel
- class Matcher:
- def __init__(self):
- # Load model directly
- # # # 加载预训练的text2vec模型和分词器
- self.tokenizer = AutoTokenizer.from_pretrained("GanymedeNil/text2vec-base-chinese")
- self.model = AutoModel.from_pretrained("GanymedeNil/text2vec-base-chinese")
- def TopK1(self, title: str, keywords: list, query_embedding: np.ndarray, option_embeddings: List[np.ndarray]) -> pd.Series:
- """
- 获取相似度最高的向量
- Args:
- title: 待分类词
- keywords: 备选类别
- query_embedding: 待分类词向量
- option_embeddings: 备选类别向量列表
- Returns:
- 类别和相似度值
- """
- # 计算相似度
- similarities = [cosine_similarity([query_embedding], [embedding])[0][0] for embedding in option_embeddings]
- # 找到最相近的关键词
- most_similar_keyword = keywords[similarities.index(max(similarities))]
- return pd.Series([most_similar_keyword, max(similarities)])
- def get_embedding(self, text: str) -> np.ndarray:
- """
- 单文本转换为向量
- Args:
- text: 文本
- Returns:
- text_embedding: 文本向量
- """
- encoded_input = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
- with torch.no_grad():
- output = self.model(**encoded_input)
- text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
- return text_embedding
- def get_embeddings(self, text_list: list) -> List[np.ndarray]:
- """
- 批量文本转换为向量
- Args:
- text_list: 批量文本
- Returns:
- text_embeddings: 文本向量列表
- """
- text_embeddings = []
- for text in text_list:
- encoded_input = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
- with torch.no_grad():
- output = self.model(**encoded_input)
- text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
- return text_embeddings
- @classmethod
- def mean_pooling(cls, token_embeddings: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
- """
- Args:
- token_embeddings: First element of model_output contains all token embeddings
- """
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
- def sentence_embeddings(self, sentence: Union[str, List[str]]) -> torch.Tensor:
- encoded_input = self.tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
- with torch.no_grad():
- model_output = self.model(**encoded_input)
- return self.mean_pooling(model_output[0], encoded_input['attention_mask'])
- def similarities(self, sentence: Union[str, List[str]], query: str, topk: int = 1) -> pd.DataFrame:
- sentence_matrix = self.sentence_embeddings(sentence)
- query_vector = self.sentence_embeddings(query)
- cosine_similarities = cosine_similarity(query_vector, sentence_matrix)
- similarity_df = pd.DataFrame(cosine_similarities[0], columns=['similarity'])
- return similarity_df
- # df_with_similarity = pd.concat([sentence, similarity_df], axis=1).sort_values(by='similarity', ascending=False)
- # threshold = 0.7
- # result = df_with_similarity[df_with_similarity['similarity'] > threshold]
- # return result.head(topk)
- if __name__ == '__main__':
- matcher = Matcher()
- 招标因素 = ['投标人名称', '投标文件封面、投标函签字盖章', '投标文件格式', '报价唯一', '营业执照', '安全生产许可证', '资质条件', '财务要求', '业绩要求', '人员要求', '信誉要求', '不得存在的情形', '其他要求', '投标报价', '投标内容', '工期', '工程质量', '投标有效期', '投标保证金', '权利义务', '己标价工程量清单', '技术标准和要求', '其他', '以往同类项目业绩、经验', '信用评价', '财务状况', '投标报价合理性', '施工组织设计', '无机磨石品牌及质量', '无机磨石地坪的施工工艺及质量控制', '投标关键技术、设备、部件及材料的来源及供应可靠性', '施工安全和文明施工', '组织机构及施工管理人员', '价格得分']
- df = pd.read_json("D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-title.json")
- del df['bbox']
- keyword_embeddings = matcher.get_embeddings(招标因素)
- result = df['text'].apply(lambda x: matcher.TopK1(x, 招标因素, matcher.get_embedding(x), keyword_embeddings))
- result.columns = ['因素', '相似度']
- df['因素'] = result['因素']
- df['相似度'] = result['相似度']
- max_sim_idx = df.groupby('因素')['相似度'].idxmax()
- max_sim_rows = df.loc[max_sim_idx]
- max_sim_rows.to_json('相似度.json', orient='records', lines=True, force_ascii=False)
|