matcher.py 3.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-27 09:33:01
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-06-27 14:44:43
  6. import torch
  7. import numpy as np
  8. import pandas as pd
  9. from sklearn.metrics.pairwise import cosine_similarity
  10. from transformers import AutoTokenizer, AutoModel
  11. class Matcher:
  12. def __init__(self):
  13. # Load model directly
  14. # # # 加载预训练的text2vec模型和分词器
  15. base_dir = "/home/stf/.cache/huggingface/hub/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777"
  16. self.tokenizer = AutoTokenizer.from_pretrained(base_dir)
  17. self.model = AutoModel.from_pretrained(base_dir)
  18. # 计算给定文本(title)与一组关键词(keywords)之间的相似度,并确定其中最相似的关键词
  19. # title:输入文本标题
  20. # keywords:关键词
  21. # query_embedding:表示标题文本的embedding vector
  22. # option_embedding:表示待比较关键词列表的embedding vector
  23. def TopK1(self, title: str, keywords: list, query_embedding, option_embeddings: list) -> pd.Series:
  24. # 计算相似度
  25. similarities = [cosine_similarity([query_embedding], [embedding])[0][0] for embedding in option_embeddings]
  26. # 找到最相近的关键词
  27. most_similar_keyword = keywords[similarities.index(max(similarities))]
  28. print(f"和 {title} 最相近的关键词是:{most_similar_keyword}")
  29. # 返pandas Series对象,包含与title最相近的关键词和其对应的最大相似度值
  30. return pd.Series([most_similar_keyword, max(similarities)])
  31. # 调用self.tokenizer和self.model获取text embedding
  32. def get_embedding(self, text: str):
  33. encoded_input = self.tokenizer(text, return_tensors='pt')
  34. with torch.no_grad():
  35. output = self.model(**encoded_input)
  36. text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
  37. return text_embedding
  38. # 获取text_list中每个text embedding
  39. def get_embeddings(self, text_list: list) -> list:
  40. text_embeddings = []
  41. for text in text_list:
  42. encoded_input = self.tokenizer(text, return_tensors='pt')
  43. with torch.no_grad():
  44. output = self.model(**encoded_input)
  45. text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
  46. return text_embeddings
  47. if __name__ == '__main__':
  48. matcher = Matcher()
  49. 招标因素 = ['投标人名称', '投标文件封面、投标函签字盖章', '投标文件格式', '报价唯一', '营业执照', '安全生产许可证', '资质条件', '财务要求', '业绩要求', '人员要求', '信誉要求', '不得存在的情形', '其他要求', '投标报价', '投标内容', '工期', '工程质量', '投标有效期', '投标保证金', '权利义务', '己标价工程量清单', '技术标准和要求', '其他', '以往同类项目业绩、经验', '信用评价', '财务状况', '投标报价合理性', '施工组织设计', '无机磨石品牌及质量', '无机磨石地坪的施工工艺及质量控制', '投标关键技术、设备、部件及材料的来源及供应可靠性', '施工安全和文明施工', '组织机构及施工管理人员', '价格得分']
  50. df = pd.read_json("投标文件-修改版9-5-1-1.json")
  51. del df['bbox']
  52. keyword_embeddings = matcher.get_embeddings(招标因素)
  53. result = df['text'].apply(lambda x: matcher.TopK1(x, 招标因素, matcher.get_embedding(x), keyword_embeddings))
  54. result.columns = ['因素', '相似度']
  55. df['因素'] = result['因素']
  56. df['相似度'] = result['相似度']
  57. max_sim_idx = df.groupby('因素')['相似度'].idxmax()
  58. max_sim_rows = df.loc[max_sim_idx]
  59. max_sim_rows.to_json('相似度.json', orient='records', lines=True, force_ascii=False)