''' 专家领域推荐 ''' import re import torch import numpy as np from BaseTools import BaseMethods from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModel class ExpertFiledRecommendation(BaseMethods): def __init__(self) -> None: super().__init__() self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777') self.model = AutoModel.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777') self.expert_datasets = self.base_data_load() # print(self.expert_datasets.head(1)) self.export_filed_list = self.base_read('data/export_filed_commendation_datasets/export_fileds_dict.txt') self.filed_keywords = self.json_read('data/export_filed_commendation_datasets/filed_keywords.json') def base_data_load(self): expert_filed = self.pandas_read_xls(file_path='data/export_filed_commendation_datasets/专家专业数据_主要及相关行业.xlsx', sheetname='sheet1') expert_filed['招标项目名称'] = expert_filed.apply(lambda row: self.projectname_processing(row['招标项目名称']), axis=1) expert_filed['projectName_embedding'] = expert_filed.apply(lambda row: self.get_embedding(row['招标项目名称']), axis=1) expert_filed['projectContent_embedding'] = expert_filed.apply(lambda row: self.get_embedding(row['项目内容']), axis=1) return expert_filed def projectname_processing(self, project_name:str): ''' 数据处理 ''' project_name = project_name.split("竞争性谈判")[0] project_name = re.sub(r'[\s\n]+|\.|','', project_name) project_name = "".join(re.findall(r"[\u4e00-\u9fff]+", project_name)) return project_name def get_embedding(self, text: str): ''' 获取文本的embedding ''' text = re.sub(r'[\s\n]+|\.','', text)[:510] print("get_embedding_text: ",text) encoded_input = self.tokenizer(text, return_tensors='pt') with torch.no_grad(): output = self.model(**encoded_input) text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0) return text_embedding def get_embeddings(self, text_list: list) -> list: text_embeddings = [] for text in text_list: encoded_input = self.tokenizer(text, return_tensors='pt') with torch.no_grad(): output = self.model(**encoded_input) text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)) return text_embeddings def Tok1(self, embedding1, embedding2): ''' 相似度计算 ''' similarities = cosine_similarity([embedding1], [embedding2])[0][0] return similarities def sim_average(self, sim1, sim2): ''' 相似度求均值 ''' sim = round((sim1 + sim2) / 2, 2) return sim def filed_recommendation(self, project_info: dict) -> list: ''' export field recommendation ''' projectName = project_info['project_name'] content = re.sub("\n| ","",project_info['content']) projectName_embedding = self.get_embedding(projectName) projectContent_embedding = self.get_embedding(content) self.expert_datasets['projectName_sim'] = self.expert_datasets.apply(lambda row: self.Tok1(row['projectName_embedding'], projectName_embedding), axis=1) # print(self.expert_datasets.head(1)) self.expert_datasets['content_sim'] = self.expert_datasets.apply(lambda row: self.Tok1(row['projectContent_embedding'], projectContent_embedding), axis=1) # print(self.expert_datasets.head(1)) self.expert_datasets['sim'] = self.expert_datasets.apply(lambda row: self.sim_average(row['projectName_sim'], row['content_sim']), axis=1) print(self.expert_datasets.head(2)) max_row = self.expert_datasets.nlargest(1, 'sim') max_sim = max_row.iat[0,-1] if max_sim>=0.65: result = max_row.to_dict('records')[0] return { "firstSpeciality": result['主要专业'], "secondSpeciality": result['相关专业'], "financeSpeciality": "财务管理专业" } else: result = [] if '保护' in projectName: result.append("保护专业") if '监测' in projectName or '监控' in projectName: result.append("监控信息专业") if len(result) > 1: return { "firstSpeciality": result[0], "secondSpeciality": result[1], "financeSpeciality": "财务管理专业" } values_hits = {} for k,v in self.filed_keywords.items(): hited_value = 0 for value in v: value_nums = content.count(value) hited_value += value_nums values_hits[k] = hited_value values_hits = sorted(values_hits.items(), key=lambda x:x[1], reverse=True)[:2] if len(result) == 1: return { "firstSpeciality": result[0], "secondSpeciality": values_hits[0][1], "financeSpeciality": "财务管理专业" } return { "firstSpeciality": values_hits[0][1], "secondSpeciality": values_hits[1][1], "financeSpeciality": "财务管理专业" } from fastapi import FastAPI import uvicorn from pydantic import BaseModel app = FastAPI() class ProjectInfo(BaseModel): projectName: str content: str class DemoResponse(BaseModel): code: int = 0 msg: str=None data: dict ok: bool = False @app.post('/professional_recommendation') def professional_recommendation(info: ProjectInfo): item_info = info.dict(exclude_unset=True) if not item_info: return {'code': 400, 'msg': '参数错误', 'data': None, 'ok': False} print(info) projectName = info.projectName content = info.content result = efr.filed_recommendation({'project_name': projectName, 'content': content}) if result: print(result) return {'code': 0, 'msg': '', 'data': result, 'ok': True} efr = ExpertFiledRecommendation() if __name__ == '__main__': # efr = ExpertFiledRecommendation() uvicorn.run(app='export_filed_recommendation:app', host='0.0.0.0', port=18884) # 13888