123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- '''
- 专家领域推荐
- '''
- import re
- import torch
- import numpy as np
- from BaseTools import BaseMethods
- from sklearn.metrics.pairwise import cosine_similarity
- from transformers import AutoTokenizer, AutoModel
- class ExpertFiledRecommendation(BaseMethods):
- def __init__(self) -> None:
- super().__init__()
- self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777')
- self.model = AutoModel.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777')
- self.expert_datasets = self.base_data_load()
- # print(self.expert_datasets.head(1))
- self.export_filed_list = self.base_read('data/export_filed_commendation_datasets/export_fileds_dict.txt')
- self.filed_keywords = self.json_read('data/export_filed_commendation_datasets/filed_keywords.json')
-
- def base_data_load(self):
- expert_filed = self.pandas_read_xls(file_path='data/export_filed_commendation_datasets/专家专业数据_主要及相关行业.xlsx', sheetname='sheet1')
- expert_filed['招标项目名称'] = expert_filed.apply(lambda row: self.projectname_processing(row['招标项目名称']), axis=1)
- expert_filed['projectName_embedding'] = expert_filed.apply(lambda row: self.get_embedding(row['招标项目名称']), axis=1)
- expert_filed['projectContent_embedding'] = expert_filed.apply(lambda row: self.get_embedding(row['项目内容']), axis=1)
- return expert_filed
- def projectname_processing(self, project_name:str):
- ''' 数据处理 '''
- project_name = project_name.split("竞争性谈判")[0]
- project_name = re.sub(r'[\s\n]+|\.|','', project_name)
- project_name = "".join(re.findall(r"[\u4e00-\u9fff]+", project_name))
- return project_name
- def get_embedding(self, text: str):
- ''' 获取文本的embedding '''
- text = re.sub(r'[\s\n]+|\.','', text)[:510]
- print("get_embedding_text: ",text)
- encoded_input = self.tokenizer(text, return_tensors='pt')
- with torch.no_grad():
- output = self.model(**encoded_input)
- text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
- return text_embedding
-
- def get_embeddings(self, text_list: list) -> list:
- text_embeddings = []
- for text in text_list:
- encoded_input = self.tokenizer(text, return_tensors='pt')
- with torch.no_grad():
- output = self.model(**encoded_input)
- text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
- return text_embeddings
- def Tok1(self, embedding1, embedding2):
- ''' 相似度计算 '''
- similarities = cosine_similarity([embedding1], [embedding2])[0][0]
-
- return similarities
-
- def sim_average(self, sim1, sim2):
- ''' 相似度求均值 '''
- sim = round((sim1 + sim2) / 2, 2)
- return sim
- def filed_recommendation(self, project_info: dict) -> list:
- ''' export field recommendation '''
- projectName = project_info['project_name']
- content = re.sub("\n| ","",project_info['content'])
- projectName_embedding = self.get_embedding(projectName)
- projectContent_embedding = self.get_embedding(content)
- self.expert_datasets['projectName_sim'] = self.expert_datasets.apply(lambda row: self.Tok1(row['projectName_embedding'], projectName_embedding), axis=1)
- # print(self.expert_datasets.head(1))
- self.expert_datasets['content_sim'] = self.expert_datasets.apply(lambda row: self.Tok1(row['projectContent_embedding'], projectContent_embedding), axis=1)
- # print(self.expert_datasets.head(1))
- self.expert_datasets['sim'] = self.expert_datasets.apply(lambda row: self.sim_average(row['projectName_sim'], row['content_sim']), axis=1)
- print(self.expert_datasets.head(2))
- max_row = self.expert_datasets.nlargest(1, 'sim')
- max_sim = max_row.iat[0,-1]
- if max_sim>=0.65:
- result = max_row.to_dict('records')[0]
- return {
- "firstSpeciality": result['主要专业'],
- "secondSpeciality": result['相关专业'],
- "financeSpeciality": "财务管理专业"
- }
- else:
- result = []
- if '保护' in projectName:
- result.append("保护专业")
- if '监测' in projectName or '监控' in projectName:
- result.append("监控信息专业")
- if len(result) > 1:
- return {
- "firstSpeciality": result[0],
- "secondSpeciality": result[1],
- "financeSpeciality": "财务管理专业"
- }
- values_hits = {}
- for k,v in self.filed_keywords.items():
- hited_value = 0
- for value in v:
- value_nums = content.count(value)
- hited_value += value_nums
- values_hits[k] = hited_value
- values_hits = sorted(values_hits.items(), key=lambda x:x[1], reverse=True)[:2]
- if len(result) == 1:
- return {
- "firstSpeciality": result[0],
- "secondSpeciality": values_hits[0][1],
- "financeSpeciality": "财务管理专业"
- }
- return {
- "firstSpeciality": values_hits[0][1],
- "secondSpeciality": values_hits[1][1],
- "financeSpeciality": "财务管理专业"
- }
- from fastapi import FastAPI
- import uvicorn
- from pydantic import BaseModel
- app = FastAPI()
- class ProjectInfo(BaseModel):
- projectName: str
- content: str
- class DemoResponse(BaseModel):
- code: int = 0
- msg: str=None
- data: dict
- ok: bool = False
- @app.post('/professional_recommendation')
- def professional_recommendation(info: ProjectInfo):
- item_info = info.dict(exclude_unset=True)
- if not item_info:
- return {'code': 400, 'msg': '参数错误', 'data': None, 'ok': False}
- print(info)
- projectName = info.projectName
- content = info.content
- result = efr.filed_recommendation({'project_name': projectName, 'content': content})
- if result:
- print(result)
- return {'code': 0, 'msg': '', 'data': result, 'ok': True}
- efr = ExpertFiledRecommendation()
- if __name__ == '__main__':
- # efr = ExpertFiledRecommendation()
- uvicorn.run(app='export_filed_recommendation:app', host='0.0.0.0', port=18884) # 13888
|