export_filed_recommendation.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. '''
  2. 专家领域推荐
  3. '''
  4. import re
  5. import torch
  6. import numpy as np
  7. from BaseTools import BaseMethods
  8. from sklearn.metrics.pairwise import cosine_similarity
  9. from transformers import AutoTokenizer, AutoModel
  10. class ExpertFiledRecommendation(BaseMethods):
  11. def __init__(self) -> None:
  12. super().__init__()
  13. self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777')
  14. self.model = AutoModel.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777')
  15. self.expert_datasets = self.base_data_load()
  16. # print(self.expert_datasets.head(1))
  17. self.export_filed_list = self.base_read('data/export_filed_commendation_datasets/export_fileds_dict.txt')
  18. self.filed_keywords = self.json_read('data/export_filed_commendation_datasets/filed_keywords.json')
  19. def base_data_load(self):
  20. expert_filed = self.pandas_read_xls(file_path='data/export_filed_commendation_datasets/专家专业数据_主要及相关行业.xlsx', sheetname='sheet1')
  21. expert_filed['招标项目名称'] = expert_filed.apply(lambda row: self.projectname_processing(row['招标项目名称']), axis=1)
  22. expert_filed['projectName_embedding'] = expert_filed.apply(lambda row: self.get_embedding(row['招标项目名称']), axis=1)
  23. expert_filed['projectContent_embedding'] = expert_filed.apply(lambda row: self.get_embedding(row['项目内容']), axis=1)
  24. return expert_filed
  25. def projectname_processing(self, project_name:str):
  26. ''' 数据处理 '''
  27. project_name = project_name.split("竞争性谈判")[0]
  28. project_name = re.sub(r'[\s\n]+|\.|','', project_name)
  29. project_name = "".join(re.findall(r"[\u4e00-\u9fff]+", project_name))
  30. return project_name
  31. def get_embedding(self, text: str):
  32. ''' 获取文本的embedding '''
  33. text = re.sub(r'[\s\n]+|\.','', text)[:510]
  34. print("get_embedding_text: ",text)
  35. encoded_input = self.tokenizer(text, return_tensors='pt')
  36. with torch.no_grad():
  37. output = self.model(**encoded_input)
  38. text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
  39. return text_embedding
  40. def get_embeddings(self, text_list: list) -> list:
  41. text_embeddings = []
  42. for text in text_list:
  43. encoded_input = self.tokenizer(text, return_tensors='pt')
  44. with torch.no_grad():
  45. output = self.model(**encoded_input)
  46. text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
  47. return text_embeddings
  48. def Tok1(self, embedding1, embedding2):
  49. ''' 相似度计算 '''
  50. similarities = cosine_similarity([embedding1], [embedding2])[0][0]
  51. return similarities
  52. def sim_average(self, sim1, sim2):
  53. ''' 相似度求均值 '''
  54. sim = round((sim1 + sim2) / 2, 2)
  55. return sim
  56. def filed_recommendation(self, project_info: dict) -> list:
  57. ''' export field recommendation '''
  58. projectName = project_info['project_name']
  59. content = re.sub("\n| ","",project_info['content'])
  60. projectName_embedding = self.get_embedding(projectName)
  61. projectContent_embedding = self.get_embedding(content)
  62. self.expert_datasets['projectName_sim'] = self.expert_datasets.apply(lambda row: self.Tok1(row['projectName_embedding'], projectName_embedding), axis=1)
  63. # print(self.expert_datasets.head(1))
  64. self.expert_datasets['content_sim'] = self.expert_datasets.apply(lambda row: self.Tok1(row['projectContent_embedding'], projectContent_embedding), axis=1)
  65. # print(self.expert_datasets.head(1))
  66. self.expert_datasets['sim'] = self.expert_datasets.apply(lambda row: self.sim_average(row['projectName_sim'], row['content_sim']), axis=1)
  67. print(self.expert_datasets.head(2))
  68. max_row = self.expert_datasets.nlargest(1, 'sim')
  69. max_sim = max_row.iat[0,-1]
  70. if max_sim>=0.65:
  71. result = max_row.to_dict('records')[0]
  72. return {
  73. "firstSpeciality": result['主要专业'],
  74. "secondSpeciality": result['相关专业'],
  75. "financeSpeciality": "财务管理专业"
  76. }
  77. else:
  78. result = []
  79. if '保护' in projectName:
  80. result.append("保护专业")
  81. if '监测' in projectName or '监控' in projectName:
  82. result.append("监控信息专业")
  83. if len(result) > 1:
  84. return {
  85. "firstSpeciality": result[0],
  86. "secondSpeciality": result[1],
  87. "financeSpeciality": "财务管理专业"
  88. }
  89. values_hits = {}
  90. for k,v in self.filed_keywords.items():
  91. hited_value = 0
  92. for value in v:
  93. value_nums = content.count(value)
  94. hited_value += value_nums
  95. values_hits[k] = hited_value
  96. values_hits = sorted(values_hits.items(), key=lambda x:x[1], reverse=True)[:2]
  97. if len(result) == 1:
  98. return {
  99. "firstSpeciality": result[0],
  100. "secondSpeciality": values_hits[0][1],
  101. "financeSpeciality": "财务管理专业"
  102. }
  103. return {
  104. "firstSpeciality": values_hits[0][1],
  105. "secondSpeciality": values_hits[1][1],
  106. "financeSpeciality": "财务管理专业"
  107. }
  108. from fastapi import FastAPI
  109. import uvicorn
  110. from pydantic import BaseModel
  111. app = FastAPI()
  112. class ProjectInfo(BaseModel):
  113. projectName: str
  114. content: str
  115. class DemoResponse(BaseModel):
  116. code: int = 0
  117. msg: str=None
  118. data: dict
  119. ok: bool = False
  120. @app.post('/professional_recommendation')
  121. def professional_recommendation(info: ProjectInfo):
  122. item_info = info.dict(exclude_unset=True)
  123. if not item_info:
  124. return {'code': 400, 'msg': '参数错误', 'data': None, 'ok': False}
  125. print(info)
  126. projectName = info.projectName
  127. content = info.content
  128. result = efr.filed_recommendation({'project_name': projectName, 'content': content})
  129. if result:
  130. print(result)
  131. return {'code': 0, 'msg': '', 'data': result, 'ok': True}
  132. efr = ExpertFiledRecommendation()
  133. if __name__ == '__main__':
  134. # efr = ExpertFiledRecommendation()
  135. uvicorn.run(app='export_filed_recommendation:app', host='0.0.0.0', port=18884) # 13888