test.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. from translate import ApiCenterClient
  2. from baidubce import bce_client_configuration
  3. from baidubce.auth import bce_credentials
  4. from paddlenlp import Taskflow
  5. import re
  6. import requests
  7. from pymongo import MongoClient
  8. import jieba.analyse
  9. import langid
  10. from tld import get_tld
  11. import time
  12. import logger
  13. def keywords_textrank(text):
  14. keywords = jieba.analyse.textrank(text, topK=6)
  15. return keywords
  16. def isVaildDate(date):
  17. try:
  18. if ":" in date:
  19. time.strptime(date, "%Y-%m-%d %H:%M:%S")
  20. else:
  21. time.strptime(date, "%Y-%m-%d")
  22. return True
  23. except:
  24. return False
  25. def information_extraction(document):
  26. title = document['title']
  27. url = document['url']
  28. if document['highlight']:
  29. highlight = document['highlight']
  30. else:
  31. highlight = []
  32. author = document['author']
  33. hotkey = document['hotkey']
  34. if len(highlight) >5:
  35. highlight = highlight[0:5]
  36. #for w in highlight_:
  37. #highlight.append(trans_client.demo(w[0]).result.trans_result[0].dst)
  38. #highlight = document['highlight']
  39. if isVaildDate(document['publish_time']):
  40. publish_time = document['publish_time']
  41. else:
  42. publish_time = ''
  43. hot_event = document['hotkey']
  44. publisher = ''
  45. publisher_country = ''
  46. if langid.classify(title) == 'zh':
  47. media_type = 0
  48. else:
  49. media_type = 1
  50. try:
  51. tld_ = get_tld(url, as_object=True).fld
  52. platform = tld_.split('.')[0]
  53. except:
  54. platform = ''
  55. try:
  56. for kk in document['metadata'].keys():
  57. if re.search('publisher', kk.lower()):
  58. publisher = document['metadata'][kk]
  59. elif re.search('publisher_country', kk.lower()):
  60. publisher_country = document['metadata'][kk]
  61. elif re.search('platform', kk.lower()):
  62. platform = document['metadata'][kk]
  63. elif re.search('keywords',kk.lower()):
  64. highlight = document['metadata'][kk].split(',')
  65. except:
  66. publisher = ''
  67. publisher_country = ''
  68. platform = ''
  69. if len(document['content'].split(' ')) > 2000:
  70. print(len(document['content'].split(' ')))
  71. title_trans_json = trans_client.demo(title)
  72. print(title_trans_json)
  73. information = {
  74. 'title': title,
  75. 'title_trans': title_trans_json.result.trans_result[0].dst,
  76. 'title_emotion': '',
  77. 'all_emotion': '',
  78. 'show_emotion': '',
  79. 'url': url,
  80. 'publish_time': '',
  81. 'publisher': publisher,
  82. 'publisher_country': publisher_country,
  83. 'author':author,
  84. 'read_um': '',
  85. 'share_num': '',
  86. 'platform': platform,
  87. 'media_type': media_type,
  88. 'highlights': highlight,
  89. 'keywords': [],
  90. 'hotwords': '',
  91. 'hotevent': hot_event,
  92. 'tags': [],
  93. 'paragraphs': []
  94. }
  95. # return information
  96. else:
  97. content = document['content'].split('\n')
  98. paragraphs = []
  99. article = ''
  100. dict_ = {}
  101. print('content', len(content))
  102. for l in content:
  103. p = {}
  104. if re.search('[a-zA-Z]', l):
  105. #print(l)
  106. res = trans_client.demo(l)
  107. # print(res.__dict__['raw_data'])
  108. try:
  109. text = res.result.trans_result[0].dst
  110. except Exception as e:
  111. print(e)
  112. return None
  113. p['paragraph'] = res.result.trans_result[0].src
  114. p['paragraph_trans'] = text
  115. article = article + str(text)
  116. emo = senta(text)
  117. if emo[0]['label'] == 'negative' and emo[0]['score'] >= 0.75:
  118. p['paragraph_show_emotion'] = '负面'
  119. elif emo[0]['label'] == 'positive' and emo[0]['score'] >= 0.75:
  120. p['paragraph_show_emotion'] = '正面'
  121. else:
  122. p['paragraph_show_emotion'] = '中性'
  123. p['paragraph_emotion'] = str(emo[0]['score'])
  124. p['order'] = content.index(l)
  125. entitys = entity(text)
  126. temp = []
  127. kk = entitys[0].keys()
  128. for k in kk:
  129. for e in entitys[0][k]:
  130. te = list(e.values())
  131. te.append(k)
  132. temp.append(te)
  133. for l in temp:
  134. if l[0] in dict_.keys():
  135. if l[3] > dict_[l[0]]['probability']:
  136. dict_[l[0]]['type'] = l[4]
  137. else:
  138. dict_[l[0]] = {'type': l[4], 'probability': l[3]}
  139. if p:
  140. paragraphs.append(p)
  141. tags = []
  142. for k, v in dict_.items():
  143. tag = {}
  144. tag['name'] = k
  145. if v['type'] == '栏目':
  146. tag['type'] = 1
  147. elif v['type'] == '人物':
  148. tag['type'] = 2
  149. elif v['type'] == '机构':
  150. tag['type'] = 3
  151. elif v['type'] == '国家':
  152. tag['type'] = 4
  153. name = trans_client.demo(k,0).result.trans_result[0].dst
  154. tag['name'] = name
  155. tags.append(tag)
  156. try:
  157. d_emo = senta(article)
  158. except:
  159. d_emo = []
  160. try:
  161. t_emo = senta(title)
  162. except:
  163. t_emo = []
  164. try:
  165. title_emotion = str(t_emo[0]['score'])
  166. except:
  167. title_emotion = str(0)
  168. try:
  169. all_emotion = str(d_emo[0]['score'])
  170. except:
  171. all_emotion = '0'
  172. if not d_emo:
  173. show_emotion = '中性'
  174. elif d_emo[0]['label'] == 'negative' and d_emo[0]['score'] >= 0.75:
  175. show_emotion = '负面'
  176. elif d_emo[0]['label'] == 'positive' and d_emo[0]['score'] >= 0.75:
  177. show_emotion = '正面'
  178. else:
  179. show_emotion = '中性'
  180. keywords = keywords_textrank(article)
  181. tcdr = trans_client.demo(title).result
  182. if tcdr:
  183. title_trans = tcdr.trans_result[0].dst
  184. else:
  185. title_trans = title
  186. information = {
  187. 'title': title,
  188. 'title_trans': title_trans,
  189. 'title_emotion': title_emotion,
  190. 'all_emotion': all_emotion,
  191. 'show_emotion': show_emotion,
  192. 'url': url,
  193. 'publish_time': publish_time,
  194. 'publisher': publisher,
  195. 'publisher_country': publisher_country,
  196. 'author':author,
  197. 'read_um': '',
  198. 'share_num': '',
  199. 'platform': platform,
  200. 'media_type': media_type,
  201. 'highlights': highlight,
  202. 'keywords': keywords,
  203. 'hotwords': hotkey,
  204. 'hotevent': hot_event,
  205. 'tags': tags,
  206. 'paragraphs': paragraphs
  207. }
  208. print(information)
  209. #r = requests.post(url_, json=information, timeout=10)
  210. #print(r.text)
  211. return information
  212. if __name__ == '__main__':
  213. endpoint = 'https://aip.baidubce.com'
  214. url_ = 'http://120.48.174.46:9000/open/api/saveNews'
  215. ak = ''
  216. sk = ''
  217. config = bce_client_configuration.BceClientConfiguration(credentials=bce_credentials.BceCredentials(ak, sk),
  218. endpoint=endpoint)
  219. trans_client = ApiCenterClient(config)
  220. senta = Taskflow("sentiment_analysis")
  221. schema = ['栏目', '人物', '机构', '国家']
  222. entity = Taskflow('information_extraction', schema=schema, model="uie-tiny")
  223. entity.set_schema(schema)
  224. information_extraction(doc)