social_process.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-11-01 14:40:52
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-11-03 14:41:07
  6. from translate import ApiCenterClient
  7. from baidubce import bce_client_configuration
  8. from baidubce.auth import bce_credentials
  9. import re
  10. import json
  11. import requests
  12. import jieba.analyse
  13. from paddlenlp import Taskflow
  14. import langid
  15. from tld import get_tld
  16. import time
  17. import logger
  18. from pprint import pprint
  19. def keywords_textrank(text):
  20. keywords = jieba.analyse.textrank(text, topK=6)
  21. return keywords
  22. def isVaildDate(date):
  23. try:
  24. if ":" in date:
  25. time.strptime(date, "%Y-%m-%d %H:%M:%S")
  26. else:
  27. time.strptime(date, "%Y-%m-%d")
  28. return True
  29. except:
  30. return False
  31. def push_result(url, content, content_trans, publish_time, all_emotion, show_emotion, like_num, share_num, comment_num, author, author_link, highlights=[], keywords=[], hotword=""):
  32. psuh_url = "http://120.48.174.46:9000/open/api/saveSocial"
  33. body = {
  34. "url": url,
  35. "content": content,
  36. "content_trans": content_trans,
  37. "publish_time": publish_time,
  38. "all_emotion": all_emotion,
  39. "show_emotion": show_emotion,
  40. "like_num": like_num,
  41. "share_num": share_num,
  42. "comment_num": comment_num,
  43. "author": author,
  44. "author_link": author_link,
  45. "platform": "twitter",
  46. "media_type": 1,
  47. "highlights": highlights,
  48. "keywords": keywords,
  49. "hotword": hotword,
  50. "hotevent": "",
  51. "tags": [{"name":"", "type":""}]
  52. }
  53. pprint(body)
  54. # r = requests.post(url, json=body, timeout=10)
  55. # if r.status_code == 200:
  56. # print(r.text)
  57. # else:
  58. # exit(1)
  59. def information(document):
  60. global comparison_dic
  61. # url = document["url"]
  62. url = ""
  63. content = document['content']
  64. content_trans = ""
  65. for line in content.split('\n'):
  66. res = trans_client.demo(line)
  67. text = res["result"]["trans_result"][0]["dst"]
  68. content_trans += str(text)
  69. emo = senta(content_trans)
  70. all_emotion = emo[0]['score']
  71. if emo[0]['label'] == 'negative' and emo[0]['score'] >= 0.75:
  72. show_emotion = '负面'
  73. elif emo[0]['label'] == 'positive' and emo[0]['score'] >= 0.75:
  74. show_emotion = '正面'
  75. else:
  76. show_emotion = '中性'
  77. if isVaildDate(document['publish_time']):
  78. publish_time = document['publish_time']
  79. else:
  80. publish_time = ''
  81. like_num = document["like_num"]
  82. share_num = document["share_num"]
  83. comment_num = document["comment_num"]
  84. author = document["author"]
  85. # author_link = document["author_link"]
  86. author_link = ""
  87. entitys = entity(content_trans)
  88. temp = []
  89. dict_ = {}
  90. kk = entitys[0].keys()
  91. for k in kk:
  92. for e in entitys[0][k]:
  93. te = list(e.values())
  94. te.append(k)
  95. temp.append(te)
  96. for l in temp:
  97. if l[0] in dict_.keys():
  98. if l[3] > dict_[l[0]]['probability']:
  99. dict_[l[0]]['type'] = l[4]
  100. else:
  101. dict_[l[0]] = {'type': l[4], 'probability': l[3]}
  102. tags = []
  103. for k, v in dict_.items():
  104. tag = {}
  105. tag['name'] = k
  106. if v['type'] == '栏目':
  107. tag['type'] = 1
  108. elif v['type'] == '人物':
  109. tag['type'] = 2
  110. elif v['type'] == '机构':
  111. tag['type'] = 3
  112. elif v['type'] == '国家':
  113. tag['type'] = 4
  114. name = trans_client.demo(k,0)["result"]["trans_result"][0]["dst"]
  115. tag['name'] = name
  116. tags.append(tag)
  117. push_result(url, content, content_trans, publish_time, all_emotion, show_emotion, like_num, share_num, comment_num, author, author_link, highlights=[], keywords=[], hotword="")
  118. if __name__ == '__main__':
  119. global comparison_dic
  120. with open("comparison_dic.json",'r', encoding='gbk') as f:
  121. comparison_dic = json.load(f)
  122. endpoint = 'https://aip.baidubce.com'
  123. ak = ''
  124. sk = ''
  125. config = bce_client_configuration.BceClientConfiguration(credentials=bce_credentials.BceCredentials(ak, sk),
  126. endpoint=endpoint)
  127. trans_client = ApiCenterClient(config)
  128. senta = Taskflow("sentiment_analysis")
  129. schema = ['栏目', '人物', '机构', '国家']
  130. entity = Taskflow('information_extraction', schema=schema, model="uie-tiny")
  131. import pandas as pd
  132. df = pd.read_excel("./社交媒体.xlsx", converters={"like_num":int, "share_num":int, "comment_num":int})
  133. for i in range(len(df)):
  134. information(df.loc[i])
  135. exit