123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2022-11-01 14:40:52
- # @Last Modified by: privacy
- # @Last Modified time: 2022-11-03 14:41:07
- from translate import ApiCenterClient
- from baidubce import bce_client_configuration
- from baidubce.auth import bce_credentials
- import re
- import json
- import requests
- import jieba.analyse
- from paddlenlp import Taskflow
- import langid
- from tld import get_tld
- import time
- import logger
- from pprint import pprint
- def keywords_textrank(text):
- keywords = jieba.analyse.textrank(text, topK=6)
- return keywords
- def isVaildDate(date):
- try:
- if ":" in date:
- time.strptime(date, "%Y-%m-%d %H:%M:%S")
- else:
- time.strptime(date, "%Y-%m-%d")
- return True
- except:
- return False
- def push_result(url, content, content_trans, publish_time, all_emotion, show_emotion, like_num, share_num, comment_num, author, author_link, highlights=[], keywords=[], hotword=""):
- psuh_url = "http://120.48.174.46:9000/open/api/saveSocial"
- body = {
- "url": url,
- "content": content,
- "content_trans": content_trans,
- "publish_time": publish_time,
- "all_emotion": all_emotion,
- "show_emotion": show_emotion,
- "like_num": like_num,
- "share_num": share_num,
- "comment_num": comment_num,
- "author": author,
- "author_link": author_link,
- "platform": "twitter",
- "media_type": 1,
- "highlights": highlights,
- "keywords": keywords,
- "hotword": hotword,
- "hotevent": "",
- "tags": [{"name":"", "type":""}]
- }
- pprint(body)
- # r = requests.post(url, json=body, timeout=10)
- # if r.status_code == 200:
- # print(r.text)
- # else:
- # exit(1)
- def information(document):
- global comparison_dic
- # url = document["url"]
- url = ""
- content = document['content']
- content_trans = ""
- for line in content.split('\n'):
- res = trans_client.demo(line)
- text = res["result"]["trans_result"][0]["dst"]
- content_trans += str(text)
- emo = senta(content_trans)
- all_emotion = emo[0]['score']
- if emo[0]['label'] == 'negative' and emo[0]['score'] >= 0.75:
- show_emotion = '负面'
- elif emo[0]['label'] == 'positive' and emo[0]['score'] >= 0.75:
- show_emotion = '正面'
- else:
- show_emotion = '中性'
- if isVaildDate(document['publish_time']):
- publish_time = document['publish_time']
- else:
- publish_time = ''
- like_num = document["like_num"]
- share_num = document["share_num"]
- comment_num = document["comment_num"]
- author = document["author"]
- # author_link = document["author_link"]
- author_link = ""
- entitys = entity(content_trans)
- temp = []
- dict_ = {}
- kk = entitys[0].keys()
- for k in kk:
- for e in entitys[0][k]:
- te = list(e.values())
- te.append(k)
- temp.append(te)
- for l in temp:
- if l[0] in dict_.keys():
- if l[3] > dict_[l[0]]['probability']:
- dict_[l[0]]['type'] = l[4]
- else:
- dict_[l[0]] = {'type': l[4], 'probability': l[3]}
- tags = []
- for k, v in dict_.items():
- tag = {}
- tag['name'] = k
- if v['type'] == '栏目':
- tag['type'] = 1
- elif v['type'] == '人物':
- tag['type'] = 2
- elif v['type'] == '机构':
- tag['type'] = 3
- elif v['type'] == '国家':
- tag['type'] = 4
- name = trans_client.demo(k,0)["result"]["trans_result"][0]["dst"]
- tag['name'] = name
- tags.append(tag)
- push_result(url, content, content_trans, publish_time, all_emotion, show_emotion, like_num, share_num, comment_num, author, author_link, highlights=[], keywords=[], hotword="")
- if __name__ == '__main__':
- global comparison_dic
- with open("comparison_dic.json",'r', encoding='gbk') as f:
- comparison_dic = json.load(f)
- endpoint = 'https://aip.baidubce.com'
- ak = ''
- sk = ''
- config = bce_client_configuration.BceClientConfiguration(credentials=bce_credentials.BceCredentials(ak, sk),
- endpoint=endpoint)
- trans_client = ApiCenterClient(config)
- senta = Taskflow("sentiment_analysis")
- schema = ['栏目', '人物', '机构', '国家']
- entity = Taskflow('information_extraction', schema=schema, model="uie-tiny")
- import pandas as pd
- df = pd.read_excel("./社交媒体.xlsx", converters={"like_num":int, "share_num":int, "comment_num":int})
- for i in range(len(df)):
- information(df.loc[i])
- exit
|