# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2022-11-01 14:40:52 # @Last Modified by: privacy # @Last Modified time: 2022-11-03 14:41:07 from translate import ApiCenterClient from baidubce import bce_client_configuration from baidubce.auth import bce_credentials import re import json import requests import jieba.analyse from paddlenlp import Taskflow import langid from tld import get_tld import time import logger from pprint import pprint def keywords_textrank(text): keywords = jieba.analyse.textrank(text, topK=6) return keywords def isVaildDate(date): try: if ":" in date: time.strptime(date, "%Y-%m-%d %H:%M:%S") else: time.strptime(date, "%Y-%m-%d") return True except: return False def push_result(url, content, content_trans, publish_time, all_emotion, show_emotion, like_num, share_num, comment_num, author, author_link, highlights=[], keywords=[], hotword=""): psuh_url = "http://120.48.174.46:9000/open/api/saveSocial" body = { "url": url, "content": content, "content_trans": content_trans, "publish_time": publish_time, "all_emotion": all_emotion, "show_emotion": show_emotion, "like_num": like_num, "share_num": share_num, "comment_num": comment_num, "author": author, "author_link": author_link, "platform": "twitter", "media_type": 1, "highlights": highlights, "keywords": keywords, "hotword": hotword, "hotevent": "", "tags": [{"name":"", "type":""}] } pprint(body) # r = requests.post(url, json=body, timeout=10) # if r.status_code == 200: # print(r.text) # else: # exit(1) def information(document): global comparison_dic # url = document["url"] url = "" content = document['content'] content_trans = "" for line in content.split('\n'): res = trans_client.demo(line) text = res["result"]["trans_result"][0]["dst"] content_trans += str(text) emo = senta(content_trans) all_emotion = emo[0]['score'] if emo[0]['label'] == 'negative' and emo[0]['score'] >= 0.75: show_emotion = '负面' elif emo[0]['label'] == 'positive' and emo[0]['score'] >= 0.75: show_emotion = '正面' else: show_emotion = '中性' if isVaildDate(document['publish_time']): publish_time = document['publish_time'] else: publish_time = '' like_num = document["like_num"] share_num = document["share_num"] comment_num = document["comment_num"] author = document["author"] # author_link = document["author_link"] author_link = "" entitys = entity(content_trans) temp = [] dict_ = {} kk = entitys[0].keys() for k in kk: for e in entitys[0][k]: te = list(e.values()) te.append(k) temp.append(te) for l in temp: if l[0] in dict_.keys(): if l[3] > dict_[l[0]]['probability']: dict_[l[0]]['type'] = l[4] else: dict_[l[0]] = {'type': l[4], 'probability': l[3]} tags = [] for k, v in dict_.items(): tag = {} tag['name'] = k if v['type'] == '栏目': tag['type'] = 1 elif v['type'] == '人物': tag['type'] = 2 elif v['type'] == '机构': tag['type'] = 3 elif v['type'] == '国家': tag['type'] = 4 name = trans_client.demo(k,0)["result"]["trans_result"][0]["dst"] tag['name'] = name tags.append(tag) push_result(url, content, content_trans, publish_time, all_emotion, show_emotion, like_num, share_num, comment_num, author, author_link, highlights=[], keywords=[], hotword="") if __name__ == '__main__': global comparison_dic with open("comparison_dic.json",'r', encoding='gbk') as f: comparison_dic = json.load(f) endpoint = 'https://aip.baidubce.com' ak = '' sk = '' config = bce_client_configuration.BceClientConfiguration(credentials=bce_credentials.BceCredentials(ak, sk), endpoint=endpoint) trans_client = ApiCenterClient(config) senta = Taskflow("sentiment_analysis") schema = ['栏目', '人物', '机构', '国家'] entity = Taskflow('information_extraction', schema=schema, model="uie-tiny") import pandas as pd df = pd.read_excel("./社交媒体.xlsx", converters={"like_num":int, "share_num":int, "comment_num":int}) for i in range(len(df)): information(df.loc[i]) exit