xzc
/
education


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2022-11-01 14:40:52
# @Last Modified by:   privacy
# @Last Modified time: 2022-11-03 14:41:07
from translate import ApiCenterClient
from baidubce import bce_client_configuration
from baidubce.auth import bce_credentials

import re
import json
import requests
import jieba.analyse
from paddlenlp import Taskflow

import langid
from tld import get_tld
import time
import logger
from pprint import pprint

def keywords_textrank(text):
	keywords = jieba.analyse.textrank(text, topK=6)
	return keywords

def isVaildDate(date):
	try:
		if ":" in date:
			time.strptime(date, "%Y-%m-%d %H:%M:%S")
		else:
			time.strptime(date, "%Y-%m-%d")
		return True
	except:
		return False

def push_result(url, content, content_trans, publish_time, all_emotion, show_emotion, like_num, share_num, comment_num, author, author_link, highlights=[], keywords=[], hotword=""):
	psuh_url = "http://120.48.174.46:9000/open/api/saveSocial"
	body = {
		"url": url,
		"content": content,
		"content_trans": content_trans,
		"publish_time": publish_time,
		"all_emotion": all_emotion,
		"show_emotion": show_emotion,
		"like_num": like_num,
		"share_num": share_num,
		"comment_num": comment_num,
		"author": author,
		"author_link": author_link,
		"platform": "twitter",
		"media_type": 1,
		"highlights": highlights,
		"keywords": keywords,
		"hotword": hotword,
		"hotevent": "",
		"tags": [{"name":"", "type":""}]
	}
	pprint(body)
	# r = requests.post(url, json=body, timeout=10)
	# if r.status_code == 200:
	# 	print(r.text)
	# else:
	# 	exit(1)

def information(document):
	global comparison_dic
	# url = document["url"]
	url = ""

	content = document['content']

	content_trans = ""
	for line in content.split('\n'):
		res = trans_client.demo(line)
		text = res["result"]["trans_result"][0]["dst"]
		content_trans += str(text)

	emo = senta(content_trans)
	all_emotion = emo[0]['score']

	if emo[0]['label'] == 'negative' and emo[0]['score'] >= 0.75:
		show_emotion = '负面'
	elif emo[0]['label'] == 'positive' and emo[0]['score'] >= 0.75:
		show_emotion = '正面'
	else:
		show_emotion = '中性'

	if isVaildDate(document['publish_time']):
		publish_time = document['publish_time']
	else:
		publish_time = ''

	like_num = document["like_num"]

	share_num = document["share_num"]

	comment_num = document["comment_num"]

	author = document["author"]

	# author_link = document["author_link"]
	author_link = ""

	entitys = entity(content_trans)
	temp = []
	dict_ = {}
	kk = entitys[0].keys()
	for k in kk:
		for e in entitys[0][k]:
			te = list(e.values())
			te.append(k)
			temp.append(te)
	for l in temp:
		if l[0] in dict_.keys():
			if l[3] > dict_[l[0]]['probability']:
				dict_[l[0]]['type'] = l[4]
		else:
			dict_[l[0]] = {'type': l[4], 'probability': l[3]}
	tags = []
	for k, v in dict_.items():
		tag = {}
		tag['name'] = k
		if v['type'] == '栏目':
			tag['type'] = 1
		elif v['type'] == '人物':
			tag['type'] = 2
		elif v['type'] == '机构':
			tag['type'] = 3
		elif v['type'] == '国家':
			tag['type'] = 4
			name = trans_client.demo(k,0)["result"]["trans_result"][0]["dst"]
			tag['name'] = name
		tags.append(tag)

	push_result(url, content, content_trans, publish_time, all_emotion, show_emotion, like_num, share_num, comment_num, author, author_link, highlights=[], keywords=[], hotword="")


if __name__ == '__main__':
	global comparison_dic
	with open("comparison_dic.json",'r', encoding='gbk') as f:
		comparison_dic = json.load(f)
	endpoint = 'https://aip.baidubce.com'
	ak = ''
	sk = ''
	config = bce_client_configuration.BceClientConfiguration(credentials=bce_credentials.BceCredentials(ak, sk),
															 endpoint=endpoint)
	trans_client = ApiCenterClient(config)

	senta = Taskflow("sentiment_analysis")
	schema = ['栏目', '人物', '机构', '国家']
	entity = Taskflow('information_extraction', schema=schema, model="uie-tiny")
	import pandas as pd
	df = pd.read_excel("./社交媒体.xlsx", converters={"like_num":int, "share_num":int, "comment_num":int})
	for i in range(len(df)):
		information(df.loc[i])
		exit