from translate import ApiCenterClient from baidubce import bce_client_configuration from baidubce.auth import bce_credentials from paddlenlp import Taskflow import re import requests from pymongo import MongoClient import jieba.analyse import langid from tld import get_tld import time import logger def keywords_textrank(text): keywords = jieba.analyse.textrank(text, topK=6) return keywords def isVaildDate(date): try: if ":" in date: time.strptime(date, "%Y-%m-%d %H:%M:%S") else: time.strptime(date, "%Y-%m-%d") return True except: return False def information_extraction(document): title = document['title'] url = document['url'] if document['highlight']: highlight = document['highlight'] else: highlight = [] author = document['author'] hotkey = document['hotkey'] if len(highlight) >5: highlight = highlight[0:5] #for w in highlight_: #highlight.append(trans_client.demo(w[0]).result.trans_result[0].dst) #highlight = document['highlight'] if isVaildDate(document['publish_time']): publish_time = document['publish_time'] else: publish_time = '' hot_event = document['hotkey'] publisher = '' publisher_country = '' if langid.classify(title) == 'zh': media_type = 0 else: media_type = 1 try: tld_ = get_tld(url, as_object=True).fld platform = tld_.split('.')[0] except: platform = '' try: for kk in document['metadata'].keys(): if re.search('publisher', kk.lower()): publisher = document['metadata'][kk] elif re.search('publisher_country', kk.lower()): publisher_country = document['metadata'][kk] elif re.search('platform', kk.lower()): platform = document['metadata'][kk] elif re.search('keywords',kk.lower()): highlight = document['metadata'][kk].split(',') except: publisher = '' publisher_country = '' platform = '' if len(document['content'].split(' ')) > 2000: print(len(document['content'].split(' '))) title_trans_json = trans_client.demo(title) print(title_trans_json) information = { 'title': title, 'title_trans': title_trans_json.result.trans_result[0].dst, 'title_emotion': '', 'all_emotion': '', 'show_emotion': '', 'url': url, 'publish_time': '', 'publisher': publisher, 'publisher_country': publisher_country, 'author':author, 'read_um': '', 'share_num': '', 'platform': platform, 'media_type': media_type, 'highlights': highlight, 'keywords': [], 'hotwords': '', 'hotevent': hot_event, 'tags': [], 'paragraphs': [] } # return information else: content = document['content'].split('\n') paragraphs = [] article = '' dict_ = {} print('content', len(content)) for l in content: p = {} if re.search('[a-zA-Z]', l): #print(l) res = trans_client.demo(l) # print(res.__dict__['raw_data']) try: text = res.result.trans_result[0].dst except Exception as e: print(e) return None p['paragraph'] = res.result.trans_result[0].src p['paragraph_trans'] = text article = article + str(text) emo = senta(text) if emo[0]['label'] == 'negative' and emo[0]['score'] >= 0.75: p['paragraph_show_emotion'] = '负面' elif emo[0]['label'] == 'positive' and emo[0]['score'] >= 0.75: p['paragraph_show_emotion'] = '正面' else: p['paragraph_show_emotion'] = '中性' p['paragraph_emotion'] = str(emo[0]['score']) p['order'] = content.index(l) entitys = entity(text) temp = [] kk = entitys[0].keys() for k in kk: for e in entitys[0][k]: te = list(e.values()) te.append(k) temp.append(te) for l in temp: if l[0] in dict_.keys(): if l[3] > dict_[l[0]]['probability']: dict_[l[0]]['type'] = l[4] else: dict_[l[0]] = {'type': l[4], 'probability': l[3]} if p: paragraphs.append(p) tags = [] for k, v in dict_.items(): tag = {} tag['name'] = k if v['type'] == '栏目': tag['type'] = 1 elif v['type'] == '人物': tag['type'] = 2 elif v['type'] == '机构': tag['type'] = 3 elif v['type'] == '国家': tag['type'] = 4 name = trans_client.demo(k,0).result.trans_result[0].dst tag['name'] = name tags.append(tag) try: d_emo = senta(article) except: d_emo = [] try: t_emo = senta(title) except: t_emo = [] try: title_emotion = str(t_emo[0]['score']) except: title_emotion = str(0) try: all_emotion = str(d_emo[0]['score']) except: all_emotion = '0' if not d_emo: show_emotion = '中性' elif d_emo[0]['label'] == 'negative' and d_emo[0]['score'] >= 0.75: show_emotion = '负面' elif d_emo[0]['label'] == 'positive' and d_emo[0]['score'] >= 0.75: show_emotion = '正面' else: show_emotion = '中性' keywords = keywords_textrank(article) tcdr = trans_client.demo(title).result if tcdr: title_trans = tcdr.trans_result[0].dst else: title_trans = title information = { 'title': title, 'title_trans': title_trans, 'title_emotion': title_emotion, 'all_emotion': all_emotion, 'show_emotion': show_emotion, 'url': url, 'publish_time': publish_time, 'publisher': publisher, 'publisher_country': publisher_country, 'author':author, 'read_um': '', 'share_num': '', 'platform': platform, 'media_type': media_type, 'highlights': highlight, 'keywords': keywords, 'hotwords': hotkey, 'hotevent': hot_event, 'tags': tags, 'paragraphs': paragraphs } print(information) #r = requests.post(url_, json=information, timeout=10) #print(r.text) return information if __name__ == '__main__': endpoint = 'https://aip.baidubce.com' url_ = 'http://120.48.174.46:9000/open/api/saveNews' ak = '' sk = '' config = bce_client_configuration.BceClientConfiguration(credentials=bce_credentials.BceCredentials(ak, sk), endpoint=endpoint) trans_client = ApiCenterClient(config) senta = Taskflow("sentiment_analysis") schema = ['栏目', '人物', '机构', '国家'] entity = Taskflow('information_extraction', schema=schema, model="uie-tiny") entity.set_schema(schema) information_extraction(doc)