123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- from translate import ApiCenterClient
- from baidubce import bce_client_configuration
- from baidubce.auth import bce_credentials
- from paddlenlp import Taskflow
- import re
- import requests
- from pymongo import MongoClient
- import jieba.analyse
- import langid
- from tld import get_tld
- import time
- import logger
- def keywords_textrank(text):
- keywords = jieba.analyse.textrank(text, topK=6)
- return keywords
- def isVaildDate(date):
- try:
- if ":" in date:
- time.strptime(date, "%Y-%m-%d %H:%M:%S")
- else:
- time.strptime(date, "%Y-%m-%d")
- return True
- except:
- return False
- def information_extraction(document):
- title = document['title']
- url = document['url']
- if document['highlight']:
- highlight = document['highlight']
- else:
- highlight = []
- author = document['author']
- hotkey = document['hotkey']
- if len(highlight) >5:
- highlight = highlight[0:5]
- #for w in highlight_:
- #highlight.append(trans_client.demo(w[0]).result.trans_result[0].dst)
- #highlight = document['highlight']
- if isVaildDate(document['publish_time']):
- publish_time = document['publish_time']
- else:
- publish_time = ''
- hot_event = document['hotkey']
- publisher = ''
- publisher_country = ''
- if langid.classify(title) == 'zh':
- media_type = 0
- else:
- media_type = 1
- try:
- tld_ = get_tld(url, as_object=True).fld
- platform = tld_.split('.')[0]
- except:
- platform = ''
- try:
- for kk in document['metadata'].keys():
- if re.search('publisher', kk.lower()):
- publisher = document['metadata'][kk]
- elif re.search('publisher_country', kk.lower()):
- publisher_country = document['metadata'][kk]
- elif re.search('platform', kk.lower()):
- platform = document['metadata'][kk]
- elif re.search('keywords',kk.lower()):
- highlight = document['metadata'][kk].split(',')
- except:
- publisher = ''
- publisher_country = ''
- platform = ''
- if len(document['content'].split(' ')) > 2000:
- print(len(document['content'].split(' ')))
- title_trans_json = trans_client.demo(title)
- print(title_trans_json)
- information = {
- 'title': title,
- 'title_trans': title_trans_json.result.trans_result[0].dst,
- 'title_emotion': '',
- 'all_emotion': '',
- 'show_emotion': '',
- 'url': url,
- 'publish_time': '',
- 'publisher': publisher,
- 'publisher_country': publisher_country,
- 'author':author,
- 'read_um': '',
- 'share_num': '',
- 'platform': platform,
- 'media_type': media_type,
- 'highlights': highlight,
- 'keywords': [],
- 'hotwords': '',
- 'hotevent': hot_event,
- 'tags': [],
- 'paragraphs': []
- }
- # return information
- else:
- content = document['content'].split('\n')
- paragraphs = []
- article = ''
- dict_ = {}
- print('content', len(content))
- for l in content:
- p = {}
- if re.search('[a-zA-Z]', l):
- #print(l)
- res = trans_client.demo(l)
- # print(res.__dict__['raw_data'])
- try:
- text = res.result.trans_result[0].dst
- except Exception as e:
- print(e)
- return None
- p['paragraph'] = res.result.trans_result[0].src
- p['paragraph_trans'] = text
- article = article + str(text)
- emo = senta(text)
- if emo[0]['label'] == 'negative' and emo[0]['score'] >= 0.75:
- p['paragraph_show_emotion'] = '负面'
- elif emo[0]['label'] == 'positive' and emo[0]['score'] >= 0.75:
- p['paragraph_show_emotion'] = '正面'
- else:
- p['paragraph_show_emotion'] = '中性'
- p['paragraph_emotion'] = str(emo[0]['score'])
- p['order'] = content.index(l)
- entitys = entity(text)
- temp = []
- kk = entitys[0].keys()
- for k in kk:
- for e in entitys[0][k]:
- te = list(e.values())
- te.append(k)
- temp.append(te)
- for l in temp:
- if l[0] in dict_.keys():
- if l[3] > dict_[l[0]]['probability']:
- dict_[l[0]]['type'] = l[4]
- else:
- dict_[l[0]] = {'type': l[4], 'probability': l[3]}
- if p:
- paragraphs.append(p)
- tags = []
- for k, v in dict_.items():
- tag = {}
- tag['name'] = k
- if v['type'] == '栏目':
- tag['type'] = 1
- elif v['type'] == '人物':
- tag['type'] = 2
- elif v['type'] == '机构':
- tag['type'] = 3
- elif v['type'] == '国家':
- tag['type'] = 4
- name = trans_client.demo(k,0).result.trans_result[0].dst
- tag['name'] = name
- tags.append(tag)
- try:
- d_emo = senta(article)
- except:
- d_emo = []
- try:
- t_emo = senta(title)
- except:
- t_emo = []
- try:
- title_emotion = str(t_emo[0]['score'])
- except:
- title_emotion = str(0)
- try:
- all_emotion = str(d_emo[0]['score'])
- except:
- all_emotion = '0'
- if not d_emo:
- show_emotion = '中性'
- elif d_emo[0]['label'] == 'negative' and d_emo[0]['score'] >= 0.75:
- show_emotion = '负面'
- elif d_emo[0]['label'] == 'positive' and d_emo[0]['score'] >= 0.75:
- show_emotion = '正面'
- else:
- show_emotion = '中性'
- keywords = keywords_textrank(article)
- tcdr = trans_client.demo(title).result
- if tcdr:
- title_trans = tcdr.trans_result[0].dst
- else:
- title_trans = title
- information = {
- 'title': title,
- 'title_trans': title_trans,
- 'title_emotion': title_emotion,
- 'all_emotion': all_emotion,
- 'show_emotion': show_emotion,
- 'url': url,
- 'publish_time': publish_time,
- 'publisher': publisher,
- 'publisher_country': publisher_country,
- 'author':author,
- 'read_um': '',
- 'share_num': '',
- 'platform': platform,
- 'media_type': media_type,
- 'highlights': highlight,
- 'keywords': keywords,
- 'hotwords': hotkey,
- 'hotevent': hot_event,
- 'tags': tags,
- 'paragraphs': paragraphs
- }
- print(information)
- #r = requests.post(url_, json=information, timeout=10)
- #print(r.text)
- return information
- if __name__ == '__main__':
- endpoint = 'https://aip.baidubce.com'
- url_ = 'http://120.48.174.46:9000/open/api/saveNews'
- ak = ''
- sk = ''
- config = bce_client_configuration.BceClientConfiguration(credentials=bce_credentials.BceCredentials(ak, sk),
- endpoint=endpoint)
- trans_client = ApiCenterClient(config)
- senta = Taskflow("sentiment_analysis")
- schema = ['栏目', '人物', '机构', '国家']
- entity = Taskflow('information_extraction', schema=schema, model="uie-tiny")
- entity.set_schema(schema)
- information_extraction(doc)
|