xzc
/
education


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
							from translate import ApiCenterClient
from baidubce import bce_client_configuration
from baidubce.auth import bce_credentials
from paddlenlp import Taskflow
import re
import requests
from pymongo import MongoClient
import jieba.analyse
import langid
from tld import get_tld
import time
import logger


def keywords_textrank(text):
    keywords = jieba.analyse.textrank(text, topK=6)
    return keywords

def isVaildDate(date):
    try:
        if ":" in date:
            time.strptime(date, "%Y-%m-%d %H:%M:%S")
        else:
            time.strptime(date, "%Y-%m-%d")
        return True
    except:
        return False

def information_extraction(document):
    title = document['title']
    url = document['url']
    if document['highlight']:
        highlight = document['highlight']
    else:
        highlight = []
    author = document['author']
    hotkey = document['hotkey']
    if len(highlight) >5:
        highlight = highlight[0:5]
    #for w in highlight_:

        #highlight.append(trans_client.demo(w[0]).result.trans_result[0].dst)
    #highlight = document['highlight']
    if isVaildDate(document['publish_time']):
        publish_time = document['publish_time']
    else:
        publish_time = ''

    hot_event = document['hotkey']
    publisher = ''
    publisher_country = ''
    if langid.classify(title) == 'zh':
        media_type = 0
    else:
        media_type = 1

    try:
        tld_ = get_tld(url, as_object=True).fld
        platform = tld_.split('.')[0]

    except:

        platform = ''

    try:
        for kk in document['metadata'].keys():
            if re.search('publisher', kk.lower()):
                publisher = document['metadata'][kk]
            elif re.search('publisher_country', kk.lower()):
                publisher_country = document['metadata'][kk]
            elif re.search('platform', kk.lower()):
                platform = document['metadata'][kk]
            elif re.search('keywords',kk.lower()):
                highlight = document['metadata'][kk].split(',')
    except:
        publisher = ''
        publisher_country = ''
        platform = ''

    if len(document['content'].split(' ')) > 2000:
        print(len(document['content'].split(' ')))
        title_trans_json = trans_client.demo(title)
        print(title_trans_json)
        information = {
            'title': title,
            'title_trans': title_trans_json.result.trans_result[0].dst,
            'title_emotion': '',
            'all_emotion': '',
            'show_emotion': '',
            'url': url,
            'publish_time': '',
            'publisher': publisher,
            'publisher_country': publisher_country,
            'author':author,
            'read_um': '',
            'share_num': '',
            'platform': platform,
            'media_type': media_type,
            'highlights': highlight,
            'keywords': [],
            'hotwords': '',
            'hotevent': hot_event,
            'tags': [],
            'paragraphs': []
        }
        # return information

    else:

        content = document['content'].split('\n')

        paragraphs = []
        article = ''
        dict_ = {}
        print('content', len(content))
        for l in content:
            p = {}
            if re.search('[a-zA-Z]', l):
                #print(l)
                res = trans_client.demo(l)
                # print(res.__dict__['raw_data'])
                try:
                    text = res.result.trans_result[0].dst
                except Exception as e:
                    print(e)
                    return None
                p['paragraph'] = res.result.trans_result[0].src
                p['paragraph_trans'] = text

                article = article + str(text)

                emo = senta(text)
                if emo[0]['label'] == 'negative' and emo[0]['score'] >= 0.75:
                    p['paragraph_show_emotion'] = '负面'
                elif emo[0]['label'] == 'positive' and emo[0]['score'] >= 0.75:
                    p['paragraph_show_emotion'] = '正面'
                else:
                    p['paragraph_show_emotion'] = '中性'

                p['paragraph_emotion'] = str(emo[0]['score'])

                p['order'] = content.index(l)

                entitys = entity(text)
                temp = []
                kk = entitys[0].keys()
                for k in kk:
                    for e in entitys[0][k]:
                        te = list(e.values())
                        te.append(k)
                        temp.append(te)

                for l in temp:
                    if l[0] in dict_.keys():
                        if l[3] > dict_[l[0]]['probability']:
                            dict_[l[0]]['type'] = l[4]
                    else:
                        dict_[l[0]] = {'type': l[4], 'probability': l[3]}

            if p:
                paragraphs.append(p)

        tags = []
        for k, v in dict_.items():
            tag = {}
            tag['name'] = k
            if v['type'] == '栏目':
                tag['type'] = 1
            elif v['type'] == '人物':
                tag['type'] = 2
            elif v['type'] == '机构':
                tag['type'] = 3
            elif v['type'] == '国家':
                tag['type'] = 4
                name = trans_client.demo(k,0).result.trans_result[0].dst
                tag['name'] = name
            tags.append(tag)
        try:
            d_emo = senta(article)
        except:
            d_emo = []
        try:
            t_emo = senta(title)
        except:
            t_emo = []
        try:
            title_emotion = str(t_emo[0]['score'])
        except:
            title_emotion = str(0)
        try:
            all_emotion = str(d_emo[0]['score'])
        except:
            all_emotion = '0'
        if not d_emo:
            show_emotion = '中性' 
        elif d_emo[0]['label'] == 'negative' and d_emo[0]['score'] >= 0.75:
            show_emotion = '负面'
        elif d_emo[0]['label'] == 'positive' and d_emo[0]['score'] >= 0.75:
            show_emotion = '正面'
        else:
            show_emotion = '中性'
        keywords = keywords_textrank(article)

        tcdr = trans_client.demo(title).result
        if tcdr:
            title_trans = tcdr.trans_result[0].dst
        else:
            title_trans = title
        information = {
            'title': title,
            'title_trans': title_trans,
            'title_emotion': title_emotion,
            'all_emotion': all_emotion,
            'show_emotion': show_emotion,
            'url': url,
            'publish_time': publish_time,
            'publisher': publisher,
            'publisher_country': publisher_country,
            'author':author,
            'read_um': '',
            'share_num': '',
            'platform': platform,
            'media_type': media_type,
            'highlights': highlight,
            'keywords': keywords,
            'hotwords': hotkey,
            'hotevent': hot_event,
            'tags': tags,
            'paragraphs': paragraphs

        }

    print(information)
    #r = requests.post(url_, json=information, timeout=10)
    #print(r.text)
    return information


if __name__ == '__main__':
    endpoint = 'https://aip.baidubce.com'
    url_ = 'http://120.48.174.46:9000/open/api/saveNews'
    ak = ''
    sk = ''
    config = bce_client_configuration.BceClientConfiguration(credentials=bce_credentials.BceCredentials(ak, sk),
                                                             endpoint=endpoint)
    trans_client = ApiCenterClient(config)

    senta = Taskflow("sentiment_analysis")

    schema = ['栏目', '人物', '机构', '国家']
    entity = Taskflow('information_extraction', schema=schema, model="uie-tiny")
    entity.set_schema(schema)

    information_extraction(doc)