1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693 |
- #!/usr/bin/env python
- # coding: utf-8
- # 通用简历抽取
- import os
- import re
- import json
- import time
- import platform
- import subprocess
- import rarfile
- import py7zr
- import tarfile
- from zipfile import ZipFile
- import requests
- from requests.adapters import HTTPAdapter
- from docx import Document
- from docx.shared import Inches
- from pdfminer.pdfpage import PDFPage
- from pdfminer.pdfparser import PDFParser
- from pdfminer.pdfdocument import PDFDocument
- from pdfminer.high_level import extract_pages
- from pdfminer.converter import PDFPageAggregator
- from pdfminer.layout import LTTextContainer, LTChar, LTLine, LAParams, LTTextBox, LTFigure, LTImage, LTText, LTAnno, LTTextLine, LTTextLineHorizontal
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- import pdfplumber
- from paddlenlp import Taskflow
- from logger import Logger
- logger = Logger("resume_parse")
- logger.set_file_handler(filename='journal.log')
- from rich.console import Console
- console = Console()
- global ner, ner_tag, base_info_ie, prize_ie, cet_ie, pro_ie, block, block_rev, translate
- if not locals().get("ner"):
- ner = Taskflow("ner", mode='fast')
- if not locals().get("ner_tag"):
- ner_tag = Taskflow("ner")
- if not locals().get("base_info_ie"):
- base_info_ie = Taskflow('information_extraction', schema=["姓名","性别","电子邮箱","政治面貌","手机号码","籍贯","出生日期","现任职务","参加工作时间","当前单位","所在城市"], model="uie-tiny")
- if not locals().get("prize_ie"):
- prize_ie = Taskflow('information_extraction', schema=["时间", "奖项"], model="uie-nano")
- if not locals().get("cet_ie"):
- cet_ie = Taskflow('information_extraction', schema=["时间","证书"], model="uie-nano")
- if not locals().get("pro_ie"):
- pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./resources/model_best')
- if not locals().get("block"):
- with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
- block = json.load(fp)
- if not locals().get("block_rev"):
- block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
- if not locals().get("translate"):
- with open("./resources/translate.json", "r", encoding="utf-8") as ft:
- translate = json.load(ft)
- if not os.path.exists("./uploads"):
- os.mkdir("./uploads")
- if not os.path.exists("./pdf"):
- os.mkdir("./pdf")
- if not os.path.exists("./cache"):
- os.mkdir("./cache")
- if not os.path.exists("./result"):
- os.mkdir("./result")
- import uvicorn
- from fastapi import BackgroundTasks, FastAPI, File, UploadFile
- app = FastAPI()
- from functools import wraps
- def time_this_function(func):
- @wraps(func)
- def wrapper(*args,**kwargs):
- start=time.time()
- result=func(*args, **kwargs)
- end=time.time()
- console.print("函数:",func.__name__,"运行时间:", round(end - start, 4),"s")
- return result
- return wrapper
- # 基本信息(旧版)
- @time_this_function
- def get_base_info_old(lines):
- logger.info(lines)
- datas = "".join(lines)
- ner_list = ner(datas)
- concat_list = []
- for w, t in ner_list:
- if concat_list and (t == concat_list[-1][-1]):
- concat_list[-1][0] += w
- else:
- concat_list.append([w, t])
- schema = {
- '姓名': None,
- }
- for line in [' '.join(' '.join(lines).split('\n'))]:
- line = line.replace(r'[ ]{5,}','\n')
- w = re.sub(r'[\W]+(\w[::])[\W]{0,}\w', r'\1', line)
- for i in w.split():
- if ':' in i:
- try:
- key, val = i.split(':')
- schema[key] = val
- except Exception as e:
- logger.error(e)
- if not schema.get('姓名'):
- schema['姓名'] = re.search(r'[姓名::]{3,}(\w{2,4})', w).group(1) if re.search(r'[姓名::]{3,}(\w{2,4})', w) else None
- if not schema.get('姓名'):
- for word, tag in ner_tag(w):
- if tag == "人物类_实体":
- schema['姓名'] = word
- if not schema.get('性别'):
- schema['性别'] = re.search(r'[男女]', w).group() if re.search(r'[男女]', w) else None
- # if not schema.get('婚姻状况'):
- # schema['婚姻状况'] = re.search(r'[已未]婚', w).group() if re.search(r'[已未]婚', w) else None
- if not schema.get('邮箱地址'):
- schema['邮箱地址'] = re.search(r'([.\w]+@[.\w]+)', w).group() if re.search(r'([.\w]+@[.\w]+)', w) else None
- if not schema.get('政治面貌'):
- schema['政治面貌'] = re.search(r'[预备中共党团员群众无派人士]{2,6}', w).group() if re.search(r'[预备中共党团员群众无派人士]{2,6}', w) else None
- if not schema.get('手机号码'):
- schema['手机号码'] = re.search(r'\W(1[\d]{10})\W', w).group(1) if re.search(r'\W(1[\d]{10})\W', w) else None
- if not schema.get('出生年月'):
- schema['出生年月'] = re.search(r'\d{4}[./年\-]\d{1,2}[月]', w).group() if re.search(r'\d{4}[./年\-]\d{1,2}[月]', w) else None
- # if not schema.get('当前职位'):
- # schema['当前职位'] = re.search(r'[当前职位: ]{3,}(\w)+', w).group() if re.search(r'[当前职位: ]{3,}(\w)+', w) else None
- # if not schema.get('参加工作时间'):
- # schema['参加工作时间'] = re.search(r'[参加工作时间:]{3,}(\d{4}[./年\-]\d{1,2}[月])', w).group(1) if re.search(r'[参加工作时间:]{3,}(\d{4}[./年\-]\d{1,2}[月])', w) else None
- for idx, (w, t) in enumerate(concat_list):
- if t == "LOC":
- if not schema.get("籍贯"):
- if re.search(r'[出生籍贯户]+', concat_list[idx-1][0]) or re.search(r'[出生籍贯户]+', concat_list[idx-2][0]):
- schema["籍贯"] = w
- if t == "TIME":
- if not schema.get("出生年月"):
- if re.search(r'[出生日期年月]+', concat_list[idx-1][0]) or re.search(r'[出生日期年月]+', concat_list[idx-2][0]):
- schema["出生年月"] = w
- if not schema.get("参加工作时间"):
- if re.search(r'[参加工作时间]+', concat_list[idx-1][0]) or re.search(r'[参加工作时间]+', concat_list[idx-2][0]):
- schema["参加工作时间"] = w
- return {key:value for key, value in schema.items() if value}
- # 基本信息(OIE 已完成)
- @time_this_function
- def get_base_info(lines):
- if not lines:
- return
- logger.info(lines)
- data = " ".join(lines)
- rst = base_info_ie(data)[0]
- if rst.get("出生日期"):
- dates = re.findall(r'\d+' ,rst["出生日期"][0]["text"])
- if len(dates) == 1:
- if len(dates[0]) > 4:
- rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0][:4]), int(dates[0][4:6]))
- else:
- rst["出生日期"][0]["text"] = "{:4d}-01-01".format(int(dates[0][:4]))
- elif len(dates) == 2:
- rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- elif len(dates) == 3:
- rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- if rst.get("参加工作时间"):
- dates = re.findall(r'\d+' ,rst["参加工作时间"][0]["text"])
- if len(dates) == 1:
- if len(dates[0]) > 4:
- rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0][:4]), int(dates[0][4:6]))
- else:
- rst["参加工作时间"][0]["text"] = "{:4d}-01-01".format(int(dates[0]))
- elif len(dates) == 2:
- rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- elif len(dates) == 3:
- rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- return {key:rst[key][0]["text"] for key in rst.keys()}
- # 求职意向(已完成)
- @time_this_function
- def get_job_intention(lines):
- logger.info(lines)
- schema = {}
- for line in lines:
- regex = re.compile(r'\W{0,3}[::]\s+')
- line = regex.sub(':', line)
- for i in line.split():
- if ":" in i:
- try:
- key, val = i.split(":")
- schema[key] = val
- except Exception as e:
- logger.error(e)
- return schema
- # 教育经历 (已停用)
- # ner + 分词 (判断学校,时间,学历) 专业需要单独处理。
- def get_edu_list_old(lines):
- logger.info(lines)
- job_list = []
- job_dict = {'edu_time_beg':'', 'edu_time_end':'', 'edu_name':'','edu_leval':'','edu_domain':'', 'edu_statue':0}
- re_txt = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今|^\d{4,4}.\d{1,2}|19\d{2,2}.|20\d{2,2}.'
- re_txt_1 = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今'
- nums = []
- for i in range(len(lines)):
- if re.findall(re_txt, lines[i]):
- nums.append(i)
- nums.append(len(lines))
- edu_level = {'本科':18, "大专":17, "博士研究生":20, "学士":18, "博士":20, "硕士":19, "研究生":19, "博后":21, '博士后':21}
- year_dict = {18:4, 17:3,20:3,19:3,21:2}
- edu_dict = {18:'本科', 17:'大专',20:'博士研究生',19:'硕士',21:'博士后'}
- edu_list = []
- for i in range(1, len(nums[:])):
- job_dict = {'edu_time_beg':'', 'edu_time_end':'', 'edu_name':'','edu_leval':'','edu_domain':''}
- data_list = lines[nums[i-1]:nums[i]]
- if len(data_list) > 1 and data_list[1] and data_list[1][-1] == '|' and data_list[0][-1] != '|':
- data_list[0] = data_list[0] + data_list[1]
- data_list[1] = ''
- if len(data_list) > 2 and data_list[2] and data_list[2][-1] == '|' and data_list[0][-1] != '|' and '|' in str(data_list[0]) and data_list[1] and data_list[1][-1] != '|':
- data_list[0] = data_list[0] + data_list[1] + data_list[2]
- data_list[1] = ''
- data_list[2] = ''
- if '' in data_list:
- data_list.remove('')
- data_line = ' '.join(data_list)
- data_line = re.sub('[\|]', ' ', data_line)
- data_line = re.sub('-{3,}', '', data_line)
- ner_data = ner(''.join(data_list[:2]))
- org = ''
- time_list = []
- for jj in range(1, len(ner_data)):
- if ner_data[jj][1] == ner_data[jj-1][1]:
- ner_data[jj] = list(ner_data[jj])
- ner_data[jj][0] = ner_data[jj-1][0] + ner_data[jj][0]
- ner_data[jj-1] = ('','')
- for _ in ner_data:
- if _[1] == 'ORG' and not org:
- org = _[0].strip()
- elif _[1] == 'TIME' and len(_[1]) >= 4:
- time_list.append(_[0])
- #TIME
- # print(data_line)
- _list_data = re.split('\040+',data_line)
- top_level = 18
- remove_list = []
- logger.info(_list_data)
- logger.info(time_list)
- for ii in range(len(_list_data)):
- for t in time_list:
- if t in _list_data[ii]:
- _list_data[ii] = ''
- break
- for i in range(len(_list_data)):
- #if org in _list_data[i]:
- # _list_data[i] = ''
- if re.findall('^\d{4,4}', _list_data[i]):
- _list_data[i] = ''
- _data = re.findall('本科|学士|硕士|博士研究生|博士后|博后|博士|研究生|大专', _list_data[i])
- if not _data:
- continue
- top_level = edu_level[_data[0]]
- _list_data[i] = ''
- break
- #remove_list.append(i)
- logger.info(_list_data)
- job_time = re.findall(re_txt_1, data_list[0])
- if job_time:
- job_dict['edu_time'] = job_time[0]
- else:
- job_dict['edu_time'] = ''
-
- _nums = re.findall('\d+', job_dict['edu_time'])
- if len(_nums) >= 4:
- job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
- job_dict['edu_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
- job_dict['edu_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
- elif len(_nums) == 2:
- job_dict['edu_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
- job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
- job_dict['edu_time_end'] = '%s'%('至今')
- elif len(time_list) == 2:
- nums_1 = re.findall('\d+', time_list[0])
- nums_2 = re.findall('\d+', time_list[1])
- nums_1.append('09')
- nums_2.append('07')
- job_dict['edu_time_beg'] = '%s-%02d'%(nums_1[0], int(nums_1[1]))
- try:
- job_dict['edu_time_end'] = '%s-%02d'%(nums_2[0], int(nums_2[1]))
- except:
- job_dict['edu_time_end'] = None
- try:
- job_dict['edu_time'] = '%s-%02d~%s-%02d'%(nums_1[0], int(nums_1[1]), nums_2[0], int(nums_2[1]))
- except:
- job_dict['edu_time'] = '%s-%02d~今'%(nums_1[0], int(nums_1[1]))
- elif len(time_list) == 1:
- _nums = re.findall('\d+', time_list[0])
- if '毕业' in data_list[0]:
- _nums.append('06')
- _nums.insert(0, '09')
- _nums.insert(0, str(int(_nums[1]) - year_dict[top_level]))
- job_dict['edu_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
- job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
- job_dict['edu_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
- else:
- _nums.append('09')
- job_dict['edu_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
- job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
- job_dict['edu_time_end'] = '%s'%('至今')
- job_dict['edu_leval'] = edu_dict[top_level]
- if org:
- job_dict['edu_name'] = org
- else:
- job_dict['edu_name'] = ''
- edu_domain = ''
- for i in range(len(_list_data)):
- if org in _list_data[i]:
- continue
- if not _list_data[i] and '专业' in _list_data[i]:
- edu_domain = _list_data[i]
- if not edu_domain:
- for i in range(len(_list_data)):
- if org in _list_data[i]:
- continue
- if _list_data[i] and len(_list_data[i]) >= 3:
- edu_domain = _list_data[i]
- break
- if not edu_domain:
- for i in range(len(_list_data)):
- if org in _list_data[i]:
- for j in range(i+1, len(_list_data)):
- if _list_data[i] and len(_list_data[j]) >= 2:
- edu_domain = _list_data[j]
- break
- break
- job_dict['edu_domain'] = edu_domain
- if len(job_list) ==0:
- job_list.append(job_dict)
- else:
- if job_dict in job_list:
- continue
- if not job_dict['edu_time']:
- continue
- if int(job_dict['edu_time'][:4]) > int(job_list[-1]['edu_time'][:4]):
- job_list = [job_dict] + job_list
- else:
- job_list.append(job_dict)
- continue
- data_list[0] = re.sub(job_time[0], '', data_list[0])
- _list = re.split('\|\040+', data_list[0])
- #print(_list)
- if len(_list) == 1:
- __list = re.split('\040+', data_list[0])
- job_dict['edu_name'] = __list[1].strip()
- job_dict['edu_domain'] = __list[2].strip()
- job_dict['edu_leval'] = __list[3].strip()
- else:
- #if job_dict['edu_leval'] not in
- if len(_list) > 3:
- job_dict['edu_name'] = _list[2].strip()
- job_dict['edu_domain'] = _list[3].strip()
- job_dict['edu_leval'] = _list[1].strip()
- else:
- job_dict['edu_leval'] = _list[0].strip()
- job_dict['edu_name'] = _list[1].strip()
- job_dict['edu_domain'] = _list[2].strip()
- if '硕士' in _list[0] or '研究生' in _list[0]:
- job_dict['edu_leval'] = '硕士'
- elif '博士' in _list[0]:
- job_dict['edu_leval'] = '博士'
- elif '本科' in _list[0]:
- job_dict['edu_leval'] = '本科'
- elif '学士' in _list[0]:
- job_dict['edu_leval'] = '本科'
- # print(job_dict)
- if len(job_list) ==0:
- job_list.append(job_dict)
- else:
- if job_dict in job_list:
- continue
- if int(job_dict['edu_time'][:4]) > int(job_list[-1]['edu_time'][:4]):
- job_list = [job_dict] + job_list
- else:
- job_list.append(job_dict)
- #edu_list.append(job_dict['edu_time'] + job_dict['edu_name'] + job_dict['edu_domain'] + job_dict['edu_leval'])
- #if job_list[0]['edu_leval'] not in ['硕士', '博士', '本科', '博后'] and len(job_list[0]['edu_leval']) > 5:
- # job_list[0]['edu_leval'] = '本科'
- return job_list
- # 教育经历改 (已完成)
- @time_this_function
- def get_edu_list(lines):
- logger.info(lines)
- edu_list = [{"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None}]
- regex_time = re.compile(r'((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?[\d]{0,2})[至到\W]+((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?)?([今])?|(\d{4})[至\W]+([\d今]{4})')
- regex_end = re.compile(r'毕业时间[\w\W]{0,5}(\d{4})[\W年]?(\d{0,2})[月\W]?')
- regex_level = re.compile(r'[大本专科硕博士研究生后]{2,}')
- regex_domain = re.compile(u'[\u4E00-\u9FA5]{2,10}', re.UNICODE)
- count = 0
- for line in lines:
- line = line.replace("学士","本科").replace("专业","").replace("学位","")
- for cell in re.split(r'[·\|\t]', line):
- if not cell.strip():
- continue
- flags = 0
- edu_time = regex_time.search(cell)
- edu_end_time = regex_end.search(cell)
- edu_level = regex_level.search(cell)
- edu_domain = regex_domain.search(cell)
- # 标准时间格式
- if edu_time:
- # 提交信息
- if edu_list[count].get("Time") and edu_list[count].get("school_name"):
- edu_list.append({"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None})
- count += 1
- edu_list[count]["start_time"] = '{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)))
- # 年月日
- if edu_time.group(5) != None:
- edu_list[count]["end_time"] = '{:4d}-{:02d}'.format(int(edu_time.group(5)),int(edu_time.group(6)))
- edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)),int(edu_time.group(5)),int(edu_time.group(6)))
- # 只有年
- elif edu_time.group(8) != None:
- edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_time.group(8)),int(edu_time.group(9)))
- edu_list[count]["start_time"] = '{:4d}'.format(int(edu_time.group(8)))
- edu_list[count]["end_time"] = '{:4d}'.format(int(edu_time.group(9)))
- # 至今类
- elif edu_time.group(7):
- edu_list[count]["end_time"] = edu_time.group(7)
- edu_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(edu_time.group(2)),int(edu_time.group(3)),edu_time.group(7))
- flags = 1
- # 只有毕业时间
- elif edu_end_time:
- # 提交信息
- if edu_list[count].get("end_time") and edu_list[count].get("school_name"):
- edu_list.append({"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None})
- count += 1
- # 年月
- if edu_end_time.group(2):
- edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_end_time.group(1))-3,int(edu_end_time.group(2)),int(edu_end_time.group(1)),int(edu_end_time.group(2)))
- edu_list[count]["end_time"] = '{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)))
- # 只有年
- elif edu_end_time.group(1):
- edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_end_time.group(1))-3,int(edu_end_time.group(1)))
- edu_list[count]["end_time"] = '{:4d}'.format(int(edu_end_time.group(1)))
- # 学历
- if (not edu_list[count].get("degree")) and edu_level:
- edu_list[count]["degree"] = edu_level.group(0)
- # WordTag 识别 学校/专业
- for word, tag in ner_tag(cell):
- if (not edu_list[count].get("school_name")) and (tag == "组织机构类_教育组织机构"):
- edu_list[count]["school_name"] = word.strip()
- flags = 1
- elif (not edu_list[count].get("major")) and (tag in "_术语类型"):
- edu_list[count]["major"] = word.strip()
- elif edu_list[count].get("school_name") and edu_list[count].get("major"):
- break
- # LAC 识别 学校
- else:
- for word, tag in ner(cell):
- if (tag == "ORG"):
- edu_list[count]["school_name"] = word
- flags = 1
- break
- # 未识别成功时填充专业
- if (not (edu_level or flags or edu_list[count].get("major"))) and edu_domain:
- edu_list[count]["major"] = edu_domain.group(0)
- # 剔除时间不存在、学校不存在的列
- if (not edu_list[-1].get("Time")) or (not edu_list[-1].get("school_name")):
- edu_list.pop()
- return edu_list
- # 工作经历 (已完成)
- # ner + 分词 机构信息,人物身份信息,时间 工作内容区分判断
- # 其中,时间是判断是否下一份工作情况的主要标识符之一。字符数量
- # 时间类 数量词
- @time_this_function
- def get_job_list(lines):
- logger.info(lines)
- job_list = []
- re_txt = '\d{4,4}\040{0,2}.\d+\040{0,2}.?\040{0,2}[\-–至-\—~]{1,2}\040{0,2}\d{4,4}\040{0,2}.\040{0,2}\d+.?|\d{4,4}.\d+.?\040{0,2}[\-–-—]{0,2}\040{0,2}至?今|\d{4,4}.\d+.?\040{0,2}[\-–-]{1,2}\040{0,2}现在|\d{4,4}年\d+月\-\d{4,4}年\d+月|\d{4,4}年\d+月\-\~|\d{4,4}年\d+月[\-\~-]至今|\d{4,4}-\d+\040{0,2}[-\~至]\040{0,2}\d{4,4}-\d+|\d{4,4}-\d+\~|\d{4,4}-\d+\[~-]至今|\d{4,4}-\d+\040{0,2}至今'
- nums = []
- for i in range(len(lines)):
- #print(lines[i])
- #print(lines[i], re.findall(re_txt, lines[i]), re.findall('\||\040{1,}', lines[i]))
- if re.findall(re_txt, lines[i].replace(' ', '')) and re.findall('\||\040{1,}', lines[i]):
- nums.append(i)
- continue
- if re.findall(re_txt, lines[i].replace(' ', '')[:20]):
- nums.append(i)
- continue
- if len(lines[i].strip().replace(' ', '')) > 50:
- continue
-
- year_list = re.findall('19\d{2,2}.\d{1,2}|20\d{2,2}.\d{1,2}', lines[i])
- if len(year_list) >= 2:
- nums.append(i)
- elif len(year_list) == 1 and '至今' in lines[i]:
- nums.append(i)
- nums.append(len(lines))
- # logger.info(nums)
- logger.info('get_job_list :{}'.format(nums))
- for i in range(1, len(nums[:])):
- job_dict = {'job_time':'', 'job_leval':'','job_company':'','job_content':''}
- data_list = lines[nums[i-1]:nums[i]]
- if '' in data_list:
- data_list.remove('')
- org = ''
- person_professor_list = []
- org_index = -1
- end_index = 3
- job_time = re.findall(re_txt, data_list[0])
- if not job_time:
- year_list = re.findall('19\d{2,2}.\d{1,2}|20\d{2,2}.\d{1,2}', data_list[0])
- if len(year_list) >= 2:
- job_time = ['-'.join(year_list)]
- elif len(year_list) == 1 and '至今' in lines[i]:
- job_time = [year_list[0] + '~' + '至今']
- if not job_time:
- regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
- job_time = [re.search(regex, data_list[0]).group(0)]
- job_dict['job_time'] = job_time[0]
- _nums = re.findall('\d{1,4}', job_dict['job_time'])
- #print(_nums)
- if len(_nums) >= 4:
- job_dict['job_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
- elif len(_nums) == 2:
- job_dict['job_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
- if re.findall('\d', job_time[0]):
- job_time[0] = job_time[0][:-1]
- data_list[0] = re.sub(job_time[0], '', data_list[0])
- data_list[0] = data_list[0].strip()
- ner_list = []
- for ii in range(len(data_list[:3])):
- if '工作' in data_list[ii][:4] and (re.findall(':|\:', data_list[ii])):
- end_index = ii
- break
- #print(re.findall('\040|\||/', data_list[ii].strip()), org)
- if not re.findall('\040|\||/', data_list[ii].strip()) and org:
- end_index = ii
- break
- if len(data_list[ii]) > 80:
- end_index = ii
- break
- if data_list[ii]:
- ner_data = ner_tag(data_list[ii].strip())
- #print('\n\nnerdata:\t',ner_data)
- else:
- continue
- ner_list.append(ner_data)
- for x in ner_data:
- if x[1] == '人物类_概念' and len(x[0]) > 2:
- person_professor_list.append(x[0].strip())
-
- elif x[1] == '组织机构类_企事业单位' or x[1] == '组织机构类_教育组织机构' or x[1] == '组织机构类_国家机关':
- if not org and len(x[0]) >= 3:
- org = re.split('\040|\|/', x[0].strip())[0]
- org_index = ii
- if not org:
- for ii in range(len(ner_list)):
- if org:
- break
- ner_data = ner_list[ii]
- for x in ner_data:
- if x[1][:5] == '组织机构类':
- org = re.split('\040|\|/', x[0].strip())[0]
- break
- #print(person_professor_list)
- if not person_professor_list:
- for ii in range(len(ner_list)):
- ner_data = ner_list[ii]
- for x in ner_data:
- if x[1] == '人物类_概念':
- person_professor_list = [re.split('\040|\|/', x[0].strip())[0]]
- break
- data_line = ' '.join(data_list[:end_index])
- data_line = re.sub('\||/', ' ', data_line)
- _list_data = re.split('\040+', data_line)
- if len(_list_data) == 1 and len(data_list) == 1:
- end_index = 0
- #print(_list_data)
- if not person_professor_list:
- for x in range(len(_list_data)):
- if re.findall('经理|工程师|会计|董事长|总监|秘书|主管|处长|局长|主任|讲师|教授', _list_data[x][-4:]):
- person_professor_list.append(_list_data[x])
- if not org:
- for x in range(len(_list_data)):
- if len(_list_data[x]) < 4:
- _list_data[x] = ''
- elif person_professor_list and re.findall('|'.join(person_professor_list), _list_data[x]):
- _list_data[x] = ''
- elif '经理' == _list_data[x][-2:]:
- _list_data[x] = ''
- for x in range(len(_list_data)):
- if _list_data[x]:
- org = _list_data[x]
- break
- if not person_professor_list:
- for x in range(len(_list_data)):
- if org in _list_data[x]:
- for j in range(x+1, len(_list_data)):
- if _list_data[j]:
- person_professor_list = [_list_data[j]]
- break
- break
- #print(org, person_professor_list, job_time)
- job_dict['job_company'] = org
- job_dict['job_leval'] = ' '.join(person_professor_list)
- if not data_list[end_index:] and end_index == 3:
- end_index = 2
- if not data_list[end_index:] and end_index == 2:
- end_index = 1
- job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容::|工工作作内内容容', '工作内容:', ''.join(data_list[end_index:]))
- job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
- job_dict['start_time'] = job_dict['job_time'].split('~')[0]
- job_dict['end_time'] = job_dict['job_time'].split('~')[-1]
- normal = {"job_company":"company_name","job_content":"job_desc","job_leval":"job_name"}
- for key in normal.keys():
- if job_dict.get(key):
- job_dict[normal[key]] = job_dict[key]
- job_dict.pop(key)
- job_list.append(job_dict)
- # continue
- # if len(data_list) > 1 and data_list[1] and data_list[1][-1] == '|':# and data_list[0] and data_list[0][-1] != '|':
- # data_list[0] = data_list[0] + data_list[1]
- # data_list[1] = ''
- # elif len(data_list) > 2 and data_list[2] and data_list[2][-1] == '|' and data_list[0][-1] != '|' and '|' in str(data_list[0]) and data_list[1] and data_list[1][-1] != '|':
- # data_list[0] = data_list[0] + data_list[1] + data_list[2]
- # data_list[1] = ''
- # data_list[2] = ''
- # elif len(data_list) > 1 and data_list[1] and '工作职责:' in data_list[2]:
- # data_list[0] = data_list[0] + data_list[1]
- # data_list[1] = ''
- # elif len(data_list) > 1 and '工作职责:' in data_list[3]:
- # data_list[0] = data_list[0] + data_list[1] + data_list[2]
- # data_list[1] = ''
- # data_list[2] = ''
- # job_time = re.findall(re_txt, data_list[0])
- # job_dict['job_time'] = job_time[0]
- # _nums = re.findall('\d+', job_dict['job_time'])
- # #print(_nums)
- # if len(_nums) >= 4:
- # job_dict['job_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
- # elif len(_nums) == 2:
- # job_dict['job_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
- # data_list[0] = re.sub(job_time[0], '', data_list[0])
- # data_list[0] = data_list[0].strip()
- # data_list[0] = re.sub('历任:', ' ', data_list[0])
- # _list = data_list[0].split('|')
- # if len(_list) == 1:
- # __list = re.split('\040{2,}', data_list[0])
- # #print(__list)
- # job_dict['job_leval'] = __list[1].strip()
- # job_dict['job_company'] = __list[0].strip()
- # else:
- # job_dict['job_leval'] = _list[0].strip()
- # job_dict['job_company'] = _list[1].strip()
- # if '职级:' in data_list[1:]:
- # data_list.remove('职级:')
- # job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容::|工工作作内内容容', '工作内容:', ''.join(data_list[1:]))
- # job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
- # #print(job_dict)
- # job_list.append(job_dict)
- return job_list
- # 项目经历 (已弃用)
- # 项目名称未知
- def get_pro_list_old(lines):
- logger.info(lines)
- pro_list = [{"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,},]
- regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
- re_con = re.compile(r'负责内容(.*?)')
- re_na = re.compile(r'\W(.*?项目)\W')
- count = 0
- for line in lines:
- regex_time = regex.search(line)
- regex_content = re_con.search(line)
- regex_name = re_na.search(line)
- if regex_time:
- if pro_list[count].get("Time"):
- pro_list.append({"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,})
- count += 1
- pro_list[count]["startTime"] = '{:4d}-{:02d}'.format(int(regex_time.group(2)),int(regex_time.group(3)))
- if regex_time.group(5) != None:
- pro_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(regex_time.group(5)),int(regex_time.group(6)))
- pro_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(regex_time.group(2)),int(regex_time.group(3)),int(regex_time.group(5)),int(regex_time.group(6)))
- else:
- pro_list[count]["endTime"] = regex_time.group(7)
- pro_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(regex_time.group(2)),int(regex_time.group(3)),regex_time.group(7))
- elif regex_name and (not pro_list[count].get("job_name")):
- pro_list[count]["pro_name"] = regex_name.group()
- elif pro_list[count].get("content"):
- pro_list[count]["content"] += line
- else:
- try:
- for word, tag in ner_tag(line):
- if (not pro_list[count].get("job_leval")) and (tag == "人物类_概念"):
- pro_list[count]["job_leval"] = word
- if (not pro_list[count].get("job_company")) and (tag in "组织机构类_企事业单位"):
- pro_list[count]["job_company"] = word
- except Exception as e:
- logger.error(e)
- pro_list[count]["content"] = line
- return pro_list
- # 项目经历 (UIE 已完成)
- @time_this_function
- def get_pro_list(lines):
- logger.info(lines)
- starts = []
- # 时间查找
- for index, line in enumerate(lines):
- if re.search(r'\d{4}', line):
- starts.append(index)
- # 简单筛选
- count = len(starts)
- c = (starts[-1] - starts[0])/count
- for i in range(count-1):
- if (starts[i+1]-starts[i] < c/2):
- starts[i+1] = starts[i]
- # 合并
- pro_list = []
- pros = {}
- index = 0
- for i in range(len(lines)):
- if i in starts:
- index = i
- pros[index] = [lines[i], []]
- pros[index][1].append(lines[i])
- elif not pros:
- continue
- else:
- pros[index][0] += lines[i]
- pros[index][1].append(lines[i])
- # 提取
- for key in pros.keys():
- info = pro_ie(pros[key][0])
- src = pros[key][1]
- for rst in info:
- if not rst.get("时间") or not rst.get("项目名称"):
- continue
- rst["工作内容"] = [{"text":""}]
- logger.info(rst)
- for l in src:
- if rst["时间"][0]["text"] in l:
- continue
- else:
- rst["工作内容"][0]["text"] += l
- for key in rst.keys():
- if key == "时间":
- time_list = [None, None, None, None, None, None]
- tim_list = re.findall(r'\d+', rst["时间"][0]["text"])
- i = 0
- for t in tim_list:
- if (len(t) == 4) and (i != 0):
- i = 3
- time_list[i] = t
- else:
- time_list[i] = t
- i += 1
- else:
- continue
- if time_list[3] is not None:
- if time_list[4] is not None:
- rst["时间"][0]["text"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(time_list[0]),int(time_list[1]),int(time_list[3]),int(time_list[4]))
- else:
- rst["时间"][0]["text"] = "{:4d}~{:4d}".format(int(time_list[0]),int(time_list[3]))
- else:
- if time_list[1] is not None:
- rst["时间"][0]["text"] = "{:4d}-{:02d}~至今".format(int(time_list[0]),int(time_list[1]))
- else:
- rst["时间"][0]["text"] = "{:4d}~至今".format(int(time_list[0]))
- pro_list.extend([{key:rst[key][0]["text"] for key in rst.keys()} for rst in info])
- return pro_list
- # 培训经历 (已完成)
- # ner + 分词 (机构名) 培训项目 时间
- @time_this_function
- def get_cultivate_list(lines):
- logger.info(lines)
- job_list = []
- re_txt = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今|^\d{4,4}.\d{1,2}|\d{4,4}.'
- re_txt_1 = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今'
- nums = []
- for i in range(len(lines)):
- if re.findall(re_txt, lines[i].replace(' ', '')) and re.findall('\||\040{1,}', lines[i]):
- nums.append(i)
- continue
- if re.findall(re_txt, lines[i].replace(' ', '')[:20]):
- nums.append(i)
- if len(lines[i].strip().replace(' ', '')) > 50:
- continue
- nums.append(len(lines))
- year_dict = {18:4, 17:3,20:3,19:3,21:2,22:1}
- for i in range(1, len(nums[:])):
- job_dict = {'cultivate_time':'', 'cultivate_time_beg':'', 'cultivate_time_end':'', 'cultivate_name':'','cultivate_leval':'','cultivate_content':''}
- data_list = lines[nums[i-1]:nums[i]]
- data_line = ' '.join(data_list)
- data_line = re.sub('[\|\t]', ' ', data_line)
- data_line = re.sub('-{3,}', '', data_line)
- ner_data = ner(''.join(data_list[:2]))
- org = ''
- time_list = []
- for _ in ner_data:
- if _[1] == 'ORG' and not org:
- org = _[0].strip()
- elif _[1] == 'TIME' and len(_[1]) >= 4:
- time_list.append(_[0])
- #TIME
- logger.info(data_line)
- _list_data = re.split('\040+', data_line)
- top_level = 22
- end_index = 0
- remove_list = []
- if len(_list_data) <= 2:
- end_index = 0
- #continue
- job_time = re.findall(re_txt_1, data_list[0])
- if job_time:
- job_dict['cultivate_time'] = job_time[0]
- data_list[0] = re.sub(job_time[0], '', data_list[0])
- else:
- job_dict['cultivate_time'] = ''
-
- for t in time_list:
- data_list[0] = re.sub(t, '', data_list[0])
- _list = data_list[0].split('|')
- if len(_list) >= 2:
- job_dict['cultivate_name'] = _list[0].strip()
- job_dict['cultivate_leval'] = _list[1].strip()
- end_index = 1
- _nums = re.findall('\d+', job_dict['cultivate_time'])
- if len(_nums) >= 4:
- job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
- job_dict['cultivate_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
- job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
- elif len(_nums) == 2:
- job_dict['cultivate_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
- job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
- job_dict['cultivate_time_end'] = '%s'%('至今')
- elif len(time_list) == 2:
- nums_1 = re.findall('\d+', time_list[0])
- nums_2 = re.findall('\d+', time_list[1])
- nums_1.append('09')
- nums_2.append('07')
- job_dict['cultivate_time_beg'] = '%s-%02d'%(nums_1[0], int(nums_1[1]))
- job_dict['cultivate_time_end'] = '%s-%02d'%(nums_2[0], int(nums_2[1]))
- job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(nums_1[0], int(nums_1[1]), nums_2[0], int(nums_2[1]))
-
- elif len(time_list) == 1:
- _nums = re.findall('\d+', time_list[0])
- if '获得' in data_list[0]:
- _nums.append('01')
- _nums.insert(0, '01')
- _nums.insert(0, str(int(_nums[1]) - year_dict[top_level]))
- job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
- job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
- job_dict['cultivate_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
- else:
- _nums.append('01')
- job_dict['cultivate_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
- job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
- job_dict['cultivate_time_end'] = '%s'%('至今')
-
- job_dict['cultivate_content'] = re.sub('培培训训内内容容::|培培训训内内容容::|培培训训内内容容', '培训内容:', ''.join(data_list[end_index:]))
- if not job_dict['cultivate_name']:
- job_dict['cultivate_name'] = org
- logger.info(job_dict)
- job_list.append(job_dict)
- continue
- '''
- #print(nums)
- for i in range(1, len(nums[:])):
- job_dict = {'cultivate_time':'', 'cultivate_name':'','cultivate_leval':'','cultivate_content':''}
- data_list = lines[nums[i-1]:nums[i]]
- if '' in data_list:
- data_list.remove('')
- if len(data_list) > 1 and data_list[1] and data_list[1][-1] == '|' and data_list[0][-1] != '|':
- data_list[0] = data_list[0] + data_list[1]
- data_list[1] = ''
- job_time = re.findall(re_txt_1, data_list[0])
- job_dict['cultivate_time'] = job_time[0]
- _nums = re.findall('\d+', job_dict['cultivate_time'])
- if len(_nums) >= 4:
- job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
- elif len(_nums) == 2:
- job_dict['cultivate_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
- data_list[0] = re.sub(job_time[0], '', data_list[0])
- _list = data_list[0].split('|')
- if len(_list) >= 2:
- job_dict['cultivate_name'] = _list[0].strip()
- job_dict['cultivate_leval'] = _list[1].strip()
- job_dict['cultivate_content'] = re.sub('培培训训内内容容|培培训训内内容容::|培培训训内内容容::', '培训内容:', ''.join(data_list[1:]))
- else:
- job_dict['cultivate_content'] = re.sub('培培训训内内容容|培培训训内内容容::|培培训训内内容容::', '培训内容:', ''.join(data_list[0:]))
- #print(job_dict)
- '''
- return job_list
- # 语言能力(已完成)
- @time_this_function
- def get_lag_list(lines):
- logger.info(lines)
- lan_list = []
- re_lan = re.compile(r'(\w+[语话])')
- re_lev = re.compile(r'([公共级四专八]+)')
- lag_dict = {'语言':'', '熟练度':""}
- for l in lines:
- if not l.strip():
- continue
- lan_name = re.search(re_lan, l)
- lag_lev = re.search(re_lev, l)
- if lag_lev and lag_lev.group(1):
- lag_dict["熟练度"] = lag_lev.group(1)
- if lan_name and lan_name.group(1):
- if lag_dict["语言"]:
- lan_list.append(lag_dict)
- lag_dict = {'语言':'', '熟练度':""}
- lag_dict['语言'] = lan_name.group(1)
- return lan_list
- # 家庭情况(已弃用)
- def get_fam_list(lines):
- job_list = []
- fam_dict = {}
- for l in lines:
- if not l.strip():
- continue
- ls = l.split('|')
- if len(ls) == 1:
- continue
- fam_dict = {'fam_name':"",'fam_company':"",'fam_lable':"","fam_status":"", 'fam_job':""}
- fam_dict["fam_lable"] = ls[0].strip()
- fam_dict["fam_name"] = ls[1].strip()
- flag = 0
- if re.findall('\d岁|\d{4,5}', ls[2]):
- flag = 1
- fam_dict["fam_company"] = ls[flag+2].strip()
- fam_dict["fam_job"] = ls[flag+3].strip()
- fam_dict["fam_status"] = ls[flag+4].strip()
- #print(fam_dict)
- job_list.append(fam_dict)
- return job_list
- # 证书情况 时间+证书名称 (旧版)
- @time_this_function
- def get_cet_list_(lines):
- logger.info(lines)
- job_list = []
- re_txt = '\d+年\d+月|\d+-\d+|\d+\.\d+'
- lines_word = ' '.join(lines)
- lines = re.findall('\d+年\d+月|\d+-\d+|\d+\.\d+', lines_word)
- nums = []
- for x in range(len(lines) - 1):
- _index = lines_word.index(lines[x])
- _end_index = lines_word.index(lines[x+1])
- l = lines_word[_index : _end_index]
- if not l.strip():
- continue
- lines_word = lines_word[_end_index:]
- job_time = re.findall(re_txt, l)
- cet_dict = {'证书':'','获得时间':""}
- if job_time:
- cet_dict['证书'] = job_time[0]
- l = re.sub(job_time[0], '', l)
- else:
- continue
- ls = re.split('\||\040+|\t+', l)
- logger.info(ls)
- for l in ls:
- if len(l) <= 3:
- continue
- cet_dict['证书'] = l.strip()
- break
- job_list.append(cet_dict)
- return job_list
- # 证书情况 时间+证书名称 (UIE已完成)
- @time_this_function
- def get_cet_list(lines):
- logger.info(lines)
- cet_list = []
- for line in lines:
- info = cet_ie(line)
- cet_list.extend([{key:rst[key][0]["text"] for key in rst.keys()} for rst in info if rst.get("证书")])
- return cet_list
- # 获奖情况 时间+获奖名称 (旧版)
- def get_prize_list_old(lines):
- logger.info(lines)
- job_list = []
- re_txt = '\d+年\d+月|\d+-\d+|\d{4,4}.\d{1,2}'
- lines_word = ' '.join(lines)
- lines = re.findall('\d+年\d+月|\d{4,4}-\d+|\d{4,4}.\d{1,2}', lines_word)
- nums = []
- for x in range(len(lines) - 1):
- _index = lines_word.index(lines[x])
- _end_index = lines_word.index(lines[x+1])
- l = lines_word[_index : _end_index]
- if not l.strip():
- continue
- lines_word = lines_word[_end_index:]
- job_time = re.findall(re_txt, l)
- cet_dict = {'prize_name':'','prize_time':""}
- if job_time:
- cet_dict['prize_time'] = job_time[0]
- l = re.sub(job_time[0], '', l)
- else:
- continue
- ls = re.split('\||\040+|\t+', l)
- logger.info(ls)
- for l in ls:
- if len(l) <= 3:
- continue
- cet_dict['prize_name'] = l.strip()
- break
- logger.info(cet_dict)
- job_list.append(cet_dict)
- return job_list
- # 获奖情况 时间+获奖名称 (UIE已完成)
- @time_this_function
- def get_prize_list(lines):
- logger.info(lines)
- prize_list = []
- for line in lines:
- info = prize_ie(line)
- prize_list.extend([{key:rst[key][0]["text"] for key in rst.keys()} for rst in info if rst.get("奖项")])
- return prize_list
- # 返回其他信息
- def get_other_list(lines):
- other_list = []
- other_list.append("\n".join(lines))
- return other_list
- # Linux doc 文件处理
- def doc2pdf_linux(docPath, pdfPath):
- """
- 允许的文档格式:doc,docx
- 仅在linux平台下可以
- 需要在linux中下载好libreoffice
- """
- # 注意cmd中的libreoffice要和linux中安装的一致
- cmd = 'libreoffice6.3 --headless --convert-to pdf'.split() + [docPath] + ['--outdir'] + [pdfPath]
- # cmd = 'libreoffice6.2 --headless --convert-to pdf'.split() + [docPath]
- p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
- p.wait(timeout=30) # 停顿30秒等待转化
- stdout, stderr = p.communicate()
- if stderr:
- raise subprocess.SubprocessError(stderr)
- # Windows doc 文件处理
- def doc2pdf_win(docPath, pdfPath):
- # console.print(pdfPath+'/'+os.path.splitext(os.path.split(docPath)[-1])[0] + '.pdf')
- import win32com
- from win32com.client import DispatchEx, constants
- word = DispatchEx("Word.Application") #内部方法
- word.Visible = 1 # 后台运行,不显示
- word.DisplayAlerts = 0 # 不警告
- doc = word.Documents.Open(docPath) #转换源文件
- doc.SaveAs(pdfPath+'/'+os.path.splitext(os.path.split(docPath)[-1])[0] + '.pdf', FileFormat=17) #txt=4,html=10,docx=16,pdf=17 #新文件
- doc.Close() #关闭
- word.Quit() #退出
- # doc 文件处理
- def doc2pdf(docPath, pdfPath, system):
- """
- 注意使用绝对路径
- pdf的生成只写路径,不写名字
- """
- docPathTrue = os.path.abspath(docPath) # bugfix - searching files in windows/system32
- if system == "Linux":
- return doc2pdf_linux(docPathTrue, pdfPath)
- if system == "Windows":
- return doc2pdf_win(docPathTrue, pdfPath)
- # txt 纯文本解析(已完成)
- @time_this_function
- def parse_txt(path, save_dir):
- with open(path, 'r', encoding='utf-8') as fp:
- data = fp.read()
- global block, block_rev
- chun = 1
- page = {1: []}
- if len(data.split("\n")) <= 2:
- for line in data.split("\n"):
- line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").replace("·","").strip()
- for word in line.split():
- if word in block.keys():
- chun = block[word]
- page[chun] = []
- elif word:
- page[chun].append(word)
- else:
- for line in data.split("\n"):
- line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").replace("·","")
- regex = re.compile(u'[\u3000]+',re.UNICODE)
- line = regex.sub('', line.strip())
- if line in block.keys():
- chun = block[line]
- page[chun] = []
- elif line:
- page[chun].append(line)
- result_data = dict()
- for key in page.keys():
- for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
- if key == index:
- result_data[block_rev[index]] = func(page[index])
- filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
- with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
- json.dump(result_data, fp, indent=4, ensure_ascii=False)
- # 纯文本 word 解析
- @time_this_function
- def read_from_word(doc, path, save_dir):
- para_text = []
- for para in doc.paragraphs:
- para_text.append(para.text)
- global block, block_rev
- chun = 1
- page = {1: []}
- for line in para_text:
- regex = re.compile(u'[\uF000-\uF0FF]+',re.UNICODE)
- line = regex.sub('', line)
- if line in block.keys():
- chun = block[line]
- page[chun] = []
- elif line:
- page[chun].append(line)
- result_data = dict()
- for key in page.keys():
- for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
- if key == index:
- result_data[block_rev[index]] = func(page[index])
- filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
- with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
- json.dump(result_data, fp, indent=4, ensure_ascii=False)
- # 提取 word 表格(已完成)
- @time_this_function
- def check_word(path, save_dir):
- doc = Document(path)
- tables = doc.tables
- if not tables:
- logger.info("this is raw text")
- read_from_word(doc, path, save_dir=save_dir)
- logger.info("this is a Table")
- global block
- with open("resources/keys.json", "r", encoding="utf-8") as fp:
- prk = json.load(fp)
- chun = 1
- page = {1: []}
- regex = re.compile(r'(\(\w{2,8}\))?((\w{2,8}))?')
- for table in tables:
- lo = {} # 存储每一行去重后的数据
- for row in range(0, len(table.rows)):
- row_list = []
- for col in range(0, len(table.row_cells(row))): # 提取row行的全部列数据
- if len(''.join(table.cell(row, col).text)) <= 20:
- row_list.append(re.sub(r'(\w)\n', r'\1', table.cell(row, col).text))
- else:
- row_list.append(regex.sub("", table.cell(row, col).text.replace(" ","").replace(":", ":").replace("学历\n学位","学历学位"))) # 去除字符串中的特殊字符,并添加到临时列表中
- lo[row] = (sorted(set(row_list), key=row_list.index)) # 在不变顺序的前提下,去除List中的重复项
- # 去除空项
- for key in list(lo.keys()):
- if "" in lo[key]:
- lo[key].remove("")
- if not lo[key]:
- lo.pop(key)
- for _, line in lo.items():
- if (line[0] in block.keys()) or (line[0] in prk.keys()):
- # 包含大类目名
- if line[0] in block.keys():
- # 指向当前类目
- chun = block[line[0]]
- if not page.get(chun):
- page[chun] = []
- # 去除类目名
- line = '\n'.join(line[1:])
- # 包含小类目
- elif line[0] in prk.keys():
- # 指向当前类目
- chun = prk[line[0]]
- if not page.get(chun):
- page[chun] = []
- # 不去除
- line = '\n'.join(line)
- else:
- line = '\n'.join(line)
- # 标准化小类目
- for k in prk.keys():
- line = line.replace(k+"\n", k+":")
- page[chun].extend(line.split())
- result_data = dict()
- for key in page.keys():
- for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
- if key == index:
- result_data[block_rev[index]] = func(page[index])
- filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
- with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
- json.dump(result_data, fp, indent=4, ensure_ascii=False)
- # pdf 解析句子(已完成)
- @time_this_function
- def parse_line_layout(layout, b):
- texts = []
- """解析页面内容,一行一行的解析"""
- # bbox:
- # x0:从页面左侧到框左边缘的距离。
- # y0:从页面底部到框的下边缘的距离。
- # x1:从页面左侧到方框右边缘的距离。
- # y1:从页面底部到框的上边缘的距离
- for textbox in layout:
- if isinstance(textbox, LTTextBox) or isinstance(textbox, LTTextLine):
- for char in textbox:
- if isinstance(char, LTTextLineHorizontal):
- texts.append([char.bbox[0], char.bbox[3], char.get_text().strip()])
- # 按行排序
- texts.sort(key=lambda x:-x[1])
- global block, block_rev
- chun = b
- page = {chun: []}
- for _, _, line in texts:
- regex = re.compile(u'[\u007F|\u25A0|\u00B7|\uF000-\uF0FF]+',re.UNICODE)
- line = regex.sub('', line)
- regex_tips = re.compile(r'(\(.*?\))?((.*?))?')
- # line = regex_tips.sub('', line)
- line = line.strip()
- if regex_tips.sub('', line).strip() in block.keys():
- chun = block[regex_tips.sub('', line).strip()]
- page[chun] = []
- elif line:
- page[chun].append(line)
- return page, chun
- # pdf 样式解析(已完成)
- @time_this_function
- def read_from_pdf(path, save_dir):
- result = {}
- global block_rev
- with open(path, 'rb') as in_file:
- parser = PDFParser(in_file) # 用文件对象来创建一个pdf文档分析器
- doc: PDFDocument = PDFDocument(parser) # 创建pdf文档
- rsrcmgr = PDFResourceManager() # 创建PDF,资源管理器,来共享资源
- # 创建一个PDF设备对象
- laparams = LAParams()
- device = PDFPageAggregator(rsrcmgr, laparams=laparams)
- # 创建一个PDF解释其对象
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- # 循环遍历列表,每次处理一个page内容
- # doc.get_pages() 获取page列表
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- # 处理文档对象中每一页的内容
- # 循环遍历列表,每次处理一个page的内容
- b = 1
- for page in PDFPage.create_pages(doc):
- logger.debug('================ 新页面 ================')
- interpreter.process_page(page)
- layout = device.get_result()
- r, b = parse_line_layout(layout, b)
- for key in r.keys():
- if result.get(key):
- result[key].extend(r[key])
- else:
- result[key] = r[key]
- result_data = dict()
- for key in result.keys():
- for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
- if key == index:
- result_data[block_rev[index]] = func(result[index])
- filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
- with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
- json.dump(result_data, fp, indent=4, ensure_ascii=False)
- # pdf 表格解析 (已完成)
- @time_this_function
- def parse_table_from_pdf(path, save_dir):
- global block, block_rev
- lo = {}
- with pdfplumber.open(path) as pdf:
- for page in pdf.pages:
- for table in page.extract_tables():
- for line in table:
- row_list = []
- for word in line:
- row_list.append(word)
- lo[len(lo.keys())] = row_list
- # 去除空项
- for key in list(lo.keys()):
- if "" in lo[key]:
- lo[key].remove("")
- if not lo[key]:
- lo.pop(key)
- for _, line in lo.items():
- if (line[0] in block.keys()) or (line[0] in prk.keys()):
- # 包含大类目名
- if line[0] in block.keys():
- # 指向当前类目
- chun = block[line[0]]
- if not page.get(chun):
- page[chun] = []
- # 去除类目名
- line = '\n'.join(line[1:])
- # 包含小类目
- elif line[0] in prk.keys():
- # 指向当前类目
- chun = prk[line[0]]
- if not page.get(chun):
- page[chun] = []
- # 不去除
- line = '\n'.join(line)
- else:
- line = '\n'.join(line)
- # 标准化小类目
- for k in prk.keys():
- line = line.replace(k+"\n", k+":")
- page[chun].extend(line.split())
- result_data = dict()
- for key in page.keys():
- for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
- if key == index:
- result_data[block_rev[index]] = func(page[index])
- filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
- with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
- json.dump(result_data, fp, indent=4, ensure_ascii=False)
- # 检测 pdf 格式 (已完成)
- @time_this_function
- def check_pdf(path):
- """
- # 输入:
- # pdf 文件路径
- # 输出:
- # 文件包含元素 [Word, Table]
- """
- rst = []
- for page_layout in extract_pages(path):
- for element in page_layout:
- if isinstance(element, LTFigure):
- for cell in element:
- if isinstance(cell, LTChar):
- rst.append("Table")
- break
- elif isinstance(element, LTTextContainer):
- rst.append("Word")
- return set(rst)
- # 检测编码(已完成)
- def decode_path(path):
- '''zipfile解压出现乱码,将乱码的路径编码为UTF8'''
- try:
- path_name = path.decode('utf-8')
- except:
- path_name = path.encode('437').decode('gbk')
- path_name = path_name.encode('utf-8').decode('utf-8')
- return path_name
- # 格式化字段
- def formatter(result, json_obj):
- normal = json_obj["base"]
- itenormal = json_obj["base"]
- edunormal = json_obj["tal_his_edu"]
- jobnormal = json_obj["tal_his_job"]
- tranornal = json_obj["tal_training_experience"]
- cetnormal = json_obj["tal_vocational_qualification_certificate"]
- rewnormal = json_obj["tal_reward_punishment"]
- family = json_obj["tal_family_social_relation"]
- # for key in normal.keys():
- # if result.get(key):
- # result[normal[key]] = result[key]
- # result.pop(key)
- for key in json_obj["base"].keys():
- if result.get("基本信息"):
- if result["基本信息"].get(key):
- result[json_obj["base"][key]] = result["基本信息"][key]
- del result["基本信息"][key]
- if result.get("求职意向"):
- if result["求职意向"].get(key):
- result[json_obj["base"][key]] = result["求职意向"][key]
- del result["求职意向"][key]
- del result["基本信息"]
- del result["求职意向"]
- if result.get("教育经历"):
- for idx in range(len(result['教育经历'])):
- for key in edunormal.keys():
- if result['教育经历'][idx].get(key):
- result['教育经历'][idx][edunormal[key]] = result['教育经历'][idx][key]
- result['教育经历'][idx].pop(key)
- if result.get("工作经历"):
- for idx in range(len(result['工作经历'])):
- for key in jobnormal.keys():
- if result['工作经历'][idx].get(key):
- result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
- result['工作经历'][idx].pop(key)
- if result.get("项目经历"):
- for key in json_obj["tal_his_project"].keys():
- for idx in range(len(result["项目经历"])):
- if result["项目经历"][idx].get(key):
- result["项目经历"][idx][json_obj["tal_his_project"][key]] = result["项目经历"][idx][key]
- del result["项目经历"][idx][key]
- if result.get("培训经历"):
- for idx in range(len(result['培训经历'])):
- for key in tranornal.keys():
- if result['培训经历'][idx].get(key):
- result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]
- result['培训经历'][idx].pop(key)
- if result.get("语言能力"):
- for key in json_obj["tal_language"].keys():
- for idx in range(len(result["语言能力"])):
- if result["语言能力"][idx].get(key):
- result["语言能力"][idx][json_obj["tal_language"][key]] = result["语言能力"][idx][key]
- del result["语言能力"][idx][key]
- if result.get("证书"):
- for idx in range(len(result['证书'])):
- for key in cetnormal.keys():
- if result['证书'][idx].get(key):
- result['证书'][idx][cetnormal[key]] = result['证书'][idx][key]
- result['证书'][idx].pop(key)
- if result.get("获奖情况"):
- for idx in range(len(result['获奖情况'])):
- for key in rewnormal.keys():
- if result['获奖情况'][idx].get(key):
- result['获奖情况'][idx][rewnormal[key]] = result['获奖情况'][idx][key]
- result['获奖情况'][idx].pop(key)
- if result.get("家庭成员"):
- for idx in range(len(result['家庭成员'])):
- for key in family.keys():
- if result['家庭成员'][idx].get(key):
- result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
- result['家庭成员'][idx].pop(key)
- tit = {
- "基本信息":"base",
- "求职意向":"intent_job",
- "教育经历":"tal_his_edu",
- "工作经历":"tal_his_job",
- "项目经历":"tal_his_project",
- "培训经历":"tal_training_experience",
- "获奖情况":"tal_reward_punishment",
- "语言能力":"tal_language",
- "证书":"tal_vocational_qualification_certificate",
- "专业技能":"tal_professional_tech_certificate",
- "家庭成员":"tal_family_social_relation",
- "其他情况说明":"intro"
- }
- for key in tit.keys():
- if result.get(key):
- result[tit[key]] = result[key]
- result.pop(key)
- return result
- # 结果返回
- def push_back(tempdir):
- for file in os.listdir('./result/' + tempdir):
- filename = os.path.join('./result/' + tempdir, file)
- with open(filename, "r", encoding="utf-8") as ff:
- rst = json.load(ff)
- rst = formatter(rst, translate)
- url = "http://192.168.1.110:9999/talent/getResumeData"
- session = requests.Session()
- session.mount('http://', HTTPAdapter(max_retries = 3))
- try:
- headers = {
- 'contentType':'Application/json'
- }
- response = session.post(url=url, headers=headers, json={"filename":file, "ResumeData":rst}, timeout=10)
- except Exception as e:
- print(e)
- logger.info({"filename":file, "ResumeData":rst})
- # 检测传入格式(已完成)
- def detection_type(path, system):
- tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
- os.mkdir('./result/' + tempdir)
- # 传入 rar 压缩文件
- if os.path.isfile(path) and path.endswith('.rar'):
- rar = rarfile.RarFile(path)
- rar.extractall('./cache/' + tempdir)
- path = "./cache/" + tempdir
- # 传入 tar.gz 压缩文件
- if os.path.isfile(path) and path.endswith('.tar.gz'):
- tf = tarfile.open(path)
- tf.extractall('./cache/' + tempdir)
- tf.close()
- path = "./cache/" + tempdir
- # 传入 .zip .7z 压缩文件
- try:
- if os.path.isfile(path) and path.endswith('.zip'):
- ## 解压方式1:存在乱码
- # f = zipfile.ZipFile(file, mode='r')
- # f.extractall(target_dir)
- ## 解压方式2:防止乱码
- with ZipFile(path, allowZip64=True) as zf:
- # 排除目录文件
- print("zf.filelist", zf.filelist)
- file_iter = (filename for filename in zf.filelist if os.path.isfile(path))
- for filename in file_iter:
- # 编码文件名称为 utf 格式
- filename.filename = decode_path(filename.filename) # 防止乱码的操作
- zf.extract(filename, "./cache/" + tempdir)
- path = "./cache/" + tempdir
- elif os.path.isfile(path) and path.endswith('.7z'): # .7z格式文件解压
- zf = py7zr.SevenZipFile(path, mode='r')
- zf.extractall("./cache/" + tempdir)
- path = "./cache/" + tempdir
- except Exception as e:
- logger.error(e)
- # 传入为 doc
- if os.path.isfile(path) and path.endswith('.doc'):
- doc2pdf(docPath = path, pdfPath = './pdf', system=system)
- newfile = './pdf/' + os.path.splitext(os.path.split(path)[-1])[0] + '.pdf'
- if os.path.exists(newfile):
- rst = check_pdf(newfile)
- if "Table" in rst:
- parse_table_from_pdf(newfile, save_dir=tempdir)
- pass
- if "Word" in rst:
- read_from_pdf(newfile, save_dir=tempdir)
- # 传入为 docx
- elif os.path.isfile(path) and path.endswith('.docx'):
- check_word(path, save_dir=tempdir)
- # 传入为 pdf
- elif os.path.isfile(path) and path.endswith('.pdf'):
- rst = check_pdf(path)
- if "Table" in rst:
- parse_table_from_pdf(path, save_dir=tempdir)
- if "Word" in rst:
- read_from_pdf(path, save_dir=tempdir)
- # 传入为 txt
- elif os.path.isfile(path) and path.endswith('.txt'):
- parse_txt(path, save_dir=tempdir)
- # 传入目录
- elif os.path.isdir(path):
- for filename in os.listdir(path):
- filename = os.path.join(path, filename)
- # 传入为 doc
- logger.info(filename)
- if filename.endswith('.doc') and not filename.startswith('.~'):
- doc2pdf(docPath = filename, pdfPath = './pdf', system=system)
- newfile = './pdf/' + os.path.splitext(os.path.split(filename)[-1])[0] + '.pdf'
- if os.path.exists(newfile):
- rst = check_pdf(newfile)
- if "Table" in rst:
- parse_table_from_pdf(newfile, save_dir=tempdir)
- pass
- if "Word" in rst:
- read_from_pdf(newfile, save_dir=tempdir)
- # 传入为 docx
- elif os.path.isfile(filename) and filename.endswith('.docx'):
- check_word(filename, save_dir=tempdir)
- # 传入为 pdf
- if os.path.isfile(filename) and filename.endswith('.pdf'):
- rst = check_pdf(filename)
- if "Table" in rst:
- parse_table_from_pdf(filename, save_dir=tempdir)
- pass
- if "Word" in rst:
- read_from_pdf(filename, save_dir=tempdir)
- # 传入为 txt
- elif os.path.isfile(filename) and filename.endswith('.txt'):
- parse_txt(filename, save_dir=tempdir)
- # 推送后端
- push_back(tempdir)
- @app.post("/resume_parse")
- async def file_upload(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
- """
- 简历上传
- 格式:pdf,docx,doc,txt,tar.gz,zip,7z, rar
- """
- res = await file.read()
- with open('./uploads/' + file.filename, "wb") as f:
- f.write(res)
- background_tasks.add_task(detection_type, './uploads/' + file.filename, platform.system())
- return {"errno": 0, "msg": "{} Upload Success".format(file.filename)}
- if __name__ == '__main__':
- uvicorn.run(app=app, host="0.0.0.0", port=8320)
|