resume_parse.py 71 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # 通用简历抽取
  4. import os
  5. import re
  6. import json
  7. import time
  8. import platform
  9. import subprocess
  10. import rarfile
  11. import py7zr
  12. import tarfile
  13. from zipfile import ZipFile
  14. import requests
  15. from requests.adapters import HTTPAdapter
  16. from docx import Document
  17. from docx.shared import Inches
  18. from pdfminer.pdfpage import PDFPage
  19. from pdfminer.pdfparser import PDFParser
  20. from pdfminer.pdfdocument import PDFDocument
  21. from pdfminer.high_level import extract_pages
  22. from pdfminer.converter import PDFPageAggregator
  23. from pdfminer.layout import LTTextContainer, LTChar, LTLine, LAParams, LTTextBox, LTFigure, LTImage, LTText, LTAnno, LTTextLine, LTTextLineHorizontal
  24. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  25. import pdfplumber
  26. from paddlenlp import Taskflow
  27. from logger import Logger
  28. logger = Logger("resume_parse")
  29. logger.set_file_handler(filename='journal.log')
  30. from rich.console import Console
  31. console = Console()
  32. global ner, ner_tag, base_info_ie, prize_ie, cet_ie, pro_ie, block, block_rev, translate
  33. if not locals().get("ner"):
  34. ner = Taskflow("ner", mode='fast')
  35. if not locals().get("ner_tag"):
  36. ner_tag = Taskflow("ner")
  37. if not locals().get("base_info_ie"):
  38. base_info_ie = Taskflow('information_extraction', schema=["姓名","性别","电子邮箱","政治面貌","手机号码","籍贯","出生日期","现任职务","参加工作时间","当前单位","所在城市"], model="uie-tiny")
  39. if not locals().get("prize_ie"):
  40. prize_ie = Taskflow('information_extraction', schema=["时间", "奖项"], model="uie-nano")
  41. if not locals().get("cet_ie"):
  42. cet_ie = Taskflow('information_extraction', schema=["时间","证书"], model="uie-nano")
  43. if not locals().get("pro_ie"):
  44. pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./resources/model_best')
  45. if not locals().get("block"):
  46. with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
  47. block = json.load(fp)
  48. if not locals().get("block_rev"):
  49. block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
  50. if not locals().get("translate"):
  51. with open("./resources/translate.json", "r", encoding="utf-8") as ft:
  52. translate = json.load(ft)
  53. if not os.path.exists("./uploads"):
  54. os.mkdir("./uploads")
  55. if not os.path.exists("./pdf"):
  56. os.mkdir("./pdf")
  57. if not os.path.exists("./cache"):
  58. os.mkdir("./cache")
  59. if not os.path.exists("./result"):
  60. os.mkdir("./result")
  61. import uvicorn
  62. from fastapi import BackgroundTasks, FastAPI, File, UploadFile
  63. app = FastAPI()
  64. from functools import wraps
  65. def time_this_function(func):
  66. @wraps(func)
  67. def wrapper(*args,**kwargs):
  68. start=time.time()
  69. result=func(*args, **kwargs)
  70. end=time.time()
  71. console.print("函数:",func.__name__,"运行时间:", round(end - start, 4),"s")
  72. return result
  73. return wrapper
  74. # 基本信息(旧版)
  75. @time_this_function
  76. def get_base_info_old(lines):
  77. logger.info(lines)
  78. datas = "".join(lines)
  79. ner_list = ner(datas)
  80. concat_list = []
  81. for w, t in ner_list:
  82. if concat_list and (t == concat_list[-1][-1]):
  83. concat_list[-1][0] += w
  84. else:
  85. concat_list.append([w, t])
  86. schema = {
  87. '姓名': None,
  88. }
  89. for line in [' '.join(' '.join(lines).split('\n'))]:
  90. line = line.replace(r'[ ]{5,}','\n')
  91. w = re.sub(r'[\W]+(\w[::])[\W]{0,}\w', r'\1', line)
  92. for i in w.split():
  93. if ':' in i:
  94. try:
  95. key, val = i.split(':')
  96. schema[key] = val
  97. except Exception as e:
  98. logger.error(e)
  99. if not schema.get('姓名'):
  100. schema['姓名'] = re.search(r'[姓名::]{3,}(\w{2,4})', w).group(1) if re.search(r'[姓名::]{3,}(\w{2,4})', w) else None
  101. if not schema.get('姓名'):
  102. for word, tag in ner_tag(w):
  103. if tag == "人物类_实体":
  104. schema['姓名'] = word
  105. if not schema.get('性别'):
  106. schema['性别'] = re.search(r'[男女]', w).group() if re.search(r'[男女]', w) else None
  107. # if not schema.get('婚姻状况'):
  108. # schema['婚姻状况'] = re.search(r'[已未]婚', w).group() if re.search(r'[已未]婚', w) else None
  109. if not schema.get('邮箱地址'):
  110. schema['邮箱地址'] = re.search(r'([.\w]+@[.\w]+)', w).group() if re.search(r'([.\w]+@[.\w]+)', w) else None
  111. if not schema.get('政治面貌'):
  112. schema['政治面貌'] = re.search(r'[预备中共党团员群众无派人士]{2,6}', w).group() if re.search(r'[预备中共党团员群众无派人士]{2,6}', w) else None
  113. if not schema.get('手机号码'):
  114. schema['手机号码'] = re.search(r'\W(1[\d]{10})\W', w).group(1) if re.search(r'\W(1[\d]{10})\W', w) else None
  115. if not schema.get('出生年月'):
  116. schema['出生年月'] = re.search(r'\d{4}[./年\-]\d{1,2}[月]', w).group() if re.search(r'\d{4}[./年\-]\d{1,2}[月]', w) else None
  117. # if not schema.get('当前职位'):
  118. # schema['当前职位'] = re.search(r'[当前职位: ]{3,}(\w)+', w).group() if re.search(r'[当前职位: ]{3,}(\w)+', w) else None
  119. # if not schema.get('参加工作时间'):
  120. # schema['参加工作时间'] = re.search(r'[参加工作时间:]{3,}(\d{4}[./年\-]\d{1,2}[月])', w).group(1) if re.search(r'[参加工作时间:]{3,}(\d{4}[./年\-]\d{1,2}[月])', w) else None
  121. for idx, (w, t) in enumerate(concat_list):
  122. if t == "LOC":
  123. if not schema.get("籍贯"):
  124. if re.search(r'[出生籍贯户]+', concat_list[idx-1][0]) or re.search(r'[出生籍贯户]+', concat_list[idx-2][0]):
  125. schema["籍贯"] = w
  126. if t == "TIME":
  127. if not schema.get("出生年月"):
  128. if re.search(r'[出生日期年月]+', concat_list[idx-1][0]) or re.search(r'[出生日期年月]+', concat_list[idx-2][0]):
  129. schema["出生年月"] = w
  130. if not schema.get("参加工作时间"):
  131. if re.search(r'[参加工作时间]+', concat_list[idx-1][0]) or re.search(r'[参加工作时间]+', concat_list[idx-2][0]):
  132. schema["参加工作时间"] = w
  133. return {key:value for key, value in schema.items() if value}
  134. # 基本信息(OIE 已完成)
  135. @time_this_function
  136. def get_base_info(lines):
  137. if not lines:
  138. return
  139. logger.info(lines)
  140. data = " ".join(lines)
  141. rst = base_info_ie(data)[0]
  142. if rst.get("出生日期"):
  143. dates = re.findall(r'\d+' ,rst["出生日期"][0]["text"])
  144. if len(dates) == 1:
  145. if len(dates[0]) > 4:
  146. rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0][:4]), int(dates[0][4:6]))
  147. else:
  148. rst["出生日期"][0]["text"] = "{:4d}-01-01".format(int(dates[0][:4]))
  149. elif len(dates) == 2:
  150. rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  151. elif len(dates) == 3:
  152. rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  153. if rst.get("参加工作时间"):
  154. dates = re.findall(r'\d+' ,rst["参加工作时间"][0]["text"])
  155. if len(dates) == 1:
  156. if len(dates[0]) > 4:
  157. rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0][:4]), int(dates[0][4:6]))
  158. else:
  159. rst["参加工作时间"][0]["text"] = "{:4d}-01-01".format(int(dates[0]))
  160. elif len(dates) == 2:
  161. rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  162. elif len(dates) == 3:
  163. rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  164. return {key:rst[key][0]["text"] for key in rst.keys()}
  165. # 求职意向(已完成)
  166. @time_this_function
  167. def get_job_intention(lines):
  168. logger.info(lines)
  169. schema = {}
  170. for line in lines:
  171. regex = re.compile(r'\W{0,3}[::]\s+')
  172. line = regex.sub(':', line)
  173. for i in line.split():
  174. if ":" in i:
  175. try:
  176. key, val = i.split(":")
  177. schema[key] = val
  178. except Exception as e:
  179. logger.error(e)
  180. return schema
  181. # 教育经历 (已停用)
  182. # ner + 分词 (判断学校,时间,学历) 专业需要单独处理。
  183. def get_edu_list_old(lines):
  184. logger.info(lines)
  185. job_list = []
  186. job_dict = {'edu_time_beg':'', 'edu_time_end':'', 'edu_name':'','edu_leval':'','edu_domain':'', 'edu_statue':0}
  187. re_txt = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今|^\d{4,4}.\d{1,2}|19\d{2,2}.|20\d{2,2}.'
  188. re_txt_1 = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今'
  189. nums = []
  190. for i in range(len(lines)):
  191. if re.findall(re_txt, lines[i]):
  192. nums.append(i)
  193. nums.append(len(lines))
  194. edu_level = {'本科':18, "大专":17, "博士研究生":20, "学士":18, "博士":20, "硕士":19, "研究生":19, "博后":21, '博士后':21}
  195. year_dict = {18:4, 17:3,20:3,19:3,21:2}
  196. edu_dict = {18:'本科', 17:'大专',20:'博士研究生',19:'硕士',21:'博士后'}
  197. edu_list = []
  198. for i in range(1, len(nums[:])):
  199. job_dict = {'edu_time_beg':'', 'edu_time_end':'', 'edu_name':'','edu_leval':'','edu_domain':''}
  200. data_list = lines[nums[i-1]:nums[i]]
  201. if len(data_list) > 1 and data_list[1] and data_list[1][-1] == '|' and data_list[0][-1] != '|':
  202. data_list[0] = data_list[0] + data_list[1]
  203. data_list[1] = ''
  204. if len(data_list) > 2 and data_list[2] and data_list[2][-1] == '|' and data_list[0][-1] != '|' and '|' in str(data_list[0]) and data_list[1] and data_list[1][-1] != '|':
  205. data_list[0] = data_list[0] + data_list[1] + data_list[2]
  206. data_list[1] = ''
  207. data_list[2] = ''
  208. if '' in data_list:
  209. data_list.remove('')
  210. data_line = ' '.join(data_list)
  211. data_line = re.sub('[\|]', ' ', data_line)
  212. data_line = re.sub('-{3,}', '', data_line)
  213. ner_data = ner(''.join(data_list[:2]))
  214. org = ''
  215. time_list = []
  216. for jj in range(1, len(ner_data)):
  217. if ner_data[jj][1] == ner_data[jj-1][1]:
  218. ner_data[jj] = list(ner_data[jj])
  219. ner_data[jj][0] = ner_data[jj-1][0] + ner_data[jj][0]
  220. ner_data[jj-1] = ('','')
  221. for _ in ner_data:
  222. if _[1] == 'ORG' and not org:
  223. org = _[0].strip()
  224. elif _[1] == 'TIME' and len(_[1]) >= 4:
  225. time_list.append(_[0])
  226. #TIME
  227. # print(data_line)
  228. _list_data = re.split('\040+',data_line)
  229. top_level = 18
  230. remove_list = []
  231. logger.info(_list_data)
  232. logger.info(time_list)
  233. for ii in range(len(_list_data)):
  234. for t in time_list:
  235. if t in _list_data[ii]:
  236. _list_data[ii] = ''
  237. break
  238. for i in range(len(_list_data)):
  239. #if org in _list_data[i]:
  240. # _list_data[i] = ''
  241. if re.findall('^\d{4,4}', _list_data[i]):
  242. _list_data[i] = ''
  243. _data = re.findall('本科|学士|硕士|博士研究生|博士后|博后|博士|研究生|大专', _list_data[i])
  244. if not _data:
  245. continue
  246. top_level = edu_level[_data[0]]
  247. _list_data[i] = ''
  248. break
  249. #remove_list.append(i)
  250. logger.info(_list_data)
  251. job_time = re.findall(re_txt_1, data_list[0])
  252. if job_time:
  253. job_dict['edu_time'] = job_time[0]
  254. else:
  255. job_dict['edu_time'] = ''
  256. _nums = re.findall('\d+', job_dict['edu_time'])
  257. if len(_nums) >= 4:
  258. job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  259. job_dict['edu_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
  260. job_dict['edu_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  261. elif len(_nums) == 2:
  262. job_dict['edu_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  263. job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  264. job_dict['edu_time_end'] = '%s'%('至今')
  265. elif len(time_list) == 2:
  266. nums_1 = re.findall('\d+', time_list[0])
  267. nums_2 = re.findall('\d+', time_list[1])
  268. nums_1.append('09')
  269. nums_2.append('07')
  270. job_dict['edu_time_beg'] = '%s-%02d'%(nums_1[0], int(nums_1[1]))
  271. try:
  272. job_dict['edu_time_end'] = '%s-%02d'%(nums_2[0], int(nums_2[1]))
  273. except:
  274. job_dict['edu_time_end'] = None
  275. try:
  276. job_dict['edu_time'] = '%s-%02d~%s-%02d'%(nums_1[0], int(nums_1[1]), nums_2[0], int(nums_2[1]))
  277. except:
  278. job_dict['edu_time'] = '%s-%02d~今'%(nums_1[0], int(nums_1[1]))
  279. elif len(time_list) == 1:
  280. _nums = re.findall('\d+', time_list[0])
  281. if '毕业' in data_list[0]:
  282. _nums.append('06')
  283. _nums.insert(0, '09')
  284. _nums.insert(0, str(int(_nums[1]) - year_dict[top_level]))
  285. job_dict['edu_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  286. job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  287. job_dict['edu_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
  288. else:
  289. _nums.append('09')
  290. job_dict['edu_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  291. job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  292. job_dict['edu_time_end'] = '%s'%('至今')
  293. job_dict['edu_leval'] = edu_dict[top_level]
  294. if org:
  295. job_dict['edu_name'] = org
  296. else:
  297. job_dict['edu_name'] = ''
  298. edu_domain = ''
  299. for i in range(len(_list_data)):
  300. if org in _list_data[i]:
  301. continue
  302. if not _list_data[i] and '专业' in _list_data[i]:
  303. edu_domain = _list_data[i]
  304. if not edu_domain:
  305. for i in range(len(_list_data)):
  306. if org in _list_data[i]:
  307. continue
  308. if _list_data[i] and len(_list_data[i]) >= 3:
  309. edu_domain = _list_data[i]
  310. break
  311. if not edu_domain:
  312. for i in range(len(_list_data)):
  313. if org in _list_data[i]:
  314. for j in range(i+1, len(_list_data)):
  315. if _list_data[i] and len(_list_data[j]) >= 2:
  316. edu_domain = _list_data[j]
  317. break
  318. break
  319. job_dict['edu_domain'] = edu_domain
  320. if len(job_list) ==0:
  321. job_list.append(job_dict)
  322. else:
  323. if job_dict in job_list:
  324. continue
  325. if not job_dict['edu_time']:
  326. continue
  327. if int(job_dict['edu_time'][:4]) > int(job_list[-1]['edu_time'][:4]):
  328. job_list = [job_dict] + job_list
  329. else:
  330. job_list.append(job_dict)
  331. continue
  332. data_list[0] = re.sub(job_time[0], '', data_list[0])
  333. _list = re.split('\|\040+', data_list[0])
  334. #print(_list)
  335. if len(_list) == 1:
  336. __list = re.split('\040+', data_list[0])
  337. job_dict['edu_name'] = __list[1].strip()
  338. job_dict['edu_domain'] = __list[2].strip()
  339. job_dict['edu_leval'] = __list[3].strip()
  340. else:
  341. #if job_dict['edu_leval'] not in
  342. if len(_list) > 3:
  343. job_dict['edu_name'] = _list[2].strip()
  344. job_dict['edu_domain'] = _list[3].strip()
  345. job_dict['edu_leval'] = _list[1].strip()
  346. else:
  347. job_dict['edu_leval'] = _list[0].strip()
  348. job_dict['edu_name'] = _list[1].strip()
  349. job_dict['edu_domain'] = _list[2].strip()
  350. if '硕士' in _list[0] or '研究生' in _list[0]:
  351. job_dict['edu_leval'] = '硕士'
  352. elif '博士' in _list[0]:
  353. job_dict['edu_leval'] = '博士'
  354. elif '本科' in _list[0]:
  355. job_dict['edu_leval'] = '本科'
  356. elif '学士' in _list[0]:
  357. job_dict['edu_leval'] = '本科'
  358. # print(job_dict)
  359. if len(job_list) ==0:
  360. job_list.append(job_dict)
  361. else:
  362. if job_dict in job_list:
  363. continue
  364. if int(job_dict['edu_time'][:4]) > int(job_list[-1]['edu_time'][:4]):
  365. job_list = [job_dict] + job_list
  366. else:
  367. job_list.append(job_dict)
  368. #edu_list.append(job_dict['edu_time'] + job_dict['edu_name'] + job_dict['edu_domain'] + job_dict['edu_leval'])
  369. #if job_list[0]['edu_leval'] not in ['硕士', '博士', '本科', '博后'] and len(job_list[0]['edu_leval']) > 5:
  370. # job_list[0]['edu_leval'] = '本科'
  371. return job_list
  372. # 教育经历改 (已完成)
  373. @time_this_function
  374. def get_edu_list(lines):
  375. logger.info(lines)
  376. edu_list = [{"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None}]
  377. regex_time = re.compile(r'((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?[\d]{0,2})[至到\W]+((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?)?([今])?|(\d{4})[至\W]+([\d今]{4})')
  378. regex_end = re.compile(r'毕业时间[\w\W]{0,5}(\d{4})[\W年]?(\d{0,2})[月\W]?')
  379. regex_level = re.compile(r'[大本专科硕博士研究生后]{2,}')
  380. regex_domain = re.compile(u'[\u4E00-\u9FA5]{2,10}', re.UNICODE)
  381. count = 0
  382. for line in lines:
  383. line = line.replace("学士","本科").replace("专业","").replace("学位","")
  384. for cell in re.split(r'[·\|\t]', line):
  385. if not cell.strip():
  386. continue
  387. flags = 0
  388. edu_time = regex_time.search(cell)
  389. edu_end_time = regex_end.search(cell)
  390. edu_level = regex_level.search(cell)
  391. edu_domain = regex_domain.search(cell)
  392. # 标准时间格式
  393. if edu_time:
  394. # 提交信息
  395. if edu_list[count].get("Time") and edu_list[count].get("school_name"):
  396. edu_list.append({"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None})
  397. count += 1
  398. edu_list[count]["start_time"] = '{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)))
  399. # 年月日
  400. if edu_time.group(5) != None:
  401. edu_list[count]["end_time"] = '{:4d}-{:02d}'.format(int(edu_time.group(5)),int(edu_time.group(6)))
  402. edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)),int(edu_time.group(5)),int(edu_time.group(6)))
  403. # 只有年
  404. elif edu_time.group(8) != None:
  405. edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_time.group(8)),int(edu_time.group(9)))
  406. edu_list[count]["start_time"] = '{:4d}'.format(int(edu_time.group(8)))
  407. edu_list[count]["end_time"] = '{:4d}'.format(int(edu_time.group(9)))
  408. # 至今类
  409. elif edu_time.group(7):
  410. edu_list[count]["end_time"] = edu_time.group(7)
  411. edu_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(edu_time.group(2)),int(edu_time.group(3)),edu_time.group(7))
  412. flags = 1
  413. # 只有毕业时间
  414. elif edu_end_time:
  415. # 提交信息
  416. if edu_list[count].get("end_time") and edu_list[count].get("school_name"):
  417. edu_list.append({"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None})
  418. count += 1
  419. # 年月
  420. if edu_end_time.group(2):
  421. edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_end_time.group(1))-3,int(edu_end_time.group(2)),int(edu_end_time.group(1)),int(edu_end_time.group(2)))
  422. edu_list[count]["end_time"] = '{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)))
  423. # 只有年
  424. elif edu_end_time.group(1):
  425. edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_end_time.group(1))-3,int(edu_end_time.group(1)))
  426. edu_list[count]["end_time"] = '{:4d}'.format(int(edu_end_time.group(1)))
  427. # 学历
  428. if (not edu_list[count].get("degree")) and edu_level:
  429. edu_list[count]["degree"] = edu_level.group(0)
  430. # WordTag 识别 学校/专业
  431. for word, tag in ner_tag(cell):
  432. if (not edu_list[count].get("school_name")) and (tag == "组织机构类_教育组织机构"):
  433. edu_list[count]["school_name"] = word.strip()
  434. flags = 1
  435. elif (not edu_list[count].get("major")) and (tag in "_术语类型"):
  436. edu_list[count]["major"] = word.strip()
  437. elif edu_list[count].get("school_name") and edu_list[count].get("major"):
  438. break
  439. # LAC 识别 学校
  440. else:
  441. for word, tag in ner(cell):
  442. if (tag == "ORG"):
  443. edu_list[count]["school_name"] = word
  444. flags = 1
  445. break
  446. # 未识别成功时填充专业
  447. if (not (edu_level or flags or edu_list[count].get("major"))) and edu_domain:
  448. edu_list[count]["major"] = edu_domain.group(0)
  449. # 剔除时间不存在、学校不存在的列
  450. if (not edu_list[-1].get("Time")) or (not edu_list[-1].get("school_name")):
  451. edu_list.pop()
  452. return edu_list
  453. # 工作经历 (已完成)
  454. # ner + 分词 机构信息,人物身份信息,时间 工作内容区分判断
  455. # 其中,时间是判断是否下一份工作情况的主要标识符之一。字符数量
  456. # 时间类 数量词
  457. @time_this_function
  458. def get_job_list(lines):
  459. logger.info(lines)
  460. job_list = []
  461. re_txt = '\d{4,4}\040{0,2}.\d+\040{0,2}.?\040{0,2}[\-–至-\—~]{1,2}\040{0,2}\d{4,4}\040{0,2}.\040{0,2}\d+.?|\d{4,4}.\d+.?\040{0,2}[\-–-—]{0,2}\040{0,2}至?今|\d{4,4}.\d+.?\040{0,2}[\-–-]{1,2}\040{0,2}现在|\d{4,4}年\d+月\-\d{4,4}年\d+月|\d{4,4}年\d+月\-\~|\d{4,4}年\d+月[\-\~-]至今|\d{4,4}-\d+\040{0,2}[-\~至]\040{0,2}\d{4,4}-\d+|\d{4,4}-\d+\~|\d{4,4}-\d+\[~-]至今|\d{4,4}-\d+\040{0,2}至今'
  462. nums = []
  463. for i in range(len(lines)):
  464. #print(lines[i])
  465. #print(lines[i], re.findall(re_txt, lines[i]), re.findall('\||\040{1,}', lines[i]))
  466. if re.findall(re_txt, lines[i].replace(' ', '')) and re.findall('\||\040{1,}', lines[i]):
  467. nums.append(i)
  468. continue
  469. if re.findall(re_txt, lines[i].replace(' ', '')[:20]):
  470. nums.append(i)
  471. continue
  472. if len(lines[i].strip().replace(' ', '')) > 50:
  473. continue
  474. year_list = re.findall('19\d{2,2}.\d{1,2}|20\d{2,2}.\d{1,2}', lines[i])
  475. if len(year_list) >= 2:
  476. nums.append(i)
  477. elif len(year_list) == 1 and '至今' in lines[i]:
  478. nums.append(i)
  479. nums.append(len(lines))
  480. # logger.info(nums)
  481. logger.info('get_job_list :{}'.format(nums))
  482. for i in range(1, len(nums[:])):
  483. job_dict = {'job_time':'', 'job_leval':'','job_company':'','job_content':''}
  484. data_list = lines[nums[i-1]:nums[i]]
  485. if '' in data_list:
  486. data_list.remove('')
  487. org = ''
  488. person_professor_list = []
  489. org_index = -1
  490. end_index = 3
  491. job_time = re.findall(re_txt, data_list[0])
  492. if not job_time:
  493. year_list = re.findall('19\d{2,2}.\d{1,2}|20\d{2,2}.\d{1,2}', data_list[0])
  494. if len(year_list) >= 2:
  495. job_time = ['-'.join(year_list)]
  496. elif len(year_list) == 1 and '至今' in lines[i]:
  497. job_time = [year_list[0] + '~' + '至今']
  498. if not job_time:
  499. regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
  500. job_time = [re.search(regex, data_list[0]).group(0)]
  501. job_dict['job_time'] = job_time[0]
  502. _nums = re.findall('\d{1,4}', job_dict['job_time'])
  503. #print(_nums)
  504. if len(_nums) >= 4:
  505. job_dict['job_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  506. elif len(_nums) == 2:
  507. job_dict['job_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  508. if re.findall('\d', job_time[0]):
  509. job_time[0] = job_time[0][:-1]
  510. data_list[0] = re.sub(job_time[0], '', data_list[0])
  511. data_list[0] = data_list[0].strip()
  512. ner_list = []
  513. for ii in range(len(data_list[:3])):
  514. if '工作' in data_list[ii][:4] and (re.findall(':|\:', data_list[ii])):
  515. end_index = ii
  516. break
  517. #print(re.findall('\040|\||/', data_list[ii].strip()), org)
  518. if not re.findall('\040|\||/', data_list[ii].strip()) and org:
  519. end_index = ii
  520. break
  521. if len(data_list[ii]) > 80:
  522. end_index = ii
  523. break
  524. if data_list[ii]:
  525. ner_data = ner_tag(data_list[ii].strip())
  526. #print('\n\nnerdata:\t',ner_data)
  527. else:
  528. continue
  529. ner_list.append(ner_data)
  530. for x in ner_data:
  531. if x[1] == '人物类_概念' and len(x[0]) > 2:
  532. person_professor_list.append(x[0].strip())
  533. elif x[1] == '组织机构类_企事业单位' or x[1] == '组织机构类_教育组织机构' or x[1] == '组织机构类_国家机关':
  534. if not org and len(x[0]) >= 3:
  535. org = re.split('\040|\|/', x[0].strip())[0]
  536. org_index = ii
  537. if not org:
  538. for ii in range(len(ner_list)):
  539. if org:
  540. break
  541. ner_data = ner_list[ii]
  542. for x in ner_data:
  543. if x[1][:5] == '组织机构类':
  544. org = re.split('\040|\|/', x[0].strip())[0]
  545. break
  546. #print(person_professor_list)
  547. if not person_professor_list:
  548. for ii in range(len(ner_list)):
  549. ner_data = ner_list[ii]
  550. for x in ner_data:
  551. if x[1] == '人物类_概念':
  552. person_professor_list = [re.split('\040|\|/', x[0].strip())[0]]
  553. break
  554. data_line = ' '.join(data_list[:end_index])
  555. data_line = re.sub('\||/', ' ', data_line)
  556. _list_data = re.split('\040+', data_line)
  557. if len(_list_data) == 1 and len(data_list) == 1:
  558. end_index = 0
  559. #print(_list_data)
  560. if not person_professor_list:
  561. for x in range(len(_list_data)):
  562. if re.findall('经理|工程师|会计|董事长|总监|秘书|主管|处长|局长|主任|讲师|教授', _list_data[x][-4:]):
  563. person_professor_list.append(_list_data[x])
  564. if not org:
  565. for x in range(len(_list_data)):
  566. if len(_list_data[x]) < 4:
  567. _list_data[x] = ''
  568. elif person_professor_list and re.findall('|'.join(person_professor_list), _list_data[x]):
  569. _list_data[x] = ''
  570. elif '经理' == _list_data[x][-2:]:
  571. _list_data[x] = ''
  572. for x in range(len(_list_data)):
  573. if _list_data[x]:
  574. org = _list_data[x]
  575. break
  576. if not person_professor_list:
  577. for x in range(len(_list_data)):
  578. if org in _list_data[x]:
  579. for j in range(x+1, len(_list_data)):
  580. if _list_data[j]:
  581. person_professor_list = [_list_data[j]]
  582. break
  583. break
  584. #print(org, person_professor_list, job_time)
  585. job_dict['job_company'] = org
  586. job_dict['job_leval'] = ' '.join(person_professor_list)
  587. if not data_list[end_index:] and end_index == 3:
  588. end_index = 2
  589. if not data_list[end_index:] and end_index == 2:
  590. end_index = 1
  591. job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容::|工工作作内内容容', '工作内容:', ''.join(data_list[end_index:]))
  592. job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
  593. job_dict['start_time'] = job_dict['job_time'].split('~')[0]
  594. job_dict['end_time'] = job_dict['job_time'].split('~')[-1]
  595. normal = {"job_company":"company_name","job_content":"job_desc","job_leval":"job_name"}
  596. for key in normal.keys():
  597. if job_dict.get(key):
  598. job_dict[normal[key]] = job_dict[key]
  599. job_dict.pop(key)
  600. job_list.append(job_dict)
  601. # continue
  602. # if len(data_list) > 1 and data_list[1] and data_list[1][-1] == '|':# and data_list[0] and data_list[0][-1] != '|':
  603. # data_list[0] = data_list[0] + data_list[1]
  604. # data_list[1] = ''
  605. # elif len(data_list) > 2 and data_list[2] and data_list[2][-1] == '|' and data_list[0][-1] != '|' and '|' in str(data_list[0]) and data_list[1] and data_list[1][-1] != '|':
  606. # data_list[0] = data_list[0] + data_list[1] + data_list[2]
  607. # data_list[1] = ''
  608. # data_list[2] = ''
  609. # elif len(data_list) > 1 and data_list[1] and '工作职责:' in data_list[2]:
  610. # data_list[0] = data_list[0] + data_list[1]
  611. # data_list[1] = ''
  612. # elif len(data_list) > 1 and '工作职责:' in data_list[3]:
  613. # data_list[0] = data_list[0] + data_list[1] + data_list[2]
  614. # data_list[1] = ''
  615. # data_list[2] = ''
  616. # job_time = re.findall(re_txt, data_list[0])
  617. # job_dict['job_time'] = job_time[0]
  618. # _nums = re.findall('\d+', job_dict['job_time'])
  619. # #print(_nums)
  620. # if len(_nums) >= 4:
  621. # job_dict['job_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  622. # elif len(_nums) == 2:
  623. # job_dict['job_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  624. # data_list[0] = re.sub(job_time[0], '', data_list[0])
  625. # data_list[0] = data_list[0].strip()
  626. # data_list[0] = re.sub('历任:', ' ', data_list[0])
  627. # _list = data_list[0].split('|')
  628. # if len(_list) == 1:
  629. # __list = re.split('\040{2,}', data_list[0])
  630. # #print(__list)
  631. # job_dict['job_leval'] = __list[1].strip()
  632. # job_dict['job_company'] = __list[0].strip()
  633. # else:
  634. # job_dict['job_leval'] = _list[0].strip()
  635. # job_dict['job_company'] = _list[1].strip()
  636. # if '职级:' in data_list[1:]:
  637. # data_list.remove('职级:')
  638. # job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容::|工工作作内内容容', '工作内容:', ''.join(data_list[1:]))
  639. # job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
  640. # #print(job_dict)
  641. # job_list.append(job_dict)
  642. return job_list
  643. # 项目经历 (已弃用)
  644. # 项目名称未知
  645. def get_pro_list_old(lines):
  646. logger.info(lines)
  647. pro_list = [{"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,},]
  648. regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
  649. re_con = re.compile(r'负责内容(.*?)')
  650. re_na = re.compile(r'\W(.*?项目)\W')
  651. count = 0
  652. for line in lines:
  653. regex_time = regex.search(line)
  654. regex_content = re_con.search(line)
  655. regex_name = re_na.search(line)
  656. if regex_time:
  657. if pro_list[count].get("Time"):
  658. pro_list.append({"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,})
  659. count += 1
  660. pro_list[count]["startTime"] = '{:4d}-{:02d}'.format(int(regex_time.group(2)),int(regex_time.group(3)))
  661. if regex_time.group(5) != None:
  662. pro_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(regex_time.group(5)),int(regex_time.group(6)))
  663. pro_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(regex_time.group(2)),int(regex_time.group(3)),int(regex_time.group(5)),int(regex_time.group(6)))
  664. else:
  665. pro_list[count]["endTime"] = regex_time.group(7)
  666. pro_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(regex_time.group(2)),int(regex_time.group(3)),regex_time.group(7))
  667. elif regex_name and (not pro_list[count].get("job_name")):
  668. pro_list[count]["pro_name"] = regex_name.group()
  669. elif pro_list[count].get("content"):
  670. pro_list[count]["content"] += line
  671. else:
  672. try:
  673. for word, tag in ner_tag(line):
  674. if (not pro_list[count].get("job_leval")) and (tag == "人物类_概念"):
  675. pro_list[count]["job_leval"] = word
  676. if (not pro_list[count].get("job_company")) and (tag in "组织机构类_企事业单位"):
  677. pro_list[count]["job_company"] = word
  678. except Exception as e:
  679. logger.error(e)
  680. pro_list[count]["content"] = line
  681. return pro_list
  682. # 项目经历 (UIE 已完成)
  683. @time_this_function
  684. def get_pro_list(lines):
  685. logger.info(lines)
  686. starts = []
  687. # 时间查找
  688. for index, line in enumerate(lines):
  689. if re.search(r'\d{4}', line):
  690. starts.append(index)
  691. # 简单筛选
  692. count = len(starts)
  693. c = (starts[-1] - starts[0])/count
  694. for i in range(count-1):
  695. if (starts[i+1]-starts[i] < c/2):
  696. starts[i+1] = starts[i]
  697. # 合并
  698. pro_list = []
  699. pros = {}
  700. index = 0
  701. for i in range(len(lines)):
  702. if i in starts:
  703. index = i
  704. pros[index] = [lines[i], []]
  705. pros[index][1].append(lines[i])
  706. elif not pros:
  707. continue
  708. else:
  709. pros[index][0] += lines[i]
  710. pros[index][1].append(lines[i])
  711. # 提取
  712. for key in pros.keys():
  713. info = pro_ie(pros[key][0])
  714. src = pros[key][1]
  715. for rst in info:
  716. if not rst.get("时间") or not rst.get("项目名称"):
  717. continue
  718. rst["工作内容"] = [{"text":""}]
  719. logger.info(rst)
  720. for l in src:
  721. if rst["时间"][0]["text"] in l:
  722. continue
  723. else:
  724. rst["工作内容"][0]["text"] += l
  725. for key in rst.keys():
  726. if key == "时间":
  727. time_list = [None, None, None, None, None, None]
  728. tim_list = re.findall(r'\d+', rst["时间"][0]["text"])
  729. i = 0
  730. for t in tim_list:
  731. if (len(t) == 4) and (i != 0):
  732. i = 3
  733. time_list[i] = t
  734. else:
  735. time_list[i] = t
  736. i += 1
  737. else:
  738. continue
  739. if time_list[3] is not None:
  740. if time_list[4] is not None:
  741. rst["时间"][0]["text"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(time_list[0]),int(time_list[1]),int(time_list[3]),int(time_list[4]))
  742. else:
  743. rst["时间"][0]["text"] = "{:4d}~{:4d}".format(int(time_list[0]),int(time_list[3]))
  744. else:
  745. if time_list[1] is not None:
  746. rst["时间"][0]["text"] = "{:4d}-{:02d}~至今".format(int(time_list[0]),int(time_list[1]))
  747. else:
  748. rst["时间"][0]["text"] = "{:4d}~至今".format(int(time_list[0]))
  749. pro_list.extend([{key:rst[key][0]["text"] for key in rst.keys()} for rst in info])
  750. return pro_list
  751. # 培训经历 (已完成)
  752. # ner + 分词 (机构名) 培训项目 时间
  753. @time_this_function
  754. def get_cultivate_list(lines):
  755. logger.info(lines)
  756. job_list = []
  757. re_txt = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今|^\d{4,4}.\d{1,2}|\d{4,4}.'
  758. re_txt_1 = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今'
  759. nums = []
  760. for i in range(len(lines)):
  761. if re.findall(re_txt, lines[i].replace(' ', '')) and re.findall('\||\040{1,}', lines[i]):
  762. nums.append(i)
  763. continue
  764. if re.findall(re_txt, lines[i].replace(' ', '')[:20]):
  765. nums.append(i)
  766. if len(lines[i].strip().replace(' ', '')) > 50:
  767. continue
  768. nums.append(len(lines))
  769. year_dict = {18:4, 17:3,20:3,19:3,21:2,22:1}
  770. for i in range(1, len(nums[:])):
  771. job_dict = {'cultivate_time':'', 'cultivate_time_beg':'', 'cultivate_time_end':'', 'cultivate_name':'','cultivate_leval':'','cultivate_content':''}
  772. data_list = lines[nums[i-1]:nums[i]]
  773. data_line = ' '.join(data_list)
  774. data_line = re.sub('[\|\t]', ' ', data_line)
  775. data_line = re.sub('-{3,}', '', data_line)
  776. ner_data = ner(''.join(data_list[:2]))
  777. org = ''
  778. time_list = []
  779. for _ in ner_data:
  780. if _[1] == 'ORG' and not org:
  781. org = _[0].strip()
  782. elif _[1] == 'TIME' and len(_[1]) >= 4:
  783. time_list.append(_[0])
  784. #TIME
  785. logger.info(data_line)
  786. _list_data = re.split('\040+', data_line)
  787. top_level = 22
  788. end_index = 0
  789. remove_list = []
  790. if len(_list_data) <= 2:
  791. end_index = 0
  792. #continue
  793. job_time = re.findall(re_txt_1, data_list[0])
  794. if job_time:
  795. job_dict['cultivate_time'] = job_time[0]
  796. data_list[0] = re.sub(job_time[0], '', data_list[0])
  797. else:
  798. job_dict['cultivate_time'] = ''
  799. for t in time_list:
  800. data_list[0] = re.sub(t, '', data_list[0])
  801. _list = data_list[0].split('|')
  802. if len(_list) >= 2:
  803. job_dict['cultivate_name'] = _list[0].strip()
  804. job_dict['cultivate_leval'] = _list[1].strip()
  805. end_index = 1
  806. _nums = re.findall('\d+', job_dict['cultivate_time'])
  807. if len(_nums) >= 4:
  808. job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  809. job_dict['cultivate_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
  810. job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  811. elif len(_nums) == 2:
  812. job_dict['cultivate_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  813. job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  814. job_dict['cultivate_time_end'] = '%s'%('至今')
  815. elif len(time_list) == 2:
  816. nums_1 = re.findall('\d+', time_list[0])
  817. nums_2 = re.findall('\d+', time_list[1])
  818. nums_1.append('09')
  819. nums_2.append('07')
  820. job_dict['cultivate_time_beg'] = '%s-%02d'%(nums_1[0], int(nums_1[1]))
  821. job_dict['cultivate_time_end'] = '%s-%02d'%(nums_2[0], int(nums_2[1]))
  822. job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(nums_1[0], int(nums_1[1]), nums_2[0], int(nums_2[1]))
  823. elif len(time_list) == 1:
  824. _nums = re.findall('\d+', time_list[0])
  825. if '获得' in data_list[0]:
  826. _nums.append('01')
  827. _nums.insert(0, '01')
  828. _nums.insert(0, str(int(_nums[1]) - year_dict[top_level]))
  829. job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  830. job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  831. job_dict['cultivate_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
  832. else:
  833. _nums.append('01')
  834. job_dict['cultivate_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  835. job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  836. job_dict['cultivate_time_end'] = '%s'%('至今')
  837. job_dict['cultivate_content'] = re.sub('培培训训内内容容::|培培训训内内容容::|培培训训内内容容', '培训内容:', ''.join(data_list[end_index:]))
  838. if not job_dict['cultivate_name']:
  839. job_dict['cultivate_name'] = org
  840. logger.info(job_dict)
  841. job_list.append(job_dict)
  842. continue
  843. '''
  844. #print(nums)
  845. for i in range(1, len(nums[:])):
  846. job_dict = {'cultivate_time':'', 'cultivate_name':'','cultivate_leval':'','cultivate_content':''}
  847. data_list = lines[nums[i-1]:nums[i]]
  848. if '' in data_list:
  849. data_list.remove('')
  850. if len(data_list) > 1 and data_list[1] and data_list[1][-1] == '|' and data_list[0][-1] != '|':
  851. data_list[0] = data_list[0] + data_list[1]
  852. data_list[1] = ''
  853. job_time = re.findall(re_txt_1, data_list[0])
  854. job_dict['cultivate_time'] = job_time[0]
  855. _nums = re.findall('\d+', job_dict['cultivate_time'])
  856. if len(_nums) >= 4:
  857. job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  858. elif len(_nums) == 2:
  859. job_dict['cultivate_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  860. data_list[0] = re.sub(job_time[0], '', data_list[0])
  861. _list = data_list[0].split('|')
  862. if len(_list) >= 2:
  863. job_dict['cultivate_name'] = _list[0].strip()
  864. job_dict['cultivate_leval'] = _list[1].strip()
  865. job_dict['cultivate_content'] = re.sub('培培训训内内容容|培培训训内内容容::|培培训训内内容容::', '培训内容:', ''.join(data_list[1:]))
  866. else:
  867. job_dict['cultivate_content'] = re.sub('培培训训内内容容|培培训训内内容容::|培培训训内内容容::', '培训内容:', ''.join(data_list[0:]))
  868. #print(job_dict)
  869. '''
  870. return job_list
  871. # 语言能力(已完成)
  872. @time_this_function
  873. def get_lag_list(lines):
  874. logger.info(lines)
  875. lan_list = []
  876. re_lan = re.compile(r'(\w+[语话])')
  877. re_lev = re.compile(r'([公共级四专八]+)')
  878. lag_dict = {'语言':'', '熟练度':""}
  879. for l in lines:
  880. if not l.strip():
  881. continue
  882. lan_name = re.search(re_lan, l)
  883. lag_lev = re.search(re_lev, l)
  884. if lag_lev and lag_lev.group(1):
  885. lag_dict["熟练度"] = lag_lev.group(1)
  886. if lan_name and lan_name.group(1):
  887. if lag_dict["语言"]:
  888. lan_list.append(lag_dict)
  889. lag_dict = {'语言':'', '熟练度':""}
  890. lag_dict['语言'] = lan_name.group(1)
  891. return lan_list
  892. # 家庭情况(已弃用)
  893. def get_fam_list(lines):
  894. job_list = []
  895. fam_dict = {}
  896. for l in lines:
  897. if not l.strip():
  898. continue
  899. ls = l.split('|')
  900. if len(ls) == 1:
  901. continue
  902. fam_dict = {'fam_name':"",'fam_company':"",'fam_lable':"","fam_status":"", 'fam_job':""}
  903. fam_dict["fam_lable"] = ls[0].strip()
  904. fam_dict["fam_name"] = ls[1].strip()
  905. flag = 0
  906. if re.findall('\d岁|\d{4,5}', ls[2]):
  907. flag = 1
  908. fam_dict["fam_company"] = ls[flag+2].strip()
  909. fam_dict["fam_job"] = ls[flag+3].strip()
  910. fam_dict["fam_status"] = ls[flag+4].strip()
  911. #print(fam_dict)
  912. job_list.append(fam_dict)
  913. return job_list
  914. # 证书情况 时间+证书名称 (旧版)
  915. @time_this_function
  916. def get_cet_list_(lines):
  917. logger.info(lines)
  918. job_list = []
  919. re_txt = '\d+年\d+月|\d+-\d+|\d+\.\d+'
  920. lines_word = ' '.join(lines)
  921. lines = re.findall('\d+年\d+月|\d+-\d+|\d+\.\d+', lines_word)
  922. nums = []
  923. for x in range(len(lines) - 1):
  924. _index = lines_word.index(lines[x])
  925. _end_index = lines_word.index(lines[x+1])
  926. l = lines_word[_index : _end_index]
  927. if not l.strip():
  928. continue
  929. lines_word = lines_word[_end_index:]
  930. job_time = re.findall(re_txt, l)
  931. cet_dict = {'证书':'','获得时间':""}
  932. if job_time:
  933. cet_dict['证书'] = job_time[0]
  934. l = re.sub(job_time[0], '', l)
  935. else:
  936. continue
  937. ls = re.split('\||\040+|\t+', l)
  938. logger.info(ls)
  939. for l in ls:
  940. if len(l) <= 3:
  941. continue
  942. cet_dict['证书'] = l.strip()
  943. break
  944. job_list.append(cet_dict)
  945. return job_list
  946. # 证书情况 时间+证书名称 (UIE已完成)
  947. @time_this_function
  948. def get_cet_list(lines):
  949. logger.info(lines)
  950. cet_list = []
  951. for line in lines:
  952. info = cet_ie(line)
  953. cet_list.extend([{key:rst[key][0]["text"] for key in rst.keys()} for rst in info if rst.get("证书")])
  954. return cet_list
  955. # 获奖情况 时间+获奖名称 (旧版)
  956. def get_prize_list_old(lines):
  957. logger.info(lines)
  958. job_list = []
  959. re_txt = '\d+年\d+月|\d+-\d+|\d{4,4}.\d{1,2}'
  960. lines_word = ' '.join(lines)
  961. lines = re.findall('\d+年\d+月|\d{4,4}-\d+|\d{4,4}.\d{1,2}', lines_word)
  962. nums = []
  963. for x in range(len(lines) - 1):
  964. _index = lines_word.index(lines[x])
  965. _end_index = lines_word.index(lines[x+1])
  966. l = lines_word[_index : _end_index]
  967. if not l.strip():
  968. continue
  969. lines_word = lines_word[_end_index:]
  970. job_time = re.findall(re_txt, l)
  971. cet_dict = {'prize_name':'','prize_time':""}
  972. if job_time:
  973. cet_dict['prize_time'] = job_time[0]
  974. l = re.sub(job_time[0], '', l)
  975. else:
  976. continue
  977. ls = re.split('\||\040+|\t+', l)
  978. logger.info(ls)
  979. for l in ls:
  980. if len(l) <= 3:
  981. continue
  982. cet_dict['prize_name'] = l.strip()
  983. break
  984. logger.info(cet_dict)
  985. job_list.append(cet_dict)
  986. return job_list
  987. # 获奖情况 时间+获奖名称 (UIE已完成)
  988. @time_this_function
  989. def get_prize_list(lines):
  990. logger.info(lines)
  991. prize_list = []
  992. for line in lines:
  993. info = prize_ie(line)
  994. prize_list.extend([{key:rst[key][0]["text"] for key in rst.keys()} for rst in info if rst.get("奖项")])
  995. return prize_list
  996. # 返回其他信息
  997. def get_other_list(lines):
  998. other_list = []
  999. other_list.append("\n".join(lines))
  1000. return other_list
  1001. # Linux doc 文件处理
  1002. def doc2pdf_linux(docPath, pdfPath):
  1003. """
  1004. 允许的文档格式:doc,docx
  1005. 仅在linux平台下可以
  1006. 需要在linux中下载好libreoffice
  1007. """
  1008. # 注意cmd中的libreoffice要和linux中安装的一致
  1009. cmd = 'libreoffice6.3 --headless --convert-to pdf'.split() + [docPath] + ['--outdir'] + [pdfPath]
  1010. # cmd = 'libreoffice6.2 --headless --convert-to pdf'.split() + [docPath]
  1011. p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  1012. p.wait(timeout=30) # 停顿30秒等待转化
  1013. stdout, stderr = p.communicate()
  1014. if stderr:
  1015. raise subprocess.SubprocessError(stderr)
  1016. # Windows doc 文件处理
  1017. def doc2pdf_win(docPath, pdfPath):
  1018. # console.print(pdfPath+'/'+os.path.splitext(os.path.split(docPath)[-1])[0] + '.pdf')
  1019. import win32com
  1020. from win32com.client import DispatchEx, constants
  1021. word = DispatchEx("Word.Application") #内部方法
  1022. word.Visible = 1 # 后台运行,不显示
  1023. word.DisplayAlerts = 0 # 不警告
  1024. doc = word.Documents.Open(docPath) #转换源文件
  1025. doc.SaveAs(pdfPath+'/'+os.path.splitext(os.path.split(docPath)[-1])[0] + '.pdf', FileFormat=17) #txt=4,html=10,docx=16,pdf=17 #新文件
  1026. doc.Close() #关闭
  1027. word.Quit() #退出
  1028. # doc 文件处理
  1029. def doc2pdf(docPath, pdfPath, system):
  1030. """
  1031. 注意使用绝对路径
  1032. pdf的生成只写路径,不写名字
  1033. """
  1034. docPathTrue = os.path.abspath(docPath) # bugfix - searching files in windows/system32
  1035. if system == "Linux":
  1036. return doc2pdf_linux(docPathTrue, pdfPath)
  1037. if system == "Windows":
  1038. return doc2pdf_win(docPathTrue, pdfPath)
  1039. # txt 纯文本解析(已完成)
  1040. @time_this_function
  1041. def parse_txt(path, save_dir):
  1042. with open(path, 'r', encoding='utf-8') as fp:
  1043. data = fp.read()
  1044. global block, block_rev
  1045. chun = 1
  1046. page = {1: []}
  1047. if len(data.split("\n")) <= 2:
  1048. for line in data.split("\n"):
  1049. line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").replace("·","").strip()
  1050. for word in line.split():
  1051. if word in block.keys():
  1052. chun = block[word]
  1053. page[chun] = []
  1054. elif word:
  1055. page[chun].append(word)
  1056. else:
  1057. for line in data.split("\n"):
  1058. line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").replace("·","")
  1059. regex = re.compile(u'[\u3000]+',re.UNICODE)
  1060. line = regex.sub('', line.strip())
  1061. if line in block.keys():
  1062. chun = block[line]
  1063. page[chun] = []
  1064. elif line:
  1065. page[chun].append(line)
  1066. result_data = dict()
  1067. for key in page.keys():
  1068. for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
  1069. if key == index:
  1070. result_data[block_rev[index]] = func(page[index])
  1071. filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
  1072. with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
  1073. json.dump(result_data, fp, indent=4, ensure_ascii=False)
  1074. # 纯文本 word 解析
  1075. @time_this_function
  1076. def read_from_word(doc, path, save_dir):
  1077. para_text = []
  1078. for para in doc.paragraphs:
  1079. para_text.append(para.text)
  1080. global block, block_rev
  1081. chun = 1
  1082. page = {1: []}
  1083. for line in para_text:
  1084. regex = re.compile(u'[\uF000-\uF0FF]+',re.UNICODE)
  1085. line = regex.sub('', line)
  1086. if line in block.keys():
  1087. chun = block[line]
  1088. page[chun] = []
  1089. elif line:
  1090. page[chun].append(line)
  1091. result_data = dict()
  1092. for key in page.keys():
  1093. for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
  1094. if key == index:
  1095. result_data[block_rev[index]] = func(page[index])
  1096. filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
  1097. with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
  1098. json.dump(result_data, fp, indent=4, ensure_ascii=False)
  1099. # 提取 word 表格(已完成)
  1100. @time_this_function
  1101. def check_word(path, save_dir):
  1102. doc = Document(path)
  1103. tables = doc.tables
  1104. if not tables:
  1105. logger.info("this is raw text")
  1106. read_from_word(doc, path, save_dir=save_dir)
  1107. logger.info("this is a Table")
  1108. global block
  1109. with open("resources/keys.json", "r", encoding="utf-8") as fp:
  1110. prk = json.load(fp)
  1111. chun = 1
  1112. page = {1: []}
  1113. regex = re.compile(r'(\(\w{2,8}\))?((\w{2,8}))?')
  1114. for table in tables:
  1115. lo = {} # 存储每一行去重后的数据
  1116. for row in range(0, len(table.rows)):
  1117. row_list = []
  1118. for col in range(0, len(table.row_cells(row))): # 提取row行的全部列数据
  1119. if len(''.join(table.cell(row, col).text)) <= 20:
  1120. row_list.append(re.sub(r'(\w)\n', r'\1', table.cell(row, col).text))
  1121. else:
  1122. row_list.append(regex.sub("", table.cell(row, col).text.replace(" ","").replace(":", ":").replace("学历\n学位","学历学位"))) # 去除字符串中的特殊字符,并添加到临时列表中
  1123. lo[row] = (sorted(set(row_list), key=row_list.index)) # 在不变顺序的前提下,去除List中的重复项
  1124. # 去除空项
  1125. for key in list(lo.keys()):
  1126. if "" in lo[key]:
  1127. lo[key].remove("")
  1128. if not lo[key]:
  1129. lo.pop(key)
  1130. for _, line in lo.items():
  1131. if (line[0] in block.keys()) or (line[0] in prk.keys()):
  1132. # 包含大类目名
  1133. if line[0] in block.keys():
  1134. # 指向当前类目
  1135. chun = block[line[0]]
  1136. if not page.get(chun):
  1137. page[chun] = []
  1138. # 去除类目名
  1139. line = '\n'.join(line[1:])
  1140. # 包含小类目
  1141. elif line[0] in prk.keys():
  1142. # 指向当前类目
  1143. chun = prk[line[0]]
  1144. if not page.get(chun):
  1145. page[chun] = []
  1146. # 不去除
  1147. line = '\n'.join(line)
  1148. else:
  1149. line = '\n'.join(line)
  1150. # 标准化小类目
  1151. for k in prk.keys():
  1152. line = line.replace(k+"\n", k+":")
  1153. page[chun].extend(line.split())
  1154. result_data = dict()
  1155. for key in page.keys():
  1156. for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
  1157. if key == index:
  1158. result_data[block_rev[index]] = func(page[index])
  1159. filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
  1160. with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
  1161. json.dump(result_data, fp, indent=4, ensure_ascii=False)
  1162. # pdf 解析句子(已完成)
  1163. @time_this_function
  1164. def parse_line_layout(layout, b):
  1165. texts = []
  1166. """解析页面内容,一行一行的解析"""
  1167. # bbox:
  1168. # x0:从页面左侧到框左边缘的距离。
  1169. # y0:从页面底部到框的下边缘的距离。
  1170. # x1:从页面左侧到方框右边缘的距离。
  1171. # y1:从页面底部到框的上边缘的距离
  1172. for textbox in layout:
  1173. if isinstance(textbox, LTTextBox) or isinstance(textbox, LTTextLine):
  1174. for char in textbox:
  1175. if isinstance(char, LTTextLineHorizontal):
  1176. texts.append([char.bbox[0], char.bbox[3], char.get_text().strip()])
  1177. # 按行排序
  1178. texts.sort(key=lambda x:-x[1])
  1179. global block, block_rev
  1180. chun = b
  1181. page = {chun: []}
  1182. for _, _, line in texts:
  1183. regex = re.compile(u'[\u007F|\u25A0|\u00B7|\uF000-\uF0FF]+',re.UNICODE)
  1184. line = regex.sub('', line)
  1185. regex_tips = re.compile(r'(\(.*?\))?((.*?))?')
  1186. # line = regex_tips.sub('', line)
  1187. line = line.strip()
  1188. if regex_tips.sub('', line).strip() in block.keys():
  1189. chun = block[regex_tips.sub('', line).strip()]
  1190. page[chun] = []
  1191. elif line:
  1192. page[chun].append(line)
  1193. return page, chun
  1194. # pdf 样式解析(已完成)
  1195. @time_this_function
  1196. def read_from_pdf(path, save_dir):
  1197. result = {}
  1198. global block_rev
  1199. with open(path, 'rb') as in_file:
  1200. parser = PDFParser(in_file) # 用文件对象来创建一个pdf文档分析器
  1201. doc: PDFDocument = PDFDocument(parser) # 创建pdf文档
  1202. rsrcmgr = PDFResourceManager() # 创建PDF,资源管理器,来共享资源
  1203. # 创建一个PDF设备对象
  1204. laparams = LAParams()
  1205. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  1206. # 创建一个PDF解释其对象
  1207. interpreter = PDFPageInterpreter(rsrcmgr, device)
  1208. # 循环遍历列表,每次处理一个page内容
  1209. # doc.get_pages() 获取page列表
  1210. interpreter = PDFPageInterpreter(rsrcmgr, device)
  1211. # 处理文档对象中每一页的内容
  1212. # 循环遍历列表,每次处理一个page的内容
  1213. b = 1
  1214. for page in PDFPage.create_pages(doc):
  1215. logger.debug('================ 新页面 ================')
  1216. interpreter.process_page(page)
  1217. layout = device.get_result()
  1218. r, b = parse_line_layout(layout, b)
  1219. for key in r.keys():
  1220. if result.get(key):
  1221. result[key].extend(r[key])
  1222. else:
  1223. result[key] = r[key]
  1224. result_data = dict()
  1225. for key in result.keys():
  1226. for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
  1227. if key == index:
  1228. result_data[block_rev[index]] = func(result[index])
  1229. filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
  1230. with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
  1231. json.dump(result_data, fp, indent=4, ensure_ascii=False)
  1232. # pdf 表格解析 (已完成)
  1233. @time_this_function
  1234. def parse_table_from_pdf(path, save_dir):
  1235. global block, block_rev
  1236. lo = {}
  1237. with pdfplumber.open(path) as pdf:
  1238. for page in pdf.pages:
  1239. for table in page.extract_tables():
  1240. for line in table:
  1241. row_list = []
  1242. for word in line:
  1243. row_list.append(word)
  1244. lo[len(lo.keys())] = row_list
  1245. # 去除空项
  1246. for key in list(lo.keys()):
  1247. if "" in lo[key]:
  1248. lo[key].remove("")
  1249. if not lo[key]:
  1250. lo.pop(key)
  1251. for _, line in lo.items():
  1252. if (line[0] in block.keys()) or (line[0] in prk.keys()):
  1253. # 包含大类目名
  1254. if line[0] in block.keys():
  1255. # 指向当前类目
  1256. chun = block[line[0]]
  1257. if not page.get(chun):
  1258. page[chun] = []
  1259. # 去除类目名
  1260. line = '\n'.join(line[1:])
  1261. # 包含小类目
  1262. elif line[0] in prk.keys():
  1263. # 指向当前类目
  1264. chun = prk[line[0]]
  1265. if not page.get(chun):
  1266. page[chun] = []
  1267. # 不去除
  1268. line = '\n'.join(line)
  1269. else:
  1270. line = '\n'.join(line)
  1271. # 标准化小类目
  1272. for k in prk.keys():
  1273. line = line.replace(k+"\n", k+":")
  1274. page[chun].extend(line.split())
  1275. result_data = dict()
  1276. for key in page.keys():
  1277. for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
  1278. if key == index:
  1279. result_data[block_rev[index]] = func(page[index])
  1280. filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
  1281. with open(os.path.join('./result/' + save_dir, filename), 'w', encoding="utf-8") as fp:
  1282. json.dump(result_data, fp, indent=4, ensure_ascii=False)
  1283. # 检测 pdf 格式 (已完成)
  1284. @time_this_function
  1285. def check_pdf(path):
  1286. """
  1287. # 输入:
  1288. # pdf 文件路径
  1289. # 输出:
  1290. # 文件包含元素 [Word, Table]
  1291. """
  1292. rst = []
  1293. for page_layout in extract_pages(path):
  1294. for element in page_layout:
  1295. if isinstance(element, LTFigure):
  1296. for cell in element:
  1297. if isinstance(cell, LTChar):
  1298. rst.append("Table")
  1299. break
  1300. elif isinstance(element, LTTextContainer):
  1301. rst.append("Word")
  1302. return set(rst)
  1303. # 检测编码(已完成)
  1304. def decode_path(path):
  1305. '''zipfile解压出现乱码,将乱码的路径编码为UTF8'''
  1306. try:
  1307. path_name = path.decode('utf-8')
  1308. except:
  1309. path_name = path.encode('437').decode('gbk')
  1310. path_name = path_name.encode('utf-8').decode('utf-8')
  1311. return path_name
  1312. # 格式化字段
  1313. def formatter(result, json_obj):
  1314. normal = json_obj["base"]
  1315. itenormal = json_obj["base"]
  1316. edunormal = json_obj["tal_his_edu"]
  1317. jobnormal = json_obj["tal_his_job"]
  1318. tranornal = json_obj["tal_training_experience"]
  1319. cetnormal = json_obj["tal_vocational_qualification_certificate"]
  1320. rewnormal = json_obj["tal_reward_punishment"]
  1321. family = json_obj["tal_family_social_relation"]
  1322. # for key in normal.keys():
  1323. # if result.get(key):
  1324. # result[normal[key]] = result[key]
  1325. # result.pop(key)
  1326. for key in json_obj["base"].keys():
  1327. if result.get("基本信息"):
  1328. if result["基本信息"].get(key):
  1329. result[json_obj["base"][key]] = result["基本信息"][key]
  1330. del result["基本信息"][key]
  1331. if result.get("求职意向"):
  1332. if result["求职意向"].get(key):
  1333. result[json_obj["base"][key]] = result["求职意向"][key]
  1334. del result["求职意向"][key]
  1335. del result["基本信息"]
  1336. del result["求职意向"]
  1337. if result.get("教育经历"):
  1338. for idx in range(len(result['教育经历'])):
  1339. for key in edunormal.keys():
  1340. if result['教育经历'][idx].get(key):
  1341. result['教育经历'][idx][edunormal[key]] = result['教育经历'][idx][key]
  1342. result['教育经历'][idx].pop(key)
  1343. if result.get("工作经历"):
  1344. for idx in range(len(result['工作经历'])):
  1345. for key in jobnormal.keys():
  1346. if result['工作经历'][idx].get(key):
  1347. result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
  1348. result['工作经历'][idx].pop(key)
  1349. if result.get("项目经历"):
  1350. for key in json_obj["tal_his_project"].keys():
  1351. for idx in range(len(result["项目经历"])):
  1352. if result["项目经历"][idx].get(key):
  1353. result["项目经历"][idx][json_obj["tal_his_project"][key]] = result["项目经历"][idx][key]
  1354. del result["项目经历"][idx][key]
  1355. if result.get("培训经历"):
  1356. for idx in range(len(result['培训经历'])):
  1357. for key in tranornal.keys():
  1358. if result['培训经历'][idx].get(key):
  1359. result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]
  1360. result['培训经历'][idx].pop(key)
  1361. if result.get("语言能力"):
  1362. for key in json_obj["tal_language"].keys():
  1363. for idx in range(len(result["语言能力"])):
  1364. if result["语言能力"][idx].get(key):
  1365. result["语言能力"][idx][json_obj["tal_language"][key]] = result["语言能力"][idx][key]
  1366. del result["语言能力"][idx][key]
  1367. if result.get("证书"):
  1368. for idx in range(len(result['证书'])):
  1369. for key in cetnormal.keys():
  1370. if result['证书'][idx].get(key):
  1371. result['证书'][idx][cetnormal[key]] = result['证书'][idx][key]
  1372. result['证书'][idx].pop(key)
  1373. if result.get("获奖情况"):
  1374. for idx in range(len(result['获奖情况'])):
  1375. for key in rewnormal.keys():
  1376. if result['获奖情况'][idx].get(key):
  1377. result['获奖情况'][idx][rewnormal[key]] = result['获奖情况'][idx][key]
  1378. result['获奖情况'][idx].pop(key)
  1379. if result.get("家庭成员"):
  1380. for idx in range(len(result['家庭成员'])):
  1381. for key in family.keys():
  1382. if result['家庭成员'][idx].get(key):
  1383. result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
  1384. result['家庭成员'][idx].pop(key)
  1385. tit = {
  1386. "基本信息":"base",
  1387. "求职意向":"intent_job",
  1388. "教育经历":"tal_his_edu",
  1389. "工作经历":"tal_his_job",
  1390. "项目经历":"tal_his_project",
  1391. "培训经历":"tal_training_experience",
  1392. "获奖情况":"tal_reward_punishment",
  1393. "语言能力":"tal_language",
  1394. "证书":"tal_vocational_qualification_certificate",
  1395. "专业技能":"tal_professional_tech_certificate",
  1396. "家庭成员":"tal_family_social_relation",
  1397. "其他情况说明":"intro"
  1398. }
  1399. for key in tit.keys():
  1400. if result.get(key):
  1401. result[tit[key]] = result[key]
  1402. result.pop(key)
  1403. return result
  1404. # 结果返回
  1405. def push_back(tempdir):
  1406. for file in os.listdir('./result/' + tempdir):
  1407. filename = os.path.join('./result/' + tempdir, file)
  1408. with open(filename, "r", encoding="utf-8") as ff:
  1409. rst = json.load(ff)
  1410. rst = formatter(rst, translate)
  1411. url = "http://192.168.1.110:9999/talent/getResumeData"
  1412. session = requests.Session()
  1413. session.mount('http://', HTTPAdapter(max_retries = 3))
  1414. try:
  1415. headers = {
  1416. 'contentType':'Application/json'
  1417. }
  1418. response = session.post(url=url, headers=headers, json={"filename":file, "ResumeData":rst}, timeout=10)
  1419. except Exception as e:
  1420. print(e)
  1421. logger.info({"filename":file, "ResumeData":rst})
  1422. # 检测传入格式(已完成)
  1423. def detection_type(path, system):
  1424. tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
  1425. os.mkdir('./result/' + tempdir)
  1426. # 传入 rar 压缩文件
  1427. if os.path.isfile(path) and path.endswith('.rar'):
  1428. rar = rarfile.RarFile(path)
  1429. rar.extractall('./cache/' + tempdir)
  1430. path = "./cache/" + tempdir
  1431. # 传入 tar.gz 压缩文件
  1432. if os.path.isfile(path) and path.endswith('.tar.gz'):
  1433. tf = tarfile.open(path)
  1434. tf.extractall('./cache/' + tempdir)
  1435. tf.close()
  1436. path = "./cache/" + tempdir
  1437. # 传入 .zip .7z 压缩文件
  1438. try:
  1439. if os.path.isfile(path) and path.endswith('.zip'):
  1440. ## 解压方式1:存在乱码
  1441. # f = zipfile.ZipFile(file, mode='r')
  1442. # f.extractall(target_dir)
  1443. ## 解压方式2:防止乱码
  1444. with ZipFile(path, allowZip64=True) as zf:
  1445. # 排除目录文件
  1446. print("zf.filelist", zf.filelist)
  1447. file_iter = (filename for filename in zf.filelist if os.path.isfile(path))
  1448. for filename in file_iter:
  1449. # 编码文件名称为 utf 格式
  1450. filename.filename = decode_path(filename.filename) # 防止乱码的操作
  1451. zf.extract(filename, "./cache/" + tempdir)
  1452. path = "./cache/" + tempdir
  1453. elif os.path.isfile(path) and path.endswith('.7z'): # .7z格式文件解压
  1454. zf = py7zr.SevenZipFile(path, mode='r')
  1455. zf.extractall("./cache/" + tempdir)
  1456. path = "./cache/" + tempdir
  1457. except Exception as e:
  1458. logger.error(e)
  1459. # 传入为 doc
  1460. if os.path.isfile(path) and path.endswith('.doc'):
  1461. doc2pdf(docPath = path, pdfPath = './pdf', system=system)
  1462. newfile = './pdf/' + os.path.splitext(os.path.split(path)[-1])[0] + '.pdf'
  1463. if os.path.exists(newfile):
  1464. rst = check_pdf(newfile)
  1465. if "Table" in rst:
  1466. parse_table_from_pdf(newfile, save_dir=tempdir)
  1467. pass
  1468. if "Word" in rst:
  1469. read_from_pdf(newfile, save_dir=tempdir)
  1470. # 传入为 docx
  1471. elif os.path.isfile(path) and path.endswith('.docx'):
  1472. check_word(path, save_dir=tempdir)
  1473. # 传入为 pdf
  1474. elif os.path.isfile(path) and path.endswith('.pdf'):
  1475. rst = check_pdf(path)
  1476. if "Table" in rst:
  1477. parse_table_from_pdf(path, save_dir=tempdir)
  1478. if "Word" in rst:
  1479. read_from_pdf(path, save_dir=tempdir)
  1480. # 传入为 txt
  1481. elif os.path.isfile(path) and path.endswith('.txt'):
  1482. parse_txt(path, save_dir=tempdir)
  1483. # 传入目录
  1484. elif os.path.isdir(path):
  1485. for filename in os.listdir(path):
  1486. filename = os.path.join(path, filename)
  1487. # 传入为 doc
  1488. logger.info(filename)
  1489. if filename.endswith('.doc') and not filename.startswith('.~'):
  1490. doc2pdf(docPath = filename, pdfPath = './pdf', system=system)
  1491. newfile = './pdf/' + os.path.splitext(os.path.split(filename)[-1])[0] + '.pdf'
  1492. if os.path.exists(newfile):
  1493. rst = check_pdf(newfile)
  1494. if "Table" in rst:
  1495. parse_table_from_pdf(newfile, save_dir=tempdir)
  1496. pass
  1497. if "Word" in rst:
  1498. read_from_pdf(newfile, save_dir=tempdir)
  1499. # 传入为 docx
  1500. elif os.path.isfile(filename) and filename.endswith('.docx'):
  1501. check_word(filename, save_dir=tempdir)
  1502. # 传入为 pdf
  1503. if os.path.isfile(filename) and filename.endswith('.pdf'):
  1504. rst = check_pdf(filename)
  1505. if "Table" in rst:
  1506. parse_table_from_pdf(filename, save_dir=tempdir)
  1507. pass
  1508. if "Word" in rst:
  1509. read_from_pdf(filename, save_dir=tempdir)
  1510. # 传入为 txt
  1511. elif os.path.isfile(filename) and filename.endswith('.txt'):
  1512. parse_txt(filename, save_dir=tempdir)
  1513. # 推送后端
  1514. push_back(tempdir)
  1515. @app.post("/resume_parse")
  1516. async def file_upload(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
  1517. """
  1518. 简历上传
  1519. 格式:pdf,docx,doc,txt,tar.gz,zip,7z, rar
  1520. """
  1521. res = await file.read()
  1522. with open('./uploads/' + file.filename, "wb") as f:
  1523. f.write(res)
  1524. background_tasks.add_task(detection_type, './uploads/' + file.filename, platform.system())
  1525. return {"errno": 0, "msg": "{} Upload Success".format(file.filename)}
  1526. if __name__ == '__main__':
  1527. uvicorn.run(app=app, host="0.0.0.0", port=8320)