resume_parse.py 56 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. import os
  4. import sys
  5. import re
  6. import subprocess
  7. from pprint import pprint
  8. import logging
  9. logging.basicConfig(format='%(asctime)s: %(name)s: %(levelname)s: %(filename)s: %(funcName)s: %(lineno)d: %(message)s', level=logging.INFO)
  10. import pandas as pd
  11. from docx import Document
  12. from docx.shared import Inches
  13. from pdfminer.high_level import extract_pages
  14. from pdfminer.layout import LTTextContainer, LTChar, LTLine, LAParams, LTTextBox, LTFigure, LTImage, LTText, LTAnno, LTTextLine, LTTextLineHorizontal
  15. from pdfminer.pdfdocument import PDFDocument
  16. from pdfminer.pdfpage import PDFPage
  17. from pdfminer.pdfparser import PDFParser
  18. from pdfminer.converter import PDFPageAggregator
  19. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  20. import pdfplumber
  21. from paddlenlp import Taskflow
  22. from rich.console import Console
  23. console = Console()
  24. # import uvicorn
  25. # from fastapi import FastAPI
  26. # app = FastAPI()
  27. ner = Taskflow("ner", mode='fast')
  28. ner_tag = Taskflow("ner")
  29. global block, block_rev
  30. block = {
  31. "个人信息":1, "基本信息":1, "个人简历":1, "基基本本信信息息":1, "基本信息基本信息":1, "基本信息文本内容":1,
  32. "求职意向":2, "求职意向求职意向":2, "期望工作文本内容":2,
  33. "教育背景":3, "教育经历":3, "教教育育经经历历":3, "教育经历教育经历":3, "教育经历文本内容":3, "学历学位":3,
  34. "工作经验":4, "主要工作内容与职责":4, "工作方面":4, "实习经历":4, "工作经历":4, "工工作作经经历历":4, "工作经历工作经历":4, "工作经历文本内容":4,
  35. "项目经历":5, "项目经验":5, "科研项目经历":5, "项项目目经经历历":5, "项目经历项目经历":5, "研究生参与代表性项目":5, "项目经历文本内容":5,
  36. "专业技能":6, "个人技能":6, "专业/外语技能":6, "技能素质":6, "个人技能文本内容":6,
  37. "自我评价":7, "个人简介":7, "个人评价":7, "自我描述":7, "自自我我评评价价":7, "自我评价自我评价":7, "自我评价文本内容":7,
  38. "兴趣爱好":8, "兴趣爱好文本内容":8,
  39. "语言及方言":9, "语言能力":9, "英语能力":9, "语语言言能能力力":9, "语言能力语言能力":9, "语言技能文本内容":9,
  40. "证书":10, "所获证书文本内容":10,
  41. "获得奖励":11, "获奖经历":11, "获奖情况":11, "获获奖奖经经历历":11, "获奖经历获奖经历":11, "获奖情况及社会活动":11, "校内奖励":11, "校内活动&奖励":11, "所获奖励文本内容":11,"奖惩情况":11,
  42. "培训":12, "培训经历":12, "培培训训经经历历":12, "培训经历文本内容":12,
  43. "家庭成员":13, "家家庭庭成成员员":13, "家庭成员家庭成员":13, "主要家庭成员及社会关系":13,
  44. "社会活动":"other", "实践经验":"other", "社会活动及社会实践":"other", "近三年年度考核结果":"other", "其他意愿":"other",
  45. }
  46. block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"自我评价", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
  47. # 基本信息(已完成)
  48. def get_base_info(lines):
  49. logging.info(lines)
  50. schema = {
  51. '姓名': None,
  52. }
  53. for line in [' '.join(' '.join(lines).split('\n'))]:
  54. line = line.replace(r'[ ]{5,}','\n')
  55. w = re.sub(r'[\W]+(\w[::])[\W]{0,}\w', r'\1', line)
  56. for i in w.split():
  57. if ':' in i:
  58. try:
  59. key, val = i.split(':')
  60. schema[key] = val
  61. except Exception as e:
  62. logging.error(e)
  63. if not schema.get('姓名'):
  64. schema['姓名'] = re.search(r'[姓名::]{3,}(\w{2,4})', w).group(1) if re.search(r'[姓名::]{3,}(\w{2,4})', w) else None
  65. if not schema.get('姓名'):
  66. for word, tag in ner_tag(w):
  67. if tag == "人物类_实体":
  68. schema['姓名'] = word
  69. if not schema.get('性别'):
  70. schema['性别'] = re.search(r'[男女]', w).group() if re.search(r'[男女]', w) else None
  71. if not schema.get('婚姻状况'):
  72. schema['婚姻状况'] = re.search(r'[已未]婚', w).group() if re.search(r'[已未]婚', w) else None
  73. if not schema.get('电子邮箱'):
  74. schema['电子邮箱'] = re.search(r'([.\w]+@[.\w]+)', w).group() if re.search(r'([.\w]+@[.\w]+)', w) else None
  75. if not schema.get('政治面貌'):
  76. schema['政治面貌'] = re.search(r'[预备中共党团员群众无派人士]{2,6}', w).group() if re.search(r'[预备中共党团员群众无派人士]{2,6}', w) else None
  77. if not schema.get('手机号码'):
  78. schema['手机号码'] = re.search(r'\W(1[\d]{10})\W', w).group(1) if re.search(r'\W(1[\d]{10})\W', w) else None
  79. # if not schema.get('籍贯'):
  80. # schema['籍贯'] = re.search(r'[籍贯::]{3,}(\w{2,5})', w).group(1) if re.search(r'[籍贯::]{3,}(\w{2,})', w) else None
  81. # if not schema.get('出生年月'):
  82. # schema['出生年月'] = re.search(r'\d{4}[./年\-]\d{1,2}[月]', w).group() if re.search(r'\d{4}[./年\-]\d{1,2}[月]', w) else None
  83. # if not schema.get('当前职位'):
  84. # schema['当前职位'] = re.search(r'[当前职位: ]{3,}(\w)+', w).group() if re.search(r'[当前职位: ]{3,}(\w)+', w) else None
  85. # if not schema.get('参加工作时间'):
  86. # schema['参加工作时间'] = re.search(r'[参加工作事件:]{3,}(\d{4}[./年\-]\d{1,2}[月])', w).group(1) if re.search(r'[参加工作事件:]{3,}(\d{4}[./年\-]\d{1,2}[月])', w) else None
  87. return {key:value for key, value in schema.items() if value}
  88. # 求职意向(已完成)
  89. def get_job_intention(lines):
  90. logging.info(lines)
  91. schema = {}
  92. for line in lines:
  93. regex = re.compile(r'\W{0,3}[::]\s+')
  94. line = regex.sub(':', line)
  95. for i in line.split():
  96. if ":" in i:
  97. try:
  98. key, val = i.split(":")
  99. schema[key] = val
  100. except Exception as e:
  101. logging.error(e)
  102. return schema
  103. # 教育经历 (已停用)
  104. # ner + 分词 (判断学校,时间,学历) 专业需要单独处理。
  105. def get_edu_list_old(lines):
  106. logging.info(lines)
  107. job_list = []
  108. job_dict = {'edu_time_beg':'', 'edu_time_end':'', 'edu_name':'','edu_leval':'','edu_domain':'', 'edu_statue':0}
  109. re_txt = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今|^\d{4,4}.\d{1,2}|19\d{2,2}.|20\d{2,2}.'
  110. re_txt_1 = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今'
  111. nums = []
  112. for i in range(len(lines)):
  113. if re.findall(re_txt, lines[i]):
  114. nums.append(i)
  115. nums.append(len(lines))
  116. edu_level = {'本科':18, "大专":17, "博士研究生":20, "学士":18, "博士":20, "硕士":19, "研究生":19, "博后":21, '博士后':21}
  117. year_dict = {18:4, 17:3,20:3,19:3,21:2}
  118. edu_dict = {18:'本科', 17:'大专',20:'博士研究生',19:'硕士',21:'博士后'}
  119. edu_list = []
  120. for i in range(1, len(nums[:])):
  121. job_dict = {'edu_time_beg':'', 'edu_time_end':'', 'edu_name':'','edu_leval':'','edu_domain':''}
  122. data_list = lines[nums[i-1]:nums[i]]
  123. if len(data_list) > 1 and data_list[1] and data_list[1][-1] == '|' and data_list[0][-1] != '|':
  124. data_list[0] = data_list[0] + data_list[1]
  125. data_list[1] = ''
  126. if len(data_list) > 2 and data_list[2] and data_list[2][-1] == '|' and data_list[0][-1] != '|' and '|' in str(data_list[0]) and data_list[1] and data_list[1][-1] != '|':
  127. data_list[0] = data_list[0] + data_list[1] + data_list[2]
  128. data_list[1] = ''
  129. data_list[2] = ''
  130. if '' in data_list:
  131. data_list.remove('')
  132. data_line = ' '.join(data_list)
  133. data_line = re.sub('[\|]', ' ', data_line)
  134. data_line = re.sub('-{3,}', '', data_line)
  135. ner_data = ner(''.join(data_list[:2]))
  136. org = ''
  137. time_list = []
  138. for jj in range(1, len(ner_data)):
  139. if ner_data[jj][1] == ner_data[jj-1][1]:
  140. ner_data[jj] = list(ner_data[jj])
  141. ner_data[jj][0] = ner_data[jj-1][0] + ner_data[jj][0]
  142. ner_data[jj-1] = ('','')
  143. for _ in ner_data:
  144. if _[1] == 'ORG' and not org:
  145. org = _[0].strip()
  146. elif _[1] == 'TIME' and len(_[1]) >= 4:
  147. time_list.append(_[0])
  148. #TIME
  149. # print(data_line)
  150. _list_data = re.split('\040+',data_line)
  151. top_level = 18
  152. remove_list = []
  153. logging.info(_list_data)
  154. logging.info(time_list)
  155. for ii in range(len(_list_data)):
  156. for t in time_list:
  157. if t in _list_data[ii]:
  158. _list_data[ii] = ''
  159. break
  160. for i in range(len(_list_data)):
  161. #if org in _list_data[i]:
  162. # _list_data[i] = ''
  163. if re.findall('^\d{4,4}', _list_data[i]):
  164. _list_data[i] = ''
  165. _data = re.findall('本科|学士|硕士|博士研究生|博士后|博后|博士|研究生|大专', _list_data[i])
  166. if not _data:
  167. continue
  168. top_level = edu_level[_data[0]]
  169. _list_data[i] = ''
  170. break
  171. #remove_list.append(i)
  172. logging.info(_list_data)
  173. job_time = re.findall(re_txt_1, data_list[0])
  174. if job_time:
  175. job_dict['edu_time'] = job_time[0]
  176. else:
  177. job_dict['edu_time'] = ''
  178. _nums = re.findall('\d+', job_dict['edu_time'])
  179. if len(_nums) >= 4:
  180. job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  181. job_dict['edu_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
  182. job_dict['edu_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  183. elif len(_nums) == 2:
  184. job_dict['edu_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  185. job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  186. job_dict['edu_time_end'] = '%s'%('至今')
  187. elif len(time_list) == 2:
  188. nums_1 = re.findall('\d+', time_list[0])
  189. nums_2 = re.findall('\d+', time_list[1])
  190. nums_1.append('09')
  191. nums_2.append('07')
  192. job_dict['edu_time_beg'] = '%s-%02d'%(nums_1[0], int(nums_1[1]))
  193. try:
  194. job_dict['edu_time_end'] = '%s-%02d'%(nums_2[0], int(nums_2[1]))
  195. except:
  196. job_dict['edu_time_end'] = None
  197. try:
  198. job_dict['edu_time'] = '%s-%02d~%s-%02d'%(nums_1[0], int(nums_1[1]), nums_2[0], int(nums_2[1]))
  199. except:
  200. job_dict['edu_time'] = '%s-%02d~今'%(nums_1[0], int(nums_1[1]))
  201. elif len(time_list) == 1:
  202. _nums = re.findall('\d+', time_list[0])
  203. if '毕业' in data_list[0]:
  204. _nums.append('06')
  205. _nums.insert(0, '09')
  206. _nums.insert(0, str(int(_nums[1]) - year_dict[top_level]))
  207. job_dict['edu_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  208. job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  209. job_dict['edu_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
  210. else:
  211. _nums.append('09')
  212. job_dict['edu_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  213. job_dict['edu_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  214. job_dict['edu_time_end'] = '%s'%('至今')
  215. job_dict['edu_leval'] = edu_dict[top_level]
  216. if org:
  217. job_dict['edu_name'] = org
  218. else:
  219. job_dict['edu_name'] = ''
  220. edu_domain = ''
  221. for i in range(len(_list_data)):
  222. if org in _list_data[i]:
  223. continue
  224. if not _list_data[i] and '专业' in _list_data[i]:
  225. edu_domain = _list_data[i]
  226. if not edu_domain:
  227. for i in range(len(_list_data)):
  228. if org in _list_data[i]:
  229. continue
  230. if _list_data[i] and len(_list_data[i]) >= 3:
  231. edu_domain = _list_data[i]
  232. break
  233. if not edu_domain:
  234. for i in range(len(_list_data)):
  235. if org in _list_data[i]:
  236. for j in range(i+1, len(_list_data)):
  237. if _list_data[i] and len(_list_data[j]) >= 2:
  238. edu_domain = _list_data[j]
  239. break
  240. break
  241. job_dict['edu_domain'] = edu_domain
  242. if len(job_list) ==0:
  243. job_list.append(job_dict)
  244. else:
  245. if job_dict in job_list:
  246. continue
  247. if not job_dict['edu_time']:
  248. continue
  249. if int(job_dict['edu_time'][:4]) > int(job_list[-1]['edu_time'][:4]):
  250. job_list = [job_dict] + job_list
  251. else:
  252. job_list.append(job_dict)
  253. continue
  254. data_list[0] = re.sub(job_time[0], '', data_list[0])
  255. _list = re.split('\|\040+', data_list[0])
  256. #print(_list)
  257. if len(_list) == 1:
  258. __list = re.split('\040+', data_list[0])
  259. job_dict['edu_name'] = __list[1].strip()
  260. job_dict['edu_domain'] = __list[2].strip()
  261. job_dict['edu_leval'] = __list[3].strip()
  262. else:
  263. #if job_dict['edu_leval'] not in
  264. if len(_list) > 3:
  265. job_dict['edu_name'] = _list[2].strip()
  266. job_dict['edu_domain'] = _list[3].strip()
  267. job_dict['edu_leval'] = _list[1].strip()
  268. else:
  269. job_dict['edu_leval'] = _list[0].strip()
  270. job_dict['edu_name'] = _list[1].strip()
  271. job_dict['edu_domain'] = _list[2].strip()
  272. if '硕士' in _list[0] or '研究生' in _list[0]:
  273. job_dict['edu_leval'] = '硕士'
  274. elif '博士' in _list[0]:
  275. job_dict['edu_leval'] = '博士'
  276. elif '本科' in _list[0]:
  277. job_dict['edu_leval'] = '本科'
  278. elif '学士' in _list[0]:
  279. job_dict['edu_leval'] = '本科'
  280. # print(job_dict)
  281. if len(job_list) ==0:
  282. job_list.append(job_dict)
  283. else:
  284. if job_dict in job_list:
  285. continue
  286. if int(job_dict['edu_time'][:4]) > int(job_list[-1]['edu_time'][:4]):
  287. job_list = [job_dict] + job_list
  288. else:
  289. job_list.append(job_dict)
  290. #edu_list.append(job_dict['edu_time'] + job_dict['edu_name'] + job_dict['edu_domain'] + job_dict['edu_leval'])
  291. #if job_list[0]['edu_leval'] not in ['硕士', '博士', '本科', '博后'] and len(job_list[0]['edu_leval']) > 5:
  292. # job_list[0]['edu_leval'] = '本科'
  293. return job_list
  294. # 教育经历改 (已完成)
  295. def get_edu_list(lines):
  296. logging.info(lines)
  297. edu_list = [{"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None}]
  298. regex_time = re.compile(r'((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?[\d]{0,2})[至到\W]+((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?)?([今])?|(\d{4})[至\W]+([\d今]{4})')
  299. regex_end = re.compile(r'毕业时间[\w\W]{0,5}(\d{4})[\W年]?(\d{0,2})[月\W]?')
  300. regex_level = re.compile(r'[大本专科硕博士研究生后]{2,}')
  301. regex_domain = re.compile(u'[\u4E00-\u9FA5]{2,10}', re.UNICODE)
  302. count = 0
  303. for line in lines:
  304. line = line.replace("学士","本科").replace("专业","").replace("学位","")
  305. for cell in re.split(r'[·\|\t]', line):
  306. if not cell.strip():
  307. continue
  308. flags = 0
  309. edu_time = regex_time.search(cell)
  310. edu_end_time = regex_end.search(cell)
  311. edu_level = regex_level.search(cell)
  312. edu_domain = regex_domain.search(cell)
  313. # 标准时间格式
  314. if edu_time:
  315. # 提交信息
  316. if edu_list[count].get("Time") and edu_list[count].get("edu_name"):
  317. edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
  318. count += 1
  319. edu_list[count]["startTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)))
  320. # 年月日
  321. if edu_time.group(5) != None:
  322. edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(5)),int(edu_time.group(6)))
  323. edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)),int(edu_time.group(5)),int(edu_time.group(6)))
  324. # 只有年
  325. elif edu_time.group(8) != None:
  326. edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_time.group(8)),int(edu_time.group(9)))
  327. edu_list[count]["startTime"] = '{:4d}'.format(int(edu_time.group(8)))
  328. edu_list[count]["endTime"] = '{:4d}'.format(int(edu_time.group(9)))
  329. # 至今类
  330. else:
  331. edu_list[count]["endTime"] = edu_time.group(7)
  332. edu_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(edu_time.group(2)),int(edu_time.group(3)),edu_time.group(7))
  333. flags = 1
  334. # 只有毕业时间
  335. elif edu_end_time:
  336. # 提交信息
  337. if edu_list[count].get("endTime") and edu_list[count].get("edu_name"):
  338. edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
  339. count += 1
  340. # 年月
  341. if edu_end_time.group(2):
  342. edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)),int(edu_end_time.group(1))-3,int(edu_end_time.group(2)))
  343. edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)))
  344. # 只有年
  345. elif edu_end_time.group(1):
  346. edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(1))-3)
  347. edu_list[count]["endTime"] = '{:4d}'.format(int(edu_end_time.group(1)))
  348. # 学历
  349. if (not edu_list[count].get("edu_level")) and edu_level:
  350. edu_list[count]["edu_level"] = edu_level.group(0)
  351. # WordTag 识别 学校/专业
  352. for word, tag in ner_tag(cell):
  353. if (not edu_list[count].get("edu_name")) and (tag == "组织机构类_教育组织机构"):
  354. edu_list[count]["edu_name"] = word.strip()
  355. flags = 1
  356. elif (not edu_list[count].get("edu_domain")) and (tag in "_术语类型"):
  357. edu_list[count]["edu_domain"] = word.strip()
  358. elif edu_list[count].get("edu_name") and edu_list[count].get("edu_domain"):
  359. break
  360. # LAC 识别 学校
  361. else:
  362. for word, tag in ner(cell):
  363. if (tag == "ORG"):
  364. edu_list[count]["edu_name"] = word
  365. flags = 1
  366. break
  367. # 未识别成功时填充专业
  368. if (not (edu_level or flags or edu_list[count].get("edu_domain"))) and edu_domain:
  369. edu_list[count]["edu_domain"] = edu_domain.group(0)
  370. # 剔除时间不存在、学校不存在的列
  371. if (not edu_list[-1].get("Time")) or (not edu_list[-1].get("edu_name")):
  372. edu_list.pop()
  373. return edu_list
  374. # 工作经历 (已完成)
  375. # ner + 分词 机构信息,人物身份信息,时间 工作内容区分判断
  376. # 其中,时间是判断是否下一份工作情况的主要标识符之一。字符数量
  377. # 时间类 数量词
  378. def get_job_list(lines):
  379. logging.info(lines)
  380. job_list = []
  381. re_txt = '\d{4,4}\040{0,2}.\d+\040{0,2}.?\040{0,2}[\-–至-\—~]{1,2}\040{0,2}\d{4,4}\040{0,2}.\040{0,2}\d+.?|\d{4,4}.\d+.?\040{0,2}[\-–-—]{0,2}\040{0,2}至?今|\d{4,4}.\d+.?\040{0,2}[\-–-]{1,2}\040{0,2}现在|\d{4,4}年\d+月\-\d{4,4}年\d+月|\d{4,4}年\d+月\-\~|\d{4,4}年\d+月[\-\~-]至今|\d{4,4}-\d+\040{0,2}[-\~至]\040{0,2}\d{4,4}-\d+|\d{4,4}-\d+\~|\d{4,4}-\d+\[~-]至今|\d{4,4}-\d+\040{0,2}至今'
  382. nums = []
  383. for i in range(len(lines)):
  384. #print(lines[i])
  385. #print(lines[i], re.findall(re_txt, lines[i]), re.findall('\||\040{1,}', lines[i]))
  386. if re.findall(re_txt, lines[i].replace(' ', '')) and re.findall('\||\040{1,}', lines[i]):
  387. nums.append(i)
  388. continue
  389. if re.findall(re_txt, lines[i].replace(' ', '')[:20]):
  390. nums.append(i)
  391. continue
  392. if len(lines[i].strip().replace(' ', '')) > 50:
  393. continue
  394. year_list = re.findall('19\d{2,2}.\d{1,2}|20\d{2,2}.\d{1,2}', lines[i])
  395. if len(year_list) >= 2:
  396. nums.append(i)
  397. elif len(year_list) == 1 and '至今' in lines[i]:
  398. nums.append(i)
  399. nums.append(len(lines))
  400. # logging.info(nums)
  401. logging.info('get_job_list :{}'.format(nums))
  402. for i in range(1, len(nums[:])):
  403. job_dict = {'job_time':'', 'job_leval':'','job_company':'','job_content':''}
  404. data_list = lines[nums[i-1]:nums[i]]
  405. if '' in data_list:
  406. data_list.remove('')
  407. org = ''
  408. person_professor_list = []
  409. org_index = -1
  410. end_index = 3
  411. job_time = re.findall(re_txt, data_list[0])
  412. if not job_time:
  413. year_list = re.findall('19\d{2,2}.\d{1,2}|20\d{2,2}.\d{1,2}', data_list[0])
  414. if len(year_list) >= 2:
  415. job_time = ['-'.join(year_list)]
  416. elif len(year_list) == 1 and '至今' in lines[i]:
  417. job_time = [year_list[0] + '-' + '至今']
  418. if not job_time:
  419. regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
  420. job_time = [re.search(regex, data_list[0]).group(0)]
  421. job_dict['job_time'] = job_time[0]
  422. _nums = re.findall('\d+', job_dict['job_time'])
  423. #print(_nums)
  424. if len(_nums) >= 4:
  425. job_dict['job_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  426. elif len(_nums) == 2:
  427. job_dict['job_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  428. data_list[0] = re.sub(job_time[0], '', data_list[0])
  429. data_list[0] = data_list[0].strip()
  430. ner_list = []
  431. for i in range(len(data_list[:3])):
  432. if '工作' in data_list[i][:4] and (re.findall(':|\:', data_list[i])):
  433. end_index = i
  434. break
  435. if not re.findall('\040|\||/', data_list[i]) and org:
  436. end_index = i
  437. break
  438. if len(data_list[i]) > 80:
  439. end_index = i
  440. break
  441. if data_list[i]:
  442. ner_data = ner_tag(data_list[i].strip())
  443. else:
  444. continue
  445. ner_list.append(ner_data)
  446. for x in ner_data:
  447. if x[1] == '人物类_概念' and len(x[0]) > 2:
  448. person_professor_list.append(x[0].strip())
  449. elif x[1] == '组织机构类_企事业单位' or x[1] == '组织机构类_教育组织机构':
  450. if not org:
  451. org = re.split('\040|\|/', x[0].strip())[0]
  452. org_index = i
  453. if not org:
  454. for i in range(len(ner_list)):
  455. ner_data = ner_list[i]
  456. for x in ner_data:
  457. if x[1] == '组织机构类':
  458. org = re.split('\040|\|/', x[0].strip())[0]
  459. break
  460. if not person_professor_list:
  461. for i in range(len(ner_list)):
  462. ner_data = ner_list[i]
  463. for x in ner_data:
  464. if x[1] == '人物类_概念':
  465. person_professor_list = [re.split('\040|\|/', x[0].strip())[0]]
  466. break
  467. data_line = ' '.join(data_list[:end_index])
  468. data_line = re.sub('\||/', ' ', data_line)
  469. _list_data = re.split('\040+',data_line)
  470. if len(_list_data) == 1:
  471. end_index = 0
  472. if not person_professor_list:
  473. for x in range(len(_list_data)):
  474. if re.findall('经理|工程师|会计|董事长|总监|秘书|主管|处长|局长|主任|讲师|教授', _list_data[x][-4:]):
  475. person_professor_list.append(_list_data[x])
  476. if not org:
  477. for x in range(len(_list_data)):
  478. if len(_list_data[x]) < 4:
  479. _list_data[x] = ''
  480. elif person_professor_list and re.findall('|'.join(person_professor_list), _list_data[x]):
  481. _list_data[x] = ''
  482. elif '经理' == _list_data[x][-2:]:
  483. _list_data[x] = ''
  484. for x in range(len(_list_data)):
  485. if _list_data[x]:
  486. org = _list_data[x]
  487. break
  488. if not person_professor_list:
  489. for x in range(len(_list_data)):
  490. if org in _list_data[x]:
  491. for j in range(x+1, len(_list_data)):
  492. if _list_data[j]:
  493. person_professor_list = [_list_data[j]]
  494. break
  495. break
  496. #print(org, person_professor_list, job_time)
  497. job_dict['job_company'] = org
  498. job_dict['job_leval'] = ' '.join(person_professor_list)
  499. job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容::|工工作作内内容容', '工作内容:', ''.join(data_list[end_index:]))
  500. job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
  501. job_list.append(job_dict)
  502. continue
  503. if len(data_list) > 1 and data_list[1] and data_list[1][-1] == '|':# and data_list[0] and data_list[0][-1] != '|':
  504. data_list[0] = data_list[0] + data_list[1]
  505. data_list[1] = ''
  506. elif len(data_list) > 2 and data_list[2] and data_list[2][-1] == '|' and data_list[0][-1] != '|' and '|' in str(data_list[0]) and data_list[1] and data_list[1][-1] != '|':
  507. data_list[0] = data_list[0] + data_list[1] + data_list[2]
  508. data_list[1] = ''
  509. data_list[2] = ''
  510. elif len(data_list) > 1 and data_list[1] and '工作职责:' in data_list[2]:
  511. data_list[0] = data_list[0] + data_list[1]
  512. data_list[1] = ''
  513. elif len(data_list) > 1 and '工作职责:' in data_list[3]:
  514. data_list[0] = data_list[0] + data_list[1] + data_list[2]
  515. data_list[1] = ''
  516. data_list[2] = ''
  517. job_time = re.findall(re_txt, data_list[0])
  518. job_dict['job_time'] = job_time[0]
  519. _nums = re.findall('\d+', job_dict['job_time'])
  520. #print(_nums)
  521. if len(_nums) >= 4:
  522. job_dict['job_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  523. elif len(_nums) == 2:
  524. job_dict['job_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  525. data_list[0] = re.sub(job_time[0], '', data_list[0])
  526. data_list[0] = data_list[0].strip()
  527. data_list[0] = re.sub('历任:', ' ', data_list[0])
  528. _list = data_list[0].split('|')
  529. if len(_list) == 1:
  530. __list = re.split('\040{2,}', data_list[0])
  531. #print(__list)
  532. job_dict['job_leval'] = __list[1].strip()
  533. job_dict['job_company'] = __list[0].strip()
  534. else:
  535. job_dict['job_leval'] = _list[0].strip()
  536. job_dict['job_company'] = _list[1].strip()
  537. if '职级:' in data_list[1:]:
  538. data_list.remove('职级:')
  539. job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容::|工工作作内内容容', '工作内容:', ''.join(data_list[1:]))
  540. job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
  541. #print(job_dict)
  542. job_list.append(job_dict)
  543. return job_list
  544. # 项目经历 (已完成)
  545. # 项目名称未知
  546. def get_pro_list(lines):
  547. logging.info(lines)
  548. pro_list = [{"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,},]
  549. regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
  550. re_con = re.compile(r'负责内容(.*?)')
  551. re_na = re.compile(r'\W(.*?项目)\W')
  552. count = 0
  553. for line in lines:
  554. regex_time = regex.search(line)
  555. regex_content = re_con.search(line)
  556. regex_name = re_na.search(line)
  557. if regex_time:
  558. if pro_list[count].get("Time"):
  559. pro_list.append({"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,})
  560. count += 1
  561. pro_list[count]["startTime"] = '{:4d}-{:02d}'.format(int(regex_time.group(2)),int(regex_time.group(3)))
  562. if regex_time.group(5) != None:
  563. pro_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(regex_time.group(5)),int(regex_time.group(6)))
  564. pro_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(regex_time.group(2)),int(regex_time.group(3)),int(regex_time.group(5)),int(regex_time.group(6)))
  565. else:
  566. pro_list[count]["endTime"] = regex_time.group(7)
  567. pro_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(regex_time.group(2)),int(regex_time.group(3)),regex_time.group(7))
  568. elif regex_name and (not pro_list[count].get("job_name")):
  569. pro_list[count]["pro_name"] = regex_name.group()
  570. elif pro_list[count].get("content"):
  571. pro_list[count]["content"] += line
  572. else:
  573. try:
  574. for word, tag in ner_tag(line):
  575. if (not pro_list[count].get("job_leval")) and (tag == "人物类_概念"):
  576. pro_list[count]["job_leval"] = word
  577. if (not pro_list[count].get("job_company")) and (tag in "组织机构类_企事业单位"):
  578. pro_list[count]["job_company"] = word
  579. except Exception as e:
  580. logging.error(e)
  581. pro_list[count]["content"] = line
  582. return pro_list
  583. # 培训经历 (已完成)
  584. # ner + 分词 (机构名) 培训项目 时间
  585. def get_cultivate_list(lines):
  586. logging.info(lines)
  587. job_list = []
  588. re_txt = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今|^\d{4,4}.\d{1,2}|\d{4,4}.'
  589. re_txt_1 = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今'
  590. nums = []
  591. for i in range(len(lines)):
  592. if re.findall(re_txt, lines[i].replace(' ', '')) and re.findall('\||\040{1,}', lines[i]):
  593. nums.append(i)
  594. continue
  595. if re.findall(re_txt, lines[i].replace(' ', '')[:20]):
  596. nums.append(i)
  597. if len(lines[i].strip().replace(' ', '')) > 50:
  598. continue
  599. nums.append(len(lines))
  600. year_dict = {18:4, 17:3,20:3,19:3,21:2,22:1}
  601. for i in range(1, len(nums[:])):
  602. job_dict = {'cultivate_time':'', 'cultivate_time_beg':'', 'cultivate_time_end':'', 'cultivate_name':'','cultivate_leval':'','cultivate_content':''}
  603. data_list = lines[nums[i-1]:nums[i]]
  604. data_line = ' '.join(data_list)
  605. data_line = re.sub('[\|\t]', ' ', data_line)
  606. data_line = re.sub('-{3,}', '', data_line)
  607. ner_data = ner(''.join(data_list[:2]))
  608. org = ''
  609. time_list = []
  610. for _ in ner_data:
  611. if _[1] == 'ORG' and not org:
  612. org = _[0].strip()
  613. elif _[1] == 'TIME' and len(_[1]) >= 4:
  614. time_list.append(_[0])
  615. #TIME
  616. logging.info(data_line)
  617. _list_data = re.split('\040+', data_line)
  618. top_level = 22
  619. end_index = 0
  620. remove_list = []
  621. if len(_list_data) <= 2:
  622. end_index = 0
  623. #continue
  624. job_time = re.findall(re_txt_1, data_list[0])
  625. if job_time:
  626. job_dict['cultivate_time'] = job_time[0]
  627. data_list[0] = re.sub(job_time[0], '', data_list[0])
  628. else:
  629. job_dict['cultivate_time'] = ''
  630. for t in time_list:
  631. data_list[0] = re.sub(t, '', data_list[0])
  632. _list = data_list[0].split('|')
  633. if len(_list) >= 2:
  634. job_dict['cultivate_name'] = _list[0].strip()
  635. job_dict['cultivate_leval'] = _list[1].strip()
  636. end_index = 1
  637. _nums = re.findall('\d+', job_dict['cultivate_time'])
  638. if len(_nums) >= 4:
  639. job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  640. job_dict['cultivate_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
  641. job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  642. elif len(_nums) == 2:
  643. job_dict['cultivate_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  644. job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  645. job_dict['cultivate_time_end'] = '%s'%('至今')
  646. elif len(time_list) == 2:
  647. nums_1 = re.findall('\d+', time_list[0])
  648. nums_2 = re.findall('\d+', time_list[1])
  649. nums_1.append('09')
  650. nums_2.append('07')
  651. job_dict['cultivate_time_beg'] = '%s-%02d'%(nums_1[0], int(nums_1[1]))
  652. job_dict['cultivate_time_end'] = '%s-%02d'%(nums_2[0], int(nums_2[1]))
  653. job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(nums_1[0], int(nums_1[1]), nums_2[0], int(nums_2[1]))
  654. elif len(time_list) == 1:
  655. _nums = re.findall('\d+', time_list[0])
  656. if '获得' in data_list[0]:
  657. _nums.append('01')
  658. _nums.insert(0, '01')
  659. _nums.insert(0, str(int(_nums[1]) - year_dict[top_level]))
  660. job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  661. job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  662. job_dict['cultivate_time_end'] = '%s-%02d'%(_nums[2], int(_nums[3]))
  663. else:
  664. _nums.append('01')
  665. job_dict['cultivate_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  666. job_dict['cultivate_time_beg'] = '%s-%02d'%(_nums[0], int(_nums[1]))
  667. job_dict['cultivate_time_end'] = '%s'%('至今')
  668. job_dict['cultivate_content'] = re.sub('培培训训内内容容::|培培训训内内容容::|培培训训内内容容', '培训内容:', ''.join(data_list[end_index:]))
  669. if not job_dict['cultivate_name']:
  670. job_dict['cultivate_name'] = org
  671. logging.info(job_dict)
  672. job_list.append(job_dict)
  673. continue
  674. '''
  675. #print(nums)
  676. for i in range(1, len(nums[:])):
  677. job_dict = {'cultivate_time':'', 'cultivate_name':'','cultivate_leval':'','cultivate_content':''}
  678. data_list = lines[nums[i-1]:nums[i]]
  679. if '' in data_list:
  680. data_list.remove('')
  681. if len(data_list) > 1 and data_list[1] and data_list[1][-1] == '|' and data_list[0][-1] != '|':
  682. data_list[0] = data_list[0] + data_list[1]
  683. data_list[1] = ''
  684. job_time = re.findall(re_txt_1, data_list[0])
  685. job_dict['cultivate_time'] = job_time[0]
  686. _nums = re.findall('\d+', job_dict['cultivate_time'])
  687. if len(_nums) >= 4:
  688. job_dict['cultivate_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
  689. elif len(_nums) == 2:
  690. job_dict['cultivate_time'] = '%s-%02d~%s'%(_nums[0], int(_nums[1]), '至今')
  691. data_list[0] = re.sub(job_time[0], '', data_list[0])
  692. _list = data_list[0].split('|')
  693. if len(_list) >= 2:
  694. job_dict['cultivate_name'] = _list[0].strip()
  695. job_dict['cultivate_leval'] = _list[1].strip()
  696. job_dict['cultivate_content'] = re.sub('培培训训内内容容|培培训训内内容容::|培培训训内内容容::', '培训内容:', ''.join(data_list[1:]))
  697. else:
  698. job_dict['cultivate_content'] = re.sub('培培训训内内容容|培培训训内内容容::|培培训训内内容容::', '培训内容:', ''.join(data_list[0:]))
  699. #print(job_dict)
  700. '''
  701. return job_list
  702. # 语言能力
  703. def get_lag_list(lines):
  704. logging.info(lines)
  705. job_list = []
  706. re_lan = re.compile(r'(\w+[语话])')
  707. lag_dict = {'lag_name':'', 'lag_leval':""}
  708. for l in lines:
  709. if not l.strip():
  710. continue
  711. lag_name = re.search(re_lan, l)
  712. if lag_name and lag_name.group(1):
  713. if lag_dict['lag_name']:
  714. job_list.append(lag_dict)
  715. lag_dict['lag_name'] = lag_name.group(1)
  716. return job_list
  717. # 家庭情况
  718. def get_fam_list(lines):
  719. job_list = []
  720. fam_dict = {}
  721. for l in lines:
  722. if not l.strip():
  723. continue
  724. ls = l.split('|')
  725. if len(ls) == 1:
  726. continue
  727. fam_dict = {'fam_name':"",'fam_company':"",'fam_lable':"","fam_status":"", 'fam_job':""}
  728. fam_dict["fam_lable"] = ls[0].strip()
  729. fam_dict["fam_name"] = ls[1].strip()
  730. flag = 0
  731. if re.findall('\d岁|\d{4,5}', ls[2]):
  732. flag = 1
  733. fam_dict["fam_company"] = ls[flag+2].strip()
  734. fam_dict["fam_job"] = ls[flag+3].strip()
  735. fam_dict["fam_status"] = ls[flag+4].strip()
  736. #print(fam_dict)
  737. job_list.append(fam_dict)
  738. return job_list
  739. # 证书情况 时间+证书名称 (已完成)
  740. def get_cet_list(lines):
  741. logging.info(lines)
  742. job_list = []
  743. re_txt = '\d+年\d+月|\d+-\d+|\d+\.\d+'
  744. lines_word = ' '.join(lines)
  745. lines = re.findall('\d+年\d+月|\d+-\d+|\d+\.\d+', lines_word)
  746. nums = []
  747. for x in range(len(lines) - 1):
  748. _index = lines_word.index(lines[x])
  749. _end_index = lines_word.index(lines[x+1])
  750. l = lines_word[_index : _end_index]
  751. if not l.strip():
  752. continue
  753. lines_word = lines_word[_end_index:]
  754. job_time = re.findall(re_txt, l)
  755. cet_dict = {'cet_name':'','cet_time':""}
  756. if job_time:
  757. cet_dict['prize_time'] = job_time[0]
  758. l = re.sub(job_time[0], '', l)
  759. else:
  760. continue
  761. ls = re.split('\||\040+|\t+', l)
  762. logging.info(ls)
  763. for l in ls:
  764. if len(l) <= 3:
  765. continue
  766. cet_dict['prize_name'] = l.strip()
  767. break
  768. job_list.append(cet_dict)
  769. return job_list
  770. # 获奖情况 时间+获奖名称 (已完成)
  771. def get_prize_list(lines):
  772. logging.info(lines)
  773. job_list = []
  774. re_txt = '\d+年\d+月|\d+-\d+|\d{4,4}.\d{1,2}'
  775. lines_word = ' '.join(lines)
  776. lines = re.findall('\d+年\d+月|\d{4,4}-\d+|\d{4,4}.\d{1,2}', lines_word)
  777. nums = []
  778. for x in range(len(lines) - 1):
  779. _index = lines_word.index(lines[x])
  780. _end_index = lines_word.index(lines[x+1])
  781. l = lines_word[_index : _end_index]
  782. if not l.strip():
  783. continue
  784. lines_word = lines_word[_end_index:]
  785. job_time = re.findall(re_txt, l)
  786. cet_dict = {'prize_name':'','prize_time':""}
  787. if job_time:
  788. cet_dict['prize_time'] = job_time[0]
  789. l = re.sub(job_time[0], '', l)
  790. else:
  791. continue
  792. ls = re.split('\||\040+|\t+', l)
  793. logging.info(ls)
  794. for l in ls:
  795. if len(l) <= 3:
  796. continue
  797. cet_dict['prize_name'] = l.strip()
  798. break
  799. logging.info(cet_dict)
  800. job_list.append(cet_dict)
  801. return job_list
  802. # Linux doc 文件处理
  803. def doc2pdf_linux(docPath, pdfPath):
  804. """
  805. 允许的文档格式:doc,docx
  806. 仅在linux平台下可以
  807. 需要在linux中下载好libreoffice
  808. """
  809. # 注意cmd中的libreoffice要和linux中安装的一致
  810. cmd = 'libreoffice --headless --convert-to pdf'.split() + [docPath] + ['--outdir'] + [pdfPath]
  811. # cmd = 'libreoffice6.2 --headless --convert-to pdf'.split() + [docPath]
  812. p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
  813. p.wait(timeout=30) # 停顿30秒等待转化
  814. stdout, stderr = p.communicate()
  815. if stderr:
  816. raise subprocess.SubprocessError(stderr)
  817. # Win32 doc 文件处理
  818. def doc2pdf(docPath, pdfPath, system):
  819. """
  820. 注意使用绝对路径
  821. pdf的生成只写路径,不写名字
  822. """
  823. docPathTrue = os.path.abspath(docPath) # bugfix - searching files in windows/system32
  824. if system == "Linux":
  825. return doc2pdf_linux(docPathTrue, pdfPath)
  826. # txt 纯文本解析(已完成)
  827. def parse_txt(path):
  828. with open(path, 'r', encoding='utf-8') as fp:
  829. data = fp.read()
  830. global block, block_rev
  831. chun = 1
  832. page = {1: []}
  833. if len(data.split("\n")) <= 2:
  834. for line in data.split("\n"):
  835. line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历")
  836. for word in line.split():
  837. if word in block.keys():
  838. chun = block[word]
  839. page[chun] = []
  840. elif word:
  841. page[chun].append(word)
  842. else:
  843. for line in data.split("\n"):
  844. line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历")
  845. regex = re.compile(u'[\u3000]+',re.UNICODE)
  846. line = regex.sub('', line)
  847. if line in block.keys():
  848. chun = block[line]
  849. page[chun] = []
  850. elif line:
  851. page[chun].append(line)
  852. result_data = []
  853. for key in page.keys():
  854. for index, func in zip([1, 2, 3, 4, 5, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
  855. if key == index:
  856. result_data.append({block_rev[index]:func(page[index])})
  857. console.print(result_data)
  858. # 纯文本 word 解析
  859. def read_from_word(doc):
  860. para_text = []
  861. for para in doc.paragraphs:
  862. para_text.append(para.text)
  863. global block, block_rev
  864. chun = 1
  865. page = {1: []}
  866. for line in para_text:
  867. regex = re.compile(u'[\uF000-\uF0FF]+',re.UNICODE)
  868. line = regex.sub('', line)
  869. if line in block.keys():
  870. chun = block[line]
  871. page[chun] = []
  872. elif line:
  873. page[chun].append(line)
  874. result_data = []
  875. for key in page.keys():
  876. for index, func in zip([1, 2, 3, 4, 5, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_cet_list, get_prize_list, get_cultivate_list]):
  877. if key == index:
  878. result_data.append({block_rev[index]:func(page[index])})
  879. console.print(result_data)
  880. # 提取 word 表格(已完成)
  881. def check_word(path):
  882. doc = Document(path)
  883. tables = doc.tables
  884. if not tables:
  885. logging.info("this is raw text")
  886. read_from_word(doc)
  887. logging.info("this is a Table")
  888. prk = {"姓名":1, "性别":1, "出生年月":1, "民族":1, "籍贯":1, "户籍地":1, "政治面貌":1, "参加工作时间":1, "健康状况":1, "专业技术资格":1, "外语水平":9, "熟悉专业有何专长":8, "学历学位":1, "工作单位":1, "现任职务":1, "任职时间":1, "提职时间":1, "联系电话":1, "邮箱地址":1, "称谓":13, "工作单位及职务":1, "毕业时间、院校及专业":3,}
  889. block = {
  890. "个人信息":1, "基本信息":1, "个人简历":1, "基基本本信信息息":1, "基本信息基本信息":1, "基本信息文本内容":1,
  891. "求职意向":2, "求职意向求职意向":2, "期望工作文本内容":2,
  892. "教育背景":3, "教育经历":3, "教教育育经经历历":3, "教育经历教育经历":3, "教育经历文本内容":3, "学历学位":3,
  893. "工作经验":4, "主要工作内容与职责":4, "工作方面":4, "实习经历":4, "工作经历":4, "工工作作经经历历":4, "工作经历工作经历":4, "工作经历文本内容":4,
  894. "项目经历":5, "项目经验":5, "科研项目经历":5, "项项目目经经历历":5, "项目经历项目经历":5, "研究生参与代表性项目":5, "项目经历文本内容":5,
  895. "专业技能":6, "个人技能":6, "专业/外语技能":6, "技能素质":6, "个人技能文本内容":6,
  896. "自我评价":7, "个人简介":7, "个人评价":7, "自我描述":7, "自自我我评评价价":7, "自我评价自我评价":7, "自我评价文本内容":7,
  897. "兴趣爱好":8, "兴趣爱好文本内容":8,
  898. "语言及方言":9, "语言能力":9, "英语能力":9, "语语言言能能力力":9, "语言能力语言能力":9, "语言技能文本内容":9,
  899. "证书":10, "所获证书文本内容":10,
  900. "获得奖励":11, "获奖经历":11, "获奖情况":11, "获获奖奖经经历历":11, "获奖经历获奖经历":11, "获奖情况及社会活动":11, "校内奖励":11, "校内活动&奖励":11, "所获奖励文本内容":11,"奖惩情况":11,
  901. "培训":12, "培训经历":12, "培培训训经经历历":12, "培训经历文本内容":12,
  902. "家庭成员":13, "家家庭庭成成员员":13, "家庭成员家庭成员":13, "主要家庭成员及社会关系":13,
  903. "社会活动":"other", "实践经验":"other", "社会活动及社会实践":"other", "近三年年度考核结果":"other", "其他意愿":"other",
  904. }
  905. chun = 1
  906. page = {1: []}
  907. regex = re.compile(r'(\(\w{2,8}\))?((\w{2,8}))?')
  908. for table in tables:
  909. lo = {} # 存储每一行去重后的数据
  910. for row in range(0, len(table.rows)):
  911. row_list = []
  912. for col in range(0, len(table.row_cells(row))): # 提取row行的全部列数据
  913. row_list.append(regex.sub("", table.cell(row,col).text.replace(" ","").replace(":", ":").replace("学历\n学位","学历学位"))) # 去除字符串中的特殊字符,并添加到临时列表中
  914. lo[row] = (sorted(set(row_list), key=row_list.index)) # 在不变顺序的前提下,去除List中的重复项
  915. # 去除空项
  916. for key in lo.keys():
  917. if "" in lo[key]:
  918. lo[key].remove("")
  919. for _, line in lo.items():
  920. if (line[0] in block.keys()) or (line[0] in prk.keys()):
  921. # 包含大类目名
  922. if line[0] in block.keys():
  923. # 指向当前类目
  924. chun = block[line[0]]
  925. if not page.get(chun):
  926. page[chun] = []
  927. # 去除类目名
  928. line = '\n'.join(line[1:])
  929. # 包含小类目
  930. elif line[0] in prk.keys():
  931. # 指向当前类目
  932. chun = prk[line[0]]
  933. if not page.get(chun):
  934. page[chun] = []
  935. # 不去除
  936. line = '\n'.join(line)
  937. else:
  938. line = '\n'.join(line)
  939. # 标准化小类目
  940. for k in prk.keys():
  941. line = line.replace(k+"\n", k+":")
  942. page[chun].extend(line.split())
  943. result_data = []
  944. for key in page.keys():
  945. for index, func in zip([1, 2, 3, 4, 5, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_cet_list, get_prize_list, get_cultivate_list]):
  946. if key == index:
  947. result_data.append({block_rev[index]:func(page[index])})
  948. console.print(result_data)
  949. # pdf 解析句子(已完成)
  950. def parse_line_layout(layout):
  951. texts = []
  952. """解析页面内容,一行一行的解析"""
  953. # bbox:
  954. # x0:从页面左侧到框左边缘的距离。
  955. # y0:从页面底部到框的下边缘的距离。
  956. # x1:从页面左侧到方框右边缘的距离。
  957. # y1:从页面底部到框的上边缘的距离
  958. for textbox in layout:
  959. if isinstance(textbox, LTTextBox) or isinstance(textbox, LTTextLine):
  960. for char in textbox:
  961. if isinstance(char, LTTextLineHorizontal):
  962. texts.append([char.bbox[0], char.bbox[3], char.get_text().strip()])
  963. # 按行排序
  964. texts.sort(key=lambda x:-x[1])
  965. global block, block_rev
  966. chun = 1
  967. page = {1: []}
  968. for _, _, line in texts:
  969. regex = re.compile(u'[\uF000-\uF0FF]+',re.UNICODE)
  970. line = regex.sub('', line)
  971. if line in block.keys():
  972. chun = block[line]
  973. page[chun] = []
  974. elif line:
  975. page[chun].append(line)
  976. return page
  977. # pdf 样式解析(已完成)
  978. def read_from_pdf(path):
  979. result = {}
  980. with open(path, 'rb') as in_file:
  981. parser = PDFParser(in_file) # 用文件对象来创建一个pdf文档分析器
  982. doc: PDFDocument = PDFDocument(parser) # 创建pdf文档
  983. rsrcmgr = PDFResourceManager() # 创建PDF,资源管理器,来共享资源
  984. # 创建一个PDF设备对象
  985. laparams = LAParams()
  986. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  987. # 创建一个PDF解释其对象
  988. interpreter = PDFPageInterpreter(rsrcmgr, device)
  989. # 循环遍历列表,每次处理一个page内容
  990. # doc.get_pages() 获取page列表
  991. interpreter = PDFPageInterpreter(rsrcmgr, device)
  992. # 处理文档对象中每一页的内容
  993. # doc.get_pages() 获取page列表
  994. # 循环遍历列表,每次处理一个page的内容
  995. # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
  996. for page in PDFPage.create_pages(doc):
  997. logging.info('================ 新页面 ================')
  998. interpreter.process_page(page)
  999. layout = device.get_result()
  1000. r = parse_line_layout(layout)
  1001. for key in r.keys():
  1002. if result.get(key):
  1003. result[key].extend(r[key])
  1004. else:
  1005. result[key] = r[key]
  1006. block_rev = {1:"基本信息",2:"求职意向",3:"教育经历",4:"工作经历",5:"项目经历",6:"专业技能",7:"自我评价",8:"兴趣爱好",9:"语言能力",10:"证书",11:"获奖情况",12:"培训经历",13:"家庭成员","other":"其他"}
  1007. result_data = []
  1008. for key in result.keys():
  1009. for index, func in zip([1, 2, 3, 4, 5, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
  1010. if key == index:
  1011. result_data.append({block_rev[index]: func(result[index])})
  1012. console.print(result_data)
  1013. # pdf 表格解析 ()
  1014. def parse_table_from_pdf(path):
  1015. global block, block_rev
  1016. result = {}
  1017. with pdfplumber.open(path) as pdf:
  1018. for page in pdf.pages:
  1019. key = None
  1020. for table in page.extract_tables():
  1021. for line in table:
  1022. for word in line:
  1023. if not key:
  1024. key = word
  1025. else:
  1026. result[key] = word
  1027. key = None
  1028. for key in block.keys():
  1029. if result.get(key):
  1030. logging.info({key: result[key]})
  1031. console.print(result)
  1032. # for key in result.keys():
  1033. # for index, func in zip([1, 2, 3, 4, 5, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_cet_list, get_prize_list, get_cultivate_list]):
  1034. # if (key in block.keys()) and (block[key] == index):
  1035. # console.print(block_rev[index])
  1036. # try:
  1037. # console.print(func(result[index]), justify="left")
  1038. # except Exception as e:
  1039. # logging.error(e)
  1040. # break
  1041. # else:
  1042. # console.print({key: result[key]})
  1043. # break
  1044. return None
  1045. # 检测 pdf 格式 (已完成)
  1046. def check_pdf(path):
  1047. """
  1048. # 输入:
  1049. # pdf 文件路径
  1050. # 输出:
  1051. # 文件包含元素 [Word, Table]
  1052. """
  1053. rst = []
  1054. for page_layout in extract_pages(path):
  1055. for element in page_layout:
  1056. if isinstance(element, LTFigure):
  1057. for cell in element:
  1058. if isinstance(cell, LTChar):
  1059. rst.append("Table")
  1060. break
  1061. elif isinstance(element, LTTextContainer):
  1062. rst.append("Word")
  1063. return set(rst)
  1064. # 检测传入格式(已完成)
  1065. def detection_type(path, system):
  1066. # 传入目录
  1067. if os.path.isdir(path):
  1068. for filename in os.listdir(path):
  1069. filename = os.path.join(path, filename)
  1070. # 传入为 doc
  1071. logging.info(filename)
  1072. if filename.endswith('.doc') and not filename.startswith('.~'):
  1073. doc2pdf(docPath = filename, pdfPath = './', system=system)
  1074. # 传入为 docx
  1075. elif os.path.isfile(filename) and filename.endswith('.docx'):
  1076. check_word(filename)
  1077. # 传入为 pdf
  1078. if os.path.isfile(filename) and filename.endswith('.pdf'):
  1079. rst = check_pdf(filename)
  1080. if "Table" in rst:
  1081. parse_table_from_pdf(filename)
  1082. pass
  1083. if "Word" in rst:
  1084. read_from_pdf(filename)
  1085. # 传入为 txt
  1086. elif os.path.isfile(filename) and filename.endswith('.txt'):
  1087. parse_txt(filename)
  1088. # 传入为 doc
  1089. elif os.path.isfile(path) and path.endswith('.doc'):
  1090. doc2pdf(docPath = path, pdfPath = './', system=system)
  1091. # 传入为 docx
  1092. elif os.path.isfile(path) and path.endswith('.docx'):
  1093. check_word(path)
  1094. # 传入为 pdf
  1095. elif os.path.isfile(path) and path.endswith('.pdf'):
  1096. rst = check_pdf(path)
  1097. if "Table" in rst:
  1098. parse_table_from_pdf(path)
  1099. if "Word" in rst:
  1100. read_from_pdf(path)
  1101. # 传入为 txt
  1102. elif os.path.isfile(path) and path.endswith('.txt'):
  1103. parse_txt(path)
  1104. return None
  1105. if __name__ == '__main__':
  1106. import platform
  1107. system = platform.system()
  1108. if (system == "Windows"):
  1109. logging.info("Windows")
  1110. elif (system == "Linux"):
  1111. logging.info("Linux")
  1112. else:
  1113. logging.error("Unnot support this system")
  1114. # try:
  1115. # detection_type(sys.argv[1], system)
  1116. # except Exception as e:
  1117. # logging.error(e)
  1118. detection_type(sys.argv[1], system)
  1119. # detection_type('w1.pdf', system)