irafa.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 13:12:17
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-18 14:57:29
  6. # 内部人才市场简历模板
  7. import re
  8. import json
  9. import requests
  10. from requests.adapters import HTTPAdapter
  11. import pdfplumber
  12. from docx import Document
  13. # path = "d:\\desktop\\内部人才市场简历模板.docx"
  14. path = "d:\\desktop\\内部人才市场简历模板.pdf"
  15. class Inner(object):
  16. """docstring for Inner"""
  17. def __init__(self):
  18. super(Inner, self).__init__()
  19. self.keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
  20. self.json_obj = self.get_translate()
  21. def get_translate(self):
  22. # 转译数据库字段名
  23. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  24. json_obj = json.load(ff)
  25. return json_obj
  26. def parse_line(self, line):
  27. result = []
  28. key = None
  29. for cell in line:
  30. if cell and ''.join(cell.split()) in self.keywords:
  31. key = ''.join(cell.split())
  32. elif cell and key:
  33. schema = {key:cell}
  34. result.append(schema)
  35. key = None
  36. return result
  37. # 解析word
  38. def parse_word_layout(self, path):
  39. result = []
  40. doc = Document(path)
  41. lo = {}
  42. tables = doc.tables
  43. for _table in tables[:]:
  44. for i, row in enumerate(_table.rows[:]):
  45. row_content = []
  46. for cell in row.cells[:]:
  47. c = cell.text
  48. row_content.append(c)
  49. lo[len(lo.keys())] = row_content
  50. kwln = -1
  51. kwline = None
  52. for key in lo.keys():
  53. # pdb.set_trace()
  54. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  55. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  56. # pdb.set_trace()
  57. for c in lo[key]:
  58. # pdb.set_trace()
  59. if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
  60. result.extend(self.parse_line(lo[key]))
  61. break
  62. else:# 关键词行元素
  63. schema = dict()
  64. for key, val in zip(kwline, lo[key]):
  65. if key:
  66. schema[key] = val
  67. if "学校/培训机构" in schema.keys():
  68. schema["学习经历"] = "学习经历"
  69. elif "与本人关系" in schema.keys():
  70. schema["家庭成员"] = "家庭成员"
  71. elif "意向地区" in schema.keys():
  72. schema["职业发展管理"] = "职业发展管理"
  73. elif "职业证书" in schema.keys():
  74. schema["职业资格证书"] = "职业资格证书"
  75. result.append(schema)
  76. break
  77. break
  78. else:
  79. # print("此行为关键词行")
  80. kwline = [''.join(cell.split()) for cell in lo[key]]
  81. kwln = len(lo[key])
  82. job = {"工作经历":"工作经历"}
  83. flag = None
  84. for p in doc.paragraphs:
  85. text = p.text.replace(":", ":")
  86. if ":" in text:
  87. text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
  88. for line in text.split("\n"):
  89. if line.strip():
  90. i = line.split(":")
  91. if job.get(i[0].strip()):
  92. result.append(job)
  93. job = {"工作经历":"工作经历"}
  94. job[i[0].strip()] = i[1].strip()
  95. flag = i[0].strip()
  96. elif flag == "工作描述":
  97. job["工作描述"] += '\n' + text.strip()
  98. else:
  99. result.append(job)
  100. return result
  101. # 解析pdf
  102. def parse_pdf_layout(self, path):
  103. result = []
  104. lo = {}
  105. with pdfplumber.open(path) as pdf:
  106. for page in pdf.pages:
  107. for table in page.extract_tables():
  108. for line in table:
  109. # lo[len(lo.keys())] = [cell for cell in line if cell]
  110. lo[len(lo.keys())] = line
  111. kwln = -1
  112. kwline = None
  113. for key in lo.keys():
  114. # pdb.set_trace()
  115. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  116. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  117. # pdb.set_trace()
  118. for c in lo[key]:
  119. # pdb.set_trace()
  120. if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
  121. result.extend(self.parse_line(lo[key]))
  122. break
  123. if c == "对报名岗位\n认 识及工作":
  124. print(''.join(c.split()))
  125. break
  126. else:# 关键词行元素
  127. schema = dict()
  128. for key, val in zip(kwline, lo[key]):
  129. if key:
  130. schema[key] = val
  131. if "学校/培训机构" in schema.keys():
  132. schema["学习经历"] = "学习经历"
  133. elif "与本人关系" in schema.keys():
  134. schema["家庭成员"] = "家庭成员"
  135. elif "意向地区" in schema.keys():
  136. schema["职业发展管理"] = "职业发展管理"
  137. elif "职业证书" in schema.keys():
  138. schema["职业资格证书"] = "职业资格证书"
  139. result.append(schema)
  140. break
  141. break
  142. else:
  143. # print("此行为关键词行")
  144. kwline = [''.join(cell.split()) for cell in lo[key]]
  145. kwln = len(lo[key])
  146. job = {"工作经历":"工作经历"}
  147. flag = None
  148. with pdfplumber.open(path) as pdf:
  149. for page in pdf.pages:
  150. for predict in page.extract_words():
  151. # print(predict['text'])
  152. text = predict['text'].replace(":", ":")
  153. if ":" in text:
  154. text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
  155. for line in text.split("\n"):
  156. if line.strip():
  157. i = line.split(":")
  158. if job.get(i[0].strip()):
  159. result.append(job)
  160. job = {"工作经历":"工作经历"}
  161. job[i[0].strip()] = i[1].strip()
  162. flag = i[0].strip()
  163. elif flag == "工作描述":
  164. job["工作描述"] += '\n' + text.strip()
  165. else:
  166. result.append(job)
  167. return result
  168. # 格式化数据
  169. def formatter(self, datalist):
  170. result = dict()
  171. for d in datalist:
  172. if len(d) == 1:
  173. for key in d.keys():
  174. result[key] = d[key]
  175. else:
  176. for k in list(d.keys()):
  177. if k == "".join(d[k].split()):
  178. d.pop(k)
  179. if result.get(k):
  180. result[k].append(d)
  181. else:
  182. result[k] = [d]
  183. if result.get("外语水平"):
  184. data = re.findall(r'(\w+[语话])', result["外语水平"])
  185. if dates:
  186. result["外语水平"] = data
  187. if result.get("专业技术资格(取得时间)"):
  188. dates = re.findall(r'\d+', result["专业技术资格(取得时间)"])
  189. for i in dates:
  190. result["专业技术资格(取得时间)"] = result["专业技术资格(取得时间)"].replace(i, "")
  191. names = re.findall(r'\w+', result["专业技术资格(取得时间)"])
  192. if len(dates) == 1:
  193. result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"专业技术资格":names}]
  194. elif len(dates) == 2:
  195. result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"专业技术资格":names}]
  196. elif len(dates) == 3:
  197. result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"专业技术资格":names}]
  198. ### 时间格式化
  199. if result.get("出生年月"):
  200. dates = re.findall(r'\d+' , result["出生年月"])
  201. if len(dates) == 1:
  202. result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
  203. elif len(dates) == 2:
  204. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  205. elif len(dates) == 3:
  206. result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  207. if result.get("任职时间"):
  208. dates = re.findall(r'\d+' , result["任职时间"])
  209. if len(dates) == 1:
  210. result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
  211. elif len(dates) == 2:
  212. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  213. elif len(dates) == 3:
  214. result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  215. if result.get("参加工作时间"):
  216. dates = re.findall(r'\d+' , result["参加工作时间"])
  217. if len(dates) == 1:
  218. result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
  219. elif len(dates) == 2:
  220. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  221. elif len(dates) == 3:
  222. result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  223. if result.get("最高学历毕业院校及毕业时间"):
  224. dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
  225. ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
  226. if len(ws) > 0:
  227. result["最高学历毕业院校"] = ws[0]
  228. if len(dates) == 1:
  229. result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  230. elif len(dates) == 2:
  231. result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  232. elif len(dates) == 3:
  233. result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  234. result.pop("最高学历毕业院校及毕业时间")
  235. if result.get("初始学历毕业院校及毕业时间"):
  236. dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
  237. ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
  238. if len(ws) > 0:
  239. result["初始学历毕业院校"] = ws[0]
  240. if len(dates) == 1:
  241. result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  242. elif len(dates) == 2:
  243. result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  244. elif len(dates) == 3:
  245. result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  246. result.pop("初始学历毕业院校及毕业时间")
  247. if result.get("学习经历"):
  248. for idx, edu in enumerate(result["学习经历"]):
  249. if edu.get("起止时间"):
  250. dates = re.findall(r'\d+' , edu["起止时间"])
  251. if len(dates) == 4:
  252. result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  253. if result.get("培训经历"):
  254. for idx, edu in enumerate(result["培训经历"]):
  255. if edu.get("起止时间"):
  256. dates = re.findall(r'\d+' , edu["起止时间"])
  257. if len(dates) == 4:
  258. result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  259. if result.get("工作经历"):
  260. for idx, edu in enumerate(result["工作经历"]):
  261. if edu.get("起止时间"):
  262. dates = re.findall(r'\d+' , edu["起止时间"])
  263. if len(dates) == 4:
  264. result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  265. if result.get("项目经历"):
  266. for idx, edu in enumerate(result["项目经历"]):
  267. if edu.get("起止时间"):
  268. dates = re.findall(r'\d+' , edu["起止时间"])
  269. if len(dates) == 4:
  270. result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  271. if result.get("获得职业资格证书情况"):
  272. for idx, edu in enumerate(result["获得职业资格证书情况"]):
  273. if edu.get("获得日期"):
  274. dates = re.findall(r'\d+' , edu["获得日期"])
  275. if len(dates) == 2:
  276. result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  277. if result.get("奖惩情况"):
  278. for idx, edu in enumerate(result["奖惩情况"]):
  279. if edu.get("时间"):
  280. dates = re.findall(r'\d+' , edu["时间"])
  281. if len(dates) == 2:
  282. result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  283. if result.get("主要家庭成员及社会关系"):
  284. for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
  285. if fam.get("出生年月"):
  286. dates = re.findall(r'\d+' , fam["出生年月"])
  287. if len(dates) == 2:
  288. result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  289. normal = self.json_obj["base"]
  290. itenormal = self.json_obj["base"]
  291. edunormal = self.json_obj["tal_training_experience"]
  292. jobnormal = self.json_obj["tal_his_job"]
  293. cetnormal = self.json_obj["tal_vocational_qualification_certificate"]
  294. family = self.json_obj["tal_family_social_relation"]
  295. for key in normal.keys():
  296. if result.get(key):
  297. result[normal[key]] = result[key]
  298. result.pop(key)
  299. for idx in range(len(result['职业发展管理'])):
  300. for key in itenormal.keys():
  301. if result['职业发展管理'][idx].get(key):
  302. result['职业发展管理'][idx][itenormal[key]] = result['职业发展管理'][idx][key]
  303. result['职业发展管理'][idx].pop(key)
  304. for idx in range(len(result['学习经历'])):
  305. for key in edunormal.keys():
  306. if result['学习经历'][idx].get(key):
  307. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  308. result['学习经历'][idx].pop(key)
  309. for idx in range(len(result['工作经历'])):
  310. for key in jobnormal.keys():
  311. if result['工作经历'][idx].get(key):
  312. result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
  313. result['工作经历'][idx].pop(key)
  314. for idx in range(len(result['职业资格证书'])):
  315. for key in cetnormal.keys():
  316. if result['职业资格证书'][idx].get(key):
  317. result['职业资格证书'][idx][cetnormal[key]] = result['职业资格证书'][idx][key]
  318. result['职业资格证书'][idx].pop(key)
  319. for idx in range(len(result['家庭成员'])):
  320. for key in family.keys():
  321. if result['家庭成员'][idx].get(key):
  322. result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
  323. result['家庭成员'][idx].pop(key)
  324. tit = {
  325. "基本信息":"base",
  326. "职业发展管理":"intent_job",
  327. "学习经历":"tal_training_experience",
  328. "工作经历":"tal_his_job",
  329. "项目经历":"tal_his_project",
  330. "培训经历":"tal_training_experience",
  331. "获奖情况":"tal_reward_punishment",
  332. "语言能力":"tal_language",
  333. "职业资格证书":"tal_vocational_qualification_certificate",
  334. "专业技能":"tal_professional_tech_certificate",
  335. "家庭成员":"tal_family_social_relation"
  336. }
  337. for key in tit.keys():
  338. if result.get(key):
  339. result[tit[key]] = result[key]
  340. result.pop(key)
  341. return result
  342. # 推送后端
  343. def push_back(self, result):
  344. url = "http://192.168.1.110:9999/talent/getResumeData"
  345. session = requests.Session()
  346. session.mount('http://', HTTPAdapter(max_retries = 3))
  347. try:
  348. headers = {
  349. 'contentType':'Application/json'
  350. }
  351. response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
  352. print(response.text)
  353. except Exception as e:
  354. print(e)
  355. def predict(self, path):
  356. if path.endswith(".docx"):
  357. result = self.formatter(self.parse_word_layout(path))
  358. self.push_back(result)
  359. print(self.formatter(self.parse_word_layout(path)))
  360. elif path.endswith(".pdf"):
  361. result = self.formatter(self.parse_pdf_layout(path))
  362. self.push_back(result)
  363. print(self.formatter(self.parse_pdf_layout(path)))
  364. if __name__ == "__main__":
  365. i = Inner()
  366. i.predict(path)