custom.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-11 09:21:24
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-18 14:54:53
  6. # 自定义模板
  7. import re
  8. import json
  9. import requests
  10. from requests.adapters import HTTPAdapter
  11. import pdfplumber
  12. from docx import Document
  13. path = "d:\\desktop\\自定义.docx"
  14. # path = "d:\\desktop\\自定义.pdf"
  15. class Custom(object):
  16. """docstring for Custom"""
  17. def __init__(self):
  18. super(Custom, self).__init__()
  19. self.keywords = [
  20. "姓名",
  21. "性别",
  22. "出生年月",
  23. "出生日期",
  24. "民族",
  25. "籍贯",
  26. "户籍地",
  27. "健康状况",
  28. "政治面貌(加入时间)",
  29. "政治面貌(加入时间)",
  30. "参加工作时间",
  31. "健康状况",
  32. "外语水平",
  33. "专业技术资格(取得时间)",
  34. "专业技术资格(取得时间)",
  35. "职业技能等级(取得时间)",
  36. "职业技能等级(取得时间)",
  37. "熟悉专业有何专长",
  38. "学历院校",
  39. "初始学历、专业",
  40. "初始学历毕业院校及毕业时间",
  41. "最高学历、专业",
  42. "最高学历毕业院校及毕业时间",
  43. "工作单位",
  44. "现任职务",
  45. "任职时间",
  46. "提职时间",
  47. "联系电话",
  48. "邮箱地址",
  49. "对报名岗位认识及工作设想",
  50. "意向地区",
  51. "意向岗位",
  52. "其他意向岗位",
  53. "意向单位",
  54. "意向专业",
  55. "学习经历",
  56. "起止时间",
  57. "学校","专业","学历","学位","研究方向","是否全日制",
  58. "培训经历",
  59. "培训类型","机构","内容","成绩","证书名称",
  60. "工作经历",
  61. "工作单位","职务","部门","证明人","备注",
  62. "项目经历",
  63. "项目名称","项目职务","项目描述","项目职责","项目成果",
  64. "获得职业资格证书情况",
  65. "获得日期","名称","证书编码/文号","授予单位",
  66. "奖惩情况",
  67. "项目","时间","项目单位","证明材料",
  68. "主要工作业绩(500字以内)",
  69. "主要工作业绩(500字以内)",
  70. "自我评价",
  71. "近三年年度考核结果",
  72. "主要家庭成员及社会关系",
  73. "称谓",
  74. "其他情况说明",
  75. "工作单位及职务",
  76. "政治面貌",
  77. "职业证书",
  78. "资格等级",
  79. "取得日期",
  80. "学校/培训机构",
  81. "专业",
  82. "起始时间",
  83. "毕业时间",
  84. "职业",
  85. "与本人关系",
  86. "计算机水平"
  87. ]
  88. self.json_obj = self.get_translate()
  89. def get_translate(self):
  90. # 转译数据库字段名
  91. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  92. json_obj = json.load(ff)
  93. return json_obj
  94. # 解析行内元素
  95. def parse_line(self, line):
  96. result = []
  97. key = None
  98. for cell in line:
  99. if cell and ''.join(cell.split()) in self.keywords:
  100. key = ''.join(cell.split())
  101. elif cell and key:
  102. schema = {key:cell}
  103. result.append(schema)
  104. key = None
  105. return result
  106. # 解析word
  107. def parse_word_layout(self, path):
  108. result = []
  109. doc = Document(path)
  110. lo = {}
  111. for _table in doc.tables[:]:
  112. for i, row in enumerate(_table.rows[:]):
  113. row_content = []
  114. for cell in row.cells[:]:
  115. c = cell.text
  116. if c not in row_content:
  117. row_content.append(c)
  118. lo[len(lo.keys())] = row_content
  119. kwln = -1# 关键词行长度
  120. kwline = None# 关键词行
  121. for key in lo.keys():
  122. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  123. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  124. perc = 0# 行内关键词数量
  125. for c in lo[key]:
  126. if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词
  127. perc += 1
  128. if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
  129. perc = 0# 清空行内关键词数
  130. result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素
  131. break
  132. else:# 关键词行元素
  133. schema = dict()
  134. for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
  135. if key:
  136. schema[key] = val
  137. result.append(schema)
  138. break
  139. break
  140. else:
  141. # print("{}:此行为关键词行!".format(lo[key]))
  142. try:
  143. kwline = [''.join(cell.split()) for cell in lo[key]]
  144. except Exception as e:
  145. kwline = lo[key]
  146. kwln = len(lo[key])
  147. return result
  148. # 解析pdf
  149. def parse_pdf_layout(self, path):
  150. result = []
  151. lo = {}
  152. with pdfplumber.open(path) as pdf:
  153. for page in pdf.pages:
  154. for table in page.extract_tables():
  155. for line in table:
  156. lo[len(lo.keys())] = line
  157. kwln = -1
  158. kwline = None
  159. for key in lo.keys():
  160. # pdb.set_trace()
  161. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  162. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  163. # pdb.set_trace()
  164. for c in lo[key] or len(lo[key])!=kwln:
  165. # pdb.set_trace()
  166. if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
  167. result.extend(self.parse_line(lo[key]))
  168. break
  169. else:# 关键词行元素
  170. schema = dict()
  171. for key, val in zip(kwline, lo[key]):
  172. if key:
  173. schema[key] = val if val else key
  174. result.append(schema)
  175. break
  176. break
  177. else:
  178. kwline = []
  179. for cell in lo[key]:
  180. if cell:
  181. kwline.append(''.join(cell.split()))
  182. else:
  183. kwline.append(cell)
  184. kwln = len(lo[key])
  185. return result
  186. # 格式化数据
  187. def formatter(self, datalist):
  188. result = dict()
  189. for d in datalist:
  190. if len(d) == 1:# 普通键值对
  191. for key in d.keys():
  192. result[key] = d[key]
  193. else:# 行级元素
  194. for k in list(d.keys()):
  195. if k == "".join(d[k].split()):# 行名
  196. d.pop(k)
  197. if result.get(k):# 多行元素合并
  198. result[k].append(d)
  199. else:
  200. result[k] = [d]
  201. if result.get("外语水平"):
  202. data = re.findall(r'(\w+[语话])', result["外语水平"])
  203. if data:
  204. result["外语水平"] = data
  205. if result.get("专业技术资格(取得时间)"):
  206. dates = re.findall(r'\d+', result["专业技术资格(取得时间)"])
  207. for i in dates:
  208. result["专业技术资格(取得时间)"] = result["专业技术资格(取得时间)"].replace(i, "")
  209. names = re.findall(r'\w+', result["专业技术资格(取得时间)"])
  210. if len(dates) == 1:
  211. result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"专业技术资格":names}]
  212. elif len(dates) == 2:
  213. result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"专业技术资格":names}]
  214. elif len(dates) == 3:
  215. result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"专业技术资格":names}]
  216. if result.get("职业技能等级(取得时间)"):
  217. dates = re.findall(r'\d+', result["职业技能等级(取得时间)"])
  218. for i in dates:
  219. result["职业技能等级(取得时间)"] = result["职业技能等级(取得时间)"].replace(i, "")
  220. names = re.findall(r'\w+', result["职业技能等级(取得时间)"])
  221. if len(dates) == 1:
  222. result["职业技能等级(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"职业技能等级":names}]
  223. elif len(dates) == 2:
  224. result["职业技能等级(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"职业技能等级":names}]
  225. elif len(dates) == 3:
  226. result["职业技能等级(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"职业技能等级":names}]
  227. ### 时间格式化
  228. if result.get("出生年月"):
  229. dates = re.findall(r'\d+' , result["出生年月"])
  230. if len(dates) == 1:
  231. result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
  232. elif len(dates) == 2:
  233. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  234. elif len(dates) == 3:
  235. result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  236. if result.get("任职时间"):
  237. dates = re.findall(r'\d+' , result["任职时间"])
  238. if len(dates) == 1:
  239. result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
  240. elif len(dates) == 2:
  241. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  242. elif len(dates) == 3:
  243. result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  244. if result.get("参加工作时间"):
  245. dates = re.findall(r'\d+' , result["参加工作时间"])
  246. if len(dates) == 1:
  247. result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
  248. elif len(dates) == 2:
  249. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  250. elif len(dates) == 3:
  251. result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  252. if result.get("最高学历毕业院校及毕业时间"):
  253. dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
  254. ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
  255. if len(ws) > 0:
  256. result["最高学历毕业院校"] = ws[0]
  257. if len(dates) == 1:
  258. result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  259. elif len(dates) == 2:
  260. result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  261. elif len(dates) == 3:
  262. result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  263. result.pop("最高学历毕业院校及毕业时间")
  264. if result.get("初始学历毕业院校及毕业时间"):
  265. dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
  266. ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
  267. if len(ws) > 0:
  268. result["初始学历毕业院校"] = ws[0]
  269. if len(dates) == 1:
  270. result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  271. elif len(dates) == 2:
  272. result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  273. elif len(dates) == 3:
  274. result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  275. result.pop("初始学历毕业院校及毕业时间")
  276. if result.get("学习经历"):
  277. for idx, edu in enumerate(result["学习经历"]):
  278. if edu.get("起止时间"):
  279. dates = re.findall(r'\d+' , edu["起止时间"])
  280. if len(dates) == 4:
  281. result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  282. if result.get("培训经历"):
  283. for idx, edu in enumerate(result["培训经历"]):
  284. if edu.get("起止时间"):
  285. dates = re.findall(r'\d+' , edu["起止时间"])
  286. if len(dates) == 4:
  287. result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  288. if result.get("工作经历"):
  289. for idx, edu in enumerate(result["工作经历"]):
  290. if edu.get("起止时间"):
  291. dates = re.findall(r'\d+' , edu["起止时间"])
  292. if len(dates) == 4:
  293. result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  294. if result.get("项目经历"):
  295. for idx, edu in enumerate(result["项目经历"]):
  296. if edu.get("起止时间"):
  297. dates = re.findall(r'\d+' , edu["起止时间"])
  298. if len(dates) == 4:
  299. result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  300. if result.get("获得职业资格证书情况"):
  301. for idx, edu in enumerate(result["获得职业资格证书情况"]):
  302. if edu.get("获得日期"):
  303. dates = re.findall(r'\d+' , edu["获得日期"])
  304. if len(dates) == 2:
  305. result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  306. if result.get("奖惩情况"):
  307. for idx, edu in enumerate(result["奖惩情况"]):
  308. if edu.get("时间"):
  309. dates = re.findall(r'\d+' , edu["时间"])
  310. if len(dates) == 2:
  311. result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  312. if result.get("主要家庭成员及社会关系"):
  313. for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
  314. if fam.get("出生年月"):
  315. dates = re.findall(r'\d+' , fam["出生年月"])
  316. if len(dates) == 2:
  317. result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  318. normal = self.json_obj["base"]
  319. edunormal = self.json_obj["tal_his_edu"]
  320. family = self.json_obj["tal_family_social_relation"]
  321. for key in normal.keys():
  322. if result.get(key):
  323. result[normal[key]] = result[key]
  324. result.pop(key)
  325. for idx in range(len(result['学习经历'])):
  326. result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
  327. result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
  328. for key in edunormal.keys():
  329. if result['学习经历'][idx].get(key):
  330. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  331. result['学习经历'][idx].pop(key)
  332. for idx in range(len(result['主要家庭成员及社会关系'])):
  333. for key in family.keys():
  334. if result['主要家庭成员及社会关系'][idx].get(key):
  335. result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
  336. result['主要家庭成员及社会关系'][idx].pop(key)
  337. tit = {
  338. "基本信息":"base",
  339. "求职意向":"intent_job",
  340. "学习经历":"tal_his_edu",
  341. "工作经历":"tal_his_job",
  342. "项目经历":"tal_his_project",
  343. "培训经历":"tal_training_experience",
  344. "获奖情况":"tal_reward_punishment",
  345. "语言能力":"tal_language",
  346. "证书":"tal_vocational_qualification_certificate",
  347. "专业技能":"tal_professional_tech_certificate",
  348. "主要家庭成员及社会关系":"tal_family_social_relation"
  349. }
  350. for key in tit.keys():
  351. if result.get(key):
  352. result[tit[key]] = result[key]
  353. result.pop(key)
  354. return result
  355. # 推送后端
  356. def push_back(self, result):
  357. url = "http://192.168.1.110:9999/talent/getResumeData"
  358. session = requests.Session()
  359. session.mount('http://', HTTPAdapter(max_retries = 3))
  360. try:
  361. headers = {
  362. 'contentType':'Application/json'
  363. }
  364. response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
  365. print(response.text)
  366. except Exception as e:
  367. print(e)
  368. def predict(self, path):
  369. if path.endswith(".docx"):
  370. result = self.formatter(self.parse_word_layout(path))
  371. self.push_back(result)
  372. print(self.formatter(self.parse_word_layout(path)))
  373. elif path.endswith(".pdf"):
  374. result = self.formatter(self.parse_pdf_layout(path))
  375. self.push_back(result)
  376. print(self.formatter(self.parse_pdf_layout(path)))
  377. if __name__ == '__main__':
  378. c = Custom()
  379. c.predict(path)