custom.py 16 KB


  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-11 09:21:24
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-18 13:50:34
  6. # 自定义模板
  7. import re
  8. import json
  9. import requests
  10. from requests.adapters import HTTPAdapter
  11. import pdfplumber
  12. from docx import Document
  13. path = "d:\\desktop\\自定义.docx"
  14. # path = "d:\\desktop\\自定义.pdf"
  15. class Custom(object):
  16. """docstring for Custom"""
  17. def __init__(self):
  18. super(Custom, self).__init__()
  19. self.keywords = [
  20. "姓名",
  21. "性别",
  22. "出生年月",
  23. "出生日期",
  24. "民族",
  25. "籍贯",
  26. "户籍地",
  27. "健康状况",
  28. "政治面貌(加入时间)",
  29. "政治面貌(加入时间)",
  30. "参加工作时间",
  31. "健康状况",
  32. "外语水平",
  33. "专业技术资格(取得时间)",
  34. "专业技术资格(取得时间)",
  35. "职业技能等级(取得时间)",
  36. "职业技能等级(取得时间)",
  37. "熟悉专业有何专长",
  38. "学历院校",
  39. "初始学历、专业",
  40. "初始学历毕业院校及毕业时间",
  41. "最高学历、专业",
  42. "最高学历毕业院校及毕业时间",
  43. "工作单位",
  44. "现任职务",
  45. "任职时间",
  46. "提职时间",
  47. "联系电话",
  48. "邮箱地址",
  49. "对报名岗位认识及工作设想",
  50. "意向地区",
  51. "意向岗位",
  52. "其他意向岗位",
  53. "意向单位",
  54. "意向专业",
  55. "学习经历",
  56. "起止时间",
  57. "学校","专业","学历","学位","研究方向","是否全日制",
  58. "培训经历",
  59. "培训类型","机构","内容","成绩","证书名称",
  60. "工作经历",
  61. "工作单位","职务","部门","证明人","备注",
  62. "项目经历",
  63. "项目名称","项目职务","项目描述","项目职责","项目成果",
  64. "获得职业资格证书情况",
  65. "获得日期","名称","证书编码/文号","授予单位",
  66. "奖惩情况",
  67. "项目","时间","项目单位","证明材料",
  68. "主要工作业绩(500字以内)",
  69. "主要工作业绩(500字以内)",
  70. "自我评价",
  71. "近三年年度考核结果",
  72. "主要家庭成员及社会关系",
  73. "称谓",
  74. "其他情况说明",
  75. "工作单位及职务",
  76. "政治面貌",
  77. "职业证书",
  78. "资格等级",
  79. "取得日期",
  80. "学校/培训机构",
  81. "专业",
  82. "起始时间",
  83. "毕业时间",
  84. "职业",
  85. "与本人关系",
  86. "计算机水平"
  87. ]
  88. self.json_obj = self.get_translate()
  89. def get_translate(self):
  90. # 转译数据库字段名
  91. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  92. json_obj = json.load(ff)
  93. return json_obj
  94. # 解析行内元素
  95. def parse_line(self, line):
  96. result = []
  97. key = None
  98. for cell in line:
  99. if cell and ''.join(cell.split()) in self.keywords:
  100. key = ''.join(cell.split())
  101. elif cell and key:
  102. schema = {key:cell}
  103. result.append(schema)
  104. key = None
  105. return result
  106. # 解析word
  107. def parse_word_layout(self, path):
  108. result = []
  109. doc = Document(path)
  110. lo = {}
  111. for _table in doc.tables[:]:
  112. for i, row in enumerate(_table.rows[:]):
  113. row_content = []
  114. for cell in row.cells[:]:
  115. c = cell.text
  116. if c not in row_content:
  117. row_content.append(c)
  118. lo[len(lo.keys())] = row_content
  119. kwln = -1# 关键词行长度
  120. kwline = None# 关键词行
  121. for key in lo.keys():
  122. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  123. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  124. perc = 0# 行内关键词数量
  125. for c in lo[key]:
  126. if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词
  127. perc += 1
  128. if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
  129. perc = 0# 清空行内关键词数
  130. result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素
  131. break
  132. else:# 关键词行元素
  133. schema = dict()
  134. for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
  135. if key:
  136. schema[key] = val
  137. result.append(schema)
  138. break
  139. break
  140. else:
  141. # print("{}:此行为关键词行!".format(lo[key]))
  142. try:
  143. kwline = [''.join(cell.split()) for cell in lo[key]]
  144. except Exception as e:
  145. kwline = lo[key]
  146. kwln = len(lo[key])
  147. return result
  148. # 解析pdf
  149. def parse_pdf_layout(self, path):
  150. result = []
  151. lo = {}
  152. with pdfplumber.open(path) as pdf:
  153. for page in pdf.pages:
  154. for table in page.extract_tables():
  155. for line in table:
  156. lo[len(lo.keys())] = line
  157. kwln = -1
  158. kwline = None
  159. for key in lo.keys():
  160. # pdb.set_trace()
  161. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  162. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  163. # pdb.set_trace()
  164. for c in lo[key] or len(lo[key])!=kwln:
  165. # pdb.set_trace()
  166. if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
  167. result.extend(self.parse_line(lo[key]))
  168. break
  169. else:# 关键词行元素
  170. schema = dict()
  171. for key, val in zip(kwline, lo[key]):
  172. if key:
  173. schema[key] = val if val else key
  174. result.append(schema)
  175. break
  176. break
  177. else:
  178. kwline = []
  179. for cell in lo[key]:
  180. if cell:
  181. kwline.append(''.join(cell.split()))
  182. else:
  183. kwline.append(cell)
  184. kwln = len(lo[key])
  185. return result
  186. # 格式化数据
  187. def formatter(self, datalist):
  188. result = dict()
  189. for d in datalist:
  190. if len(d) == 1:# 普通键值对
  191. for key in d.keys():
  192. result[key] = d[key]
  193. else:# 行级元素
  194. for k in list(d.keys()):
  195. if k == "".join(d[k].split()):# 行名
  196. d.pop(k)
  197. if result.get(k):# 多行元素合并
  198. result[k].append(d)
  199. else:
  200. result[k] = [d]
  201. ### 时间格式化
  202. if result.get("出生年月"):
  203. dates = re.findall(r'\d+' , result["出生年月"])
  204. if len(dates) == 1:
  205. result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
  206. elif len(dates) == 2:
  207. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  208. elif len(dates) == 3:
  209. result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  210. if result.get("任职时间"):
  211. dates = re.findall(r'\d+' , result["任职时间"])
  212. if len(dates) == 1:
  213. result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
  214. elif len(dates) == 2:
  215. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  216. elif len(dates) == 3:
  217. result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  218. if result.get("参加工作时间"):
  219. dates = re.findall(r'\d+' , result["参加工作时间"])
  220. if len(dates) == 1:
  221. result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
  222. elif len(dates) == 2:
  223. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  224. elif len(dates) == 3:
  225. result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  226. if result.get("最高学历毕业院校及毕业时间"):
  227. dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
  228. ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
  229. if len(ws) > 0:
  230. result["最高学历毕业院校"] = ws[0]
  231. if len(dates) == 1:
  232. result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  233. elif len(dates) == 2:
  234. result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  235. elif len(dates) == 3:
  236. result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  237. result.pop("最高学历毕业院校及毕业时间")
  238. if result.get("初始学历毕业院校及毕业时间"):
  239. dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
  240. ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
  241. if len(ws) > 0:
  242. result["初始学历毕业院校"] = ws[0]
  243. if len(dates) == 1:
  244. result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  245. elif len(dates) == 2:
  246. result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  247. elif len(dates) == 3:
  248. result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  249. result.pop("初始学历毕业院校及毕业时间")
  250. if result.get("学习经历"):
  251. for idx, edu in enumerate(result["学习经历"]):
  252. if edu.get("起止时间"):
  253. dates = re.findall(r'\d+' , edu["起止时间"])
  254. if len(dates) == 4:
  255. result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  256. if result.get("培训经历"):
  257. for idx, edu in enumerate(result["培训经历"]):
  258. if edu.get("起止时间"):
  259. dates = re.findall(r'\d+' , edu["起止时间"])
  260. if len(dates) == 4:
  261. result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  262. if result.get("工作经历"):
  263. for idx, edu in enumerate(result["工作经历"]):
  264. if edu.get("起止时间"):
  265. dates = re.findall(r'\d+' , edu["起止时间"])
  266. if len(dates) == 4:
  267. result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  268. if result.get("项目经历"):
  269. for idx, edu in enumerate(result["项目经历"]):
  270. if edu.get("起止时间"):
  271. dates = re.findall(r'\d+' , edu["起止时间"])
  272. if len(dates) == 4:
  273. result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  274. if result.get("获得职业资格证书情况"):
  275. for idx, edu in enumerate(result["获得职业资格证书情况"]):
  276. if edu.get("获得日期"):
  277. dates = re.findall(r'\d+' , edu["获得日期"])
  278. if len(dates) == 2:
  279. result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  280. if result.get("奖惩情况"):
  281. for idx, edu in enumerate(result["奖惩情况"]):
  282. if edu.get("时间"):
  283. dates = re.findall(r'\d+' , edu["时间"])
  284. if len(dates) == 2:
  285. result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  286. if result.get("主要家庭成员及社会关系"):
  287. for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
  288. if fam.get("出生年月"):
  289. dates = re.findall(r'\d+' , fam["出生年月"])
  290. if len(dates) == 2:
  291. result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  292. normal = self.json_obj["base"]
  293. edunormal = self.json_obj["tal_his_edu"]
  294. family = self.json_obj["tal_family_social_relation"]
  295. for key in normal.keys():
  296. if result.get(key):
  297. result[normal[key]] = result[key]
  298. result.pop(key)
  299. for idx in range(len(result['学习经历'])):
  300. result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
  301. result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
  302. for key in edunormal.keys():
  303. if result['学习经历'][idx].get(key):
  304. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  305. result['学习经历'][idx].pop(key)
  306. for idx in range(len(result['主要家庭成员及社会关系'])):
  307. for key in family.keys():
  308. if result['主要家庭成员及社会关系'][idx].get(key):
  309. result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
  310. result['主要家庭成员及社会关系'][idx].pop(key)
  311. tit = {
  312. "基本信息":"base",
  313. "求职意向":"intent_job",
  314. "学习经历":"tal_his_edu",
  315. "工作经历":"tal_his_job",
  316. "项目经历":"tal_his_project",
  317. "培训经历":"tal_training_experience",
  318. "获奖情况":"tal_reward_punishment",
  319. "语言能力":"tal_language",
  320. "证书":"tal_vocational_qualification_certificate",
  321. "专业技能":"tal_professional_tech_certificate",
  322. "主要家庭成员及社会关系":"tal_family_social_relation"
  323. }
  324. for key in tit.keys():
  325. if result.get(key):
  326. result[tit[key]] = result[key]
  327. result.pop(key)
  328. return result
  329. # 推送后端
  330. def push_back(self, result):
  331. url = "http://192.168.1.110:9999/talent/getResumeData"
  332. session = requests.Session()
  333. session.mount('http://', HTTPAdapter(max_retries = 3))
  334. try:
  335. headers = {
  336. 'contentType':'Application/json'
  337. }
  338. response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
  339. print(response.text)
  340. except Exception as e:
  341. print(e)
  342. def predict(self, path):
  343. if path.endswith(".docx"):
  344. result = self.formatter(self.parse_word_layout(path))
  345. self.push_back(result)
  346. print(self.formatter(self.parse_word_layout(path)))
  347. elif path.endswith(".pdf"):
  348. result = self.formatter(self.parse_pdf_layout(path))
  349. self.push_back(result)
  350. print(self.formatter(self.parse_pdf_layout(path)))
  351. if __name__ == '__main__':
  352. c = Custom()
  353. c.predict(path)