custom.py 12 KB


  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-11 09:21:24
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-13 15:31:50
  6. # 自定义模板
  7. import re
  8. import logging
  9. from pprint import pprint
  10. import requests
  11. from requests.adapters import HTTPAdapter
  12. from docx import Document
  13. from docx.shared import Inches
  14. path = "d:\\desktop\\自定义.docx"
  15. # path = "d:\\desktop\\内部人才市场简历模板.docx"
  16. keywords = [
  17. "姓名",
  18. "性别",
  19. "出生年月",
  20. "出生日期",
  21. "民族",
  22. "籍贯",
  23. "户籍地",
  24. "健康状况",
  25. "政治面貌(加入时间)",
  26. "政治面貌(加入时间)",
  27. "参加工作时间",
  28. "健康状况",
  29. "外语水平",
  30. "专业技术资格(取得时间)",
  31. "专业技术资格(取得时间)",
  32. "职业技能等级(取得时间)",
  33. "职业技能等级(取得时间)",
  34. "熟悉专业有何专长",
  35. "学历院校",
  36. "初始学历、专业",
  37. "初始学历毕业院校及毕业时间",
  38. "最高学历、专业",
  39. "最高学历毕业院校及毕业时间",
  40. "工作单位",
  41. "现任职务",
  42. "任职时间",
  43. "提职时间",
  44. "联系电话",
  45. "邮箱地址",
  46. "对报名岗位认识及工作设想",
  47. "意向地区",
  48. "意向岗位",
  49. "其他意向岗位",
  50. "意向单位",
  51. "意向专业",
  52. "学习经历",
  53. "起止时间",
  54. "学校","专业","学历","学位","研究方向","是否全日制",
  55. "培训经历",
  56. "培训类型","机构","内容","成绩","证书名称",
  57. "工作经历",
  58. "工作单位","职务","部门","证明人","备注",
  59. "项目经历",
  60. "项目名称","项目职务","项目描述","项目职责","项目成果",
  61. "获得职业资格证书情况",
  62. "获得日期","名称","证书编码/文号","授予单位",
  63. "奖惩情况",
  64. "项目","时间","项目单位","证明材料",
  65. "主要工作业绩(500字以内)",
  66. "主要工作业绩(500字以内)",
  67. "自我评价",
  68. "近三年年度考核结果",
  69. "主要家庭成员及社会关系",
  70. "称谓",
  71. "其他情况说明",
  72. "工作单位及职务",
  73. "政治面貌",
  74. "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系", "计算机水平"
  75. ]
  76. def parse_line(line):
  77. result = []
  78. key = None
  79. for cell in line:
  80. if cell and ''.join(cell.split()) in keywords:
  81. key = ''.join(cell.split())
  82. elif cell and key:
  83. schema = {key:cell}
  84. result.append(schema)
  85. key = None
  86. return result
  87. def parse_layout(path):
  88. result = []
  89. doc = Document(path)
  90. lo = {}
  91. tables = doc.tables
  92. for _table in tables[:]:
  93. for i, row in enumerate(_table.rows[:]):
  94. row_content = []
  95. for cell in row.cells[:]:
  96. c = cell.text
  97. # row_content.append(c)
  98. if c not in row_content:
  99. row_content.append(c)
  100. lo[len(lo.keys())] = row_content
  101. kwln = -1
  102. kwline = None
  103. for key in lo.keys():
  104. # pdb.set_trace()
  105. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  106. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  107. # pdb.set_trace()
  108. perc = 0
  109. for c in lo[key]:
  110. # pdb.set_trace()
  111. if c and (''.join(c.split()) in keywords):
  112. perc += 1
  113. if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 非关键词行元素
  114. # print(c)
  115. # print(perc)
  116. # print(lo[key])
  117. perc = 0
  118. result.extend(parse_line(lo[key]))
  119. break
  120. else:# 关键词行元素
  121. schema = dict()
  122. for key, val in zip(kwline, lo[key]):
  123. if key:
  124. schema[key] = val
  125. result.append(schema)
  126. break
  127. break
  128. else:
  129. # print("{}\t\t此行为关键词行".format(lo[key]))
  130. try:
  131. kwline = [''.join(cell.split()) for cell in lo[key]]
  132. except Exception as e:
  133. kwline = lo[key]
  134. kwln = len(lo[key])
  135. return result
  136. # 格式化数据
  137. def formatter(datalist):
  138. result = dict()
  139. for d in datalist:
  140. if len(d) == 1:
  141. for key in d.keys():
  142. result[key] = d[key]
  143. else:
  144. for k in list(d.keys()):
  145. if k == "".join(d[k].split()):
  146. d.pop(k)
  147. if result.get(k):
  148. result[k].append(d)
  149. else:
  150. result[k] = [d]
  151. if result.get("出生年月"):
  152. dates = re.findall(r'\d+' , result["出生年月"])
  153. if len(dates) == 1:
  154. result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
  155. elif len(dates) == 2:
  156. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  157. elif len(dates) == 3:
  158. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  159. if result.get("任职时间"):
  160. dates = re.findall(r'\d+' , result["任职时间"])
  161. if len(dates) == 1:
  162. result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
  163. elif len(dates) == 2:
  164. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  165. elif len(dates) == 3:
  166. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  167. if result.get("参加工作时间"):
  168. dates = re.findall(r'\d+' , result["参加工作时间"])
  169. if len(dates) == 1:
  170. result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
  171. elif len(dates) == 2:
  172. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  173. elif len(dates) == 3:
  174. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  175. if result.get("最高学历毕业院校及毕业时间"):
  176. dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
  177. ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
  178. if len(ws) > 0:
  179. result["最高学历毕业院校"] = ws[0]
  180. if len(dates) == 1:
  181. result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  182. elif len(dates) == 2:
  183. result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  184. elif len(dates) == 3:
  185. result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  186. result.pop("最高学历毕业院校及毕业时间")
  187. if result.get("初始学历毕业院校及毕业时间"):
  188. dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
  189. ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
  190. if len(ws) > 0:
  191. result["初始学历毕业院校"] = ws[0]
  192. if len(dates) == 1:
  193. result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  194. elif len(dates) == 2:
  195. result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  196. elif len(dates) == 3:
  197. result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  198. result.pop("初始学历毕业院校及毕业时间")
  199. if result.get("学习经历"):
  200. for idx, edu in enumerate(result["学习经历"]):
  201. if edu.get("起止时间"):
  202. dates = re.findall(r'\d+' , edu["起止时间"])
  203. if len(dates) == 4:
  204. result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  205. if result.get("培训经历"):
  206. for idx, edu in enumerate(result["培训经历"]):
  207. if edu.get("起止时间"):
  208. dates = re.findall(r'\d+' , edu["起止时间"])
  209. if len(dates) == 4:
  210. result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  211. if result.get("工作经历"):
  212. for idx, edu in enumerate(result["工作经历"]):
  213. if edu.get("起止时间"):
  214. dates = re.findall(r'\d+' , edu["起止时间"])
  215. if len(dates) == 4:
  216. result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  217. if result.get("项目经历"):
  218. for idx, edu in enumerate(result["项目经历"]):
  219. if edu.get("起止时间"):
  220. dates = re.findall(r'\d+' , edu["起止时间"])
  221. if len(dates) == 4:
  222. result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  223. if result.get("获得职业资格证书情况"):
  224. for idx, edu in enumerate(result["获得职业资格证书情况"]):
  225. if edu.get("获得日期"):
  226. dates = re.findall(r'\d+' , edu["获得日期"])
  227. if len(dates) == 2:
  228. result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  229. if result.get("奖惩情况"):
  230. for idx, edu in enumerate(result["奖惩情况"]):
  231. if edu.get("时间"):
  232. dates = re.findall(r'\d+' , edu["时间"])
  233. if len(dates) == 2:
  234. result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  235. if result.get("主要家庭成员及社会关系"):
  236. for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
  237. if fam.get("出生年月"):
  238. dates = re.findall(r'\d+' , fam["出生年月"])
  239. if len(dates) == 2:
  240. result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  241. normal = {
  242. "姓名":"name",
  243. "性别":"gender",
  244. "邮箱地址":"email",
  245. "政治面貌(加入时间)":"politics",
  246. "联系电话":"mobile",
  247. "籍贯":"birthplace",
  248. "出生年月":"birth_time",
  249. "现任职务":"current_job",
  250. "所在城市":"living_city",
  251. "参加工作时间":"work_begin_time",
  252. "意向岗位":"intent_job",
  253. "熟悉专业有何专长":"skills",
  254. }
  255. edunormal = {
  256. "学校":"school_name",
  257. "专业":"major",
  258. "学历":"degree",
  259. "是否全日制":"degree_type",
  260. }
  261. for key in normal.keys():
  262. if result.get(key):
  263. result[normal[key]] = result[key]
  264. result.pop(key)
  265. for idx in range(len(result['学习经历'])):
  266. result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
  267. result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
  268. for key in edunormal.keys():
  269. if result['学习经历'][idx].get(key):
  270. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  271. result['学习经历'][idx].pop(key)
  272. url = "http://192.168.1.110:9999/talent/getResumeData"
  273. session = requests.Session()
  274. session.mount('http://', HTTPAdapter(max_retries = 3))
  275. try:
  276. headers = {
  277. 'contentType':'Application/json'
  278. }
  279. response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
  280. print(response.text)
  281. except Exception as e:
  282. print(e)
  283. return result
  284. if __name__ == '__main__':
  285. pprint(formatter(parse_layout(path)))