custom.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-11 09:21:24
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-15 17:22:00
  6. # 自定义模板
  7. import re
  8. import json
  9. import logging
  10. from pprint import pprint
  11. import requests
  12. from requests.adapters import HTTPAdapter
  13. from docx import Document
  14. from docx.shared import Inches
  15. path = "d:\\desktop\\自定义.docx"
  16. # 关键词字典
  17. keywords = [
  18. "姓名",
  19. "性别",
  20. "出生年月",
  21. "出生日期",
  22. "民族",
  23. "籍贯",
  24. "户籍地",
  25. "健康状况",
  26. "政治面貌(加入时间)",
  27. "政治面貌(加入时间)",
  28. "参加工作时间",
  29. "健康状况",
  30. "外语水平",
  31. "专业技术资格(取得时间)",
  32. "专业技术资格(取得时间)",
  33. "职业技能等级(取得时间)",
  34. "职业技能等级(取得时间)",
  35. "熟悉专业有何专长",
  36. "学历院校",
  37. "初始学历、专业",
  38. "初始学历毕业院校及毕业时间",
  39. "最高学历、专业",
  40. "最高学历毕业院校及毕业时间",
  41. "工作单位",
  42. "现任职务",
  43. "任职时间",
  44. "提职时间",
  45. "联系电话",
  46. "邮箱地址",
  47. "对报名岗位认识及工作设想",
  48. "意向地区",
  49. "意向岗位",
  50. "其他意向岗位",
  51. "意向单位",
  52. "意向专业",
  53. "学习经历",
  54. "起止时间",
  55. "学校","专业","学历","学位","研究方向","是否全日制",
  56. "培训经历",
  57. "培训类型","机构","内容","成绩","证书名称",
  58. "工作经历",
  59. "工作单位","职务","部门","证明人","备注",
  60. "项目经历",
  61. "项目名称","项目职务","项目描述","项目职责","项目成果",
  62. "获得职业资格证书情况",
  63. "获得日期","名称","证书编码/文号","授予单位",
  64. "奖惩情况",
  65. "项目","时间","项目单位","证明材料",
  66. "主要工作业绩(500字以内)",
  67. "主要工作业绩(500字以内)",
  68. "自我评价",
  69. "近三年年度考核结果",
  70. "主要家庭成员及社会关系",
  71. "称谓",
  72. "其他情况说明",
  73. "工作单位及职务",
  74. "政治面貌",
  75. "职业证书",
  76. "资格等级",
  77. "取得日期",
  78. "学校/培训机构",
  79. "专业",
  80. "起始时间",
  81. "毕业时间",
  82. "职业",
  83. "与本人关系",
  84. "计算机水平"
  85. ]
  86. # 解析行内元素
  87. def parse_line(line):
  88. result = []
  89. key = None
  90. for cell in line:
  91. if cell and ''.join(cell.split()) in keywords:
  92. key = ''.join(cell.split())
  93. elif cell and key:
  94. schema = {key:cell}
  95. result.append(schema)
  96. key = None
  97. return result
  98. # 解析文档布局
  99. def parse_layout(path):
  100. result = []
  101. doc = Document(path)
  102. lo = {}
  103. for _table in doc.tables[:]:
  104. for i, row in enumerate(_table.rows[:]):
  105. row_content = []
  106. for cell in row.cells[:]:
  107. c = cell.text
  108. if c not in row_content:
  109. row_content.append(c)
  110. lo[len(lo.keys())] = row_content
  111. kwln = -1# 关键词行长度
  112. kwline = None# 关键词行
  113. for key in lo.keys():
  114. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  115. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  116. perc = 0# 行内关键词数量
  117. for c in lo[key]:
  118. if c and (''.join(c.split()) in keywords):# 找到此行有关键词
  119. perc += 1
  120. if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
  121. perc = 0# 清空行内关键词数
  122. result.extend(parse_line(lo[key]))# 添加并解析普通行级元素
  123. break
  124. else:# 关键词行元素
  125. schema = dict()
  126. for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
  127. if key:
  128. schema[key] = val
  129. result.append(schema)
  130. break
  131. break
  132. else:
  133. # print("{}:此行为关键词行!".format(lo[key]))
  134. try:
  135. kwline = [''.join(cell.split()) for cell in lo[key]]
  136. except Exception as e:
  137. kwline = lo[key]
  138. kwln = len(lo[key])
  139. return result
  140. # 格式化数据
  141. def formatter(datalist):
  142. result = dict()
  143. for d in datalist:
  144. if len(d) == 1:# 普通键值对
  145. for key in d.keys():
  146. result[key] = d[key]
  147. else:# 行级元素
  148. for k in list(d.keys()):
  149. if k == "".join(d[k].split()):# 行名
  150. d.pop(k)
  151. if result.get(k):# 多行元素合并
  152. result[k].append(d)
  153. else:
  154. result[k] = [d]
  155. ### 时间格式化
  156. if result.get("出生年月"):
  157. dates = re.findall(r'\d+' , result["出生年月"])
  158. if len(dates) == 1:
  159. result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
  160. elif len(dates) == 2:
  161. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  162. elif len(dates) == 3:
  163. result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  164. if result.get("任职时间"):
  165. dates = re.findall(r'\d+' , result["任职时间"])
  166. if len(dates) == 1:
  167. result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
  168. elif len(dates) == 2:
  169. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  170. elif len(dates) == 3:
  171. result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  172. if result.get("参加工作时间"):
  173. dates = re.findall(r'\d+' , result["参加工作时间"])
  174. if len(dates) == 1:
  175. result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
  176. elif len(dates) == 2:
  177. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  178. elif len(dates) == 3:
  179. result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  180. if result.get("最高学历毕业院校及毕业时间"):
  181. dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
  182. ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
  183. if len(ws) > 0:
  184. result["最高学历毕业院校"] = ws[0]
  185. if len(dates) == 1:
  186. result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  187. elif len(dates) == 2:
  188. result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  189. elif len(dates) == 3:
  190. result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  191. result.pop("最高学历毕业院校及毕业时间")
  192. if result.get("初始学历毕业院校及毕业时间"):
  193. dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
  194. ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
  195. if len(ws) > 0:
  196. result["初始学历毕业院校"] = ws[0]
  197. if len(dates) == 1:
  198. result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  199. elif len(dates) == 2:
  200. result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  201. elif len(dates) == 3:
  202. result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  203. result.pop("初始学历毕业院校及毕业时间")
  204. if result.get("学习经历"):
  205. for idx, edu in enumerate(result["学习经历"]):
  206. if edu.get("起止时间"):
  207. dates = re.findall(r'\d+' , edu["起止时间"])
  208. if len(dates) == 4:
  209. result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  210. if result.get("培训经历"):
  211. for idx, edu in enumerate(result["培训经历"]):
  212. if edu.get("起止时间"):
  213. dates = re.findall(r'\d+' , edu["起止时间"])
  214. if len(dates) == 4:
  215. result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  216. if result.get("工作经历"):
  217. for idx, edu in enumerate(result["工作经历"]):
  218. if edu.get("起止时间"):
  219. dates = re.findall(r'\d+' , edu["起止时间"])
  220. if len(dates) == 4:
  221. result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  222. if result.get("项目经历"):
  223. for idx, edu in enumerate(result["项目经历"]):
  224. if edu.get("起止时间"):
  225. dates = re.findall(r'\d+' , edu["起止时间"])
  226. if len(dates) == 4:
  227. result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  228. if result.get("获得职业资格证书情况"):
  229. for idx, edu in enumerate(result["获得职业资格证书情况"]):
  230. if edu.get("获得日期"):
  231. dates = re.findall(r'\d+' , edu["获得日期"])
  232. if len(dates) == 2:
  233. result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  234. if result.get("奖惩情况"):
  235. for idx, edu in enumerate(result["奖惩情况"]):
  236. if edu.get("时间"):
  237. dates = re.findall(r'\d+' , edu["时间"])
  238. if len(dates) == 2:
  239. result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  240. if result.get("主要家庭成员及社会关系"):
  241. for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
  242. if fam.get("出生年月"):
  243. dates = re.findall(r'\d+' , fam["出生年月"])
  244. if len(dates) == 2:
  245. result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  246. # 转译数据库字段名
  247. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  248. json_obj = json.load(ff)
  249. normal = json_obj["base"]
  250. edunormal = json_obj["tal_his_edu"]
  251. family = json_obj["tal_family_social_relations"]
  252. for key in normal.keys():
  253. if result.get(key):
  254. result[normal[key]] = result[key]
  255. result.pop(key)
  256. for idx in range(len(result['学习经历'])):
  257. result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
  258. result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
  259. for key in edunormal.keys():
  260. if result['学习经历'][idx].get(key):
  261. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  262. result['学习经历'][idx].pop(key)
  263. for idx in range(len(result['主要家庭成员及社会关系'])):
  264. for key in family.keys():
  265. if result['主要家庭成员及社会关系'][idx].get(key):
  266. result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
  267. result['主要家庭成员及社会关系'][idx].pop(key)
  268. tit = {
  269. "基本信息":"base",
  270. "求职意向":"intent_job",
  271. "学习经历":"tal_his_edu",
  272. "工作经历":"tal_his_job",
  273. "项目经历":"tal_his_project",
  274. "培训经历":"tal_training_institutions",
  275. "获奖情况":"tal_rewards_punishments",
  276. "语言能力":"tal_language",
  277. "证书":"tal_vocational_qualification_certificate",
  278. "专业技能":"tal_professional_tech_certificate",
  279. "主要家庭成员及社会关系":"tal_family_social_relations"
  280. }
  281. for key in tit.keys():
  282. if result.get(key):
  283. result[tit[key]] = result[key]
  284. result.pop(key)
  285. # url = "http://192.168.1.110:9999/talent/getResumeData"
  286. # session = requests.Session()
  287. # session.mount('http://', HTTPAdapter(max_retries = 3))
  288. # try:
  289. # headers = {
  290. # 'contentType':'Application/json'
  291. # }
  292. # response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
  293. # print(response.text)
  294. # except Exception as e:
  295. # print(e)
  296. return result
  297. if __name__ == '__main__':
  298. pprint(formatter(parse_layout(path)))