custom.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-11 09:21:24
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-12 16:30:08
  6. import re
  7. import logging
  8. from pprint import pprint
  9. from docx import Document
  10. from docx.shared import Inches
  11. path = "d:\\desktop\\自定义.docx"
  12. # path = "d:\\desktop\\内部人才市场简历模板.docx"
  13. keywords = [
  14. "姓名",
  15. "性别",
  16. "出生年月",
  17. "出生日期",
  18. "民族",
  19. "籍贯",
  20. "户籍地",
  21. "健康状况",
  22. "政治面貌(加入时间)",
  23. "政治面貌(加入时间)",
  24. "参加工作时间",
  25. "健康状况",
  26. "外语水平",
  27. "专业技术资格(取得时间)",
  28. "专业技术资格(取得时间)",
  29. "职业技能等级(取得时间)",
  30. "职业技能等级(取得时间)",
  31. "熟悉专业有何专长",
  32. "学历院校",
  33. "初始学历、专业",
  34. "初始学历毕业院校及毕业时间",
  35. "最高学历、专业",
  36. "最高学历毕业院校及毕业时间",
  37. "工作单位",
  38. "现任职务",
  39. "任职时间",
  40. "提职时间",
  41. "联系电话",
  42. "邮箱地址",
  43. "对报名岗位认识及工作设想",
  44. "意向地区",
  45. "意向岗位",
  46. "其他意向岗位",
  47. "意向单位",
  48. "意向专业",
  49. "学习经历",
  50. "起止时间",
  51. "学校","专业","学历","学位","研究方向","是否全日制",
  52. "培训经历",
  53. "培训类型","机构","内容","成绩","证书名称",
  54. "工作经历",
  55. "工作单位","职务","部门","证明人","备注",
  56. "项目经历",
  57. "项目名称","项目职务","项目描述","项目职责","项目成果",
  58. "获得职业资格证书情况",
  59. "获得日期","名称","证书编码/文号","授予单位",
  60. "奖惩情况",
  61. "项目","时间","项目单位","证明材料",
  62. "主要工作业绩(500字以内)",
  63. "主要工作业绩(500字以内)",
  64. "自我评价",
  65. "近三年年度考核结果",
  66. "主要家庭成员及社会关系",
  67. "称谓",
  68. "其他情况说明",
  69. "工作单位及职务",
  70. "政治面貌",
  71. "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系", "计算机水平"
  72. ]
  73. def parse_line(line):
  74. result = []
  75. key = None
  76. for cell in line:
  77. if cell and ''.join(cell.split()) in keywords:
  78. key = ''.join(cell.split())
  79. elif cell and key:
  80. schema = {key:cell}
  81. result.append(schema)
  82. key = None
  83. return result
  84. def parse_layout(path):
  85. result = []
  86. doc = Document(path)
  87. lo = {}
  88. tables = doc.tables
  89. for _table in tables[:]:
  90. for i, row in enumerate(_table.rows[:]):
  91. row_content = []
  92. for cell in row.cells[:]:
  93. c = cell.text
  94. # row_content.append(c)
  95. if c not in row_content:
  96. row_content.append(c)
  97. lo[len(lo.keys())] = row_content
  98. kwln = -1
  99. kwline = None
  100. for key in lo.keys():
  101. # pdb.set_trace()
  102. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  103. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  104. # pdb.set_trace()
  105. perc = 0
  106. for c in lo[key]:
  107. # pdb.set_trace()
  108. if c and (''.join(c.split()) in keywords):
  109. perc += 1
  110. if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 非关键词行元素
  111. # print(c)
  112. # print(perc)
  113. # print(lo[key])
  114. perc = 0
  115. result.extend(parse_line(lo[key]))
  116. break
  117. else:# 关键词行元素
  118. schema = dict()
  119. for key, val in zip(kwline, lo[key]):
  120. if key:
  121. schema[key] = val
  122. result.append(schema)
  123. break
  124. break
  125. else:
  126. # print("{}\t\t此行为关键词行".format(lo[key]))
  127. try:
  128. kwline = [''.join(cell.split()) for cell in lo[key]]
  129. except Exception as e:
  130. kwline = lo[key]
  131. kwln = len(lo[key])
  132. return result
  133. # 格式化数据
  134. def formatter(datalist):
  135. result = dict()
  136. for d in datalist:
  137. if len(d) == 1:
  138. for key in d.keys():
  139. result[key] = d[key]
  140. else:
  141. for k in list(d.keys()):
  142. if k == "".join(d[k].split()):
  143. d.pop(k)
  144. if result.get(k):
  145. result[k].append(d)
  146. else:
  147. result[k] = [d]
  148. if result.get("出生年月"):
  149. dates = re.findall(r'\d+' , result["出生年月"])
  150. if len(dates) == 1:
  151. result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
  152. elif len(dates) == 2:
  153. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  154. elif len(dates) == 3:
  155. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  156. if result.get("任职时间"):
  157. dates = re.findall(r'\d+' , result["任职时间"])
  158. if len(dates) == 1:
  159. result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
  160. elif len(dates) == 2:
  161. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  162. elif len(dates) == 3:
  163. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  164. if result.get("参加工作时间"):
  165. dates = re.findall(r'\d+' , result["参加工作时间"])
  166. if len(dates) == 1:
  167. result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
  168. elif len(dates) == 2:
  169. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  170. elif len(dates) == 3:
  171. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  172. if result.get("最高学历毕业院校及毕业时间"):
  173. dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
  174. ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
  175. if len(ws) > 0:
  176. result["最高学历毕业院校"] = ws[0]
  177. if len(dates) == 1:
  178. result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  179. elif len(dates) == 2:
  180. result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  181. elif len(dates) == 3:
  182. result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  183. result.pop("最高学历毕业院校及毕业时间")
  184. if result.get("初始学历毕业院校及毕业时间"):
  185. dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
  186. ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
  187. if len(ws) > 0:
  188. result["初始学历毕业院校"] = ws[0]
  189. if len(dates) == 1:
  190. result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  191. elif len(dates) == 2:
  192. result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  193. elif len(dates) == 3:
  194. result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  195. result.pop("初始学历毕业院校及毕业时间")
  196. if result.get("学习经历"):
  197. for idx, edu in enumerate(result["学习经历"]):
  198. if edu.get("起止时间"):
  199. dates = re.findall(r'\d+' , edu["起止时间"])
  200. if len(dates) == 4:
  201. result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  202. if result.get("培训经历"):
  203. for idx, edu in enumerate(result["培训经历"]):
  204. if edu.get("起止时间"):
  205. dates = re.findall(r'\d+' , edu["起止时间"])
  206. if len(dates) == 4:
  207. result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  208. if result.get("工作经历"):
  209. for idx, edu in enumerate(result["工作经历"]):
  210. if edu.get("起止时间"):
  211. dates = re.findall(r'\d+' , edu["起止时间"])
  212. if len(dates) == 4:
  213. result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  214. if result.get("项目经历"):
  215. for idx, edu in enumerate(result["项目经历"]):
  216. if edu.get("起止时间"):
  217. dates = re.findall(r'\d+' , edu["起止时间"])
  218. if len(dates) == 4:
  219. result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  220. if result.get("获得职业资格证书情况"):
  221. for idx, edu in enumerate(result["获得职业资格证书情况"]):
  222. if edu.get("获得日期"):
  223. dates = re.findall(r'\d+' , edu["获得日期"])
  224. if len(dates) == 2:
  225. result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  226. if result.get("奖惩情况"):
  227. for idx, edu in enumerate(result["奖惩情况"]):
  228. if edu.get("时间"):
  229. dates = re.findall(r'\d+' , edu["时间"])
  230. if len(dates) == 2:
  231. result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  232. if result.get("主要家庭成员及社会关系"):
  233. for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
  234. if fam.get("出生年月"):
  235. dates = re.findall(r'\d+' , fam["出生年月"])
  236. if len(dates) == 2:
  237. result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  238. return result
  239. if __name__ == '__main__':
  240. pprint(formatter(parse_layout(path)))