|
@@ -0,0 +1,269 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# @Author: privacy
|
|
|
+# @Date: 2022-07-11 09:21:24
|
|
|
+# @Last Modified by: privacy
|
|
|
+# @Last Modified time: 2022-07-12 16:30:08
|
|
|
+import re
|
|
|
+import logging
|
|
|
+from pprint import pprint
|
|
|
+
|
|
|
+from docx import Document
|
|
|
+from docx.shared import Inches
|
|
|
+
|
|
|
+
|
|
|
+path = "d:\\desktop\\自定义.docx"
|
|
|
+# path = "d:\\desktop\\内部人才市场简历模板.docx"
|
|
|
+
|
|
|
+keywords = [
|
|
|
+ "姓名",
|
|
|
+ "性别",
|
|
|
+ "出生年月",
|
|
|
+ "出生日期",
|
|
|
+ "民族",
|
|
|
+ "籍贯",
|
|
|
+ "户籍地",
|
|
|
+ "健康状况",
|
|
|
+ "政治面貌(加入时间)",
|
|
|
+ "政治面貌(加入时间)",
|
|
|
+ "参加工作时间",
|
|
|
+ "健康状况",
|
|
|
+ "外语水平",
|
|
|
+ "专业技术资格(取得时间)",
|
|
|
+ "专业技术资格(取得时间)",
|
|
|
+ "职业技能等级(取得时间)",
|
|
|
+ "职业技能等级(取得时间)",
|
|
|
+ "熟悉专业有何专长",
|
|
|
+ "学历院校",
|
|
|
+ "初始学历、专业",
|
|
|
+ "初始学历毕业院校及毕业时间",
|
|
|
+ "最高学历、专业",
|
|
|
+ "最高学历毕业院校及毕业时间",
|
|
|
+ "工作单位",
|
|
|
+ "现任职务",
|
|
|
+ "任职时间",
|
|
|
+ "提职时间",
|
|
|
+ "联系电话",
|
|
|
+ "邮箱地址",
|
|
|
+ "对报名岗位认识及工作设想",
|
|
|
+ "意向地区",
|
|
|
+ "意向岗位",
|
|
|
+ "其他意向岗位",
|
|
|
+ "意向单位",
|
|
|
+ "意向专业",
|
|
|
+ "学习经历",
|
|
|
+ "起止时间",
|
|
|
+ "学校","专业","学历","学位","研究方向","是否全日制",
|
|
|
+ "培训经历",
|
|
|
+ "培训类型","机构","内容","成绩","证书名称",
|
|
|
+ "工作经历",
|
|
|
+ "工作单位","职务","部门","证明人","备注",
|
|
|
+ "项目经历",
|
|
|
+ "项目名称","项目职务","项目描述","项目职责","项目成果",
|
|
|
+ "获得职业资格证书情况",
|
|
|
+ "获得日期","名称","证书编码/文号","授予单位",
|
|
|
+ "奖惩情况",
|
|
|
+ "项目","时间","项目单位","证明材料",
|
|
|
+ "主要工作业绩(500字以内)",
|
|
|
+ "主要工作业绩(500字以内)",
|
|
|
+ "自我评价",
|
|
|
+ "近三年年度考核结果",
|
|
|
+ "主要家庭成员及社会关系",
|
|
|
+ "称谓",
|
|
|
+ "其他情况说明",
|
|
|
+ "工作单位及职务",
|
|
|
+ "政治面貌",
|
|
|
+ "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系", "计算机水平"
|
|
|
+]
|
|
|
+
|
|
|
+def parse_line(line):
|
|
|
+ result = []
|
|
|
+ key = None
|
|
|
+ for cell in line:
|
|
|
+ if cell and ''.join(cell.split()) in keywords:
|
|
|
+ key = ''.join(cell.split())
|
|
|
+ elif cell and key:
|
|
|
+ schema = {key:cell}
|
|
|
+ result.append(schema)
|
|
|
+ key = None
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def parse_layout(path):
|
|
|
+ result = []
|
|
|
+ doc = Document(path)
|
|
|
+ lo = {}
|
|
|
+ tables = doc.tables
|
|
|
+ for _table in tables[:]:
|
|
|
+ for i, row in enumerate(_table.rows[:]):
|
|
|
+ row_content = []
|
|
|
+ for cell in row.cells[:]:
|
|
|
+ c = cell.text
|
|
|
+ # row_content.append(c)
|
|
|
+ if c not in row_content:
|
|
|
+ row_content.append(c)
|
|
|
+ lo[len(lo.keys())] = row_content
|
|
|
+
|
|
|
+ kwln = -1
|
|
|
+ kwline = None
|
|
|
+ for key in lo.keys():
|
|
|
+ # pdb.set_trace()
|
|
|
+ for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
|
|
|
+ if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
|
|
|
+ # pdb.set_trace()
|
|
|
+ perc = 0
|
|
|
+ for c in lo[key]:
|
|
|
+ # pdb.set_trace()
|
|
|
+ if c and (''.join(c.split()) in keywords):
|
|
|
+ perc += 1
|
|
|
+ if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 非关键词行元素
|
|
|
+ # print(c)
|
|
|
+ # print(perc)
|
|
|
+ # print(lo[key])
|
|
|
+ perc = 0
|
|
|
+ result.extend(parse_line(lo[key]))
|
|
|
+ break
|
|
|
+ else:# 关键词行元素
|
|
|
+ schema = dict()
|
|
|
+ for key, val in zip(kwline, lo[key]):
|
|
|
+ if key:
|
|
|
+ schema[key] = val
|
|
|
+ result.append(schema)
|
|
|
+ break
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ # print("{}\t\t此行为关键词行".format(lo[key]))
|
|
|
+ try:
|
|
|
+ kwline = [''.join(cell.split()) for cell in lo[key]]
|
|
|
+ except Exception as e:
|
|
|
+ kwline = lo[key]
|
|
|
+ kwln = len(lo[key])
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+# 格式化数据
|
|
|
+def formatter(datalist):
|
|
|
+ result = dict()
|
|
|
+
|
|
|
+ for d in datalist:
|
|
|
+ if len(d) == 1:
|
|
|
+ for key in d.keys():
|
|
|
+ result[key] = d[key]
|
|
|
+ else:
|
|
|
+ for k in list(d.keys()):
|
|
|
+ if k == "".join(d[k].split()):
|
|
|
+ d.pop(k)
|
|
|
+ if result.get(k):
|
|
|
+ result[k].append(d)
|
|
|
+ else:
|
|
|
+ result[k] = [d]
|
|
|
+
|
|
|
+ if result.get("出生年月"):
|
|
|
+ dates = re.findall(r'\d+' , result["出生年月"])
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+
|
|
|
+ if result.get("任职时间"):
|
|
|
+ dates = re.findall(r'\d+' , result["任职时间"])
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+
|
|
|
+ if result.get("参加工作时间"):
|
|
|
+ dates = re.findall(r'\d+' , result["参加工作时间"])
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+
|
|
|
+ if result.get("最高学历毕业院校及毕业时间"):
|
|
|
+ dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
|
|
|
+ ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
|
|
|
+ if len(ws) > 0:
|
|
|
+ result["最高学历毕业院校"] = ws[0]
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
+ result.pop("最高学历毕业院校及毕业时间")
|
|
|
+
|
|
|
+ if result.get("初始学历毕业院校及毕业时间"):
|
|
|
+ dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
|
|
|
+ ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
|
|
|
+ if len(ws) > 0:
|
|
|
+ result["初始学历毕业院校"] = ws[0]
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
+ result.pop("初始学历毕业院校及毕业时间")
|
|
|
+
|
|
|
+ if result.get("学习经历"):
|
|
|
+ for idx, edu in enumerate(result["学习经历"]):
|
|
|
+ if edu.get("起止时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
+ if len(dates) == 4:
|
|
|
+ result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
+
|
|
|
+ if result.get("培训经历"):
|
|
|
+ for idx, edu in enumerate(result["培训经历"]):
|
|
|
+ if edu.get("起止时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
+ if len(dates) == 4:
|
|
|
+ result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
+
|
|
|
+ if result.get("工作经历"):
|
|
|
+ for idx, edu in enumerate(result["工作经历"]):
|
|
|
+ if edu.get("起止时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
+ if len(dates) == 4:
|
|
|
+ result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
+
|
|
|
+ if result.get("项目经历"):
|
|
|
+ for idx, edu in enumerate(result["项目经历"]):
|
|
|
+ if edu.get("起止时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
+ if len(dates) == 4:
|
|
|
+ result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
+
|
|
|
+ if result.get("获得职业资格证书情况"):
|
|
|
+ for idx, edu in enumerate(result["获得职业资格证书情况"]):
|
|
|
+ if edu.get("获得日期"):
|
|
|
+ dates = re.findall(r'\d+' , edu["获得日期"])
|
|
|
+ if len(dates) == 2:
|
|
|
+ result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+
|
|
|
+ if result.get("奖惩情况"):
|
|
|
+ for idx, edu in enumerate(result["奖惩情况"]):
|
|
|
+ if edu.get("时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["时间"])
|
|
|
+ if len(dates) == 2:
|
|
|
+ result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+
|
|
|
+ if result.get("主要家庭成员及社会关系"):
|
|
|
+ for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
|
|
|
+ if fam.get("出生年月"):
|
|
|
+ dates = re.findall(r'\d+' , fam["出生年月"])
|
|
|
+ if len(dates) == 2:
|
|
|
+ result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ pprint(formatter(parse_layout(path)))
|
|
|
+
|
|
|
+
|