123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2022-07-11 09:21:24
- # @Last Modified by: privacy
- # @Last Modified time: 2022-07-14 11:00:31
- # 自定义模板
- import re
- import json
- import logging
- from pprint import pprint
- import requests
- from requests.adapters import HTTPAdapter
- from docx import Document
- from docx.shared import Inches
- path = "d:\\desktop\\自定义.docx"
- # 关键词字典
- keywords = [
- "姓名",
- "性别",
- "出生年月",
- "出生日期",
- "民族",
- "籍贯",
- "户籍地",
- "健康状况",
- "政治面貌(加入时间)",
- "政治面貌(加入时间)",
- "参加工作时间",
- "健康状况",
- "外语水平",
- "专业技术资格(取得时间)",
- "专业技术资格(取得时间)",
- "职业技能等级(取得时间)",
- "职业技能等级(取得时间)",
- "熟悉专业有何专长",
- "学历院校",
- "初始学历、专业",
- "初始学历毕业院校及毕业时间",
- "最高学历、专业",
- "最高学历毕业院校及毕业时间",
- "工作单位",
- "现任职务",
- "任职时间",
- "提职时间",
- "联系电话",
- "邮箱地址",
- "对报名岗位认识及工作设想",
- "意向地区",
- "意向岗位",
- "其他意向岗位",
- "意向单位",
- "意向专业",
- "学习经历",
- "起止时间",
- "学校","专业","学历","学位","研究方向","是否全日制",
- "培训经历",
- "培训类型","机构","内容","成绩","证书名称",
- "工作经历",
- "工作单位","职务","部门","证明人","备注",
- "项目经历",
- "项目名称","项目职务","项目描述","项目职责","项目成果",
- "获得职业资格证书情况",
- "获得日期","名称","证书编码/文号","授予单位",
- "奖惩情况",
- "项目","时间","项目单位","证明材料",
- "主要工作业绩(500字以内)",
- "主要工作业绩(500字以内)",
- "自我评价",
- "近三年年度考核结果",
- "主要家庭成员及社会关系",
- "称谓",
- "其他情况说明",
- "工作单位及职务",
- "政治面貌",
- "职业证书",
- "资格等级",
- "取得日期",
- "学校/培训机构",
- "专业",
- "起始时间",
- "毕业时间",
- "职业",
- "与本人关系",
- "计算机水平"
- ]
- # 解析行内元素
- def parse_line(line):
- result = []
- key = None
- for cell in line:
- if cell and ''.join(cell.split()) in keywords:
- key = ''.join(cell.split())
- elif cell and key:
- schema = {key:cell}
- result.append(schema)
- key = None
- return result
- # 解析文档布局
- def parse_layout(path):
- result = []
- doc = Document(path)
- lo = {}
- for _table in doc.tables[:]:
- for i, row in enumerate(_table.rows[:]):
- row_content = []
- for cell in row.cells[:]:
- c = cell.text
- if c not in row_content:
- row_content.append(c)
- lo[len(lo.keys())] = row_content
- kwln = -1# 关键词行长度
- kwline = None# 关键词行
- for key in lo.keys():
- for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
- if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
- perc = 0# 行内关键词数量
- for c in lo[key]:
- if c and (''.join(c.split()) in keywords):# 找到此行有关键词
- perc += 1
- if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
- perc = 0# 清空行内关键词数
- result.extend(parse_line(lo[key]))# 添加并解析普通行级元素
- break
- else:# 关键词行元素
- schema = dict()
- for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
- if key:
- schema[key] = val
- result.append(schema)
- break
- break
- else:
- # print("{}:此行为关键词行!".format(lo[key]))
- try:
- kwline = [''.join(cell.split()) for cell in lo[key]]
- except Exception as e:
- kwline = lo[key]
- kwln = len(lo[key])
- return result
- # 格式化数据
- def formatter(datalist):
- result = dict()
- for d in datalist:
- if len(d) == 1:# 普通键值对
- for key in d.keys():
- result[key] = d[key]
- else:# 行级元素
- for k in list(d.keys()):
- if k == "".join(d[k].split()):# 行名
- d.pop(k)
- if result.get(k):# 多行元素合并
- result[k].append(d)
- else:
- result[k] = [d]
- ### 时间格式化
- if result.get("出生年月"):
- dates = re.findall(r'\d+' , result["出生年月"])
- if len(dates) == 1:
- result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
- elif len(dates) == 2:
- result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- elif len(dates) == 3:
- result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
- if result.get("任职时间"):
- dates = re.findall(r'\d+' , result["任职时间"])
- if len(dates) == 1:
- result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
- elif len(dates) == 2:
- result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- elif len(dates) == 3:
- result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
- if result.get("参加工作时间"):
- dates = re.findall(r'\d+' , result["参加工作时间"])
- if len(dates) == 1:
- result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
- elif len(dates) == 2:
- result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- elif len(dates) == 3:
- result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
- if result.get("最高学历毕业院校及毕业时间"):
- dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
- ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
- if len(ws) > 0:
- result["最高学历毕业院校"] = ws[0]
- if len(dates) == 1:
- result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
- elif len(dates) == 2:
- result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- elif len(dates) == 3:
- result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
- result.pop("最高学历毕业院校及毕业时间")
- if result.get("初始学历毕业院校及毕业时间"):
- dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
- ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
- if len(ws) > 0:
- result["初始学历毕业院校"] = ws[0]
- if len(dates) == 1:
- result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
- elif len(dates) == 2:
- result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- elif len(dates) == 3:
- result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
- result.pop("初始学历毕业院校及毕业时间")
- if result.get("学习经历"):
- for idx, edu in enumerate(result["学习经历"]):
- if edu.get("起止时间"):
- dates = re.findall(r'\d+' , edu["起止时间"])
- if len(dates) == 4:
- result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
- if result.get("培训经历"):
- for idx, edu in enumerate(result["培训经历"]):
- if edu.get("起止时间"):
- dates = re.findall(r'\d+' , edu["起止时间"])
- if len(dates) == 4:
- result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
- if result.get("工作经历"):
- for idx, edu in enumerate(result["工作经历"]):
- if edu.get("起止时间"):
- dates = re.findall(r'\d+' , edu["起止时间"])
- if len(dates) == 4:
- result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
- if result.get("项目经历"):
- for idx, edu in enumerate(result["项目经历"]):
- if edu.get("起止时间"):
- dates = re.findall(r'\d+' , edu["起止时间"])
- if len(dates) == 4:
- result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
- if result.get("获得职业资格证书情况"):
- for idx, edu in enumerate(result["获得职业资格证书情况"]):
- if edu.get("获得日期"):
- dates = re.findall(r'\d+' , edu["获得日期"])
- if len(dates) == 2:
- result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- if result.get("奖惩情况"):
- for idx, edu in enumerate(result["奖惩情况"]):
- if edu.get("时间"):
- dates = re.findall(r'\d+' , edu["时间"])
- if len(dates) == 2:
- result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- if result.get("主要家庭成员及社会关系"):
- for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
- if fam.get("出生年月"):
- dates = re.findall(r'\d+' , fam["出生年月"])
- if len(dates) == 2:
- result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
- # 转译数据库字段名
- with open("./resources/translate.json", "r", encoding="utf-8") as ff:
- json_obj = json.load(ff)
- normal = json_obj["base"]
- edunormal = json_obj["tal_his_edu"]
- family = json_obj["tal_family_social_relations"]
- for key in normal.keys():
- if result.get(key):
- result[normal[key]] = result[key]
- result.pop(key)
- for idx in range(len(result['学习经历'])):
- result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
- result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
- for key in edunormal.keys():
- if result['学习经历'][idx].get(key):
- result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
- result['学习经历'][idx].pop(key)
- for idx in range(len(result['主要家庭成员及社会关系'])):
- for key in family.keys():
- if result['主要家庭成员及社会关系'][idx].get(key):
- result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
- result['主要家庭成员及社会关系'][idx].pop(key)
- # url = "http://192.168.1.110:9999/talent/getResumeData"
- # session = requests.Session()
- # session.mount('http://', HTTPAdapter(max_retries = 3))
- # try:
- # headers = {
- # 'contentType':'Application/json'
- # }
- # response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
- # print(response.text)
- # except Exception as e:
- # print(e)
- return result
- if __name__ == '__main__':
- pprint(formatter(parse_layout(path)))
|