# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2022-07-11 09:21:24 # @Last Modified by: privacy # @Last Modified time: 2022-07-13 15:31:50 # 自定义模板 import re import logging from pprint import pprint import requests from requests.adapters import HTTPAdapter from docx import Document from docx.shared import Inches path = "d:\\desktop\\自定义.docx" # path = "d:\\desktop\\内部人才市场简历模板.docx" keywords = [ "姓名", "性别", "出生年月", "出生日期", "民族", "籍贯", "户籍地", "健康状况", "政治面貌(加入时间)", "政治面貌(加入时间)", "参加工作时间", "健康状况", "外语水平", "专业技术资格(取得时间)", "专业技术资格(取得时间)", "职业技能等级(取得时间)", "职业技能等级(取得时间)", "熟悉专业有何专长", "学历院校", "初始学历、专业", "初始学历毕业院校及毕业时间", "最高学历、专业", "最高学历毕业院校及毕业时间", "工作单位", "现任职务", "任职时间", "提职时间", "联系电话", "邮箱地址", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "其他意向岗位", "意向单位", "意向专业", "学习经历", "起止时间", "学校","专业","学历","学位","研究方向","是否全日制", "培训经历", "培训类型","机构","内容","成绩","证书名称", "工作经历", "工作单位","职务","部门","证明人","备注", "项目经历", "项目名称","项目职务","项目描述","项目职责","项目成果", "获得职业资格证书情况", "获得日期","名称","证书编码/文号","授予单位", "奖惩情况", "项目","时间","项目单位","证明材料", "主要工作业绩(500字以内)", "主要工作业绩(500字以内)", "自我评价", "近三年年度考核结果", "主要家庭成员及社会关系", "称谓", "其他情况说明", "工作单位及职务", "政治面貌", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系", "计算机水平" ] def parse_line(line): result = [] key = None for cell in line: if cell and ''.join(cell.split()) in keywords: key = ''.join(cell.split()) elif cell and key: schema = {key:cell} result.append(schema) key = None return result def parse_layout(path): result = [] doc = Document(path) lo = {} tables = doc.tables for _table in tables[:]: for i, row in enumerate(_table.rows[:]): row_content = [] for cell in row.cells[:]: c = cell.text # row_content.append(c) if c not in row_content: row_content.append(c) lo[len(lo.keys())] = row_content kwln = -1 kwline = None for key in lo.keys(): # pdb.set_trace() for val in lo[key]:# 通过全关键词,判断此行是否为关键词行 if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素 # pdb.set_trace() perc = 0 for c in lo[key]: # pdb.set_trace() if c and (''.join(c.split()) in keywords): perc += 1 if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 非关键词行元素 # print(c) # print(perc) # print(lo[key]) perc = 0 result.extend(parse_line(lo[key])) break else:# 关键词行元素 schema = dict() for key, val in zip(kwline, lo[key]): if key: schema[key] = val result.append(schema) break break else: # print("{}\t\t此行为关键词行".format(lo[key])) try: kwline = [''.join(cell.split()) for cell in lo[key]] except Exception as e: kwline = lo[key] kwln = len(lo[key]) return result # 格式化数据 def formatter(datalist): result = dict() for d in datalist: if len(d) == 1: for key in d.keys(): result[key] = d[key] else: for k in list(d.keys()): if k == "".join(d[k].split()): d.pop(k) if result.get(k): result[k].append(d) else: result[k] = [d] if result.get("出生年月"): dates = re.findall(r'\d+' , result["出生年月"]) if len(dates) == 1: result["出生年月"] = "{:4d}-01-01".format(int(dates[0])) elif len(dates) == 2: result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) elif len(dates) == 3: result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) if result.get("任职时间"): dates = re.findall(r'\d+' , result["任职时间"]) if len(dates) == 1: result["任职时间"] = "{:4d}-01-01".format(int(dates[0])) elif len(dates) == 2: result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) elif len(dates) == 3: result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) if result.get("参加工作时间"): dates = re.findall(r'\d+' , result["参加工作时间"]) if len(dates) == 1: result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0])) elif len(dates) == 2: result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) elif len(dates) == 3: result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) if result.get("最高学历毕业院校及毕业时间"): dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"]) ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"]) if len(ws) > 0: result["最高学历毕业院校"] = ws[0] if len(dates) == 1: result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0])) elif len(dates) == 2: result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) elif len(dates) == 3: result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])) result.pop("最高学历毕业院校及毕业时间") if result.get("初始学历毕业院校及毕业时间"): dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"]) ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"]) if len(ws) > 0: result["初始学历毕业院校"] = ws[0] if len(dates) == 1: result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0])) elif len(dates) == 2: result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) elif len(dates) == 3: result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])) result.pop("初始学历毕业院校及毕业时间") if result.get("学习经历"): for idx, edu in enumerate(result["学习经历"]): if edu.get("起止时间"): dates = re.findall(r'\d+' , edu["起止时间"]) if len(dates) == 4: result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3])) if result.get("培训经历"): for idx, edu in enumerate(result["培训经历"]): if edu.get("起止时间"): dates = re.findall(r'\d+' , edu["起止时间"]) if len(dates) == 4: result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3])) if result.get("工作经历"): for idx, edu in enumerate(result["工作经历"]): if edu.get("起止时间"): dates = re.findall(r'\d+' , edu["起止时间"]) if len(dates) == 4: result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3])) if result.get("项目经历"): for idx, edu in enumerate(result["项目经历"]): if edu.get("起止时间"): dates = re.findall(r'\d+' , edu["起止时间"]) if len(dates) == 4: result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3])) if result.get("获得职业资格证书情况"): for idx, edu in enumerate(result["获得职业资格证书情况"]): if edu.get("获得日期"): dates = re.findall(r'\d+' , edu["获得日期"]) if len(dates) == 2: result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) if result.get("奖惩情况"): for idx, edu in enumerate(result["奖惩情况"]): if edu.get("时间"): dates = re.findall(r'\d+' , edu["时间"]) if len(dates) == 2: result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) if result.get("主要家庭成员及社会关系"): for idx, fam in enumerate(result["主要家庭成员及社会关系"]): if fam.get("出生年月"): dates = re.findall(r'\d+' , fam["出生年月"]) if len(dates) == 2: result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])) normal = { "姓名":"name", "性别":"gender", "邮箱地址":"email", "政治面貌(加入时间)":"politics", "联系电话":"mobile", "籍贯":"birthplace", "出生年月":"birth_time", "现任职务":"current_job", "所在城市":"living_city", "参加工作时间":"work_begin_time", "意向岗位":"intent_job", "熟悉专业有何专长":"skills", } edunormal = { "学校":"school_name", "专业":"major", "学历":"degree", "是否全日制":"degree_type", } for key in normal.keys(): if result.get(key): result[normal[key]] = result[key] result.pop(key) for idx in range(len(result['学习经历'])): result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0] result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1] for key in edunormal.keys(): if result['学习经历'][idx].get(key): result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key] result['学习经历'][idx].pop(key) url = "http://192.168.1.110:9999/talent/getResumeData" session = requests.Session() session.mount('http://', HTTPAdapter(max_retries = 3)) try: headers = { 'contentType':'Application/json' } response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10) print(response.text) except Exception as e: print(e) return result if __name__ == '__main__': pprint(formatter(parse_layout(path)))