# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2022-07-07 13:12:17 # @Last Modified by: privacy # @Last Modified time: 2022-07-14 09:39:42 # 内部人才市场简历模板 from pprint import pprint import re import docx from docx import Document from docx.shared import Inches path = "d:\\desktop\\内部人才市场简历模板.docx" keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"] def parse_line(line): result = [] key = None for cell in line: if cell and ''.join(cell.split()) in keywords: key = ''.join(cell.split()) elif cell and key: schema = {key:cell} result.append(schema) key = None return result def parse_layout(path): result = [] doc = Document(path) lo = {} tables = doc.tables for _table in tables[:]: for i, row in enumerate(_table.rows[:]): row_content = [] for cell in row.cells[:]: c = cell.text row_content.append(c) lo[len(lo.keys())] = row_content kwln = -1 kwline = None for key in lo.keys(): # pdb.set_trace() for val in lo[key]:# 通过全关键词,判断此行是否为关键词行 if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素 # pdb.set_trace() for c in lo[key]: # pdb.set_trace() if c and ''.join(c.split()) in keywords:# 非关键词行元素 result.extend(parse_line(lo[key])) break else:# 关键词行元素 schema = dict() for key, val in zip(kwline, lo[key]): if key: schema[key] = val if "学校/培训机构" in schema.keys(): schema["学习经历"] = "学习经历" elif "与本人关系" in schema.keys(): schema["家庭成员"] = "家庭成员" elif "意向地区" in schema.keys(): schema["职业发展管理"] = "职业发展管理" elif "职业证书" in schema.keys(): schema["职业资格证书"] = "职业资格证书" result.append(schema) break break else: # print("此行为关键词行") kwline = [''.join(cell.split()) for cell in lo[key]] kwln = len(lo[key]) job = {"工作经历":"工作经历"} flag = None for p in doc.paragraphs: text = p.text.replace(":", ":") if ":" in text: text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text) for line in text.split("\n"): if line.strip(): i = line.split(":") if job.get(i[0].strip()): result.append(job) job = {"工作经历":"工作经历"} job[i[0].strip()] = i[1].strip() flag = i[0].strip() elif flag == "工作描述": job["工作描述"] += '\n' + text.strip() else: result.append(job) return result # 格式化数据 def formatter(datalist): result = dict() for d in datalist: if len(d) == 1: for key in d.keys(): result[key] = d[key] else: for k in list(d.keys()): if k == "".join(d[k].split()): d.pop(k) if result.get(k): result[k].append(d) else: result[k] = [d] normal = { "姓名":"name", "性别":"gender", "邮箱地址":"email", "政治面貌":"politics", "联系电话":"mobile", "籍贯":"birthplace", "出生日期":"birth_time", "现任职务":"current_job", "所在城市":"living_city", "参加工作时间":"work_begin_time", "意向岗位":"intent_job", "熟悉专业有何专长":"skills", } edunormal = { "学校":"school_name", "专业":"major", "学历":"degree", "是否全日制":"degree_type", } for key in normal.keys(): if result.get(key): result[normal[key]] = result[key] result.pop(key) edunormal = { "学校/培训机构":"school_name", "专业":"major", "起始时间":"start_time", "毕业时间":"end_time" } for idx in range(len(result['学习经历'])): for key in edunormal.keys(): if result['学习经历'][idx].get(key): result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key] result['学习经历'][idx].pop(key) return result if __name__ == "__main__": pprint(formatter(parse_layout(path)))