xzc
/
resume-parse


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2022-07-11 09:21:24
# @Last Modified by:   privacy
# @Last Modified time: 2022-07-15 17:22:00

# 自定义模板

import re
import json
import logging
from pprint import pprint
import requests
from requests.adapters import HTTPAdapter
from docx import Document
from docx.shared import Inches


path = "d:\\desktop\\自定义.docx"

# 关键词字典
keywords = [
	"姓名",
	"性别",
	"出生年月",
	"出生日期",
	"民族",
	"籍贯",
	"户籍地",
	"健康状况",
	"政治面貌（加入时间）",
	"政治面貌(加入时间)",
	"参加工作时间",
	"健康状况",
	"外语水平",
	"专业技术资格（取得时间）",
	"专业技术资格(取得时间)",
	"职业技能等级（取得时间）",
	"职业技能等级(取得时间)",
	"熟悉专业有何专长",
	"学历院校",
	"初始学历、专业",
	"初始学历毕业院校及毕业时间",
	"最高学历、专业",
	"最高学历毕业院校及毕业时间",
	"工作单位",
	"现任职务",
	"任职时间",
	"提职时间",
	"联系电话",
	"邮箱地址",
	"对报名岗位认识及工作设想",
	"意向地区",
	"意向岗位",
	"其他意向岗位",
	"意向单位",
	"意向专业",
	"学习经历",
	"起止时间",
	"学校","专业","学历","学位","研究方向","是否全日制",
	"培训经历",
	"培训类型","机构","内容","成绩","证书名称",
	"工作经历",
	"工作单位","职务","部门","证明人","备注",
	"项目经历",
	"项目名称","项目职务","项目描述","项目职责","项目成果",
	"获得职业资格证书情况",
	"获得日期","名称","证书编码/文号","授予单位",
	"奖惩情况",
	"项目","时间","项目单位","证明材料",
	"主要工作业绩（500字以内）",
	"主要工作业绩(500字以内)",
	"自我评价",
	"近三年年度考核结果",
	"主要家庭成员及社会关系",
	"称谓",
	"其他情况说明",
	"工作单位及职务",
	"政治面貌",
	"职业证书",
    "资格等级",
    "取得日期",
    "学校/培训机构",
    "专业",
    "起始时间",
    "毕业时间",
    "职业",
    "与本人关系",
    "计算机水平"
]

# 解析行内元素
def parse_line(line):
    result = []
    key = None
    for cell in line:
        if cell and ''.join(cell.split()) in keywords:
            key = ''.join(cell.split())
        elif cell and key:
            schema = {key:cell}
            result.append(schema)
            key = None
    return result


# 解析文档布局
def parse_layout(path):
    result = []
    doc = Document(path)
    lo = {}
    for _table in doc.tables[:]:
        for i, row in enumerate(_table.rows[:]):
            row_content = []
            for cell in row.cells[:]:
                c = cell.text
                if c not in row_content:
                	row_content.append(c)
            lo[len(lo.keys())] = row_content

    kwln = -1# 关键词行长度
    kwline = None# 关键词行
    for key in lo.keys():
        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行
            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素
                perc = 0# 行内关键词数量
                for c in lo[key]:
                    if c and (''.join(c.split()) in keywords):# 找到此行有关键词
                        perc += 1
                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3，判断此行非关键词行元素
                        perc = 0# 清空行内关键词数
                        result.extend(parse_line(lo[key]))# 添加并解析普通行级元素
                        break
                else:# 关键词行元素
                    schema = dict()
                    for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
                        if key:
                            schema[key] = val
                    result.append(schema)
                    break
                break
        else:
            # print("{}：此行为关键词行！".format(lo[key]))
            try:
                kwline = [''.join(cell.split()) for cell in lo[key]]
            except Exception as e:
                kwline = lo[key]
            kwln = len(lo[key])
    return result


# 格式化数据
def formatter(datalist):
    result = dict()
    for d in datalist:
        if len(d) == 1:# 普通键值对
            for key in d.keys():
                result[key] = d[key]
        else:# 行级元素
            for k in list(d.keys()):
                if k == "".join(d[k].split()):# 行名
                    d.pop(k)
                    if result.get(k):# 多行元素合并
                        result[k].append(d)
                    else:
                        result[k] = [d]

    ### 时间格式化
    if result.get("出生年月"):
        dates = re.findall(r'\d+' , result["出生年月"])
        if len(dates) == 1:
            result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
        elif len(dates) == 2:
            result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
        elif len(dates) == 3:
            result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

    if result.get("任职时间"):
        dates = re.findall(r'\d+' , result["任职时间"])
        if len(dates) == 1:
            result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
        elif len(dates) == 2:
            result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
        elif len(dates) == 3:
            result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

    if result.get("参加工作时间"):
        dates = re.findall(r'\d+' , result["参加工作时间"])
        if len(dates) == 1:
            result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
        elif len(dates) == 2:
            result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
        elif len(dates) == 3:
            result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

    if result.get("最高学历毕业院校及毕业时间"):
        dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
        ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
        if len(ws) > 0:
            result["最高学历毕业院校"] = ws[0]
        if len(dates) == 1:
            result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
        elif len(dates) == 2:
            result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
        elif len(dates) == 3:
            result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
        result.pop("最高学历毕业院校及毕业时间")

    if result.get("初始学历毕业院校及毕业时间"):
        dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
        ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
        if len(ws) > 0:
            result["初始学历毕业院校"] = ws[0]
        if len(dates) == 1:
            result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
        elif len(dates) == 2:
            result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
        elif len(dates) == 3:
            result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
        result.pop("初始学历毕业院校及毕业时间")

    if result.get("学习经历"):
        for idx, edu in enumerate(result["学习经历"]):
            if edu.get("起止时间"):
                dates = re.findall(r'\d+' , edu["起止时间"])
                if len(dates) == 4:
                    result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

    if result.get("培训经历"):
        for idx, edu in enumerate(result["培训经历"]):
            if edu.get("起止时间"):
                dates = re.findall(r'\d+' , edu["起止时间"])
                if len(dates) == 4:
                    result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

    if result.get("工作经历"):
        for idx, edu in enumerate(result["工作经历"]):
            if edu.get("起止时间"):
                dates = re.findall(r'\d+' , edu["起止时间"])
                if len(dates) == 4:
                    result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

    if result.get("项目经历"):
        for idx, edu in enumerate(result["项目经历"]):
            if edu.get("起止时间"):
                dates = re.findall(r'\d+' , edu["起止时间"])
                if len(dates) == 4:
                    result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

    if result.get("获得职业资格证书情况"):
        for idx, edu in enumerate(result["获得职业资格证书情况"]):
            if edu.get("获得日期"):
                dates = re.findall(r'\d+' , edu["获得日期"])
                if len(dates) == 2:
                    result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

    if result.get("奖惩情况"):
        for idx, edu in enumerate(result["奖惩情况"]):
            if edu.get("时间"):
                dates = re.findall(r'\d+' , edu["时间"])
                if len(dates) == 2:
                    result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

    if result.get("主要家庭成员及社会关系"):
        for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
            if fam.get("出生年月"):
                dates = re.findall(r'\d+' , fam["出生年月"])
                if len(dates) == 2:
                    result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

    # 转译数据库字段名
    with open("./resources/translate.json", "r", encoding="utf-8") as ff:
        json_obj = json.load(ff)

    normal = json_obj["base"]
    edunormal = json_obj["tal_his_edu"]
    family = json_obj["tal_family_social_relations"]

    for key in normal.keys():
        if result.get(key):
            result[normal[key]] = result[key]
            result.pop(key)

    for idx in range(len(result['学习经历'])):
        result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
        result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
        for key in edunormal.keys():
            if result['学习经历'][idx].get(key):
                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
                result['学习经历'][idx].pop(key)

    for idx in range(len(result['主要家庭成员及社会关系'])):
        for key in family.keys():
            if result['主要家庭成员及社会关系'][idx].get(key):
                result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
                result['主要家庭成员及社会关系'][idx].pop(key)

    tit = {
        "基本信息":"base",
        "求职意向":"intent_job",
        "学习经历":"tal_his_edu",
        "工作经历":"tal_his_job",
        "项目经历":"tal_his_project",
        "培训经历":"tal_training_institutions",
        "获奖情况":"tal_rewards_punishments",
        "语言能力":"tal_language",
        "证书":"tal_vocational_qualification_certificate",
        "专业技能":"tal_professional_tech_certificate",
        "主要家庭成员及社会关系":"tal_family_social_relations"
    }

    for key in tit.keys():
        if result.get(key):
            result[tit[key]] = result[key]
            result.pop(key)

    # url = "http://192.168.1.110:9999/talent/getResumeData"
    # session = requests.Session()
    # session.mount('http://', HTTPAdapter(max_retries = 3))
    # try:
    #     headers = {
    #         'contentType':'Application/json'
    #     }
    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
    #     print(response.text)
    # except Exception as e:
    #     print(e)
    return result


if __name__ == '__main__':
    pprint(formatter(parse_layout(path)))