xzc
/
resume-parse


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2022-07-07 12:59:42
# @Last Modified by:   privacy
# @Last Modified time: 2022-07-18 14:57:59
# import pdb

import re
import json
import requests
from requests.adapters import HTTPAdapter

import pdfplumber
from docx import Document


path = "d:\\desktop\\社招简历模板.docx"

class Social(object):
    """docstring for Social"""
    def __init__(self):
        super(Social, self).__init__()
        self.keywords = [
            '姓名',
            '性别',
            '出生日期',
            '一寸照片',
            '民族',
            '出生地',
            '政治面貌（加入时间）',
            '参加工作时间',
            '健康状况',
            '外语水平',
            '初始学历、专业',
            '最高学历、专业',
            '初始学历毕业院校及毕业时间',
            '最高学历毕业院校及毕业时间',
            '专业技术资格（取得时间）',
            '职业技能等级（取得时间）',
            '熟悉专业有何专长',
            '工作单位',
            '现任职务',
            '任职时间',
            '提职时间',
            '意向岗位',
            '联系电话',
            '学习经历',
            '起止时间',
            '学校',
            '专业',
            '学历',
            '学位',
            '研究方向',
            '是否全日制',
            '培训经历',
            '培训类型',
            '机构',
            '内容',
            '成绩',
            '证书名称',
            '工作经历',
            '职务',
            '部门',
            '证明人',
            '备注',
            '对报名岗位认识及工作设想',
            '自我评价及主要工作业绩',
            '获得职业资格证书情况',
            '获得日期',
            '名称',
            '证书编码/文号',
            '授予单位',
            '奖惩情况',
            '项目',
            '时间',
            '项目单位',
            '证明材料',
            '主要家庭成员及社会关系',
            '称谓',
            '出生年月',
            '政治面貌',
            '工作单位及职务',
            '其他情况说明',
            '诚信承诺',
            '社会招聘工作办公室资格审查意见'
        ]
        self.json_obj = self.get_translate()

    def get_translate(self):
        # 转译数据库字段名
        with open("./resources/translate.json", "r", encoding="utf-8") as ff:
            json_obj = json.load(ff)
        return json_obj

    def parse_line(self, line):
        result = []
        key = None
        for cell in line:
            if cell and ''.join(cell.split()) in self.keywords:
                key = ''.join(cell.split())
            elif cell and key:
                schema = {key:cell}
                result.append(schema)
                key = None
        return result
    
    # 解析word
    def parse_word_layout(self, path):
        result = []
        doc = Document(path)
        lo = {}
        for _table in doc.tables[:]:
            for i, row in enumerate(_table.rows[:]):
                row_content = []
                for cell in row.cells[:]:
                    c = cell.text
                    if c not in row_content:
                        row_content.append(c)
                lo[len(lo.keys())] = row_content

        kwln = -1# 关键词行长度
        kwline = None# 关键词行
        for key in lo.keys():
            for val in lo[key]:# 通过全关键词，判断此行是否为关键词行
                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素
                    perc = 0# 行内关键词数量
                    for c in lo[key]:
                        if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词
                            perc += 1
                        if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3，判断此行非关键词行元素
                            perc = 0# 清空行内关键词数
                            result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素
                            break
                    else:# 关键词行元素
                        if len(kwline) != len(lo[key]):
                            break
                        schema = dict()
                        for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
                            if key:
                                schema[key] = val
                        result.append(schema)
                        break
                    break
            else:
                # print("{}：此行为关键词行！".format(lo[key]))
                if len(lo[key])>2:
                    try:
                        kwline = [''.join(cell.split()) for cell in lo[key]]
                    except Exception as e:
                        kwline = lo[key]
                    kwln = len(lo[key])
        return result
    
    # 解析pdf
    def parse_pdf_layout(self, path):
        result = []
        lo = {}
        with pdfplumber.open(path) as pdf:
                for page in pdf.pages:
                    for table in page.extract_tables():
                        for line in table:
                            # lo[len(lo.keys())] = [cell for cell in line if cell]
                            lo[len(lo.keys())] = line

        kwln = -1
        kwline = None
        for key in lo.keys():
            # pdb.set_trace()
            for val in lo[key]:# 通过全关键词，判断此行是否为关键词行
                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素
                    # pdb.set_trace()
                    for c in lo[key] or len(lo[key])!=kwln:
                        # pdb.set_trace()
                        if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
                            result.extend(self.parse_line(lo[key]))
                            break
                    else:# 关键词行元素
                        schema = dict()
                        for key, val in zip(kwline, lo[key]):
                            if key:
                                schema[key] = val if val else key
                        result.append(schema)
                        break
                    break
            else:
                kwline = []
                for cell in lo[key]:
                    if cell:
                        kwline.append(''.join(cell.split()))
                    else:
                        kwline.append(cell)
                kwln = len(lo[key])
        return result
    
    # 格式化数据
    def formatter(self, datalist):
        result = dict()
        for d in datalist:
            if len(d) == 1:
                for key in d.keys():
                    result[key] = d[key]
            else:
                for k in list(d.keys()):
                    if k == "".join(d[k].split()):
                        d.pop(k)
                        if result.get(k):
                            result[k].append(d)
                        else:
                            result[k] = [d]

        if result.get("外语水平"):
            data = re.findall(r'(\w+[语话])', result["外语水平"])
            if data:
                result["外语水平"] = data

        if result.get("专业技术资格(取得时间)"):
            dates = re.findall(r'\d+', result["专业技术资格(取得时间)"])
            for i in dates:
                result["专业技术资格(取得时间)"] = result["专业技术资格(取得时间)"].replace(i, "")
            names = re.findall(r'\w+', result["专业技术资格(取得时间)"])
            if len(dates) == 1:
                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"专业技术资格":names}]
            elif len(dates) == 2:
                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"专业技术资格":names}]
            elif len(dates) == 3:
                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"专业技术资格":names}]

        if result.get("职业技能等级（取得时间）"):
            dates = re.findall(r'\d+', result["职业技能等级（取得时间）"])
            for i in dates:
                result["职业技能等级（取得时间）"] = result["职业技能等级（取得时间）"].replace(i, "")
            names = re.findall(r'\w+', result["职业技能等级（取得时间）"])
            if len(dates) == 1:
                result["职业技能等级（取得时间）"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"职业技能等级":names}]
            elif len(dates) == 2:
                result["职业技能等级（取得时间）"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"职业技能等级":names}]
            elif len(dates) == 3:
                result["职业技能等级（取得时间）"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"职业技能等级":names}]

        ### 时间格式化
        if result.get("出生年月"):
            dates = re.findall(r'\d+' , result["出生年月"])
            if len(dates) == 1:
                result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
            elif len(dates) == 2:
                result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
            elif len(dates) == 3:
                result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

        if result.get("任职时间"):
            dates = re.findall(r'\d+' , result["任职时间"])
            if len(dates) == 1:
                result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
            elif len(dates) == 2:
                result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
            elif len(dates) == 3:
                result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

        if result.get("参加工作时间"):
            dates = re.findall(r'\d+' , result["参加工作时间"])
            if len(dates) == 1:
                result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
            elif len(dates) == 2:
                result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
            elif len(dates) == 3:
                result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

        if result.get("最高学历毕业院校及毕业时间"):
            dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
            ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
            if len(ws) > 0:
                result["最高学历毕业院校"] = ws[0]
            if len(dates) == 1:
                result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
            elif len(dates) == 2:
                result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
            elif len(dates) == 3:
                result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
            result.pop("最高学历毕业院校及毕业时间")

        if result.get("初始学历毕业院校及毕业时间"):
            dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
            ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
            if len(ws) > 0:
                result["初始学历毕业院校"] = ws[0]
            if len(dates) == 1:
                result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
            elif len(dates) == 2:
                result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
            elif len(dates) == 3:
                result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
            result.pop("初始学历毕业院校及毕业时间")

        if result.get("学习经历"):
            for idx, edu in enumerate(result["学习经历"]):
                if edu.get("起止时间"):
                    dates = re.findall(r'\d+' , edu["起止时间"])
                    if len(dates) == 4:
                        result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

        if result.get("培训经历"):
            for idx, edu in enumerate(result["培训经历"]):
                if edu.get("起止时间"):
                    dates = re.findall(r'\d+' , edu["起止时间"])
                    if len(dates) == 4:
                        result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

        if result.get("工作经历"):
            for idx, edu in enumerate(result["工作经历"]):
                if edu.get("起止时间"):
                    dates = re.findall(r'\d+' , edu["起止时间"])
                    if len(dates) == 4:
                        result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

        if result.get("项目经历"):
            for idx, edu in enumerate(result["项目经历"]):
                if edu.get("起止时间"):
                    dates = re.findall(r'\d+' , edu["起止时间"])
                    if len(dates) == 4:
                        result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

        if result.get("获得职业资格证书情况"):
            for idx, edu in enumerate(result["获得职业资格证书情况"]):
                if edu.get("获得日期"):
                    dates = re.findall(r'\d+' , edu["获得日期"])
                    if len(dates) == 2:
                        result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

        if result.get("奖惩情况"):
            for idx, edu in enumerate(result["奖惩情况"]):
                if edu.get("时间"):
                    dates = re.findall(r'\d+' , edu["时间"])
                    if len(dates) == 2:
                        result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

        if result.get("主要家庭成员及社会关系"):
            for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
                if fam.get("出生年月"):
                    dates = re.findall(r'\d+' , fam["出生年月"])
                    if len(dates) == 2:
                        result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

        normal = self.json_obj["base"]
        itenormal = self.json_obj["base"]
        edunormal = self.json_obj["tal_his_edu"]
        jobnormal = self.json_obj["tal_his_job"]
        tranornal = self.json_obj["tal_training_experience"]
        cetnormal = self.json_obj["tal_vocational_qualification_certificate"]
        rewnormal = self.json_obj["tal_reward_punishment"]
        family = self.json_obj["tal_family_social_relation"]

        for key in normal.keys():
            if result.get(key):
                result[normal[key]] = result[key]
                result.pop(key)

        for idx in range(len(result['学习经历'])):
            for key in edunormal.keys():
                if result['学习经历'][idx].get(key):
                    result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
                    result['学习经历'][idx].pop(key)

        for idx in range(len(result['工作经历'])):
            for key in jobnormal.keys():
                if result['工作经历'][idx].get(key):
                    result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
                    result['工作经历'][idx].pop(key)

        for idx in range(len(result['培训经历'])):
            for key in tranornal.keys():
                if result['培训经历'][idx].get(key):
                    result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]
                    result['培训经历'][idx].pop(key)

        for idx in range(len(result['获得职业资格证书情况'])):
            for key in cetnormal.keys():
                if result['获得职业资格证书情况'][idx].get(key):
                    result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]
                    result['获得职业资格证书情况'][idx].pop(key)

        for idx in range(len(result['奖惩情况'])):
            for key in rewnormal.keys():
                if result['奖惩情况'][idx].get(key):
                    result['奖惩情况'][idx][rewnormal[key]] = result['奖惩情况'][idx][key]
                    result['奖惩情况'][idx].pop(key)

        for idx in range(len(result['主要家庭成员及社会关系'])):
            for key in family.keys():
                if result['主要家庭成员及社会关系'][idx].get(key):
                    result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
                    result['主要家庭成员及社会关系'][idx].pop(key)

        tit = {
            "基本信息":"base",
            "职业发展管理":"intent_job",
            "学习经历":"tal_his_edu",
            "工作经历":"tal_his_job",
            "项目经历":"tal_his_project",
            "培训经历":"tal_training_experience",
            "奖惩情况":"tal_reward_punishment",
            "语言能力":"tal_language",
            "获得职业资格证书情况":"tal_vocational_qualification_certificate",
            "专业技能":"tal_professional_tech_certificate",
            "主要家庭成员及社会关系":"tal_family_social_relation",
            "其他情况说明":"intro"
        }

        for key in tit.keys():
            if result.get(key):
                result[tit[key]] = result[key]
                result.pop(key)
        return result
    
    # 推送后端
    def push_back(self, result):
        url = "http://192.168.1.110:9999/talent/getResumeData"
        session = requests.Session()
        session.mount('http://', HTTPAdapter(max_retries = 3))
        try:
            headers = {
                'contentType':'Application/json'
            }
            response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
            print(response.text)
        except Exception as e:
            print(e)

    def predict(self, path):
        if path.endswith(".docx"):
            result = self.formatter(self.parse_word_layout(path))
            self.push_back(result)
            print(self.formatter(self.parse_word_layout(path)))
        elif path.endswith(".pdf"):
            result = self.formatter(self.parse_pdf_layout(path))
            self.push_back(result)
            print(self.formatter(self.parse_pdf_layout(path)))


if __name__ == '__main__':
    s = Social()
    s.predict(path)