xzc
/
resume-parse


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2022-07-07 12:59:42
# @Last Modified by:   privacy
# @Last Modified time: 2022-07-16 11:41:09
# import pdb
from pprint import pprint
import json
import pandas as pd
import pdfplumber
import docx
from docx import Document
from docx.shared import Inches

path = "d:\\desktop\\社招简历模板.docx"
# path = "d:\\desktop\\社招简历模板.pdf"

keywords = ['姓名',
    '性别',
    '出生日期',
    '一寸照片',
    '民族',
    '出生地',
    '政治面貌（加入时间）',
    '参加工作时间',
    '健康状况',
    '外语水平',
    '初始学历、专业',
    '最高学历、专业',
    '初始学历毕业院校及毕业时间',
    '最高学历毕业院校及毕业时间',
    '专业技术资格（取得时间）',
    '职业技能等级（取得时间）',
    '熟悉专业有何专长',
    '工作单位',
    '现任职务',
    '任职时间',
    '提职时间',
    '意向岗位',
    '联系电话',
    '学习经历',
    '起止时间',
    '学校',
    '专业',
    '学历',
    '学位',
    '研究方向',
    '是否全日制',
    '培训',
    '起止时间',
    '培训类型',
    '机构',
    '内容',
    '成绩',
    '证书名称',
    '经历',
    '工作经历',
    '起止时间',
    '工作单位',
    '职务',
    '部门',
    '证明人',
    '备注',
    '对报名岗位认识及工作设想',
    '自我评价及主要工作业绩',
    '获得职业资格证书情况',
    '获得日期',
    '名称',
    '证书编码/文号',
    '授予单位',
    '备注',
    '奖惩',
    '项目',
    '时间',
    '项目单位',
    '证明材料',
    '情况',
    '主要家庭成员及社会关系',
    '称谓',
    '出生年月',
    '政治面貌',
    '工作单位及职务',
    '其他情况说明',
    '诚信承诺',
    '本人承诺，以上信息均与事实相符，若有虚假，愿承担一切后果并自愿取消应聘资格。'
    '承诺人：'
    '社会招聘工作办公室资格审查意见']

def parse_line(line):
    result = []
    key = None
    for cell in line:
        if cell and ''.join(cell.split()) in keywords:
            key = ''.join(cell.split())
        elif cell and key:
            schema = {key:cell}
            result.append(schema)
            key = None
    return result


def parse_word_layout(path):
    result = []
    doc = Document(path)
    lo = {}
    for _table in doc.tables[:]:
        for i, row in enumerate(_table.rows[:]):
            row_content = []
            for cell in row.cells[:]:
                c = cell.text
                if c not in row_content:
                    row_content.append(c)
            lo[len(lo.keys())] = row_content

    kwln = -1# 关键词行长度
    kwline = None# 关键词行
    for key in lo.keys():
        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行
            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素
                perc = 0# 行内关键词数量
                for c in lo[key]:
                    if c and (''.join(c.split()) in keywords):# 找到此行有关键词
                        perc += 1
                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3，判断此行非关键词行元素
                        perc = 0# 清空行内关键词数
                        result.extend(parse_line(lo[key]))# 添加并解析普通行级元素
                        break
                else:# 关键词行元素
                    if len(kwline) != len(lo[key]):
                        break
                    schema = dict()
                    for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
                        if key:
                            schema[key] = val
                    result.append(schema)
                    break
                break
        else:
            # print("{}：此行为关键词行！".format(lo[key]))
            if len(lo[key])>2:
                try:
                    kwline = [''.join(cell.split()) for cell in lo[key]]
                except Exception as e:
                    kwline = lo[key]
                kwln = len(lo[key])
    return result

def parse_pdf_layout(path):
    result = []
    lo = {}
    with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                for table in page.extract_tables():
                    for line in table:
                        # lo[len(lo.keys())] = [cell for cell in line if cell]
                        lo[len(lo.keys())] = line

    kwln = -1
    kwline = None
    for key in lo.keys():
        # pdb.set_trace()
        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行
            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素
                # pdb.set_trace()
                for c in lo[key] or len(lo[key])!=kwln:
                    # pdb.set_trace()
                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
                        result.extend(parse_line(lo[key]))
                        break
                else:# 关键词行元素
                    schema = dict()
                    for key, val in zip(kwline, lo[key]):
                        if key:
                            schema[key] = val if val else key
                    result.append(schema)
                    break
                break
        else:
            # print("此行为关键词行")
            # kwline = lo[key]
            kwline = []
            for cell in lo[key]:
                if cell:
                    kwline.append(''.join(cell.split()))
                else:
                    kwline.append(cell)
            kwln = len(lo[key])
    return result

# 格式化数据
def formatter(datalist):
    result = dict()

    for d in datalist:
        if len(d) == 1:
            for key in d.keys():
                result[key] = d[key]
        else:
            for k in list(d.keys()):
                if k == "".join(d[k].split()):
                    d.pop(k)
                    if result.get(k):
                        result[k].append(d)
                    else:
                        result[k] = [d]


    # 转译数据库字段名
    with open("./resources/translate.json", "r", encoding="utf-8") as ff:
        json_obj = json.load(ff)

    normal = json_obj["base"]
    itenormal = json_obj["base"]
    edunormal = json_obj["tal_his_edu"]
    jobnormal = json_obj["tal_his_job"]
    cetnormal = json_obj["tal_vocational_qualification_certificate"]
    family = json_obj["tal_family_social_relations"]

    for key in normal.keys():
        if result.get(key):
            result[normal[key]] = result[key]
            result.pop(key)

    for idx in range(len(result['学习经历'])):
        for key in edunormal.keys():
            if result['学习经历'][idx].get(key):
                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
                result['学习经历'][idx].pop(key)

    for idx in range(len(result['工作经历'])):
        for key in jobnormal.keys():
            if result['工作经历'][idx].get(key):
                result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
                result['工作经历'][idx].pop(key)

    for idx in range(len(result['获得职业资格证书情况'])):
        for key in cetnormal.keys():
            if result['获得职业资格证书情况'][idx].get(key):
                result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]
                result['获得职业资格证书情况'][idx].pop(key)

    for idx in range(len(result['主要家庭成员及社会关系'])):
        for key in family.keys():
            if result['主要家庭成员及社会关系'][idx].get(key):
                result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
                result['主要家庭成员及社会关系'][idx].pop(key)

    tit = {
        "基本信息":"base",
        "职业发展管理":"intent_job",
        "学习经历":"tal_his_edu",
        "工作经历":"tal_his_job",
        "项目经历":"tal_his_project",
        "培训经历":"tal_training_institutions",
        "获奖情况":"tal_rewards_punishments",
        "语言能力":"tal_language",
        "获得职业资格证书情况":"tal_vocational_qualification_certificate",
        "专业技能":"tal_professional_tech_certificate",
        "主要家庭成员及社会关系":"tal_family_social_relations",
        "其他情况说明":"intro"
    }

    for key in tit.keys():
        if result.get(key):
            result[tit[key]] = result[key]
            result.pop(key)

    # url = "http://192.168.1.110:9999/talent/getResumeData"
    # session = requests.Session()
    # session.mount('http://', HTTPAdapter(max_retries = 3))
    # try:
    #     headers = {
    #         'contentType':'Application/json'
    #     }
    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
    #     print(response.text)
    # except Exception as e:
    #     print(e)

    return result

if __name__ == '__main__':
    if path.endswith(".pdf"):
        pprint(formatter(parse_pdf_layout(path)))
    else:
        pprint(formatter(parse_word_layout(path)))