xzc
/
resume-parse


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2022-07-07 12:59:42
# @Last Modified by:   privacy
# @Last Modified time: 2022-07-13 15:22:48
# import pdb
from pprint import pprint
import pandas as pd
import pdfplumber

path = "d:\\desktop\\社招简历模板.pdf"

keywords = ['姓名',
    '性别',
    '出生日期',
    '一寸照片',
    '民族',
    '出生地',
    '政治面貌（加入时间）',
    '参加工作时间',
    '健康状况',
    '外语水平',
    '初始学历、专业',
    '最高学历、专业',
    '初始学历毕业院校及毕业时间',
    '最高学历毕业院校及毕业时间',
    '专业技术资格（取得时间）',
    '职业技能等级（取得时间）',
    '熟悉专业有何专长',
    '工作单位',
    '现任职务',
    '任职时间',
    '提职时间',
    '意向岗位',
    '联系电话',
    '学习经历',
    '起止时间',
    '学校',
    '专业',
    '学历',
    '学位',
    '研究方向',
    '是否全日制',
    '培训',
    '起止时间',
    '培训类型',
    '机构',
    '内容',
    '成绩',
    '证书名称',
    '经历',
    '工作经历',
    '起止时间',
    '工作单位',
    '职务',
    '部门',
    '证明人',
    '备注',
    '对报名岗位认识及工作设想',
    '自我评价及主要工作业绩',
    '获得职业资格证书情况',
    '获得日期',
    '名称',
    '证书编码/文号',
    '授予单位',
    '备注',
    '奖惩',
    '项目',
    '时间',
    '项目单位',
    '证明材料',
    '情况',
    '主要家庭成员及社会关系',
    '称谓',
    '出生年月',
    '政治面貌',
    '工作单位及职务',
    '其他情况说明',
    '诚信承诺',
    '本人承诺，以上信息均与事实相符，若有虚假，愿承担一切后果并自愿取消应聘资格。'
    '承诺人：'
    '社会招聘工作办公室资格审查意见']

def parse_line(line):
    result = []
    key = None
    for cell in line:
        if cell and ''.join(cell.split()) in keywords:
            key = ''.join(cell.split())
        elif cell and key:
            schema = {key:cell}
            result.append(schema)
            key = None
    return result


def parse_layout(path):
    result = []
    lo = {}
    with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                for table in page.extract_tables():
                    for line in table:
                        # lo[len(lo.keys())] = [cell for cell in line if cell]
                        lo[len(lo.keys())] = line

    kwln = -1
    kwline = None
    for key in lo.keys():
        # pdb.set_trace()
        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行
            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素
                # pdb.set_trace()
                for c in lo[key] or len(lo[key])!=kwln:
                    # pdb.set_trace()
                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
                        result.extend(parse_line(lo[key]))
                        break
                else:# 关键词行元素
                    schema = dict()
                    for key, val in zip(kwline, lo[key]):
                        if key:
                            schema[key] = val if val else key
                    result.append(schema)
                    break
                break
        else:
            # print("此行为关键词行")
            # kwline = lo[key]
            kwline = []
            for cell in lo[key]:
                if cell:
                    kwline.append(''.join(cell.split()))
                else:
                    kwline.append(cell)
            kwln = len(lo[key])
    return result

# 格式化数据
def formatter(datalist):
    result = dict()

    for d in datalist:
        if len(d) == 1:
            for key in d.keys():
                result[key] = d[key]
        else:
            for k in list(d.keys()):
                if k == "".join(d[k].split()):
                    d.pop(k)
                    if result.get(k):
                        result[k].append(d)
                    else:
                        result[k] = [d]

    return result

if __name__ == '__main__':
    # pprint(parse_layout(path))
    pprint(formatter(parse_layout(path)))