xzc
/
resume-parse


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2022-07-07 13:12:17
# @Last Modified by:   privacy
# @Last Modified time: 2022-07-16 15:05:03

# 内部人才市场简历模板
from pprint import pprint
import re
import json
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import pdfplumber
import docx
from docx import Document
from docx.shared import Inches


# path = "d:\\desktop\\内部人才市场简历模板.docx"
path = "d:\\desktop\\内部人才市场简历模板.pdf"

keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格（取得时间）", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]

def parse_line(line):
    result = []
    key = None
    for cell in line:
        if cell and ''.join(cell.split()) in keywords:
            key = ''.join(cell.split())
        elif cell and key:
            schema = {key:cell}
            result.append(schema)
            key = None
    return result


def parse_layout(path):
    result = []
    doc = Document(path)

    lo = {}
    tables = doc.tables
    for _table in tables[:]:
        for i, row in enumerate(_table.rows[:]):
            row_content = []
            for cell in row.cells[:]:
                c = cell.text
                row_content.append(c)
            lo[len(lo.keys())] = row_content
    
    kwln = -1
    kwline = None
    for key in lo.keys():
        # pdb.set_trace()
        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行
            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素
                # pdb.set_trace()
                for c in lo[key]:
                    # pdb.set_trace()
                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
                        result.extend(parse_line(lo[key]))
                        break
                else:# 关键词行元素
                    schema = dict()
                    for key, val in zip(kwline, lo[key]):
                        if key:
                            schema[key] = val
                    if "学校/培训机构" in schema.keys():
                        schema["学习经历"] = "学习经历"
                    elif "与本人关系" in schema.keys():
                        schema["家庭成员"] = "家庭成员"
                    elif "意向地区" in schema.keys():
                        schema["职业发展管理"] = "职业发展管理"
                    elif "职业证书" in schema.keys():
                        schema["职业资格证书"] = "职业资格证书"
                    result.append(schema)
                    break
                break
        else:
            # print("此行为关键词行")
            kwline = [''.join(cell.split()) for cell in lo[key]]
            kwln = len(lo[key])

    job = {"工作经历":"工作经历"}
    flag = None
    for p in doc.paragraphs:
        text = p.text.replace("：", ":")
        if ":" in text:
            text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
            for line in text.split("\n"):
                if line.strip():
                    i = line.split(":")
                    if job.get(i[0].strip()):
                        result.append(job)
                        job = {"工作经历":"工作经历"}
                    job[i[0].strip()] = i[1].strip()
                    flag = i[0].strip()
        elif flag == "工作描述":
            job["工作描述"] += '\n' + text.strip()
    else:
        result.append(job)
    return result


def parse_pdf_layout(path):
    result = []
    lo = {}
    with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                for table in page.extract_tables():
                    for line in table:
                        # lo[len(lo.keys())] = [cell for cell in line if cell]
                        lo[len(lo.keys())] = line

    kwln = -1
    kwline = None
    for key in lo.keys():
        # pdb.set_trace()
        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行
            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素
                # pdb.set_trace()
                for c in lo[key]:
                    # pdb.set_trace()
                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
                        result.extend(parse_line(lo[key]))
                        break
                    if c == "对报名岗位\n认 识及工作":
                        print(''.join(c.split()))
                        break
                else:# 关键词行元素
                    schema = dict()
                    for key, val in zip(kwline, lo[key]):
                        if key:
                            schema[key] = val
                    if "学校/培训机构" in schema.keys():
                        schema["学习经历"] = "学习经历"
                    elif "与本人关系" in schema.keys():
                        schema["家庭成员"] = "家庭成员"
                    elif "意向地区" in schema.keys():
                        schema["职业发展管理"] = "职业发展管理"
                    elif "职业证书" in schema.keys():
                        schema["职业资格证书"] = "职业资格证书"
                    result.append(schema)
                    break
                break
        else:
            # print("此行为关键词行")
            kwline = [''.join(cell.split()) for cell in lo[key]]
            kwln = len(lo[key])

    job = {"工作经历":"工作经历"}
    flag = None

    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            for predict in page.extract_words():
                # print(predict['text'])
                text = predict['text'].replace("：", ":")
                if ":" in text:
                    text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
                    for line in text.split("\n"):
                        if line.strip():
                            i = line.split(":")
                            if job.get(i[0].strip()):
                                result.append(job)
                                job = {"工作经历":"工作经历"}
                            job[i[0].strip()] = i[1].strip()
                            flag = i[0].strip()
                elif flag == "工作描述":
                    job["工作描述"] += '\n' + text.strip()
            else:
                result.append(job)
    return result

# 格式化数据
def formatter(datalist):
    result = dict()

    for d in datalist:
        if len(d) == 1:
            for key in d.keys():
                result[key] = d[key]
        else:
            for k in list(d.keys()):
                if k == "".join(d[k].split()):
                    d.pop(k)
                    if result.get(k):
                        result[k].append(d)
                    else:
                        result[k] = [d]

    # 转译数据库字段名
    with open("./resources/translate.json", "r", encoding="utf-8") as ff:
        json_obj = json.load(ff)

    normal = json_obj["base"]
    itenormal = json_obj["base"]
    edunormal = json_obj["tal_training_institutions"]
    jobnormal = json_obj["tal_his_job"]
    cetnormal = json_obj["tal_vocational_qualification_certificate"]
    family = json_obj["tal_family_social_relations"]

    for key in normal.keys():
        if result.get(key):
            result[normal[key]] = result[key]
            result.pop(key)

    for idx in range(len(result['职业发展管理'])):
        for key in itenormal.keys():
            if result['职业发展管理'][idx].get(key):
                result['职业发展管理'][idx][itenormal[key]] = result['职业发展管理'][idx][key]
                result['职业发展管理'][idx].pop(key)

    for idx in range(len(result['学习经历'])):
        for key in edunormal.keys():
            if result['学习经历'][idx].get(key):
                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
                result['学习经历'][idx].pop(key)

    for idx in range(len(result['工作经历'])):
        for key in jobnormal.keys():
            if result['工作经历'][idx].get(key):
                result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
                result['工作经历'][idx].pop(key)

    for idx in range(len(result['职业资格证书'])):
        for key in cetnormal.keys():
            if result['职业资格证书'][idx].get(key):
                result['职业资格证书'][idx][cetnormal[key]] = result['职业资格证书'][idx][key]
                result['职业资格证书'][idx].pop(key)

    for idx in range(len(result['家庭成员'])):
        for key in family.keys():
            if result['家庭成员'][idx].get(key):
                result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
                result['家庭成员'][idx].pop(key)

    tit = {
        "基本信息":"base",
        "职业发展管理":"intent_job",
        "学习经历":"tal_training_institutions",
        "工作经历":"tal_his_job",
        "项目经历":"tal_his_project",
        "培训经历":"tal_training_institutions",
        "获奖情况":"tal_rewards_punishments",
        "语言能力":"tal_language",
        "职业资格证书":"tal_vocational_qualification_certificate",
        "专业技能":"tal_professional_tech_certificate",
        "家庭成员":"tal_family_social_relations"
    }

    for key in tit.keys():
        if result.get(key):
            result[tit[key]] = result[key]
            result.pop(key)

    # url = "http://192.168.1.110:9999/talent/getResumeData"
    # session = requests.Session()
    # session.mount('http://', HTTPAdapter(max_retries = 3))
    # try:
    #     headers = {
    #         'contentType':'Application/json'
    #     }
    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
    #     print(response.text)
    # except Exception as e:
    #     print(e)

    return result

if __name__ == "__main__":
    if path.endswith(".docx"):
        pprint(formatter(parse_layout(path)))
    else:
        pprint(formatter(parse_pdf_layout(path)))