# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2022-07-07 12:59:42 # @Last Modified by: privacy # @Last Modified time: 2022-07-16 11:41:09 # import pdb from pprint import pprint import json import pandas as pd import pdfplumber import docx from docx import Document from docx.shared import Inches path = "d:\\desktop\\社招简历模板.docx" # path = "d:\\desktop\\社招简历模板.pdf" keywords = ['姓名', '性别', '出生日期', '一寸照片', '民族', '出生地', '政治面貌(加入时间)', '参加工作时间', '健康状况', '外语水平', '初始学历、专业', '最高学历、专业', '初始学历毕业院校及毕业时间', '最高学历毕业院校及毕业时间', '专业技术资格(取得时间)', '职业技能等级(取得时间)', '熟悉专业有何专长', '工作单位', '现任职务', '任职时间', '提职时间', '意向岗位', '联系电话', '学习经历', '起止时间', '学校', '专业', '学历', '学位', '研究方向', '是否全日制', '培训', '起止时间', '培训类型', '机构', '内容', '成绩', '证书名称', '经历', '工作经历', '起止时间', '工作单位', '职务', '部门', '证明人', '备注', '对报名岗位认识及工作设想', '自我评价及主要工作业绩', '获得职业资格证书情况', '获得日期', '名称', '证书编码/文号', '授予单位', '备注', '奖惩', '项目', '时间', '项目单位', '证明材料', '情况', '主要家庭成员及社会关系', '称谓', '出生年月', '政治面貌', '工作单位及职务', '其他情况说明', '诚信承诺', '本人承诺,以上信息均与事实相符,若有虚假,愿承担一切后果并自愿取消应聘资格。' '承诺人:' '社会招聘工作办公室资格审查意见'] def parse_line(line): result = [] key = None for cell in line: if cell and ''.join(cell.split()) in keywords: key = ''.join(cell.split()) elif cell and key: schema = {key:cell} result.append(schema) key = None return result def parse_word_layout(path): result = [] doc = Document(path) lo = {} for _table in doc.tables[:]: for i, row in enumerate(_table.rows[:]): row_content = [] for cell in row.cells[:]: c = cell.text if c not in row_content: row_content.append(c) lo[len(lo.keys())] = row_content kwln = -1# 关键词行长度 kwline = None# 关键词行 for key in lo.keys(): for val in lo[key]:# 通过全关键词,判断此行是否为关键词行 if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素 perc = 0# 行内关键词数量 for c in lo[key]: if c and (''.join(c.split()) in keywords):# 找到此行有关键词 perc += 1 if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素 perc = 0# 清空行内关键词数 result.extend(parse_line(lo[key]))# 添加并解析普通行级元素 break else:# 关键词行元素 if len(kwline) != len(lo[key]): break schema = dict() for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素 if key: schema[key] = val result.append(schema) break break else: # print("{}:此行为关键词行!".format(lo[key])) if len(lo[key])>2: try: kwline = [''.join(cell.split()) for cell in lo[key]] except Exception as e: kwline = lo[key] kwln = len(lo[key]) return result def parse_pdf_layout(path): result = [] lo = {} with pdfplumber.open(path) as pdf: for page in pdf.pages: for table in page.extract_tables(): for line in table: # lo[len(lo.keys())] = [cell for cell in line if cell] lo[len(lo.keys())] = line kwln = -1 kwline = None for key in lo.keys(): # pdb.set_trace() for val in lo[key]:# 通过全关键词,判断此行是否为关键词行 if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素 # pdb.set_trace() for c in lo[key] or len(lo[key])!=kwln: # pdb.set_trace() if c and ''.join(c.split()) in keywords:# 非关键词行元素 result.extend(parse_line(lo[key])) break else:# 关键词行元素 schema = dict() for key, val in zip(kwline, lo[key]): if key: schema[key] = val if val else key result.append(schema) break break else: # print("此行为关键词行") # kwline = lo[key] kwline = [] for cell in lo[key]: if cell: kwline.append(''.join(cell.split())) else: kwline.append(cell) kwln = len(lo[key]) return result # 格式化数据 def formatter(datalist): result = dict() for d in datalist: if len(d) == 1: for key in d.keys(): result[key] = d[key] else: for k in list(d.keys()): if k == "".join(d[k].split()): d.pop(k) if result.get(k): result[k].append(d) else: result[k] = [d] # 转译数据库字段名 with open("./resources/translate.json", "r", encoding="utf-8") as ff: json_obj = json.load(ff) normal = json_obj["base"] itenormal = json_obj["base"] edunormal = json_obj["tal_his_edu"] jobnormal = json_obj["tal_his_job"] cetnormal = json_obj["tal_vocational_qualification_certificate"] family = json_obj["tal_family_social_relations"] for key in normal.keys(): if result.get(key): result[normal[key]] = result[key] result.pop(key) for idx in range(len(result['学习经历'])): for key in edunormal.keys(): if result['学习经历'][idx].get(key): result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key] result['学习经历'][idx].pop(key) for idx in range(len(result['工作经历'])): for key in jobnormal.keys(): if result['工作经历'][idx].get(key): result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key] result['工作经历'][idx].pop(key) for idx in range(len(result['获得职业资格证书情况'])): for key in cetnormal.keys(): if result['获得职业资格证书情况'][idx].get(key): result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key] result['获得职业资格证书情况'][idx].pop(key) for idx in range(len(result['主要家庭成员及社会关系'])): for key in family.keys(): if result['主要家庭成员及社会关系'][idx].get(key): result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key] result['主要家庭成员及社会关系'][idx].pop(key) tit = { "基本信息":"base", "职业发展管理":"intent_job", "学习经历":"tal_his_edu", "工作经历":"tal_his_job", "项目经历":"tal_his_project", "培训经历":"tal_training_institutions", "获奖情况":"tal_rewards_punishments", "语言能力":"tal_language", "获得职业资格证书情况":"tal_vocational_qualification_certificate", "专业技能":"tal_professional_tech_certificate", "主要家庭成员及社会关系":"tal_family_social_relations", "其他情况说明":"intro" } for key in tit.keys(): if result.get(key): result[tit[key]] = result[key] result.pop(key) # url = "http://192.168.1.110:9999/talent/getResumeData" # session = requests.Session() # session.mount('http://', HTTPAdapter(max_retries = 3)) # try: # headers = { # 'contentType':'Application/json' # } # response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10) # print(response.text) # except Exception as e: # print(e) return result if __name__ == '__main__': if path.endswith(".pdf"): pprint(formatter(parse_pdf_layout(path))) else: pprint(formatter(parse_word_layout(path)))