123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2022-07-07 12:59:42
- # @Last Modified by: privacy
- # @Last Modified time: 2022-07-13 15:22:48
- # import pdb
- from pprint import pprint
- import pandas as pd
- import pdfplumber
- path = "d:\\desktop\\社招简历模板.pdf"
- keywords = ['姓名',
- '性别',
- '出生日期',
- '一寸照片',
- '民族',
- '出生地',
- '政治面貌(加入时间)',
- '参加工作时间',
- '健康状况',
- '外语水平',
- '初始学历、专业',
- '最高学历、专业',
- '初始学历毕业院校及毕业时间',
- '最高学历毕业院校及毕业时间',
- '专业技术资格(取得时间)',
- '职业技能等级(取得时间)',
- '熟悉专业有何专长',
- '工作单位',
- '现任职务',
- '任职时间',
- '提职时间',
- '意向岗位',
- '联系电话',
- '学习经历',
- '起止时间',
- '学校',
- '专业',
- '学历',
- '学位',
- '研究方向',
- '是否全日制',
- '培训',
- '起止时间',
- '培训类型',
- '机构',
- '内容',
- '成绩',
- '证书名称',
- '经历',
- '工作经历',
- '起止时间',
- '工作单位',
- '职务',
- '部门',
- '证明人',
- '备注',
- '对报名岗位认识及工作设想',
- '自我评价及主要工作业绩',
- '获得职业资格证书情况',
- '获得日期',
- '名称',
- '证书编码/文号',
- '授予单位',
- '备注',
- '奖惩',
- '项目',
- '时间',
- '项目单位',
- '证明材料',
- '情况',
- '主要家庭成员及社会关系',
- '称谓',
- '出生年月',
- '政治面貌',
- '工作单位及职务',
- '其他情况说明',
- '诚信承诺',
- '本人承诺,以上信息均与事实相符,若有虚假,愿承担一切后果并自愿取消应聘资格。'
- '承诺人:'
- '社会招聘工作办公室资格审查意见']
- def parse_line(line):
- result = []
- key = None
- for cell in line:
- if cell and ''.join(cell.split()) in keywords:
- key = ''.join(cell.split())
- elif cell and key:
- schema = {key:cell}
- result.append(schema)
- key = None
- return result
- def parse_layout(path):
- result = []
- lo = {}
- with pdfplumber.open(path) as pdf:
- for page in pdf.pages:
- for table in page.extract_tables():
- for line in table:
- # lo[len(lo.keys())] = [cell for cell in line if cell]
- lo[len(lo.keys())] = line
- kwln = -1
- kwline = None
- for key in lo.keys():
- # pdb.set_trace()
- for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
- if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
- # pdb.set_trace()
- for c in lo[key] or len(lo[key])!=kwln:
- # pdb.set_trace()
- if c and ''.join(c.split()) in keywords:# 非关键词行元素
- result.extend(parse_line(lo[key]))
- break
- else:# 关键词行元素
- schema = dict()
- for key, val in zip(kwline, lo[key]):
- if key:
- schema[key] = val if val else key
- result.append(schema)
- break
- break
- else:
- # print("此行为关键词行")
- # kwline = lo[key]
- kwline = []
- for cell in lo[key]:
- if cell:
- kwline.append(''.join(cell.split()))
- else:
- kwline.append(cell)
- kwln = len(lo[key])
- return result
- # 格式化数据
- def formatter(datalist):
- result = dict()
- for d in datalist:
- if len(d) == 1:
- for key in d.keys():
- result[key] = d[key]
- else:
- for k in list(d.keys()):
- if k == "".join(d[k].split()):
- d.pop(k)
- if result.get(k):
- result[k].append(d)
- else:
- result[k] = [d]
- return result
- if __name__ == '__main__':
- # pprint(parse_layout(path))
- pprint(formatter(parse_layout(path)))
|