123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2022-07-07 13:12:17
- # @Last Modified by: privacy
- # @Last Modified time: 2022-07-16 09:08:32
- # 内部人才市场简历模板
- from pprint import pprint
- import re
- import json
- import docx
- from docx import Document
- from docx.shared import Inches
- path = "d:\\desktop\\内部人才市场简历模板.docx"
- keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
- def parse_line(line):
- result = []
- key = None
- for cell in line:
- if cell and ''.join(cell.split()) in keywords:
- key = ''.join(cell.split())
- elif cell and key:
- schema = {key:cell}
- result.append(schema)
- key = None
- return result
- def parse_layout(path):
- result = []
- doc = Document(path)
- lo = {}
- tables = doc.tables
- for _table in tables[:]:
- for i, row in enumerate(_table.rows[:]):
- row_content = []
- for cell in row.cells[:]:
- c = cell.text
- row_content.append(c)
- lo[len(lo.keys())] = row_content
-
- kwln = -1
- kwline = None
- for key in lo.keys():
- # pdb.set_trace()
- for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
- if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
- # pdb.set_trace()
- for c in lo[key]:
- # pdb.set_trace()
- if c and ''.join(c.split()) in keywords:# 非关键词行元素
- result.extend(parse_line(lo[key]))
- break
- else:# 关键词行元素
- schema = dict()
- for key, val in zip(kwline, lo[key]):
- if key:
- schema[key] = val
- if "学校/培训机构" in schema.keys():
- schema["学习经历"] = "学习经历"
- elif "与本人关系" in schema.keys():
- schema["家庭成员"] = "家庭成员"
- elif "意向地区" in schema.keys():
- schema["职业发展管理"] = "职业发展管理"
- elif "职业证书" in schema.keys():
- schema["职业资格证书"] = "职业资格证书"
- result.append(schema)
- break
- break
- else:
- # print("此行为关键词行")
- kwline = [''.join(cell.split()) for cell in lo[key]]
- kwln = len(lo[key])
- job = {"工作经历":"工作经历"}
- flag = None
- for p in doc.paragraphs:
- text = p.text.replace(":", ":")
- if ":" in text:
- text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
- for line in text.split("\n"):
- if line.strip():
- i = line.split(":")
- if job.get(i[0].strip()):
- result.append(job)
- job = {"工作经历":"工作经历"}
- job[i[0].strip()] = i[1].strip()
- flag = i[0].strip()
- elif flag == "工作描述":
- job["工作描述"] += '\n' + text.strip()
- else:
- result.append(job)
- return result
- # 格式化数据
- def formatter(datalist):
- result = dict()
- for d in datalist:
- if len(d) == 1:
- for key in d.keys():
- result[key] = d[key]
- else:
- for k in list(d.keys()):
- if k == "".join(d[k].split()):
- d.pop(k)
- if result.get(k):
- result[k].append(d)
- else:
- result[k] = [d]
- # 转译数据库字段名
- with open("./resources/translate.json", "r", encoding="utf-8") as ff:
- json_obj = json.load(ff)
- normal = json_obj["base"]
- itenormal = json_obj["base"]
- edunormal = json_obj["tal_training_institutions"]
- jobnormal = json_obj["tal_his_job"]
- cetnormal = json_obj["tal_vocational_qualification_certificate"]
- family = json_obj["tal_family_social_relations"]
- for key in normal.keys():
- if result.get(key):
- result[normal[key]] = result[key]
- result.pop(key)
- for idx in range(len(result['职业发展管理'])):
- for key in itenormal.keys():
- if result['职业发展管理'][idx].get(key):
- result['职业发展管理'][idx][itenormal[key]] = result['职业发展管理'][idx][key]
- result['职业发展管理'][idx].pop(key)
- for idx in range(len(result['学习经历'])):
- for key in edunormal.keys():
- if result['学习经历'][idx].get(key):
- result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
- result['学习经历'][idx].pop(key)
- for idx in range(len(result['工作经历'])):
- for key in jobnormal.keys():
- if result['工作经历'][idx].get(key):
- result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
- result['工作经历'][idx].pop(key)
- for idx in range(len(result['职业资格证书'])):
- for key in cetnormal.keys():
- if result['职业资格证书'][idx].get(key):
- result['职业资格证书'][idx][cetnormal[key]] = result['职业资格证书'][idx][key]
- result['职业资格证书'][idx].pop(key)
- for idx in range(len(result['家庭成员'])):
- for key in family.keys():
- if result['家庭成员'][idx].get(key):
- result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
- result['家庭成员'][idx].pop(key)
- tit = {
- "基本信息":"base",
- "职业发展管理":"intent_job",
- "学习经历":"tal_training_institutions",
- "工作经历":"tal_his_job",
- "项目经历":"tal_his_project",
- "培训经历":"tal_training_institutions",
- "获奖情况":"tal_rewards_punishments",
- "语言能力":"tal_language",
- "职业资格证书":"tal_vocational_qualification_certificate",
- "专业技能":"tal_professional_tech_certificate",
- "家庭成员":"tal_family_social_relations"
- }
- for key in tit.keys():
- if result.get(key):
- result[tit[key]] = result[key]
- result.pop(key)
- # url = "http://192.168.1.110:9999/talent/getResumeData"
- # session = requests.Session()
- # session.mount('http://', HTTPAdapter(max_retries = 3))
- # try:
- # headers = {
- # 'contentType':'Application/json'
- # }
- # response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
- # print(response.text)
- # except Exception as e:
- # print(e)
- return result
- if __name__ == "__main__":
- pprint(formatter(parse_layout(path)))
|