|
@@ -2,333 +2,418 @@
|
|
|
# @Author: privacy
|
|
|
# @Date: 2022-07-11 09:21:24
|
|
|
# @Last Modified by: privacy
|
|
|
-# @Last Modified time: 2022-07-15 17:22:00
|
|
|
+# @Last Modified time: 2022-07-18 14:54:53
|
|
|
|
|
|
# 自定义模板
|
|
|
|
|
|
import re
|
|
|
import json
|
|
|
-import logging
|
|
|
-from pprint import pprint
|
|
|
+
|
|
|
import requests
|
|
|
from requests.adapters import HTTPAdapter
|
|
|
+
|
|
|
+import pdfplumber
|
|
|
from docx import Document
|
|
|
-from docx.shared import Inches
|
|
|
|
|
|
|
|
|
path = "d:\\desktop\\自定义.docx"
|
|
|
-
|
|
|
-# 关键词字典
|
|
|
-keywords = [
|
|
|
- "姓名",
|
|
|
- "性别",
|
|
|
- "出生年月",
|
|
|
- "出生日期",
|
|
|
- "民族",
|
|
|
- "籍贯",
|
|
|
- "户籍地",
|
|
|
- "健康状况",
|
|
|
- "政治面貌(加入时间)",
|
|
|
- "政治面貌(加入时间)",
|
|
|
- "参加工作时间",
|
|
|
- "健康状况",
|
|
|
- "外语水平",
|
|
|
- "专业技术资格(取得时间)",
|
|
|
- "专业技术资格(取得时间)",
|
|
|
- "职业技能等级(取得时间)",
|
|
|
- "职业技能等级(取得时间)",
|
|
|
- "熟悉专业有何专长",
|
|
|
- "学历院校",
|
|
|
- "初始学历、专业",
|
|
|
- "初始学历毕业院校及毕业时间",
|
|
|
- "最高学历、专业",
|
|
|
- "最高学历毕业院校及毕业时间",
|
|
|
- "工作单位",
|
|
|
- "现任职务",
|
|
|
- "任职时间",
|
|
|
- "提职时间",
|
|
|
- "联系电话",
|
|
|
- "邮箱地址",
|
|
|
- "对报名岗位认识及工作设想",
|
|
|
- "意向地区",
|
|
|
- "意向岗位",
|
|
|
- "其他意向岗位",
|
|
|
- "意向单位",
|
|
|
- "意向专业",
|
|
|
- "学习经历",
|
|
|
- "起止时间",
|
|
|
- "学校","专业","学历","学位","研究方向","是否全日制",
|
|
|
- "培训经历",
|
|
|
- "培训类型","机构","内容","成绩","证书名称",
|
|
|
- "工作经历",
|
|
|
- "工作单位","职务","部门","证明人","备注",
|
|
|
- "项目经历",
|
|
|
- "项目名称","项目职务","项目描述","项目职责","项目成果",
|
|
|
- "获得职业资格证书情况",
|
|
|
- "获得日期","名称","证书编码/文号","授予单位",
|
|
|
- "奖惩情况",
|
|
|
- "项目","时间","项目单位","证明材料",
|
|
|
- "主要工作业绩(500字以内)",
|
|
|
- "主要工作业绩(500字以内)",
|
|
|
- "自我评价",
|
|
|
- "近三年年度考核结果",
|
|
|
- "主要家庭成员及社会关系",
|
|
|
- "称谓",
|
|
|
- "其他情况说明",
|
|
|
- "工作单位及职务",
|
|
|
- "政治面貌",
|
|
|
- "职业证书",
|
|
|
- "资格等级",
|
|
|
- "取得日期",
|
|
|
- "学校/培训机构",
|
|
|
- "专业",
|
|
|
- "起始时间",
|
|
|
- "毕业时间",
|
|
|
- "职业",
|
|
|
- "与本人关系",
|
|
|
- "计算机水平"
|
|
|
-]
|
|
|
-
|
|
|
-# 解析行内元素
|
|
|
-def parse_line(line):
|
|
|
- result = []
|
|
|
- key = None
|
|
|
- for cell in line:
|
|
|
- if cell and ''.join(cell.split()) in keywords:
|
|
|
- key = ''.join(cell.split())
|
|
|
- elif cell and key:
|
|
|
- schema = {key:cell}
|
|
|
- result.append(schema)
|
|
|
- key = None
|
|
|
- return result
|
|
|
-
|
|
|
-
|
|
|
-# 解析文档布局
|
|
|
-def parse_layout(path):
|
|
|
- result = []
|
|
|
- doc = Document(path)
|
|
|
- lo = {}
|
|
|
- for _table in doc.tables[:]:
|
|
|
- for i, row in enumerate(_table.rows[:]):
|
|
|
- row_content = []
|
|
|
- for cell in row.cells[:]:
|
|
|
- c = cell.text
|
|
|
- if c not in row_content:
|
|
|
- row_content.append(c)
|
|
|
- lo[len(lo.keys())] = row_content
|
|
|
-
|
|
|
- kwln = -1# 关键词行长度
|
|
|
- kwline = None# 关键词行
|
|
|
- for key in lo.keys():
|
|
|
- for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
|
|
|
- if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
|
|
|
- perc = 0# 行内关键词数量
|
|
|
- for c in lo[key]:
|
|
|
- if c and (''.join(c.split()) in keywords):# 找到此行有关键词
|
|
|
- perc += 1
|
|
|
- if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
|
|
|
- perc = 0# 清空行内关键词数
|
|
|
- result.extend(parse_line(lo[key]))# 添加并解析普通行级元素
|
|
|
+# path = "d:\\desktop\\自定义.pdf"
|
|
|
+
|
|
|
+class Custom(object):
|
|
|
+ """docstring for Custom"""
|
|
|
+ def __init__(self):
|
|
|
+ super(Custom, self).__init__()
|
|
|
+ self.keywords = [
|
|
|
+ "姓名",
|
|
|
+ "性别",
|
|
|
+ "出生年月",
|
|
|
+ "出生日期",
|
|
|
+ "民族",
|
|
|
+ "籍贯",
|
|
|
+ "户籍地",
|
|
|
+ "健康状况",
|
|
|
+ "政治面貌(加入时间)",
|
|
|
+ "政治面貌(加入时间)",
|
|
|
+ "参加工作时间",
|
|
|
+ "健康状况",
|
|
|
+ "外语水平",
|
|
|
+ "专业技术资格(取得时间)",
|
|
|
+ "专业技术资格(取得时间)",
|
|
|
+ "职业技能等级(取得时间)",
|
|
|
+ "职业技能等级(取得时间)",
|
|
|
+ "熟悉专业有何专长",
|
|
|
+ "学历院校",
|
|
|
+ "初始学历、专业",
|
|
|
+ "初始学历毕业院校及毕业时间",
|
|
|
+ "最高学历、专业",
|
|
|
+ "最高学历毕业院校及毕业时间",
|
|
|
+ "工作单位",
|
|
|
+ "现任职务",
|
|
|
+ "任职时间",
|
|
|
+ "提职时间",
|
|
|
+ "联系电话",
|
|
|
+ "邮箱地址",
|
|
|
+ "对报名岗位认识及工作设想",
|
|
|
+ "意向地区",
|
|
|
+ "意向岗位",
|
|
|
+ "其他意向岗位",
|
|
|
+ "意向单位",
|
|
|
+ "意向专业",
|
|
|
+ "学习经历",
|
|
|
+ "起止时间",
|
|
|
+ "学校","专业","学历","学位","研究方向","是否全日制",
|
|
|
+ "培训经历",
|
|
|
+ "培训类型","机构","内容","成绩","证书名称",
|
|
|
+ "工作经历",
|
|
|
+ "工作单位","职务","部门","证明人","备注",
|
|
|
+ "项目经历",
|
|
|
+ "项目名称","项目职务","项目描述","项目职责","项目成果",
|
|
|
+ "获得职业资格证书情况",
|
|
|
+ "获得日期","名称","证书编码/文号","授予单位",
|
|
|
+ "奖惩情况",
|
|
|
+ "项目","时间","项目单位","证明材料",
|
|
|
+ "主要工作业绩(500字以内)",
|
|
|
+ "主要工作业绩(500字以内)",
|
|
|
+ "自我评价",
|
|
|
+ "近三年年度考核结果",
|
|
|
+ "主要家庭成员及社会关系",
|
|
|
+ "称谓",
|
|
|
+ "其他情况说明",
|
|
|
+ "工作单位及职务",
|
|
|
+ "政治面貌",
|
|
|
+ "职业证书",
|
|
|
+ "资格等级",
|
|
|
+ "取得日期",
|
|
|
+ "学校/培训机构",
|
|
|
+ "专业",
|
|
|
+ "起始时间",
|
|
|
+ "毕业时间",
|
|
|
+ "职业",
|
|
|
+ "与本人关系",
|
|
|
+ "计算机水平"
|
|
|
+ ]
|
|
|
+ self.json_obj = self.get_translate()
|
|
|
+
|
|
|
+ def get_translate(self):
|
|
|
+ # 转译数据库字段名
|
|
|
+ with open("./resources/translate.json", "r", encoding="utf-8") as ff:
|
|
|
+ json_obj = json.load(ff)
|
|
|
+ return json_obj
|
|
|
+
|
|
|
+ # 解析行内元素
|
|
|
+ def parse_line(self, line):
|
|
|
+ result = []
|
|
|
+ key = None
|
|
|
+ for cell in line:
|
|
|
+ if cell and ''.join(cell.split()) in self.keywords:
|
|
|
+ key = ''.join(cell.split())
|
|
|
+ elif cell and key:
|
|
|
+ schema = {key:cell}
|
|
|
+ result.append(schema)
|
|
|
+ key = None
|
|
|
+ return result
|
|
|
+
|
|
|
+ # 解析word
|
|
|
+ def parse_word_layout(self, path):
|
|
|
+ result = []
|
|
|
+ doc = Document(path)
|
|
|
+ lo = {}
|
|
|
+ for _table in doc.tables[:]:
|
|
|
+ for i, row in enumerate(_table.rows[:]):
|
|
|
+ row_content = []
|
|
|
+ for cell in row.cells[:]:
|
|
|
+ c = cell.text
|
|
|
+ if c not in row_content:
|
|
|
+ row_content.append(c)
|
|
|
+ lo[len(lo.keys())] = row_content
|
|
|
+
|
|
|
+ kwln = -1# 关键词行长度
|
|
|
+ kwline = None# 关键词行
|
|
|
+ for key in lo.keys():
|
|
|
+ for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
|
|
|
+ if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
|
|
|
+ perc = 0# 行内关键词数量
|
|
|
+ for c in lo[key]:
|
|
|
+ if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词
|
|
|
+ perc += 1
|
|
|
+ if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
|
|
|
+ perc = 0# 清空行内关键词数
|
|
|
+ result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素
|
|
|
+ break
|
|
|
+ else:# 关键词行元素
|
|
|
+ schema = dict()
|
|
|
+ for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
|
|
|
+ if key:
|
|
|
+ schema[key] = val
|
|
|
+ result.append(schema)
|
|
|
+ break
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ # print("{}:此行为关键词行!".format(lo[key]))
|
|
|
+ try:
|
|
|
+ kwline = [''.join(cell.split()) for cell in lo[key]]
|
|
|
+ except Exception as e:
|
|
|
+ kwline = lo[key]
|
|
|
+ kwln = len(lo[key])
|
|
|
+ return result
|
|
|
+
|
|
|
+ # 解析pdf
|
|
|
+ def parse_pdf_layout(self, path):
|
|
|
+ result = []
|
|
|
+ lo = {}
|
|
|
+ with pdfplumber.open(path) as pdf:
|
|
|
+ for page in pdf.pages:
|
|
|
+ for table in page.extract_tables():
|
|
|
+ for line in table:
|
|
|
+ lo[len(lo.keys())] = line
|
|
|
+
|
|
|
+ kwln = -1
|
|
|
+ kwline = None
|
|
|
+ for key in lo.keys():
|
|
|
+ # pdb.set_trace()
|
|
|
+ for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
|
|
|
+ if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
|
|
|
+ # pdb.set_trace()
|
|
|
+ for c in lo[key] or len(lo[key])!=kwln:
|
|
|
+ # pdb.set_trace()
|
|
|
+ if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
|
|
|
+ result.extend(self.parse_line(lo[key]))
|
|
|
+ break
|
|
|
+ else:# 关键词行元素
|
|
|
+ schema = dict()
|
|
|
+ for key, val in zip(kwline, lo[key]):
|
|
|
+ if key:
|
|
|
+ schema[key] = val if val else key
|
|
|
+ result.append(schema)
|
|
|
break
|
|
|
- else:# 关键词行元素
|
|
|
- schema = dict()
|
|
|
- for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
|
|
|
- if key:
|
|
|
- schema[key] = val
|
|
|
- result.append(schema)
|
|
|
break
|
|
|
- break
|
|
|
- else:
|
|
|
- # print("{}:此行为关键词行!".format(lo[key]))
|
|
|
- try:
|
|
|
- kwline = [''.join(cell.split()) for cell in lo[key]]
|
|
|
- except Exception as e:
|
|
|
- kwline = lo[key]
|
|
|
- kwln = len(lo[key])
|
|
|
- return result
|
|
|
-
|
|
|
-
|
|
|
-# 格式化数据
|
|
|
-def formatter(datalist):
|
|
|
- result = dict()
|
|
|
- for d in datalist:
|
|
|
- if len(d) == 1:# 普通键值对
|
|
|
- for key in d.keys():
|
|
|
- result[key] = d[key]
|
|
|
- else:# 行级元素
|
|
|
- for k in list(d.keys()):
|
|
|
- if k == "".join(d[k].split()):# 行名
|
|
|
- d.pop(k)
|
|
|
- if result.get(k):# 多行元素合并
|
|
|
- result[k].append(d)
|
|
|
+ else:
|
|
|
+ kwline = []
|
|
|
+ for cell in lo[key]:
|
|
|
+ if cell:
|
|
|
+ kwline.append(''.join(cell.split()))
|
|
|
else:
|
|
|
- result[k] = [d]
|
|
|
-
|
|
|
- ### 时间格式化
|
|
|
- if result.get("出生年月"):
|
|
|
- dates = re.findall(r'\d+' , result["出生年月"])
|
|
|
- if len(dates) == 1:
|
|
|
- result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
- elif len(dates) == 2:
|
|
|
- result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
- elif len(dates) == 3:
|
|
|
- result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
-
|
|
|
- if result.get("任职时间"):
|
|
|
- dates = re.findall(r'\d+' , result["任职时间"])
|
|
|
- if len(dates) == 1:
|
|
|
- result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
- elif len(dates) == 2:
|
|
|
- result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
- elif len(dates) == 3:
|
|
|
- result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
-
|
|
|
- if result.get("参加工作时间"):
|
|
|
- dates = re.findall(r'\d+' , result["参加工作时间"])
|
|
|
- if len(dates) == 1:
|
|
|
- result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
- elif len(dates) == 2:
|
|
|
- result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
- elif len(dates) == 3:
|
|
|
- result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
-
|
|
|
- if result.get("最高学历毕业院校及毕业时间"):
|
|
|
- dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
|
|
|
- ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
|
|
|
- if len(ws) > 0:
|
|
|
- result["最高学历毕业院校"] = ws[0]
|
|
|
- if len(dates) == 1:
|
|
|
- result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
- elif len(dates) == 2:
|
|
|
- result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
- elif len(dates) == 3:
|
|
|
- result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
- result.pop("最高学历毕业院校及毕业时间")
|
|
|
-
|
|
|
- if result.get("初始学历毕业院校及毕业时间"):
|
|
|
- dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
|
|
|
- ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
|
|
|
- if len(ws) > 0:
|
|
|
- result["初始学历毕业院校"] = ws[0]
|
|
|
- if len(dates) == 1:
|
|
|
- result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
- elif len(dates) == 2:
|
|
|
- result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
- elif len(dates) == 3:
|
|
|
- result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
- result.pop("初始学历毕业院校及毕业时间")
|
|
|
-
|
|
|
- if result.get("学习经历"):
|
|
|
- for idx, edu in enumerate(result["学习经历"]):
|
|
|
- if edu.get("起止时间"):
|
|
|
- dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
- if len(dates) == 4:
|
|
|
- result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
-
|
|
|
- if result.get("培训经历"):
|
|
|
- for idx, edu in enumerate(result["培训经历"]):
|
|
|
- if edu.get("起止时间"):
|
|
|
- dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
- if len(dates) == 4:
|
|
|
- result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
-
|
|
|
- if result.get("工作经历"):
|
|
|
- for idx, edu in enumerate(result["工作经历"]):
|
|
|
- if edu.get("起止时间"):
|
|
|
- dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
- if len(dates) == 4:
|
|
|
- result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
-
|
|
|
- if result.get("项目经历"):
|
|
|
- for idx, edu in enumerate(result["项目经历"]):
|
|
|
- if edu.get("起止时间"):
|
|
|
- dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
- if len(dates) == 4:
|
|
|
- result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
-
|
|
|
- if result.get("获得职业资格证书情况"):
|
|
|
- for idx, edu in enumerate(result["获得职业资格证书情况"]):
|
|
|
- if edu.get("获得日期"):
|
|
|
- dates = re.findall(r'\d+' , edu["获得日期"])
|
|
|
- if len(dates) == 2:
|
|
|
- result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
-
|
|
|
- if result.get("奖惩情况"):
|
|
|
- for idx, edu in enumerate(result["奖惩情况"]):
|
|
|
- if edu.get("时间"):
|
|
|
- dates = re.findall(r'\d+' , edu["时间"])
|
|
|
- if len(dates) == 2:
|
|
|
- result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
-
|
|
|
- if result.get("主要家庭成员及社会关系"):
|
|
|
- for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
|
|
|
- if fam.get("出生年月"):
|
|
|
- dates = re.findall(r'\d+' , fam["出生年月"])
|
|
|
- if len(dates) == 2:
|
|
|
- result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
-
|
|
|
- # 转译数据库字段名
|
|
|
- with open("./resources/translate.json", "r", encoding="utf-8") as ff:
|
|
|
- json_obj = json.load(ff)
|
|
|
-
|
|
|
- normal = json_obj["base"]
|
|
|
- edunormal = json_obj["tal_his_edu"]
|
|
|
- family = json_obj["tal_family_social_relations"]
|
|
|
-
|
|
|
- for key in normal.keys():
|
|
|
- if result.get(key):
|
|
|
- result[normal[key]] = result[key]
|
|
|
- result.pop(key)
|
|
|
-
|
|
|
- for idx in range(len(result['学习经历'])):
|
|
|
- result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
|
|
|
- result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
|
|
|
- for key in edunormal.keys():
|
|
|
- if result['学习经历'][idx].get(key):
|
|
|
- result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
|
|
|
- result['学习经历'][idx].pop(key)
|
|
|
-
|
|
|
- for idx in range(len(result['主要家庭成员及社会关系'])):
|
|
|
- for key in family.keys():
|
|
|
- if result['主要家庭成员及社会关系'][idx].get(key):
|
|
|
- result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
|
|
|
- result['主要家庭成员及社会关系'][idx].pop(key)
|
|
|
-
|
|
|
- tit = {
|
|
|
- "基本信息":"base",
|
|
|
- "求职意向":"intent_job",
|
|
|
- "学习经历":"tal_his_edu",
|
|
|
- "工作经历":"tal_his_job",
|
|
|
- "项目经历":"tal_his_project",
|
|
|
- "培训经历":"tal_training_institutions",
|
|
|
- "获奖情况":"tal_rewards_punishments",
|
|
|
- "语言能力":"tal_language",
|
|
|
- "证书":"tal_vocational_qualification_certificate",
|
|
|
- "专业技能":"tal_professional_tech_certificate",
|
|
|
- "主要家庭成员及社会关系":"tal_family_social_relations"
|
|
|
- }
|
|
|
-
|
|
|
- for key in tit.keys():
|
|
|
- if result.get(key):
|
|
|
- result[tit[key]] = result[key]
|
|
|
- result.pop(key)
|
|
|
-
|
|
|
- # url = "http://192.168.1.110:9999/talent/getResumeData"
|
|
|
- # session = requests.Session()
|
|
|
- # session.mount('http://', HTTPAdapter(max_retries = 3))
|
|
|
- # try:
|
|
|
- # headers = {
|
|
|
- # 'contentType':'Application/json'
|
|
|
- # }
|
|
|
- # response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
|
|
|
- # print(response.text)
|
|
|
- # except Exception as e:
|
|
|
- # print(e)
|
|
|
- return result
|
|
|
-
|
|
|
+ kwline.append(cell)
|
|
|
+ kwln = len(lo[key])
|
|
|
+ return result
|
|
|
+
|
|
|
+ # 格式化数据
|
|
|
+ def formatter(self, datalist):
|
|
|
+ result = dict()
|
|
|
+ for d in datalist:
|
|
|
+ if len(d) == 1:# 普通键值对
|
|
|
+ for key in d.keys():
|
|
|
+ result[key] = d[key]
|
|
|
+ else:# 行级元素
|
|
|
+ for k in list(d.keys()):
|
|
|
+ if k == "".join(d[k].split()):# 行名
|
|
|
+ d.pop(k)
|
|
|
+ if result.get(k):# 多行元素合并
|
|
|
+ result[k].append(d)
|
|
|
+ else:
|
|
|
+ result[k] = [d]
|
|
|
+
|
|
|
+ if result.get("外语水平"):
|
|
|
+ data = re.findall(r'(\w+[语话])', result["外语水平"])
|
|
|
+ if data:
|
|
|
+ result["外语水平"] = data
|
|
|
+
|
|
|
+ if result.get("专业技术资格(取得时间)"):
|
|
|
+ dates = re.findall(r'\d+', result["专业技术资格(取得时间)"])
|
|
|
+ for i in dates:
|
|
|
+ result["专业技术资格(取得时间)"] = result["专业技术资格(取得时间)"].replace(i, "")
|
|
|
+ names = re.findall(r'\w+', result["专业技术资格(取得时间)"])
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"专业技术资格":names}]
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"专业技术资格":names}]
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"专业技术资格":names}]
|
|
|
+
|
|
|
+ if result.get("职业技能等级(取得时间)"):
|
|
|
+ dates = re.findall(r'\d+', result["职业技能等级(取得时间)"])
|
|
|
+ for i in dates:
|
|
|
+ result["职业技能等级(取得时间)"] = result["职业技能等级(取得时间)"].replace(i, "")
|
|
|
+ names = re.findall(r'\w+', result["职业技能等级(取得时间)"])
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["职业技能等级(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"职业技能等级":names}]
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["职业技能等级(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"职业技能等级":names}]
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["职业技能等级(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"职业技能等级":names}]
|
|
|
+
|
|
|
+ ### 时间格式化
|
|
|
+ if result.get("出生年月"):
|
|
|
+ dates = re.findall(r'\d+' , result["出生年月"])
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
+
|
|
|
+ if result.get("任职时间"):
|
|
|
+ dates = re.findall(r'\d+' , result["任职时间"])
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
+
|
|
|
+ if result.get("参加工作时间"):
|
|
|
+ dates = re.findall(r'\d+' , result["参加工作时间"])
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
+
|
|
|
+ if result.get("最高学历毕业院校及毕业时间"):
|
|
|
+ dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
|
|
|
+ ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
|
|
|
+ if len(ws) > 0:
|
|
|
+ result["最高学历毕业院校"] = ws[0]
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
+ result.pop("最高学历毕业院校及毕业时间")
|
|
|
+
|
|
|
+ if result.get("初始学历毕业院校及毕业时间"):
|
|
|
+ dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
|
|
|
+ ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
|
|
|
+ if len(ws) > 0:
|
|
|
+ result["初始学历毕业院校"] = ws[0]
|
|
|
+ if len(dates) == 1:
|
|
|
+ result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ elif len(dates) == 2:
|
|
|
+ result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ elif len(dates) == 3:
|
|
|
+ result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
|
|
|
+ result.pop("初始学历毕业院校及毕业时间")
|
|
|
+
|
|
|
+ if result.get("学习经历"):
|
|
|
+ for idx, edu in enumerate(result["学习经历"]):
|
|
|
+ if edu.get("起止时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
+ if len(dates) == 4:
|
|
|
+ result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
+
|
|
|
+ if result.get("培训经历"):
|
|
|
+ for idx, edu in enumerate(result["培训经历"]):
|
|
|
+ if edu.get("起止时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
+ if len(dates) == 4:
|
|
|
+ result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
+
|
|
|
+ if result.get("工作经历"):
|
|
|
+ for idx, edu in enumerate(result["工作经历"]):
|
|
|
+ if edu.get("起止时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
+ if len(dates) == 4:
|
|
|
+ result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
+
|
|
|
+ if result.get("项目经历"):
|
|
|
+ for idx, edu in enumerate(result["项目经历"]):
|
|
|
+ if edu.get("起止时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["起止时间"])
|
|
|
+ if len(dates) == 4:
|
|
|
+ result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
|
|
|
+
|
|
|
+ if result.get("获得职业资格证书情况"):
|
|
|
+ for idx, edu in enumerate(result["获得职业资格证书情况"]):
|
|
|
+ if edu.get("获得日期"):
|
|
|
+ dates = re.findall(r'\d+' , edu["获得日期"])
|
|
|
+ if len(dates) == 2:
|
|
|
+ result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+
|
|
|
+ if result.get("奖惩情况"):
|
|
|
+ for idx, edu in enumerate(result["奖惩情况"]):
|
|
|
+ if edu.get("时间"):
|
|
|
+ dates = re.findall(r'\d+' , edu["时间"])
|
|
|
+ if len(dates) == 2:
|
|
|
+ result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+
|
|
|
+ if result.get("主要家庭成员及社会关系"):
|
|
|
+ for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
|
|
|
+ if fam.get("出生年月"):
|
|
|
+ dates = re.findall(r'\d+' , fam["出生年月"])
|
|
|
+ if len(dates) == 2:
|
|
|
+ result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+
|
|
|
+ normal = self.json_obj["base"]
|
|
|
+ edunormal = self.json_obj["tal_his_edu"]
|
|
|
+ family = self.json_obj["tal_family_social_relation"]
|
|
|
+
|
|
|
+ for key in normal.keys():
|
|
|
+ if result.get(key):
|
|
|
+ result[normal[key]] = result[key]
|
|
|
+ result.pop(key)
|
|
|
+
|
|
|
+ for idx in range(len(result['学习经历'])):
|
|
|
+ result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
|
|
|
+ result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
|
|
|
+ for key in edunormal.keys():
|
|
|
+ if result['学习经历'][idx].get(key):
|
|
|
+ result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
|
|
|
+ result['学习经历'][idx].pop(key)
|
|
|
+
|
|
|
+ for idx in range(len(result['主要家庭成员及社会关系'])):
|
|
|
+ for key in family.keys():
|
|
|
+ if result['主要家庭成员及社会关系'][idx].get(key):
|
|
|
+ result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
|
|
|
+ result['主要家庭成员及社会关系'][idx].pop(key)
|
|
|
+
|
|
|
+ tit = {
|
|
|
+ "基本信息":"base",
|
|
|
+ "求职意向":"intent_job",
|
|
|
+ "学习经历":"tal_his_edu",
|
|
|
+ "工作经历":"tal_his_job",
|
|
|
+ "项目经历":"tal_his_project",
|
|
|
+ "培训经历":"tal_training_experience",
|
|
|
+ "获奖情况":"tal_reward_punishment",
|
|
|
+ "语言能力":"tal_language",
|
|
|
+ "证书":"tal_vocational_qualification_certificate",
|
|
|
+ "专业技能":"tal_professional_tech_certificate",
|
|
|
+ "主要家庭成员及社会关系":"tal_family_social_relation"
|
|
|
+ }
|
|
|
+
|
|
|
+ for key in tit.keys():
|
|
|
+ if result.get(key):
|
|
|
+ result[tit[key]] = result[key]
|
|
|
+ result.pop(key)
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+ # 推送后端
|
|
|
+ def push_back(self, result):
|
|
|
+ url = "http://192.168.1.110:9999/talent/getResumeData"
|
|
|
+ session = requests.Session()
|
|
|
+ session.mount('http://', HTTPAdapter(max_retries = 3))
|
|
|
+ try:
|
|
|
+ headers = {
|
|
|
+ 'contentType':'Application/json'
|
|
|
+ }
|
|
|
+ response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
|
|
|
+ print(response.text)
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+
|
|
|
+ def predict(self, path):
|
|
|
+ if path.endswith(".docx"):
|
|
|
+ result = self.formatter(self.parse_word_layout(path))
|
|
|
+ self.push_back(result)
|
|
|
+ print(self.formatter(self.parse_word_layout(path)))
|
|
|
+ elif path.endswith(".pdf"):
|
|
|
+ result = self.formatter(self.parse_pdf_layout(path))
|
|
|
+ self.push_back(result)
|
|
|
+ print(self.formatter(self.parse_pdf_layout(path)))
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- pprint(formatter(parse_layout(path)))
|
|
|
-
|
|
|
-
|
|
|
+ c = Custom()
|
|
|
+ c.predict(path)
|