|
@@ -2,20 +2,24 @@
|
|
|
# @Author: privacy
|
|
|
# @Date: 2022-07-07 13:12:17
|
|
|
# @Last Modified by: privacy
|
|
|
-# @Last Modified time: 2022-07-16 09:08:32
|
|
|
+# @Last Modified time: 2022-07-16 15:05:03
|
|
|
|
|
|
# 内部人才市场简历模板
|
|
|
from pprint import pprint
|
|
|
import re
|
|
|
import json
|
|
|
+from pdfminer.high_level import extract_pages
|
|
|
+from pdfminer.layout import LTTextContainer
|
|
|
+import pdfplumber
|
|
|
import docx
|
|
|
from docx import Document
|
|
|
from docx.shared import Inches
|
|
|
|
|
|
|
|
|
-path = "d:\\desktop\\内部人才市场简历模板.docx"
|
|
|
+# path = "d:\\desktop\\内部人才市场简历模板.docx"
|
|
|
+path = "d:\\desktop\\内部人才市场简历模板.pdf"
|
|
|
|
|
|
-keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
|
|
|
+keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
|
|
|
|
|
|
def parse_line(line):
|
|
|
result = []
|
|
@@ -98,6 +102,76 @@ def parse_layout(path):
|
|
|
return result
|
|
|
|
|
|
|
|
|
+def parse_pdf_layout(path):
|
|
|
+ result = []
|
|
|
+ lo = {}
|
|
|
+ with pdfplumber.open(path) as pdf:
|
|
|
+ for page in pdf.pages:
|
|
|
+ for table in page.extract_tables():
|
|
|
+ for line in table:
|
|
|
+ # lo[len(lo.keys())] = [cell for cell in line if cell]
|
|
|
+ lo[len(lo.keys())] = line
|
|
|
+
|
|
|
+ kwln = -1
|
|
|
+ kwline = None
|
|
|
+ for key in lo.keys():
|
|
|
+ # pdb.set_trace()
|
|
|
+ for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
|
|
|
+ if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
|
|
|
+ # pdb.set_trace()
|
|
|
+ for c in lo[key]:
|
|
|
+ # pdb.set_trace()
|
|
|
+ if c and ''.join(c.split()) in keywords:# 非关键词行元素
|
|
|
+ result.extend(parse_line(lo[key]))
|
|
|
+ break
|
|
|
+ if c == "对报名岗位\n认 识及工作":
|
|
|
+ print(''.join(c.split()))
|
|
|
+ break
|
|
|
+ else:# 关键词行元素
|
|
|
+ schema = dict()
|
|
|
+ for key, val in zip(kwline, lo[key]):
|
|
|
+ if key:
|
|
|
+ schema[key] = val
|
|
|
+ if "学校/培训机构" in schema.keys():
|
|
|
+ schema["学习经历"] = "学习经历"
|
|
|
+ elif "与本人关系" in schema.keys():
|
|
|
+ schema["家庭成员"] = "家庭成员"
|
|
|
+ elif "意向地区" in schema.keys():
|
|
|
+ schema["职业发展管理"] = "职业发展管理"
|
|
|
+ elif "职业证书" in schema.keys():
|
|
|
+ schema["职业资格证书"] = "职业资格证书"
|
|
|
+ result.append(schema)
|
|
|
+ break
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ # print("此行为关键词行")
|
|
|
+ kwline = [''.join(cell.split()) for cell in lo[key]]
|
|
|
+ kwln = len(lo[key])
|
|
|
+
|
|
|
+ job = {"工作经历":"工作经历"}
|
|
|
+ flag = None
|
|
|
+
|
|
|
+ with pdfplumber.open(path) as pdf:
|
|
|
+ for page in pdf.pages:
|
|
|
+ for predict in page.extract_words():
|
|
|
+ # print(predict['text'])
|
|
|
+ text = predict['text'].replace(":", ":")
|
|
|
+ if ":" in text:
|
|
|
+ text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
|
|
|
+ for line in text.split("\n"):
|
|
|
+ if line.strip():
|
|
|
+ i = line.split(":")
|
|
|
+ if job.get(i[0].strip()):
|
|
|
+ result.append(job)
|
|
|
+ job = {"工作经历":"工作经历"}
|
|
|
+ job[i[0].strip()] = i[1].strip()
|
|
|
+ flag = i[0].strip()
|
|
|
+ elif flag == "工作描述":
|
|
|
+ job["工作描述"] += '\n' + text.strip()
|
|
|
+ else:
|
|
|
+ result.append(job)
|
|
|
+ return result
|
|
|
+
|
|
|
# 格式化数据
|
|
|
def formatter(datalist):
|
|
|
result = dict()
|
|
@@ -195,5 +269,8 @@ def formatter(datalist):
|
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- pprint(formatter(parse_layout(path)))
|
|
|
+ if path.endswith(".docx"):
|
|
|
+ pprint(formatter(parse_layout(path)))
|
|
|
+ else:
|
|
|
+ pprint(formatter(parse_pdf_layout(path)))
|
|
|
|