浏览代码

new file: custom.py
new file: irafa.py
modified: resume_parse.py
new file: srafa.py

sprivacy 3 年之前
父节点
当前提交
3e5f25e488
共有 4 个文件被更改,包括 470 次插入10 次删除
  1. 269 0
      tools/custom.py
  2. 61 0
      tools/irafa.py
  3. 10 10
      tools/resume_parse.py
  4. 130 0
      tools/srafa.py

+ 269 - 0
tools/custom.py

@@ -0,0 +1,269 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2022-07-11 09:21:24
+# @Last Modified by:   privacy
+# @Last Modified time: 2022-07-12 16:30:08
+import re
+import logging
+from pprint import pprint
+
+from docx import Document
+from docx.shared import Inches
+
+
+path = "d:\\desktop\\自定义.docx"
+# path = "d:\\desktop\\内部人才市场简历模板.docx"
+
+keywords = [
+	"姓名",
+	"性别",
+	"出生年月",
+	"出生日期",
+	"民族",
+	"籍贯",
+	"户籍地",
+	"健康状况",
+	"政治面貌(加入时间)",
+	"政治面貌(加入时间)",
+	"参加工作时间",
+	"健康状况",
+	"外语水平",
+	"专业技术资格(取得时间)",
+	"专业技术资格(取得时间)",
+	"职业技能等级(取得时间)",
+	"职业技能等级(取得时间)",
+	"熟悉专业有何专长",
+	"学历院校",
+	"初始学历、专业",
+	"初始学历毕业院校及毕业时间",
+	"最高学历、专业",
+	"最高学历毕业院校及毕业时间",
+	"工作单位",
+	"现任职务",
+	"任职时间",
+	"提职时间",
+	"联系电话",
+	"邮箱地址",
+	"对报名岗位认识及工作设想",
+	"意向地区",
+	"意向岗位",
+	"其他意向岗位",
+	"意向单位",
+	"意向专业",
+	"学习经历",
+	"起止时间",
+	"学校","专业","学历","学位","研究方向","是否全日制",
+	"培训经历",
+	"培训类型","机构","内容","成绩","证书名称",
+	"工作经历",
+	"工作单位","职务","部门","证明人","备注",
+	"项目经历",
+	"项目名称","项目职务","项目描述","项目职责","项目成果",
+	"获得职业资格证书情况",
+	"获得日期","名称","证书编码/文号","授予单位",
+	"奖惩情况",
+	"项目","时间","项目单位","证明材料",
+	"主要工作业绩(500字以内)",
+	"主要工作业绩(500字以内)",
+	"自我评价",
+	"近三年年度考核结果",
+	"主要家庭成员及社会关系",
+	"称谓",
+	"其他情况说明",
+	"工作单位及职务",
+	"政治面貌",
+	"职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系", "计算机水平"
+]
+
+def parse_line(line):
+    result = []
+    key = None
+    for cell in line:
+        if cell and ''.join(cell.split()) in keywords:
+            key = ''.join(cell.split())
+        elif cell and key:
+            schema = {key:cell}
+            result.append(schema)
+            key = None
+    return result
+
+
+def parse_layout(path):
+    result = []
+    doc = Document(path)
+    lo = {}
+    tables = doc.tables
+    for _table in tables[:]:
+        for i, row in enumerate(_table.rows[:]):
+            row_content = []
+            for cell in row.cells[:]:
+                c = cell.text
+                # row_content.append(c)
+                if c not in row_content:
+                	row_content.append(c)
+            lo[len(lo.keys())] = row_content
+
+    kwln = -1
+    kwline = None
+    for key in lo.keys():
+        # pdb.set_trace()
+        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
+                # pdb.set_trace()
+                perc = 0
+                for c in lo[key]:
+                    # pdb.set_trace()
+                    if c and (''.join(c.split()) in keywords):
+                        perc += 1
+                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 非关键词行元素
+                        # print(c)
+                        # print(perc)
+                        # print(lo[key])
+                        perc = 0
+                        result.extend(parse_line(lo[key]))
+                        break
+                else:# 关键词行元素
+                    schema = dict()
+                    for key, val in zip(kwline, lo[key]):
+                        if key:
+                            schema[key] = val
+                    result.append(schema)
+                    break
+                break
+        else:
+            # print("{}\t\t此行为关键词行".format(lo[key]))
+            try:
+                kwline = [''.join(cell.split()) for cell in lo[key]]
+            except Exception as e:
+                kwline = lo[key]
+            kwln = len(lo[key])
+    return result
+
+
+# 格式化数据
+def formatter(datalist):
+    result = dict()
+
+    for d in datalist:
+        if len(d) == 1:
+            for key in d.keys():
+                result[key] = d[key]
+        else:
+            for k in list(d.keys()):
+                if k == "".join(d[k].split()):
+                    d.pop(k)
+                    if result.get(k):
+                        result[k].append(d)
+                    else:
+                        result[k] = [d]
+
+    if result.get("出生年月"):
+        dates = re.findall(r'\d+' , result["出生年月"])
+        if len(dates) == 1:
+            result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
+        elif len(dates) == 2:
+            result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+        elif len(dates) == 3:
+            result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+
+    if result.get("任职时间"):
+        dates = re.findall(r'\d+' , result["任职时间"])
+        if len(dates) == 1:
+            result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
+        elif len(dates) == 2:
+            result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+        elif len(dates) == 3:
+            result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+
+    if result.get("参加工作时间"):
+        dates = re.findall(r'\d+' , result["参加工作时间"])
+        if len(dates) == 1:
+            result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
+        elif len(dates) == 2:
+            result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+        elif len(dates) == 3:
+            result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+
+    if result.get("最高学历毕业院校及毕业时间"):
+        dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
+        ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
+        if len(ws) > 0:
+            result["最高学历毕业院校"] = ws[0]
+        if len(dates) == 1:
+            result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
+        elif len(dates) == 2:
+            result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+        elif len(dates) == 3:
+            result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
+        result.pop("最高学历毕业院校及毕业时间")
+
+    if result.get("初始学历毕业院校及毕业时间"):
+        dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
+        ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
+        if len(ws) > 0:
+            result["初始学历毕业院校"] = ws[0]
+        if len(dates) == 1:
+            result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
+        elif len(dates) == 2:
+            result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+        elif len(dates) == 3:
+            result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
+        result.pop("初始学历毕业院校及毕业时间")
+
+    if result.get("学习经历"):
+        for idx, edu in enumerate(result["学习经历"]):
+            if edu.get("起止时间"):
+                dates = re.findall(r'\d+' , edu["起止时间"])
+                if len(dates) == 4:
+                    result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
+
+    if result.get("培训经历"):
+        for idx, edu in enumerate(result["培训经历"]):
+            if edu.get("起止时间"):
+                dates = re.findall(r'\d+' , edu["起止时间"])
+                if len(dates) == 4:
+                    result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
+
+    if result.get("工作经历"):
+        for idx, edu in enumerate(result["工作经历"]):
+            if edu.get("起止时间"):
+                dates = re.findall(r'\d+' , edu["起止时间"])
+                if len(dates) == 4:
+                    result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
+
+    if result.get("项目经历"):
+        for idx, edu in enumerate(result["项目经历"]):
+            if edu.get("起止时间"):
+                dates = re.findall(r'\d+' , edu["起止时间"])
+                if len(dates) == 4:
+                    result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
+
+    if result.get("获得职业资格证书情况"):
+        for idx, edu in enumerate(result["获得职业资格证书情况"]):
+            if edu.get("获得日期"):
+                dates = re.findall(r'\d+' , edu["获得日期"])
+                if len(dates) == 2:
+                    result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+
+    if result.get("奖惩情况"):
+        for idx, edu in enumerate(result["奖惩情况"]):
+            if edu.get("时间"):
+                dates = re.findall(r'\d+' , edu["时间"])
+                if len(dates) == 2:
+                    result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+
+    if result.get("主要家庭成员及社会关系"):
+        for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
+            if fam.get("出生年月"):
+                dates = re.findall(r'\d+' , fam["出生年月"])
+                if len(dates) == 2:
+                    result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+
+    return result
+
+
+
+if __name__ == '__main__':
+    pprint(formatter(parse_layout(path)))
+
+

+ 61 - 0
tools/irafa.py

@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2022-07-07 13:12:17
+# @Last Modified by:   privacy
+# @Last Modified time: 2022-07-08 17:52:09
+
+
+from docx import Document
+from docx.shared import Inches
+
+path = "d:\\desktop\\内部人才市场简历模板.docx"
+
+keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
+
+def parse_line(line):
+    result = []
+    key = None
+    for cell in line:
+        if cell and ''.join(cell.split()) in keywords:
+            key = ''.join(cell.split())
+        elif cell and key:
+            schema = {key:cell}
+            result.append(schema)
+            key = None
+    return result
+
+doc = Document(path)
+lo = {}
+tables = doc.tables
+for _table in tables[:]:
+    for i, row in enumerate(_table.rows[:]):
+        row_content = []
+        for cell in row.cells[:]:
+            c = cell.text
+            row_content.append(c)
+        lo[len(lo.keys())] = row_content
+
+kwln = -1
+kwline = None
+for key in lo.keys():
+    # pdb.set_trace()
+    for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+        if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
+            # pdb.set_trace()
+            for c in lo[key]:
+                # pdb.set_trace()
+                if c and ''.join(c.split()) in keywords:# 非关键词行元素
+                    print(parse_line(lo[key]))
+                    break
+            else:# 关键词行元素
+                schema = dict()
+                for key, val in zip(kwline, lo[key]):
+                    if key:
+                        schema[key] = val
+                print(schema)
+                break
+            break
+    else:
+        # print("此行为关键词行")
+        kwline = lo[key]
+        kwln = len(lo[key])

+ 10 - 10
tools/resume_parse.py

@@ -86,7 +86,7 @@ block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经
 
 
 
-# 基本信息(已完成)
+# 基本信息(旧版)
 def get_base_info_old(lines):
     logger.info(lines)
     schema = {
@@ -122,8 +122,8 @@ def get_base_info_old(lines):
             schema['手机号码'] = re.search(r'\W(1[\d]{10})\W', w).group(1) if re.search(r'\W(1[\d]{10})\W', w) else None
         # if not schema.get('籍贯'):
         #     schema['籍贯'] = re.search(r'[籍贯::]{3,}(\w{2,5})', w).group(1) if re.search(r'[籍贯::]{3,}(\w{2,})', w) else None
-        # if not schema.get('出生年月'):
-        #     schema['出生年月'] = re.search(r'\d{4}[./年\-]\d{1,2}[月]', w).group() if re.search(r'\d{4}[./年\-]\d{1,2}[月]', w) else None
+        if not schema.get('出生年月'):
+            schema['出生年月'] = re.search(r'\d{4}[./年\-]\d{1,2}[月]', w).group() if re.search(r'\d{4}[./年\-]\d{1,2}[月]', w) else None
         # if not schema.get('当前职位'):
         #     schema['当前职位'] = re.search(r'[当前职位: ]{3,}(\w)+', w).group() if re.search(r'[当前职位: ]{3,}(\w)+', w) else None
         # if not schema.get('参加工作时间'):
@@ -131,7 +131,7 @@ def get_base_info_old(lines):
     return {key:value for key, value in schema.items() if value}
 
 
-# 基本信息(OIE已完成)
+# 基本信息(OIE 已完成)
 def get_base_info(lines):
     if not lines:
         return
@@ -664,7 +664,7 @@ def get_job_list(lines):
     return job_list
 
 
-# 项目经历 (已完成)(弃用)
+# 项目经历 (已弃用)
 # 项目名称未知
 def get_pro_list_old(lines):
     logger.info(lines)
@@ -706,7 +706,7 @@ def get_pro_list_old(lines):
     return pro_list
 
 
-# 项目经历 (UIE)
+# 项目经历 (UIE 已完成)
 def get_pro_list(lines):
     logger.info(lines)
 
@@ -925,7 +925,7 @@ def get_lag_list(lines):
     return job_list
 
 
-# 家庭情况(弃用)
+# 家庭情况(弃用)
 def get_fam_list(lines):
     job_list = []
     fam_dict = {}
@@ -949,7 +949,7 @@ def get_fam_list(lines):
     return job_list
 
 
-# 证书情况  时间+证书名称 (已完成
+# 证书情况  时间+证书名称 (旧版
 def get_cet_list_old(lines):
     logger.info(lines)
 
@@ -994,7 +994,7 @@ def get_cet_list(lines):
     return cet_list
 
 
-# 获奖情况  时间+获奖名称 (已完成
+# 获奖情况  时间+获奖名称 (旧版
 def get_prize_list_old(lines):
     logger.info(lines)
 
@@ -1310,7 +1310,7 @@ def parse_table_from_pdf(path, save_dir):
                     row_list = []
                     for word in line:
                         row_list.append(word)
-                    lo[len(row.keys())] = row_list
+                    lo[len(lo.keys())] = row_list
     # 去除空项
     for key in list(lo.keys()):
         if "" in lo[key]:

+ 130 - 0
tools/srafa.py

@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2022-07-07 12:59:42
+# @Last Modified by:   privacy
+# @Last Modified time: 2022-07-08 17:49:57
+# import pdb
+from pprint import pprint
+import pandas as pd
+import pdfplumber
+
+path = "d:\\desktop\\社招简历模板.pdf"
+
+keywords = ['姓名',
+    '性别',
+    '出生日期',
+    '一寸照片',
+    '民族',
+    '出生地',
+    '政治面貌(加入时间)',
+    '参加工作时间',
+    '健康状况',
+    '外语水平',
+    '初始学历、专业',
+    '最高学历、专业',
+    '初始学历毕业院校及毕业时间',
+    '最高学历毕业院校及毕业时间',
+    '专业技术资格(取得时间)',
+    '职业技能等级(取得时间)',
+    '熟悉专业有何专长',
+    '工作单位',
+    '现任职务',
+    '任职时间',
+    '提职时间',
+    '意向岗位',
+    '联系电话',
+    '学习经历',
+    '起止时间',
+    '学校',
+    '专业',
+    '学历',
+    '学位',
+    '研究方向',
+    '是否全日制',
+    '培训',
+    '起止时间',
+    '培训类型',
+    '机构',
+    '内容',
+    '成绩',
+    '证书名称',
+    '经历',
+    '工作经历',
+    '起止时间',
+    '工作单位',
+    '职务',
+    '部门',
+    '证明人',
+    '备注',
+    '对报名岗位认识及工作设想',
+    '自我评价及主要工作业绩',
+    '获得职业资格证书情况',
+    '获得日期',
+    '名称',
+    '证书编码/文号',
+    '授予单位',
+    '备注',
+    '奖惩',
+    '项目',
+    '时间',
+    '项目单位',
+    '证明材料',
+    '情况',
+    '主要家庭成员及社会关系',
+    '称谓',
+    '出生年月',
+    '政治面貌',
+    '工作单位及职务',
+    '其他情况说明',
+    '诚信承诺',
+    '本人承诺,以上信息均与事实相符,若有虚假,愿承担一切后果并自愿取消应聘资格。'
+    '承诺人:'
+    '社会招聘工作办公室资格审查意见']
+
+def parse_line(line):
+    result = []
+    key = None
+    for cell in line:
+        if cell and ''.join(cell.split()) in keywords:
+            key = ''.join(cell.split())
+        elif cell and key:
+            schema = {key:cell}
+            result.append(schema)
+            key = None
+    return result
+
+
+lo = {}
+with pdfplumber.open(path) as pdf:
+        for page in pdf.pages:
+            for table in page.extract_tables():
+                for line in table:
+                    lo[len(lo.keys())] = line
+
+kwln = -1
+kwline = None
+for key in lo.keys():
+    # pdb.set_trace()
+    for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+        if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
+            # pdb.set_trace()
+            for c in lo[key]:
+                # pdb.set_trace()
+                if c and ''.join(c.split()) in keywords:# 非关键词行元素
+                    print(parse_line(lo[key]))
+                    break
+            else:# 关键词行元素
+                schema = dict()
+                for key, val in zip(kwline, lo[key]):
+                    if key:
+                        schema[key] = val
+                print(schema)
+                break
+            break
+    else:
+        # print("此行为关键词行")
+        kwline = lo[key]
+        kwln = len(lo[key])
+
+
+