3 年之前 · 3e5f25e488
--- a/tools/custom.py
+++ b/tools/custom.py
@@ -0,0 +1,269 @@
 
				+# -*- coding: utf-8 -*-

			
 
				+# @Author: privacy

			
 
				+# @Date:   2022-07-11 09:21:24

			
 
				+# @Last Modified by:   privacy

			
 
				+# @Last Modified time: 2022-07-12 16:30:08

			
 
				+import re

			
 
				+import logging

			
 
				+from pprint import pprint

			
 
				+

			
 
				+from docx import Document

			
 
				+from docx.shared import Inches

			
 
				+

			
 
				+

			
 
				+path = "d:\\desktop\\自定义.docx"

			
 
				+# path = "d:\\desktop\\内部人才市场简历模板.docx"

			
 
				+

			
 
				+keywords = [

			
 
				+	"姓名",

			
 
				+	"性别",

			
 
				+	"出生年月",

			
 
				+	"出生日期",

			
 
				+	"民族",

			
 
				+	"籍贯",

			
 
				+	"户籍地",

			
 
				+	"健康状况",

			
 
				+	"政治面貌（加入时间）",

			
 
				+	"政治面貌(加入时间)",

			
 
				+	"参加工作时间",

			
 
				+	"健康状况",

			
 
				+	"外语水平",

			
 
				+	"专业技术资格（取得时间）",

			
 
				+	"专业技术资格(取得时间)",

			
 
				+	"职业技能等级（取得时间）",

			
 
				+	"职业技能等级(取得时间)",

			
 
				+	"熟悉专业有何专长",

			
 
				+	"学历院校",

			
 
				+	"初始学历、专业",

			
 
				+	"初始学历毕业院校及毕业时间",

			
 
				+	"最高学历、专业",

			
 
				+	"最高学历毕业院校及毕业时间",

			
 
				+	"工作单位",

			
 
				+	"现任职务",

			
 
				+	"任职时间",

			
 
				+	"提职时间",

			
 
				+	"联系电话",

			
 
				+	"邮箱地址",

			
 
				+	"对报名岗位认识及工作设想",

			
 
				+	"意向地区",

			
 
				+	"意向岗位",

			
 
				+	"其他意向岗位",

			
 
				+	"意向单位",

			
 
				+	"意向专业",

			
 
				+	"学习经历",

			
 
				+	"起止时间",

			
 
				+	"学校","专业","学历","学位","研究方向","是否全日制",

			
 
				+	"培训经历",

			
 
				+	"培训类型","机构","内容","成绩","证书名称",

			
 
				+	"工作经历",

			
 
				+	"工作单位","职务","部门","证明人","备注",

			
 
				+	"项目经历",

			
 
				+	"项目名称","项目职务","项目描述","项目职责","项目成果",

			
 
				+	"获得职业资格证书情况",

			
 
				+	"获得日期","名称","证书编码/文号","授予单位",

			
 
				+	"奖惩情况",

			
 
				+	"项目","时间","项目单位","证明材料",

			
 
				+	"主要工作业绩（500字以内）",

			
 
				+	"主要工作业绩(500字以内)",

			
 
				+	"自我评价",

			
 
				+	"近三年年度考核结果",

			
 
				+	"主要家庭成员及社会关系",

			
 
				+	"称谓",

			
 
				+	"其他情况说明",

			
 
				+	"工作单位及职务",

			
 
				+	"政治面貌",

			
 
				+	"职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系", "计算机水平"

			
 
				+]

			
 
				+

			
 
				+def parse_line(line):

			
 
				+    result = []

			
 
				+    key = None

			
 
				+    for cell in line:

			
 
				+        if cell and ''.join(cell.split()) in keywords:

			
 
				+            key = ''.join(cell.split())

			
 
				+        elif cell and key:

			
 
				+            schema = {key:cell}

			
 
				+            result.append(schema)

			
 
				+            key = None

			
 
				+    return result

			
 
				+

			
 
				+

			
 
				+def parse_layout(path):

			
 
				+    result = []

			
 
				+    doc = Document(path)

			
 
				+    lo = {}

			
 
				+    tables = doc.tables

			
 
				+    for _table in tables[:]:

			
 
				+        for i, row in enumerate(_table.rows[:]):

			
 
				+            row_content = []

			
 
				+            for cell in row.cells[:]:

			
 
				+                c = cell.text

			
 
				+                # row_content.append(c)

			
 
				+                if c not in row_content:

			
 
				+                	row_content.append(c)

			
 
				+            lo[len(lo.keys())] = row_content

			
 
				+

			
 
				+    kwln = -1

			
 
				+    kwline = None

			
 
				+    for key in lo.keys():

			
 
				+        # pdb.set_trace()

			
 
				+        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+                # pdb.set_trace()

			
 
				+                perc = 0

			
 
				+                for c in lo[key]:

			
 
				+                    # pdb.set_trace()

			
 
				+                    if c and (''.join(c.split()) in keywords):

			
 
				+                        perc += 1

			
 
				+                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 非关键词行元素

			
 
				+                        # print(c)

			
 
				+                        # print(perc)

			
 
				+                        # print(lo[key])

			
 
				+                        perc = 0

			
 
				+                        result.extend(parse_line(lo[key]))

			
 
				+                        break

			
 
				+                else:# 关键词行元素

			
 
				+                    schema = dict()

			
 
				+                    for key, val in zip(kwline, lo[key]):

			
 
				+                        if key:

			
 
				+                            schema[key] = val

			
 
				+                    result.append(schema)

			
 
				+                    break

			
 
				+                break

			
 
				+        else:

			
 
				+            # print("{}\t\t此行为关键词行".format(lo[key]))

			
 
				+            try:

			
 
				+                kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				+            except Exception as e:

			
 
				+                kwline = lo[key]

			
 
				+            kwln = len(lo[key])

			
 
				+    return result

			
 
				+

			
 
				+

			
 
				+# 格式化数据

			
 
				+def formatter(datalist):

			
 
				+    result = dict()

			
 
				+

			
 
				+    for d in datalist:

			
 
				+        if len(d) == 1:

			
 
				+            for key in d.keys():

			
 
				+                result[key] = d[key]

			
 
				+        else:

			
 
				+            for k in list(d.keys()):

			
 
				+                if k == "".join(d[k].split()):

			
 
				+                    d.pop(k)

			
 
				+                    if result.get(k):

			
 
				+                        result[k].append(d)

			
 
				+                    else:

			
 
				+                        result[k] = [d]

			
 
				+

			
 
				+    if result.get("出生年月"):

			
 
				+        dates = re.findall(r'\d+' , result["出生年月"])

			
 
				+        if len(dates) == 1:

			
 
				+            result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+        elif len(dates) == 2:

			
 
				+            result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+        elif len(dates) == 3:

			
 
				+            result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+    if result.get("任职时间"):

			
 
				+        dates = re.findall(r'\d+' , result["任职时间"])

			
 
				+        if len(dates) == 1:

			
 
				+            result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+        elif len(dates) == 2:

			
 
				+            result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+        elif len(dates) == 3:

			
 
				+            result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+    if result.get("参加工作时间"):

			
 
				+        dates = re.findall(r'\d+' , result["参加工作时间"])

			
 
				+        if len(dates) == 1:

			
 
				+            result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+        elif len(dates) == 2:

			
 
				+            result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+        elif len(dates) == 3:

			
 
				+            result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+    if result.get("最高学历毕业院校及毕业时间"):

			
 
				+        dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])

			
 
				+        ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])

			
 
				+        if len(ws) > 0:

			
 
				+            result["最高学历毕业院校"] = ws[0]

			
 
				+        if len(dates) == 1:

			
 
				+            result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+        elif len(dates) == 2:

			
 
				+            result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+        elif len(dates) == 3:

			
 
				+            result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+        result.pop("最高学历毕业院校及毕业时间")

			
 
				+

			
 
				+    if result.get("初始学历毕业院校及毕业时间"):

			
 
				+        dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])

			
 
				+        ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])

			
 
				+        if len(ws) > 0:

			
 
				+            result["初始学历毕业院校"] = ws[0]

			
 
				+        if len(dates) == 1:

			
 
				+            result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+        elif len(dates) == 2:

			
 
				+            result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+        elif len(dates) == 3:

			
 
				+            result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+        result.pop("初始学历毕业院校及毕业时间")

			
 
				+

			
 
				+    if result.get("学习经历"):

			
 
				+        for idx, edu in enumerate(result["学习经历"]):

			
 
				+            if edu.get("起止时间"):

			
 
				+                dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                if len(dates) == 4:

			
 
				+                    result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+    if result.get("培训经历"):

			
 
				+        for idx, edu in enumerate(result["培训经历"]):

			
 
				+            if edu.get("起止时间"):

			
 
				+                dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                if len(dates) == 4:

			
 
				+                    result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+    if result.get("工作经历"):

			
 
				+        for idx, edu in enumerate(result["工作经历"]):

			
 
				+            if edu.get("起止时间"):

			
 
				+                dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                if len(dates) == 4:

			
 
				+                    result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+    if result.get("项目经历"):

			
 
				+        for idx, edu in enumerate(result["项目经历"]):

			
 
				+            if edu.get("起止时间"):

			
 
				+                dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                if len(dates) == 4:

			
 
				+                    result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+    if result.get("获得职业资格证书情况"):

			
 
				+        for idx, edu in enumerate(result["获得职业资格证书情况"]):

			
 
				+            if edu.get("获得日期"):

			
 
				+                dates = re.findall(r'\d+' , edu["获得日期"])

			
 
				+                if len(dates) == 2:

			
 
				+                    result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+    if result.get("奖惩情况"):

			
 
				+        for idx, edu in enumerate(result["奖惩情况"]):

			
 
				+            if edu.get("时间"):

			
 
				+                dates = re.findall(r'\d+' , edu["时间"])

			
 
				+                if len(dates) == 2:

			
 
				+                    result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+    if result.get("主要家庭成员及社会关系"):

			
 
				+        for idx, fam in enumerate(result["主要家庭成员及社会关系"]):

			
 
				+            if fam.get("出生年月"):

			
 
				+                dates = re.findall(r'\d+' , fam["出生年月"])

			
 
				+                if len(dates) == 2:

			
 
				+                    result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+    return result

			
 
				+

			
 
				+

			
 
				+

			
 
				+if __name__ == '__main__':

			
 
				+    pprint(formatter(parse_layout(path)))

			
 
				+

			
 
				+

			
--- a/tools/irafa.py
+++ b/tools/irafa.py
@@ -0,0 +1,61 @@
 
				+# -*- coding: utf-8 -*-

			
 
				+# @Author: privacy

			
 
				+# @Date:   2022-07-07 13:12:17

			
 
				+# @Last Modified by:   privacy

			
 
				+# @Last Modified time: 2022-07-08 17:52:09

			
 
				+

			
 
				+

			
 
				+from docx import Document

			
 
				+from docx.shared import Inches

			
 
				+

			
 
				+path = "d:\\desktop\\内部人才市场简历模板.docx"

			
 
				+

			
 
				+keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格（取得时间）", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]

			
 
				+

			
 
				+def parse_line(line):

			
 
				+    result = []

			
 
				+    key = None

			
 
				+    for cell in line:

			
 
				+        if cell and ''.join(cell.split()) in keywords:

			
 
				+            key = ''.join(cell.split())

			
 
				+        elif cell and key:

			
 
				+            schema = {key:cell}

			
 
				+            result.append(schema)

			
 
				+            key = None

			
 
				+    return result

			
 
				+

			
 
				+doc = Document(path)

			
 
				+lo = {}

			
 
				+tables = doc.tables

			
 
				+for _table in tables[:]:

			
 
				+    for i, row in enumerate(_table.rows[:]):

			
 
				+        row_content = []

			
 
				+        for cell in row.cells[:]:

			
 
				+            c = cell.text

			
 
				+            row_content.append(c)

			
 
				+        lo[len(lo.keys())] = row_content

			
 
				+

			
 
				+kwln = -1

			
 
				+kwline = None

			
 
				+for key in lo.keys():

			
 
				+    # pdb.set_trace()

			
 
				+    for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+        if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+            # pdb.set_trace()

			
 
				+            for c in lo[key]:

			
 
				+                # pdb.set_trace()

			
 
				+                if c and ''.join(c.split()) in keywords:# 非关键词行元素

			
 
				+                    print(parse_line(lo[key]))

			
 
				+                    break

			
 
				+            else:# 关键词行元素

			
 
				+                schema = dict()

			
 
				+                for key, val in zip(kwline, lo[key]):

			
 
				+                    if key:

			
 
				+                        schema[key] = val

			
 
				+                print(schema)

			
 
				+                break

			
 
				+            break

			
 
				+    else:

			
 
				+        # print("此行为关键词行")

			
 
				+        kwline = lo[key]

			
 
				+        kwln = len(lo[key])

			
--- a/tools/resume_parse.py
+++ b/tools/resume_parse.py
@@ -86,7 +86,7 @@ block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经
 
				 
			
 
				 
			
 
				 
			
 
				-# 基本信息(已完成)
			
 
				+# 基本信息(旧版)
			
 
				 def get_base_info_old(lines):
			
 
				     logger.info(lines)
			
 
				     schema = {
			
@@ -122,8 +122,8 @@ def get_base_info_old(lines):
 
				             schema['手机号码'] = re.search(r'\W(1[\d]{10})\W', w).group(1) if re.search(r'\W(1[\d]{10})\W', w) else None
			
 
				         # if not schema.get('籍贯'):
			
 
				         #     schema['籍贯'] = re.search(r'[籍贯:：]{3,}(\w{2,5})', w).group(1) if re.search(r'[籍贯:：]{3,}(\w{2,})', w) else None
			
 
				-        # if not schema.get('出生年月'):
			
 
				-        #     schema['出生年月'] = re.search(r'\d{4}[./年\-]\d{1,2}[月]', w).group() if re.search(r'\d{4}[./年\-]\d{1,2}[月]', w) else None
			
 
				+        if not schema.get('出生年月'):
			
 
				+            schema['出生年月'] = re.search(r'\d{4}[./年\-]\d{1,2}[月]', w).group() if re.search(r'\d{4}[./年\-]\d{1,2}[月]', w) else None
			
 
				         # if not schema.get('当前职位'):
			
 
				         #     schema['当前职位'] = re.search(r'[当前职位： ]{3，}(\w)+', w).group() if re.search(r'[当前职位： ]{3，}(\w)+', w) else None
			
 
				         # if not schema.get('参加工作时间'):
			
@@ -131,7 +131,7 @@ def get_base_info_old(lines):
 
				     return {key:value for key, value in schema.items() if value}
			
 
				 
			
 
				 
			
 
				-# 基本信息(OIE已完成)
			
 
				+# 基本信息(OIE 已完成)
			
 
				 def get_base_info(lines):
			
 
				     if not lines:
			
 
				         return
			
@@ -664,7 +664,7 @@ def get_job_list(lines):
 
				     return job_list
			
 
				 
			
 
				 
			
 
				-# 项目经历 (已完成)(弃用)
			
 
				+# 项目经历 (已弃用)
			
 
				 # 项目名称未知
			
 
				 def get_pro_list_old(lines):
			
 
				     logger.info(lines)
			
@@ -706,7 +706,7 @@ def get_pro_list_old(lines):
 
				     return pro_list
			
 
				 
			
 
				 
			
 
				-# 项目经历 (UIE)
			
 
				+# 项目经历 (UIE 已完成)
			
 
				 def get_pro_list(lines):
			
 
				     logger.info(lines)
			
 
				 
			
@@ -925,7 +925,7 @@ def get_lag_list(lines):
 
				     return job_list
			
 
				 
			
 
				 
			
 
				-# 家庭情况（弃用）
			
 
				+# 家庭情况（已弃用）
			
 
				 def get_fam_list(lines):
			
 
				     job_list = []
			
 
				     fam_dict = {}
			
@@ -949,7 +949,7 @@ def get_fam_list(lines):
 
				     return job_list
			
 
				 
			
 
				 
			
 
				-# 证书情况  时间+证书名称 （已完成）
			
 
				+# 证书情况  时间+证书名称 （旧版）
			
 
				 def get_cet_list_old(lines):
			
 
				     logger.info(lines)
			
 
				 
			
@@ -994,7 +994,7 @@ def get_cet_list(lines):
 
				     return cet_list
			
 
				 
			
 
				 
			
 
				-# 获奖情况  时间+获奖名称 （已完成）
			
 
				+# 获奖情况  时间+获奖名称 （旧版）
			
 
				 def get_prize_list_old(lines):
			
 
				     logger.info(lines)
			
 
				 
			
@@ -1310,7 +1310,7 @@ def parse_table_from_pdf(path, save_dir):
 
				                     row_list = []
			
 
				                     for word in line:
			
 
				                         row_list.append(word)
			
 
				-                    lo[len(row.keys())] = row_list
			
 
				+                    lo[len(lo.keys())] = row_list
			
 
				     # 去除空项
			
 
				     for key in list(lo.keys()):
			
 
				         if "" in lo[key]:
			
--- a/tools/srafa.py
+++ b/tools/srafa.py
@@ -0,0 +1,130 @@
 
				+# -*- coding: utf-8 -*-

			
 
				+# @Author: privacy

			
 
				+# @Date:   2022-07-07 12:59:42

			
 
				+# @Last Modified by:   privacy

			
 
				+# @Last Modified time: 2022-07-08 17:49:57

			
 
				+# import pdb

			
 
				+from pprint import pprint

			
 
				+import pandas as pd

			
 
				+import pdfplumber

			
 
				+

			
 
				+path = "d:\\desktop\\社招简历模板.pdf"

			
 
				+

			
 
				+keywords = ['姓名',

			
 
				+    '性别',

			
 
				+    '出生日期',

			
 
				+    '一寸照片',

			
 
				+    '民族',

			
 
				+    '出生地',

			
 
				+    '政治面貌（加入时间）',

			
 
				+    '参加工作时间',

			
 
				+    '健康状况',

			
 
				+    '外语水平',

			
 
				+    '初始学历、专业',

			
 
				+    '最高学历、专业',

			
 
				+    '初始学历毕业院校及毕业时间',

			
 
				+    '最高学历毕业院校及毕业时间',

			
 
				+    '专业技术资格（取得时间）',

			
 
				+    '职业技能等级（取得时间）',

			
 
				+    '熟悉专业有何专长',

			
 
				+    '工作单位',

			
 
				+    '现任职务',

			
 
				+    '任职时间',

			
 
				+    '提职时间',

			
 
				+    '意向岗位',

			
 
				+    '联系电话',

			
 
				+    '学习经历',

			
 
				+    '起止时间',

			
 
				+    '学校',

			
 
				+    '专业',

			
 
				+    '学历',

			
 
				+    '学位',

			
 
				+    '研究方向',

			
 
				+    '是否全日制',

			
 
				+    '培训',

			
 
				+    '起止时间',

			
 
				+    '培训类型',

			
 
				+    '机构',

			
 
				+    '内容',

			
 
				+    '成绩',

			
 
				+    '证书名称',

			
 
				+    '经历',

			
 
				+    '工作经历',

			
 
				+    '起止时间',

			
 
				+    '工作单位',

			
 
				+    '职务',

			
 
				+    '部门',

			
 
				+    '证明人',

			
 
				+    '备注',

			
 
				+    '对报名岗位认识及工作设想',

			
 
				+    '自我评价及主要工作业绩',

			
 
				+    '获得职业资格证书情况',

			
 
				+    '获得日期',

			
 
				+    '名称',

			
 
				+    '证书编码/文号',

			
 
				+    '授予单位',

			
 
				+    '备注',

			
 
				+    '奖惩',

			
 
				+    '项目',

			
 
				+    '时间',

			
 
				+    '项目单位',

			
 
				+    '证明材料',

			
 
				+    '情况',

			
 
				+    '主要家庭成员及社会关系',

			
 
				+    '称谓',

			
 
				+    '出生年月',

			
 
				+    '政治面貌',

			
 
				+    '工作单位及职务',

			
 
				+    '其他情况说明',

			
 
				+    '诚信承诺',

			
 
				+    '本人承诺，以上信息均与事实相符，若有虚假，愿承担一切后果并自愿取消应聘资格。'

			
 
				+    '承诺人：'

			
 
				+    '社会招聘工作办公室资格审查意见']

			
 
				+

			
 
				+def parse_line(line):

			
 
				+    result = []

			
 
				+    key = None

			
 
				+    for cell in line:

			
 
				+        if cell and ''.join(cell.split()) in keywords:

			
 
				+            key = ''.join(cell.split())

			
 
				+        elif cell and key:

			
 
				+            schema = {key:cell}

			
 
				+            result.append(schema)

			
 
				+            key = None

			
 
				+    return result

			
 
				+

			
 
				+

			
 
				+lo = {}

			
 
				+with pdfplumber.open(path) as pdf:

			
 
				+        for page in pdf.pages:

			
 
				+            for table in page.extract_tables():

			
 
				+                for line in table:

			
 
				+                    lo[len(lo.keys())] = line

			
 
				+

			
 
				+kwln = -1

			
 
				+kwline = None

			
 
				+for key in lo.keys():

			
 
				+    # pdb.set_trace()

			
 
				+    for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+        if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+            # pdb.set_trace()

			
 
				+            for c in lo[key]:

			
 
				+                # pdb.set_trace()

			
 
				+                if c and ''.join(c.split()) in keywords:# 非关键词行元素

			
 
				+                    print(parse_line(lo[key]))

			
 
				+                    break

			
 
				+            else:# 关键词行元素

			
 
				+                schema = dict()

			
 
				+                for key, val in zip(kwline, lo[key]):

			
 
				+                    if key:

			
 
				+                        schema[key] = val

			
 
				+                print(schema)

			
 
				+                break

			
 
				+            break

			
 
				+    else:

			
 
				+        # print("此行为关键词行")

			
 
				+        kwline = lo[key]

			
 
				+        kwln = len(lo[key])

			
 
				+

			
 
				+

			
 
				+