3 år sedan · e8bf2d77b9
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,6 +14,8 @@ RUN tar xvf Python-3.8.12.tgz && cd Python-3.8.12 && ./configure --prefix=/usr/l
 
				 
			
 
				 RUN /usr/local/python-3.8.12/bin/python3.8 -m venv venv && source venv/bin/activate && pip install --upgrade pip -i https://mirror.baidu.com/pypi/simple && pip install --upgrade paddlepaddle-2.3.0-cp38-cp38-linux_x86_64.whl paddlenlp -i https://mirror.baidu.com/pypi/simple && pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple
			
 
				 
			
 
				+RUN rm -f /usr/bin/python && ln -s /workspace/venv/bin/python /usr/bin/python
			
 
				+
			
 
				 EXPOSE 8320
			
 
				 
			
 
				 ENTRYPOINT ["/bin/sh"]
			
--- a/README.md
+++ b/README.md
@@ -1 +1,29 @@
 
				+### 安装 rar 解压工具

			
 
				+```bash

			
 
				 apt install unrar-free

			
 
				+```

			
 
				+

			
 
				+

			
 
				+

			
 
				+### 通用模板抽取

			
 
				+resume_parse.py

			
 
				+支持 doc, docx, pdf, txt, tar.gz, 7z, rar, zip

			
 
				+

			
 
				+### 社招简历模板

			
 
				+srafa.py

			
 
				+支持 docx, pdf

			
 
				+

			
 
				+### 内部人才简历模板

			
 
				+irafa.py

			
 
				+支持 docx, pdf

			
 
				+

			
 
				+### 自定义模板抽取

			
 
				+custom.py

			
 
				+支持 docx, pdf

			
 
				+

			
 
				+### 模型

			
 
				+##### model_100

			
 
				+使用base模型训练，效果较好，预测时间较长

			
 
				+

			
 
				+##### model_best

			
 
				+使用mini模型训练，速度快
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 
				+chardet==4.0.0

			
 
				 uvicorn

			
 
				 fastapi

			
 
				 py7zr

			
 
				-

			
 
				 rarfile

			
 
				 tarfile

			
 
				 zipfile

			
@@ -9,4 +9,6 @@ requests
 
				 python-docx

			
 
				 pdfminer

			
 
				 pdfplumber

			
 
				-paddlenlp
			
 
				+xlrd==1.2.0

			
 
				+paddlenlp==2.3.4

			
 
				+paddlepaddle==2.2.2
			
--- a/tools/custom.py
+++ b/tools/custom.py
@@ -2,333 +2,418 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-11 09:21:24

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-15 17:22:00

			
 
				+# @Last Modified time: 2022-07-18 14:54:53

			
 
				 

			
 
				 # 自定义模板

			
 
				 

			
 
				 import re

			
 
				 import json

			
 
				-import logging

			
 
				-from pprint import pprint

			
 
				+

			
 
				 import requests

			
 
				 from requests.adapters import HTTPAdapter

			
 
				+

			
 
				+import pdfplumber

			
 
				 from docx import Document

			
 
				-from docx.shared import Inches

			
 
				 

			
 
				 

			
 
				 path = "d:\\desktop\\自定义.docx"

			
 
				-

			
 
				-# 关键词字典

			
 
				-keywords = [

			
 
				-	"姓名",

			
 
				-	"性别",

			
 
				-	"出生年月",

			
 
				-	"出生日期",

			
 
				-	"民族",

			
 
				-	"籍贯",

			
 
				-	"户籍地",

			
 
				-	"健康状况",

			
 
				-	"政治面貌（加入时间）",

			
 
				-	"政治面貌(加入时间)",

			
 
				-	"参加工作时间",

			
 
				-	"健康状况",

			
 
				-	"外语水平",

			
 
				-	"专业技术资格（取得时间）",

			
 
				-	"专业技术资格(取得时间)",

			
 
				-	"职业技能等级（取得时间）",

			
 
				-	"职业技能等级(取得时间)",

			
 
				-	"熟悉专业有何专长",

			
 
				-	"学历院校",

			
 
				-	"初始学历、专业",

			
 
				-	"初始学历毕业院校及毕业时间",

			
 
				-	"最高学历、专业",

			
 
				-	"最高学历毕业院校及毕业时间",

			
 
				-	"工作单位",

			
 
				-	"现任职务",

			
 
				-	"任职时间",

			
 
				-	"提职时间",

			
 
				-	"联系电话",

			
 
				-	"邮箱地址",

			
 
				-	"对报名岗位认识及工作设想",

			
 
				-	"意向地区",

			
 
				-	"意向岗位",

			
 
				-	"其他意向岗位",

			
 
				-	"意向单位",

			
 
				-	"意向专业",

			
 
				-	"学习经历",

			
 
				-	"起止时间",

			
 
				-	"学校","专业","学历","学位","研究方向","是否全日制",

			
 
				-	"培训经历",

			
 
				-	"培训类型","机构","内容","成绩","证书名称",

			
 
				-	"工作经历",

			
 
				-	"工作单位","职务","部门","证明人","备注",

			
 
				-	"项目经历",

			
 
				-	"项目名称","项目职务","项目描述","项目职责","项目成果",

			
 
				-	"获得职业资格证书情况",

			
 
				-	"获得日期","名称","证书编码/文号","授予单位",

			
 
				-	"奖惩情况",

			
 
				-	"项目","时间","项目单位","证明材料",

			
 
				-	"主要工作业绩（500字以内）",

			
 
				-	"主要工作业绩(500字以内)",

			
 
				-	"自我评价",

			
 
				-	"近三年年度考核结果",

			
 
				-	"主要家庭成员及社会关系",

			
 
				-	"称谓",

			
 
				-	"其他情况说明",

			
 
				-	"工作单位及职务",

			
 
				-	"政治面貌",

			
 
				-	"职业证书",

			
 
				-    "资格等级",

			
 
				-    "取得日期",

			
 
				-    "学校/培训机构",

			
 
				-    "专业",

			
 
				-    "起始时间",

			
 
				-    "毕业时间",

			
 
				-    "职业",

			
 
				-    "与本人关系",

			
 
				-    "计算机水平"

			
 
				-]

			
 
				-

			
 
				-# 解析行内元素

			
 
				-def parse_line(line):

			
 
				-    result = []

			
 
				-    key = None

			
 
				-    for cell in line:

			
 
				-        if cell and ''.join(cell.split()) in keywords:

			
 
				-            key = ''.join(cell.split())

			
 
				-        elif cell and key:

			
 
				-            schema = {key:cell}

			
 
				-            result.append(schema)

			
 
				-            key = None

			
 
				-    return result

			
 
				-

			
 
				-

			
 
				-# 解析文档布局

			
 
				-def parse_layout(path):

			
 
				-    result = []

			
 
				-    doc = Document(path)

			
 
				-    lo = {}

			
 
				-    for _table in doc.tables[:]:

			
 
				-        for i, row in enumerate(_table.rows[:]):

			
 
				-            row_content = []

			
 
				-            for cell in row.cells[:]:

			
 
				-                c = cell.text

			
 
				-                if c not in row_content:

			
 
				-                	row_content.append(c)

			
 
				-            lo[len(lo.keys())] = row_content

			
 
				-

			
 
				-    kwln = -1# 关键词行长度

			
 
				-    kwline = None# 关键词行

			
 
				-    for key in lo.keys():

			
 
				-        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				-            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				-                perc = 0# 行内关键词数量

			
 
				-                for c in lo[key]:

			
 
				-                    if c and (''.join(c.split()) in keywords):# 找到此行有关键词

			
 
				-                        perc += 1

			
 
				-                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3，判断此行非关键词行元素

			
 
				-                        perc = 0# 清空行内关键词数

			
 
				-                        result.extend(parse_line(lo[key]))# 添加并解析普通行级元素

			
 
				+# path = "d:\\desktop\\自定义.pdf"

			
 
				+

			
 
				+class Custom(object):

			
 
				+    """docstring for Custom"""

			
 
				+    def __init__(self):

			
 
				+        super(Custom, self).__init__()

			
 
				+        self.keywords = [

			
 
				+	       "姓名",

			
 
				+	       "性别",

			
 
				+	       "出生年月",

			
 
				+	       "出生日期",

			
 
				+	       "民族",

			
 
				+	       "籍贯",

			
 
				+	       "户籍地",

			
 
				+	       "健康状况",

			
 
				+	       "政治面貌（加入时间）",

			
 
				+	       "政治面貌(加入时间)",

			
 
				+	       "参加工作时间",

			
 
				+	       "健康状况",

			
 
				+	       "外语水平",

			
 
				+	       "专业技术资格（取得时间）",

			
 
				+	       "专业技术资格(取得时间)",

			
 
				+	       "职业技能等级（取得时间）",

			
 
				+	       "职业技能等级(取得时间)",

			
 
				+	       "熟悉专业有何专长",

			
 
				+	       "学历院校",

			
 
				+	       "初始学历、专业",

			
 
				+	       "初始学历毕业院校及毕业时间",

			
 
				+	       "最高学历、专业",

			
 
				+	       "最高学历毕业院校及毕业时间",

			
 
				+	       "工作单位",

			
 
				+	       "现任职务",

			
 
				+	       "任职时间",

			
 
				+	       "提职时间",

			
 
				+	       "联系电话",

			
 
				+	       "邮箱地址",

			
 
				+	       "对报名岗位认识及工作设想",

			
 
				+	       "意向地区",

			
 
				+	       "意向岗位",

			
 
				+	       "其他意向岗位",

			
 
				+	       "意向单位",

			
 
				+	       "意向专业",

			
 
				+	       "学习经历",

			
 
				+	       "起止时间",

			
 
				+	       "学校","专业","学历","学位","研究方向","是否全日制",

			
 
				+	       "培训经历",

			
 
				+	       "培训类型","机构","内容","成绩","证书名称",

			
 
				+	       "工作经历",

			
 
				+	       "工作单位","职务","部门","证明人","备注",

			
 
				+	       "项目经历",

			
 
				+	       "项目名称","项目职务","项目描述","项目职责","项目成果",

			
 
				+	       "获得职业资格证书情况",

			
 
				+	       "获得日期","名称","证书编码/文号","授予单位",

			
 
				+	       "奖惩情况",

			
 
				+	       "项目","时间","项目单位","证明材料",

			
 
				+	       "主要工作业绩（500字以内）",

			
 
				+	       "主要工作业绩(500字以内)",

			
 
				+	       "自我评价",

			
 
				+	       "近三年年度考核结果",

			
 
				+	       "主要家庭成员及社会关系",

			
 
				+	       "称谓",

			
 
				+	       "其他情况说明",

			
 
				+	       "工作单位及职务",

			
 
				+	       "政治面貌",

			
 
				+	       "职业证书",

			
 
				+            "资格等级",

			
 
				+            "取得日期",

			
 
				+            "学校/培训机构",

			
 
				+            "专业",

			
 
				+            "起始时间",

			
 
				+            "毕业时间",

			
 
				+            "职业",

			
 
				+            "与本人关系",

			
 
				+            "计算机水平"

			
 
				+        ]

			
 
				+        self.json_obj = self.get_translate()

			
 
				+

			
 
				+    def get_translate(self):

			
 
				+        # 转译数据库字段名

			
 
				+        with open("./resources/translate.json", "r", encoding="utf-8") as ff:

			
 
				+            json_obj = json.load(ff)

			
 
				+        return json_obj

			
 
				+

			
 
				+    # 解析行内元素

			
 
				+    def parse_line(self, line):

			
 
				+        result = []

			
 
				+        key = None

			
 
				+        for cell in line:

			
 
				+            if cell and ''.join(cell.split()) in self.keywords:

			
 
				+                key = ''.join(cell.split())

			
 
				+            elif cell and key:

			
 
				+                schema = {key:cell}

			
 
				+                result.append(schema)

			
 
				+                key = None

			
 
				+        return result

			
 
				+

			
 
				+    # 解析word

			
 
				+    def parse_word_layout(self, path):

			
 
				+        result = []

			
 
				+        doc = Document(path)

			
 
				+        lo = {}

			
 
				+        for _table in doc.tables[:]:

			
 
				+            for i, row in enumerate(_table.rows[:]):

			
 
				+                row_content = []

			
 
				+                for cell in row.cells[:]:

			
 
				+                    c = cell.text

			
 
				+                    if c not in row_content:

			
 
				+                        row_content.append(c)

			
 
				+                lo[len(lo.keys())] = row_content

			
 
				+

			
 
				+        kwln = -1# 关键词行长度

			
 
				+        kwline = None# 关键词行

			
 
				+        for key in lo.keys():

			
 
				+            for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+                    perc = 0# 行内关键词数量

			
 
				+                    for c in lo[key]:

			
 
				+                        if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词

			
 
				+                            perc += 1

			
 
				+                        if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3，判断此行非关键词行元素

			
 
				+                            perc = 0# 清空行内关键词数

			
 
				+                            result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素

			
 
				+                            break

			
 
				+                    else:# 关键词行元素

			
 
				+                        schema = dict()

			
 
				+                        for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素

			
 
				+                            if key:

			
 
				+                                schema[key] = val

			
 
				+                        result.append(schema)

			
 
				+                        break

			
 
				+                    break

			
 
				+            else:

			
 
				+                # print("{}：此行为关键词行！".format(lo[key]))

			
 
				+                try:

			
 
				+                    kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				+                except Exception as e:

			
 
				+                    kwline = lo[key]

			
 
				+                kwln = len(lo[key])

			
 
				+        return result

			
 
				+

			
 
				+    # 解析pdf

			
 
				+    def parse_pdf_layout(self, path):

			
 
				+        result = []

			
 
				+        lo = {}

			
 
				+        with pdfplumber.open(path) as pdf:

			
 
				+                for page in pdf.pages:

			
 
				+                    for table in page.extract_tables():

			
 
				+                        for line in table:

			
 
				+                            lo[len(lo.keys())] = line

			
 
				+

			
 
				+        kwln = -1

			
 
				+        kwline = None

			
 
				+        for key in lo.keys():

			
 
				+            # pdb.set_trace()

			
 
				+            for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+                    # pdb.set_trace()

			
 
				+                    for c in lo[key] or len(lo[key])!=kwln:

			
 
				+                        # pdb.set_trace()

			
 
				+                        if c and ''.join(c.split()) in self.keywords:# 非关键词行元素

			
 
				+                            result.extend(self.parse_line(lo[key]))

			
 
				+                            break

			
 
				+                    else:# 关键词行元素

			
 
				+                        schema = dict()

			
 
				+                        for key, val in zip(kwline, lo[key]):

			
 
				+                            if key:

			
 
				+                                schema[key] = val if val else key

			
 
				+                        result.append(schema)

			
 
				                         break

			
 
				-                else:# 关键词行元素

			
 
				-                    schema = dict()

			
 
				-                    for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素

			
 
				-                        if key:

			
 
				-                            schema[key] = val

			
 
				-                    result.append(schema)

			
 
				                     break

			
 
				-                break

			
 
				-        else:

			
 
				-            # print("{}：此行为关键词行！".format(lo[key]))

			
 
				-            try:

			
 
				-                kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				-            except Exception as e:

			
 
				-                kwline = lo[key]

			
 
				-            kwln = len(lo[key])

			
 
				-    return result

			
 
				-

			
 
				-

			
 
				-# 格式化数据

			
 
				-def formatter(datalist):

			
 
				-    result = dict()

			
 
				-    for d in datalist:

			
 
				-        if len(d) == 1:# 普通键值对

			
 
				-            for key in d.keys():

			
 
				-                result[key] = d[key]

			
 
				-        else:# 行级元素

			
 
				-            for k in list(d.keys()):

			
 
				-                if k == "".join(d[k].split()):# 行名

			
 
				-                    d.pop(k)

			
 
				-                    if result.get(k):# 多行元素合并

			
 
				-                        result[k].append(d)

			
 
				+            else:

			
 
				+                kwline = []

			
 
				+                for cell in lo[key]:

			
 
				+                    if cell:

			
 
				+                        kwline.append(''.join(cell.split()))

			
 
				                     else:

			
 
				-                        result[k] = [d]

			
 
				-

			
 
				-    ### 时间格式化

			
 
				-    if result.get("出生年月"):

			
 
				-        dates = re.findall(r'\d+' , result["出生年月"])

			
 
				-        if len(dates) == 1:

			
 
				-            result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				-        elif len(dates) == 2:

			
 
				-            result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-        elif len(dates) == 3:

			
 
				-            result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				-

			
 
				-    if result.get("任职时间"):

			
 
				-        dates = re.findall(r'\d+' , result["任职时间"])

			
 
				-        if len(dates) == 1:

			
 
				-            result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				-        elif len(dates) == 2:

			
 
				-            result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-        elif len(dates) == 3:

			
 
				-            result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				-

			
 
				-    if result.get("参加工作时间"):

			
 
				-        dates = re.findall(r'\d+' , result["参加工作时间"])

			
 
				-        if len(dates) == 1:

			
 
				-            result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				-        elif len(dates) == 2:

			
 
				-            result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-        elif len(dates) == 3:

			
 
				-            result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				-

			
 
				-    if result.get("最高学历毕业院校及毕业时间"):

			
 
				-        dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])

			
 
				-        ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])

			
 
				-        if len(ws) > 0:

			
 
				-            result["最高学历毕业院校"] = ws[0]

			
 
				-        if len(dates) == 1:

			
 
				-            result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				-        elif len(dates) == 2:

			
 
				-            result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-        elif len(dates) == 3:

			
 
				-            result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				-        result.pop("最高学历毕业院校及毕业时间")

			
 
				-

			
 
				-    if result.get("初始学历毕业院校及毕业时间"):

			
 
				-        dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])

			
 
				-        ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])

			
 
				-        if len(ws) > 0:

			
 
				-            result["初始学历毕业院校"] = ws[0]

			
 
				-        if len(dates) == 1:

			
 
				-            result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				-        elif len(dates) == 2:

			
 
				-            result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-        elif len(dates) == 3:

			
 
				-            result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				-        result.pop("初始学历毕业院校及毕业时间")

			
 
				-

			
 
				-    if result.get("学习经历"):

			
 
				-        for idx, edu in enumerate(result["学习经历"]):

			
 
				-            if edu.get("起止时间"):

			
 
				-                dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				-                if len(dates) == 4:

			
 
				-                    result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				-

			
 
				-    if result.get("培训经历"):

			
 
				-        for idx, edu in enumerate(result["培训经历"]):

			
 
				-            if edu.get("起止时间"):

			
 
				-                dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				-                if len(dates) == 4:

			
 
				-                    result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				-

			
 
				-    if result.get("工作经历"):

			
 
				-        for idx, edu in enumerate(result["工作经历"]):

			
 
				-            if edu.get("起止时间"):

			
 
				-                dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				-                if len(dates) == 4:

			
 
				-                    result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				-

			
 
				-    if result.get("项目经历"):

			
 
				-        for idx, edu in enumerate(result["项目经历"]):

			
 
				-            if edu.get("起止时间"):

			
 
				-                dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				-                if len(dates) == 4:

			
 
				-                    result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				-

			
 
				-    if result.get("获得职业资格证书情况"):

			
 
				-        for idx, edu in enumerate(result["获得职业资格证书情况"]):

			
 
				-            if edu.get("获得日期"):

			
 
				-                dates = re.findall(r'\d+' , edu["获得日期"])

			
 
				-                if len(dates) == 2:

			
 
				-                    result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-

			
 
				-    if result.get("奖惩情况"):

			
 
				-        for idx, edu in enumerate(result["奖惩情况"]):

			
 
				-            if edu.get("时间"):

			
 
				-                dates = re.findall(r'\d+' , edu["时间"])

			
 
				-                if len(dates) == 2:

			
 
				-                    result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-

			
 
				-    if result.get("主要家庭成员及社会关系"):

			
 
				-        for idx, fam in enumerate(result["主要家庭成员及社会关系"]):

			
 
				-            if fam.get("出生年月"):

			
 
				-                dates = re.findall(r'\d+' , fam["出生年月"])

			
 
				-                if len(dates) == 2:

			
 
				-                    result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-

			
 
				-    # 转译数据库字段名

			
 
				-    with open("./resources/translate.json", "r", encoding="utf-8") as ff:

			
 
				-        json_obj = json.load(ff)

			
 
				-

			
 
				-    normal = json_obj["base"]

			
 
				-    edunormal = json_obj["tal_his_edu"]

			
 
				-    family = json_obj["tal_family_social_relations"]

			
 
				-

			
 
				-    for key in normal.keys():

			
 
				-        if result.get(key):

			
 
				-            result[normal[key]] = result[key]

			
 
				-            result.pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['学习经历'])):

			
 
				-        result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]

			
 
				-        result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]

			
 
				-        for key in edunormal.keys():

			
 
				-            if result['学习经历'][idx].get(key):

			
 
				-                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				-                result['学习经历'][idx].pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['主要家庭成员及社会关系'])):

			
 
				-        for key in family.keys():

			
 
				-            if result['主要家庭成员及社会关系'][idx].get(key):

			
 
				-                result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]

			
 
				-                result['主要家庭成员及社会关系'][idx].pop(key)

			
 
				-

			
 
				-    tit = {

			
 
				-        "基本信息":"base",

			
 
				-        "求职意向":"intent_job",

			
 
				-        "学习经历":"tal_his_edu",

			
 
				-        "工作经历":"tal_his_job",

			
 
				-        "项目经历":"tal_his_project",

			
 
				-        "培训经历":"tal_training_institutions",

			
 
				-        "获奖情况":"tal_rewards_punishments",

			
 
				-        "语言能力":"tal_language",

			
 
				-        "证书":"tal_vocational_qualification_certificate",

			
 
				-        "专业技能":"tal_professional_tech_certificate",

			
 
				-        "主要家庭成员及社会关系":"tal_family_social_relations"

			
 
				-    }

			
 
				-

			
 
				-    for key in tit.keys():

			
 
				-        if result.get(key):

			
 
				-            result[tit[key]] = result[key]

			
 
				-            result.pop(key)

			
 
				-

			
 
				-    # url = "http://192.168.1.110:9999/talent/getResumeData"

			
 
				-    # session = requests.Session()

			
 
				-    # session.mount('http://', HTTPAdapter(max_retries = 3))

			
 
				-    # try:

			
 
				-    #     headers = {

			
 
				-    #         'contentType':'Application/json'

			
 
				-    #     }

			
 
				-    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)

			
 
				-    #     print(response.text)

			
 
				-    # except Exception as e:

			
 
				-    #     print(e)

			
 
				-    return result

			
 
				-

			
 
				+                        kwline.append(cell)

			
 
				+                kwln = len(lo[key])

			
 
				+        return result

			
 
				+

			
 
				+    # 格式化数据

			
 
				+    def formatter(self, datalist):

			
 
				+        result = dict()

			
 
				+        for d in datalist:

			
 
				+            if len(d) == 1:# 普通键值对

			
 
				+                for key in d.keys():

			
 
				+                    result[key] = d[key]

			
 
				+            else:# 行级元素

			
 
				+                for k in list(d.keys()):

			
 
				+                    if k == "".join(d[k].split()):# 行名

			
 
				+                        d.pop(k)

			
 
				+                        if result.get(k):# 多行元素合并

			
 
				+                            result[k].append(d)

			
 
				+                        else:

			
 
				+                            result[k] = [d]

			
 
				+

			
 
				+        if result.get("外语水平"):

			
 
				+            data = re.findall(r'(\w+[语话])', result["外语水平"])

			
 
				+            if data:

			
 
				+                result["外语水平"] = data

			
 
				+

			
 
				+        if result.get("专业技术资格(取得时间)"):

			
 
				+            dates = re.findall(r'\d+', result["专业技术资格(取得时间)"])

			
 
				+            for i in dates:

			
 
				+                result["专业技术资格(取得时间)"] = result["专业技术资格(取得时间)"].replace(i, "")

			
 
				+            names = re.findall(r'\w+', result["专业技术资格(取得时间)"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"专业技术资格":names}]

			
 
				+            elif len(dates) == 2:

			
 
				+                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"专业技术资格":names}]

			
 
				+            elif len(dates) == 3:

			
 
				+                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"专业技术资格":names}]

			
 
				+

			
 
				+        if result.get("职业技能等级（取得时间）"):

			
 
				+            dates = re.findall(r'\d+', result["职业技能等级（取得时间）"])

			
 
				+            for i in dates:

			
 
				+                result["职业技能等级（取得时间）"] = result["职业技能等级（取得时间）"].replace(i, "")

			
 
				+            names = re.findall(r'\w+', result["职业技能等级（取得时间）"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["职业技能等级（取得时间）"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"职业技能等级":names}]

			
 
				+            elif len(dates) == 2:

			
 
				+                result["职业技能等级（取得时间）"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"职业技能等级":names}]

			
 
				+            elif len(dates) == 3:

			
 
				+                result["职业技能等级（取得时间）"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"职业技能等级":names}]

			
 
				+

			
 
				+        ### 时间格式化

			
 
				+        if result.get("出生年月"):

			
 
				+            dates = re.findall(r'\d+' , result["出生年月"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+

			
 
				+        if result.get("任职时间"):

			
 
				+            dates = re.findall(r'\d+' , result["任职时间"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+

			
 
				+        if result.get("参加工作时间"):

			
 
				+            dates = re.findall(r'\d+' , result["参加工作时间"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+

			
 
				+        if result.get("最高学历毕业院校及毕业时间"):

			
 
				+            dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])

			
 
				+            ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])

			
 
				+            if len(ws) > 0:

			
 
				+                result["最高学历毕业院校"] = ws[0]

			
 
				+            if len(dates) == 1:

			
 
				+                result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+            result.pop("最高学历毕业院校及毕业时间")

			
 
				+

			
 
				+        if result.get("初始学历毕业院校及毕业时间"):

			
 
				+            dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])

			
 
				+            ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])

			
 
				+            if len(ws) > 0:

			
 
				+                result["初始学历毕业院校"] = ws[0]

			
 
				+            if len(dates) == 1:

			
 
				+                result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+            result.pop("初始学历毕业院校及毕业时间")

			
 
				+

			
 
				+        if result.get("学习经历"):

			
 
				+            for idx, edu in enumerate(result["学习经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("培训经历"):

			
 
				+            for idx, edu in enumerate(result["培训经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("工作经历"):

			
 
				+            for idx, edu in enumerate(result["工作经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("项目经历"):

			
 
				+            for idx, edu in enumerate(result["项目经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("获得职业资格证书情况"):

			
 
				+            for idx, edu in enumerate(result["获得职业资格证书情况"]):

			
 
				+                if edu.get("获得日期"):

			
 
				+                    dates = re.findall(r'\d+' , edu["获得日期"])

			
 
				+                    if len(dates) == 2:

			
 
				+                        result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+        if result.get("奖惩情况"):

			
 
				+            for idx, edu in enumerate(result["奖惩情况"]):

			
 
				+                if edu.get("时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["时间"])

			
 
				+                    if len(dates) == 2:

			
 
				+                        result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+        if result.get("主要家庭成员及社会关系"):

			
 
				+            for idx, fam in enumerate(result["主要家庭成员及社会关系"]):

			
 
				+                if fam.get("出生年月"):

			
 
				+                    dates = re.findall(r'\d+' , fam["出生年月"])

			
 
				+                    if len(dates) == 2:

			
 
				+                        result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+        normal = self.json_obj["base"]

			
 
				+        edunormal = self.json_obj["tal_his_edu"]

			
 
				+        family = self.json_obj["tal_family_social_relation"]

			
 
				+

			
 
				+        for key in normal.keys():

			
 
				+            if result.get(key):

			
 
				+                result[normal[key]] = result[key]

			
 
				+                result.pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['学习经历'])):

			
 
				+            result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]

			
 
				+            result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]

			
 
				+            for key in edunormal.keys():

			
 
				+                if result['学习经历'][idx].get(key):

			
 
				+                    result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				+                    result['学习经历'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['主要家庭成员及社会关系'])):

			
 
				+            for key in family.keys():

			
 
				+                if result['主要家庭成员及社会关系'][idx].get(key):

			
 
				+                    result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]

			
 
				+                    result['主要家庭成员及社会关系'][idx].pop(key)

			
 
				+

			
 
				+        tit = {

			
 
				+            "基本信息":"base",

			
 
				+            "求职意向":"intent_job",

			
 
				+            "学习经历":"tal_his_edu",

			
 
				+            "工作经历":"tal_his_job",

			
 
				+            "项目经历":"tal_his_project",

			
 
				+            "培训经历":"tal_training_experience",

			
 
				+            "获奖情况":"tal_reward_punishment",

			
 
				+            "语言能力":"tal_language",

			
 
				+            "证书":"tal_vocational_qualification_certificate",

			
 
				+            "专业技能":"tal_professional_tech_certificate",

			
 
				+            "主要家庭成员及社会关系":"tal_family_social_relation"

			
 
				+        }

			
 
				+

			
 
				+        for key in tit.keys():

			
 
				+            if result.get(key):

			
 
				+                result[tit[key]] = result[key]

			
 
				+                result.pop(key)

			
 
				+

			
 
				+        return result

			
 
				+

			
 
				+    # 推送后端

			
 
				+    def push_back(self, result):

			
 
				+        url = "http://192.168.1.110:9999/talent/getResumeData"

			
 
				+        session = requests.Session()

			
 
				+        session.mount('http://', HTTPAdapter(max_retries = 3))

			
 
				+        try:

			
 
				+            headers = {

			
 
				+                'contentType':'Application/json'

			
 
				+            }

			
 
				+            response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)

			
 
				+            print(response.text)

			
 
				+        except Exception as e:

			
 
				+            print(e)

			
 
				+

			
 
				+    def predict(self, path):

			
 
				+        if path.endswith(".docx"):

			
 
				+            result = self.formatter(self.parse_word_layout(path))

			
 
				+            self.push_back(result)

			
 
				+            print(self.formatter(self.parse_word_layout(path)))

			
 
				+        elif path.endswith(".pdf"):

			
 
				+            result = self.formatter(self.parse_pdf_layout(path))

			
 
				+            self.push_back(result)

			
 
				+            print(self.formatter(self.parse_pdf_layout(path)))

			
 
				 

			
 
				 

			
 
				 if __name__ == '__main__':

			
 
				-    pprint(formatter(parse_layout(path)))

			
 
				-

			
 
				-

			
 
				+    c = Custom()

			
 
				+    c.predict(path)

			
--- a/tools/irafa.py
+++ b/tools/irafa.py
@@ -2,198 +2,411 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-07 13:12:17

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-16 09:08:32

			
 
				+# @Last Modified time: 2022-07-18 14:57:29

			
 
				 

			
 
				 # 内部人才市场简历模板

			
 
				-from pprint import pprint

			
 
				+

			
 
				 import re

			
 
				 import json

			
 
				-import docx

			
 
				+

			
 
				+import requests

			
 
				+from requests.adapters import HTTPAdapter

			
 
				+

			
 
				+import pdfplumber

			
 
				 from docx import Document

			
 
				-from docx.shared import Inches

			
 
				-

			
 
				-

			
 
				-path = "d:\\desktop\\内部人才市场简历模板.docx"

			
 
				-

			
 
				-keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格（取得时间）", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]

			
 
				-

			
 
				-def parse_line(line):

			
 
				-    result = []

			
 
				-    key = None

			
 
				-    for cell in line:

			
 
				-        if cell and ''.join(cell.split()) in keywords:

			
 
				-            key = ''.join(cell.split())

			
 
				-        elif cell and key:

			
 
				-            schema = {key:cell}

			
 
				-            result.append(schema)

			
 
				-            key = None

			
 
				-    return result

			
 
				-

			
 
				-

			
 
				-def parse_layout(path):

			
 
				-    result = []

			
 
				-    doc = Document(path)

			
 
				-

			
 
				-    lo = {}

			
 
				-    tables = doc.tables

			
 
				-    for _table in tables[:]:

			
 
				-        for i, row in enumerate(_table.rows[:]):

			
 
				-            row_content = []

			
 
				-            for cell in row.cells[:]:

			
 
				-                c = cell.text

			
 
				-                row_content.append(c)

			
 
				-            lo[len(lo.keys())] = row_content

			
 
				-    

			
 
				-    kwln = -1

			
 
				-    kwline = None

			
 
				-    for key in lo.keys():

			
 
				-        # pdb.set_trace()

			
 
				-        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				-            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				-                # pdb.set_trace()

			
 
				-                for c in lo[key]:

			
 
				+

			
 
				+# path = "d:\\desktop\\内部人才市场简历模板.docx"

			
 
				+path = "d:\\desktop\\内部人才市场简历模板.pdf"

			
 
				+

			
 
				+

			
 
				+class Inner(object):

			
 
				+    """docstring for Inner"""

			
 
				+    def __init__(self):

			
 
				+        super(Inner, self).__init__()

			
 
				+        self.keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格（取得时间）", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]

			
 
				+        self.json_obj = self.get_translate()

			
 
				+

			
 
				+    def get_translate(self):

			
 
				+        # 转译数据库字段名

			
 
				+        with open("./resources/translate.json", "r", encoding="utf-8") as ff:

			
 
				+            json_obj = json.load(ff)

			
 
				+        return json_obj

			
 
				+

			
 
				+    def parse_line(self, line):

			
 
				+        result = []

			
 
				+        key = None

			
 
				+        for cell in line:

			
 
				+            if cell and ''.join(cell.split()) in self.keywords:

			
 
				+                key = ''.join(cell.split())

			
 
				+            elif cell and key:

			
 
				+                schema = {key:cell}

			
 
				+                result.append(schema)

			
 
				+                key = None

			
 
				+        return result

			
 
				+

			
 
				+    # 解析word

			
 
				+    def parse_word_layout(self, path):

			
 
				+        result = []

			
 
				+        doc = Document(path)

			
 
				+        lo = {}

			
 
				+        tables = doc.tables

			
 
				+        for _table in tables[:]:

			
 
				+            for i, row in enumerate(_table.rows[:]):

			
 
				+                row_content = []

			
 
				+                for cell in row.cells[:]:

			
 
				+                    c = cell.text

			
 
				+                    row_content.append(c)

			
 
				+                lo[len(lo.keys())] = row_content

			
 
				+        

			
 
				+        kwln = -1

			
 
				+        kwline = None

			
 
				+        for key in lo.keys():

			
 
				+            # pdb.set_trace()

			
 
				+            for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				                     # pdb.set_trace()

			
 
				-                    if c and ''.join(c.split()) in keywords:# 非关键词行元素

			
 
				-                        result.extend(parse_line(lo[key]))

			
 
				+                    for c in lo[key]:

			
 
				+                        # pdb.set_trace()

			
 
				+                        if c and ''.join(c.split()) in self.keywords:# 非关键词行元素

			
 
				+                            result.extend(self.parse_line(lo[key]))

			
 
				+                            break

			
 
				+                    else:# 关键词行元素

			
 
				+                        schema = dict()

			
 
				+                        for key, val in zip(kwline, lo[key]):

			
 
				+                            if key:

			
 
				+                                schema[key] = val

			
 
				+                        if "学校/培训机构" in schema.keys():

			
 
				+                            schema["学习经历"] = "学习经历"

			
 
				+                        elif "与本人关系" in schema.keys():

			
 
				+                            schema["家庭成员"] = "家庭成员"

			
 
				+                        elif "意向地区" in schema.keys():

			
 
				+                            schema["职业发展管理"] = "职业发展管理"

			
 
				+                        elif "职业证书" in schema.keys():

			
 
				+                            schema["职业资格证书"] = "职业资格证书"

			
 
				+                        result.append(schema)

			
 
				                         break

			
 
				-                else:# 关键词行元素

			
 
				-                    schema = dict()

			
 
				-                    for key, val in zip(kwline, lo[key]):

			
 
				-                        if key:

			
 
				-                            schema[key] = val

			
 
				-                    if "学校/培训机构" in schema.keys():

			
 
				-                        schema["学习经历"] = "学习经历"

			
 
				-                    elif "与本人关系" in schema.keys():

			
 
				-                        schema["家庭成员"] = "家庭成员"

			
 
				-                    elif "意向地区" in schema.keys():

			
 
				-                        schema["职业发展管理"] = "职业发展管理"

			
 
				-                    elif "职业证书" in schema.keys():

			
 
				-                        schema["职业资格证书"] = "职业资格证书"

			
 
				-                    result.append(schema)

			
 
				                     break

			
 
				-                break

			
 
				-        else:

			
 
				-            # print("此行为关键词行")

			
 
				-            kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				-            kwln = len(lo[key])

			
 
				-

			
 
				-    job = {"工作经历":"工作经历"}

			
 
				-    flag = None

			
 
				-    for p in doc.paragraphs:

			
 
				-        text = p.text.replace("：", ":")

			
 
				-        if ":" in text:

			
 
				-            text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)

			
 
				-            for line in text.split("\n"):

			
 
				-                if line.strip():

			
 
				-                    i = line.split(":")

			
 
				-                    if job.get(i[0].strip()):

			
 
				-                        result.append(job)

			
 
				-                        job = {"工作经历":"工作经历"}

			
 
				-                    job[i[0].strip()] = i[1].strip()

			
 
				-                    flag = i[0].strip()

			
 
				-        elif flag == "工作描述":

			
 
				-            job["工作描述"] += '\n' + text.strip()

			
 
				-    else:

			
 
				-        result.append(job)

			
 
				-    return result

			
 
				-

			
 
				-

			
 
				-# 格式化数据

			
 
				-def formatter(datalist):

			
 
				-    result = dict()

			
 
				-

			
 
				-    for d in datalist:

			
 
				-        if len(d) == 1:

			
 
				-            for key in d.keys():

			
 
				-                result[key] = d[key]

			
 
				+            else:

			
 
				+                # print("此行为关键词行")

			
 
				+                kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				+                kwln = len(lo[key])

			
 
				+

			
 
				+        job = {"工作经历":"工作经历"}

			
 
				+        flag = None

			
 
				+        for p in doc.paragraphs:

			
 
				+            text = p.text.replace("：", ":")

			
 
				+            if ":" in text:

			
 
				+                text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)

			
 
				+                for line in text.split("\n"):

			
 
				+                    if line.strip():

			
 
				+                        i = line.split(":")

			
 
				+                        if job.get(i[0].strip()):

			
 
				+                            result.append(job)

			
 
				+                            job = {"工作经历":"工作经历"}

			
 
				+                        job[i[0].strip()] = i[1].strip()

			
 
				+                        flag = i[0].strip()

			
 
				+            elif flag == "工作描述":

			
 
				+                job["工作描述"] += '\n' + text.strip()

			
 
				         else:

			
 
				-            for k in list(d.keys()):

			
 
				-                if k == "".join(d[k].split()):

			
 
				-                    d.pop(k)

			
 
				-                    if result.get(k):

			
 
				-                        result[k].append(d)

			
 
				-                    else:

			
 
				-                        result[k] = [d]

			
 
				-

			
 
				-    # 转译数据库字段名

			
 
				-    with open("./resources/translate.json", "r", encoding="utf-8") as ff:

			
 
				-        json_obj = json.load(ff)

			
 
				-

			
 
				-    normal = json_obj["base"]

			
 
				-    itenormal = json_obj["base"]

			
 
				-    edunormal = json_obj["tal_training_institutions"]

			
 
				-    jobnormal = json_obj["tal_his_job"]

			
 
				-    cetnormal = json_obj["tal_vocational_qualification_certificate"]

			
 
				-    family = json_obj["tal_family_social_relations"]

			
 
				-

			
 
				-    for key in normal.keys():

			
 
				-        if result.get(key):

			
 
				-            result[normal[key]] = result[key]

			
 
				-            result.pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['职业发展管理'])):

			
 
				-        for key in itenormal.keys():

			
 
				-            if result['职业发展管理'][idx].get(key):

			
 
				-                result['职业发展管理'][idx][itenormal[key]] = result['职业发展管理'][idx][key]

			
 
				-                result['职业发展管理'][idx].pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['学习经历'])):

			
 
				-        for key in edunormal.keys():

			
 
				-            if result['学习经历'][idx].get(key):

			
 
				-                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				-                result['学习经历'][idx].pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['工作经历'])):

			
 
				-        for key in jobnormal.keys():

			
 
				-            if result['工作经历'][idx].get(key):

			
 
				-                result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]

			
 
				-                result['工作经历'][idx].pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['职业资格证书'])):

			
 
				-        for key in cetnormal.keys():

			
 
				-            if result['职业资格证书'][idx].get(key):

			
 
				-                result['职业资格证书'][idx][cetnormal[key]] = result['职业资格证书'][idx][key]

			
 
				-                result['职业资格证书'][idx].pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['家庭成员'])):

			
 
				-        for key in family.keys():

			
 
				-            if result['家庭成员'][idx].get(key):

			
 
				-                result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]

			
 
				-                result['家庭成员'][idx].pop(key)

			
 
				-

			
 
				-    tit = {

			
 
				-        "基本信息":"base",

			
 
				-        "职业发展管理":"intent_job",

			
 
				-        "学习经历":"tal_training_institutions",

			
 
				-        "工作经历":"tal_his_job",

			
 
				-        "项目经历":"tal_his_project",

			
 
				-        "培训经历":"tal_training_institutions",

			
 
				-        "获奖情况":"tal_rewards_punishments",

			
 
				-        "语言能力":"tal_language",

			
 
				-        "职业资格证书":"tal_vocational_qualification_certificate",

			
 
				-        "专业技能":"tal_professional_tech_certificate",

			
 
				-        "家庭成员":"tal_family_social_relations"

			
 
				-    }

			
 
				-

			
 
				-    for key in tit.keys():

			
 
				-        if result.get(key):

			
 
				-            result[tit[key]] = result[key]

			
 
				-            result.pop(key)

			
 
				-

			
 
				-    # url = "http://192.168.1.110:9999/talent/getResumeData"

			
 
				-    # session = requests.Session()

			
 
				-    # session.mount('http://', HTTPAdapter(max_retries = 3))

			
 
				-    # try:

			
 
				-    #     headers = {

			
 
				-    #         'contentType':'Application/json'

			
 
				-    #     }

			
 
				-    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)

			
 
				-    #     print(response.text)

			
 
				-    # except Exception as e:

			
 
				-    #     print(e)

			
 
				-

			
 
				-    return result

			
 
				+            result.append(job)

			
 
				+        return result

			
 
				 

			
 
				-if __name__ == "__main__":

			
 
				-    pprint(formatter(parse_layout(path)))

			
 
				+    # 解析pdf

			
 
				+    def parse_pdf_layout(self, path):

			
 
				+        result = []

			
 
				+        lo = {}

			
 
				+        with pdfplumber.open(path) as pdf:

			
 
				+                for page in pdf.pages:

			
 
				+                    for table in page.extract_tables():

			
 
				+                        for line in table:

			
 
				+                            # lo[len(lo.keys())] = [cell for cell in line if cell]

			
 
				+                            lo[len(lo.keys())] = line

			
 
				+

			
 
				+        kwln = -1

			
 
				+        kwline = None

			
 
				+        for key in lo.keys():

			
 
				+            # pdb.set_trace()

			
 
				+            for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+                    # pdb.set_trace()

			
 
				+                    for c in lo[key]:

			
 
				+                        # pdb.set_trace()

			
 
				+                        if c and ''.join(c.split()) in self.keywords:# 非关键词行元素

			
 
				+                            result.extend(self.parse_line(lo[key]))

			
 
				+                            break

			
 
				+                        if c == "对报名岗位\n认 识及工作":

			
 
				+                            print(''.join(c.split()))

			
 
				+                            break

			
 
				+                    else:# 关键词行元素

			
 
				+                        schema = dict()

			
 
				+                        for key, val in zip(kwline, lo[key]):

			
 
				+                            if key:

			
 
				+                                schema[key] = val

			
 
				+                        if "学校/培训机构" in schema.keys():

			
 
				+                            schema["学习经历"] = "学习经历"

			
 
				+                        elif "与本人关系" in schema.keys():

			
 
				+                            schema["家庭成员"] = "家庭成员"

			
 
				+                        elif "意向地区" in schema.keys():

			
 
				+                            schema["职业发展管理"] = "职业发展管理"

			
 
				+                        elif "职业证书" in schema.keys():

			
 
				+                            schema["职业资格证书"] = "职业资格证书"

			
 
				+                        result.append(schema)

			
 
				+                        break

			
 
				+                    break

			
 
				+            else:

			
 
				+                # print("此行为关键词行")

			
 
				+                kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				+                kwln = len(lo[key])

			
 
				+

			
 
				+        job = {"工作经历":"工作经历"}

			
 
				+        flag = None

			
 
				+

			
 
				+        with pdfplumber.open(path) as pdf:

			
 
				+            for page in pdf.pages:

			
 
				+                for predict in page.extract_words():

			
 
				+                    # print(predict['text'])

			
 
				+                    text = predict['text'].replace("：", ":")

			
 
				+                    if ":" in text:

			
 
				+                        text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)

			
 
				+                        for line in text.split("\n"):

			
 
				+                            if line.strip():

			
 
				+                                i = line.split(":")

			
 
				+                                if job.get(i[0].strip()):

			
 
				+                                    result.append(job)

			
 
				+                                    job = {"工作经历":"工作经历"}

			
 
				+                                job[i[0].strip()] = i[1].strip()

			
 
				+                                flag = i[0].strip()

			
 
				+                    elif flag == "工作描述":

			
 
				+                        job["工作描述"] += '\n' + text.strip()

			
 
				+                else:

			
 
				+                    result.append(job)

			
 
				+        return result

			
 
				+

			
 
				+    # 格式化数据

			
 
				+    def formatter(self, datalist):

			
 
				+        result = dict()

			
 
				+        for d in datalist:

			
 
				+            if len(d) == 1:

			
 
				+                for key in d.keys():

			
 
				+                    result[key] = d[key]

			
 
				+            else:

			
 
				+                for k in list(d.keys()):

			
 
				+                    if k == "".join(d[k].split()):

			
 
				+                        d.pop(k)

			
 
				+                        if result.get(k):

			
 
				+                            result[k].append(d)

			
 
				+                        else:

			
 
				+                            result[k] = [d]

			
 
				+

			
 
				+        if result.get("外语水平"):

			
 
				+            data = re.findall(r'(\w+[语话])', result["外语水平"])

			
 
				+            if dates:

			
 
				+                result["外语水平"] = data

			
 
				+

			
 
				+        if result.get("专业技术资格(取得时间)"):

			
 
				+            dates = re.findall(r'\d+', result["专业技术资格(取得时间)"])

			
 
				+            for i in dates:

			
 
				+                result["专业技术资格(取得时间)"] = result["专业技术资格(取得时间)"].replace(i, "")

			
 
				+            names = re.findall(r'\w+', result["专业技术资格(取得时间)"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"专业技术资格":names}]

			
 
				+            elif len(dates) == 2:

			
 
				+                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"专业技术资格":names}]

			
 
				+            elif len(dates) == 3:

			
 
				+                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"专业技术资格":names}]

			
 
				+

			
 
				+        ### 时间格式化

			
 
				+        if result.get("出生年月"):

			
 
				+            dates = re.findall(r'\d+' , result["出生年月"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+

			
 
				+        if result.get("任职时间"):

			
 
				+            dates = re.findall(r'\d+' , result["任职时间"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+

			
 
				+        if result.get("参加工作时间"):

			
 
				+            dates = re.findall(r'\d+' , result["参加工作时间"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				 

			
 
				+        if result.get("最高学历毕业院校及毕业时间"):

			
 
				+            dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])

			
 
				+            ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])

			
 
				+            if len(ws) > 0:

			
 
				+                result["最高学历毕业院校"] = ws[0]

			
 
				+            if len(dates) == 1:

			
 
				+                result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+            result.pop("最高学历毕业院校及毕业时间")

			
 
				+

			
 
				+        if result.get("初始学历毕业院校及毕业时间"):

			
 
				+            dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])

			
 
				+            ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])

			
 
				+            if len(ws) > 0:

			
 
				+                result["初始学历毕业院校"] = ws[0]

			
 
				+            if len(dates) == 1:

			
 
				+                result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+            result.pop("初始学历毕业院校及毕业时间")

			
 
				+

			
 
				+        if result.get("学习经历"):

			
 
				+            for idx, edu in enumerate(result["学习经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("培训经历"):

			
 
				+            for idx, edu in enumerate(result["培训经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("工作经历"):

			
 
				+            for idx, edu in enumerate(result["工作经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("项目经历"):

			
 
				+            for idx, edu in enumerate(result["项目经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("获得职业资格证书情况"):

			
 
				+            for idx, edu in enumerate(result["获得职业资格证书情况"]):

			
 
				+                if edu.get("获得日期"):

			
 
				+                    dates = re.findall(r'\d+' , edu["获得日期"])

			
 
				+                    if len(dates) == 2:

			
 
				+                        result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+        if result.get("奖惩情况"):

			
 
				+            for idx, edu in enumerate(result["奖惩情况"]):

			
 
				+                if edu.get("时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["时间"])

			
 
				+                    if len(dates) == 2:

			
 
				+                        result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+        if result.get("主要家庭成员及社会关系"):

			
 
				+            for idx, fam in enumerate(result["主要家庭成员及社会关系"]):

			
 
				+                if fam.get("出生年月"):

			
 
				+                    dates = re.findall(r'\d+' , fam["出生年月"])

			
 
				+                    if len(dates) == 2:

			
 
				+                        result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+        normal = self.json_obj["base"]

			
 
				+        itenormal = self.json_obj["base"]

			
 
				+        edunormal = self.json_obj["tal_training_experience"]

			
 
				+        jobnormal = self.json_obj["tal_his_job"]

			
 
				+        cetnormal = self.json_obj["tal_vocational_qualification_certificate"]

			
 
				+        family = self.json_obj["tal_family_social_relation"]

			
 
				+

			
 
				+        for key in normal.keys():

			
 
				+            if result.get(key):

			
 
				+                result[normal[key]] = result[key]

			
 
				+                result.pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['职业发展管理'])):

			
 
				+            for key in itenormal.keys():

			
 
				+                if result['职业发展管理'][idx].get(key):

			
 
				+                    result['职业发展管理'][idx][itenormal[key]] = result['职业发展管理'][idx][key]

			
 
				+                    result['职业发展管理'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['学习经历'])):

			
 
				+            for key in edunormal.keys():

			
 
				+                if result['学习经历'][idx].get(key):

			
 
				+                    result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				+                    result['学习经历'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['工作经历'])):

			
 
				+            for key in jobnormal.keys():

			
 
				+                if result['工作经历'][idx].get(key):

			
 
				+                    result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]

			
 
				+                    result['工作经历'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['职业资格证书'])):

			
 
				+            for key in cetnormal.keys():

			
 
				+                if result['职业资格证书'][idx].get(key):

			
 
				+                    result['职业资格证书'][idx][cetnormal[key]] = result['职业资格证书'][idx][key]

			
 
				+                    result['职业资格证书'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['家庭成员'])):

			
 
				+            for key in family.keys():

			
 
				+                if result['家庭成员'][idx].get(key):

			
 
				+                    result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]

			
 
				+                    result['家庭成员'][idx].pop(key)

			
 
				+

			
 
				+        tit = {

			
 
				+            "基本信息":"base",

			
 
				+            "职业发展管理":"intent_job",

			
 
				+            "学习经历":"tal_training_experience",

			
 
				+            "工作经历":"tal_his_job",

			
 
				+            "项目经历":"tal_his_project",

			
 
				+            "培训经历":"tal_training_experience",

			
 
				+            "获奖情况":"tal_reward_punishment",

			
 
				+            "语言能力":"tal_language",

			
 
				+            "职业资格证书":"tal_vocational_qualification_certificate",

			
 
				+            "专业技能":"tal_professional_tech_certificate",

			
 
				+            "家庭成员":"tal_family_social_relation"

			
 
				+        }

			
 
				+

			
 
				+        for key in tit.keys():

			
 
				+            if result.get(key):

			
 
				+                result[tit[key]] = result[key]

			
 
				+                result.pop(key)

			
 
				+

			
 
				+        return result

			
 
				+

			
 
				+    # 推送后端

			
 
				+    def push_back(self, result):

			
 
				+        url = "http://192.168.1.110:9999/talent/getResumeData"

			
 
				+        session = requests.Session()

			
 
				+        session.mount('http://', HTTPAdapter(max_retries = 3))

			
 
				+        try:

			
 
				+            headers = {

			
 
				+                'contentType':'Application/json'

			
 
				+            }

			
 
				+            response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)

			
 
				+            print(response.text)

			
 
				+        except Exception as e:

			
 
				+            print(e)

			
 
				+

			
 
				+    def predict(self, path):

			
 
				+        if path.endswith(".docx"):

			
 
				+            result = self.formatter(self.parse_word_layout(path))

			
 
				+            self.push_back(result)

			
 
				+            print(self.formatter(self.parse_word_layout(path)))

			
 
				+        elif path.endswith(".pdf"):

			
 
				+            result = self.formatter(self.parse_pdf_layout(path))

			
 
				+            self.push_back(result)

			
 
				+            print(self.formatter(self.parse_pdf_layout(path)))

			
 
				+

			
 
				+

			
 
				+

			
 
				+if __name__ == "__main__":

			
 
				+    i = Inner()

			
 
				+    i.predict(path)

			
--- a/tools/resources/translate.json
+++ b/tools/resources/translate.json
@@ -7,6 +7,7 @@
 
				         "民族":"national",

			
 
				         "籍贯":"birthplace",

			
 
				         "户籍地":"household_register_address",

			
 
				+        "出生地":"household_register_address",

			
 
				         "参加工作时间":"work_begin_time",

			
 
				         "联系电话":"mobile",

			
 
				         "手机号码":"mobile",

			
@@ -20,8 +21,9 @@
 
				         "意向岗位":"intent_job",

			
 
				         "期望职业":"intent_job",

			
 
				         "目前年薪":"current_salary_yearl",

			
 
				-        "政治面貌(加入时间)":"politics",

			
 
				         "政治面貌":"politics",

			
 
				+        "政治面貌(加入时间)":"politics",

			
 
				+        "政治面貌（加入时间）":"politics",

			
 
				         "熟悉专业有何专长":"skills"

			
 
				     },

			
 
				     "tal_his_edu":{

			
@@ -45,6 +47,7 @@
 
				     },

			
 
				     "tal_his_project":{

			
 
				         "项目名":"project_name",

			
 
				+        "项目名称":"project_name",

			
 
				         "公司名":"company_name",

			
 
				         "公司名称":"company_name",

			
 
				         "职位":"project_office",

			
@@ -58,42 +61,45 @@
 
				         "熟练度":"proficiency"

			
 
				     },

			
 
				     "tal_vocational_qualification_certificate":{

			
 
				-        "证书名称":"vocational_qualification_certificate_name",

			
 
				-        "名称":"vocational_qualification_certificate_name",

			
 
				-        "证书":"vocational_qualification_certificate_name",

			
 
				-        "职业证书":"vocational_qualification_certificate_name",

			
 
				-        "获得时间":"vocational_certificate_obtaining_time",

			
 
				-        "获得日期":"vocational_certificate_obtaining_time",

			
 
				-        "取得日期":"vocational_certificate_obtaining_time"

			
 
				+        "证书名称":"certificate_name",

			
 
				+        "名称":"certificate_name",

			
 
				+        "证书":"certificate_name",

			
 
				+        "职业证书":"certificate_name",

			
 
				+        "获得时间":"obtain_time",

			
 
				+        "获得日期":"obtain_time",

			
 
				+        "取得日期":"obtain_time"

			
 
				     },

			
 
				     "tal_professional_tech_certificate":{

			
 
				-        "技术资格证明":"professional_tech_certificate_name",

			
 
				-        "获得时间":"professional_certificate_obtaining_time"

			
 
				+        "技术资格证明":"certificate_name",

			
 
				+        "获得时间":"obtain_time"

			
 
				     },

			
 
				-    "tal_training_institutions":{

			
 
				-        "学校/培训机构":"school_training_institutions",

			
 
				+    "tal_training_experience":{

			
 
				+        "学校/培训机构":"institution_name",

			
 
				+        "机构":"institution_name",

			
 
				+        "cultivate_name":"institution_name",

			
 
				+        "内容":"institution_name",

			
 
				         "cultivate_time_beg":"start_time",

			
 
				         "cultivate_time_end":"end_time",

			
 
				-        "cultivate_name":"school_training_institutions",

			
 
				         "专业":"major",

			
 
				+        "培训类型":"major",

			
 
				         "开始时间":"start_time",

			
 
				         "起始时间":"start_time",

			
 
				         "结束时间":"end_time",

			
 
				         "毕业时间":"end_time"

			
 
				     },

			
 
				-    "tal_rewards_punishments":{

			
 
				+    "tal_reward_punishment":{

			
 
				         "项目名称":"name",

			
 
				         "奖项":"name",

			
 
				-        "项目单位":"rewards_punishments_unit",

			
 
				-        "时间":"rewards_punishments_time"

			
 
				+        "项目单位":"unit",

			
 
				+        "时间":"obtain_time"

			
 
				     },

			
 
				-    "tal_family_social_relations":{

			
 
				+    "tal_family_social_relation":{

			
 
				         "称谓":"appellation",

			
 
				         "与本人关系":"appellation",

			
 
				         "姓名":"name",

			
 
				-        "出生年月":"birth_time",

			
 
				+        "出生年月":"birth_date",

			
 
				         "政治面貌":"politics",

			
 
				-        "工作单位":"work_units",

			
 
				+        "工作单位":"work_unit",

			
 
				         "职务":"position",

			
 
				         "职业":"position",

			
 
				         "工作单位及职务":"position"

			
--- a/tools/resume_parse.py
+++ b/tools/resume_parse.py
@@ -38,7 +38,7 @@ from rich.console import Console
 
				 console = Console()
			
 
				 
			
 
				 
			
 
				-global ner, ner_tag, base_info_ie, prize_ie, cet_ie, pro_ie, block, block_rev
			
 
				+global ner, ner_tag, base_info_ie, prize_ie, cet_ie, pro_ie, block, block_rev, translate
			
 
				 
			
 
				 
			
 
				 if not locals().get("ner"):
			
@@ -52,7 +52,7 @@ if not locals().get("prize_ie"):
 
				 if not locals().get("cet_ie"):
			
 
				     cet_ie = Taskflow('information_extraction', schema=["时间","证书"], model="uie-nano")
			
 
				 if not locals().get("pro_ie"):
			
 
				-    pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./resources/model_100')
			
 
				+    pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./resources/model_best')
			
 
				 
			
 
				 if not locals().get("block"):
			
 
				     with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
			
@@ -61,6 +61,10 @@ if not locals().get("block"):
 
				 if not locals().get("block_rev"):
			
 
				     block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
			
 
				 
			
 
				+if not locals().get("translate"):
			
 
				+    with open("./resources/translate.json", "r", encoding="utf-8") as ft:
			
 
				+        translate = json.load(ft)
			
 
				+
			
 
				 
			
 
				 if not os.path.exists("./uploads"):
			
 
				     os.mkdir("./uploads")
			
@@ -1448,6 +1452,112 @@ def decode_path(path):
 
				     return path_name
			
 
				 
			
 
				 
			
 
				+# 格式化字段
			
 
				+def formatter(result, json_obj):
			
 
				+    normal = json_obj["base"]
			
 
				+    itenormal = json_obj["base"]
			
 
				+    edunormal = json_obj["tal_his_edu"]
			
 
				+    jobnormal = json_obj["tal_his_job"]
			
 
				+    tranornal = json_obj["tal_training_experience"]
			
 
				+    cetnormal = json_obj["tal_vocational_qualification_certificate"]
			
 
				+    rewnormal = json_obj["tal_reward_punishment"]
			
 
				+    family = json_obj["tal_family_social_relation"]
			
 
				+
			
 
				+    # for key in normal.keys():
			
 
				+    #     if result.get(key):
			
 
				+    #         result[normal[key]] = result[key]
			
 
				+    #         result.pop(key)
			
 
				+
			
 
				+    for key in json_obj["base"].keys():
			
 
				+        if result.get("基本信息"):
			
 
				+            if result["基本信息"].get(key):
			
 
				+                result[json_obj["base"][key]] = result["基本信息"][key]
			
 
				+                del result["基本信息"][key]
			
 
				+        if result.get("求职意向"):
			
 
				+            if result["求职意向"].get(key):
			
 
				+                result[json_obj["base"][key]] = result["求职意向"][key]
			
 
				+                del result["求职意向"][key]
			
 
				+    del result["基本信息"]
			
 
				+    del result["求职意向"]
			
 
				+
			
 
				+    if result.get("教育经历"):
			
 
				+        for idx in range(len(result['教育经历'])):
			
 
				+            for key in edunormal.keys():
			
 
				+                if result['教育经历'][idx].get(key):
			
 
				+                    result['教育经历'][idx][edunormal[key]] = result['教育经历'][idx][key]
			
 
				+                    result['教育经历'][idx].pop(key)
			
 
				+
			
 
				+    if result.get("工作经历"):
			
 
				+        for idx in range(len(result['工作经历'])):
			
 
				+            for key in jobnormal.keys():
			
 
				+                if result['工作经历'][idx].get(key):
			
 
				+                    result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
			
 
				+                    result['工作经历'][idx].pop(key)
			
 
				+
			
 
				+    if result.get("项目经历"):
			
 
				+        for key in json_obj["tal_his_project"].keys():
			
 
				+            for idx in range(len(result["项目经历"])):
			
 
				+                if result["项目经历"][idx].get(key):
			
 
				+                    result["项目经历"][idx][json_obj["tal_his_project"][key]] = result["项目经历"][idx][key]
			
 
				+                    del result["项目经历"][idx][key]
			
 
				+
			
 
				+    if result.get("培训经历"):
			
 
				+        for idx in range(len(result['培训经历'])):
			
 
				+            for key in tranornal.keys():
			
 
				+                if result['培训经历'][idx].get(key):
			
 
				+                    result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]
			
 
				+                    result['培训经历'][idx].pop(key)
			
 
				+
			
 
				+    if result.get("语言能力"):
			
 
				+        for key in json_obj["tal_language"].keys():
			
 
				+            for idx in range(len(result["语言能力"])):
			
 
				+                if result["语言能力"][idx].get(key):
			
 
				+                    result["语言能力"][idx][json_obj["tal_language"][key]] = result["语言能力"][idx][key]
			
 
				+                    del result["语言能力"][idx][key]
			
 
				+
			
 
				+    if result.get("证书"):
			
 
				+        for idx in range(len(result['证书'])):
			
 
				+            for key in cetnormal.keys():
			
 
				+                if result['证书'][idx].get(key):
			
 
				+                    result['证书'][idx][cetnormal[key]] = result['证书'][idx][key]
			
 
				+                    result['证书'][idx].pop(key)
			
 
				+
			
 
				+    if result.get("获奖情况"):
			
 
				+        for idx in range(len(result['获奖情况'])):
			
 
				+            for key in rewnormal.keys():
			
 
				+                if result['获奖情况'][idx].get(key):
			
 
				+                    result['获奖情况'][idx][rewnormal[key]] = result['获奖情况'][idx][key]
			
 
				+                    result['获奖情况'][idx].pop(key)
			
 
				+
			
 
				+    if result.get("家庭成员"):
			
 
				+        for idx in range(len(result['家庭成员'])):
			
 
				+            for key in family.keys():
			
 
				+                if result['家庭成员'][idx].get(key):
			
 
				+                    result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
			
 
				+                    result['家庭成员'][idx].pop(key)
			
 
				+
			
 
				+    tit = {
			
 
				+        "基本信息":"base",
			
 
				+        "求职意向":"intent_job",
			
 
				+        "教育经历":"tal_his_edu",
			
 
				+        "工作经历":"tal_his_job",
			
 
				+        "项目经历":"tal_his_project",
			
 
				+        "培训经历":"tal_training_experience",
			
 
				+        "获奖情况":"tal_reward_punishment",
			
 
				+        "语言能力":"tal_language",
			
 
				+        "证书":"tal_vocational_qualification_certificate",
			
 
				+        "专业技能":"tal_professional_tech_certificate",
			
 
				+        "家庭成员":"tal_family_social_relation",
			
 
				+        "其他情况说明":"intro"
			
 
				+    }
			
 
				+
			
 
				+    for key in tit.keys():
			
 
				+        if result.get(key):
			
 
				+            result[tit[key]] = result[key]
			
 
				+            result.pop(key)
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				 # 结果返回
			
 
				 def push_back(tempdir):
			
 
				     for file in os.listdir('./result/' + tempdir):
			
@@ -1455,75 +1565,7 @@ def push_back(tempdir):
 
				         with open(filename, "r", encoding="utf-8") as ff:
			
 
				             rst = json.load(ff)
			
 
				 
			
 
				-        with open("./resources/translate.json", "r", encoding="utf-8") as ft:
			
 
				-            json_obj = json.load(ft)
			
 
				-
			
 
				-        for key in json_obj["base"].keys():
			
 
				-            if rst.get("基本信息"):
			
 
				-                if rst["基本信息"].get(key):
			
 
				-                    rst[json_obj["base"][key]] = rst["基本信息"][key]
			
 
				-                    del rst["基本信息"][key]
			
 
				-            if rst.get("求职意向"):
			
 
				-                if rst["求职意向"].get(key):
			
 
				-                    rst[json_obj["base"][key]] = rst["求职意向"][key]
			
 
				-                    del rst["求职意向"][key]
			
 
				-
			
 
				-        del rst["基本信息"]
			
 
				-        del rst["求职意向"]
			
 
				-        
			
 
				-        for key in json_obj["tal_his_project"].keys():
			
 
				-            if rst.get("项目经历"):
			
 
				-                for idx in range(len(rst["项目经历"])):
			
 
				-                    if rst["项目经历"][idx].get(key):
			
 
				-                        rst["项目经历"][idx][json_obj["tal_his_project"][key]] = rst["项目经历"][idx][key]
			
 
				-                        del rst["项目经历"][idx][key]
			
 
				-
			
 
				-        for key in json_obj["tal_training_institutions"].keys():
			
 
				-            if rst.get("培训经历"):
			
 
				-                for idx in range(len(rst["培训经历"])):
			
 
				-                    if rst["培训经历"][idx].get(key):
			
 
				-                        rst["培训经历"][idx][json_obj["tal_training_institutions"][key]] = rst["培训经历"][idx][key]
			
 
				-                        del rst["培训经历"][idx][key]
			
 
				-
			
 
				-        for key in json_obj["tal_vocational_qualification_certificate"].keys():
			
 
				-            if rst.get("证书"):
			
 
				-                for idx in range(len(rst["证书"])):
			
 
				-                    if rst["证书"][idx].get(key):
			
 
				-                        rst["证书"][idx][json_obj["tal_vocational_qualification_certificate"][key]] = rst["证书"][idx][key]
			
 
				-                        del rst["证书"][idx][key]
			
 
				-        
			
 
				-        for key in json_obj["tal_language"].keys():
			
 
				-            if rst.get("语言能力"):
			
 
				-                for idx in range(len(rst["语言能力"])):
			
 
				-                    if rst["语言能力"][idx].get(key):
			
 
				-                        rst["语言能力"][idx][json_obj["tal_language"][key]] = rst["语言能力"][idx][key]
			
 
				-                        del rst["语言能力"][idx][key]
			
 
				-
			
 
				-        for key in json_obj["tal_rewards_punishments"].keys():
			
 
				-            if rst.get("获奖情况"):
			
 
				-                for idx in range(len(rst["获奖情况"])):
			
 
				-                    if rst["获奖情况"][idx].get(key):
			
 
				-                        rst["获奖情况"][idx][json_obj["tal_rewards_punishments"][key]] = rst["获奖情况"][idx][key]
			
 
				-                        del rst["获奖情况"][idx][key]
			
 
				-
			
 
				-        tit = {
			
 
				-            "基本信息":"base",
			
 
				-            "求职意向":"intent_job",
			
 
				-            "教育经历":"tal_his_edu",
			
 
				-            "工作经历":"tal_his_job",
			
 
				-            "项目经历":"tal_his_project",
			
 
				-            "培训经历":"tal_training_institutions",
			
 
				-            "获奖情况":"tal_rewards_punishments",
			
 
				-            "语言能力":"tal_language",
			
 
				-            "证书":"tal_vocational_qualification_certificate",
			
 
				-            "专业技能":"tal_professional_tech_certificate",
			
 
				-            "家庭成员":"tal_family_social_relations"
			
 
				-        }
			
 
				-
			
 
				-        for key in tit.keys():
			
 
				-            if rst.get(key):
			
 
				-                rst[tit[key]] = rst[key]
			
 
				-                rst.pop(key)
			
 
				+        rst = formatter(rst, translate)
			
 
				 
			
 
				         url = "http://192.168.1.110:9999/talent/getResumeData"
			
 
				         session = requests.Session()
			
@@ -1638,7 +1680,7 @@ def detection_type(path, system):
 
				 async def file_upload(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
			
 
				     """
			
 
				     简历上传
			
 
				-    格式：pdf，docx，doc，txt，tar，zip，7z
			
 
				+    格式：pdf，docx，doc，txt，tar.gz，zip，7z, rar
			
 
				     """
			
 
				     res = await file.read()
			
 
				     with open('./uploads/' + file.filename, "wb") as f:
			
--- a/tools/srafa.py
+++ b/tools/srafa.py
@@ -2,232 +2,443 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-07 12:59:42

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-16 09:08:43

			
 
				+# @Last Modified time: 2022-07-18 14:57:59

			
 
				 # import pdb

			
 
				-from pprint import pprint

			
 
				+

			
 
				+import re

			
 
				 import json

			
 
				-import pandas as pd

			
 
				+import requests

			
 
				+from requests.adapters import HTTPAdapter

			
 
				+

			
 
				 import pdfplumber

			
 
				+from docx import Document

			
 
				+

			
 
				+

			
 
				+path = "d:\\desktop\\社招简历模板.docx"

			
 
				+

			
 
				+class Social(object):

			
 
				+    """docstring for Social"""

			
 
				+    def __init__(self):

			
 
				+        super(Social, self).__init__()

			
 
				+        self.keywords = [

			
 
				+            '姓名',

			
 
				+            '性别',

			
 
				+            '出生日期',

			
 
				+            '一寸照片',

			
 
				+            '民族',

			
 
				+            '出生地',

			
 
				+            '政治面貌（加入时间）',

			
 
				+            '参加工作时间',

			
 
				+            '健康状况',

			
 
				+            '外语水平',

			
 
				+            '初始学历、专业',

			
 
				+            '最高学历、专业',

			
 
				+            '初始学历毕业院校及毕业时间',

			
 
				+            '最高学历毕业院校及毕业时间',

			
 
				+            '专业技术资格（取得时间）',

			
 
				+            '职业技能等级（取得时间）',

			
 
				+            '熟悉专业有何专长',

			
 
				+            '工作单位',

			
 
				+            '现任职务',

			
 
				+            '任职时间',

			
 
				+            '提职时间',

			
 
				+            '意向岗位',

			
 
				+            '联系电话',

			
 
				+            '学习经历',

			
 
				+            '起止时间',

			
 
				+            '学校',

			
 
				+            '专业',

			
 
				+            '学历',

			
 
				+            '学位',

			
 
				+            '研究方向',

			
 
				+            '是否全日制',

			
 
				+            '培训经历',

			
 
				+            '培训类型',

			
 
				+            '机构',

			
 
				+            '内容',

			
 
				+            '成绩',

			
 
				+            '证书名称',

			
 
				+            '工作经历',

			
 
				+            '职务',

			
 
				+            '部门',

			
 
				+            '证明人',

			
 
				+            '备注',

			
 
				+            '对报名岗位认识及工作设想',

			
 
				+            '自我评价及主要工作业绩',

			
 
				+            '获得职业资格证书情况',

			
 
				+            '获得日期',

			
 
				+            '名称',

			
 
				+            '证书编码/文号',

			
 
				+            '授予单位',

			
 
				+            '奖惩情况',

			
 
				+            '项目',

			
 
				+            '时间',

			
 
				+            '项目单位',

			
 
				+            '证明材料',

			
 
				+            '主要家庭成员及社会关系',

			
 
				+            '称谓',

			
 
				+            '出生年月',

			
 
				+            '政治面貌',

			
 
				+            '工作单位及职务',

			
 
				+            '其他情况说明',

			
 
				+            '诚信承诺',

			
 
				+            '社会招聘工作办公室资格审查意见'

			
 
				+        ]

			
 
				+        self.json_obj = self.get_translate()

			
 
				+

			
 
				+    def get_translate(self):

			
 
				+        # 转译数据库字段名

			
 
				+        with open("./resources/translate.json", "r", encoding="utf-8") as ff:

			
 
				+            json_obj = json.load(ff)

			
 
				+        return json_obj

			
 
				+

			
 
				+    def parse_line(self, line):

			
 
				+        result = []

			
 
				+        key = None

			
 
				+        for cell in line:

			
 
				+            if cell and ''.join(cell.split()) in self.keywords:

			
 
				+                key = ''.join(cell.split())

			
 
				+            elif cell and key:

			
 
				+                schema = {key:cell}

			
 
				+                result.append(schema)

			
 
				+                key = None

			
 
				+        return result

			
 
				+    

			
 
				+    # 解析word

			
 
				+    def parse_word_layout(self, path):

			
 
				+        result = []

			
 
				+        doc = Document(path)

			
 
				+        lo = {}

			
 
				+        for _table in doc.tables[:]:

			
 
				+            for i, row in enumerate(_table.rows[:]):

			
 
				+                row_content = []

			
 
				+                for cell in row.cells[:]:

			
 
				+                    c = cell.text

			
 
				+                    if c not in row_content:

			
 
				+                        row_content.append(c)

			
 
				+                lo[len(lo.keys())] = row_content

			
 
				+

			
 
				+        kwln = -1# 关键词行长度

			
 
				+        kwline = None# 关键词行

			
 
				+        for key in lo.keys():

			
 
				+            for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+                    perc = 0# 行内关键词数量

			
 
				+                    for c in lo[key]:

			
 
				+                        if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词

			
 
				+                            perc += 1

			
 
				+                        if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3，判断此行非关键词行元素

			
 
				+                            perc = 0# 清空行内关键词数

			
 
				+                            result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素

			
 
				+                            break

			
 
				+                    else:# 关键词行元素

			
 
				+                        if len(kwline) != len(lo[key]):

			
 
				+                            break

			
 
				+                        schema = dict()

			
 
				+                        for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素

			
 
				+                            if key:

			
 
				+                                schema[key] = val

			
 
				+                        result.append(schema)

			
 
				+                        break

			
 
				+                    break

			
 
				+            else:

			
 
				+                # print("{}：此行为关键词行！".format(lo[key]))

			
 
				+                if len(lo[key])>2:

			
 
				+                    try:

			
 
				+                        kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				+                    except Exception as e:

			
 
				+                        kwline = lo[key]

			
 
				+                    kwln = len(lo[key])

			
 
				+        return result

			
 
				+    

			
 
				+    # 解析pdf

			
 
				+    def parse_pdf_layout(self, path):

			
 
				+        result = []

			
 
				+        lo = {}

			
 
				+        with pdfplumber.open(path) as pdf:

			
 
				+                for page in pdf.pages:

			
 
				+                    for table in page.extract_tables():

			
 
				+                        for line in table:

			
 
				+                            # lo[len(lo.keys())] = [cell for cell in line if cell]

			
 
				+                            lo[len(lo.keys())] = line

			
 
				 

			
 
				-path = "d:\\desktop\\社招简历模板.pdf"

			
 
				-

			
 
				-keywords = ['姓名',

			
 
				-    '性别',

			
 
				-    '出生日期',

			
 
				-    '一寸照片',

			
 
				-    '民族',

			
 
				-    '出生地',

			
 
				-    '政治面貌（加入时间）',

			
 
				-    '参加工作时间',

			
 
				-    '健康状况',

			
 
				-    '外语水平',

			
 
				-    '初始学历、专业',

			
 
				-    '最高学历、专业',

			
 
				-    '初始学历毕业院校及毕业时间',

			
 
				-    '最高学历毕业院校及毕业时间',

			
 
				-    '专业技术资格（取得时间）',

			
 
				-    '职业技能等级（取得时间）',

			
 
				-    '熟悉专业有何专长',

			
 
				-    '工作单位',

			
 
				-    '现任职务',

			
 
				-    '任职时间',

			
 
				-    '提职时间',

			
 
				-    '意向岗位',

			
 
				-    '联系电话',

			
 
				-    '学习经历',

			
 
				-    '起止时间',

			
 
				-    '学校',

			
 
				-    '专业',

			
 
				-    '学历',

			
 
				-    '学位',

			
 
				-    '研究方向',

			
 
				-    '是否全日制',

			
 
				-    '培训',

			
 
				-    '起止时间',

			
 
				-    '培训类型',

			
 
				-    '机构',

			
 
				-    '内容',

			
 
				-    '成绩',

			
 
				-    '证书名称',

			
 
				-    '经历',

			
 
				-    '工作经历',

			
 
				-    '起止时间',

			
 
				-    '工作单位',

			
 
				-    '职务',

			
 
				-    '部门',

			
 
				-    '证明人',

			
 
				-    '备注',

			
 
				-    '对报名岗位认识及工作设想',

			
 
				-    '自我评价及主要工作业绩',

			
 
				-    '获得职业资格证书情况',

			
 
				-    '获得日期',

			
 
				-    '名称',

			
 
				-    '证书编码/文号',

			
 
				-    '授予单位',

			
 
				-    '备注',

			
 
				-    '奖惩',

			
 
				-    '项目',

			
 
				-    '时间',

			
 
				-    '项目单位',

			
 
				-    '证明材料',

			
 
				-    '情况',

			
 
				-    '主要家庭成员及社会关系',

			
 
				-    '称谓',

			
 
				-    '出生年月',

			
 
				-    '政治面貌',

			
 
				-    '工作单位及职务',

			
 
				-    '其他情况说明',

			
 
				-    '诚信承诺',

			
 
				-    '本人承诺，以上信息均与事实相符，若有虚假，愿承担一切后果并自愿取消应聘资格。'

			
 
				-    '承诺人：'

			
 
				-    '社会招聘工作办公室资格审查意见']

			
 
				-

			
 
				-def parse_line(line):

			
 
				-    result = []

			
 
				-    key = None

			
 
				-    for cell in line:

			
 
				-        if cell and ''.join(cell.split()) in keywords:

			
 
				-            key = ''.join(cell.split())

			
 
				-        elif cell and key:

			
 
				-            schema = {key:cell}

			
 
				-            result.append(schema)

			
 
				-            key = None

			
 
				-    return result

			
 
				-

			
 
				-

			
 
				-def parse_layout(path):

			
 
				-    result = []

			
 
				-    lo = {}

			
 
				-    with pdfplumber.open(path) as pdf:

			
 
				-            for page in pdf.pages:

			
 
				-                for table in page.extract_tables():

			
 
				-                    for line in table:

			
 
				-                        # lo[len(lo.keys())] = [cell for cell in line if cell]

			
 
				-                        lo[len(lo.keys())] = line

			
 
				-

			
 
				-    kwln = -1

			
 
				-    kwline = None

			
 
				-    for key in lo.keys():

			
 
				-        # pdb.set_trace()

			
 
				-        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				-            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				-                # pdb.set_trace()

			
 
				-                for c in lo[key] or len(lo[key])!=kwln:

			
 
				+        kwln = -1

			
 
				+        kwline = None

			
 
				+        for key in lo.keys():

			
 
				+            # pdb.set_trace()

			
 
				+            for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				                     # pdb.set_trace()

			
 
				-                    if c and ''.join(c.split()) in keywords:# 非关键词行元素

			
 
				-                        result.extend(parse_line(lo[key]))

			
 
				+                    for c in lo[key] or len(lo[key])!=kwln:

			
 
				+                        # pdb.set_trace()

			
 
				+                        if c and ''.join(c.split()) in self.keywords:# 非关键词行元素

			
 
				+                            result.extend(self.parse_line(lo[key]))

			
 
				+                            break

			
 
				+                    else:# 关键词行元素

			
 
				+                        schema = dict()

			
 
				+                        for key, val in zip(kwline, lo[key]):

			
 
				+                            if key:

			
 
				+                                schema[key] = val if val else key

			
 
				+                        result.append(schema)

			
 
				                         break

			
 
				-                else:# 关键词行元素

			
 
				-                    schema = dict()

			
 
				-                    for key, val in zip(kwline, lo[key]):

			
 
				-                        if key:

			
 
				-                            schema[key] = val if val else key

			
 
				-                    result.append(schema)

			
 
				                     break

			
 
				-                break

			
 
				-        else:

			
 
				-            # print("此行为关键词行")

			
 
				-            # kwline = lo[key]

			
 
				-            kwline = []

			
 
				-            for cell in lo[key]:

			
 
				-                if cell:

			
 
				-                    kwline.append(''.join(cell.split()))

			
 
				-                else:

			
 
				-                    kwline.append(cell)

			
 
				-            kwln = len(lo[key])

			
 
				-    return result

			
 
				-

			
 
				-# 格式化数据

			
 
				-def formatter(datalist):

			
 
				-    result = dict()

			
 
				-

			
 
				-    for d in datalist:

			
 
				-        if len(d) == 1:

			
 
				-            for key in d.keys():

			
 
				-                result[key] = d[key]

			
 
				-        else:

			
 
				-            for k in list(d.keys()):

			
 
				-                if k == "".join(d[k].split()):

			
 
				-                    d.pop(k)

			
 
				-                    if result.get(k):

			
 
				-                        result[k].append(d)

			
 
				+            else:

			
 
				+                kwline = []

			
 
				+                for cell in lo[key]:

			
 
				+                    if cell:

			
 
				+                        kwline.append(''.join(cell.split()))

			
 
				                     else:

			
 
				-                        result[k] = [d]

			
 
				-

			
 
				-    # 转译数据库字段名

			
 
				-    with open("./resources/translate.json", "r", encoding="utf-8") as ff:

			
 
				-        json_obj = json.load(ff)

			
 
				-

			
 
				-    normal = json_obj["base"]

			
 
				-    itenormal = json_obj["base"]

			
 
				-    edunormal = json_obj["tal_his_edu"]

			
 
				-    jobnormal = json_obj["tal_his_job"]

			
 
				-    cetnormal = json_obj["tal_vocational_qualification_certificate"]

			
 
				-    family = json_obj["tal_family_social_relations"]

			
 
				-

			
 
				-    for key in normal.keys():

			
 
				-        if result.get(key):

			
 
				-            result[normal[key]] = result[key]

			
 
				-            result.pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['学习经历'])):

			
 
				-        for key in edunormal.keys():

			
 
				-            if result['学习经历'][idx].get(key):

			
 
				-                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				-                result['学习经历'][idx].pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['工作经历'])):

			
 
				-        for key in jobnormal.keys():

			
 
				-            if result['工作经历'][idx].get(key):

			
 
				-                result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]

			
 
				-                result['工作经历'][idx].pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['获得职业资格证书情况'])):

			
 
				-        for key in cetnormal.keys():

			
 
				-            if result['获得职业资格证书情况'][idx].get(key):

			
 
				-                result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]

			
 
				-                result['获得职业资格证书情况'][idx].pop(key)

			
 
				-

			
 
				-    for idx in range(len(result['主要家庭成员及社会关系'])):

			
 
				-        for key in family.keys():

			
 
				-            if result['主要家庭成员及社会关系'][idx].get(key):

			
 
				-                result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]

			
 
				-                result['主要家庭成员及社会关系'][idx].pop(key)

			
 
				-

			
 
				-    tit = {

			
 
				-        "基本信息":"base",

			
 
				-        "职业发展管理":"intent_job",

			
 
				-        "学习经历":"tal_his_edu",

			
 
				-        "工作经历":"tal_his_job",

			
 
				-        "项目经历":"tal_his_project",

			
 
				-        "培训经历":"tal_training_institutions",

			
 
				-        "获奖情况":"tal_rewards_punishments",

			
 
				-        "语言能力":"tal_language",

			
 
				-        "获得职业资格证书情况":"tal_vocational_qualification_certificate",

			
 
				-        "专业技能":"tal_professional_tech_certificate",

			
 
				-        "主要家庭成员及社会关系":"tal_family_social_relations"

			
 
				-    }

			
 
				-

			
 
				-    for key in tit.keys():

			
 
				-        if result.get(key):

			
 
				-            result[tit[key]] = result[key]

			
 
				-            result.pop(key)

			
 
				-

			
 
				-    # url = "http://192.168.1.110:9999/talent/getResumeData"

			
 
				-    # session = requests.Session()

			
 
				-    # session.mount('http://', HTTPAdapter(max_retries = 3))

			
 
				-    # try:

			
 
				-    #     headers = {

			
 
				-    #         'contentType':'Application/json'

			
 
				-    #     }

			
 
				-    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)

			
 
				-    #     print(response.text)

			
 
				-    # except Exception as e:

			
 
				-    #     print(e)

			
 
				-

			
 
				-    return result

			
 
				+                        kwline.append(cell)

			
 
				+                kwln = len(lo[key])

			
 
				+        return result

			
 
				+    

			
 
				+    # 格式化数据

			
 
				+    def formatter(self, datalist):

			
 
				+        result = dict()

			
 
				+        for d in datalist:

			
 
				+            if len(d) == 1:

			
 
				+                for key in d.keys():

			
 
				+                    result[key] = d[key]

			
 
				+            else:

			
 
				+                for k in list(d.keys()):

			
 
				+                    if k == "".join(d[k].split()):

			
 
				+                        d.pop(k)

			
 
				+                        if result.get(k):

			
 
				+                            result[k].append(d)

			
 
				+                        else:

			
 
				+                            result[k] = [d]

			
 
				 

			
 
				-if __name__ == '__main__':

			
 
				-    pprint(formatter(parse_layout(path)))

			
 
				+        if result.get("外语水平"):

			
 
				+            data = re.findall(r'(\w+[语话])', result["外语水平"])

			
 
				+            if data:

			
 
				+                result["外语水平"] = data

			
 
				+

			
 
				+        if result.get("专业技术资格(取得时间)"):

			
 
				+            dates = re.findall(r'\d+', result["专业技术资格(取得时间)"])

			
 
				+            for i in dates:

			
 
				+                result["专业技术资格(取得时间)"] = result["专业技术资格(取得时间)"].replace(i, "")

			
 
				+            names = re.findall(r'\w+', result["专业技术资格(取得时间)"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"专业技术资格":names}]

			
 
				+            elif len(dates) == 2:

			
 
				+                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"专业技术资格":names}]

			
 
				+            elif len(dates) == 3:

			
 
				+                result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"专业技术资格":names}]

			
 
				+

			
 
				+        if result.get("职业技能等级（取得时间）"):

			
 
				+            dates = re.findall(r'\d+', result["职业技能等级（取得时间）"])

			
 
				+            for i in dates:

			
 
				+                result["职业技能等级（取得时间）"] = result["职业技能等级（取得时间）"].replace(i, "")

			
 
				+            names = re.findall(r'\w+', result["职业技能等级（取得时间）"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["职业技能等级（取得时间）"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"职业技能等级":names}]

			
 
				+            elif len(dates) == 2:

			
 
				+                result["职业技能等级（取得时间）"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"职业技能等级":names}]

			
 
				+            elif len(dates) == 3:

			
 
				+                result["职业技能等级（取得时间）"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"职业技能等级":names}]

			
 
				+

			
 
				+        ### 时间格式化

			
 
				+        if result.get("出生年月"):

			
 
				+            dates = re.findall(r'\d+' , result["出生年月"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+

			
 
				+        if result.get("任职时间"):

			
 
				+            dates = re.findall(r'\d+' , result["任职时间"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+

			
 
				+        if result.get("参加工作时间"):

			
 
				+            dates = re.findall(r'\d+' , result["参加工作时间"])

			
 
				+            if len(dates) == 1:

			
 
				+                result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+

			
 
				+        if result.get("最高学历毕业院校及毕业时间"):

			
 
				+            dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])

			
 
				+            ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])

			
 
				+            if len(ws) > 0:

			
 
				+                result["最高学历毕业院校"] = ws[0]

			
 
				+            if len(dates) == 1:

			
 
				+                result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+            result.pop("最高学历毕业院校及毕业时间")

			
 
				+

			
 
				+        if result.get("初始学历毕业院校及毕业时间"):

			
 
				+            dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])

			
 
				+            ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])

			
 
				+            if len(ws) > 0:

			
 
				+                result["初始学历毕业院校"] = ws[0]

			
 
				+            if len(dates) == 1:

			
 
				+                result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))

			
 
				+            elif len(dates) == 2:

			
 
				+                result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            elif len(dates) == 3:

			
 
				+                result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				+            result.pop("初始学历毕业院校及毕业时间")

			
 
				+

			
 
				+        if result.get("学习经历"):

			
 
				+            for idx, edu in enumerate(result["学习经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("培训经历"):

			
 
				+            for idx, edu in enumerate(result["培训经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				 

			
 
				+        if result.get("工作经历"):

			
 
				+            for idx, edu in enumerate(result["工作经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("项目经历"):

			
 
				+            for idx, edu in enumerate(result["项目经历"]):

			
 
				+                if edu.get("起止时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["起止时间"])

			
 
				+                    if len(dates) == 4:

			
 
				+                        result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))

			
 
				+

			
 
				+        if result.get("获得职业资格证书情况"):

			
 
				+            for idx, edu in enumerate(result["获得职业资格证书情况"]):

			
 
				+                if edu.get("获得日期"):

			
 
				+                    dates = re.findall(r'\d+' , edu["获得日期"])

			
 
				+                    if len(dates) == 2:

			
 
				+                        result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+        if result.get("奖惩情况"):

			
 
				+            for idx, edu in enumerate(result["奖惩情况"]):

			
 
				+                if edu.get("时间"):

			
 
				+                    dates = re.findall(r'\d+' , edu["时间"])

			
 
				+                    if len(dates) == 2:

			
 
				+                        result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+        if result.get("主要家庭成员及社会关系"):

			
 
				+            for idx, fam in enumerate(result["主要家庭成员及社会关系"]):

			
 
				+                if fam.get("出生年月"):

			
 
				+                    dates = re.findall(r'\d+' , fam["出生年月"])

			
 
				+                    if len(dates) == 2:

			
 
				+                        result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+

			
 
				+        normal = self.json_obj["base"]

			
 
				+        itenormal = self.json_obj["base"]

			
 
				+        edunormal = self.json_obj["tal_his_edu"]

			
 
				+        jobnormal = self.json_obj["tal_his_job"]

			
 
				+        tranornal = self.json_obj["tal_training_experience"]

			
 
				+        cetnormal = self.json_obj["tal_vocational_qualification_certificate"]

			
 
				+        rewnormal = self.json_obj["tal_reward_punishment"]

			
 
				+        family = self.json_obj["tal_family_social_relation"]

			
 
				+

			
 
				+        for key in normal.keys():

			
 
				+            if result.get(key):

			
 
				+                result[normal[key]] = result[key]

			
 
				+                result.pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['学习经历'])):

			
 
				+            for key in edunormal.keys():

			
 
				+                if result['学习经历'][idx].get(key):

			
 
				+                    result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				+                    result['学习经历'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['工作经历'])):

			
 
				+            for key in jobnormal.keys():

			
 
				+                if result['工作经历'][idx].get(key):

			
 
				+                    result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]

			
 
				+                    result['工作经历'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['培训经历'])):

			
 
				+            for key in tranornal.keys():

			
 
				+                if result['培训经历'][idx].get(key):

			
 
				+                    result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]

			
 
				+                    result['培训经历'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['获得职业资格证书情况'])):

			
 
				+            for key in cetnormal.keys():

			
 
				+                if result['获得职业资格证书情况'][idx].get(key):

			
 
				+                    result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]

			
 
				+                    result['获得职业资格证书情况'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['奖惩情况'])):

			
 
				+            for key in rewnormal.keys():

			
 
				+                if result['奖惩情况'][idx].get(key):

			
 
				+                    result['奖惩情况'][idx][rewnormal[key]] = result['奖惩情况'][idx][key]

			
 
				+                    result['奖惩情况'][idx].pop(key)

			
 
				+

			
 
				+        for idx in range(len(result['主要家庭成员及社会关系'])):

			
 
				+            for key in family.keys():

			
 
				+                if result['主要家庭成员及社会关系'][idx].get(key):

			
 
				+                    result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]

			
 
				+                    result['主要家庭成员及社会关系'][idx].pop(key)

			
 
				+

			
 
				+        tit = {

			
 
				+            "基本信息":"base",

			
 
				+            "职业发展管理":"intent_job",

			
 
				+            "学习经历":"tal_his_edu",

			
 
				+            "工作经历":"tal_his_job",

			
 
				+            "项目经历":"tal_his_project",

			
 
				+            "培训经历":"tal_training_experience",

			
 
				+            "奖惩情况":"tal_reward_punishment",

			
 
				+            "语言能力":"tal_language",

			
 
				+            "获得职业资格证书情况":"tal_vocational_qualification_certificate",

			
 
				+            "专业技能":"tal_professional_tech_certificate",

			
 
				+            "主要家庭成员及社会关系":"tal_family_social_relation",

			
 
				+            "其他情况说明":"intro"

			
 
				+        }

			
 
				+

			
 
				+        for key in tit.keys():

			
 
				+            if result.get(key):

			
 
				+                result[tit[key]] = result[key]

			
 
				+                result.pop(key)

			
 
				+        return result

			
 
				+    

			
 
				+    # 推送后端

			
 
				+    def push_back(self, result):

			
 
				+        url = "http://192.168.1.110:9999/talent/getResumeData"

			
 
				+        session = requests.Session()

			
 
				+        session.mount('http://', HTTPAdapter(max_retries = 3))

			
 
				+        try:

			
 
				+            headers = {

			
 
				+                'contentType':'Application/json'

			
 
				+            }

			
 
				+            response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)

			
 
				+            print(response.text)

			
 
				+        except Exception as e:

			
 
				+            print(e)

			
 
				+

			
 
				+    def predict(self, path):

			
 
				+        if path.endswith(".docx"):

			
 
				+            result = self.formatter(self.parse_word_layout(path))

			
 
				+            self.push_back(result)

			
 
				+            print(self.formatter(self.parse_word_layout(path)))

			
 
				+        elif path.endswith(".pdf"):

			
 
				+            result = self.formatter(self.parse_pdf_layout(path))

			
 
				+            self.push_back(result)

			
 
				+            print(self.formatter(self.parse_pdf_layout(path)))

			
 
				+

			
 
				+

			
 
				+

			
 
				+if __name__ == '__main__':

			
 
				+    s = Social()

			
 
				+    s.predict(path)

			
 
				+