ソースを参照

modified: custom.py
modified: irafa.py
modified: resources/translate.json
modified: resume_parse.py
modified: srafa.py

sprivacy 3 年 前
コミット
8c9457deb5
5 ファイル変更955 行追加861 行削除
  1. 296 289
      tools/custom.py
  2. 246 230
      tools/irafa.py
  3. 24 19
      tools/resources/translate.json
  4. 113 71
      tools/resume_parse.py
  5. 276 252
      tools/srafa.py

+ 296 - 289
tools/custom.py

@@ -2,26 +2,28 @@
 # @Author: privacy
 # @Date:   2022-07-11 09:21:24
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-16 14:26:21
+# @Last Modified time: 2022-07-18 13:50:34
 
 # 自定义模板
 
 import re
 import json
-import logging
-from pprint import pprint
+
 import requests
 from requests.adapters import HTTPAdapter
+
 import pdfplumber
 from docx import Document
-from docx.shared import Inches
 
 
 path = "d:\\desktop\\自定义.docx"
 # path = "d:\\desktop\\自定义.pdf"
 
-# 关键词字典
-keywords = [
+class Custom(object):
+    """docstring for Custom"""
+    def __init__(self):
+        super(Custom, self).__init__()
+        self.keywords = [
 	"姓名",
 	"性别",
 	"出生年月",
@@ -89,295 +91,300 @@ keywords = [
     "职业",
     "与本人关系",
     "计算机水平"
-]
-
-# 解析行内元素
-def parse_line(line):
-    result = []
-    key = None
-    for cell in line:
-        if cell and ''.join(cell.split()) in keywords:
-            key = ''.join(cell.split())
-        elif cell and key:
-            schema = {key:cell}
-            result.append(schema)
-            key = None
-    return result
-
-
-# 解析文档布局
-def parse_layout(path):
-    result = []
-    doc = Document(path)
-    lo = {}
-    for _table in doc.tables[:]:
-        for i, row in enumerate(_table.rows[:]):
-            row_content = []
-            for cell in row.cells[:]:
-                c = cell.text
-                if c not in row_content:
-                    row_content.append(c)
-            lo[len(lo.keys())] = row_content
-
-    kwln = -1# 关键词行长度
-    kwline = None# 关键词行
-    for key in lo.keys():
-        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
-            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
-                perc = 0# 行内关键词数量
-                for c in lo[key]:
-                    if c and (''.join(c.split()) in keywords):# 找到此行有关键词
-                        perc += 1
-                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
-                        perc = 0# 清空行内关键词数
-                        result.extend(parse_line(lo[key]))# 添加并解析普通行级元素
+        ]
+        self.json_obj = self.get_translate()
+
+    def get_translate(self):
+        # 转译数据库字段名
+        with open("./resources/translate.json", "r", encoding="utf-8") as ff:
+            json_obj = json.load(ff)
+        return json_obj
+
+    # 解析行内元素
+    def parse_line(self, line):
+        result = []
+        key = None
+        for cell in line:
+            if cell and ''.join(cell.split()) in self.keywords:
+                key = ''.join(cell.split())
+            elif cell and key:
+                schema = {key:cell}
+                result.append(schema)
+                key = None
+        return result
+
+    # 解析word
+    def parse_word_layout(self, path):
+        result = []
+        doc = Document(path)
+        lo = {}
+        for _table in doc.tables[:]:
+            for i, row in enumerate(_table.rows[:]):
+                row_content = []
+                for cell in row.cells[:]:
+                    c = cell.text
+                    if c not in row_content:
+                        row_content.append(c)
+                lo[len(lo.keys())] = row_content
+
+        kwln = -1# 关键词行长度
+        kwline = None# 关键词行
+        for key in lo.keys():
+            for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
+                    perc = 0# 行内关键词数量
+                    for c in lo[key]:
+                        if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词
+                            perc += 1
+                        if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
+                            perc = 0# 清空行内关键词数
+                            result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素
+                            break
+                    else:# 关键词行元素
+                        schema = dict()
+                        for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
+                            if key:
+                                schema[key] = val
+                        result.append(schema)
                         break
-                else:# 关键词行元素
-                    schema = dict()
-                    for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
-                        if key:
-                            schema[key] = val
-                    result.append(schema)
                     break
-                break
-        else:
-            # print("{}:此行为关键词行!".format(lo[key]))
-            try:
-                kwline = [''.join(cell.split()) for cell in lo[key]]
-            except Exception as e:
-                kwline = lo[key]
-            kwln = len(lo[key])
-    return result
-
-
-def parse_pdf_layout(path):
-    result = []
-    lo = {}
-    with pdfplumber.open(path) as pdf:
-            for page in pdf.pages:
-                for table in page.extract_tables():
-                    for line in table:
-                        # lo[len(lo.keys())] = [cell for cell in line if cell]
-                        lo[len(lo.keys())] = line
-    print(lo)
-
-    kwln = -1
-    kwline = None
-    for key in lo.keys():
-        # pdb.set_trace()
-        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
-            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
-                # pdb.set_trace()
-                for c in lo[key] or len(lo[key])!=kwln:
+            else:
+                # print("{}:此行为关键词行!".format(lo[key]))
+                try:
+                    kwline = [''.join(cell.split()) for cell in lo[key]]
+                except Exception as e:
+                    kwline = lo[key]
+                kwln = len(lo[key])
+        return result
+
+    # 解析pdf
+    def parse_pdf_layout(self, path):
+        result = []
+        lo = {}
+        with pdfplumber.open(path) as pdf:
+                for page in pdf.pages:
+                    for table in page.extract_tables():
+                        for line in table:
+                            lo[len(lo.keys())] = line
+
+        kwln = -1
+        kwline = None
+        for key in lo.keys():
+            # pdb.set_trace()
+            for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
                     # pdb.set_trace()
-                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
-                        result.extend(parse_line(lo[key]))
+                    for c in lo[key] or len(lo[key])!=kwln:
+                        # pdb.set_trace()
+                        if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
+                            result.extend(self.parse_line(lo[key]))
+                            break
+                    else:# 关键词行元素
+                        schema = dict()
+                        for key, val in zip(kwline, lo[key]):
+                            if key:
+                                schema[key] = val if val else key
+                        result.append(schema)
                         break
-                else:# 关键词行元素
-                    schema = dict()
-                    for key, val in zip(kwline, lo[key]):
-                        if key:
-                            schema[key] = val if val else key
-                    result.append(schema)
                     break
-                break
-        else:
-            # print("此行为关键词行")
-            # kwline = lo[key]
-            kwline = []
-            for cell in lo[key]:
-                if cell:
-                    kwline.append(''.join(cell.split()))
-                else:
-                    kwline.append(cell)
-            kwln = len(lo[key])
-    return result
-
-# 格式化数据
-def formatter(datalist):
-    result = dict()
-    for d in datalist:
-        if len(d) == 1:# 普通键值对
-            for key in d.keys():
-                result[key] = d[key]
-        else:# 行级元素
-            for k in list(d.keys()):
-                if k == "".join(d[k].split()):# 行名
-                    d.pop(k)
-                    if result.get(k):# 多行元素合并
-                        result[k].append(d)
+            else:
+                kwline = []
+                for cell in lo[key]:
+                    if cell:
+                        kwline.append(''.join(cell.split()))
                     else:
-                        result[k] = [d]
-
-    ### 时间格式化
-    if result.get("出生年月"):
-        dates = re.findall(r'\d+' , result["出生年月"])
-        if len(dates) == 1:
-            result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
-        elif len(dates) == 2:
-            result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
-        elif len(dates) == 3:
-            result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
-
-    if result.get("任职时间"):
-        dates = re.findall(r'\d+' , result["任职时间"])
-        if len(dates) == 1:
-            result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
-        elif len(dates) == 2:
-            result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
-        elif len(dates) == 3:
-            result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
-
-    if result.get("参加工作时间"):
-        dates = re.findall(r'\d+' , result["参加工作时间"])
-        if len(dates) == 1:
-            result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
-        elif len(dates) == 2:
-            result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
-        elif len(dates) == 3:
-            result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
-
-    if result.get("最高学历毕业院校及毕业时间"):
-        dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
-        ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
-        if len(ws) > 0:
-            result["最高学历毕业院校"] = ws[0]
-        if len(dates) == 1:
-            result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
-        elif len(dates) == 2:
-            result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
-        elif len(dates) == 3:
-            result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
-        result.pop("最高学历毕业院校及毕业时间")
-
-    if result.get("初始学历毕业院校及毕业时间"):
-        dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
-        ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
-        if len(ws) > 0:
-            result["初始学历毕业院校"] = ws[0]
-        if len(dates) == 1:
-            result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
-        elif len(dates) == 2:
-            result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
-        elif len(dates) == 3:
-            result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
-        result.pop("初始学历毕业院校及毕业时间")
-
-    if result.get("学习经历"):
-        for idx, edu in enumerate(result["学习经历"]):
-            if edu.get("起止时间"):
-                dates = re.findall(r'\d+' , edu["起止时间"])
-                if len(dates) == 4:
-                    result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
-
-    if result.get("培训经历"):
-        for idx, edu in enumerate(result["培训经历"]):
-            if edu.get("起止时间"):
-                dates = re.findall(r'\d+' , edu["起止时间"])
-                if len(dates) == 4:
-                    result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
-
-    if result.get("工作经历"):
-        for idx, edu in enumerate(result["工作经历"]):
-            if edu.get("起止时间"):
-                dates = re.findall(r'\d+' , edu["起止时间"])
-                if len(dates) == 4:
-                    result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
-
-    if result.get("项目经历"):
-        for idx, edu in enumerate(result["项目经历"]):
-            if edu.get("起止时间"):
-                dates = re.findall(r'\d+' , edu["起止时间"])
-                if len(dates) == 4:
-                    result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
-
-    if result.get("获得职业资格证书情况"):
-        for idx, edu in enumerate(result["获得职业资格证书情况"]):
-            if edu.get("获得日期"):
-                dates = re.findall(r'\d+' , edu["获得日期"])
-                if len(dates) == 2:
-                    result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
-
-    if result.get("奖惩情况"):
-        for idx, edu in enumerate(result["奖惩情况"]):
-            if edu.get("时间"):
-                dates = re.findall(r'\d+' , edu["时间"])
-                if len(dates) == 2:
-                    result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
-
-    if result.get("主要家庭成员及社会关系"):
-        for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
-            if fam.get("出生年月"):
-                dates = re.findall(r'\d+' , fam["出生年月"])
-                if len(dates) == 2:
-                    result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
-
-    # 转译数据库字段名
-    with open("./resources/translate.json", "r", encoding="utf-8") as ff:
-        json_obj = json.load(ff)
-
-    normal = json_obj["base"]
-    edunormal = json_obj["tal_his_edu"]
-    family = json_obj["tal_family_social_relations"]
-
-    for key in normal.keys():
-        if result.get(key):
-            result[normal[key]] = result[key]
-            result.pop(key)
-
-    for idx in range(len(result['学习经历'])):
-        result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
-        result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
-        for key in edunormal.keys():
-            if result['学习经历'][idx].get(key):
-                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
-                result['学习经历'][idx].pop(key)
-
-    for idx in range(len(result['主要家庭成员及社会关系'])):
-        for key in family.keys():
-            if result['主要家庭成员及社会关系'][idx].get(key):
-                result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
-                result['主要家庭成员及社会关系'][idx].pop(key)
-
-    tit = {
-        "基本信息":"base",
-        "求职意向":"intent_job",
-        "学习经历":"tal_his_edu",
-        "工作经历":"tal_his_job",
-        "项目经历":"tal_his_project",
-        "培训经历":"tal_training_institutions",
-        "获奖情况":"tal_rewards_punishments",
-        "语言能力":"tal_language",
-        "证书":"tal_vocational_qualification_certificate",
-        "专业技能":"tal_professional_tech_certificate",
-        "主要家庭成员及社会关系":"tal_family_social_relations"
-    }
-
-    for key in tit.keys():
-        if result.get(key):
-            result[tit[key]] = result[key]
-            result.pop(key)
-
-    # url = "http://192.168.1.110:9999/talent/getResumeData"
-    # session = requests.Session()
-    # session.mount('http://', HTTPAdapter(max_retries = 3))
-    # try:
-    #     headers = {
-    #         'contentType':'Application/json'
-    #     }
-    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
-    #     print(response.text)
-    # except Exception as e:
-    #     print(e)
-    return result
-
+                        kwline.append(cell)
+                kwln = len(lo[key])
+        return result
+
+    # 格式化数据
+    def formatter(self, datalist):
+        result = dict()
+        for d in datalist:
+            if len(d) == 1:# 普通键值对
+                for key in d.keys():
+                    result[key] = d[key]
+            else:# 行级元素
+                for k in list(d.keys()):
+                    if k == "".join(d[k].split()):# 行名
+                        d.pop(k)
+                        if result.get(k):# 多行元素合并
+                            result[k].append(d)
+                        else:
+                            result[k] = [d]
+
+        ### 时间格式化
+        if result.get("出生年月"):
+            dates = re.findall(r'\d+' , result["出生年月"])
+            if len(dates) == 1:
+                result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
+            elif len(dates) == 2:
+                result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+            elif len(dates) == 3:
+                result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
+
+        if result.get("任职时间"):
+            dates = re.findall(r'\d+' , result["任职时间"])
+            if len(dates) == 1:
+                result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
+            elif len(dates) == 2:
+                result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+            elif len(dates) == 3:
+                result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
+
+        if result.get("参加工作时间"):
+            dates = re.findall(r'\d+' , result["参加工作时间"])
+            if len(dates) == 1:
+                result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
+            elif len(dates) == 2:
+                result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+            elif len(dates) == 3:
+                result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
+
+        if result.get("最高学历毕业院校及毕业时间"):
+            dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
+            ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
+            if len(ws) > 0:
+                result["最高学历毕业院校"] = ws[0]
+            if len(dates) == 1:
+                result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
+            elif len(dates) == 2:
+                result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+            elif len(dates) == 3:
+                result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
+            result.pop("最高学历毕业院校及毕业时间")
+
+        if result.get("初始学历毕业院校及毕业时间"):
+            dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
+            ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
+            if len(ws) > 0:
+                result["初始学历毕业院校"] = ws[0]
+            if len(dates) == 1:
+                result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
+            elif len(dates) == 2:
+                result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+            elif len(dates) == 3:
+                result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
+            result.pop("初始学历毕业院校及毕业时间")
+
+        if result.get("学习经历"):
+            for idx, edu in enumerate(result["学习经历"]):
+                if edu.get("起止时间"):
+                    dates = re.findall(r'\d+' , edu["起止时间"])
+                    if len(dates) == 4:
+                        result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
+
+        if result.get("培训经历"):
+            for idx, edu in enumerate(result["培训经历"]):
+                if edu.get("起止时间"):
+                    dates = re.findall(r'\d+' , edu["起止时间"])
+                    if len(dates) == 4:
+                        result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
+
+        if result.get("工作经历"):
+            for idx, edu in enumerate(result["工作经历"]):
+                if edu.get("起止时间"):
+                    dates = re.findall(r'\d+' , edu["起止时间"])
+                    if len(dates) == 4:
+                        result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
+
+        if result.get("项目经历"):
+            for idx, edu in enumerate(result["项目经历"]):
+                if edu.get("起止时间"):
+                    dates = re.findall(r'\d+' , edu["起止时间"])
+                    if len(dates) == 4:
+                        result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
+
+        if result.get("获得职业资格证书情况"):
+            for idx, edu in enumerate(result["获得职业资格证书情况"]):
+                if edu.get("获得日期"):
+                    dates = re.findall(r'\d+' , edu["获得日期"])
+                    if len(dates) == 2:
+                        result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+
+        if result.get("奖惩情况"):
+            for idx, edu in enumerate(result["奖惩情况"]):
+                if edu.get("时间"):
+                    dates = re.findall(r'\d+' , edu["时间"])
+                    if len(dates) == 2:
+                        result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+
+        if result.get("主要家庭成员及社会关系"):
+            for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
+                if fam.get("出生年月"):
+                    dates = re.findall(r'\d+' , fam["出生年月"])
+                    if len(dates) == 2:
+                        result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+
+        normal = self.json_obj["base"]
+        edunormal = self.json_obj["tal_his_edu"]
+        family = self.json_obj["tal_family_social_relation"]
+
+        for key in normal.keys():
+            if result.get(key):
+                result[normal[key]] = result[key]
+                result.pop(key)
+
+        for idx in range(len(result['学习经历'])):
+            result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
+            result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
+            for key in edunormal.keys():
+                if result['学习经历'][idx].get(key):
+                    result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
+                    result['学习经历'][idx].pop(key)
+
+        for idx in range(len(result['主要家庭成员及社会关系'])):
+            for key in family.keys():
+                if result['主要家庭成员及社会关系'][idx].get(key):
+                    result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
+                    result['主要家庭成员及社会关系'][idx].pop(key)
+
+        tit = {
+            "基本信息":"base",
+            "求职意向":"intent_job",
+            "学习经历":"tal_his_edu",
+            "工作经历":"tal_his_job",
+            "项目经历":"tal_his_project",
+            "培训经历":"tal_training_experience",
+            "获奖情况":"tal_reward_punishment",
+            "语言能力":"tal_language",
+            "证书":"tal_vocational_qualification_certificate",
+            "专业技能":"tal_professional_tech_certificate",
+            "主要家庭成员及社会关系":"tal_family_social_relation"
+        }
+
+        for key in tit.keys():
+            if result.get(key):
+                result[tit[key]] = result[key]
+                result.pop(key)
+
+        return result
+
+    # 推送后端
+    def push_back(self, result):
+        url = "http://192.168.1.110:9999/talent/getResumeData"
+        session = requests.Session()
+        session.mount('http://', HTTPAdapter(max_retries = 3))
+        try:
+            headers = {
+                'contentType':'Application/json'
+            }
+            response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
+            print(response.text)
+        except Exception as e:
+            print(e)
+
+    def predict(self, path):
+        if path.endswith(".docx"):
+            result = self.formatter(self.parse_word_layout(path))
+            self.push_back(result)
+            print(self.formatter(self.parse_word_layout(path)))
+        elif path.endswith(".pdf"):
+            result = self.formatter(self.parse_pdf_layout(path))
+            self.push_back(result)
+            print(self.formatter(self.parse_pdf_layout(path)))
 
 
 if __name__ == '__main__':
-    if path.endswith(".docx"):
-        pprint(formatter(parse_layout(path)))
-    else:
-        pprint(parse_pdf_layout(path))
-        pprint(formatter(parse_pdf_layout(path)))
-
-
+    c = Custom()
+    c.predict(path)

+ 246 - 230
tools/irafa.py

@@ -2,275 +2,291 @@
 # @Author: privacy
 # @Date:   2022-07-07 13:12:17
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-16 15:05:03
+# @Last Modified time: 2022-07-18 13:50:39
 
 # 内部人才市场简历模板
-from pprint import pprint
+
 import re
 import json
-from pdfminer.high_level import extract_pages
-from pdfminer.layout import LTTextContainer
+
+import requests
+from requests.adapters import HTTPAdapter
+
 import pdfplumber
-import docx
 from docx import Document
-from docx.shared import Inches
-
 
 # path = "d:\\desktop\\内部人才市场简历模板.docx"
 path = "d:\\desktop\\内部人才市场简历模板.pdf"
 
-keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
 
-def parse_line(line):
-    result = []
-    key = None
-    for cell in line:
-        if cell and ''.join(cell.split()) in keywords:
-            key = ''.join(cell.split())
-        elif cell and key:
-            schema = {key:cell}
-            result.append(schema)
-            key = None
-    return result
+class Inner(object):
+    """docstring for Inner"""
+    def __init__(self):
+        super(Inner, self).__init__()
+        self.keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
+        self.json_obj = self.get_translate()
 
+    def get_translate(self):
+        # 转译数据库字段名
+        with open("./resources/translate.json", "r", encoding="utf-8") as ff:
+            json_obj = json.load(ff)
+        return json_obj
 
-def parse_layout(path):
-    result = []
-    doc = Document(path)
+    def parse_line(self, line):
+        result = []
+        key = None
+        for cell in line:
+            if cell and ''.join(cell.split()) in self.keywords:
+                key = ''.join(cell.split())
+            elif cell and key:
+                schema = {key:cell}
+                result.append(schema)
+                key = None
+        return result
 
-    lo = {}
-    tables = doc.tables
-    for _table in tables[:]:
-        for i, row in enumerate(_table.rows[:]):
-            row_content = []
-            for cell in row.cells[:]:
-                c = cell.text
-                row_content.append(c)
-            lo[len(lo.keys())] = row_content
-    
-    kwln = -1
-    kwline = None
-    for key in lo.keys():
-        # pdb.set_trace()
-        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
-            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
-                # pdb.set_trace()
-                for c in lo[key]:
+    # 解析word
+    def parse_word_layout(self, path):
+        result = []
+        doc = Document(path)
+        lo = {}
+        tables = doc.tables
+        for _table in tables[:]:
+            for i, row in enumerate(_table.rows[:]):
+                row_content = []
+                for cell in row.cells[:]:
+                    c = cell.text
+                    row_content.append(c)
+                lo[len(lo.keys())] = row_content
+        
+        kwln = -1
+        kwline = None
+        for key in lo.keys():
+            # pdb.set_trace()
+            for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
                     # pdb.set_trace()
-                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
-                        result.extend(parse_line(lo[key]))
+                    for c in lo[key]:
+                        # pdb.set_trace()
+                        if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
+                            result.extend(self.parse_line(lo[key]))
+                            break
+                    else:# 关键词行元素
+                        schema = dict()
+                        for key, val in zip(kwline, lo[key]):
+                            if key:
+                                schema[key] = val
+                        if "学校/培训机构" in schema.keys():
+                            schema["学习经历"] = "学习经历"
+                        elif "与本人关系" in schema.keys():
+                            schema["家庭成员"] = "家庭成员"
+                        elif "意向地区" in schema.keys():
+                            schema["职业发展管理"] = "职业发展管理"
+                        elif "职业证书" in schema.keys():
+                            schema["职业资格证书"] = "职业资格证书"
+                        result.append(schema)
                         break
-                else:# 关键词行元素
-                    schema = dict()
-                    for key, val in zip(kwline, lo[key]):
-                        if key:
-                            schema[key] = val
-                    if "学校/培训机构" in schema.keys():
-                        schema["学习经历"] = "学习经历"
-                    elif "与本人关系" in schema.keys():
-                        schema["家庭成员"] = "家庭成员"
-                    elif "意向地区" in schema.keys():
-                        schema["职业发展管理"] = "职业发展管理"
-                    elif "职业证书" in schema.keys():
-                        schema["职业资格证书"] = "职业资格证书"
-                    result.append(schema)
                     break
-                break
-        else:
-            # print("此行为关键词行")
-            kwline = [''.join(cell.split()) for cell in lo[key]]
-            kwln = len(lo[key])
-
-    job = {"工作经历":"工作经历"}
-    flag = None
-    for p in doc.paragraphs:
-        text = p.text.replace(":", ":")
-        if ":" in text:
-            text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
-            for line in text.split("\n"):
-                if line.strip():
-                    i = line.split(":")
-                    if job.get(i[0].strip()):
-                        result.append(job)
-                        job = {"工作经历":"工作经历"}
-                    job[i[0].strip()] = i[1].strip()
-                    flag = i[0].strip()
-        elif flag == "工作描述":
-            job["工作描述"] += '\n' + text.strip()
-    else:
-        result.append(job)
-    return result
+            else:
+                # print("此行为关键词行")
+                kwline = [''.join(cell.split()) for cell in lo[key]]
+                kwln = len(lo[key])
 
+        job = {"工作经历":"工作经历"}
+        flag = None
+        for p in doc.paragraphs:
+            text = p.text.replace(":", ":")
+            if ":" in text:
+                text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
+                for line in text.split("\n"):
+                    if line.strip():
+                        i = line.split(":")
+                        if job.get(i[0].strip()):
+                            result.append(job)
+                            job = {"工作经历":"工作经历"}
+                        job[i[0].strip()] = i[1].strip()
+                        flag = i[0].strip()
+            elif flag == "工作描述":
+                job["工作描述"] += '\n' + text.strip()
+        else:
+            result.append(job)
+        return result
 
-def parse_pdf_layout(path):
-    result = []
-    lo = {}
-    with pdfplumber.open(path) as pdf:
-            for page in pdf.pages:
-                for table in page.extract_tables():
-                    for line in table:
-                        # lo[len(lo.keys())] = [cell for cell in line if cell]
-                        lo[len(lo.keys())] = line
+    # 解析pdf
+    def parse_pdf_layout(self, path):
+        result = []
+        lo = {}
+        with pdfplumber.open(path) as pdf:
+                for page in pdf.pages:
+                    for table in page.extract_tables():
+                        for line in table:
+                            # lo[len(lo.keys())] = [cell for cell in line if cell]
+                            lo[len(lo.keys())] = line
 
-    kwln = -1
-    kwline = None
-    for key in lo.keys():
-        # pdb.set_trace()
-        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
-            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
-                # pdb.set_trace()
-                for c in lo[key]:
+        kwln = -1
+        kwline = None
+        for key in lo.keys():
+            # pdb.set_trace()
+            for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
                     # pdb.set_trace()
-                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
-                        result.extend(parse_line(lo[key]))
-                        break
-                    if c == "对报名岗位\n认 识及工作":
-                        print(''.join(c.split()))
+                    for c in lo[key]:
+                        # pdb.set_trace()
+                        if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
+                            result.extend(self.parse_line(lo[key]))
+                            break
+                        if c == "对报名岗位\n认 识及工作":
+                            print(''.join(c.split()))
+                            break
+                    else:# 关键词行元素
+                        schema = dict()
+                        for key, val in zip(kwline, lo[key]):
+                            if key:
+                                schema[key] = val
+                        if "学校/培训机构" in schema.keys():
+                            schema["学习经历"] = "学习经历"
+                        elif "与本人关系" in schema.keys():
+                            schema["家庭成员"] = "家庭成员"
+                        elif "意向地区" in schema.keys():
+                            schema["职业发展管理"] = "职业发展管理"
+                        elif "职业证书" in schema.keys():
+                            schema["职业资格证书"] = "职业资格证书"
+                        result.append(schema)
                         break
-                else:# 关键词行元素
-                    schema = dict()
-                    for key, val in zip(kwline, lo[key]):
-                        if key:
-                            schema[key] = val
-                    if "学校/培训机构" in schema.keys():
-                        schema["学习经历"] = "学习经历"
-                    elif "与本人关系" in schema.keys():
-                        schema["家庭成员"] = "家庭成员"
-                    elif "意向地区" in schema.keys():
-                        schema["职业发展管理"] = "职业发展管理"
-                    elif "职业证书" in schema.keys():
-                        schema["职业资格证书"] = "职业资格证书"
-                    result.append(schema)
                     break
-                break
-        else:
-            # print("此行为关键词行")
-            kwline = [''.join(cell.split()) for cell in lo[key]]
-            kwln = len(lo[key])
+            else:
+                # print("此行为关键词行")
+                kwline = [''.join(cell.split()) for cell in lo[key]]
+                kwln = len(lo[key])
 
-    job = {"工作经历":"工作经历"}
-    flag = None
+        job = {"工作经历":"工作经历"}
+        flag = None
 
-    with pdfplumber.open(path) as pdf:
-        for page in pdf.pages:
-            for predict in page.extract_words():
-                # print(predict['text'])
-                text = predict['text'].replace(":", ":")
-                if ":" in text:
-                    text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
-                    for line in text.split("\n"):
-                        if line.strip():
-                            i = line.split(":")
-                            if job.get(i[0].strip()):
-                                result.append(job)
-                                job = {"工作经历":"工作经历"}
-                            job[i[0].strip()] = i[1].strip()
-                            flag = i[0].strip()
-                elif flag == "工作描述":
-                    job["工作描述"] += '\n' + text.strip()
+        with pdfplumber.open(path) as pdf:
+            for page in pdf.pages:
+                for predict in page.extract_words():
+                    # print(predict['text'])
+                    text = predict['text'].replace(":", ":")
+                    if ":" in text:
+                        text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
+                        for line in text.split("\n"):
+                            if line.strip():
+                                i = line.split(":")
+                                if job.get(i[0].strip()):
+                                    result.append(job)
+                                    job = {"工作经历":"工作经历"}
+                                job[i[0].strip()] = i[1].strip()
+                                flag = i[0].strip()
+                    elif flag == "工作描述":
+                        job["工作描述"] += '\n' + text.strip()
+                else:
+                    result.append(job)
+        return result
+
+    # 格式化数据
+    def formatter(self, datalist):
+        result = dict()
+        for d in datalist:
+            if len(d) == 1:
+                for key in d.keys():
+                    result[key] = d[key]
             else:
-                result.append(job)
-    return result
+                for k in list(d.keys()):
+                    if k == "".join(d[k].split()):
+                        d.pop(k)
+                        if result.get(k):
+                            result[k].append(d)
+                        else:
+                            result[k] = [d]
 
-# 格式化数据
-def formatter(datalist):
-    result = dict()
+        normal = self.json_obj["base"]
+        itenormal = self.json_obj["base"]
+        edunormal = self.json_obj["tal_training_experience"]
+        jobnormal = self.json_obj["tal_his_job"]
+        cetnormal = self.json_obj["tal_vocational_qualification_certificate"]
+        family = self.json_obj["tal_family_social_relation"]
 
-    for d in datalist:
-        if len(d) == 1:
-            for key in d.keys():
-                result[key] = d[key]
-        else:
-            for k in list(d.keys()):
-                if k == "".join(d[k].split()):
-                    d.pop(k)
-                    if result.get(k):
-                        result[k].append(d)
-                    else:
-                        result[k] = [d]
+        for key in normal.keys():
+            if result.get(key):
+                result[normal[key]] = result[key]
+                result.pop(key)
 
-    # 转译数据库字段名
-    with open("./resources/translate.json", "r", encoding="utf-8") as ff:
-        json_obj = json.load(ff)
+        for idx in range(len(result['职业发展管理'])):
+            for key in itenormal.keys():
+                if result['职业发展管理'][idx].get(key):
+                    result['职业发展管理'][idx][itenormal[key]] = result['职业发展管理'][idx][key]
+                    result['职业发展管理'][idx].pop(key)
 
-    normal = json_obj["base"]
-    itenormal = json_obj["base"]
-    edunormal = json_obj["tal_training_institutions"]
-    jobnormal = json_obj["tal_his_job"]
-    cetnormal = json_obj["tal_vocational_qualification_certificate"]
-    family = json_obj["tal_family_social_relations"]
+        for idx in range(len(result['学习经历'])):
+            for key in edunormal.keys():
+                if result['学习经历'][idx].get(key):
+                    result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
+                    result['学习经历'][idx].pop(key)
 
-    for key in normal.keys():
-        if result.get(key):
-            result[normal[key]] = result[key]
-            result.pop(key)
+        for idx in range(len(result['工作经历'])):
+            for key in jobnormal.keys():
+                if result['工作经历'][idx].get(key):
+                    result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
+                    result['工作经历'][idx].pop(key)
 
-    for idx in range(len(result['职业发展管理'])):
-        for key in itenormal.keys():
-            if result['职业发展管理'][idx].get(key):
-                result['职业发展管理'][idx][itenormal[key]] = result['职业发展管理'][idx][key]
-                result['职业发展管理'][idx].pop(key)
+        for idx in range(len(result['职业资格证书'])):
+            for key in cetnormal.keys():
+                if result['职业资格证书'][idx].get(key):
+                    result['职业资格证书'][idx][cetnormal[key]] = result['职业资格证书'][idx][key]
+                    result['职业资格证书'][idx].pop(key)
 
-    for idx in range(len(result['学习经历'])):
-        for key in edunormal.keys():
-            if result['学习经历'][idx].get(key):
-                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
-                result['学习经历'][idx].pop(key)
+        for idx in range(len(result['家庭成员'])):
+            for key in family.keys():
+                if result['家庭成员'][idx].get(key):
+                    result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
+                    result['家庭成员'][idx].pop(key)
 
-    for idx in range(len(result['工作经历'])):
-        for key in jobnormal.keys():
-            if result['工作经历'][idx].get(key):
-                result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
-                result['工作经历'][idx].pop(key)
+        tit = {
+            "基本信息":"base",
+            "职业发展管理":"intent_job",
+            "学习经历":"tal_training_experience",
+            "工作经历":"tal_his_job",
+            "项目经历":"tal_his_project",
+            "培训经历":"tal_training_experience",
+            "获奖情况":"tal_reward_punishment",
+            "语言能力":"tal_language",
+            "职业资格证书":"tal_vocational_qualification_certificate",
+            "专业技能":"tal_professional_tech_certificate",
+            "家庭成员":"tal_family_social_relation"
+        }
 
-    for idx in range(len(result['职业资格证书'])):
-        for key in cetnormal.keys():
-            if result['职业资格证书'][idx].get(key):
-                result['职业资格证书'][idx][cetnormal[key]] = result['职业资格证书'][idx][key]
-                result['职业资格证书'][idx].pop(key)
+        for key in tit.keys():
+            if result.get(key):
+                result[tit[key]] = result[key]
+                result.pop(key)
 
-    for idx in range(len(result['家庭成员'])):
-        for key in family.keys():
-            if result['家庭成员'][idx].get(key):
-                result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
-                result['家庭成员'][idx].pop(key)
+        return result
 
-    tit = {
-        "基本信息":"base",
-        "职业发展管理":"intent_job",
-        "学习经历":"tal_training_institutions",
-        "工作经历":"tal_his_job",
-        "项目经历":"tal_his_project",
-        "培训经历":"tal_training_institutions",
-        "获奖情况":"tal_rewards_punishments",
-        "语言能力":"tal_language",
-        "职业资格证书":"tal_vocational_qualification_certificate",
-        "专业技能":"tal_professional_tech_certificate",
-        "家庭成员":"tal_family_social_relations"
-    }
+    # 推送后端
+    def push_back(self, result):
+        url = "http://192.168.1.110:9999/talent/getResumeData"
+        session = requests.Session()
+        session.mount('http://', HTTPAdapter(max_retries = 3))
+        try:
+            headers = {
+                'contentType':'Application/json'
+            }
+            response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
+            print(response.text)
+        except Exception as e:
+            print(e)
 
-    for key in tit.keys():
-        if result.get(key):
-            result[tit[key]] = result[key]
-            result.pop(key)
+    def predict(self, path):
+        if path.endswith(".docx"):
+            result = self.formatter(self.parse_word_layout(path))
+            self.push_back(result)
+            print(self.formatter(self.parse_word_layout(path)))
+        elif path.endswith(".pdf"):
+            result = self.formatter(self.parse_pdf_layout(path))
+            self.push_back(result)
+            print(self.formatter(self.parse_pdf_layout(path)))
 
-    # url = "http://192.168.1.110:9999/talent/getResumeData"
-    # session = requests.Session()
-    # session.mount('http://', HTTPAdapter(max_retries = 3))
-    # try:
-    #     headers = {
-    #         'contentType':'Application/json'
-    #     }
-    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
-    #     print(response.text)
-    # except Exception as e:
-    #     print(e)
 
-    return result
 
 if __name__ == "__main__":
-    if path.endswith(".docx"):
-        pprint(formatter(parse_layout(path)))
-    else:
-        pprint(formatter(parse_pdf_layout(path)))
-
+    i = Inner()
+    i.predict(path)

+ 24 - 19
tools/resources/translate.json

@@ -21,8 +21,9 @@
         "意向岗位":"intent_job",
         "期望职业":"intent_job",
         "目前年薪":"current_salary_yearl",
-        "政治面貌(加入时间)":"politics",
         "政治面貌":"politics",
+        "政治面貌(加入时间)":"politics",
+        "政治面貌(加入时间)":"politics",
         "熟悉专业有何专长":"skills"
     },
     "tal_his_edu":{
@@ -46,6 +47,7 @@
     },
     "tal_his_project":{
         "项目名":"project_name",
+        "项目名称":"project_name",
         "公司名":"company_name",
         "公司名称":"company_name",
         "职位":"project_office",
@@ -59,42 +61,45 @@
         "熟练度":"proficiency"
     },
     "tal_vocational_qualification_certificate":{
-        "证书名称":"vocational_qualification_certificate_name",
-        "名称":"vocational_qualification_certificate_name",
-        "证书":"vocational_qualification_certificate_name",
-        "职业证书":"vocational_qualification_certificate_name",
-        "获得时间":"vocational_certificate_obtaining_time",
-        "获得日期":"vocational_certificate_obtaining_time",
-        "取得日期":"vocational_certificate_obtaining_time"
+        "证书名称":"certificate_name",
+        "名称":"certificate_name",
+        "证书":"certificate_name",
+        "职业证书":"certificate_name",
+        "获得时间":"obtain_time",
+        "获得日期":"obtain_time",
+        "取得日期":"obtain_time"
     },
     "tal_professional_tech_certificate":{
-        "技术资格证明":"professional_tech_certificate_name",
-        "获得时间":"professional_certificate_obtaining_time"
+        "技术资格证明":"certificate_name",
+        "获得时间":"obtain_time"
     },
-    "tal_training_institutions":{
-        "学校/培训机构":"school_training_institutions",
+    "tal_training_experience":{
+        "学校/培训机构":"institution_name",
+        "机构":"institution_name",
+        "cultivate_name":"institution_name",
+        "内容":"institution_name",
         "cultivate_time_beg":"start_time",
         "cultivate_time_end":"end_time",
-        "cultivate_name":"school_training_institutions",
         "专业":"major",
+        "培训类型":"major",
         "开始时间":"start_time",
         "起始时间":"start_time",
         "结束时间":"end_time",
         "毕业时间":"end_time"
     },
-    "tal_rewards_punishments":{
+    "tal_reward_punishment":{
         "项目名称":"name",
         "奖项":"name",
-        "项目单位":"rewards_punishments_unit",
-        "时间":"rewards_punishments_time"
+        "项目单位":"unit",
+        "时间":"obtain_time"
     },
-    "tal_family_social_relations":{
+    "tal_family_social_relation":{
         "称谓":"appellation",
         "与本人关系":"appellation",
         "姓名":"name",
-        "出生年月":"birth_time",
+        "出生年月":"birth_date",
         "政治面貌":"politics",
-        "工作单位":"work_units",
+        "工作单位":"work_unit",
         "职务":"position",
         "职业":"position",
         "工作单位及职务":"position"

+ 113 - 71
tools/resume_parse.py

@@ -38,7 +38,7 @@ from rich.console import Console
 console = Console()
 
 
-global ner, ner_tag, base_info_ie, prize_ie, cet_ie, pro_ie, block, block_rev
+global ner, ner_tag, base_info_ie, prize_ie, cet_ie, pro_ie, block, block_rev, translate
 
 
 if not locals().get("ner"):
@@ -52,7 +52,7 @@ if not locals().get("prize_ie"):
 if not locals().get("cet_ie"):
     cet_ie = Taskflow('information_extraction', schema=["时间","证书"], model="uie-nano")
 if not locals().get("pro_ie"):
-    pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./resources/model_100')
+    pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./resources/model_best')
 
 if not locals().get("block"):
     with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
@@ -61,6 +61,10 @@ if not locals().get("block"):
 if not locals().get("block_rev"):
     block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
 
+if not locals().get("translate"):
+    with open("./resources/translate.json", "r", encoding="utf-8") as ft:
+        translate = json.load(ft)
+
 
 if not os.path.exists("./uploads"):
     os.mkdir("./uploads")
@@ -1448,6 +1452,112 @@ def decode_path(path):
     return path_name
 
 
+# 格式化字段
+def formatter(result, json_obj):
+    normal = json_obj["base"]
+    itenormal = json_obj["base"]
+    edunormal = json_obj["tal_his_edu"]
+    jobnormal = json_obj["tal_his_job"]
+    tranornal = json_obj["tal_training_experience"]
+    cetnormal = json_obj["tal_vocational_qualification_certificate"]
+    rewnormal = json_obj["tal_reward_punishment"]
+    family = json_obj["tal_family_social_relation"]
+
+    # for key in normal.keys():
+    #     if result.get(key):
+    #         result[normal[key]] = result[key]
+    #         result.pop(key)
+
+    for key in json_obj["base"].keys():
+        if result.get("基本信息"):
+            if result["基本信息"].get(key):
+                result[json_obj["base"][key]] = result["基本信息"][key]
+                del result["基本信息"][key]
+        if result.get("求职意向"):
+            if result["求职意向"].get(key):
+                result[json_obj["base"][key]] = result["求职意向"][key]
+                del result["求职意向"][key]
+    del result["基本信息"]
+    del result["求职意向"]
+
+    if result.get("教育经历"):
+        for idx in range(len(result['教育经历'])):
+            for key in edunormal.keys():
+                if result['教育经历'][idx].get(key):
+                    result['教育经历'][idx][edunormal[key]] = result['教育经历'][idx][key]
+                    result['教育经历'][idx].pop(key)
+
+    if result.get("工作经历"):
+        for idx in range(len(result['工作经历'])):
+            for key in jobnormal.keys():
+                if result['工作经历'][idx].get(key):
+                    result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
+                    result['工作经历'][idx].pop(key)
+
+    if result.get("项目经历"):
+        for key in json_obj["tal_his_project"].keys():
+            for idx in range(len(result["项目经历"])):
+                if result["项目经历"][idx].get(key):
+                    result["项目经历"][idx][json_obj["tal_his_project"][key]] = result["项目经历"][idx][key]
+                    del result["项目经历"][idx][key]
+
+    if result.get("培训经历"):
+        for idx in range(len(result['培训经历'])):
+            for key in tranornal.keys():
+                if result['培训经历'][idx].get(key):
+                    result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]
+                    result['培训经历'][idx].pop(key)
+
+    if result.get("语言能力"):
+        for key in json_obj["tal_language"].keys():
+            for idx in range(len(result["语言能力"])):
+                if result["语言能力"][idx].get(key):
+                    result["语言能力"][idx][json_obj["tal_language"][key]] = result["语言能力"][idx][key]
+                    del result["语言能力"][idx][key]
+
+    if result.get("证书"):
+        for idx in range(len(result['证书'])):
+            for key in cetnormal.keys():
+                if result['证书'][idx].get(key):
+                    result['证书'][idx][cetnormal[key]] = result['证书'][idx][key]
+                    result['证书'][idx].pop(key)
+
+    if result.get("获奖情况"):
+        for idx in range(len(result['获奖情况'])):
+            for key in rewnormal.keys():
+                if result['获奖情况'][idx].get(key):
+                    result['获奖情况'][idx][rewnormal[key]] = result['获奖情况'][idx][key]
+                    result['获奖情况'][idx].pop(key)
+
+    if result.get("家庭成员"):
+        for idx in range(len(result['家庭成员'])):
+            for key in family.keys():
+                if result['家庭成员'][idx].get(key):
+                    result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
+                    result['家庭成员'][idx].pop(key)
+
+    tit = {
+        "基本信息":"base",
+        "求职意向":"intent_job",
+        "教育经历":"tal_his_edu",
+        "工作经历":"tal_his_job",
+        "项目经历":"tal_his_project",
+        "培训经历":"tal_training_experience",
+        "获奖情况":"tal_reward_punishment",
+        "语言能力":"tal_language",
+        "证书":"tal_vocational_qualification_certificate",
+        "专业技能":"tal_professional_tech_certificate",
+        "家庭成员":"tal_family_social_relation",
+        "其他情况说明":"intro"
+    }
+
+    for key in tit.keys():
+        if result.get(key):
+            result[tit[key]] = result[key]
+            result.pop(key)
+
+    return result
+
 # 结果返回
 def push_back(tempdir):
     for file in os.listdir('./result/' + tempdir):
@@ -1455,75 +1565,7 @@ def push_back(tempdir):
         with open(filename, "r", encoding="utf-8") as ff:
             rst = json.load(ff)
 
-        with open("./resources/translate.json", "r", encoding="utf-8") as ft:
-            json_obj = json.load(ft)
-
-        for key in json_obj["base"].keys():
-            if rst.get("基本信息"):
-                if rst["基本信息"].get(key):
-                    rst[json_obj["base"][key]] = rst["基本信息"][key]
-                    del rst["基本信息"][key]
-            if rst.get("求职意向"):
-                if rst["求职意向"].get(key):
-                    rst[json_obj["base"][key]] = rst["求职意向"][key]
-                    del rst["求职意向"][key]
-
-        del rst["基本信息"]
-        del rst["求职意向"]
-        
-        for key in json_obj["tal_his_project"].keys():
-            if rst.get("项目经历"):
-                for idx in range(len(rst["项目经历"])):
-                    if rst["项目经历"][idx].get(key):
-                        rst["项目经历"][idx][json_obj["tal_his_project"][key]] = rst["项目经历"][idx][key]
-                        del rst["项目经历"][idx][key]
-
-        for key in json_obj["tal_training_institutions"].keys():
-            if rst.get("培训经历"):
-                for idx in range(len(rst["培训经历"])):
-                    if rst["培训经历"][idx].get(key):
-                        rst["培训经历"][idx][json_obj["tal_training_institutions"][key]] = rst["培训经历"][idx][key]
-                        del rst["培训经历"][idx][key]
-
-        for key in json_obj["tal_vocational_qualification_certificate"].keys():
-            if rst.get("证书"):
-                for idx in range(len(rst["证书"])):
-                    if rst["证书"][idx].get(key):
-                        rst["证书"][idx][json_obj["tal_vocational_qualification_certificate"][key]] = rst["证书"][idx][key]
-                        del rst["证书"][idx][key]
-        
-        for key in json_obj["tal_language"].keys():
-            if rst.get("语言能力"):
-                for idx in range(len(rst["语言能力"])):
-                    if rst["语言能力"][idx].get(key):
-                        rst["语言能力"][idx][json_obj["tal_language"][key]] = rst["语言能力"][idx][key]
-                        del rst["语言能力"][idx][key]
-
-        for key in json_obj["tal_rewards_punishments"].keys():
-            if rst.get("获奖情况"):
-                for idx in range(len(rst["获奖情况"])):
-                    if rst["获奖情况"][idx].get(key):
-                        rst["获奖情况"][idx][json_obj["tal_rewards_punishments"][key]] = rst["获奖情况"][idx][key]
-                        del rst["获奖情况"][idx][key]
-
-        tit = {
-            "基本信息":"base",
-            "求职意向":"intent_job",
-            "教育经历":"tal_his_edu",
-            "工作经历":"tal_his_job",
-            "项目经历":"tal_his_project",
-            "培训经历":"tal_training_institutions",
-            "获奖情况":"tal_rewards_punishments",
-            "语言能力":"tal_language",
-            "证书":"tal_vocational_qualification_certificate",
-            "专业技能":"tal_professional_tech_certificate",
-            "家庭成员":"tal_family_social_relations"
-        }
-
-        for key in tit.keys():
-            if rst.get(key):
-                rst[tit[key]] = rst[key]
-                rst.pop(key)
+        rst = formatter(rst, translate)
 
         url = "http://192.168.1.110:9999/talent/getResumeData"
         session = requests.Session()

+ 276 - 252
tools/srafa.py

@@ -2,287 +2,311 @@
 # @Author: privacy
 # @Date:   2022-07-07 12:59:42
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-16 11:41:09
+# @Last Modified time: 2022-07-18 13:50:02
 # import pdb
-from pprint import pprint
+
 import json
-import pandas as pd
+
+import requests
+from requests.adapters import HTTPAdapter
+
 import pdfplumber
-import docx
 from docx import Document
-from docx.shared import Inches
 
-path = "d:\\desktop\\社招简历模板.docx"
-# path = "d:\\desktop\\社招简历模板.pdf"
 
-keywords = ['姓名',
-    '性别',
-    '出生日期',
-    '一寸照片',
-    '民族',
-    '出生地',
-    '政治面貌(加入时间)',
-    '参加工作时间',
-    '健康状况',
-    '外语水平',
-    '初始学历、专业',
-    '最高学历、专业',
-    '初始学历毕业院校及毕业时间',
-    '最高学历毕业院校及毕业时间',
-    '专业技术资格(取得时间)',
-    '职业技能等级(取得时间)',
-    '熟悉专业有何专长',
-    '工作单位',
-    '现任职务',
-    '任职时间',
-    '提职时间',
-    '意向岗位',
-    '联系电话',
-    '学习经历',
-    '起止时间',
-    '学校',
-    '专业',
-    '学历',
-    '学位',
-    '研究方向',
-    '是否全日制',
-    '培训',
-    '起止时间',
-    '培训类型',
-    '机构',
-    '内容',
-    '成绩',
-    '证书名称',
-    '经历',
-    '工作经历',
-    '起止时间',
-    '工作单位',
-    '职务',
-    '部门',
-    '证明人',
-    '备注',
-    '对报名岗位认识及工作设想',
-    '自我评价及主要工作业绩',
-    '获得职业资格证书情况',
-    '获得日期',
-    '名称',
-    '证书编码/文号',
-    '授予单位',
-    '备注',
-    '奖惩',
-    '项目',
-    '时间',
-    '项目单位',
-    '证明材料',
-    '情况',
-    '主要家庭成员及社会关系',
-    '称谓',
-    '出生年月',
-    '政治面貌',
-    '工作单位及职务',
-    '其他情况说明',
-    '诚信承诺',
-    '本人承诺,以上信息均与事实相符,若有虚假,愿承担一切后果并自愿取消应聘资格。'
-    '承诺人:'
-    '社会招聘工作办公室资格审查意见']
+path = "d:\\desktop\\社招简历模板.docx"
 
-def parse_line(line):
-    result = []
-    key = None
-    for cell in line:
-        if cell and ''.join(cell.split()) in keywords:
-            key = ''.join(cell.split())
-        elif cell and key:
-            schema = {key:cell}
-            result.append(schema)
-            key = None
-    return result
+class Social(object):
+    """docstring for Social"""
+    def __init__(self):
+        super(Social, self).__init__()
+        self.keywords = [
+            '姓名',
+            '性别',
+            '出生日期',
+            '一寸照片',
+            '民族',
+            '出生地',
+            '政治面貌(加入时间)',
+            '参加工作时间',
+            '健康状况',
+            '外语水平',
+            '初始学历、专业',
+            '最高学历、专业',
+            '初始学历毕业院校及毕业时间',
+            '最高学历毕业院校及毕业时间',
+            '专业技术资格(取得时间)',
+            '职业技能等级(取得时间)',
+            '熟悉专业有何专长',
+            '工作单位',
+            '现任职务',
+            '任职时间',
+            '提职时间',
+            '意向岗位',
+            '联系电话',
+            '学习经历',
+            '起止时间',
+            '学校',
+            '专业',
+            '学历',
+            '学位',
+            '研究方向',
+            '是否全日制',
+            '培训经历',
+            '培训类型',
+            '机构',
+            '内容',
+            '成绩',
+            '证书名称',
+            '工作经历',
+            '职务',
+            '部门',
+            '证明人',
+            '备注',
+            '对报名岗位认识及工作设想',
+            '自我评价及主要工作业绩',
+            '获得职业资格证书情况',
+            '获得日期',
+            '名称',
+            '证书编码/文号',
+            '授予单位',
+            '奖惩情况',
+            '项目',
+            '时间',
+            '项目单位',
+            '证明材料',
+            '主要家庭成员及社会关系',
+            '称谓',
+            '出生年月',
+            '政治面貌',
+            '工作单位及职务',
+            '其他情况说明',
+            '诚信承诺',
+            '社会招聘工作办公室资格审查意见'
+        ]
+        self.json_obj = self.get_translate()
 
+    def get_translate(self):
+        # 转译数据库字段名
+        with open("./resources/translate.json", "r", encoding="utf-8") as ff:
+            json_obj = json.load(ff)
+        return json_obj
 
-def parse_word_layout(path):
-    result = []
-    doc = Document(path)
-    lo = {}
-    for _table in doc.tables[:]:
-        for i, row in enumerate(_table.rows[:]):
-            row_content = []
-            for cell in row.cells[:]:
-                c = cell.text
-                if c not in row_content:
-                    row_content.append(c)
-            lo[len(lo.keys())] = row_content
+    def parse_line(self, line):
+        result = []
+        key = None
+        for cell in line:
+            if cell and ''.join(cell.split()) in self.keywords:
+                key = ''.join(cell.split())
+            elif cell and key:
+                schema = {key:cell}
+                result.append(schema)
+                key = None
+        return result
+    
+    # 解析word
+    def parse_word_layout(self, path):
+        result = []
+        doc = Document(path)
+        lo = {}
+        for _table in doc.tables[:]:
+            for i, row in enumerate(_table.rows[:]):
+                row_content = []
+                for cell in row.cells[:]:
+                    c = cell.text
+                    if c not in row_content:
+                        row_content.append(c)
+                lo[len(lo.keys())] = row_content
 
-    kwln = -1# 关键词行长度
-    kwline = None# 关键词行
-    for key in lo.keys():
-        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
-            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
-                perc = 0# 行内关键词数量
-                for c in lo[key]:
-                    if c and (''.join(c.split()) in keywords):# 找到此行有关键词
-                        perc += 1
-                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
-                        perc = 0# 清空行内关键词数
-                        result.extend(parse_line(lo[key]))# 添加并解析普通行级元素
-                        break
-                else:# 关键词行元素
-                    if len(kwline) != len(lo[key]):
+        kwln = -1# 关键词行长度
+        kwline = None# 关键词行
+        for key in lo.keys():
+            for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
+                    perc = 0# 行内关键词数量
+                    for c in lo[key]:
+                        if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词
+                            perc += 1
+                        if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
+                            perc = 0# 清空行内关键词数
+                            result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素
+                            break
+                    else:# 关键词行元素
+                        if len(kwline) != len(lo[key]):
+                            break
+                        schema = dict()
+                        for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
+                            if key:
+                                schema[key] = val
+                        result.append(schema)
                         break
-                    schema = dict()
-                    for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
-                        if key:
-                            schema[key] = val
-                    result.append(schema)
                     break
-                break
-        else:
-            # print("{}:此行为关键词行!".format(lo[key]))
-            if len(lo[key])>2:
-                try:
-                    kwline = [''.join(cell.split()) for cell in lo[key]]
-                except Exception as e:
-                    kwline = lo[key]
-                kwln = len(lo[key])
-    return result
+            else:
+                # print("{}:此行为关键词行!".format(lo[key]))
+                if len(lo[key])>2:
+                    try:
+                        kwline = [''.join(cell.split()) for cell in lo[key]]
+                    except Exception as e:
+                        kwline = lo[key]
+                    kwln = len(lo[key])
+        return result
+    
+    # 解析pdf
+    def parse_pdf_layout(self, path):
+        result = []
+        lo = {}
+        with pdfplumber.open(path) as pdf:
+                for page in pdf.pages:
+                    for table in page.extract_tables():
+                        for line in table:
+                            # lo[len(lo.keys())] = [cell for cell in line if cell]
+                            lo[len(lo.keys())] = line
 
-def parse_pdf_layout(path):
-    result = []
-    lo = {}
-    with pdfplumber.open(path) as pdf:
-            for page in pdf.pages:
-                for table in page.extract_tables():
-                    for line in table:
-                        # lo[len(lo.keys())] = [cell for cell in line if cell]
-                        lo[len(lo.keys())] = line
-
-    kwln = -1
-    kwline = None
-    for key in lo.keys():
-        # pdb.set_trace()
-        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
-            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
-                # pdb.set_trace()
-                for c in lo[key] or len(lo[key])!=kwln:
+        kwln = -1
+        kwline = None
+        for key in lo.keys():
+            # pdb.set_trace()
+            for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+                if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
                     # pdb.set_trace()
-                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
-                        result.extend(parse_line(lo[key]))
+                    for c in lo[key] or len(lo[key])!=kwln:
+                        # pdb.set_trace()
+                        if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
+                            result.extend(self.parse_line(lo[key]))
+                            break
+                    else:# 关键词行元素
+                        schema = dict()
+                        for key, val in zip(kwline, lo[key]):
+                            if key:
+                                schema[key] = val if val else key
+                        result.append(schema)
                         break
-                else:# 关键词行元素
-                    schema = dict()
-                    for key, val in zip(kwline, lo[key]):
-                        if key:
-                            schema[key] = val if val else key
-                    result.append(schema)
                     break
-                break
-        else:
-            # print("此行为关键词行")
-            # kwline = lo[key]
-            kwline = []
-            for cell in lo[key]:
-                if cell:
-                    kwline.append(''.join(cell.split()))
-                else:
-                    kwline.append(cell)
-            kwln = len(lo[key])
-    return result
-
-# 格式化数据
-def formatter(datalist):
-    result = dict()
-
-    for d in datalist:
-        if len(d) == 1:
-            for key in d.keys():
-                result[key] = d[key]
-        else:
-            for k in list(d.keys()):
-                if k == "".join(d[k].split()):
-                    d.pop(k)
-                    if result.get(k):
-                        result[k].append(d)
+            else:
+                kwline = []
+                for cell in lo[key]:
+                    if cell:
+                        kwline.append(''.join(cell.split()))
                     else:
-                        result[k] = [d]
+                        kwline.append(cell)
+                kwln = len(lo[key])
+        return result
+    
+    # 格式化数据
+    def formatter(self, datalist):
+        result = dict()
+        for d in datalist:
+            if len(d) == 1:
+                for key in d.keys():
+                    result[key] = d[key]
+            else:
+                for k in list(d.keys()):
+                    if k == "".join(d[k].split()):
+                        d.pop(k)
+                        if result.get(k):
+                            result[k].append(d)
+                        else:
+                            result[k] = [d]
 
+        normal = self.json_obj["base"]
+        itenormal = self.json_obj["base"]
+        edunormal = self.json_obj["tal_his_edu"]
+        jobnormal = self.json_obj["tal_his_job"]
+        tranornal = self.json_obj["tal_training_experience"]
+        cetnormal = self.json_obj["tal_vocational_qualification_certificate"]
+        rewnormal = self.json_obj["tal_reward_punishment"]
+        family = self.json_obj["tal_family_social_relation"]
 
-    # 转译数据库字段名
-    with open("./resources/translate.json", "r", encoding="utf-8") as ff:
-        json_obj = json.load(ff)
+        for key in normal.keys():
+            if result.get(key):
+                result[normal[key]] = result[key]
+                result.pop(key)
 
-    normal = json_obj["base"]
-    itenormal = json_obj["base"]
-    edunormal = json_obj["tal_his_edu"]
-    jobnormal = json_obj["tal_his_job"]
-    cetnormal = json_obj["tal_vocational_qualification_certificate"]
-    family = json_obj["tal_family_social_relations"]
+        for idx in range(len(result['学习经历'])):
+            for key in edunormal.keys():
+                if result['学习经历'][idx].get(key):
+                    result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
+                    result['学习经历'][idx].pop(key)
 
-    for key in normal.keys():
-        if result.get(key):
-            result[normal[key]] = result[key]
-            result.pop(key)
+        for idx in range(len(result['工作经历'])):
+            for key in jobnormal.keys():
+                if result['工作经历'][idx].get(key):
+                    result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
+                    result['工作经历'][idx].pop(key)
 
-    for idx in range(len(result['学习经历'])):
-        for key in edunormal.keys():
-            if result['学习经历'][idx].get(key):
-                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
-                result['学习经历'][idx].pop(key)
+        for idx in range(len(result['培训经历'])):
+            for key in tranornal.keys():
+                if result['培训经历'][idx].get(key):
+                    result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]
+                    result['培训经历'][idx].pop(key)
 
-    for idx in range(len(result['工作经历'])):
-        for key in jobnormal.keys():
-            if result['工作经历'][idx].get(key):
-                result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
-                result['工作经历'][idx].pop(key)
+        for idx in range(len(result['获得职业资格证书情况'])):
+            for key in cetnormal.keys():
+                if result['获得职业资格证书情况'][idx].get(key):
+                    result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]
+                    result['获得职业资格证书情况'][idx].pop(key)
 
-    for idx in range(len(result['获得职业资格证书情况'])):
-        for key in cetnormal.keys():
-            if result['获得职业资格证书情况'][idx].get(key):
-                result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]
-                result['获得职业资格证书情况'][idx].pop(key)
+        for idx in range(len(result['奖惩情况'])):
+            for key in rewnormal.keys():
+                if result['奖惩情况'][idx].get(key):
+                    result['奖惩情况'][idx][rewnormal[key]] = result['奖惩情况'][idx][key]
+                    result['奖惩情况'][idx].pop(key)
 
-    for idx in range(len(result['主要家庭成员及社会关系'])):
-        for key in family.keys():
-            if result['主要家庭成员及社会关系'][idx].get(key):
-                result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
-                result['主要家庭成员及社会关系'][idx].pop(key)
+        for idx in range(len(result['主要家庭成员及社会关系'])):
+            for key in family.keys():
+                if result['主要家庭成员及社会关系'][idx].get(key):
+                    result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
+                    result['主要家庭成员及社会关系'][idx].pop(key)
 
-    tit = {
-        "基本信息":"base",
-        "职业发展管理":"intent_job",
-        "学习经历":"tal_his_edu",
-        "工作经历":"tal_his_job",
-        "项目经历":"tal_his_project",
-        "培训经历":"tal_training_institutions",
-        "奖情况":"tal_rewards_punishments",
-        "语言能力":"tal_language",
-        "获得职业资格证书情况":"tal_vocational_qualification_certificate",
-        "专业技能":"tal_professional_tech_certificate",
-        "主要家庭成员及社会关系":"tal_family_social_relations",
-        "其他情况说明":"intro"
-    }
+        tit = {
+            "基本信息":"base",
+            "职业发展管理":"intent_job",
+            "学习经历":"tal_his_edu",
+            "工作经历":"tal_his_job",
+            "项目经历":"tal_his_project",
+            "培训经历":"tal_training_experience",
+            "奖情况":"tal_reward_punishment",
+            "语言能力":"tal_language",
+            "获得职业资格证书情况":"tal_vocational_qualification_certificate",
+            "专业技能":"tal_professional_tech_certificate",
+            "主要家庭成员及社会关系":"tal_family_social_relation",
+            "其他情况说明":"intro"
+        }
 
-    for key in tit.keys():
-        if result.get(key):
-            result[tit[key]] = result[key]
-            result.pop(key)
+        for key in tit.keys():
+            if result.get(key):
+                result[tit[key]] = result[key]
+                result.pop(key)
+        return result
+    
+    # 推送后端
+    def push_back(self, result):
+        url = "http://192.168.1.110:9999/talent/getResumeData"
+        session = requests.Session()
+        session.mount('http://', HTTPAdapter(max_retries = 3))
+        try:
+            headers = {
+                'contentType':'Application/json'
+            }
+            response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
+            print(response.text)
+        except Exception as e:
+            print(e)
 
-    # url = "http://192.168.1.110:9999/talent/getResumeData"
-    # session = requests.Session()
-    # session.mount('http://', HTTPAdapter(max_retries = 3))
-    # try:
-    #     headers = {
-    #         'contentType':'Application/json'
-    #     }
-    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
-    #     print(response.text)
-    # except Exception as e:
-    #     print(e)
+    def predict(self, path):
+        if path.endswith(".docx"):
+            result = self.formatter(self.parse_word_layout(path))
+            self.push_back(result)
+            print(self.formatter(self.parse_word_layout(path)))
+        elif path.endswith(".pdf"):
+            result = self.formatter(self.parse_pdf_layout(path))
+            self.push_back(result)
+            print(self.formatter(self.parse_pdf_layout(path)))
 
-    return result
 
-if __name__ == '__main__':
-    if path.endswith(".pdf"):
-        pprint(formatter(parse_pdf_layout(path)))
-    else:
-        pprint(formatter(parse_word_layout(path)))
 
+if __name__ == '__main__':
+    s = Social()
+    s.predict(path)
+