3 years ago · a0afd8f21c
--- a/tools/custom.py
+++ b/tools/custom.py
@@ -2,11 +2,12 @@
 
																 # @Author: privacy

															
 
																 # @Date:   2022-07-11 09:21:24

															
 
																 # @Last Modified by:   privacy

															
 
																-# @Last Modified time: 2022-07-13 15:31:50

															
 
																+# @Last Modified time: 2022-07-14 11:00:31

															
 
																 # 自定义模板

															
 
																 import re

															
 
																+import json

															
 
																 import logging

															
 
																 from pprint import pprint

															
 
																 import requests

															
@@ -16,8 +17,8 @@ from docx.shared import Inches
 
																 path = "d:\\desktop\\自定义.docx"

															
 
																-# path = "d:\\desktop\\内部人才市场简历模板.docx"

															
 
																+# 关键词字典

															
 
																 keywords = [

															
 
																 	"姓名",

															
 
																 	"性别",

															
@@ -76,9 +77,19 @@ keywords = [
 
																 	"其他情况说明",

															
 
																 	"工作单位及职务",

															
 
																 	"政治面貌",

															
 
																-	"职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系", "计算机水平"

															
 
																+	"职业证书",

															
 
																+    "资格等级",

															
 
																+    "取得日期",

															
 
																+    "学校/培训机构",

															
 
																+    "专业",

															
 
																+    "起始时间",

															
 
																+    "毕业时间",

															
 
																+    "职业",

															
 
																+    "与本人关系",

															
 
																+    "计算机水平"

															
 
																 ]

															
 
																+# 解析行内元素

															
 
																 def parse_line(line):

															
 
																     result = []

															
 
																     key = None

															
@@ -92,50 +103,43 @@ def parse_line(line):
 
																     return result

															
 
																+# 解析文档布局

															
 
																 def parse_layout(path):

															
 
																     result = []

															
 
																     doc = Document(path)

															
 
																     lo = {}

															
 
																-    tables = doc.tables

															
 
																-    for _table in tables[:]:

															
 
																+    for _table in doc.tables[:]:

															
 
																         for i, row in enumerate(_table.rows[:]):

															
 
																             row_content = []

															
 
																             for cell in row.cells[:]:

															
 
																                 c = cell.text

															
 
																-                # row_content.append(c)

															
 
																                 if c not in row_content:

															
 
																                 	row_content.append(c)

															
 
																             lo[len(lo.keys())] = row_content

															
 
																-    kwln = -1

															
 
																-    kwline = None

															
 
																+    kwln = -1# 关键词行长度

															
 
																+    kwline = None# 关键词行

															
 
																     for key in lo.keys():

															
 
																-        # pdb.set_trace()

															
 
																         for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

															
 
																             if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

															
 
																-                # pdb.set_trace()

															
 
																-                perc = 0

															
 
																+                perc = 0# 行内关键词数量

															
 
																                 for c in lo[key]:

															
 
																-                    # pdb.set_trace()

															
 
																-                    if c and (''.join(c.split()) in keywords):

															
 
																+                    if c and (''.join(c.split()) in keywords):# 找到此行有关键词

															
 
																                         perc += 1

															
 
																-                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 非关键词行元素

															
 
																-                        # print(c)

															
 
																-                        # print(perc)

															
 
																-                        # print(lo[key])

															
 
																-                        perc = 0

															
 
																-                        result.extend(parse_line(lo[key]))

															
 
																+                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3，判断此行非关键词行元素

															
 
																+                        perc = 0# 清空行内关键词数

															
 
																+                        result.extend(parse_line(lo[key]))# 添加并解析普通行级元素

															
 
																                         break

															
 
																                 else:# 关键词行元素

															
 
																                     schema = dict()

															
 
																-                    for key, val in zip(kwline, lo[key]):

															
 
																+                    for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素

															
 
																                         if key:

															
 
																                             schema[key] = val

															
 
																                     result.append(schema)

															
 
																                     break

															
 
																                 break

															
 
																         else:

															
 
																-            # print("{}\t\t此行为关键词行".format(lo[key]))

															
 
																+            # print("{}：此行为关键词行！".format(lo[key]))

															
 
																             try:

															
 
																                 kwline = [''.join(cell.split()) for cell in lo[key]]

															
 
																             except Exception as e:

															
@@ -147,20 +151,20 @@ def parse_layout(path):
 
																 # 格式化数据

															
 
																 def formatter(datalist):

															
 
																     result = dict()

															
 
																-

															
 
																     for d in datalist:

															
 
																-        if len(d) == 1:

															
 
																+        if len(d) == 1:# 普通键值对

															
 
																             for key in d.keys():

															
 
																                 result[key] = d[key]

															
 
																-        else:

															
 
																+        else:# 行级元素

															
 
																             for k in list(d.keys()):

															
 
																-                if k == "".join(d[k].split()):

															
 
																+                if k == "".join(d[k].split()):# 行名

															
 
																                     d.pop(k)

															
 
																-                    if result.get(k):

															
 
																+                    if result.get(k):# 多行元素合并

															
 
																                         result[k].append(d)

															
 
																                     else:

															
 
																                         result[k] = [d]

															
 
																+    ### 时间格式化

															
 
																     if result.get("出生年月"):

															
 
																         dates = re.findall(r'\d+' , result["出生年月"])

															
 
																         if len(dates) == 1:

															
@@ -168,7 +172,7 @@ def formatter(datalist):
 
																         elif len(dates) == 2:

															
 
																             result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

															
 
																         elif len(dates) == 3:

															
 
																-            result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

															
 
																+            result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

															
 
																     if result.get("任职时间"):

															
 
																         dates = re.findall(r'\d+' , result["任职时间"])

															
@@ -177,7 +181,7 @@ def formatter(datalist):
 
																         elif len(dates) == 2:

															
 
																             result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

															
 
																         elif len(dates) == 3:

															
 
																-            result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

															
 
																+            result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

															
 
																     if result.get("参加工作时间"):

															
 
																         dates = re.findall(r'\d+' , result["参加工作时间"])

															
@@ -186,7 +190,7 @@ def formatter(datalist):
 
																         elif len(dates) == 2:

															
 
																             result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

															
 
																         elif len(dates) == 3:

															
 
																-            result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

															
 
																+            result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

															
 
																     if result.get("最高学历毕业院校及毕业时间"):

															
 
																         dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])

															
@@ -262,30 +266,20 @@ def formatter(datalist):
 
																                 dates = re.findall(r'\d+' , fam["出生年月"])

															
 
																                 if len(dates) == 2:

															
 
																                     result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

															
 
																-    normal = {

															
 
																-        "姓名":"name",

															
 
																-        "性别":"gender",

															
 
																-        "邮箱地址":"email",

															
 
																-        "政治面貌(加入时间)":"politics",

															
 
																-        "联系电话":"mobile",

															
 
																-        "籍贯":"birthplace",

															
 
																-        "出生年月":"birth_time",

															
 
																-        "现任职务":"current_job",

															
 
																-        "所在城市":"living_city",

															
 
																-        "参加工作时间":"work_begin_time",

															
 
																-        "意向岗位":"intent_job",

															
 
																-        "熟悉专业有何专长":"skills",

															
 
																-    }

															
 
																-    edunormal = {

															
 
																-        "学校":"school_name",

															
 
																-        "专业":"major",

															
 
																-        "学历":"degree",

															
 
																-        "是否全日制":"degree_type",

															
 
																-    }

															
 
																+

															
 
																+    # 转译数据库字段名

															
 
																+    with open("./resources/translate.json", "r", encoding="utf-8") as ff:

															
 
																+        json_obj = json.load(ff)

															
 
																+

															
 
																+    normal = json_obj["base"]

															
 
																+    edunormal = json_obj["tal_his_edu"]

															
 
																+    family = json_obj["tal_family_social_relations"]

															
 
																+

															
 
																     for key in normal.keys():

															
 
																         if result.get(key):

															
 
																             result[normal[key]] = result[key]

															
 
																             result.pop(key)

															
 
																+

															
 
																     for idx in range(len(result['学习经历'])):

															
 
																         result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]

															
 
																         result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]

															
@@ -293,17 +287,24 @@ def formatter(datalist):
 
																             if result['学习经历'][idx].get(key):

															
 
																                 result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

															
 
																                 result['学习经历'][idx].pop(key)

															
 
																-    url = "http://192.168.1.110:9999/talent/getResumeData"

															
 
																-    session = requests.Session()

															
 
																-    session.mount('http://', HTTPAdapter(max_retries = 3))

															
 
																-    try:

															
 
																-        headers = {

															
 
																-            'contentType':'Application/json'

															
 
																-        }

															
 
																-        response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)

															
 
																-        print(response.text)

															
 
																-    except Exception as e:

															
 
																-        print(e)

															
 
																+

															
 
																+    for idx in range(len(result['主要家庭成员及社会关系'])):

															
 
																+        for key in family.keys():

															
 
																+            if result['主要家庭成员及社会关系'][idx].get(key):

															
 
																+                result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]

															
 
																+                result['主要家庭成员及社会关系'][idx].pop(key)

															
 
																+

															
 
																+    # url = "http://192.168.1.110:9999/talent/getResumeData"

															
 
																+    # session = requests.Session()

															
 
																+    # session.mount('http://', HTTPAdapter(max_retries = 3))

															
 
																+    # try:

															
 
																+    #     headers = {

															
 
																+    #         'contentType':'Application/json'

															
 
																+    #     }

															
 
																+    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)

															
 
																+    #     print(response.text)

															
 
																+    # except Exception as e:

															
 
																+    #     print(e)

															
 
																     return result

															
--- a/tools/irafa.py
+++ b/tools/irafa.py
@@ -2,11 +2,11 @@
 
																 # @Author: privacy

															
 
																 # @Date:   2022-07-07 13:12:17

															
 
																 # @Last Modified by:   privacy

															
 
																-# @Last Modified time: 2022-07-13 16:46:02

															
 
																+# @Last Modified time: 2022-07-14 09:39:42

															
 
																 # 内部人才市场简历模板

															
 
																 from pprint import pprint

															
 
																-

															
 
																+import re

															
 
																 import docx

															
 
																 from docx import Document

															
 
																 from docx.shared import Inches

															
@@ -75,6 +75,25 @@ def parse_layout(path):
 
																             # print("此行为关键词行")

															
 
																             kwline = [''.join(cell.split()) for cell in lo[key]]

															
 
																             kwln = len(lo[key])

															
 
																+

															
 
																+    job = {"工作经历":"工作经历"}

															
 
																+    flag = None

															
 
																+    for p in doc.paragraphs:

															
 
																+        text = p.text.replace("：", ":")

															
 
																+        if ":" in text:

															
 
																+            text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)

															
 
																+            for line in text.split("\n"):

															
 
																+                if line.strip():

															
 
																+                    i = line.split(":")

															
 
																+                    if job.get(i[0].strip()):

															
 
																+                        result.append(job)

															
 
																+                        job = {"工作经历":"工作经历"}

															
 
																+                    job[i[0].strip()] = i[1].strip()

															
 
																+                    flag = i[0].strip()

															
 
																+        elif flag == "工作描述":

															
 
																+            job["工作描述"] += '\n' + text.strip()

															
 
																+    else:

															
 
																+        result.append(job)

															
 
																     return result

															
@@ -119,13 +138,18 @@ def formatter(datalist):
 
																         if result.get(key):

															
 
																             result[normal[key]] = result[key]

															
 
																             result.pop(key)

															
 
																-    # for idx in range(len(result['学习经历'])):

															
 
																-    #     result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]

															
 
																-    #     result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]

															
 
																-    #     for key in edunormal.keys():

															
 
																-    #         if result['学习经历'][idx].get(key):

															
 
																-    #             result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

															
 
																-    #             result['学习经历'][idx].pop(key)

															
 
																+

															
 
																+    edunormal = {

															
 
																+        "学校/培训机构":"school_name",

															
 
																+        "专业":"major",

															
 
																+        "起始时间":"start_time",

															
 
																+        "毕业时间":"end_time"

															
 
																+    }

															
 
																+    for idx in range(len(result['学习经历'])):

															
 
																+        for key in edunormal.keys():

															
 
																+            if result['学习经历'][idx].get(key):

															
 
																+                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

															
 
																+                result['学习经历'][idx].pop(key)

															
 
																     return result

															
 
																 if __name__ == "__main__":

															
--- a/tools/logger.py
+++ b/tools/logger.py
@@ -0,0 +1,40 @@
 
																+# -*- coding: utf-8 -*-

															
 
																+# @Author: privacy

															
 
																+# @Date:   2022-07-14 13:26:15

															
 
																+# @Last Modified by:   privacy

															
 
																+# @Last Modified time: 2022-07-14 13:27:46

															
 
																+import logging

															
 
																+

															
 
																+class Logger:

															
 
																+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(asctime)s: %(name)s: %(levelname)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):

															
 
																+        self.logger = logging.getLogger(name)

															
 
																+        self.logger.setLevel(logging.INFO)

															
 
																+        self.fmt = logging.Formatter(fmt)

															
 
																+        self.set_console_handler(console_handler_level)

															
 
																+

															
 
																+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:

															
 
																+        ch = logging.StreamHandler()

															
 
																+        ch.setLevel(console_handler_level)

															
 
																+        ch.setFormatter(self.fmt)

															
 
																+        self.logger.addHandler(ch)

															
 
																+

															
 
																+    def set_file_handler(self, filename: str, mode: str = "a", file_handler_level: str = logging.WARNING) -> None:

															
 
																+        fh = logging.FileHandler(filename, mode=mode, encoding='utf-8')

															
 
																+        fh.setLevel(file_handler_level)

															
 
																+        fh.setFormatter(self.fmt)

															
 
																+        self.logger.addHandler(fh)

															
 
																+

															
 
																+    def debug(self, msg):

															
 
																+        self.logger.debug(msg)

															
 
																+

															
 
																+    def info(self, msg):

															
 
																+        self.logger.info(msg)

															
 
																+

															
 
																+    def warning(self, msg):

															
 
																+        self.logger.warning(msg)

															
 
																+

															
 
																+    def error(self, msg):

															
 
																+        self.logger.error(msg)

															
 
																+

															
 
																+    def critical(self, msg):

															
 
																+        self.logger.critical(msg)
															
--- a/tools/resources/translate.json
+++ b/tools/resources/translate.json
@@ -0,0 +1,85 @@
 
																+{

															
 
																+    "base":{

															
 
																+        "姓名":"name",

															
 
																+        "性别":"gender",

															
 
																+        "出生年月":"birth_time",

															
 
																+        "出生日期":"birth_time",

															
 
																+        "民族":"national",

															
 
																+        "籍贯":"birthplace",

															
 
																+        "户籍地":"household_register_address",

															
 
																+        "参加工作时间":"work_begin_time",

															
 
																+        "联系电话":"mobile",

															
 
																+        "手机号码":"mobile",

															
 
																+        "邮箱地址":"email",

															
 
																+        "现任职务":"current_job",

															
 
																+        "提职时间":"promotion_time",

															
 
																+        "所在城市":"living_city",

															
 
																+        "意向城市":"意向城市",

															
 
																+        "意向岗位":"intent_job",

															
 
																+        "期望职业":"intent_job",

															
 
																+        "目前年薪":"current_salary_yearl",

															
 
																+        "政治面貌(加入时间)":"politics",

															
 
																+        "政治面貌":"politics",

															
 
																+        "熟悉专业有何专长":"skills"

															
 
																+    },

															
 
																+    "tal_his_edu":{

															
 
																+        "开始时间":"start_time",

															
 
																+        "毕业时间":"end_time",

															
 
																+        "学校":"school_name",

															
 
																+        "专业":"major",

															
 
																+        "学历":"degree",

															
 
																+        "学位":"degree_in",

															
 
																+        "研究方向":"research_direction",

															
 
																+        "是否全日制":"is_full_time"

															
 
																+    },

															
 
																+    "tal_his_job":{

															
 
																+        "工作单位":"company_name",

															
 
																+        "职位":"job_name",

															
 
																+        "开始时间":"start_time",

															
 
																+        "结束时间":"end_time",

															
 
																+        "工作描述":"job_desc"

															
 
																+    },

															
 
																+    "tal_his_project":{

															
 
																+        "项目名":"project_name",

															
 
																+        "公司名":"company_name",

															
 
																+        "职位":"project_office",

															
 
																+        "开始时间":"start_time",

															
 
																+        "结束时间":"end_time",

															
 
																+        "项目职责":"project_duty",

															
 
																+        "业绩":"project_performance"

															
 
																+    },

															
 
																+    "tal_language":{

															
 
																+        "语言":"lan_name",

															
 
																+        "熟练度":"proficiency"

															
 
																+    },

															
 
																+    "tal_vocational_qualification_certificate":{

															
 
																+        "证书名称":"vocational_qualification_certificate_name",

															
 
																+        "证书":"vocational_qualification_certificate_name",

															
 
																+        "获得时间":"vocational_certificate_obtaining_time"

															
 
																+    },

															
 
																+    "tal_professional_tech_certificate":{

															
 
																+        "技术资格证明":"professional_tech_certificate_name",

															
 
																+        "获得时间":"professional_certificate_obtaining_time"

															
 
																+    },

															
 
																+    "tal_training_institutions":{

															
 
																+        "学校/培训机构":"school_training_institutions",

															
 
																+        "专业":"major",

															
 
																+        "开始时间":"start_time",

															
 
																+        "结束时间":"end_time"

															
 
																+    },

															
 
																+    "tal_rewards_punishments":{

															
 
																+        "项目名称":"name",

															
 
																+        "项目单位":"rewards_punishments_unit",

															
 
																+        "时间":"rewards_punishments_time"

															
 
																+    },

															
 
																+    "tal_family_social_relations":{

															
 
																+        "称谓":"appellation",

															
 
																+        "姓名":"name",

															
 
																+        "出生年月":"birth_time",

															
 
																+        "政治面貌":"politics",

															
 
																+        "工作单位":"work_units",

															
 
																+        "职务":"position",

															
 
																+        "工作单位及职务":"position"

															
 
																+    },

															
 
																+    "其他":"intro"

															
 
																+}
															
--- a/tools/resume_parse.py
+++ b/tools/resume_parse.py
@@ -8,6 +8,7 @@ import sys
 
																 import re
															
 
																 import json
															
 
																 import time
															
 
																+import platform
															
 
																 from os import walk
															
 
																 import subprocess
															
 
																 import rarfile
															
@@ -33,62 +34,50 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 
																 import pdfplumber
															
 
																 from paddlenlp import Taskflow
															
 
																-class Logger:
															
 
																-    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(asctime)s: %(name)s: %(levelname)s: %(filename)s: %(funcName)s: %(message)s'):
															
 
																-        self.logger = logging.getLogger(name)
															
 
																-        self.logger.setLevel(logging.INFO)
															
 
																-        self.fmt = logging.Formatter(fmt)
															
 
																-        self.set_console_handler(console_handler_level)
															
 
																-    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
															
 
																-        ch = logging.StreamHandler()
															
 
																-        ch.setLevel(console_handler_level)
															
 
																-        ch.setFormatter(self.fmt)
															
 
																-        self.logger.addHandler(ch)
															
 
																+from logger import Logger
															
 
																+logger = Logger("resume_parse")
															
 
																+logger.set_file_handler(filename='journal.log')
															
 
																-    def set_file_handler(self, filename: str, mode: str = "a", file_handler_level: str = logging.WARNING) -> None:
															
 
																-        fh = logging.FileHandler(filename, mode=mode, encoding='utf-8')
															
 
																-        fh.setLevel(file_handler_level)
															
 
																-        fh.setFormatter(self.fmt)
															
 
																-        self.logger.addHandler(fh)
															
 
																-    def debug(self, msg):
															
 
																-        self.logger.debug(msg)
															
 
																+from rich.console import Console
															
 
																+console = Console()
															
 
																-    def info(self, msg):
															
 
																-        self.logger.info(msg)
															
 
																-    def warning(self, msg):
															
 
																-        self.logger.warning(msg)
															
 
																+global ner, ner_tag, base_info_ie, prize_ie, cet_ie, pro_ie, block, block_rev
															
 
																-    def error(self, msg):
															
 
																-        self.logger.error(msg)
															
 
																-    def critical(self, msg):
															
 
																-        self.logger.critical(msg)
															
 
																+if not locals().get("ner"):
															
 
																+    ner = Taskflow("ner", mode='fast')
															
 
																+if not locals().get("ner_tag"):
															
 
																+    ner_tag = Taskflow("ner")
															
 
																+if not locals().get("base_info_ie"):
															
 
																+    base_info_ie = Taskflow('information_extraction', schema=["姓名","性别","婚姻状况","邮箱地址","政治面貌","手机号码","籍贯","出生日期","现任职务","参加工作时间","英语水平","计算机水平","工作年限","当前单位","所在城市","职业资格"])
															
 
																+if not locals().get("prize_ie"):
															
 
																+    prize_ie = Taskflow('information_extraction', schema=["时间", "奖项"])
															
 
																+if not locals().get("cet_ie"):
															
 
																+    cet_ie = Taskflow('information_extraction', schema=["时间","证书"])
															
 
																+if not locals().get("pro_ie"):
															
 
																+    pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./model_100')
															
 
																-logger = Logger("resume_parse")
															
 
																-logger.set_file_handler(filename='data.log')
															
 
																+if not locals().get("block"):
															
 
																+    with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
															
 
																+        block = json.load(fp)
															
 
																-from rich.console import Console
															
 
																-console = Console()
															
 
																+if not locals().get("block_rev"):
															
 
																+    block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
															
 
																 import uvicorn
															
 
																 from fastapi import BackgroundTasks, FastAPI, File, UploadFile
															
 
																 app = FastAPI()
															
 
																-ner = Taskflow("ner", mode='fast')
															
 
																-ner_tag = Taskflow("ner")
															
 
																-base_info_ie = Taskflow('information_extraction', schema=["姓名","性别","婚姻状况","电子邮箱","政治面貌","手机号码","籍贯","出生日期","现任职务","参加工作时间","英语水平","计算机水平","工作年限","当前单位","所在城市","职业资格"])
															
 
																-prize_ie = Taskflow('information_extraction', schema=["时间", "奖项"])
															
 
																-cet_ie = Taskflow('information_extraction', schema=["时间","证书"])
															
 
																-pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./model_100')
															
 
																-global block, block_rev
															
 
																-
															
 
																-with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
															
 
																-    block = json.load(fp)
															
 
																-block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
															
 
																+if not os.path.exists("./uploads"):
															
 
																+    os.mkdir("./uploads")
															
 
																+if not os.path.exists("./pdf"):
															
 
																+    os.mkdir("./pdf")
															
 
																+if not os.path.exists("./cache"):
															
 
																+    os.mkdir("./cache")
															
 
																 # 基本信息(旧版)
															
@@ -165,22 +154,6 @@ def get_base_info(lines):
 
																             rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
															
 
																         elif len(dates) == 3:
															
 
																             rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
															
 
																-    normal = {
															
 
																-        "姓名":"name",
															
 
																-        "性别":"gender",
															
 
																-        "电子邮箱":"email",
															
 
																-        "政治面貌":"politics",
															
 
																-        "手机号码":"mobile",
															
 
																-        "籍贯":"birthplace",
															
 
																-        "出生日期":"birth_time",
															
 
																-        "现任职务":"current_job",
															
 
																-        "所在城市":"living_city",
															
 
																-        "参加工作时间":"work_begin_time",
															
 
																-    }
															
 
																-    for key in normal.keys():
															
 
																-        if rst.get(key):
															
 
																-            rst[normal[key]] = rst[key]
															
 
																-            del rst[key]
															
 
																     return {key:rst[key][0]["text"] for key in rst.keys()}
															
@@ -961,19 +934,19 @@ def get_lag_list(lines):
 
																     lan_list = []
															
 
																     re_lan = re.compile(r'(\w+[语话])')
															
 
																     re_lev = re.compile(r'([公共级四专八]+)')
															
 
																-    lag_dict = {'lan_name':'', 'level':""}
															
 
																+    lag_dict = {'语言':'', '熟练度':""}
															
 
																     for l in lines:
															
 
																         if not l.strip():
															
 
																             continue
															
 
																         lan_name = re.search(re_lan, l)
															
 
																         lag_lev = re.search(re_lev, l)
															
 
																         if lag_lev and lag_lev.group(1):
															
 
																-            lag_dict["level"] = lag_lev.group(1)
															
 
																+            lag_dict["熟练度"] = lag_lev.group(1)
															
 
																         if lan_name and lan_name.group(1):
															
 
																-            if lag_dict["lan_name"]:
															
 
																+            if lag_dict["语言"]:
															
 
																                 lan_list.append(lag_dict)
															
 
																-                lag_dict = {'lan_name':'', 'level':""}
															
 
																-            lag_dict['lan_name'] = lan_name.group(1)
															
 
																+                lag_dict = {'语言':'', '熟练度':""}
															
 
																+            lag_dict['语言'] = lan_name.group(1)
															
 
																     return lan_list
															
@@ -1151,7 +1124,7 @@ def parse_txt(path, save_dir):
 
																     page = {1: []}
															
 
																     if len(data.split("\n")) <= 2:
															
 
																         for line in data.split("\n"):
															
 
																-            line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").strip()
															
 
																+            line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").replace("·","").strip()
															
 
																             for word in line.split():
															
 
																                 if word in block.keys():
															
 
																                     chun = block[word]
															
@@ -1160,7 +1133,7 @@ def parse_txt(path, save_dir):
 
																                     page[chun].append(word)
															
 
																     else:
															
 
																         for line in data.split("\n"):
															
 
																-            line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历")
															
 
																+            line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").replace("·","")
															
 
																             regex = re.compile(u'[\u3000]+',re.UNICODE)
															
 
																             line = regex.sub('', line.strip())
															
 
																             if line in block.keys():
															
@@ -1169,14 +1142,14 @@ def parse_txt(path, save_dir):
 
																             elif line:
															
 
																                 page[chun].append(line)
															
 
																-    result_data = []
															
 
																+    result_data = dict()
															
 
																     for key in page.keys():
															
 
																         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
															
 
																             if key == index:
															
 
																-                result_data.append({block_rev[index]:func(page[index])})
															
 
																+                result_data[block_rev[index]] = func(page[index])
															
 
																     filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
															
 
																     with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
															
 
																-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
															
 
																+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
															
 
																 # 纯文本 word 解析
															
@@ -1198,14 +1171,14 @@ def read_from_word(doc, path, save_dir):
 
																         elif line:
															
 
																             page[chun].append(line)
															
 
																-    result_data = []
															
 
																+    result_data = dict()
															
 
																     for key in page.keys():
															
 
																         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
															
 
																             if key == index:
															
 
																-                result_data.append({block_rev[index]:func(page[index])})
															
 
																+                result_data[block_rev[index]] = func(page[index])
															
 
																     filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
															
 
																     with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
															
 
																-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
															
 
																+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
															
 
																 # 提取 word 表格(已完成)
															
@@ -1267,14 +1240,14 @@ def check_word(path, save_dir):
 
																                 line = line.replace(k+"\n", k+"：")
															
 
																             page[chun].extend(line.split())
															
 
																-    result_data = []
															
 
																+    result_data = dict()
															
 
																     for key in page.keys():
															
 
																         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
															
 
																             if key == index:
															
 
																-                result_data.append({block_rev[index]:func(page[index])})
															
 
																+                result_data[block_rev[index]] = func(page[index])
															
 
																     filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
															
 
																     with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
															
 
																-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
															
 
																+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
															
 
																 # pdf 解析句子(已完成)
															
@@ -1340,15 +1313,15 @@ def read_from_pdf(path, save_dir):
 
																                     result[key].extend(r[key])
															
 
																                 else:
															
 
																                     result[key] = r[key]
															
 
																-        result_data = []
															
 
																+        result_data = dict()
															
 
																         for key in result.keys():
															
 
																             for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
															
 
																                 if key == index:
															
 
																-                    result_data.append({block_rev[index]:func(result[index])})
															
 
																+                    result_data[block_rev[index]] = func(result[index])
															
 
																         filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
															
 
																         with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
															
 
																-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
															
 
																+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
															
 
																 # pdf 表格解析 (已完成)
															
@@ -1395,14 +1368,14 @@ def parse_table_from_pdf(path, save_dir):
 
																             line = line.replace(k+"\n", k+"：")
															
 
																         page[chun].extend(line.split())
															
 
																-    result_data = []
															
 
																+    result_data = dict()
															
 
																     for key in page.keys():
															
 
																         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
															
 
																             if key == index:
															
 
																-                result_data.append({block_rev[index]:func(page[index])})
															
 
																+                result_data[block_rev[index]] = func(page[index])
															
 
																     filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
															
 
																     with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
															
 
																-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
															
 
																+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
															
 
																 # 检测 pdf 格式 (已完成)
															
@@ -1437,6 +1410,54 @@ def decode_path(path):
 
																     return path_name
															
 
																+# 结果返回
															
 
																+def push_back(tempdir):
															
 
																+    for file in os.listdir(tempdir):
															
 
																+        filename = os.path.join(tempdir, file)
															
 
																+        with open(filename, "r", encoding="utf-8") as ff:
															
 
																+            rst = json.load(ff)
															
 
																+
															
 
																+        with open("./resources/translate.json", "r", encoding="utf-8") as ft:
															
 
																+            json_obj = json.load(ft)
															
 
																+
															
 
																+        for key in json_obj["base"].keys():
															
 
																+            if rst["result"].get("基本信息"):
															
 
																+                if rst["result"]["基本信息"].get(key):
															
 
																+                    rst["result"]["基本信息"][json_obj["base"][key]] = rst["result"]["基本信息"][key]
															
 
																+                    del rst["result"]["基本信息"][key]
															
 
																+            if rst["result"].get("求职意向"):
															
 
																+                if rst["result"]["求职意向"].get(key):
															
 
																+                    rst["result"]["求职意向"][json_obj["base"][key]] = rst["result"]["求职意向"][key]
															
 
																+                    del rst["result"]["求职意向"][key]
															
 
																+        
															
 
																+        for key in json_obj["tal_vocational_qualification_certificate"].keys():
															
 
																+            if rst["result"].get("证书"):
															
 
																+                for idx in range(len(rst["result"]["证书"])):
															
 
																+                    if rst["result"]["证书"][idx].get(key):
															
 
																+                        rst["result"]["证书"][idx][json_obj["tal_vocational_qualification_certificate"][key]] = rst["result"]["证书"][idx][key]
															
 
																+                        del rst["result"]["证书"][idx][key]
															
 
																+        
															
 
																+        for key in json_obj["tal_language"].keys():
															
 
																+            if rst["result"].get("语言能力"):
															
 
																+                for idx in range(len(rst["result"]["语言能力"])):
															
 
																+                    if rst["result"]["语言能力"][idx].get(key):
															
 
																+                        rst["result"]["语言能力"][idx][json_obj["tal_language"][key]] = rst["result"]["语言能力"][idx][key]
															
 
																+                        del rst["result"]["语言能力"][idx][key]
															
 
																+
															
 
																+        # url = "http://192.168.1.110:9999/talent/getResumeData"
															
 
																+        # session = requests.Session()
															
 
																+        # session.mount('http://', HTTPAdapter(max_retries = 3))
															
 
																+        # try:
															
 
																+        #     headers = {
															
 
																+        #         'contentType':'Application/json'
															
 
																+        #     }
															
 
																+        #     response = session.post(url=url, headers=headers, json={"ResumeData":rst}, timeout=10)
															
 
																+        #     print(response.text)
															
 
																+        # except Exception as e:
															
 
																+        #     print(e)
															
 
																+        console.print(rst, style="red", justify="left")
															
 
																+
															
 
																+
															
 
																 # 检测传入格式(已完成)
															
 
																 def detection_type(path, system):
															
 
																     tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
															
@@ -1528,23 +1549,9 @@ def detection_type(path, system):
 
																             # 传入为 txt
															
 
																             elif os.path.isfile(filename) and filename.endswith('.txt'):
															
 
																                 parse_txt(filename, save_dir=tempdir)
															
 
																-    # 结果返回
															
 
																-    for file in os.listdir(tempdir):
															
 
																-        filename = os.path.join(tempdir, file)
															
 
																-        with open(filename, "r", encoding="utf-8") as ff:
															
 
																-            rst = json.load(ff)
															
 
																-        url = "http://192.168.1.110:9999/talent/getResumeData"
															
 
																-        session = requests.Session()
															
 
																-        session.mount('http://', HTTPAdapter(max_retries = 3))
															
 
																-        try:
															
 
																-            headers = {
															
 
																-                'contentType':'Application/json'
															
 
																-            }
															
 
																-            response = session.post(url=url, headers=headers, json={"ResumeData":rst}, timeout=10)
															
 
																-            print(response.text)
															
 
																-        except Exception as e:
															
 
																-            print(e)
															
 
																-        console.print(rst, style="red", justify="left")
															
 
																+
															
 
																+        push_back(tempdir)
															
 
																+
															
 
																 @app.post("/resume_parse")
															
@@ -1556,24 +1563,9 @@ async def file_upload(background_tasks: BackgroundTasks, file: UploadFile = File
 
																     res = await file.read()
															
 
																     with open('./uploads/' + file.filename, "wb") as f:
															
 
																         f.write(res)
															
 
																-    background_tasks.add_task(detection_type, './uploads/' + file.filename, system)
															
 
																+    background_tasks.add_task(detection_type, './uploads/' + file.filename, platform.system())
															
 
																     return {"errno": 0, "msg": "Upload Success"}
															
 
																 if __name__ == '__main__':
															
 
																-    import platform
															
 
																-    system = platform.system()
															
 
																-    if (system == "Windows"):
															
 
																-        logger.info("Windows")
															
 
																-    elif (system == "Linux"):
															
 
																-        logger.info("Linux")
															
 
																-    else:
															
 
																-        logger.error("Unnot support this system")
															
 
																-    if not os.path.exists("./uploads"):
															
 
																-        os.mkdir("./uploads")
															
 
																-    if not os.path.exists("./pdf"):
															
 
																-        os.mkdir("./pdf")
															
 
																-    if not os.path.exists("./cache"):
															
 
																-        os.mkdir("./cache")
															
 
																- 
															
 
																-    uvicorn.run(app=app, host="0.0.0.0", port=8320)
															
 
																+    uvicorn.run(app="resume_parse:app", host="0.0.0.0", port=8320, reload=True, log_level="info")