3 년 전 · a0afd8f21c
--- a/tools/custom.py
+++ b/tools/custom.py
@@ -2,11 +2,12 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-11 09:21:24

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-13 15:31:50

			
 
				+# @Last Modified time: 2022-07-14 11:00:31

			
 
				 

			
 
				 # 自定义模板

			
 
				 

			
 
				 import re

			
 
				+import json

			
 
				 import logging

			
 
				 from pprint import pprint

			
 
				 import requests

			
@@ -16,8 +17,8 @@ from docx.shared import Inches
 
				 

			
 
				 

			
 
				 path = "d:\\desktop\\自定义.docx"

			
 
				-# path = "d:\\desktop\\内部人才市场简历模板.docx"

			
 
				 

			
 
				+# 关键词字典

			
 
				 keywords = [

			
 
				 	"姓名",

			
 
				 	"性别",

			
@@ -76,9 +77,19 @@ keywords = [
 
				 	"其他情况说明",

			
 
				 	"工作单位及职务",

			
 
				 	"政治面貌",

			
 
				-	"职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系", "计算机水平"

			
 
				+	"职业证书",

			
 
				+    "资格等级",

			
 
				+    "取得日期",

			
 
				+    "学校/培训机构",

			
 
				+    "专业",

			
 
				+    "起始时间",

			
 
				+    "毕业时间",

			
 
				+    "职业",

			
 
				+    "与本人关系",

			
 
				+    "计算机水平"

			
 
				 ]

			
 
				 

			
 
				+# 解析行内元素

			
 
				 def parse_line(line):

			
 
				     result = []

			
 
				     key = None

			
@@ -92,50 +103,43 @@ def parse_line(line):
 
				     return result

			
 
				 

			
 
				 

			
 
				+# 解析文档布局

			
 
				 def parse_layout(path):

			
 
				     result = []

			
 
				     doc = Document(path)

			
 
				     lo = {}

			
 
				-    tables = doc.tables

			
 
				-    for _table in tables[:]:

			
 
				+    for _table in doc.tables[:]:

			
 
				         for i, row in enumerate(_table.rows[:]):

			
 
				             row_content = []

			
 
				             for cell in row.cells[:]:

			
 
				                 c = cell.text

			
 
				-                # row_content.append(c)

			
 
				                 if c not in row_content:

			
 
				                 	row_content.append(c)

			
 
				             lo[len(lo.keys())] = row_content

			
 
				 

			
 
				-    kwln = -1

			
 
				-    kwline = None

			
 
				+    kwln = -1# 关键词行长度

			
 
				+    kwline = None# 关键词行

			
 
				     for key in lo.keys():

			
 
				-        # pdb.set_trace()

			
 
				         for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				             if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				-                # pdb.set_trace()

			
 
				-                perc = 0

			
 
				+                perc = 0# 行内关键词数量

			
 
				                 for c in lo[key]:

			
 
				-                    # pdb.set_trace()

			
 
				-                    if c and (''.join(c.split()) in keywords):

			
 
				+                    if c and (''.join(c.split()) in keywords):# 找到此行有关键词

			
 
				                         perc += 1

			
 
				-                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 非关键词行元素

			
 
				-                        # print(c)

			
 
				-                        # print(perc)

			
 
				-                        # print(lo[key])

			
 
				-                        perc = 0

			
 
				-                        result.extend(parse_line(lo[key]))

			
 
				+                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3，判断此行非关键词行元素

			
 
				+                        perc = 0# 清空行内关键词数

			
 
				+                        result.extend(parse_line(lo[key]))# 添加并解析普通行级元素

			
 
				                         break

			
 
				                 else:# 关键词行元素

			
 
				                     schema = dict()

			
 
				-                    for key, val in zip(kwline, lo[key]):

			
 
				+                    for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素

			
 
				                         if key:

			
 
				                             schema[key] = val

			
 
				                     result.append(schema)

			
 
				                     break

			
 
				                 break

			
 
				         else:

			
 
				-            # print("{}\t\t此行为关键词行".format(lo[key]))

			
 
				+            # print("{}：此行为关键词行！".format(lo[key]))

			
 
				             try:

			
 
				                 kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				             except Exception as e:

			
@@ -147,20 +151,20 @@ def parse_layout(path):
 
				 # 格式化数据

			
 
				 def formatter(datalist):

			
 
				     result = dict()

			
 
				-

			
 
				     for d in datalist:

			
 
				-        if len(d) == 1:

			
 
				+        if len(d) == 1:# 普通键值对

			
 
				             for key in d.keys():

			
 
				                 result[key] = d[key]

			
 
				-        else:

			
 
				+        else:# 行级元素

			
 
				             for k in list(d.keys()):

			
 
				-                if k == "".join(d[k].split()):

			
 
				+                if k == "".join(d[k].split()):# 行名

			
 
				                     d.pop(k)

			
 
				-                    if result.get(k):

			
 
				+                    if result.get(k):# 多行元素合并

			
 
				                         result[k].append(d)

			
 
				                     else:

			
 
				                         result[k] = [d]

			
 
				 

			
 
				+    ### 时间格式化

			
 
				     if result.get("出生年月"):

			
 
				         dates = re.findall(r'\d+' , result["出生年月"])

			
 
				         if len(dates) == 1:

			
@@ -168,7 +172,7 @@ def formatter(datalist):
 
				         elif len(dates) == 2:

			
 
				             result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				         elif len(dates) == 3:

			
 
				-            result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				 

			
 
				     if result.get("任职时间"):

			
 
				         dates = re.findall(r'\d+' , result["任职时间"])

			
@@ -177,7 +181,7 @@ def formatter(datalist):
 
				         elif len(dates) == 2:

			
 
				             result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				         elif len(dates) == 3:

			
 
				-            result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				 

			
 
				     if result.get("参加工作时间"):

			
 
				         dates = re.findall(r'\d+' , result["参加工作时间"])

			
@@ -186,7 +190,7 @@ def formatter(datalist):
 
				         elif len(dates) == 2:

			
 
				             result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				         elif len(dates) == 3:

			
 
				-            result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				+            result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))

			
 
				 

			
 
				     if result.get("最高学历毕业院校及毕业时间"):

			
 
				         dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])

			
@@ -262,30 +266,20 @@ def formatter(datalist):
 
				                 dates = re.findall(r'\d+' , fam["出生年月"])

			
 
				                 if len(dates) == 2:

			
 
				                     result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-    normal = {

			
 
				-        "姓名":"name",

			
 
				-        "性别":"gender",

			
 
				-        "邮箱地址":"email",

			
 
				-        "政治面貌(加入时间)":"politics",

			
 
				-        "联系电话":"mobile",

			
 
				-        "籍贯":"birthplace",

			
 
				-        "出生年月":"birth_time",

			
 
				-        "现任职务":"current_job",

			
 
				-        "所在城市":"living_city",

			
 
				-        "参加工作时间":"work_begin_time",

			
 
				-        "意向岗位":"intent_job",

			
 
				-        "熟悉专业有何专长":"skills",

			
 
				-    }

			
 
				-    edunormal = {

			
 
				-        "学校":"school_name",

			
 
				-        "专业":"major",

			
 
				-        "学历":"degree",

			
 
				-        "是否全日制":"degree_type",

			
 
				-    }

			
 
				+

			
 
				+    # 转译数据库字段名

			
 
				+    with open("./resources/translate.json", "r", encoding="utf-8") as ff:

			
 
				+        json_obj = json.load(ff)

			
 
				+

			
 
				+    normal = json_obj["base"]

			
 
				+    edunormal = json_obj["tal_his_edu"]

			
 
				+    family = json_obj["tal_family_social_relations"]

			
 
				+

			
 
				     for key in normal.keys():

			
 
				         if result.get(key):

			
 
				             result[normal[key]] = result[key]

			
 
				             result.pop(key)

			
 
				+

			
 
				     for idx in range(len(result['学习经历'])):

			
 
				         result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]

			
 
				         result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]

			
@@ -293,17 +287,24 @@ def formatter(datalist):
 
				             if result['学习经历'][idx].get(key):

			
 
				                 result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				                 result['学习经历'][idx].pop(key)

			
 
				-    url = "http://192.168.1.110:9999/talent/getResumeData"

			
 
				-    session = requests.Session()

			
 
				-    session.mount('http://', HTTPAdapter(max_retries = 3))

			
 
				-    try:

			
 
				-        headers = {

			
 
				-            'contentType':'Application/json'

			
 
				-        }

			
 
				-        response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)

			
 
				-        print(response.text)

			
 
				-    except Exception as e:

			
 
				-        print(e)

			
 
				+

			
 
				+    for idx in range(len(result['主要家庭成员及社会关系'])):

			
 
				+        for key in family.keys():

			
 
				+            if result['主要家庭成员及社会关系'][idx].get(key):

			
 
				+                result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]

			
 
				+                result['主要家庭成员及社会关系'][idx].pop(key)

			
 
				+

			
 
				+    # url = "http://192.168.1.110:9999/talent/getResumeData"

			
 
				+    # session = requests.Session()

			
 
				+    # session.mount('http://', HTTPAdapter(max_retries = 3))

			
 
				+    # try:

			
 
				+    #     headers = {

			
 
				+    #         'contentType':'Application/json'

			
 
				+    #     }

			
 
				+    #     response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)

			
 
				+    #     print(response.text)

			
 
				+    # except Exception as e:

			
 
				+    #     print(e)

			
 
				     return result

			
 
				 

			
 
				 

			
--- a/tools/irafa.py
+++ b/tools/irafa.py
@@ -2,11 +2,11 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-07 13:12:17

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-13 16:46:02

			
 
				+# @Last Modified time: 2022-07-14 09:39:42

			
 
				 

			
 
				 # 内部人才市场简历模板

			
 
				 from pprint import pprint

			
 
				-

			
 
				+import re

			
 
				 import docx

			
 
				 from docx import Document

			
 
				 from docx.shared import Inches

			
@@ -75,6 +75,25 @@ def parse_layout(path):
 
				             # print("此行为关键词行")

			
 
				             kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				             kwln = len(lo[key])

			
 
				+

			
 
				+    job = {"工作经历":"工作经历"}

			
 
				+    flag = None

			
 
				+    for p in doc.paragraphs:

			
 
				+        text = p.text.replace("：", ":")

			
 
				+        if ":" in text:

			
 
				+            text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)

			
 
				+            for line in text.split("\n"):

			
 
				+                if line.strip():

			
 
				+                    i = line.split(":")

			
 
				+                    if job.get(i[0].strip()):

			
 
				+                        result.append(job)

			
 
				+                        job = {"工作经历":"工作经历"}

			
 
				+                    job[i[0].strip()] = i[1].strip()

			
 
				+                    flag = i[0].strip()

			
 
				+        elif flag == "工作描述":

			
 
				+            job["工作描述"] += '\n' + text.strip()

			
 
				+    else:

			
 
				+        result.append(job)

			
 
				     return result

			
 
				 

			
 
				 

			
@@ -119,13 +138,18 @@ def formatter(datalist):
 
				         if result.get(key):

			
 
				             result[normal[key]] = result[key]

			
 
				             result.pop(key)

			
 
				-    # for idx in range(len(result['学习经历'])):

			
 
				-    #     result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]

			
 
				-    #     result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]

			
 
				-    #     for key in edunormal.keys():

			
 
				-    #         if result['学习经历'][idx].get(key):

			
 
				-    #             result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				-    #             result['学习经历'][idx].pop(key)

			
 
				+

			
 
				+    edunormal = {

			
 
				+        "学校/培训机构":"school_name",

			
 
				+        "专业":"major",

			
 
				+        "起始时间":"start_time",

			
 
				+        "毕业时间":"end_time"

			
 
				+    }

			
 
				+    for idx in range(len(result['学习经历'])):

			
 
				+        for key in edunormal.keys():

			
 
				+            if result['学习经历'][idx].get(key):

			
 
				+                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				+                result['学习经历'][idx].pop(key)

			
 
				     return result

			
 
				 

			
 
				 if __name__ == "__main__":

			
--- a/tools/logger.py
+++ b/tools/logger.py
@@ -0,0 +1,40 @@
 
				+# -*- coding: utf-8 -*-

			
 
				+# @Author: privacy

			
 
				+# @Date:   2022-07-14 13:26:15

			
 
				+# @Last Modified by:   privacy

			
 
				+# @Last Modified time: 2022-07-14 13:27:46

			
 
				+import logging

			
 
				+

			
 
				+class Logger:

			
 
				+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(asctime)s: %(name)s: %(levelname)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):

			
 
				+        self.logger = logging.getLogger(name)

			
 
				+        self.logger.setLevel(logging.INFO)

			
 
				+        self.fmt = logging.Formatter(fmt)

			
 
				+        self.set_console_handler(console_handler_level)

			
 
				+

			
 
				+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:

			
 
				+        ch = logging.StreamHandler()

			
 
				+        ch.setLevel(console_handler_level)

			
 
				+        ch.setFormatter(self.fmt)

			
 
				+        self.logger.addHandler(ch)

			
 
				+

			
 
				+    def set_file_handler(self, filename: str, mode: str = "a", file_handler_level: str = logging.WARNING) -> None:

			
 
				+        fh = logging.FileHandler(filename, mode=mode, encoding='utf-8')

			
 
				+        fh.setLevel(file_handler_level)

			
 
				+        fh.setFormatter(self.fmt)

			
 
				+        self.logger.addHandler(fh)

			
 
				+

			
 
				+    def debug(self, msg):

			
 
				+        self.logger.debug(msg)

			
 
				+

			
 
				+    def info(self, msg):

			
 
				+        self.logger.info(msg)

			
 
				+

			
 
				+    def warning(self, msg):

			
 
				+        self.logger.warning(msg)

			
 
				+

			
 
				+    def error(self, msg):

			
 
				+        self.logger.error(msg)

			
 
				+

			
 
				+    def critical(self, msg):

			
 
				+        self.logger.critical(msg)
			
--- a/tools/resources/translate.json
+++ b/tools/resources/translate.json
@@ -0,0 +1,85 @@
 
				+{

			
 
				+    "base":{

			
 
				+        "姓名":"name",

			
 
				+        "性别":"gender",

			
 
				+        "出生年月":"birth_time",

			
 
				+        "出生日期":"birth_time",

			
 
				+        "民族":"national",

			
 
				+        "籍贯":"birthplace",

			
 
				+        "户籍地":"household_register_address",

			
 
				+        "参加工作时间":"work_begin_time",

			
 
				+        "联系电话":"mobile",

			
 
				+        "手机号码":"mobile",

			
 
				+        "邮箱地址":"email",

			
 
				+        "现任职务":"current_job",

			
 
				+        "提职时间":"promotion_time",

			
 
				+        "所在城市":"living_city",

			
 
				+        "意向城市":"意向城市",

			
 
				+        "意向岗位":"intent_job",

			
 
				+        "期望职业":"intent_job",

			
 
				+        "目前年薪":"current_salary_yearl",

			
 
				+        "政治面貌(加入时间)":"politics",

			
 
				+        "政治面貌":"politics",

			
 
				+        "熟悉专业有何专长":"skills"

			
 
				+    },

			
 
				+    "tal_his_edu":{

			
 
				+        "开始时间":"start_time",

			
 
				+        "毕业时间":"end_time",

			
 
				+        "学校":"school_name",

			
 
				+        "专业":"major",

			
 
				+        "学历":"degree",

			
 
				+        "学位":"degree_in",

			
 
				+        "研究方向":"research_direction",

			
 
				+        "是否全日制":"is_full_time"

			
 
				+    },

			
 
				+    "tal_his_job":{

			
 
				+        "工作单位":"company_name",

			
 
				+        "职位":"job_name",

			
 
				+        "开始时间":"start_time",

			
 
				+        "结束时间":"end_time",

			
 
				+        "工作描述":"job_desc"

			
 
				+    },

			
 
				+    "tal_his_project":{

			
 
				+        "项目名":"project_name",

			
 
				+        "公司名":"company_name",

			
 
				+        "职位":"project_office",

			
 
				+        "开始时间":"start_time",

			
 
				+        "结束时间":"end_time",

			
 
				+        "项目职责":"project_duty",

			
 
				+        "业绩":"project_performance"

			
 
				+    },

			
 
				+    "tal_language":{

			
 
				+        "语言":"lan_name",

			
 
				+        "熟练度":"proficiency"

			
 
				+    },

			
 
				+    "tal_vocational_qualification_certificate":{

			
 
				+        "证书名称":"vocational_qualification_certificate_name",

			
 
				+        "证书":"vocational_qualification_certificate_name",

			
 
				+        "获得时间":"vocational_certificate_obtaining_time"

			
 
				+    },

			
 
				+    "tal_professional_tech_certificate":{

			
 
				+        "技术资格证明":"professional_tech_certificate_name",

			
 
				+        "获得时间":"professional_certificate_obtaining_time"

			
 
				+    },

			
 
				+    "tal_training_institutions":{

			
 
				+        "学校/培训机构":"school_training_institutions",

			
 
				+        "专业":"major",

			
 
				+        "开始时间":"start_time",

			
 
				+        "结束时间":"end_time"

			
 
				+    },

			
 
				+    "tal_rewards_punishments":{

			
 
				+        "项目名称":"name",

			
 
				+        "项目单位":"rewards_punishments_unit",

			
 
				+        "时间":"rewards_punishments_time"

			
 
				+    },

			
 
				+    "tal_family_social_relations":{

			
 
				+        "称谓":"appellation",

			
 
				+        "姓名":"name",

			
 
				+        "出生年月":"birth_time",

			
 
				+        "政治面貌":"politics",

			
 
				+        "工作单位":"work_units",

			
 
				+        "职务":"position",

			
 
				+        "工作单位及职务":"position"

			
 
				+    },

			
 
				+    "其他":"intro"

			
 
				+}
			
--- a/tools/resume_parse.py
+++ b/tools/resume_parse.py
@@ -8,6 +8,7 @@ import sys
 
				 import re
			
 
				 import json
			
 
				 import time
			
 
				+import platform
			
 
				 from os import walk
			
 
				 import subprocess
			
 
				 import rarfile
			
@@ -33,62 +34,50 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 
				 import pdfplumber
			
 
				 from paddlenlp import Taskflow
			
 
				 
			
 
				-class Logger:
			
 
				-    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(asctime)s: %(name)s: %(levelname)s: %(filename)s: %(funcName)s: %(message)s'):
			
 
				-        self.logger = logging.getLogger(name)
			
 
				-        self.logger.setLevel(logging.INFO)
			
 
				-        self.fmt = logging.Formatter(fmt)
			
 
				-        self.set_console_handler(console_handler_level)
			
 
				 
			
 
				-    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
			
 
				-        ch = logging.StreamHandler()
			
 
				-        ch.setLevel(console_handler_level)
			
 
				-        ch.setFormatter(self.fmt)
			
 
				-        self.logger.addHandler(ch)
			
 
				+from logger import Logger
			
 
				+logger = Logger("resume_parse")
			
 
				+logger.set_file_handler(filename='journal.log')
			
 
				 
			
 
				-    def set_file_handler(self, filename: str, mode: str = "a", file_handler_level: str = logging.WARNING) -> None:
			
 
				-        fh = logging.FileHandler(filename, mode=mode, encoding='utf-8')
			
 
				-        fh.setLevel(file_handler_level)
			
 
				-        fh.setFormatter(self.fmt)
			
 
				-        self.logger.addHandler(fh)
			
 
				 
			
 
				-    def debug(self, msg):
			
 
				-        self.logger.debug(msg)
			
 
				+from rich.console import Console
			
 
				+console = Console()
			
 
				 
			
 
				-    def info(self, msg):
			
 
				-        self.logger.info(msg)
			
 
				 
			
 
				-    def warning(self, msg):
			
 
				-        self.logger.warning(msg)
			
 
				+global ner, ner_tag, base_info_ie, prize_ie, cet_ie, pro_ie, block, block_rev
			
 
				 
			
 
				-    def error(self, msg):
			
 
				-        self.logger.error(msg)
			
 
				 
			
 
				-    def critical(self, msg):
			
 
				-        self.logger.critical(msg)
			
 
				+if not locals().get("ner"):
			
 
				+    ner = Taskflow("ner", mode='fast')
			
 
				+if not locals().get("ner_tag"):
			
 
				+    ner_tag = Taskflow("ner")
			
 
				+if not locals().get("base_info_ie"):
			
 
				+    base_info_ie = Taskflow('information_extraction', schema=["姓名","性别","婚姻状况","邮箱地址","政治面貌","手机号码","籍贯","出生日期","现任职务","参加工作时间","英语水平","计算机水平","工作年限","当前单位","所在城市","职业资格"])
			
 
				+if not locals().get("prize_ie"):
			
 
				+    prize_ie = Taskflow('information_extraction', schema=["时间", "奖项"])
			
 
				+if not locals().get("cet_ie"):
			
 
				+    cet_ie = Taskflow('information_extraction', schema=["时间","证书"])
			
 
				+if not locals().get("pro_ie"):
			
 
				+    pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./model_100')
			
 
				 
			
 
				-logger = Logger("resume_parse")
			
 
				-logger.set_file_handler(filename='data.log')
			
 
				+if not locals().get("block"):
			
 
				+    with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
			
 
				+        block = json.load(fp)
			
 
				 
			
 
				-from rich.console import Console
			
 
				-console = Console()
			
 
				+if not locals().get("block_rev"):
			
 
				+    block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
			
 
				 
			
 
				 import uvicorn
			
 
				 from fastapi import BackgroundTasks, FastAPI, File, UploadFile
			
 
				 app = FastAPI()
			
 
				 
			
 
				-ner = Taskflow("ner", mode='fast')
			
 
				-ner_tag = Taskflow("ner")
			
 
				-base_info_ie = Taskflow('information_extraction', schema=["姓名","性别","婚姻状况","电子邮箱","政治面貌","手机号码","籍贯","出生日期","现任职务","参加工作时间","英语水平","计算机水平","工作年限","当前单位","所在城市","职业资格"])
			
 
				-prize_ie = Taskflow('information_extraction', schema=["时间", "奖项"])
			
 
				-cet_ie = Taskflow('information_extraction', schema=["时间","证书"])
			
 
				-pro_ie = Taskflow("information_extraction", schema=["时间","项目名称","机构","职位"], task_path='./model_100')
			
 
				-global block, block_rev
			
 
				-
			
 
				-with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
			
 
				-    block = json.load(fp)
			
 
				-block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
			
 
				 
			
 
				+if not os.path.exists("./uploads"):
			
 
				+    os.mkdir("./uploads")
			
 
				+if not os.path.exists("./pdf"):
			
 
				+    os.mkdir("./pdf")
			
 
				+if not os.path.exists("./cache"):
			
 
				+    os.mkdir("./cache")
			
 
				 
			
 
				 
			
 
				 # 基本信息(旧版)
			
@@ -165,22 +154,6 @@ def get_base_info(lines):
 
				             rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
			
 
				         elif len(dates) == 3:
			
 
				             rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
			
 
				-    normal = {
			
 
				-        "姓名":"name",
			
 
				-        "性别":"gender",
			
 
				-        "电子邮箱":"email",
			
 
				-        "政治面貌":"politics",
			
 
				-        "手机号码":"mobile",
			
 
				-        "籍贯":"birthplace",
			
 
				-        "出生日期":"birth_time",
			
 
				-        "现任职务":"current_job",
			
 
				-        "所在城市":"living_city",
			
 
				-        "参加工作时间":"work_begin_time",
			
 
				-    }
			
 
				-    for key in normal.keys():
			
 
				-        if rst.get(key):
			
 
				-            rst[normal[key]] = rst[key]
			
 
				-            del rst[key]
			
 
				     return {key:rst[key][0]["text"] for key in rst.keys()}
			
 
				 
			
 
				 
			
@@ -961,19 +934,19 @@ def get_lag_list(lines):
 
				     lan_list = []
			
 
				     re_lan = re.compile(r'(\w+[语话])')
			
 
				     re_lev = re.compile(r'([公共级四专八]+)')
			
 
				-    lag_dict = {'lan_name':'', 'level':""}
			
 
				+    lag_dict = {'语言':'', '熟练度':""}
			
 
				     for l in lines:
			
 
				         if not l.strip():
			
 
				             continue
			
 
				         lan_name = re.search(re_lan, l)
			
 
				         lag_lev = re.search(re_lev, l)
			
 
				         if lag_lev and lag_lev.group(1):
			
 
				-            lag_dict["level"] = lag_lev.group(1)
			
 
				+            lag_dict["熟练度"] = lag_lev.group(1)
			
 
				         if lan_name and lan_name.group(1):
			
 
				-            if lag_dict["lan_name"]:
			
 
				+            if lag_dict["语言"]:
			
 
				                 lan_list.append(lag_dict)
			
 
				-                lag_dict = {'lan_name':'', 'level':""}
			
 
				-            lag_dict['lan_name'] = lan_name.group(1)
			
 
				+                lag_dict = {'语言':'', '熟练度':""}
			
 
				+            lag_dict['语言'] = lan_name.group(1)
			
 
				     return lan_list
			
 
				 
			
 
				 
			
@@ -1151,7 +1124,7 @@ def parse_txt(path, save_dir):
 
				     page = {1: []}
			
 
				     if len(data.split("\n")) <= 2:
			
 
				         for line in data.split("\n"):
			
 
				-            line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").strip()
			
 
				+            line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").replace("·","").strip()
			
 
				             for word in line.split():
			
 
				                 if word in block.keys():
			
 
				                     chun = block[word]
			
@@ -1160,7 +1133,7 @@ def parse_txt(path, save_dir):
 
				                     page[chun].append(word)
			
 
				     else:
			
 
				         for line in data.split("\n"):
			
 
				-            line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历")
			
 
				+            line = line.replace("\xa0", "").replace("【","").replace("】","").replace("教育/培训","教育经历").replace("·","")
			
 
				             regex = re.compile(u'[\u3000]+',re.UNICODE)
			
 
				             line = regex.sub('', line.strip())
			
 
				             if line in block.keys():
			
@@ -1169,14 +1142,14 @@ def parse_txt(path, save_dir):
 
				             elif line:
			
 
				                 page[chun].append(line)
			
 
				 
			
 
				-    result_data = []
			
 
				+    result_data = dict()
			
 
				     for key in page.keys():
			
 
				         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				             if key == index:
			
 
				-                result_data.append({block_rev[index]:func(page[index])})
			
 
				+                result_data[block_rev[index]] = func(page[index])
			
 
				     filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				     with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				 # 纯文本 word 解析
			
@@ -1198,14 +1171,14 @@ def read_from_word(doc, path, save_dir):
 
				         elif line:
			
 
				             page[chun].append(line)
			
 
				 
			
 
				-    result_data = []
			
 
				+    result_data = dict()
			
 
				     for key in page.keys():
			
 
				         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				             if key == index:
			
 
				-                result_data.append({block_rev[index]:func(page[index])})
			
 
				+                result_data[block_rev[index]] = func(page[index])
			
 
				     filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				     with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				 # 提取 word 表格(已完成)
			
@@ -1267,14 +1240,14 @@ def check_word(path, save_dir):
 
				                 line = line.replace(k+"\n", k+"：")
			
 
				             page[chun].extend(line.split())
			
 
				 
			
 
				-    result_data = []
			
 
				+    result_data = dict()
			
 
				     for key in page.keys():
			
 
				         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				             if key == index:
			
 
				-                result_data.append({block_rev[index]:func(page[index])})
			
 
				+                result_data[block_rev[index]] = func(page[index])
			
 
				     filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				     with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				 # pdf 解析句子(已完成)
			
@@ -1340,15 +1313,15 @@ def read_from_pdf(path, save_dir):
 
				                     result[key].extend(r[key])
			
 
				                 else:
			
 
				                     result[key] = r[key]
			
 
				-        result_data = []
			
 
				+        result_data = dict()
			
 
				         for key in result.keys():
			
 
				             for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				                 if key == index:
			
 
				-                    result_data.append({block_rev[index]:func(result[index])})
			
 
				+                    result_data[block_rev[index]] = func(result[index])
			
 
				 
			
 
				         filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				         with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				 # pdf 表格解析 (已完成)
			
@@ -1395,14 +1368,14 @@ def parse_table_from_pdf(path, save_dir):
 
				             line = line.replace(k+"\n", k+"：")
			
 
				         page[chun].extend(line.split())
			
 
				 
			
 
				-    result_data = []
			
 
				+    result_data = dict()
			
 
				     for key in page.keys():
			
 
				         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				             if key == index:
			
 
				-                result_data.append({block_rev[index]:func(page[index])})
			
 
				+                result_data[block_rev[index]] = func(page[index])
			
 
				     filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				     with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				-            json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				+            json.dump(result_data, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				 # 检测 pdf 格式 (已完成)
			
@@ -1437,6 +1410,54 @@ def decode_path(path):
 
				     return path_name
			
 
				 
			
 
				 
			
 
				+# 结果返回
			
 
				+def push_back(tempdir):
			
 
				+    for file in os.listdir(tempdir):
			
 
				+        filename = os.path.join(tempdir, file)
			
 
				+        with open(filename, "r", encoding="utf-8") as ff:
			
 
				+            rst = json.load(ff)
			
 
				+
			
 
				+        with open("./resources/translate.json", "r", encoding="utf-8") as ft:
			
 
				+            json_obj = json.load(ft)
			
 
				+
			
 
				+        for key in json_obj["base"].keys():
			
 
				+            if rst["result"].get("基本信息"):
			
 
				+                if rst["result"]["基本信息"].get(key):
			
 
				+                    rst["result"]["基本信息"][json_obj["base"][key]] = rst["result"]["基本信息"][key]
			
 
				+                    del rst["result"]["基本信息"][key]
			
 
				+            if rst["result"].get("求职意向"):
			
 
				+                if rst["result"]["求职意向"].get(key):
			
 
				+                    rst["result"]["求职意向"][json_obj["base"][key]] = rst["result"]["求职意向"][key]
			
 
				+                    del rst["result"]["求职意向"][key]
			
 
				+        
			
 
				+        for key in json_obj["tal_vocational_qualification_certificate"].keys():
			
 
				+            if rst["result"].get("证书"):
			
 
				+                for idx in range(len(rst["result"]["证书"])):
			
 
				+                    if rst["result"]["证书"][idx].get(key):
			
 
				+                        rst["result"]["证书"][idx][json_obj["tal_vocational_qualification_certificate"][key]] = rst["result"]["证书"][idx][key]
			
 
				+                        del rst["result"]["证书"][idx][key]
			
 
				+        
			
 
				+        for key in json_obj["tal_language"].keys():
			
 
				+            if rst["result"].get("语言能力"):
			
 
				+                for idx in range(len(rst["result"]["语言能力"])):
			
 
				+                    if rst["result"]["语言能力"][idx].get(key):
			
 
				+                        rst["result"]["语言能力"][idx][json_obj["tal_language"][key]] = rst["result"]["语言能力"][idx][key]
			
 
				+                        del rst["result"]["语言能力"][idx][key]
			
 
				+
			
 
				+        # url = "http://192.168.1.110:9999/talent/getResumeData"
			
 
				+        # session = requests.Session()
			
 
				+        # session.mount('http://', HTTPAdapter(max_retries = 3))
			
 
				+        # try:
			
 
				+        #     headers = {
			
 
				+        #         'contentType':'Application/json'
			
 
				+        #     }
			
 
				+        #     response = session.post(url=url, headers=headers, json={"ResumeData":rst}, timeout=10)
			
 
				+        #     print(response.text)
			
 
				+        # except Exception as e:
			
 
				+        #     print(e)
			
 
				+        console.print(rst, style="red", justify="left")
			
 
				+
			
 
				+
			
 
				 # 检测传入格式(已完成)
			
 
				 def detection_type(path, system):
			
 
				     tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
			
@@ -1528,23 +1549,9 @@ def detection_type(path, system):
 
				             # 传入为 txt
			
 
				             elif os.path.isfile(filename) and filename.endswith('.txt'):
			
 
				                 parse_txt(filename, save_dir=tempdir)
			
 
				-    # 结果返回
			
 
				-    for file in os.listdir(tempdir):
			
 
				-        filename = os.path.join(tempdir, file)
			
 
				-        with open(filename, "r", encoding="utf-8") as ff:
			
 
				-            rst = json.load(ff)
			
 
				-        url = "http://192.168.1.110:9999/talent/getResumeData"
			
 
				-        session = requests.Session()
			
 
				-        session.mount('http://', HTTPAdapter(max_retries = 3))
			
 
				-        try:
			
 
				-            headers = {
			
 
				-                'contentType':'Application/json'
			
 
				-            }
			
 
				-            response = session.post(url=url, headers=headers, json={"ResumeData":rst}, timeout=10)
			
 
				-            print(response.text)
			
 
				-        except Exception as e:
			
 
				-            print(e)
			
 
				-        console.print(rst, style="red", justify="left")
			
 
				+
			
 
				+        push_back(tempdir)
			
 
				+
			
 
				 
			
 
				 
			
 
				 @app.post("/resume_parse")
			
@@ -1556,24 +1563,9 @@ async def file_upload(background_tasks: BackgroundTasks, file: UploadFile = File
 
				     res = await file.read()
			
 
				     with open('./uploads/' + file.filename, "wb") as f:
			
 
				         f.write(res)
			
 
				-    background_tasks.add_task(detection_type, './uploads/' + file.filename, system)
			
 
				+    background_tasks.add_task(detection_type, './uploads/' + file.filename, platform.system())
			
 
				     return {"errno": 0, "msg": "Upload Success"}
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    import platform
			
 
				-    system = platform.system()
			
 
				-    if (system == "Windows"):
			
 
				-        logger.info("Windows")
			
 
				-    elif (system == "Linux"):
			
 
				-        logger.info("Linux")
			
 
				-    else:
			
 
				-        logger.error("Unnot support this system")
			
 
				-    if not os.path.exists("./uploads"):
			
 
				-        os.mkdir("./uploads")
			
 
				-    if not os.path.exists("./pdf"):
			
 
				-        os.mkdir("./pdf")
			
 
				-    if not os.path.exists("./cache"):
			
 
				-        os.mkdir("./cache")
			
 
				- 
			
 
				-    uvicorn.run(app=app, host="0.0.0.0", port=8320)
			
 
				+    uvicorn.run(app="resume_parse:app", host="0.0.0.0", port=8320, reload=True, log_level="info")