3 tahun lalu · 106635f3d2
--- a/tools/custom.py
+++ b/tools/custom.py
@@ -2,11 +2,15 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-11 09:21:24

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-12 16:30:08

			
 
				+# @Last Modified time: 2022-07-13 15:31:50

			
 
				+

			
 
				+# 自定义模板

			
 
				+

			
 
				 import re

			
 
				 import logging

			
 
				 from pprint import pprint

			
 
				-

			
 
				+import requests

			
 
				+from requests.adapters import HTTPAdapter

			
 
				 from docx import Document

			
 
				 from docx.shared import Inches

			
 
				 

			
@@ -258,7 +262,48 @@ def formatter(datalist):
 
				                 dates = re.findall(r'\d+' , fam["出生年月"])

			
 
				                 if len(dates) == 2:

			
 
				                     result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))

			
 
				-

			
 
				+    normal = {

			
 
				+        "姓名":"name",

			
 
				+        "性别":"gender",

			
 
				+        "邮箱地址":"email",

			
 
				+        "政治面貌(加入时间)":"politics",

			
 
				+        "联系电话":"mobile",

			
 
				+        "籍贯":"birthplace",

			
 
				+        "出生年月":"birth_time",

			
 
				+        "现任职务":"current_job",

			
 
				+        "所在城市":"living_city",

			
 
				+        "参加工作时间":"work_begin_time",

			
 
				+        "意向岗位":"intent_job",

			
 
				+        "熟悉专业有何专长":"skills",

			
 
				+    }

			
 
				+    edunormal = {

			
 
				+        "学校":"school_name",

			
 
				+        "专业":"major",

			
 
				+        "学历":"degree",

			
 
				+        "是否全日制":"degree_type",

			
 
				+    }

			
 
				+    for key in normal.keys():

			
 
				+        if result.get(key):

			
 
				+            result[normal[key]] = result[key]

			
 
				+            result.pop(key)

			
 
				+    for idx in range(len(result['学习经历'])):

			
 
				+        result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]

			
 
				+        result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]

			
 
				+        for key in edunormal.keys():

			
 
				+            if result['学习经历'][idx].get(key):

			
 
				+                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				+                result['学习经历'][idx].pop(key)

			
 
				+    url = "http://192.168.1.110:9999/talent/getResumeData"

			
 
				+    session = requests.Session()

			
 
				+    session.mount('http://', HTTPAdapter(max_retries = 3))

			
 
				+    try:

			
 
				+        headers = {

			
 
				+            'contentType':'Application/json'

			
 
				+        }

			
 
				+        response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)

			
 
				+        print(response.text)

			
 
				+    except Exception as e:

			
 
				+        print(e)

			
 
				     return result

			
 
				 

			
 
				 

			
--- a/tools/irafa.py
+++ b/tools/irafa.py
@@ -2,12 +2,16 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-07 13:12:17

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-12 18:02:21

			
 
				+# @Last Modified time: 2022-07-13 16:46:02

			
 
				 

			
 
				+# 内部人才市场简历模板

			
 
				+from pprint import pprint

			
 
				 

			
 
				+import docx

			
 
				 from docx import Document

			
 
				 from docx.shared import Inches

			
 
				 

			
 
				+

			
 
				 path = "d:\\desktop\\内部人才市场简历模板.docx"

			
 
				 

			
 
				 keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格（取得时间）", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]

			
@@ -24,38 +28,106 @@ def parse_line(line):
 
				             key = None

			
 
				     return result

			
 
				 

			
 
				-doc = Document(path)

			
 
				-lo = {}

			
 
				-tables = doc.tables

			
 
				-for _table in tables[:]:

			
 
				-    for i, row in enumerate(_table.rows[:]):

			
 
				-        row_content = []

			
 
				-        for cell in row.cells[:]:

			
 
				-            c = cell.text

			
 
				-            row_content.append(c)

			
 
				-        lo[len(lo.keys())] = row_content

			
 
				-

			
 
				-kwln = -1

			
 
				-kwline = None

			
 
				-for key in lo.keys():

			
 
				-    # pdb.set_trace()

			
 
				-    for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				-        if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				-            # pdb.set_trace()

			
 
				-            for c in lo[key]:

			
 
				+

			
 
				+def parse_layout(path):

			
 
				+    result = []

			
 
				+    doc = Document(path)

			
 
				+

			
 
				+    lo = {}

			
 
				+    tables = doc.tables

			
 
				+    for _table in tables[:]:

			
 
				+        for i, row in enumerate(_table.rows[:]):

			
 
				+            row_content = []

			
 
				+            for cell in row.cells[:]:

			
 
				+                c = cell.text

			
 
				+                row_content.append(c)

			
 
				+            lo[len(lo.keys())] = row_content

			
 
				+    

			
 
				+    kwln = -1

			
 
				+    kwline = None

			
 
				+    for key in lo.keys():

			
 
				+        # pdb.set_trace()

			
 
				+        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				                 # pdb.set_trace()

			
 
				-                if c and ''.join(c.split()) in keywords:# 非关键词行元素

			
 
				-                    print(parse_line(lo[key]))

			
 
				+                for c in lo[key]:

			
 
				+                    # pdb.set_trace()

			
 
				+                    if c and ''.join(c.split()) in keywords:# 非关键词行元素

			
 
				+                        result.extend(parse_line(lo[key]))

			
 
				+                        break

			
 
				+                else:# 关键词行元素

			
 
				+                    schema = dict()

			
 
				+                    for key, val in zip(kwline, lo[key]):

			
 
				+                        if key:

			
 
				+                            schema[key] = val

			
 
				+                    if "学校/培训机构" in schema.keys():

			
 
				+                        schema["学习经历"] = "学习经历"

			
 
				+                    elif "与本人关系" in schema.keys():

			
 
				+                        schema["家庭成员"] = "家庭成员"

			
 
				+                    elif "意向地区" in schema.keys():

			
 
				+                        schema["职业发展管理"] = "职业发展管理"

			
 
				+                    elif "职业证书" in schema.keys():

			
 
				+                        schema["职业资格证书"] = "职业资格证书"

			
 
				+                    result.append(schema)

			
 
				                     break

			
 
				-            else:# 关键词行元素

			
 
				-                schema = dict()

			
 
				-                for key, val in zip(kwline, lo[key]):

			
 
				-                    if key:

			
 
				-                        schema[key] = val

			
 
				-                print(schema)

			
 
				                 break

			
 
				-            break

			
 
				-    else:

			
 
				-        # print("此行为关键词行")

			
 
				-        kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				-        kwln = len(lo[key])

			
 
				+        else:

			
 
				+            # print("此行为关键词行")

			
 
				+            kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				+            kwln = len(lo[key])

			
 
				+    return result

			
 
				+

			
 
				+

			
 
				+# 格式化数据

			
 
				+def formatter(datalist):

			
 
				+    result = dict()

			
 
				+

			
 
				+    for d in datalist:

			
 
				+        if len(d) == 1:

			
 
				+            for key in d.keys():

			
 
				+                result[key] = d[key]

			
 
				+        else:

			
 
				+            for k in list(d.keys()):

			
 
				+                if k == "".join(d[k].split()):

			
 
				+                    d.pop(k)

			
 
				+                    if result.get(k):

			
 
				+                        result[k].append(d)

			
 
				+                    else:

			
 
				+                        result[k] = [d]

			
 
				+

			
 
				+    normal = {

			
 
				+        "姓名":"name",

			
 
				+        "性别":"gender",

			
 
				+        "邮箱地址":"email",

			
 
				+        "政治面貌":"politics",

			
 
				+        "联系电话":"mobile",

			
 
				+        "籍贯":"birthplace",

			
 
				+        "出生日期":"birth_time",

			
 
				+        "现任职务":"current_job",

			
 
				+        "所在城市":"living_city",

			
 
				+        "参加工作时间":"work_begin_time",

			
 
				+        "意向岗位":"intent_job",

			
 
				+        "熟悉专业有何专长":"skills",

			
 
				+    }

			
 
				+    edunormal = {

			
 
				+        "学校":"school_name",

			
 
				+        "专业":"major",

			
 
				+        "学历":"degree",

			
 
				+        "是否全日制":"degree_type",

			
 
				+    }

			
 
				+    for key in normal.keys():

			
 
				+        if result.get(key):

			
 
				+            result[normal[key]] = result[key]

			
 
				+            result.pop(key)

			
 
				+    # for idx in range(len(result['学习经历'])):

			
 
				+    #     result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]

			
 
				+    #     result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]

			
 
				+    #     for key in edunormal.keys():

			
 
				+    #         if result['学习经历'][idx].get(key):

			
 
				+    #             result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]

			
 
				+    #             result['学习经历'][idx].pop(key)

			
 
				+    return result

			
 
				+

			
 
				+if __name__ == "__main__":

			
 
				+    pprint(formatter(parse_layout(path)))

			
 
				+

			
--- a/tools/resume_parse.py
+++ b/tools/resume_parse.py
@@ -1,5 +1,8 @@
 
				 #!/usr/bin/env python
			
 
				 # coding: utf-8
			
 
				+# 
			
 
				+# 通用简历抽取
			
 
				+
			
 
				 import os
			
 
				 import sys
			
 
				 import re
			
@@ -651,7 +654,7 @@ def get_job_list(lines):
 
				         job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容：：|工工作作内内容容', '工作内容：', ''.join(data_list[end_index:]))
			
 
				         job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
			
 
				         job_dict['start_time'] = job_dict['job_time'].split('~')[0]
			
 
				-        job_dict['end_time'] = job_dict['job_time'].split('~')[1]
			
 
				+        job_dict['end_time'] = job_dict['job_time'].split('~')[-1]
			
 
				 
			
 
				         normal = {"job_company":"company_name","job_content":"job_desc","job_leval":"job_name"}
			
 
				         for key in normal.keys():
			
--- a/tools/srafa.py
+++ b/tools/srafa.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-07 12:59:42

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-12 18:05:48

			
 
				+# @Last Modified time: 2022-07-13 15:22:48

			
 
				 # import pdb

			
 
				 from pprint import pprint

			
 
				 import pandas as pd

			
@@ -101,8 +101,9 @@ def parse_layout(path):
 
				             for page in pdf.pages:

			
 
				                 for table in page.extract_tables():

			
 
				                     for line in table:

			
 
				+                        # lo[len(lo.keys())] = [cell for cell in line if cell]

			
 
				                         lo[len(lo.keys())] = line

			
 
				-    print(lo)

			
 
				+

			
 
				     kwln = -1

			
 
				     kwline = None

			
 
				     for key in lo.keys():

			
@@ -110,7 +111,7 @@ def parse_layout(path):
 
				         for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				             if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				                 # pdb.set_trace()

			
 
				-                for c in lo[key]:

			
 
				+                for c in lo[key] or len(lo[key])!=kwln:

			
 
				                     # pdb.set_trace()

			
 
				                     if c and ''.join(c.split()) in keywords:# 非关键词行元素

			
 
				                         result.extend(parse_line(lo[key]))

			
@@ -119,17 +120,43 @@ def parse_layout(path):
 
				                     schema = dict()

			
 
				                     for key, val in zip(kwline, lo[key]):

			
 
				                         if key:

			
 
				-                            schema[key] = val

			
 
				+                            schema[key] = val if val else key

			
 
				                     result.append(schema)

			
 
				                     break

			
 
				                 break

			
 
				         else:

			
 
				             # print("此行为关键词行")

			
 
				-            kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				+            # kwline = lo[key]

			
 
				+            kwline = []

			
 
				+            for cell in lo[key]:

			
 
				+                if cell:

			
 
				+                    kwline.append(''.join(cell.split()))

			
 
				+                else:

			
 
				+                    kwline.append(cell)

			
 
				             kwln = len(lo[key])

			
 
				     return result

			
 
				 

			
 
				+# 格式化数据

			
 
				+def formatter(datalist):

			
 
				+    result = dict()

			
 
				+

			
 
				+    for d in datalist:

			
 
				+        if len(d) == 1:

			
 
				+            for key in d.keys():

			
 
				+                result[key] = d[key]

			
 
				+        else:

			
 
				+            for k in list(d.keys()):

			
 
				+                if k == "".join(d[k].split()):

			
 
				+                    d.pop(k)

			
 
				+                    if result.get(k):

			
 
				+                        result[k].append(d)

			
 
				+                    else:

			
 
				+                        result[k] = [d]

			
 
				+

			
 
				+    return result

			
 
				+

			
 
				 if __name__ == '__main__':

			
 
				-    print(parse_layout(path))

			
 
				+    # pprint(parse_layout(path))

			
 
				+    pprint(formatter(parse_layout(path)))