Jelajahi Sumber

modified: custom.py
modified: irafa.py
modified: resume_parse.py
modified: srafa.py

sprivacy 3 tahun lalu
induk
melakukan
106635f3d2
4 mengubah file dengan 190 tambahan dan 43 penghapusan
  1. 48 3
      tools/custom.py
  2. 105 33
      tools/irafa.py
  3. 4 1
      tools/resume_parse.py
  4. 33 6
      tools/srafa.py

+ 48 - 3
tools/custom.py

@@ -2,11 +2,15 @@
 # @Author: privacy
 # @Date:   2022-07-11 09:21:24
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-12 16:30:08
+# @Last Modified time: 2022-07-13 15:31:50
+
+# 自定义模板
+
 import re
 import logging
 from pprint import pprint
-
+import requests
+from requests.adapters import HTTPAdapter
 from docx import Document
 from docx.shared import Inches
 
@@ -258,7 +262,48 @@ def formatter(datalist):
                 dates = re.findall(r'\d+' , fam["出生年月"])
                 if len(dates) == 2:
                     result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
-
+    normal = {
+        "姓名":"name",
+        "性别":"gender",
+        "邮箱地址":"email",
+        "政治面貌(加入时间)":"politics",
+        "联系电话":"mobile",
+        "籍贯":"birthplace",
+        "出生年月":"birth_time",
+        "现任职务":"current_job",
+        "所在城市":"living_city",
+        "参加工作时间":"work_begin_time",
+        "意向岗位":"intent_job",
+        "熟悉专业有何专长":"skills",
+    }
+    edunormal = {
+        "学校":"school_name",
+        "专业":"major",
+        "学历":"degree",
+        "是否全日制":"degree_type",
+    }
+    for key in normal.keys():
+        if result.get(key):
+            result[normal[key]] = result[key]
+            result.pop(key)
+    for idx in range(len(result['学习经历'])):
+        result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
+        result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
+        for key in edunormal.keys():
+            if result['学习经历'][idx].get(key):
+                result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
+                result['学习经历'][idx].pop(key)
+    url = "http://192.168.1.110:9999/talent/getResumeData"
+    session = requests.Session()
+    session.mount('http://', HTTPAdapter(max_retries = 3))
+    try:
+        headers = {
+            'contentType':'Application/json'
+        }
+        response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
+        print(response.text)
+    except Exception as e:
+        print(e)
     return result
 
 

+ 105 - 33
tools/irafa.py

@@ -2,12 +2,16 @@
 # @Author: privacy
 # @Date:   2022-07-07 13:12:17
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-12 18:02:21
+# @Last Modified time: 2022-07-13 16:46:02
 
+# 内部人才市场简历模板
+from pprint import pprint
 
+import docx
 from docx import Document
 from docx.shared import Inches
 
+
 path = "d:\\desktop\\内部人才市场简历模板.docx"
 
 keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
@@ -24,38 +28,106 @@ def parse_line(line):
             key = None
     return result
 
-doc = Document(path)
-lo = {}
-tables = doc.tables
-for _table in tables[:]:
-    for i, row in enumerate(_table.rows[:]):
-        row_content = []
-        for cell in row.cells[:]:
-            c = cell.text
-            row_content.append(c)
-        lo[len(lo.keys())] = row_content
-
-kwln = -1
-kwline = None
-for key in lo.keys():
-    # pdb.set_trace()
-    for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
-        if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
-            # pdb.set_trace()
-            for c in lo[key]:
+
+def parse_layout(path):
+    result = []
+    doc = Document(path)
+
+    lo = {}
+    tables = doc.tables
+    for _table in tables[:]:
+        for i, row in enumerate(_table.rows[:]):
+            row_content = []
+            for cell in row.cells[:]:
+                c = cell.text
+                row_content.append(c)
+            lo[len(lo.keys())] = row_content
+    
+    kwln = -1
+    kwline = None
+    for key in lo.keys():
+        # pdb.set_trace()
+        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
                 # pdb.set_trace()
-                if c and ''.join(c.split()) in keywords:# 非关键词行元素
-                    print(parse_line(lo[key]))
+                for c in lo[key]:
+                    # pdb.set_trace()
+                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
+                        result.extend(parse_line(lo[key]))
+                        break
+                else:# 关键词行元素
+                    schema = dict()
+                    for key, val in zip(kwline, lo[key]):
+                        if key:
+                            schema[key] = val
+                    if "学校/培训机构" in schema.keys():
+                        schema["学习经历"] = "学习经历"
+                    elif "与本人关系" in schema.keys():
+                        schema["家庭成员"] = "家庭成员"
+                    elif "意向地区" in schema.keys():
+                        schema["职业发展管理"] = "职业发展管理"
+                    elif "职业证书" in schema.keys():
+                        schema["职业资格证书"] = "职业资格证书"
+                    result.append(schema)
                     break
-            else:# 关键词行元素
-                schema = dict()
-                for key, val in zip(kwline, lo[key]):
-                    if key:
-                        schema[key] = val
-                print(schema)
                 break
-            break
-    else:
-        # print("此行为关键词行")
-        kwline = [''.join(cell.split()) for cell in lo[key]]
-        kwln = len(lo[key])
+        else:
+            # print("此行为关键词行")
+            kwline = [''.join(cell.split()) for cell in lo[key]]
+            kwln = len(lo[key])
+    return result
+
+
+# 格式化数据
+def formatter(datalist):
+    result = dict()
+
+    for d in datalist:
+        if len(d) == 1:
+            for key in d.keys():
+                result[key] = d[key]
+        else:
+            for k in list(d.keys()):
+                if k == "".join(d[k].split()):
+                    d.pop(k)
+                    if result.get(k):
+                        result[k].append(d)
+                    else:
+                        result[k] = [d]
+
+    normal = {
+        "姓名":"name",
+        "性别":"gender",
+        "邮箱地址":"email",
+        "政治面貌":"politics",
+        "联系电话":"mobile",
+        "籍贯":"birthplace",
+        "出生日期":"birth_time",
+        "现任职务":"current_job",
+        "所在城市":"living_city",
+        "参加工作时间":"work_begin_time",
+        "意向岗位":"intent_job",
+        "熟悉专业有何专长":"skills",
+    }
+    edunormal = {
+        "学校":"school_name",
+        "专业":"major",
+        "学历":"degree",
+        "是否全日制":"degree_type",
+    }
+    for key in normal.keys():
+        if result.get(key):
+            result[normal[key]] = result[key]
+            result.pop(key)
+    # for idx in range(len(result['学习经历'])):
+    #     result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
+    #     result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
+    #     for key in edunormal.keys():
+    #         if result['学习经历'][idx].get(key):
+    #             result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
+    #             result['学习经历'][idx].pop(key)
+    return result
+
+if __name__ == "__main__":
+    pprint(formatter(parse_layout(path)))
+

+ 4 - 1
tools/resume_parse.py

@@ -1,5 +1,8 @@
 #!/usr/bin/env python
 # coding: utf-8
+# 
+# 通用简历抽取
+
 import os
 import sys
 import re
@@ -651,7 +654,7 @@ def get_job_list(lines):
         job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容::|工工作作内内容容', '工作内容:', ''.join(data_list[end_index:]))
         job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
         job_dict['start_time'] = job_dict['job_time'].split('~')[0]
-        job_dict['end_time'] = job_dict['job_time'].split('~')[1]
+        job_dict['end_time'] = job_dict['job_time'].split('~')[-1]
 
         normal = {"job_company":"company_name","job_content":"job_desc","job_leval":"job_name"}
         for key in normal.keys():

+ 33 - 6
tools/srafa.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2022-07-07 12:59:42
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-12 18:05:48
+# @Last Modified time: 2022-07-13 15:22:48
 # import pdb
 from pprint import pprint
 import pandas as pd
@@ -101,8 +101,9 @@ def parse_layout(path):
             for page in pdf.pages:
                 for table in page.extract_tables():
                     for line in table:
+                        # lo[len(lo.keys())] = [cell for cell in line if cell]
                         lo[len(lo.keys())] = line
-    print(lo)
+
     kwln = -1
     kwline = None
     for key in lo.keys():
@@ -110,7 +111,7 @@ def parse_layout(path):
         for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
             if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
                 # pdb.set_trace()
-                for c in lo[key]:
+                for c in lo[key] or len(lo[key])!=kwln:
                     # pdb.set_trace()
                     if c and ''.join(c.split()) in keywords:# 非关键词行元素
                         result.extend(parse_line(lo[key]))
@@ -119,17 +120,43 @@ def parse_layout(path):
                     schema = dict()
                     for key, val in zip(kwline, lo[key]):
                         if key:
-                            schema[key] = val
+                            schema[key] = val if val else key
                     result.append(schema)
                     break
                 break
         else:
             # print("此行为关键词行")
-            kwline = [''.join(cell.split()) for cell in lo[key]]
+            # kwline = lo[key]
+            kwline = []
+            for cell in lo[key]:
+                if cell:
+                    kwline.append(''.join(cell.split()))
+                else:
+                    kwline.append(cell)
             kwln = len(lo[key])
     return result
 
+# 格式化数据
+def formatter(datalist):
+    result = dict()
+
+    for d in datalist:
+        if len(d) == 1:
+            for key in d.keys():
+                result[key] = d[key]
+        else:
+            for k in list(d.keys()):
+                if k == "".join(d[k].split()):
+                    d.pop(k)
+                    if result.get(k):
+                        result[k].append(d)
+                    else:
+                        result[k] = [d]
+
+    return result
+
 if __name__ == '__main__':
-    print(parse_layout(path))
+    # pprint(parse_layout(path))
+    pprint(formatter(parse_layout(path)))