Переглянути джерело

modified: custom.py
modified: irafa.py
modified: resources/translate.json
modified: srafa.py

sprivacy 3 роки тому
батько
коміт
c6af61bebe
4 змінених файлів з 194 додано та 12 видалено
  1. 52 3
      tools/custom.py
  2. 81 4
      tools/irafa.py
  3. 1 0
      tools/resources/translate.json
  4. 60 5
      tools/srafa.py

+ 52 - 3
tools/custom.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2022-07-11 09:21:24
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-15 17:22:00
+# @Last Modified time: 2022-07-16 14:26:21
 
 # 自定义模板
 
@@ -12,11 +12,13 @@ import logging
 from pprint import pprint
 import requests
 from requests.adapters import HTTPAdapter
+import pdfplumber
 from docx import Document
 from docx.shared import Inches
 
 
 path = "d:\\desktop\\自定义.docx"
+# path = "d:\\desktop\\自定义.pdf"
 
 # 关键词字典
 keywords = [
@@ -114,7 +116,7 @@ def parse_layout(path):
             for cell in row.cells[:]:
                 c = cell.text
                 if c not in row_content:
-                	row_content.append(c)
+                    row_content.append(c)
             lo[len(lo.keys())] = row_content
 
     kwln = -1# 关键词行长度
@@ -148,6 +150,49 @@ def parse_layout(path):
     return result
 
 
+def parse_pdf_layout(path):
+    result = []
+    lo = {}
+    with pdfplumber.open(path) as pdf:
+            for page in pdf.pages:
+                for table in page.extract_tables():
+                    for line in table:
+                        # lo[len(lo.keys())] = [cell for cell in line if cell]
+                        lo[len(lo.keys())] = line
+    print(lo)
+
+    kwln = -1
+    kwline = None
+    for key in lo.keys():
+        # pdb.set_trace()
+        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
+                # pdb.set_trace()
+                for c in lo[key] or len(lo[key])!=kwln:
+                    # pdb.set_trace()
+                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
+                        result.extend(parse_line(lo[key]))
+                        break
+                else:# 关键词行元素
+                    schema = dict()
+                    for key, val in zip(kwline, lo[key]):
+                        if key:
+                            schema[key] = val if val else key
+                    result.append(schema)
+                    break
+                break
+        else:
+            # print("此行为关键词行")
+            # kwline = lo[key]
+            kwline = []
+            for cell in lo[key]:
+                if cell:
+                    kwline.append(''.join(cell.split()))
+                else:
+                    kwline.append(cell)
+            kwln = len(lo[key])
+    return result
+
 # 格式化数据
 def formatter(datalist):
     result = dict()
@@ -329,6 +374,10 @@ def formatter(datalist):
 
 
 if __name__ == '__main__':
-    pprint(formatter(parse_layout(path)))
+    if path.endswith(".docx"):
+        pprint(formatter(parse_layout(path)))
+    else:
+        pprint(parse_pdf_layout(path))
+        pprint(formatter(parse_pdf_layout(path)))
 
 

+ 81 - 4
tools/irafa.py

@@ -2,20 +2,24 @@
 # @Author: privacy
 # @Date:   2022-07-07 13:12:17
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-16 09:08:32
+# @Last Modified time: 2022-07-16 15:05:03
 
 # 内部人才市场简历模板
 from pprint import pprint
 import re
 import json
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextContainer
+import pdfplumber
 import docx
 from docx import Document
 from docx.shared import Inches
 
 
-path = "d:\\desktop\\内部人才市场简历模板.docx"
+# path = "d:\\desktop\\内部人才市场简历模板.docx"
+path = "d:\\desktop\\内部人才市场简历模板.pdf"
 
-keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
+keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
 
 def parse_line(line):
     result = []
@@ -98,6 +102,76 @@ def parse_layout(path):
     return result
 
 
+def parse_pdf_layout(path):
+    result = []
+    lo = {}
+    with pdfplumber.open(path) as pdf:
+            for page in pdf.pages:
+                for table in page.extract_tables():
+                    for line in table:
+                        # lo[len(lo.keys())] = [cell for cell in line if cell]
+                        lo[len(lo.keys())] = line
+
+    kwln = -1
+    kwline = None
+    for key in lo.keys():
+        # pdb.set_trace()
+        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
+                # pdb.set_trace()
+                for c in lo[key]:
+                    # pdb.set_trace()
+                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
+                        result.extend(parse_line(lo[key]))
+                        break
+                    if c == "对报名岗位\n认 识及工作":
+                        print(''.join(c.split()))
+                        break
+                else:# 关键词行元素
+                    schema = dict()
+                    for key, val in zip(kwline, lo[key]):
+                        if key:
+                            schema[key] = val
+                    if "学校/培训机构" in schema.keys():
+                        schema["学习经历"] = "学习经历"
+                    elif "与本人关系" in schema.keys():
+                        schema["家庭成员"] = "家庭成员"
+                    elif "意向地区" in schema.keys():
+                        schema["职业发展管理"] = "职业发展管理"
+                    elif "职业证书" in schema.keys():
+                        schema["职业资格证书"] = "职业资格证书"
+                    result.append(schema)
+                    break
+                break
+        else:
+            # print("此行为关键词行")
+            kwline = [''.join(cell.split()) for cell in lo[key]]
+            kwln = len(lo[key])
+
+    job = {"工作经历":"工作经历"}
+    flag = None
+
+    with pdfplumber.open(path) as pdf:
+        for page in pdf.pages:
+            for predict in page.extract_words():
+                # print(predict['text'])
+                text = predict['text'].replace(":", ":")
+                if ":" in text:
+                    text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
+                    for line in text.split("\n"):
+                        if line.strip():
+                            i = line.split(":")
+                            if job.get(i[0].strip()):
+                                result.append(job)
+                                job = {"工作经历":"工作经历"}
+                            job[i[0].strip()] = i[1].strip()
+                            flag = i[0].strip()
+                elif flag == "工作描述":
+                    job["工作描述"] += '\n' + text.strip()
+            else:
+                result.append(job)
+    return result
+
 # 格式化数据
 def formatter(datalist):
     result = dict()
@@ -195,5 +269,8 @@ def formatter(datalist):
     return result
 
 if __name__ == "__main__":
-    pprint(formatter(parse_layout(path)))
+    if path.endswith(".docx"):
+        pprint(formatter(parse_layout(path)))
+    else:
+        pprint(formatter(parse_pdf_layout(path)))
 

+ 1 - 0
tools/resources/translate.json

@@ -7,6 +7,7 @@
         "民族":"national",
         "籍贯":"birthplace",
         "户籍地":"household_register_address",
+        "出生地":"household_register_address",
         "参加工作时间":"work_begin_time",
         "联系电话":"mobile",
         "手机号码":"mobile",

+ 60 - 5
tools/srafa.py

@@ -2,14 +2,18 @@
 # @Author: privacy
 # @Date:   2022-07-07 12:59:42
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-16 09:08:43
+# @Last Modified time: 2022-07-16 11:41:09
 # import pdb
 from pprint import pprint
 import json
 import pandas as pd
 import pdfplumber
+import docx
+from docx import Document
+from docx.shared import Inches
 
-path = "d:\\desktop\\社招简历模板.pdf"
+path = "d:\\desktop\\社招简历模板.docx"
+# path = "d:\\desktop\\社招简历模板.pdf"
 
 keywords = ['姓名',
     '性别',
@@ -95,7 +99,53 @@ def parse_line(line):
     return result
 
 
-def parse_layout(path):
+def parse_word_layout(path):
+    result = []
+    doc = Document(path)
+    lo = {}
+    for _table in doc.tables[:]:
+        for i, row in enumerate(_table.rows[:]):
+            row_content = []
+            for cell in row.cells[:]:
+                c = cell.text
+                if c not in row_content:
+                    row_content.append(c)
+            lo[len(lo.keys())] = row_content
+
+    kwln = -1# 关键词行长度
+    kwline = None# 关键词行
+    for key in lo.keys():
+        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
+                perc = 0# 行内关键词数量
+                for c in lo[key]:
+                    if c and (''.join(c.split()) in keywords):# 找到此行有关键词
+                        perc += 1
+                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
+                        perc = 0# 清空行内关键词数
+                        result.extend(parse_line(lo[key]))# 添加并解析普通行级元素
+                        break
+                else:# 关键词行元素
+                    if len(kwline) != len(lo[key]):
+                        break
+                    schema = dict()
+                    for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
+                        if key:
+                            schema[key] = val
+                    result.append(schema)
+                    break
+                break
+        else:
+            # print("{}:此行为关键词行!".format(lo[key]))
+            if len(lo[key])>2:
+                try:
+                    kwline = [''.join(cell.split()) for cell in lo[key]]
+                except Exception as e:
+                    kwline = lo[key]
+                kwln = len(lo[key])
+    return result
+
+def parse_pdf_layout(path):
     result = []
     lo = {}
     with pdfplumber.open(path) as pdf:
@@ -154,6 +204,7 @@ def formatter(datalist):
                     else:
                         result[k] = [d]
 
+
     # 转译数据库字段名
     with open("./resources/translate.json", "r", encoding="utf-8") as ff:
         json_obj = json.load(ff)
@@ -205,7 +256,8 @@ def formatter(datalist):
         "语言能力":"tal_language",
         "获得职业资格证书情况":"tal_vocational_qualification_certificate",
         "专业技能":"tal_professional_tech_certificate",
-        "主要家庭成员及社会关系":"tal_family_social_relations"
+        "主要家庭成员及社会关系":"tal_family_social_relations",
+        "其他情况说明":"intro"
     }
 
     for key in tit.keys():
@@ -228,6 +280,9 @@ def formatter(datalist):
     return result
 
 if __name__ == '__main__':
-    pprint(formatter(parse_layout(path)))
+    if path.endswith(".pdf"):
+        pprint(formatter(parse_pdf_layout(path)))
+    else:
+        pprint(formatter(parse_word_layout(path)))