3 роки тому · c6af61bebe
--- a/tools/custom.py
+++ b/tools/custom.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-11 09:21:24

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-15 17:22:00

			
 
				+# @Last Modified time: 2022-07-16 14:26:21

			
 
				 

			
 
				 # 自定义模板

			
 
				 

			
@@ -12,11 +12,13 @@ import logging
 
				 from pprint import pprint

			
 
				 import requests

			
 
				 from requests.adapters import HTTPAdapter

			
 
				+import pdfplumber

			
 
				 from docx import Document

			
 
				 from docx.shared import Inches

			
 
				 

			
 
				 

			
 
				 path = "d:\\desktop\\自定义.docx"

			
 
				+# path = "d:\\desktop\\自定义.pdf"

			
 
				 

			
 
				 # 关键词字典

			
 
				 keywords = [

			
@@ -114,7 +116,7 @@ def parse_layout(path):
 
				             for cell in row.cells[:]:

			
 
				                 c = cell.text

			
 
				                 if c not in row_content:

			
 
				-                	row_content.append(c)

			
 
				+                    row_content.append(c)

			
 
				             lo[len(lo.keys())] = row_content

			
 
				 

			
 
				     kwln = -1# 关键词行长度

			
@@ -148,6 +150,49 @@ def parse_layout(path):
 
				     return result

			
 
				 

			
 
				 

			
 
				+def parse_pdf_layout(path):

			
 
				+    result = []

			
 
				+    lo = {}

			
 
				+    with pdfplumber.open(path) as pdf:

			
 
				+            for page in pdf.pages:

			
 
				+                for table in page.extract_tables():

			
 
				+                    for line in table:

			
 
				+                        # lo[len(lo.keys())] = [cell for cell in line if cell]

			
 
				+                        lo[len(lo.keys())] = line

			
 
				+    print(lo)

			
 
				+

			
 
				+    kwln = -1

			
 
				+    kwline = None

			
 
				+    for key in lo.keys():

			
 
				+        # pdb.set_trace()

			
 
				+        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+                # pdb.set_trace()

			
 
				+                for c in lo[key] or len(lo[key])!=kwln:

			
 
				+                    # pdb.set_trace()

			
 
				+                    if c and ''.join(c.split()) in keywords:# 非关键词行元素

			
 
				+                        result.extend(parse_line(lo[key]))

			
 
				+                        break

			
 
				+                else:# 关键词行元素

			
 
				+                    schema = dict()

			
 
				+                    for key, val in zip(kwline, lo[key]):

			
 
				+                        if key:

			
 
				+                            schema[key] = val if val else key

			
 
				+                    result.append(schema)

			
 
				+                    break

			
 
				+                break

			
 
				+        else:

			
 
				+            # print("此行为关键词行")

			
 
				+            # kwline = lo[key]

			
 
				+            kwline = []

			
 
				+            for cell in lo[key]:

			
 
				+                if cell:

			
 
				+                    kwline.append(''.join(cell.split()))

			
 
				+                else:

			
 
				+                    kwline.append(cell)

			
 
				+            kwln = len(lo[key])

			
 
				+    return result

			
 
				+

			
 
				 # 格式化数据

			
 
				 def formatter(datalist):

			
 
				     result = dict()

			
@@ -329,6 +374,10 @@ def formatter(datalist):
 
				 

			
 
				 

			
 
				 if __name__ == '__main__':

			
 
				-    pprint(formatter(parse_layout(path)))

			
 
				+    if path.endswith(".docx"):

			
 
				+        pprint(formatter(parse_layout(path)))

			
 
				+    else:

			
 
				+        pprint(parse_pdf_layout(path))

			
 
				+        pprint(formatter(parse_pdf_layout(path)))

			
 
				 

			
 
				 

			
--- a/tools/irafa.py
+++ b/tools/irafa.py
@@ -2,20 +2,24 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-07 13:12:17

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-16 09:08:32

			
 
				+# @Last Modified time: 2022-07-16 15:05:03

			
 
				 

			
 
				 # 内部人才市场简历模板

			
 
				 from pprint import pprint

			
 
				 import re

			
 
				 import json

			
 
				+from pdfminer.high_level import extract_pages

			
 
				+from pdfminer.layout import LTTextContainer

			
 
				+import pdfplumber

			
 
				 import docx

			
 
				 from docx import Document

			
 
				 from docx.shared import Inches

			
 
				 

			
 
				 

			
 
				-path = "d:\\desktop\\内部人才市场简历模板.docx"

			
 
				+# path = "d:\\desktop\\内部人才市场简历模板.docx"

			
 
				+path = "d:\\desktop\\内部人才市场简历模板.pdf"

			
 
				 

			
 
				-keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格（取得时间）", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]

			
 
				+keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格（取得时间）", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]

			
 
				 

			
 
				 def parse_line(line):

			
 
				     result = []

			
@@ -98,6 +102,76 @@ def parse_layout(path):
 
				     return result

			
 
				 

			
 
				 

			
 
				+def parse_pdf_layout(path):

			
 
				+    result = []

			
 
				+    lo = {}

			
 
				+    with pdfplumber.open(path) as pdf:

			
 
				+            for page in pdf.pages:

			
 
				+                for table in page.extract_tables():

			
 
				+                    for line in table:

			
 
				+                        # lo[len(lo.keys())] = [cell for cell in line if cell]

			
 
				+                        lo[len(lo.keys())] = line

			
 
				+

			
 
				+    kwln = -1

			
 
				+    kwline = None

			
 
				+    for key in lo.keys():

			
 
				+        # pdb.set_trace()

			
 
				+        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+                # pdb.set_trace()

			
 
				+                for c in lo[key]:

			
 
				+                    # pdb.set_trace()

			
 
				+                    if c and ''.join(c.split()) in keywords:# 非关键词行元素

			
 
				+                        result.extend(parse_line(lo[key]))

			
 
				+                        break

			
 
				+                    if c == "对报名岗位\n认 识及工作":

			
 
				+                        print(''.join(c.split()))

			
 
				+                        break

			
 
				+                else:# 关键词行元素

			
 
				+                    schema = dict()

			
 
				+                    for key, val in zip(kwline, lo[key]):

			
 
				+                        if key:

			
 
				+                            schema[key] = val

			
 
				+                    if "学校/培训机构" in schema.keys():

			
 
				+                        schema["学习经历"] = "学习经历"

			
 
				+                    elif "与本人关系" in schema.keys():

			
 
				+                        schema["家庭成员"] = "家庭成员"

			
 
				+                    elif "意向地区" in schema.keys():

			
 
				+                        schema["职业发展管理"] = "职业发展管理"

			
 
				+                    elif "职业证书" in schema.keys():

			
 
				+                        schema["职业资格证书"] = "职业资格证书"

			
 
				+                    result.append(schema)

			
 
				+                    break

			
 
				+                break

			
 
				+        else:

			
 
				+            # print("此行为关键词行")

			
 
				+            kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				+            kwln = len(lo[key])

			
 
				+

			
 
				+    job = {"工作经历":"工作经历"}

			
 
				+    flag = None

			
 
				+

			
 
				+    with pdfplumber.open(path) as pdf:

			
 
				+        for page in pdf.pages:

			
 
				+            for predict in page.extract_words():

			
 
				+                # print(predict['text'])

			
 
				+                text = predict['text'].replace("：", ":")

			
 
				+                if ":" in text:

			
 
				+                    text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)

			
 
				+                    for line in text.split("\n"):

			
 
				+                        if line.strip():

			
 
				+                            i = line.split(":")

			
 
				+                            if job.get(i[0].strip()):

			
 
				+                                result.append(job)

			
 
				+                                job = {"工作经历":"工作经历"}

			
 
				+                            job[i[0].strip()] = i[1].strip()

			
 
				+                            flag = i[0].strip()

			
 
				+                elif flag == "工作描述":

			
 
				+                    job["工作描述"] += '\n' + text.strip()

			
 
				+            else:

			
 
				+                result.append(job)

			
 
				+    return result

			
 
				+

			
 
				 # 格式化数据

			
 
				 def formatter(datalist):

			
 
				     result = dict()

			
@@ -195,5 +269,8 @@ def formatter(datalist):
 
				     return result

			
 
				 

			
 
				 if __name__ == "__main__":

			
 
				-    pprint(formatter(parse_layout(path)))

			
 
				+    if path.endswith(".docx"):

			
 
				+        pprint(formatter(parse_layout(path)))

			
 
				+    else:

			
 
				+        pprint(formatter(parse_pdf_layout(path)))

			
 
				 

			
--- a/tools/resources/translate.json
+++ b/tools/resources/translate.json
@@ -7,6 +7,7 @@
 
				         "民族":"national",

			
 
				         "籍贯":"birthplace",

			
 
				         "户籍地":"household_register_address",

			
 
				+        "出生地":"household_register_address",

			
 
				         "参加工作时间":"work_begin_time",

			
 
				         "联系电话":"mobile",

			
 
				         "手机号码":"mobile",

			
--- a/tools/srafa.py
+++ b/tools/srafa.py
@@ -2,14 +2,18 @@
 
				 # @Author: privacy

			
 
				 # @Date:   2022-07-07 12:59:42

			
 
				 # @Last Modified by:   privacy

			
 
				-# @Last Modified time: 2022-07-16 09:08:43

			
 
				+# @Last Modified time: 2022-07-16 11:41:09

			
 
				 # import pdb

			
 
				 from pprint import pprint

			
 
				 import json

			
 
				 import pandas as pd

			
 
				 import pdfplumber

			
 
				+import docx

			
 
				+from docx import Document

			
 
				+from docx.shared import Inches

			
 
				 

			
 
				-path = "d:\\desktop\\社招简历模板.pdf"

			
 
				+path = "d:\\desktop\\社招简历模板.docx"

			
 
				+# path = "d:\\desktop\\社招简历模板.pdf"

			
 
				 

			
 
				 keywords = ['姓名',

			
 
				     '性别',

			
@@ -95,7 +99,53 @@ def parse_line(line):
 
				     return result

			
 
				 

			
 
				 

			
 
				-def parse_layout(path):

			
 
				+def parse_word_layout(path):

			
 
				+    result = []

			
 
				+    doc = Document(path)

			
 
				+    lo = {}

			
 
				+    for _table in doc.tables[:]:

			
 
				+        for i, row in enumerate(_table.rows[:]):

			
 
				+            row_content = []

			
 
				+            for cell in row.cells[:]:

			
 
				+                c = cell.text

			
 
				+                if c not in row_content:

			
 
				+                    row_content.append(c)

			
 
				+            lo[len(lo.keys())] = row_content

			
 
				+

			
 
				+    kwln = -1# 关键词行长度

			
 
				+    kwline = None# 关键词行

			
 
				+    for key in lo.keys():

			
 
				+        for val in lo[key]:# 通过全关键词，判断此行是否为关键词行

			
 
				+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素

			
 
				+                perc = 0# 行内关键词数量

			
 
				+                for c in lo[key]:

			
 
				+                    if c and (''.join(c.split()) in keywords):# 找到此行有关键词

			
 
				+                        perc += 1

			
 
				+                    if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3，判断此行非关键词行元素

			
 
				+                        perc = 0# 清空行内关键词数

			
 
				+                        result.extend(parse_line(lo[key]))# 添加并解析普通行级元素

			
 
				+                        break

			
 
				+                else:# 关键词行元素

			
 
				+                    if len(kwline) != len(lo[key]):

			
 
				+                        break

			
 
				+                    schema = dict()

			
 
				+                    for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素

			
 
				+                        if key:

			
 
				+                            schema[key] = val

			
 
				+                    result.append(schema)

			
 
				+                    break

			
 
				+                break

			
 
				+        else:

			
 
				+            # print("{}：此行为关键词行！".format(lo[key]))

			
 
				+            if len(lo[key])>2:

			
 
				+                try:

			
 
				+                    kwline = [''.join(cell.split()) for cell in lo[key]]

			
 
				+                except Exception as e:

			
 
				+                    kwline = lo[key]

			
 
				+                kwln = len(lo[key])

			
 
				+    return result

			
 
				+

			
 
				+def parse_pdf_layout(path):

			
 
				     result = []

			
 
				     lo = {}

			
 
				     with pdfplumber.open(path) as pdf:

			
@@ -154,6 +204,7 @@ def formatter(datalist):
 
				                     else:

			
 
				                         result[k] = [d]

			
 
				 

			
 
				+

			
 
				     # 转译数据库字段名

			
 
				     with open("./resources/translate.json", "r", encoding="utf-8") as ff:

			
 
				         json_obj = json.load(ff)

			
@@ -205,7 +256,8 @@ def formatter(datalist):
 
				         "语言能力":"tal_language",

			
 
				         "获得职业资格证书情况":"tal_vocational_qualification_certificate",

			
 
				         "专业技能":"tal_professional_tech_certificate",

			
 
				-        "主要家庭成员及社会关系":"tal_family_social_relations"

			
 
				+        "主要家庭成员及社会关系":"tal_family_social_relations",

			
 
				+        "其他情况说明":"intro"

			
 
				     }

			
 
				 

			
 
				     for key in tit.keys():

			
@@ -228,6 +280,9 @@ def formatter(datalist):
 
				     return result

			
 
				 

			
 
				 if __name__ == '__main__':

			
 
				-    pprint(formatter(parse_layout(path)))

			
 
				+    if path.endswith(".pdf"):

			
 
				+        pprint(formatter(parse_pdf_layout(path)))

			
 
				+    else:

			
 
				+        pprint(formatter(parse_word_layout(path)))