3 年前 · 746a4d1e8c
--- a/tools/resume_parse.py
+++ b/tools/resume_parse.py
@@ -4,6 +4,7 @@ import os
 
				 import sys
			
 
				 import re
			
 
				 import json
			
 
				+import time
			
 
				 from os import walk
			
 
				 import subprocess
			
 
				 import py7zr
			
@@ -1024,7 +1025,7 @@ def doc2pdf_win(docPath, pdfPath):
 
				     word.Quit() #退出
			
 
				 
			
 
				 
			
 
				-# Win32 doc 文件处理
			
 
				+# doc 文件处理
			
 
				 def doc2pdf(docPath, pdfPath, system):
			
 
				     """
			
 
				     注意使用绝对路径
			
@@ -1038,7 +1039,7 @@ def doc2pdf(docPath, pdfPath, system):
 
				 
			
 
				 
			
 
				 # txt 纯文本解析(已完成)
			
 
				-def parse_txt(path):
			
 
				+def parse_txt(path, save_dir):
			
 
				     with open(path, 'r', encoding='utf-8') as fp:
			
 
				         data = fp.read()
			
 
				     global block, block_rev
			
@@ -1069,12 +1070,13 @@ def parse_txt(path):
 
				         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				             if key == index:
			
 
				                 result_data.append({block_rev[index]:func(page[index])})
			
 
				-    with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
			
 
				+    filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				+    with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				 # 纯文本 word 解析
			
 
				-def read_from_word(doc, path):
			
 
				+def read_from_word(doc, path, save_dir):
			
 
				     para_text = []
			
 
				     for para in doc.paragraphs:
			
 
				         para_text.append(para.text)
			
@@ -1097,18 +1099,19 @@ def read_from_word(doc, path):
 
				         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				             if key == index:
			
 
				                 result_data.append({block_rev[index]:func(page[index])})
			
 
				-    with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
			
 
				+    filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				+    with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				 # 提取 word 表格(已完成)
			
 
				-def check_word(path):
			
 
				+def check_word(path, save_dir):
			
 
				     doc = Document(path)
			
 
				     tables = doc.tables
			
 
				 
			
 
				     if not tables:
			
 
				         logging.info("this is raw text")
			
 
				-        read_from_word(doc, path)
			
 
				+        read_from_word(doc, path, save_dir=save_dir)
			
 
				     logging.info("this is a Table")
			
 
				 
			
 
				     global block
			
@@ -1165,7 +1168,8 @@ def check_word(path):
 
				         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				             if key == index:
			
 
				                 result_data.append({block_rev[index]:func(page[index])})
			
 
				-    with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
			
 
				+    filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				+    with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
@@ -1204,7 +1208,7 @@ def parse_line_layout(layout, b):
 
				 
			
 
				 
			
 
				 # pdf 样式解析(已完成)
			
 
				-def read_from_pdf(path):
			
 
				+def read_from_pdf(path, save_dir):
			
 
				     result = {}
			
 
				     global block_rev
			
 
				     with open(path, 'rb') as in_file:
			
@@ -1237,13 +1241,14 @@ def read_from_pdf(path):
 
				             for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				                 if key == index:
			
 
				                     result_data.append({block_rev[index]:func(result[index])})
			
 
				-        console.print(result_data)
			
 
				-        with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
			
 
				+
			
 
				+        filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				+        with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				-# pdf 表格解析 ()
			
 
				-def parse_table_from_pdf(path):
			
 
				+# pdf 表格解析 (已完成)
			
 
				+def parse_table_from_pdf(path, save_dir):
			
 
				     global block, block_rev
			
 
				     lo = {}
			
 
				     with pdfplumber.open(path) as pdf:
			
@@ -1291,7 +1296,8 @@ def parse_table_from_pdf(path):
 
				         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
			
 
				             if key == index:
			
 
				                 result_data.append({block_rev[index]:func(page[index])})
			
 
				-    with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
			
 
				+    filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
			
 
				+    with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
			
 
				             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
@@ -1329,12 +1335,15 @@ def decode_path(path):
 
				 
			
 
				 # 检测传入格式(已完成)
			
 
				 def detection_type(path, system):
			
 
				+    tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
			
 
				+    os.mkdir(tempdir)
			
 
				     # 传入 tar.gz 压缩文件
			
 
				     if os.path.isfile(path) and path.endswith('.tar.gz'):
			
 
				         tf = tarfile.open(path)
			
 
				-        tf.extractall('./cache')
			
 
				+        tf.extractall('./cache/' + tempdir)
			
 
				         tf.close()
			
 
				-        path = "./cache"
			
 
				+        path = "./cache/" + tempdir
			
 
				+    # 传入 .zip .7z 压缩文件
			
 
				     try:
			
 
				         if os.path.isfile(path) and path.endswith('.zip'):
			
 
				             ## 解压方式1：存在乱码
			
@@ -1348,14 +1357,14 @@ def detection_type(path, system):
 
				                 for filename in file_iter:
			
 
				                     # 编码文件名称为 utf 格式
			
 
				                     filename.filename = decode_path(filename.filename)  # 防止乱码的操作
			
 
				-                    zf.extract(filename, "./cache")
			
 
				+                    zf.extract(filename, "./cache/" + tempdir)
			
 
				+            path = "./cache/" + tempdir
			
 
				         elif os.path.isfile(path) and path.endswith('.7z'):  # .7z格式文件解压
			
 
				             zf = py7zr.SevenZipFile(path, mode='r')
			
 
				-            zf.extractall("./cache")
			
 
				+            zf.extractall("./cache/" + tempdir)
			
 
				+            path = "./cache/" + tempdir
			
 
				     except Exception as e:
			
 
				         logging.error(e)
			
 
				-    else:
			
 
				-        path = "./cache"
			
 
				     # 传入为 doc
			
 
				     if os.path.isfile(path) and path.endswith('.doc'):
			
 
				         doc2pdf(docPath = path, pdfPath = './pdf', system=system)
			
@@ -1363,23 +1372,23 @@ def detection_type(path, system):
 
				         if os.path.exists(newfile):
			
 
				             rst = check_pdf(newfile)
			
 
				             if "Table" in rst:
			
 
				-                parse_table_from_pdf(newfile)
			
 
				+                parse_table_from_pdf(newfile, save_dir=tempdir)
			
 
				                 pass
			
 
				             if "Word" in rst:
			
 
				-                read_from_pdf(newfile)
			
 
				+                read_from_pdf(newfile, save_dir=tempdir)
			
 
				     # 传入为 docx
			
 
				     elif os.path.isfile(path) and path.endswith('.docx'):
			
 
				-        check_word(path)
			
 
				+        check_word(path, save_dir=tempdir)
			
 
				     # 传入为 pdf
			
 
				     elif os.path.isfile(path) and path.endswith('.pdf'):
			
 
				         rst = check_pdf(path)
			
 
				         if "Table" in rst:
			
 
				-            parse_table_from_pdf(path)
			
 
				+            parse_table_from_pdf(path, save_dir=tempdir)
			
 
				         if "Word" in rst:
			
 
				-            read_from_pdf(path)
			
 
				+            read_from_pdf(path, save_dir=tempdir)
			
 
				     # 传入为 txt
			
 
				     elif os.path.isfile(path) and path.endswith('.txt'):
			
 
				-        parse_txt(path)
			
 
				+        parse_txt(path, save_dir=tempdir)
			
 
				     # 传入目录
			
 
				     elif os.path.isdir(path):
			
 
				         for filename in os.listdir(path):
			
@@ -1392,27 +1401,27 @@ def detection_type(path, system):
 
				                 if os.path.exists(newfile):
			
 
				                     rst = check_pdf(newfile)
			
 
				                     if "Table" in rst:
			
 
				-                        parse_table_from_pdf(newfile)
			
 
				+                        parse_table_from_pdf(newfile, save_dir=tempdir)
			
 
				                         pass
			
 
				                     if "Word" in rst:
			
 
				-                        read_from_pdf(newfile)
			
 
				+                        read_from_pdf(newfile, save_dir=tempdir)
			
 
				             # 传入为 docx
			
 
				             elif os.path.isfile(filename) and filename.endswith('.docx'):
			
 
				-                check_word(filename)
			
 
				+                check_word(filename, save_dir=tempdir)
			
 
				             # 传入为 pdf
			
 
				             if os.path.isfile(filename) and filename.endswith('.pdf'):
			
 
				                 rst = check_pdf(filename)
			
 
				                 if "Table" in rst:
			
 
				-                    parse_table_from_pdf(filename)
			
 
				+                    parse_table_from_pdf(filename, save_dir=tempdir)
			
 
				                     pass
			
 
				                 if "Word" in rst:
			
 
				-                    read_from_pdf(filename)
			
 
				+                    read_from_pdf(filename, save_dir=tempdir)
			
 
				             # 传入为 txt
			
 
				             elif os.path.isfile(filename) and filename.endswith('.txt'):
			
 
				-                parse_txt(filename)
			
 
				+                parse_txt(filename, save_dir=tempdir)
			
 
				     # 结果返回
			
 
				-    for file in os.listdir("results"):
			
 
				-        filename = os.path.join("./results", file)
			
 
				+    for file in os.listdir(tempdir):
			
 
				+        filename = os.path.join(tempdir, file)
			
 
				         with open(filename, "r", encoding="utf-8") as ff:
			
 
				             rst = json.load(ff)
			
 
				         console.print(rst, style="red", justify="left")