Forráskód Böngészése

modified: resume_parse.py

sprivacy 3 éve
szülő
commit
746a4d1e8c
1 módosított fájl, 43 hozzáadás és 34 törlés
  1. 43 34
      tools/resume_parse.py

+ 43 - 34
tools/resume_parse.py

@@ -4,6 +4,7 @@ import os
 import sys
 import re
 import json
+import time
 from os import walk
 import subprocess
 import py7zr
@@ -1024,7 +1025,7 @@ def doc2pdf_win(docPath, pdfPath):
     word.Quit() #退出
 
 
-# Win32 doc 文件处理
+# doc 文件处理
 def doc2pdf(docPath, pdfPath, system):
     """
     注意使用绝对路径
@@ -1038,7 +1039,7 @@ def doc2pdf(docPath, pdfPath, system):
 
 
 # txt 纯文本解析(已完成)
-def parse_txt(path):
+def parse_txt(path, save_dir):
     with open(path, 'r', encoding='utf-8') as fp:
         data = fp.read()
     global block, block_rev
@@ -1069,12 +1070,13 @@ def parse_txt(path):
         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
             if key == index:
                 result_data.append({block_rev[index]:func(page[index])})
-    with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
+    filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
+    with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
 
 
 # 纯文本 word 解析
-def read_from_word(doc, path):
+def read_from_word(doc, path, save_dir):
     para_text = []
     for para in doc.paragraphs:
         para_text.append(para.text)
@@ -1097,18 +1099,19 @@ def read_from_word(doc, path):
         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
             if key == index:
                 result_data.append({block_rev[index]:func(page[index])})
-    with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
+    filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
+    with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
 
 
 # 提取 word 表格(已完成)
-def check_word(path):
+def check_word(path, save_dir):
     doc = Document(path)
     tables = doc.tables
 
     if not tables:
         logging.info("this is raw text")
-        read_from_word(doc, path)
+        read_from_word(doc, path, save_dir=save_dir)
     logging.info("this is a Table")
 
     global block
@@ -1165,7 +1168,8 @@ def check_word(path):
         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
             if key == index:
                 result_data.append({block_rev[index]:func(page[index])})
-    with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
+    filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
+    with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
 
 
@@ -1204,7 +1208,7 @@ def parse_line_layout(layout, b):
 
 
 # pdf 样式解析(已完成)
-def read_from_pdf(path):
+def read_from_pdf(path, save_dir):
     result = {}
     global block_rev
     with open(path, 'rb') as in_file:
@@ -1237,13 +1241,14 @@ def read_from_pdf(path):
             for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
                 if key == index:
                     result_data.append({block_rev[index]:func(result[index])})
-        console.print(result_data)
-        with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
+
+        filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
+        with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
 
 
-# pdf 表格解析 ()
-def parse_table_from_pdf(path):
+# pdf 表格解析 (已完成)
+def parse_table_from_pdf(path, save_dir):
     global block, block_rev
     lo = {}
     with pdfplumber.open(path) as pdf:
@@ -1291,7 +1296,8 @@ def parse_table_from_pdf(path):
         for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
             if key == index:
                 result_data.append({block_rev[index]:func(page[index])})
-    with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
+    filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
+    with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
             json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
 
 
@@ -1329,12 +1335,15 @@ def decode_path(path):
 
 # 检测传入格式(已完成)
 def detection_type(path, system):
+    tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
+    os.mkdir(tempdir)
     # 传入 tar.gz 压缩文件
     if os.path.isfile(path) and path.endswith('.tar.gz'):
         tf = tarfile.open(path)
-        tf.extractall('./cache')
+        tf.extractall('./cache/' + tempdir)
         tf.close()
-        path = "./cache"
+        path = "./cache/" + tempdir
+    # 传入 .zip .7z 压缩文件
     try:
         if os.path.isfile(path) and path.endswith('.zip'):
             ## 解压方式1:存在乱码
@@ -1348,14 +1357,14 @@ def detection_type(path, system):
                 for filename in file_iter:
                     # 编码文件名称为 utf 格式
                     filename.filename = decode_path(filename.filename)  # 防止乱码的操作
-                    zf.extract(filename, "./cache")
+                    zf.extract(filename, "./cache/" + tempdir)
+            path = "./cache/" + tempdir
         elif os.path.isfile(path) and path.endswith('.7z'):  # .7z格式文件解压
             zf = py7zr.SevenZipFile(path, mode='r')
-            zf.extractall("./cache")
+            zf.extractall("./cache/" + tempdir)
+            path = "./cache/" + tempdir
     except Exception as e:
         logging.error(e)
-    else:
-        path = "./cache"
     # 传入为 doc
     if os.path.isfile(path) and path.endswith('.doc'):
         doc2pdf(docPath = path, pdfPath = './pdf', system=system)
@@ -1363,23 +1372,23 @@ def detection_type(path, system):
         if os.path.exists(newfile):
             rst = check_pdf(newfile)
             if "Table" in rst:
-                parse_table_from_pdf(newfile)
+                parse_table_from_pdf(newfile, save_dir=tempdir)
                 pass
             if "Word" in rst:
-                read_from_pdf(newfile)
+                read_from_pdf(newfile, save_dir=tempdir)
     # 传入为 docx
     elif os.path.isfile(path) and path.endswith('.docx'):
-        check_word(path)
+        check_word(path, save_dir=tempdir)
     # 传入为 pdf
     elif os.path.isfile(path) and path.endswith('.pdf'):
         rst = check_pdf(path)
         if "Table" in rst:
-            parse_table_from_pdf(path)
+            parse_table_from_pdf(path, save_dir=tempdir)
         if "Word" in rst:
-            read_from_pdf(path)
+            read_from_pdf(path, save_dir=tempdir)
     # 传入为 txt
     elif os.path.isfile(path) and path.endswith('.txt'):
-        parse_txt(path)
+        parse_txt(path, save_dir=tempdir)
     # 传入目录
     elif os.path.isdir(path):
         for filename in os.listdir(path):
@@ -1392,27 +1401,27 @@ def detection_type(path, system):
                 if os.path.exists(newfile):
                     rst = check_pdf(newfile)
                     if "Table" in rst:
-                        parse_table_from_pdf(newfile)
+                        parse_table_from_pdf(newfile, save_dir=tempdir)
                         pass
                     if "Word" in rst:
-                        read_from_pdf(newfile)
+                        read_from_pdf(newfile, save_dir=tempdir)
             # 传入为 docx
             elif os.path.isfile(filename) and filename.endswith('.docx'):
-                check_word(filename)
+                check_word(filename, save_dir=tempdir)
             # 传入为 pdf
             if os.path.isfile(filename) and filename.endswith('.pdf'):
                 rst = check_pdf(filename)
                 if "Table" in rst:
-                    parse_table_from_pdf(filename)
+                    parse_table_from_pdf(filename, save_dir=tempdir)
                     pass
                 if "Word" in rst:
-                    read_from_pdf(filename)
+                    read_from_pdf(filename, save_dir=tempdir)
             # 传入为 txt
             elif os.path.isfile(filename) and filename.endswith('.txt'):
-                parse_txt(filename)
+                parse_txt(filename, save_dir=tempdir)
     # 结果返回
-    for file in os.listdir("results"):
-        filename = os.path.join("./results", file)
+    for file in os.listdir(tempdir):
+        filename = os.path.join(tempdir, file)
         with open(filename, "r", encoding="utf-8") as ff:
             rst = json.load(ff)
         console.print(rst, style="red", justify="left")