|
@@ -4,6 +4,7 @@ import os
|
|
|
import sys
|
|
|
import re
|
|
|
import json
|
|
|
+import time
|
|
|
from os import walk
|
|
|
import subprocess
|
|
|
import py7zr
|
|
@@ -1024,7 +1025,7 @@ def doc2pdf_win(docPath, pdfPath):
|
|
|
word.Quit() #退出
|
|
|
|
|
|
|
|
|
-# Win32 doc 文件处理
|
|
|
+# doc 文件处理
|
|
|
def doc2pdf(docPath, pdfPath, system):
|
|
|
"""
|
|
|
注意使用绝对路径
|
|
@@ -1038,7 +1039,7 @@ def doc2pdf(docPath, pdfPath, system):
|
|
|
|
|
|
|
|
|
# txt 纯文本解析(已完成)
|
|
|
-def parse_txt(path):
|
|
|
+def parse_txt(path, save_dir):
|
|
|
with open(path, 'r', encoding='utf-8') as fp:
|
|
|
data = fp.read()
|
|
|
global block, block_rev
|
|
@@ -1069,12 +1070,13 @@ def parse_txt(path):
|
|
|
for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
|
|
|
if key == index:
|
|
|
result_data.append({block_rev[index]:func(page[index])})
|
|
|
- with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
|
|
|
+ filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
|
|
|
+ with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
|
|
|
json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
# 纯文本 word 解析
|
|
|
-def read_from_word(doc, path):
|
|
|
+def read_from_word(doc, path, save_dir):
|
|
|
para_text = []
|
|
|
for para in doc.paragraphs:
|
|
|
para_text.append(para.text)
|
|
@@ -1097,18 +1099,19 @@ def read_from_word(doc, path):
|
|
|
for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
|
|
|
if key == index:
|
|
|
result_data.append({block_rev[index]:func(page[index])})
|
|
|
- with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
|
|
|
+ filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
|
|
|
+ with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
|
|
|
json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
# 提取 word 表格(已完成)
|
|
|
-def check_word(path):
|
|
|
+def check_word(path, save_dir):
|
|
|
doc = Document(path)
|
|
|
tables = doc.tables
|
|
|
|
|
|
if not tables:
|
|
|
logging.info("this is raw text")
|
|
|
- read_from_word(doc, path)
|
|
|
+ read_from_word(doc, path, save_dir=save_dir)
|
|
|
logging.info("this is a Table")
|
|
|
|
|
|
global block
|
|
@@ -1165,7 +1168,8 @@ def check_word(path):
|
|
|
for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
|
|
|
if key == index:
|
|
|
result_data.append({block_rev[index]:func(page[index])})
|
|
|
- with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
|
|
|
+ filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
|
|
|
+ with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
|
|
|
json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
|
|
|
|
|
|
|
|
@@ -1204,7 +1208,7 @@ def parse_line_layout(layout, b):
|
|
|
|
|
|
|
|
|
# pdf 样式解析(已完成)
|
|
|
-def read_from_pdf(path):
|
|
|
+def read_from_pdf(path, save_dir):
|
|
|
result = {}
|
|
|
global block_rev
|
|
|
with open(path, 'rb') as in_file:
|
|
@@ -1237,13 +1241,14 @@ def read_from_pdf(path):
|
|
|
for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
|
|
|
if key == index:
|
|
|
result_data.append({block_rev[index]:func(result[index])})
|
|
|
- console.print(result_data)
|
|
|
- with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
|
|
|
+
|
|
|
+ filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
|
|
|
+ with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
|
|
|
json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
-# pdf 表格解析 ()
|
|
|
-def parse_table_from_pdf(path):
|
|
|
+# pdf 表格解析 (已完成)
|
|
|
+def parse_table_from_pdf(path, save_dir):
|
|
|
global block, block_rev
|
|
|
lo = {}
|
|
|
with pdfplumber.open(path) as pdf:
|
|
@@ -1291,7 +1296,8 @@ def parse_table_from_pdf(path):
|
|
|
for index, func in zip([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_other_list, get_other_list, get_other_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
|
|
|
if key == index:
|
|
|
result_data.append({block_rev[index]:func(page[index])})
|
|
|
- with open("./results/"+os.path.splitext(os.path.split(path)[-1])[0]+'.json', 'w', encoding="utf-8") as fp:
|
|
|
+ filename = os.path.splitext(os.path.split(path)[-1])[0]+'.json'
|
|
|
+ with open(os.path.join(save_dir, filename), 'w', encoding="utf-8") as fp:
|
|
|
json.dump({"result":result_data}, fp, indent=4, ensure_ascii=False)
|
|
|
|
|
|
|
|
@@ -1329,12 +1335,15 @@ def decode_path(path):
|
|
|
|
|
|
# 检测传入格式(已完成)
|
|
|
def detection_type(path, system):
|
|
|
+ tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
|
|
|
+ os.mkdir(tempdir)
|
|
|
# 传入 tar.gz 压缩文件
|
|
|
if os.path.isfile(path) and path.endswith('.tar.gz'):
|
|
|
tf = tarfile.open(path)
|
|
|
- tf.extractall('./cache')
|
|
|
+ tf.extractall('./cache/' + tempdir)
|
|
|
tf.close()
|
|
|
- path = "./cache"
|
|
|
+ path = "./cache/" + tempdir
|
|
|
+ # 传入 .zip .7z 压缩文件
|
|
|
try:
|
|
|
if os.path.isfile(path) and path.endswith('.zip'):
|
|
|
## 解压方式1:存在乱码
|
|
@@ -1348,14 +1357,14 @@ def detection_type(path, system):
|
|
|
for filename in file_iter:
|
|
|
# 编码文件名称为 utf 格式
|
|
|
filename.filename = decode_path(filename.filename) # 防止乱码的操作
|
|
|
- zf.extract(filename, "./cache")
|
|
|
+ zf.extract(filename, "./cache/" + tempdir)
|
|
|
+ path = "./cache/" + tempdir
|
|
|
elif os.path.isfile(path) and path.endswith('.7z'): # .7z格式文件解压
|
|
|
zf = py7zr.SevenZipFile(path, mode='r')
|
|
|
- zf.extractall("./cache")
|
|
|
+ zf.extractall("./cache/" + tempdir)
|
|
|
+ path = "./cache/" + tempdir
|
|
|
except Exception as e:
|
|
|
logging.error(e)
|
|
|
- else:
|
|
|
- path = "./cache"
|
|
|
# 传入为 doc
|
|
|
if os.path.isfile(path) and path.endswith('.doc'):
|
|
|
doc2pdf(docPath = path, pdfPath = './pdf', system=system)
|
|
@@ -1363,23 +1372,23 @@ def detection_type(path, system):
|
|
|
if os.path.exists(newfile):
|
|
|
rst = check_pdf(newfile)
|
|
|
if "Table" in rst:
|
|
|
- parse_table_from_pdf(newfile)
|
|
|
+ parse_table_from_pdf(newfile, save_dir=tempdir)
|
|
|
pass
|
|
|
if "Word" in rst:
|
|
|
- read_from_pdf(newfile)
|
|
|
+ read_from_pdf(newfile, save_dir=tempdir)
|
|
|
# 传入为 docx
|
|
|
elif os.path.isfile(path) and path.endswith('.docx'):
|
|
|
- check_word(path)
|
|
|
+ check_word(path, save_dir=tempdir)
|
|
|
# 传入为 pdf
|
|
|
elif os.path.isfile(path) and path.endswith('.pdf'):
|
|
|
rst = check_pdf(path)
|
|
|
if "Table" in rst:
|
|
|
- parse_table_from_pdf(path)
|
|
|
+ parse_table_from_pdf(path, save_dir=tempdir)
|
|
|
if "Word" in rst:
|
|
|
- read_from_pdf(path)
|
|
|
+ read_from_pdf(path, save_dir=tempdir)
|
|
|
# 传入为 txt
|
|
|
elif os.path.isfile(path) and path.endswith('.txt'):
|
|
|
- parse_txt(path)
|
|
|
+ parse_txt(path, save_dir=tempdir)
|
|
|
# 传入目录
|
|
|
elif os.path.isdir(path):
|
|
|
for filename in os.listdir(path):
|
|
@@ -1392,27 +1401,27 @@ def detection_type(path, system):
|
|
|
if os.path.exists(newfile):
|
|
|
rst = check_pdf(newfile)
|
|
|
if "Table" in rst:
|
|
|
- parse_table_from_pdf(newfile)
|
|
|
+ parse_table_from_pdf(newfile, save_dir=tempdir)
|
|
|
pass
|
|
|
if "Word" in rst:
|
|
|
- read_from_pdf(newfile)
|
|
|
+ read_from_pdf(newfile, save_dir=tempdir)
|
|
|
# 传入为 docx
|
|
|
elif os.path.isfile(filename) and filename.endswith('.docx'):
|
|
|
- check_word(filename)
|
|
|
+ check_word(filename, save_dir=tempdir)
|
|
|
# 传入为 pdf
|
|
|
if os.path.isfile(filename) and filename.endswith('.pdf'):
|
|
|
rst = check_pdf(filename)
|
|
|
if "Table" in rst:
|
|
|
- parse_table_from_pdf(filename)
|
|
|
+ parse_table_from_pdf(filename, save_dir=tempdir)
|
|
|
pass
|
|
|
if "Word" in rst:
|
|
|
- read_from_pdf(filename)
|
|
|
+ read_from_pdf(filename, save_dir=tempdir)
|
|
|
# 传入为 txt
|
|
|
elif os.path.isfile(filename) and filename.endswith('.txt'):
|
|
|
- parse_txt(filename)
|
|
|
+ parse_txt(filename, save_dir=tempdir)
|
|
|
# 结果返回
|
|
|
- for file in os.listdir("results"):
|
|
|
- filename = os.path.join("./results", file)
|
|
|
+ for file in os.listdir(tempdir):
|
|
|
+ filename = os.path.join(tempdir, file)
|
|
|
with open(filename, "r", encoding="utf-8") as ff:
|
|
|
rst = json.load(ff)
|
|
|
console.print(rst, style="red", justify="left")
|