Browse Source

modified: resume_parse.py

sprivacy 3 năm trước cách đây
mục cha
commit
5676abee61
1 tập tin đã thay đổi với 18 bổ sung2 xóa
  1. 18 2
      tools/resume_parse.py

+ 18 - 2
tools/resume_parse.py

@@ -1190,7 +1190,15 @@ def detection_type(path, system):
             # 传入为 doc
             # 传入为 doc
             logging.info(filename)
             logging.info(filename)
             if filename.endswith('.doc') and not filename.startswith('.~'):
             if filename.endswith('.doc') and not filename.startswith('.~'):
-                doc2pdf(docPath = filename, pdfPath = './', system=system)
+                doc2pdf(docPath = filename, pdfPath = './pdf', system=system)
+                newfile = './pdf/' + os.path.splitext(os.path.split(newfile)[-1])[0] + '.pdf'
+                if os.path.exists(newfile):
+                    rst = check_pdf(filename)
+                    if "Table" in rst:
+                        parse_table_from_pdf(filename)
+                        pass
+                    if "Word" in rst:
+                        read_from_pdf(filename)
             # 传入为 docx
             # 传入为 docx
             elif os.path.isfile(filename) and filename.endswith('.docx'):
             elif os.path.isfile(filename) and filename.endswith('.docx'):
                 check_word(filename)
                 check_word(filename)
@@ -1207,7 +1215,15 @@ def detection_type(path, system):
                 parse_txt(filename)
                 parse_txt(filename)
     # 传入为 doc
     # 传入为 doc
     elif os.path.isfile(path) and path.endswith('.doc'):
     elif os.path.isfile(path) and path.endswith('.doc'):
-        doc2pdf(docPath = path, pdfPath = './', system=system)
+        doc2pdf(docPath = path, pdfPath = './pdf', system=system)
+        newfile = './pdf/' + os.path.splitext(os.path.split(newfile)[-1])[0] + '.pdf'
+        if os.path.exists(newfile):
+            rst = check_pdf(filename)
+            if "Table" in rst:
+                parse_table_from_pdf(filename)
+                pass
+            if "Word" in rst:
+                read_from_pdf(filename)
     # 传入为 docx
     # 传入为 docx
     elif os.path.isfile(path) and path.endswith('.docx'):
     elif os.path.isfile(path) and path.endswith('.docx'):
         check_word(path)
         check_word(path)