sprivacy пре 3 година
родитељ
комит
9527e426d5
1 измењених фајлова са 45 додато и 64 уклоњено
  1. 45 64
      tools/resume_parse.py

+ 45 - 64
tools/resume_parse.py

@@ -9,13 +9,10 @@ from pprint import pprint
 import logging
 import logging
 logging.basicConfig(format='%(asctime)s: %(name)s: %(levelname)s: %(filename)s: %(funcName)s: %(message)s', level=logging.INFO)
 logging.basicConfig(format='%(asctime)s: %(name)s: %(levelname)s: %(filename)s: %(funcName)s: %(message)s', level=logging.INFO)
 
 
-import jieba
 import pandas as pd
 import pandas as pd
 from docx import Document
 from docx import Document
 from docx.shared import Inches
 from docx.shared import Inches
 
 
-from numpy import mean, median, bincount, argmax
-
 from pdfminer.high_level import extract_pages
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTChar, LTLine, LAParams, LTTextBox, LTFigure, LTImage, LTText, LTAnno, LTTextLine, LTTextLineHorizontal
 from pdfminer.layout import LTTextContainer, LTChar, LTLine, LAParams, LTTextBox, LTFigure, LTImage, LTText, LTAnno, LTTextLine, LTTextLineHorizontal
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfdocument import PDFDocument
@@ -61,7 +58,7 @@ block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经
 
 
 # 基本信息(已完成)
 # 基本信息(已完成)
 def get_base_info(lines):
 def get_base_info(lines):
-    pprint(lines)
+    logging.debug(lines)
     schema = {
     schema = {
         '姓名': None,
         '姓名': None,
     }
     }
@@ -69,8 +66,6 @@ def get_base_info(lines):
         line = line.replace(r'[ ]{5,}','\n')
         line = line.replace(r'[ ]{5,}','\n')
         w = re.sub(r'[\W]+(\w[::])[\W]{0,}\w', r'\1', line)
         w = re.sub(r'[\W]+(\w[::])[\W]{0,}\w', r'\1', line)
 
 
-        pprint(w)
-
         for i in w.split():
         for i in w.split():
             if ':' in i:
             if ':' in i:
                 try:
                 try:
@@ -78,12 +73,6 @@ def get_base_info(lines):
                     schema[key] = val
                     schema[key] = val
                 except Exception as e:
                 except Exception as e:
                     logging.error(e)
                     logging.error(e)
-            if ':' in i:
-                try:
-                    key, val = i.split(':')
-                    schema[key] = val
-                except Exception as e:
-                    logging.error(e)
 
 
         if not schema.get('姓名'):
         if not schema.get('姓名'):
             schema['姓名'] = re.search(r'[姓名::]{3,}(\w{2,4})', w).group(1) if re.search(r'[姓名::]{3,}(\w{2,4})', w) else None
             schema['姓名'] = re.search(r'[姓名::]{3,}(\w{2,4})', w).group(1) if re.search(r'[姓名::]{3,}(\w{2,4})', w) else None
@@ -95,16 +84,14 @@ def get_base_info(lines):
             schema['性别'] = re.search(r'[男女]', w).group() if re.search(r'[男女]', w) else None
             schema['性别'] = re.search(r'[男女]', w).group() if re.search(r'[男女]', w) else None
         if not schema.get('婚姻状况'):
         if not schema.get('婚姻状况'):
             schema['婚姻状况'] = re.search(r'[已未]婚', w).group() if re.search(r'[已未]婚', w) else None
             schema['婚姻状况'] = re.search(r'[已未]婚', w).group() if re.search(r'[已未]婚', w) else None
-        # if not schema.get('籍贯'):
-        #     schema['籍贯'] = re.search(r'[籍贯::]{3,}(\w{2,5})', w).group(1) if re.search(r'[籍贯::]{3,}(\w{2,})', w) else None
-        # if not schema.get('学历'):
-        #     schema['学历'] = re.search(r'[学历::]{3,}(\w{2,4})', w).group(1) if re.search(r'[学历::]{3,}(\w{2,4})', w) else None
         if not schema.get('电子邮箱'):
         if not schema.get('电子邮箱'):
             schema['电子邮箱'] = re.search(r'([.\w]+@[.\w]+)', w).group() if re.search(r'([.\w]+@[.\w]+)', w) else None
             schema['电子邮箱'] = re.search(r'([.\w]+@[.\w]+)', w).group() if re.search(r'([.\w]+@[.\w]+)', w) else None
         if not schema.get('政治面貌'):
         if not schema.get('政治面貌'):
             schema['政治面貌'] =  re.search(r'[预备中共党团员群众无派人士]{2,6}', w).group() if re.search(r'[预备中共党团员群众无派人士]{2,6}', w) else None
             schema['政治面貌'] =  re.search(r'[预备中共党团员群众无派人士]{2,6}', w).group() if re.search(r'[预备中共党团员群众无派人士]{2,6}', w) else None
         if not schema.get('手机号码'):
         if not schema.get('手机号码'):
             schema['手机号码'] = re.search(r'\W(1[\d]{10})\W', w).group(1) if re.search(r'\W(1[\d]{10})\W', w) else None
             schema['手机号码'] = re.search(r'\W(1[\d]{10})\W', w).group(1) if re.search(r'\W(1[\d]{10})\W', w) else None
+        # if not schema.get('籍贯'):
+        #     schema['籍贯'] = re.search(r'[籍贯::]{3,}(\w{2,5})', w).group(1) if re.search(r'[籍贯::]{3,}(\w{2,})', w) else None
         # if not schema.get('出生年月'):
         # if not schema.get('出生年月'):
         #     schema['出生年月'] = re.search(r'\d{4}[./年\-]\d{1,2}[月]', w).group() if re.search(r'\d{4}[./年\-]\d{1,2}[月]', w) else None
         #     schema['出生年月'] = re.search(r'\d{4}[./年\-]\d{1,2}[月]', w).group() if re.search(r'\d{4}[./年\-]\d{1,2}[月]', w) else None
         # if not schema.get('当前职位'):
         # if not schema.get('当前职位'):
@@ -116,7 +103,7 @@ def get_base_info(lines):
 
 
 # 求职意向(已完成)
 # 求职意向(已完成)
 def get_job_intention(lines):
 def get_job_intention(lines):
-    pprint(lines)
+    logging.debug(lines)
     schema = {}
     schema = {}
     for line in lines:
     for line in lines:
         regex = re.compile(r'\W{0,3}[::]\s+')
         regex = re.compile(r'\W{0,3}[::]\s+')
@@ -131,10 +118,10 @@ def get_job_intention(lines):
     return schema
     return schema
 
 
 
 
-# 教育经历 (已完成)
+# 教育经历 (已停用)
 # ner + 分词 (判断学校,时间,学历)  专业需要单独处理。
 # ner + 分词 (判断学校,时间,学历)  专业需要单独处理。
 def get_edu_list_old(lines):
 def get_edu_list_old(lines):
-    pprint(lines)
+    logging.debug(lines)
 
 
     job_list = []
     job_list = []
     job_dict = {'edu_time_beg':'', 'edu_time_end':'', 'edu_name':'','edu_leval':'','edu_domain':'', 'edu_statue':0}
     job_dict = {'edu_time_beg':'', 'edu_time_end':'', 'edu_name':'','edu_leval':'','edu_domain':'', 'edu_statue':0}
@@ -294,8 +281,6 @@ def get_edu_list_old(lines):
                     break
                     break
 
 
         job_dict['edu_domain'] = edu_domain
         job_dict['edu_domain'] = edu_domain
-        # print(job_dict)
-        # print(_list_data)
 
 
         if len(job_list) ==0:
         if len(job_list) ==0:
             job_list.append(job_dict)
             job_list.append(job_dict)
@@ -355,7 +340,7 @@ def get_edu_list_old(lines):
 
 
 # 教育经历改 (已完成)
 # 教育经历改 (已完成)
 def get_edu_list(lines):
 def get_edu_list(lines):
-    pprint(lines)
+    logging.debug(lines)
 
 
     edu_list = [{"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None}]
     edu_list = [{"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None}]
     regex_time = re.compile(r'((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?[\d]{0,2})[至到\W]+((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?)?([今])?|(\d{4})[至\W]+([\d今]{4})')
     regex_time = re.compile(r'((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?[\d]{0,2})[至到\W]+((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?)?([今])?|(\d{4})[至\W]+([\d今]{4})')
@@ -380,29 +365,38 @@ def get_edu_list(lines):
                     edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
                     edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
                     count += 1
                     count += 1
                 edu_list[count]["startTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)))
                 edu_list[count]["startTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)))
+                # 年月日
                 if edu_time.group(5) != None:
                 if edu_time.group(5) != None:
                     edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(5)),int(edu_time.group(6)))
                     edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(5)),int(edu_time.group(6)))
                     edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)),int(edu_time.group(5)),int(edu_time.group(6)))
                     edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)),int(edu_time.group(5)),int(edu_time.group(6)))
+                # 只有年
                 elif edu_time.group(8) != None:
                 elif edu_time.group(8) != None:
                     edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_time.group(8)),int(edu_time.group(9)))
                     edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_time.group(8)),int(edu_time.group(9)))
                     edu_list[count]["startTime"] = '{:4d}'.format(int(edu_time.group(8)))
                     edu_list[count]["startTime"] = '{:4d}'.format(int(edu_time.group(8)))
                     edu_list[count]["endTime"] = '{:4d}'.format(int(edu_time.group(9)))
                     edu_list[count]["endTime"] = '{:4d}'.format(int(edu_time.group(9)))
+                # 至今类
                 else:
                 else:
                     edu_list[count]["endTime"] = edu_time.group(7)
                     edu_list[count]["endTime"] = edu_time.group(7)
                     edu_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(edu_time.group(2)),int(edu_time.group(3)),edu_time.group(7))
                     edu_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(edu_time.group(2)),int(edu_time.group(3)),edu_time.group(7))
                 flags = 1
                 flags = 1
+            # 只有毕业时间
             elif edu_end_time:
             elif edu_end_time:
+                # 提交信息
                 if edu_list[count].get("endTime") and edu_list[count].get("edu_name"):
                 if edu_list[count].get("endTime") and edu_list[count].get("edu_name"):
                     edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
                     edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
                     count += 1
                     count += 1
+                # 年月
                 if edu_end_time.group(2):
                 if edu_end_time.group(2):
                     edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)),int(edu_end_time.group(1))-3,int(edu_end_time.group(2)))
                     edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)),int(edu_end_time.group(1))-3,int(edu_end_time.group(2)))
                     edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)))
                     edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)))
+                # 只有年
                 elif edu_end_time.group(1):
                 elif edu_end_time.group(1):
                     edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(1))-3)
                     edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(1))-3)
                     edu_list[count]["endTime"] = '{:4d}'.format(int(edu_end_time.group(1)))
                     edu_list[count]["endTime"] = '{:4d}'.format(int(edu_end_time.group(1)))
+            # 学历
             if (not edu_list[count].get("edu_level")) and edu_level:
             if (not edu_list[count].get("edu_level")) and edu_level:
                 edu_list[count]["edu_level"] = edu_level.group(0)
                 edu_list[count]["edu_level"] = edu_level.group(0)
+            # WordTag 识别 学校/专业
             for word, tag in ner_tag(cell):
             for word, tag in ner_tag(cell):
                 if (not edu_list[count].get("edu_name")) and (tag == "组织机构类_教育组织机构"):
                 if (not edu_list[count].get("edu_name")) and (tag == "组织机构类_教育组织机构"):
                     edu_list[count]["edu_name"] = word.strip()
                     edu_list[count]["edu_name"] = word.strip()
@@ -411,14 +405,17 @@ def get_edu_list(lines):
                     edu_list[count]["edu_domain"] = word.strip()
                     edu_list[count]["edu_domain"] = word.strip()
                 elif edu_list[count].get("edu_name") and edu_list[count].get("edu_domain"):
                 elif edu_list[count].get("edu_name") and edu_list[count].get("edu_domain"):
                     break
                     break
+            # LAC 识别 学校
             else:
             else:
                 for word, tag in ner(cell):
                 for word, tag in ner(cell):
                     if (tag == "ORG"):
                     if (tag == "ORG"):
                         edu_list[count]["edu_name"] = word
                         edu_list[count]["edu_name"] = word
                         flags = 1
                         flags = 1
                         break
                         break
+            # 未识别成功时填充专业
             if (not (edu_level or flags or edu_list[count].get("edu_domain"))) and edu_domain:
             if (not (edu_level or flags or edu_list[count].get("edu_domain"))) and edu_domain:
                 edu_list[count]["edu_domain"] = edu_domain.group(0)
                 edu_list[count]["edu_domain"] = edu_domain.group(0)
+    # 剔除时间不存在、学校不存在的列
     if (not edu_list[-1].get("Time")) or (not edu_list[-1].get("edu_name")):
     if (not edu_list[-1].get("Time")) or (not edu_list[-1].get("edu_name")):
         edu_list.pop()
         edu_list.pop()
     return edu_list
     return edu_list
@@ -429,7 +426,7 @@ def get_edu_list(lines):
 # 其中,时间是判断是否下一份工作情况的主要标识符之一。字符数量
 # 其中,时间是判断是否下一份工作情况的主要标识符之一。字符数量
 # 时间类 数量词
 # 时间类 数量词
 def get_job_list(lines):
 def get_job_list(lines):
-    pprint(lines)
+    logging.debug(lines)
 
 
     job_list = []
     job_list = []
     re_txt = '\d{4,4}\040{0,2}.\d+\040{0,2}.?\040{0,2}[\-–至-\—~]{1,2}\040{0,2}\d{4,4}\040{0,2}.\040{0,2}\d+.?|\d{4,4}.\d+.?\040{0,2}[\-–-—]{0,2}\040{0,2}至?今|\d{4,4}.\d+.?\040{0,2}[\-–-]{1,2}\040{0,2}现在|\d{4,4}年\d+月\-\d{4,4}年\d+月|\d{4,4}年\d+月\-\~|\d{4,4}年\d+月[\-\~-]至今|\d{4,4}-\d+\040{0,2}[-\~至]\040{0,2}\d{4,4}-\d+|\d{4,4}-\d+\~|\d{4,4}-\d+\[~-]至今|\d{4,4}-\d+\040{0,2}至今'
     re_txt = '\d{4,4}\040{0,2}.\d+\040{0,2}.?\040{0,2}[\-–至-\—~]{1,2}\040{0,2}\d{4,4}\040{0,2}.\040{0,2}\d+.?|\d{4,4}.\d+.?\040{0,2}[\-–-—]{0,2}\040{0,2}至?今|\d{4,4}.\d+.?\040{0,2}[\-–-]{1,2}\040{0,2}现在|\d{4,4}年\d+月\-\d{4,4}年\d+月|\d{4,4}年\d+月\-\~|\d{4,4}年\d+月[\-\~-]至今|\d{4,4}-\d+\040{0,2}[-\~至]\040{0,2}\d{4,4}-\d+|\d{4,4}-\d+\~|\d{4,4}-\d+\[~-]至今|\d{4,4}-\d+\040{0,2}至今'
@@ -453,7 +450,7 @@ def get_job_list(lines):
         elif len(year_list) == 1 and '至今' in lines[i]:
         elif len(year_list) == 1 and '至今' in lines[i]:
             nums.append(i)
             nums.append(i)
     nums.append(len(lines))
     nums.append(len(lines))
-    # pprint(nums)
+    # logging.debug(nums)
     logging.debug('get_job_list :{}'.format(nums))
     logging.debug('get_job_list :{}'.format(nums))
     for i in range(1, len(nums[:])):
     for i in range(1, len(nums[:])):
         job_dict = {'job_time':'', 'job_leval':'','job_company':'','job_content':''}
         job_dict = {'job_time':'', 'job_leval':'','job_company':'','job_content':''}
@@ -471,11 +468,11 @@ def get_job_list(lines):
                 job_time = ['-'.join(year_list)]
                 job_time = ['-'.join(year_list)]
             elif len(year_list) == 1 and '至今' in lines[i]:
             elif len(year_list) == 1 and '至今' in lines[i]:
                 job_time = [year_list[0] + '-' + '至今']
                 job_time = [year_list[0] + '-' + '至今']
-            pprint("342{}".format(job_time))
+
         if not job_time:
         if not job_time:
             regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
             regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
             job_time = [re.search(regex, data_list[0]).group(0)]
             job_time = [re.search(regex, data_list[0]).group(0)]
-            pprint("346{}".format(job_time))
+
         job_dict['job_time'] = job_time[0]
         job_dict['job_time'] = job_time[0]
         _nums = re.findall('\d+', job_dict['job_time'])
         _nums = re.findall('\d+', job_dict['job_time'])
         #print(_nums)
         #print(_nums)
@@ -612,7 +609,7 @@ def get_job_list(lines):
 # 项目经历 (已完成)
 # 项目经历 (已完成)
 # 项目名称未知
 # 项目名称未知
 def get_pro_list(lines):
 def get_pro_list(lines):
-    pprint(lines)
+    logging.debug(lines)
 
 
     pro_list = [{"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,},]
     pro_list = [{"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,},]
     regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
     regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
@@ -624,13 +621,6 @@ def get_pro_list(lines):
         regex_content = re_con.search(line)
         regex_content = re_con.search(line)
         regex_name = re_na.search(line)
         regex_name = re_na.search(line)
         if regex_time:
         if regex_time:
-
-            # data_list = line.split()
-            # # for i in regex_time.groups():
-            # #     data_list.remove(i)
-            # console.print(regex_time.groups(), style='red', justify='left')
-            # console.print(data_list, style='red', justify='left')
-
             if pro_list[count].get("Time"):
             if pro_list[count].get("Time"):
                 pro_list.append({"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,})
                 pro_list.append({"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,})
                 count += 1
                 count += 1
@@ -661,7 +651,7 @@ def get_pro_list(lines):
 # 培训经历 (已完成)
 # 培训经历 (已完成)
 # ner + 分词 (机构名) 培训项目  时间
 # ner + 分词 (机构名) 培训项目  时间
 def get_cultivate_list(lines):
 def get_cultivate_list(lines):
-    pprint(lines)
+    logging.debug(lines)
 
 
     job_list = []
     job_list = []
     re_txt = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今|^\d{4,4}.\d{1,2}|\d{4,4}.'
     re_txt = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今|^\d{4,4}.\d{1,2}|\d{4,4}.'
@@ -783,15 +773,13 @@ def get_cultivate_list(lines):
         else:
         else:
             job_dict['cultivate_content'] = re.sub('培培训训内内容容|培培训训内内容容::|培培训训内内容容::', '培训内容:', ''.join(data_list[0:]))
             job_dict['cultivate_content'] = re.sub('培培训训内内容容|培培训训内内容容::|培培训训内内容容::', '培训内容:', ''.join(data_list[0:]))
         #print(job_dict)
         #print(job_dict)
-    
-        
     '''
     '''
     return job_list
     return job_list
 
 
 
 
 # 语言能力
 # 语言能力
 def get_lag_list(lines):
 def get_lag_list(lines):
-    pprint(lines)
+    logging.debug(lines)
 
 
     job_list = []
     job_list = []
     re_lan = re.compile(r'(\w+[语话])')
     re_lan = re.compile(r'(\w+[语话])')
@@ -810,7 +798,6 @@ def get_lag_list(lines):
 # 家庭情况
 # 家庭情况
 def get_fam_list(lines):
 def get_fam_list(lines):
     job_list = []
     job_list = []
-    #re_txt = '\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\~\d+-\d+|\d+-\d+\~|\d+-\d+\~至今'
     fam_dict = {}
     fam_dict = {}
     for l in lines:
     for l in lines:
         if not l.strip():
         if not l.strip():
@@ -834,7 +821,7 @@ def get_fam_list(lines):
 
 
 # 证书情况  时间+证书名称 (已完成)
 # 证书情况  时间+证书名称 (已完成)
 def get_cet_list(lines):
 def get_cet_list(lines):
-    pprint(lines)
+    logging.debug(lines)
 
 
     job_list = []
     job_list = []
     re_txt = '\d+年\d+月|\d+-\d+|\d+\.\d+'
     re_txt = '\d+年\d+月|\d+-\d+|\d+\.\d+'
@@ -863,14 +850,13 @@ def get_cet_list(lines):
                 continue
                 continue
             cet_dict['prize_name'] = l.strip()
             cet_dict['prize_name'] = l.strip()
             break
             break
-        #print(cet_dict)
         job_list.append(cet_dict)
         job_list.append(cet_dict)
     return job_list
     return job_list
 
 
 
 
 # 获奖情况  时间+获奖名称 (已完成)
 # 获奖情况  时间+获奖名称 (已完成)
 def get_prize_list(lines):
 def get_prize_list(lines):
-    pprint(lines)
+    logging.debug(lines)
 
 
     job_list = []
     job_list = []
     re_txt = '\d+年\d+月|\d+-\d+|\d{4,4}.\d{1,2}'
     re_txt = '\d+年\d+月|\d+-\d+|\d{4,4}.\d{1,2}'
@@ -957,11 +943,13 @@ def parse_txt(path):
                 page[chun] = []
                 page[chun] = []
             elif line:
             elif line:
                 page[chun].append(line)
                 page[chun].append(line)
+
+    result_data = []
     for key in page.keys():
     for key in page.keys():
         for index, func in zip([1, 2, 3, 4, 5, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
         for index, func in zip([1, 2, 3, 4, 5, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
             if key == index:
             if key == index:
-                console.print(block_rev[index], style="yellow", justify="left")
-                console.print(func(page[index]), style="green", justify="left")
+                result_data.append({block_rev[index]:func(page[index])})
+    console.print(result_data)
 
 
 
 
 # 纯文本 word 解析
 # 纯文本 word 解析
@@ -983,15 +971,12 @@ def read_from_word(doc):
         elif line:
         elif line:
             page[chun].append(line)
             page[chun].append(line)
 
 
+    result_data = []
     for key in page.keys():
     for key in page.keys():
         for index, func in zip([1, 2, 3, 4, 5, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_cet_list, get_prize_list, get_cultivate_list]):
         for index, func in zip([1, 2, 3, 4, 5, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_cet_list, get_prize_list, get_cultivate_list]):
             if key == index:
             if key == index:
-                console.print(block_rev[index])
-                try:
-                    console.print(func(page[index]), justify="left")
-                except Exception as e:
-                    logging.error(e)
-    return ''
+                result_data.append({block_rev[index]:func(page[index])})
+    console.print(result_data)
 
 
 
 
 # 提取 word 表格(已完成)
 # 提取 word 表格(已完成)
@@ -1000,9 +985,9 @@ def check_word(path):
     tables = doc.tables
     tables = doc.tables
 
 
     if not tables:
     if not tables:
-        logging.info("this is raw text")
+        logging.debug("this is raw text")
         read_from_word(doc)
         read_from_word(doc)
-    logging.info("this is a Table")
+    logging.debug("this is a Table")
 
 
     prk = {"姓名":1, "性别":1, "出生年月":1, "民族":1, "籍贯":1, "户籍地":1, "政治面貌":1, "参加工作时间":1, "健康状况":1, "专业技术资格":1, "外语水平":9, "熟悉专业有何专长":8, "学历学位":1, "工作单位":1, "现任职务":1, "任职时间":1, "提职时间":1, "联系电话":1, "邮箱地址":1, "称谓":13, "工作单位及职务":1, "毕业时间、院校及专业":3,}
     prk = {"姓名":1, "性别":1, "出生年月":1, "民族":1, "籍贯":1, "户籍地":1, "政治面貌":1, "参加工作时间":1, "健康状况":1, "专业技术资格":1, "外语水平":9, "熟悉专业有何专长":8, "学历学位":1, "工作单位":1, "现任职务":1, "任职时间":1, "提职时间":1, "联系电话":1, "邮箱地址":1, "称谓":13, "工作单位及职务":1, "毕业时间、院校及专业":3,}
     block = {
     block = {
@@ -1021,10 +1006,10 @@ def check_word(path):
         "家庭成员":13, "家家庭庭成成员员":13, "家庭成员家庭成员":13, "主要家庭成员及社会关系":13,
         "家庭成员":13, "家家庭庭成成员员":13, "家庭成员家庭成员":13, "主要家庭成员及社会关系":13,
         "社会活动":"other", "实践经验":"other", "社会活动及社会实践":"other", "近三年年度考核结果":"other", "其他意愿":"other",
         "社会活动":"other", "实践经验":"other", "社会活动及社会实践":"other", "近三年年度考核结果":"other", "其他意愿":"other",
     }
     }
-    regex = re.compile(r'(\(\w{2,8}\))?((\w{2,8}))?')
 
 
     chun = 1
     chun = 1
     page = {1: []}
     page = {1: []}
+    regex = re.compile(r'(\(\w{2,8}\))?((\w{2,8}))?')
     for table in tables:
     for table in tables:
         lo = {} # 存储每一行去重后的数据
         lo = {} # 存储每一行去重后的数据
         for row in range(0, len(table.rows)):
         for row in range(0, len(table.rows)):
@@ -1062,12 +1047,12 @@ def check_word(path):
                 line = line.replace(k+"\n", k+":")
                 line = line.replace(k+"\n", k+":")
             page[chun].extend(line.split())
             page[chun].extend(line.split())
 
 
+    result_data = []
     for key in page.keys():
     for key in page.keys():
         for index, func in zip([1, 2, 3, 4, 5, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_cet_list, get_prize_list, get_cultivate_list]):
         for index, func in zip([1, 2, 3, 4, 5, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_cet_list, get_prize_list, get_cultivate_list]):
             if key == index:
             if key == index:
-                console.print(block_rev[index])
-                console.print(func(page[index]), justify="left")
-    return ''
+                result_data.append({block_rev[index]:func(page[index])})
+    console.print(result_data)
 
 
 
 
 # pdf 解析句子(已完成)
 # pdf 解析句子(已完成)
@@ -1086,7 +1071,6 @@ def parse_line_layout(layout):
                     texts.append([char.bbox[0], char.bbox[3], char.get_text().strip()])
                     texts.append([char.bbox[0], char.bbox[3], char.get_text().strip()])
     # 按行排序
     # 按行排序
     texts.sort(key=lambda x:-x[1])
     texts.sort(key=lambda x:-x[1])
-    # print(texts)
     global block, block_rev
     global block, block_rev
 
 
     chun = 1
     chun = 1
@@ -1134,15 +1118,12 @@ def read_from_pdf(path):
                     result[key] = r[key]
                     result[key] = r[key]
         block_rev = {1:"基本信息",2:"求职意向",3:"教育经历",4:"工作经历",5:"项目经历",6:"专业技能",7:"自我评价",8:"兴趣爱好",9:"语言能力",10:"证书",11:"获奖情况",12:"培训经历",13:"家庭成员","other":"其他"}
         block_rev = {1:"基本信息",2:"求职意向",3:"教育经历",4:"工作经历",5:"项目经历",6:"专业技能",7:"自我评价",8:"兴趣爱好",9:"语言能力",10:"证书",11:"获奖情况",12:"培训经历",13:"家庭成员","other":"其他"}
 
 
+        result_data = []
         for key in result.keys():
         for key in result.keys():
             for index, func in zip([1, 2, 3, 4, 5, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
             for index, func in zip([1, 2, 3, 4, 5, 9, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_lag_list, get_cet_list, get_prize_list, get_cultivate_list]):
                 if key == index:
                 if key == index:
-                    console.print(block_rev[index])
-                    console.print(func(result[index]), justify="left")
-                    # try:
-                    #     console.print(func(result[index]), justify="left")
-                    # except Exception as e:
-                    #     logging.error(e)
+                    result_data.append({block_rev[index]: func(result[index])})
+        console.print(result_data)
 
 
 
 
 # pdf 表格解析 ()
 # pdf 表格解析 ()
@@ -1162,7 +1143,7 @@ def parse_table_from_pdf(path):
                             key = None
                             key = None
     for key in block.keys():
     for key in block.keys():
         if result.get(key):
         if result.get(key):
-            pprint({key: result[key]})
+            logging.debug({key: result[key]})
     console.print(result)
     console.print(result)
     # for key in result.keys():
     # for key in result.keys():
     #     for index, func in zip([1, 2, 3, 4, 5, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_cet_list, get_prize_list, get_cultivate_list]):
     #     for index, func in zip([1, 2, 3, 4, 5, 10, 11, 12], [get_base_info, get_job_intention, get_edu_list, get_job_list, get_pro_list, get_cet_list, get_prize_list, get_cultivate_list]):