|
@@ -27,7 +27,42 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
|
import pdfplumber
|
|
|
from paddlenlp import Taskflow
|
|
|
|
|
|
-logging.basicConfig(format='%(asctime)s: %(name)s: %(levelname)s: %(filename)s: %(funcName)s: %(lineno)d: %(message)s', level=logging.INFO)
|
|
|
+class Logger:
|
|
|
+ def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(asctime)s: %(name)s: %(levelname)s: %(filename)s: %(funcName)s: %(message)s'):
|
|
|
+ self.logger = logging.getLogger(name)
|
|
|
+ self.logger.setLevel(logging.INFO)
|
|
|
+ self.fmt = logging.Formatter(fmt)
|
|
|
+ self.set_console_handler(console_handler_level)
|
|
|
+
|
|
|
+ def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
|
|
|
+ ch = logging.StreamHandler()
|
|
|
+ ch.setLevel(console_handler_level)
|
|
|
+ ch.setFormatter(self.fmt)
|
|
|
+ self.logger.addHandler(ch)
|
|
|
+
|
|
|
+ def set_file_handler(self, filename: str, mode: str = "a", file_handler_level: str = logging.WARNING) -> None:
|
|
|
+ fh = logging.FileHandler(filename, mode=mode, encoding='utf-8')
|
|
|
+ fh.setLevel(file_handler_level)
|
|
|
+ fh.setFormatter(self.fmt)
|
|
|
+ self.logger.addHandler(fh)
|
|
|
+
|
|
|
+ def debug(self, msg):
|
|
|
+ self.logger.debug(msg)
|
|
|
+
|
|
|
+ def info(self, msg):
|
|
|
+ self.logger.info(msg)
|
|
|
+
|
|
|
+ def warning(self, msg):
|
|
|
+ self.logger.warning(msg)
|
|
|
+
|
|
|
+ def error(self, msg):
|
|
|
+ self.logger.error(msg)
|
|
|
+
|
|
|
+ def critical(self, msg):
|
|
|
+ self.logger.critical(msg)
|
|
|
+
|
|
|
+logger = Logger("resume_parse")
|
|
|
+logger.set_file_handler(filename='data.log')
|
|
|
|
|
|
from rich.console import Console
|
|
|
console = Console()
|
|
@@ -52,7 +87,7 @@ block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经
|
|
|
|
|
|
# 基本信息(已完成)
|
|
|
def get_base_info_old(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
schema = {
|
|
|
'姓名': None,
|
|
|
}
|
|
@@ -66,7 +101,7 @@ def get_base_info_old(lines):
|
|
|
key, val = i.split(':')
|
|
|
schema[key] = val
|
|
|
except Exception as e:
|
|
|
- logging.error(e)
|
|
|
+ logger.error(e)
|
|
|
|
|
|
if not schema.get('姓名'):
|
|
|
schema['姓名'] = re.search(r'[姓名::]{3,}(\w{2,4})', w).group(1) if re.search(r'[姓名::]{3,}(\w{2,4})', w) else None
|
|
@@ -99,7 +134,7 @@ def get_base_info_old(lines):
|
|
|
def get_base_info(lines):
|
|
|
if not lines:
|
|
|
return
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
data = " ".join(lines)
|
|
|
rst = base_info_ie(data)[0]
|
|
|
if rst.get("出生日期"):
|
|
@@ -123,7 +158,7 @@ def get_base_info(lines):
|
|
|
|
|
|
# 求职意向(已完成)
|
|
|
def get_job_intention(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
schema = {}
|
|
|
for line in lines:
|
|
|
regex = re.compile(r'\W{0,3}[::]\s+')
|
|
@@ -134,14 +169,14 @@ def get_job_intention(lines):
|
|
|
key, val = i.split(":")
|
|
|
schema[key] = val
|
|
|
except Exception as e:
|
|
|
- logging.error(e)
|
|
|
+ logger.error(e)
|
|
|
return schema
|
|
|
|
|
|
|
|
|
# 教育经历 (已停用)
|
|
|
# ner + 分词 (判断学校,时间,学历) 专业需要单独处理。
|
|
|
def get_edu_list_old(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
|
|
|
job_list = []
|
|
|
job_dict = {'edu_time_beg':'', 'edu_time_end':'', 'edu_name':'','edu_leval':'','edu_domain':'', 'edu_statue':0}
|
|
@@ -199,8 +234,8 @@ def get_edu_list_old(lines):
|
|
|
top_level = 18
|
|
|
remove_list = []
|
|
|
|
|
|
- logging.info(_list_data)
|
|
|
- logging.info(time_list)
|
|
|
+ logger.info(_list_data)
|
|
|
+ logger.info(time_list)
|
|
|
|
|
|
for ii in range(len(_list_data)):
|
|
|
for t in time_list:
|
|
@@ -221,7 +256,7 @@ def get_edu_list_old(lines):
|
|
|
break
|
|
|
#remove_list.append(i)
|
|
|
|
|
|
- logging.info(_list_data)
|
|
|
+ logger.info(_list_data)
|
|
|
|
|
|
job_time = re.findall(re_txt_1, data_list[0])
|
|
|
if job_time:
|
|
@@ -360,7 +395,7 @@ def get_edu_list_old(lines):
|
|
|
|
|
|
# 教育经历改 (已完成)
|
|
|
def get_edu_list(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
|
|
|
edu_list = [{"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None}]
|
|
|
regex_time = re.compile(r'((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?[\d]{0,2})[至到\W]+((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?)?([今])?|(\d{4})[至\W]+([\d今]{4})')
|
|
@@ -446,7 +481,7 @@ def get_edu_list(lines):
|
|
|
# 其中,时间是判断是否下一份工作情况的主要标识符之一。字符数量
|
|
|
# 时间类 数量词
|
|
|
def get_job_list(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
|
|
|
job_list = []
|
|
|
re_txt = '\d{4,4}\040{0,2}.\d+\040{0,2}.?\040{0,2}[\-–至-\—~]{1,2}\040{0,2}\d{4,4}\040{0,2}.\040{0,2}\d+.?|\d{4,4}.\d+.?\040{0,2}[\-–-—]{0,2}\040{0,2}至?今|\d{4,4}.\d+.?\040{0,2}[\-–-]{1,2}\040{0,2}现在|\d{4,4}年\d+月\-\d{4,4}年\d+月|\d{4,4}年\d+月\-\~|\d{4,4}年\d+月[\-\~-]至今|\d{4,4}-\d+\040{0,2}[-\~至]\040{0,2}\d{4,4}-\d+|\d{4,4}-\d+\~|\d{4,4}-\d+\[~-]至今|\d{4,4}-\d+\040{0,2}至今'
|
|
@@ -470,8 +505,8 @@ def get_job_list(lines):
|
|
|
elif len(year_list) == 1 and '至今' in lines[i]:
|
|
|
nums.append(i)
|
|
|
nums.append(len(lines))
|
|
|
- # logging.info(nums)
|
|
|
- logging.info('get_job_list :{}'.format(nums))
|
|
|
+ # logger.info(nums)
|
|
|
+ logger.info('get_job_list :{}'.format(nums))
|
|
|
for i in range(1, len(nums[:])):
|
|
|
job_dict = {'job_time':'', 'job_leval':'','job_company':'','job_content':''}
|
|
|
data_list = lines[nums[i-1]:nums[i]]
|
|
@@ -631,7 +666,7 @@ def get_job_list(lines):
|
|
|
# 项目经历 (已完成)(弃用)
|
|
|
# 项目名称未知
|
|
|
def get_pro_list_old(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
|
|
|
pro_list = [{"Time":None,"startTime":None,"endTime":None,"pro_name":None,"job_leval":None,"job_company":None,"content":None,},]
|
|
|
regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
|
|
@@ -665,14 +700,14 @@ def get_pro_list_old(lines):
|
|
|
if (not pro_list[count].get("job_company")) and (tag in "组织机构类_企事业单位"):
|
|
|
pro_list[count]["job_company"] = word
|
|
|
except Exception as e:
|
|
|
- logging.error(e)
|
|
|
+ logger.error(e)
|
|
|
pro_list[count]["content"] = line
|
|
|
return pro_list
|
|
|
|
|
|
|
|
|
# 项目经历 (UIE)
|
|
|
def get_pro_list(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
|
|
|
starts = []
|
|
|
# 时间查找
|
|
@@ -708,7 +743,7 @@ def get_pro_list(lines):
|
|
|
if not rst.get("时间") or not rst.get("项目名称"):
|
|
|
continue
|
|
|
rst["工作内容"] = [{"text":""}]
|
|
|
- logging.info(rst)
|
|
|
+ logger.info(rst)
|
|
|
for l in src:
|
|
|
if rst["时间"][0]["text"] in l:
|
|
|
continue
|
|
@@ -745,7 +780,7 @@ def get_pro_list(lines):
|
|
|
# 培训经历 (已完成)
|
|
|
# ner + 分词 (机构名) 培训项目 时间
|
|
|
def get_cultivate_list(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
|
|
|
job_list = []
|
|
|
re_txt = '\d{4,4}.\d{1,2}.?\040{0,2}[\-–至-\—~]\040{0,2}\d{4,4}.\d{1,2}[月]?|\d+\.\d+\-至今|\d+年\d+月\-\d+年\d+月|\d+年\d+月\-\~|\d+年\d+月[\-\~]至今|\d+-\d+\040{0,2}[\~至]\040{0,2}\d+-\d+|\d+-\d+\~|\d+-\d+\~至今|\d+-\d+\040{0,2}至今|^\d{4,4}.\d{1,2}|\d{4,4}.'
|
|
@@ -776,7 +811,7 @@ def get_cultivate_list(lines):
|
|
|
elif _[1] == 'TIME' and len(_[1]) >= 4:
|
|
|
time_list.append(_[0])
|
|
|
#TIME
|
|
|
- logging.info(data_line)
|
|
|
+ logger.info(data_line)
|
|
|
_list_data = re.split('\040+', data_line)
|
|
|
top_level = 22
|
|
|
end_index = 0
|
|
@@ -838,7 +873,7 @@ def get_cultivate_list(lines):
|
|
|
job_dict['cultivate_content'] = re.sub('培培训训内内容容::|培培训训内内容容::|培培训训内内容容', '培训内容:', ''.join(data_list[end_index:]))
|
|
|
if not job_dict['cultivate_name']:
|
|
|
job_dict['cultivate_name'] = org
|
|
|
- logging.info(job_dict)
|
|
|
+ logger.info(job_dict)
|
|
|
job_list.append(job_dict)
|
|
|
continue
|
|
|
'''
|
|
@@ -873,7 +908,7 @@ def get_cultivate_list(lines):
|
|
|
|
|
|
# 语言能力
|
|
|
def get_lag_list(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
|
|
|
job_list = []
|
|
|
re_lan = re.compile(r'(\w+[语话])')
|
|
@@ -915,7 +950,7 @@ def get_fam_list(lines):
|
|
|
|
|
|
# 证书情况 时间+证书名称 (已完成)
|
|
|
def get_cet_list_old(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
|
|
|
job_list = []
|
|
|
re_txt = '\d+年\d+月|\d+-\d+|\d+\.\d+'
|
|
@@ -938,7 +973,7 @@ def get_cet_list_old(lines):
|
|
|
else:
|
|
|
continue
|
|
|
ls = re.split('\||\040+|\t+', l)
|
|
|
- logging.info(ls)
|
|
|
+ logger.info(ls)
|
|
|
for l in ls:
|
|
|
if len(l) <= 3:
|
|
|
continue
|
|
@@ -950,7 +985,7 @@ def get_cet_list_old(lines):
|
|
|
|
|
|
# 证书情况 时间+证书名称 (UIE已完成)
|
|
|
def get_cet_list(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
cet_list = []
|
|
|
for line in lines:
|
|
|
info = cet_ie(line)
|
|
@@ -960,7 +995,7 @@ def get_cet_list(lines):
|
|
|
|
|
|
# 获奖情况 时间+获奖名称 (已完成)
|
|
|
def get_prize_list_old(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
|
|
|
job_list = []
|
|
|
re_txt = '\d+年\d+月|\d+-\d+|\d{4,4}.\d{1,2}'
|
|
@@ -982,20 +1017,20 @@ def get_prize_list_old(lines):
|
|
|
else:
|
|
|
continue
|
|
|
ls = re.split('\||\040+|\t+', l)
|
|
|
- logging.info(ls)
|
|
|
+ logger.info(ls)
|
|
|
for l in ls:
|
|
|
if len(l) <= 3:
|
|
|
continue
|
|
|
cet_dict['prize_name'] = l.strip()
|
|
|
break
|
|
|
- logging.info(cet_dict)
|
|
|
+ logger.info(cet_dict)
|
|
|
job_list.append(cet_dict)
|
|
|
return job_list
|
|
|
|
|
|
|
|
|
# 获奖情况 时间+获奖名称 (UIE已完成)
|
|
|
def get_prize_list(lines):
|
|
|
- logging.info(lines)
|
|
|
+ logger.info(lines)
|
|
|
prize_list = []
|
|
|
for line in lines:
|
|
|
info = prize_ie(line)
|
|
@@ -1126,9 +1161,9 @@ def check_word(path, save_dir):
|
|
|
tables = doc.tables
|
|
|
|
|
|
if not tables:
|
|
|
- logging.info("this is raw text")
|
|
|
+ logger.info("this is raw text")
|
|
|
read_from_word(doc, path, save_dir=save_dir)
|
|
|
- logging.info("this is a Table")
|
|
|
+ logger.info("this is a Table")
|
|
|
|
|
|
global block
|
|
|
with open("resources/keys.json", "r", encoding="utf-8") as fp:
|
|
@@ -1243,7 +1278,7 @@ def read_from_pdf(path, save_dir):
|
|
|
# 循环遍历列表,每次处理一个page的内容
|
|
|
b = 1
|
|
|
for page in PDFPage.create_pages(doc):
|
|
|
- logging.debug('================ 新页面 ================')
|
|
|
+ logger.debug('================ 新页面 ================')
|
|
|
interpreter.process_page(page)
|
|
|
layout = device.get_result()
|
|
|
r, b = parse_line_layout(layout, b)
|
|
@@ -1380,7 +1415,7 @@ def detection_type(path, system):
|
|
|
zf.extractall("./cache/" + tempdir)
|
|
|
path = "./cache/" + tempdir
|
|
|
except Exception as e:
|
|
|
- logging.error(e)
|
|
|
+ logger.error(e)
|
|
|
# 传入为 doc
|
|
|
if os.path.isfile(path) and path.endswith('.doc'):
|
|
|
doc2pdf(docPath = path, pdfPath = './pdf', system=system)
|
|
@@ -1410,7 +1445,7 @@ def detection_type(path, system):
|
|
|
for filename in os.listdir(path):
|
|
|
filename = os.path.join(path, filename)
|
|
|
# 传入为 doc
|
|
|
- logging.info(filename)
|
|
|
+ logger.info(filename)
|
|
|
if filename.endswith('.doc') and not filename.startswith('.~'):
|
|
|
doc2pdf(docPath = filename, pdfPath = './pdf', system=system)
|
|
|
newfile = './pdf/' + os.path.splitext(os.path.split(filename)[-1])[0] + '.pdf'
|
|
@@ -1460,10 +1495,16 @@ if __name__ == '__main__':
|
|
|
import platform
|
|
|
system = platform.system()
|
|
|
if (system == "Windows"):
|
|
|
- logging.info("Windows")
|
|
|
+ logger.info("Windows")
|
|
|
elif (system == "Linux"):
|
|
|
- logging.info("Linux")
|
|
|
+ logger.info("Linux")
|
|
|
else:
|
|
|
- logging.error("Unnot support this system")
|
|
|
+ logger.error("Unnot support this system")
|
|
|
+ if not os.path.exists("./uploads"):
|
|
|
+ os.mkdir("./uploads")
|
|
|
+ if not os.path.exists("./pdf"):
|
|
|
+ os.mkdir("./pdf")
|
|
|
+ if not os.path.exists("./cache"):
|
|
|
+ os.mkdir("./cache")
|
|
|
|
|
|
uvicorn.run(app=app, host="0.0.0.0", port=8320)
|