|
@@ -84,7 +84,7 @@ global block, block_rev
|
|
|
|
|
|
with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
|
|
|
block = json.load(fp)
|
|
|
-block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"自我评价", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
|
|
|
+block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
|
|
|
|
|
|
|
|
|
|
|
@@ -143,7 +143,10 @@ def get_base_info(lines):
|
|
|
if rst.get("出生日期"):
|
|
|
dates = re.findall(r'\d+' ,rst["出生日期"][0]["text"])
|
|
|
if len(dates) == 1:
|
|
|
- rst["出生日期"][0]["text"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ if len(dates[0]) > 4:
|
|
|
+ rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0][:4]), int(dates[0][4:6]))
|
|
|
+ else:
|
|
|
+ rst["出生日期"][0]["text"] = "{:4d}-01-01".format(int(dates[0][:4]))
|
|
|
elif len(dates) == 2:
|
|
|
rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
elif len(dates) == 3:
|
|
@@ -151,11 +154,30 @@ def get_base_info(lines):
|
|
|
if rst.get("参加工作时间"):
|
|
|
dates = re.findall(r'\d+' ,rst["参加工作时间"][0]["text"])
|
|
|
if len(dates) == 1:
|
|
|
- rst["参加工作时间"][0]["text"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
+ if len(dates[0]) > 4:
|
|
|
+ rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0][:4]), int(dates[0][4:6]))
|
|
|
+ else:
|
|
|
+ rst["参加工作时间"][0]["text"] = "{:4d}-01-01".format(int(dates[0]))
|
|
|
elif len(dates) == 2:
|
|
|
rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
elif len(dates) == 3:
|
|
|
rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
|
|
|
+ normal = {
|
|
|
+ "姓名":"name",
|
|
|
+ "性别":"gender",
|
|
|
+ "电子邮箱":"email",
|
|
|
+ "政治面貌":"politics",
|
|
|
+ "手机号码":"mobile",
|
|
|
+ "籍贯":"birthplace",
|
|
|
+ "出生日期":"birth_time",
|
|
|
+ "现任职务":"current_job",
|
|
|
+ "所在城市":"living_city",
|
|
|
+ "参加工作时间":"work_begin_time",
|
|
|
+ }
|
|
|
+ for key in normal.keys():
|
|
|
+ if rst.get(key):
|
|
|
+ rst[normal[key]] = rst[key]
|
|
|
+ del rst[key]
|
|
|
return {key:rst[key][0]["text"] for key in rst.keys()}
|
|
|
|
|
|
|
|
@@ -400,7 +422,7 @@ def get_edu_list_old(lines):
|
|
|
def get_edu_list(lines):
|
|
|
logger.info(lines)
|
|
|
|
|
|
- edu_list = [{"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None}]
|
|
|
+ edu_list = [{"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None}]
|
|
|
regex_time = re.compile(r'((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?[\d]{0,2})[至到\W]+((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?)?([今])?|(\d{4})[至\W]+([\d今]{4})')
|
|
|
regex_end = re.compile(r'毕业时间[\w\W]{0,5}(\d{4})[\W年]?(\d{0,2})[月\W]?')
|
|
|
regex_level = re.compile(r'[大本专科硕博士研究生后]{2,}')
|
|
@@ -419,62 +441,62 @@ def get_edu_list(lines):
|
|
|
# 标准时间格式
|
|
|
if edu_time:
|
|
|
# 提交信息
|
|
|
- if edu_list[count].get("Time") and edu_list[count].get("edu_name"):
|
|
|
- edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
|
|
|
+ if edu_list[count].get("Time") and edu_list[count].get("school_name"):
|
|
|
+ edu_list.append({"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None})
|
|
|
count += 1
|
|
|
- edu_list[count]["startTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)))
|
|
|
+ edu_list[count]["start_time"] = '{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)))
|
|
|
# 年月日
|
|
|
if edu_time.group(5) != None:
|
|
|
- edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(5)),int(edu_time.group(6)))
|
|
|
+ edu_list[count]["end_time"] = '{:4d}-{:02d}'.format(int(edu_time.group(5)),int(edu_time.group(6)))
|
|
|
edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)),int(edu_time.group(5)),int(edu_time.group(6)))
|
|
|
# 只有年
|
|
|
elif edu_time.group(8) != None:
|
|
|
edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_time.group(8)),int(edu_time.group(9)))
|
|
|
- edu_list[count]["startTime"] = '{:4d}'.format(int(edu_time.group(8)))
|
|
|
- edu_list[count]["endTime"] = '{:4d}'.format(int(edu_time.group(9)))
|
|
|
+ edu_list[count]["start_time"] = '{:4d}'.format(int(edu_time.group(8)))
|
|
|
+ edu_list[count]["end_time"] = '{:4d}'.format(int(edu_time.group(9)))
|
|
|
# 至今类
|
|
|
- else:
|
|
|
- edu_list[count]["endTime"] = edu_time.group(7)
|
|
|
+ elif edu_time.group(7):
|
|
|
+ edu_list[count]["end_time"] = edu_time.group(7)
|
|
|
edu_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(edu_time.group(2)),int(edu_time.group(3)),edu_time.group(7))
|
|
|
flags = 1
|
|
|
# 只有毕业时间
|
|
|
elif edu_end_time:
|
|
|
# 提交信息
|
|
|
- if edu_list[count].get("endTime") and edu_list[count].get("edu_name"):
|
|
|
- edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
|
|
|
+ if edu_list[count].get("end_time") and edu_list[count].get("school_name"):
|
|
|
+ edu_list.append({"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None})
|
|
|
count += 1
|
|
|
# 年月
|
|
|
if edu_end_time.group(2):
|
|
|
edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_end_time.group(1))-3,int(edu_end_time.group(2)),int(edu_end_time.group(1)),int(edu_end_time.group(2)))
|
|
|
- edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)))
|
|
|
+ edu_list[count]["end_time"] = '{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)))
|
|
|
# 只有年
|
|
|
elif edu_end_time.group(1):
|
|
|
edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_end_time.group(1))-3,int(edu_end_time.group(1)))
|
|
|
- edu_list[count]["endTime"] = '{:4d}'.format(int(edu_end_time.group(1)))
|
|
|
+ edu_list[count]["end_time"] = '{:4d}'.format(int(edu_end_time.group(1)))
|
|
|
# 学历
|
|
|
- if (not edu_list[count].get("edu_level")) and edu_level:
|
|
|
- edu_list[count]["edu_level"] = edu_level.group(0)
|
|
|
+ if (not edu_list[count].get("degree")) and edu_level:
|
|
|
+ edu_list[count]["degree"] = edu_level.group(0)
|
|
|
# WordTag 识别 学校/专业
|
|
|
for word, tag in ner_tag(cell):
|
|
|
- if (not edu_list[count].get("edu_name")) and (tag == "组织机构类_教育组织机构"):
|
|
|
- edu_list[count]["edu_name"] = word.strip()
|
|
|
+ if (not edu_list[count].get("school_name")) and (tag == "组织机构类_教育组织机构"):
|
|
|
+ edu_list[count]["school_name"] = word.strip()
|
|
|
flags = 1
|
|
|
- elif (not edu_list[count].get("edu_domain")) and (tag in "_术语类型"):
|
|
|
- edu_list[count]["edu_domain"] = word.strip()
|
|
|
- elif edu_list[count].get("edu_name") and edu_list[count].get("edu_domain"):
|
|
|
+ elif (not edu_list[count].get("major")) and (tag in "_术语类型"):
|
|
|
+ edu_list[count]["major"] = word.strip()
|
|
|
+ elif edu_list[count].get("school_name") and edu_list[count].get("major"):
|
|
|
break
|
|
|
# LAC 识别 学校
|
|
|
else:
|
|
|
for word, tag in ner(cell):
|
|
|
if (tag == "ORG"):
|
|
|
- edu_list[count]["edu_name"] = word
|
|
|
+ edu_list[count]["school_name"] = word
|
|
|
flags = 1
|
|
|
break
|
|
|
# 未识别成功时填充专业
|
|
|
- if (not (edu_level or flags or edu_list[count].get("edu_domain"))) and edu_domain:
|
|
|
- edu_list[count]["edu_domain"] = edu_domain.group(0)
|
|
|
+ if (not (edu_level or flags or edu_list[count].get("major"))) and edu_domain:
|
|
|
+ edu_list[count]["major"] = edu_domain.group(0)
|
|
|
# 剔除时间不存在、学校不存在的列
|
|
|
- if (not edu_list[-1].get("Time")) or (not edu_list[-1].get("edu_name")):
|
|
|
+ if (not edu_list[-1].get("Time")) or (not edu_list[-1].get("school_name")):
|
|
|
edu_list.pop()
|
|
|
return edu_list
|
|
|
|
|
@@ -525,14 +547,14 @@ def get_job_list(lines):
|
|
|
if len(year_list) >= 2:
|
|
|
job_time = ['-'.join(year_list)]
|
|
|
elif len(year_list) == 1 and '至今' in lines[i]:
|
|
|
- job_time = [year_list[0] + '-' + '至今']
|
|
|
+ job_time = [year_list[0] + '~' + '至今']
|
|
|
|
|
|
if not job_time:
|
|
|
regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
|
|
|
job_time = [re.search(regex, data_list[0]).group(0)]
|
|
|
|
|
|
job_dict['job_time'] = job_time[0]
|
|
|
- _nums = re.findall('\d+', job_dict['job_time'])
|
|
|
+ _nums = re.findall('\d{1,4}', job_dict['job_time'])
|
|
|
#print(_nums)
|
|
|
if len(_nums) >= 4:
|
|
|
job_dict['job_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
|
|
@@ -543,18 +565,20 @@ def get_job_list(lines):
|
|
|
data_list[0] = re.sub(job_time[0], '', data_list[0])
|
|
|
data_list[0] = data_list[0].strip()
|
|
|
ner_list = []
|
|
|
- for i in range(len(data_list[:3])):
|
|
|
- if '工作' in data_list[i][:4] and (re.findall(':|\:', data_list[i])):
|
|
|
- end_index = i
|
|
|
+ for ii in range(len(data_list[:3])):
|
|
|
+ if '工作' in data_list[ii][:4] and (re.findall(':|\:', data_list[ii])):
|
|
|
+ end_index = ii
|
|
|
break
|
|
|
- if not re.findall('\040|\||/', data_list[i]) and org:
|
|
|
- end_index = i
|
|
|
+ #print(re.findall('\040|\||/', data_list[ii].strip()), org)
|
|
|
+ if not re.findall('\040|\||/', data_list[ii].strip()) and org:
|
|
|
+ end_index = ii
|
|
|
break
|
|
|
- if len(data_list[i]) > 80:
|
|
|
- end_index = i
|
|
|
+ if len(data_list[ii]) > 80:
|
|
|
+ end_index = ii
|
|
|
break
|
|
|
- if data_list[i]:
|
|
|
- ner_data = ner_tag(data_list[i].strip())
|
|
|
+ if data_list[ii]:
|
|
|
+ ner_data = ner_tag(data_list[ii].strip())
|
|
|
+ #print('\n\nnerdata:\t',ner_data)
|
|
|
else:
|
|
|
continue
|
|
|
|
|
@@ -563,29 +587,33 @@ def get_job_list(lines):
|
|
|
if x[1] == '人物类_概念' and len(x[0]) > 2:
|
|
|
person_professor_list.append(x[0].strip())
|
|
|
|
|
|
- elif x[1] == '组织机构类_企事业单位' or x[1] == '组织机构类_教育组织机构':
|
|
|
- if not org:
|
|
|
+ elif x[1] == '组织机构类_企事业单位' or x[1] == '组织机构类_教育组织机构' or x[1] == '组织机构类_国家机关':
|
|
|
+ if not org and len(x[0]) >= 3:
|
|
|
org = re.split('\040|\|/', x[0].strip())[0]
|
|
|
- org_index = i
|
|
|
+ org_index = ii
|
|
|
if not org:
|
|
|
- for i in range(len(ner_list)):
|
|
|
- ner_data = ner_list[i]
|
|
|
+ for ii in range(len(ner_list)):
|
|
|
+ if org:
|
|
|
+ break
|
|
|
+ ner_data = ner_list[ii]
|
|
|
for x in ner_data:
|
|
|
- if x[1] == '组织机构类':
|
|
|
+ if x[1][:5] == '组织机构类':
|
|
|
org = re.split('\040|\|/', x[0].strip())[0]
|
|
|
break
|
|
|
+ #print(person_professor_list)
|
|
|
if not person_professor_list:
|
|
|
- for i in range(len(ner_list)):
|
|
|
- ner_data = ner_list[i]
|
|
|
+ for ii in range(len(ner_list)):
|
|
|
+ ner_data = ner_list[ii]
|
|
|
for x in ner_data:
|
|
|
if x[1] == '人物类_概念':
|
|
|
person_professor_list = [re.split('\040|\|/', x[0].strip())[0]]
|
|
|
break
|
|
|
data_line = ' '.join(data_list[:end_index])
|
|
|
data_line = re.sub('\||/', ' ', data_line)
|
|
|
- _list_data = re.split('\040+',data_line)
|
|
|
- if len(_list_data) == 1:
|
|
|
+ _list_data = re.split('\040+', data_line)
|
|
|
+ if len(_list_data) == 1 and len(data_list) == 1:
|
|
|
end_index = 0
|
|
|
+ #print(_list_data)
|
|
|
if not person_professor_list:
|
|
|
for x in range(len(_list_data)):
|
|
|
if re.findall('经理|工程师|会计|董事长|总监|秘书|主管|处长|局长|主任|讲师|教授', _list_data[x][-4:]):
|
|
@@ -616,10 +644,20 @@ def get_job_list(lines):
|
|
|
#print(org, person_professor_list, job_time)
|
|
|
job_dict['job_company'] = org
|
|
|
job_dict['job_leval'] = ' '.join(person_professor_list)
|
|
|
+ if not data_list[end_index:] and end_index == 3:
|
|
|
+ end_index = 2
|
|
|
+ if not data_list[end_index:] and end_index == 2:
|
|
|
+ end_index = 1
|
|
|
job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容::|工工作作内内容容', '工作内容:', ''.join(data_list[end_index:]))
|
|
|
job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
|
|
|
+ job_dict['start_time'] = job_dict['job_time'].split('~')[0]
|
|
|
+ job_dict['end_time'] = job_dict['job_time'].split('~')[1]
|
|
|
|
|
|
-
|
|
|
+ normal = {"job_company":"company_name","job_content":"job_desc","job_leval":"job_name"}
|
|
|
+ for key in normal.keys():
|
|
|
+ if job_dict.get(key):
|
|
|
+ job_dict[normal[key]] = job_dict[key]
|
|
|
+ job_dict.pop(key)
|
|
|
|
|
|
job_list.append(job_dict)
|
|
|
continue
|
|
@@ -776,6 +814,11 @@ def get_pro_list(lines):
|
|
|
rst["时间"][0]["text"] = "{:4d}-{:02d}~至今".format(int(time_list[0]),int(time_list[1]))
|
|
|
else:
|
|
|
rst["时间"][0]["text"] = "{:4d}~至今".format(int(time_list[0]))
|
|
|
+ normal = {"时间":"Time","项目名称":"project_name","机构":"company_name","职位":"project_duty","工作内容":"project_desc"}
|
|
|
+ for key in normal.keys():
|
|
|
+ if rst.get(key):
|
|
|
+ rst[normal[key]] = rst[key]
|
|
|
+ rst.pop(key)
|
|
|
pro_list.extend([{key:rst[key][0]["text"] for key in rst.keys()} for rst in info])
|
|
|
return pro_list
|
|
|
|
|
@@ -909,22 +952,26 @@ def get_cultivate_list(lines):
|
|
|
return job_list
|
|
|
|
|
|
|
|
|
-# 语言能力
|
|
|
+# 语言能力(已完成)
|
|
|
def get_lag_list(lines):
|
|
|
logger.info(lines)
|
|
|
-
|
|
|
- job_list = []
|
|
|
+ lan_list = []
|
|
|
re_lan = re.compile(r'(\w+[语话])')
|
|
|
- lag_dict = {'lag_name':'', 'lag_leval':""}
|
|
|
+ re_lev = re.compile(r'([公共级四专八]+)')
|
|
|
+ lag_dict = {'lan_name':'', 'level':""}
|
|
|
for l in lines:
|
|
|
if not l.strip():
|
|
|
continue
|
|
|
- lag_name = re.search(re_lan, l)
|
|
|
- if lag_name and lag_name.group(1):
|
|
|
- if lag_dict['lag_name']:
|
|
|
- job_list.append(lag_dict)
|
|
|
- lag_dict['lag_name'] = lag_name.group(1)
|
|
|
- return job_list
|
|
|
+ lan_name = re.search(re_lan, l)
|
|
|
+ lag_lev = re.search(re_lev, l)
|
|
|
+ if lag_lev and lag_lev.group(1):
|
|
|
+ lag_dict["level"] = lag_lev.group(1)
|
|
|
+ if lan_name and lan_name.group(1):
|
|
|
+ if lag_dict["lan_name"]:
|
|
|
+ lan_list.append(lag_dict)
|
|
|
+ lag_dict = {'lan_name':'', 'level':""}
|
|
|
+ lag_dict['lan_name'] = lan_name.group(1)
|
|
|
+ return lan_list
|
|
|
|
|
|
|
|
|
# 家庭情况(已弃用)
|