Procházet zdrojové kódy

new file: ../requirements.txt
modified: irafa.py
modified: resources/SegmentName.json
modified: resume_parse.py
modified: srafa.py

sprivacy před 3 roky
rodič
revize
88c2f32b13
5 změnil soubory, kde provedl 158 přidání a 91 odebrání
  1. 13 0
      requirements.txt
  2. 2 2
      tools/irafa.py
  3. 3 1
      tools/resources/SegmentName.json
  4. 106 59
      tools/resume_parse.py
  5. 34 29
      tools/srafa.py

+ 13 - 0
requirements.txt

@@ -0,0 +1,13 @@
+uvicorn
+fastapi
+py7zr
+
+rarfile
+tarfile
+zipfile
+requests
+python-docx
+pdfminer
+pdfplumber
+paddlenlp
+rich

+ 2 - 2
tools/irafa.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2022-07-07 13:12:17
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-08 17:52:09
+# @Last Modified time: 2022-07-12 18:02:21
 
 
 from docx import Document
@@ -57,5 +57,5 @@ for key in lo.keys():
             break
     else:
         # print("此行为关键词行")
-        kwline = lo[key]
+        kwline = [''.join(cell.split()) for cell in lo[key]]
         kwln = len(lo[key])

+ 3 - 1
tools/resources/SegmentName.json

@@ -122,10 +122,12 @@
     "其它情况": "other",
     "其他技能": "other",
     "主要工作业绩": "other",
+	"主要工作业绩(500字以内)":"other",
+	"主要工作业绩(500字以内)":"other",
     "专业情况介绍": "other",
     "项目&社会实践": "other",
     "其他说明和补充": "other",
     "近三年年度考核结果": "other",
     "社会活动及社会实践": "other",
     "对报名岗位认识及工作设想": "other"
-}
+}

+ 106 - 59
tools/resume_parse.py

@@ -84,7 +84,7 @@ global block, block_rev
 
 with open("resources/SegmentName.json", "r", encoding="utf-8") as fp:
     block = json.load(fp)
-block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"自我评价", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
+block_rev = {1:"基本信息", 2:"求职意向", 3:"教育经历", 4:"工作经历", 5:"项目经历", 6:"专业技能", 7:"intro", 8:"兴趣爱好", 9:"语言能力", 10:"证书", 11:"获奖情况", 12:"培训经历", 13:"家庭成员", "other":"其他"}
 
 
 
@@ -143,7 +143,10 @@ def get_base_info(lines):
     if rst.get("出生日期"):
         dates = re.findall(r'\d+' ,rst["出生日期"][0]["text"])
         if len(dates) == 1:
-            rst["出生日期"][0]["text"] = "{:4d}-01-01".format(int(dates[0]))
+            if len(dates[0]) > 4:
+                rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0][:4]), int(dates[0][4:6]))
+            else:
+                rst["出生日期"][0]["text"] = "{:4d}-01-01".format(int(dates[0][:4]))
         elif len(dates) == 2:
             rst["出生日期"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
         elif len(dates) == 3:
@@ -151,11 +154,30 @@ def get_base_info(lines):
     if rst.get("参加工作时间"):
         dates = re.findall(r'\d+' ,rst["参加工作时间"][0]["text"])
         if len(dates) == 1:
-            rst["参加工作时间"][0]["text"] = "{:4d}-01-01".format(int(dates[0]))
+            if len(dates[0]) > 4:
+                rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0][:4]), int(dates[0][4:6]))
+            else:
+                rst["参加工作时间"][0]["text"] = "{:4d}-01-01".format(int(dates[0]))
         elif len(dates) == 2:
             rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
         elif len(dates) == 3:
             rst["参加工作时间"][0]["text"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
+    normal = {
+        "姓名":"name",
+        "性别":"gender",
+        "电子邮箱":"email",
+        "政治面貌":"politics",
+        "手机号码":"mobile",
+        "籍贯":"birthplace",
+        "出生日期":"birth_time",
+        "现任职务":"current_job",
+        "所在城市":"living_city",
+        "参加工作时间":"work_begin_time",
+    }
+    for key in normal.keys():
+        if rst.get(key):
+            rst[normal[key]] = rst[key]
+            del rst[key]
     return {key:rst[key][0]["text"] for key in rst.keys()}
 
 
@@ -400,7 +422,7 @@ def get_edu_list_old(lines):
 def get_edu_list(lines):
     logger.info(lines)
 
-    edu_list = [{"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None}]
+    edu_list = [{"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None}]
     regex_time = re.compile(r'((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?[\d]{0,2})[至到\W]+((\d{4})[年\W]{1,2}(\d{1,2})[月\W]?)?([今])?|(\d{4})[至\W]+([\d今]{4})')
     regex_end = re.compile(r'毕业时间[\w\W]{0,5}(\d{4})[\W年]?(\d{0,2})[月\W]?')
     regex_level = re.compile(r'[大本专科硕博士研究生后]{2,}')
@@ -419,62 +441,62 @@ def get_edu_list(lines):
             # 标准时间格式
             if edu_time:
                 # 提交信息
-                if edu_list[count].get("Time") and edu_list[count].get("edu_name"):
-                    edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
+                if edu_list[count].get("Time") and edu_list[count].get("school_name"):
+                    edu_list.append({"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None})
                     count += 1
-                edu_list[count]["startTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)))
+                edu_list[count]["start_time"] = '{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)))
                 # 年月日
                 if edu_time.group(5) != None:
-                    edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_time.group(5)),int(edu_time.group(6)))
+                    edu_list[count]["end_time"] = '{:4d}-{:02d}'.format(int(edu_time.group(5)),int(edu_time.group(6)))
                     edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_time.group(2)),int(edu_time.group(3)),int(edu_time.group(5)),int(edu_time.group(6)))
                 # 只有年
                 elif edu_time.group(8) != None:
                     edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_time.group(8)),int(edu_time.group(9)))
-                    edu_list[count]["startTime"] = '{:4d}'.format(int(edu_time.group(8)))
-                    edu_list[count]["endTime"] = '{:4d}'.format(int(edu_time.group(9)))
+                    edu_list[count]["start_time"] = '{:4d}'.format(int(edu_time.group(8)))
+                    edu_list[count]["end_time"] = '{:4d}'.format(int(edu_time.group(9)))
                 # 至今类
-                else:
-                    edu_list[count]["endTime"] = edu_time.group(7)
+                elif edu_time.group(7):
+                    edu_list[count]["end_time"] = edu_time.group(7)
                     edu_list[count]['Time'] = '{:4d}-{:02d}~{}'.format(int(edu_time.group(2)),int(edu_time.group(3)),edu_time.group(7))
                 flags = 1
             # 只有毕业时间
             elif edu_end_time:
                 # 提交信息
-                if edu_list[count].get("endTime") and edu_list[count].get("edu_name"):
-                    edu_list.append({"Time":None, "startTime":None, "endTime":None, "edu_name":None, "edu_domain":None, "edu_level":None})
+                if edu_list[count].get("end_time") and edu_list[count].get("school_name"):
+                    edu_list.append({"Time":None, "start_time":None, "end_time":None, "school_name":None, "major":None, "degree":None})
                     count += 1
                 # 年月
                 if edu_end_time.group(2):
                     edu_list[count]["Time"] = '{:4d}-{:02d}~{:4d}-{:02d}'.format(int(edu_end_time.group(1))-3,int(edu_end_time.group(2)),int(edu_end_time.group(1)),int(edu_end_time.group(2)))
-                    edu_list[count]["endTime"] = '{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)))
+                    edu_list[count]["end_time"] = '{:4d}-{:02d}'.format(int(edu_end_time.group(1)),int(edu_end_time.group(2)))
                 # 只有年
                 elif edu_end_time.group(1):
                     edu_list[count]["Time"] = '{:4d}~{:4d}'.format(int(edu_end_time.group(1))-3,int(edu_end_time.group(1)))
-                    edu_list[count]["endTime"] = '{:4d}'.format(int(edu_end_time.group(1)))
+                    edu_list[count]["end_time"] = '{:4d}'.format(int(edu_end_time.group(1)))
             # 学历
-            if (not edu_list[count].get("edu_level")) and edu_level:
-                edu_list[count]["edu_level"] = edu_level.group(0)
+            if (not edu_list[count].get("degree")) and edu_level:
+                edu_list[count]["degree"] = edu_level.group(0)
             # WordTag 识别 学校/专业
             for word, tag in ner_tag(cell):
-                if (not edu_list[count].get("edu_name")) and (tag == "组织机构类_教育组织机构"):
-                    edu_list[count]["edu_name"] = word.strip()
+                if (not edu_list[count].get("school_name")) and (tag == "组织机构类_教育组织机构"):
+                    edu_list[count]["school_name"] = word.strip()
                     flags = 1
-                elif (not edu_list[count].get("edu_domain")) and (tag in "_术语类型"):
-                    edu_list[count]["edu_domain"] = word.strip()
-                elif edu_list[count].get("edu_name") and edu_list[count].get("edu_domain"):
+                elif (not edu_list[count].get("major")) and (tag in "_术语类型"):
+                    edu_list[count]["major"] = word.strip()
+                elif edu_list[count].get("school_name") and edu_list[count].get("major"):
                     break
             # LAC 识别 学校
             else:
                 for word, tag in ner(cell):
                     if (tag == "ORG"):
-                        edu_list[count]["edu_name"] = word
+                        edu_list[count]["school_name"] = word
                         flags = 1
                         break
             # 未识别成功时填充专业
-            if (not (edu_level or flags or edu_list[count].get("edu_domain"))) and edu_domain:
-                edu_list[count]["edu_domain"] = edu_domain.group(0)
+            if (not (edu_level or flags or edu_list[count].get("major"))) and edu_domain:
+                edu_list[count]["major"] = edu_domain.group(0)
     # 剔除时间不存在、学校不存在的列
-    if (not edu_list[-1].get("Time")) or (not edu_list[-1].get("edu_name")):
+    if (not edu_list[-1].get("Time")) or (not edu_list[-1].get("school_name")):
         edu_list.pop()
     return edu_list
 
@@ -525,14 +547,14 @@ def get_job_list(lines):
             if len(year_list) >= 2:
                 job_time = ['-'.join(year_list)]
             elif len(year_list) == 1 and '至今' in lines[i]:
-                job_time = [year_list[0] + '-' + '至今']
+                job_time = [year_list[0] + '~' + '至今']
 
         if not job_time:
             regex = re.compile(r'((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)[至到\W]+((\d{4})[年\W]+(\d{1,2})[\W]?[\w]?)?([今])?')
             job_time = [re.search(regex, data_list[0]).group(0)]
 
         job_dict['job_time'] = job_time[0]
-        _nums = re.findall('\d+', job_dict['job_time'])
+        _nums = re.findall('\d{1,4}', job_dict['job_time'])
         #print(_nums)
         if len(_nums) >= 4:
             job_dict['job_time'] = '%s-%02d~%s-%02d'%(_nums[0], int(_nums[1]), _nums[2], int(_nums[3]))
@@ -543,18 +565,20 @@ def get_job_list(lines):
         data_list[0] = re.sub(job_time[0], '', data_list[0])
         data_list[0] = data_list[0].strip()
         ner_list = []
-        for i in range(len(data_list[:3])):
-            if '工作' in data_list[i][:4] and (re.findall(':|\:', data_list[i])):
-                end_index = i
+        for ii in range(len(data_list[:3])):
+            if '工作' in data_list[ii][:4] and (re.findall(':|\:', data_list[ii])):
+                end_index = ii
                 break
-            if not re.findall('\040|\||/', data_list[i]) and org:
-                end_index = i
+            #print(re.findall('\040|\||/', data_list[ii].strip()),  org)
+            if not re.findall('\040|\||/', data_list[ii].strip()) and org:
+                end_index = ii
                 break
-            if len(data_list[i]) > 80:
-                end_index = i
+            if len(data_list[ii]) > 80:
+                end_index = ii
                 break
-            if data_list[i]:
-                ner_data = ner_tag(data_list[i].strip())
+            if data_list[ii]:
+                ner_data = ner_tag(data_list[ii].strip())
+                #print('\n\nnerdata:\t',ner_data)
             else:
                 continue
 
@@ -563,29 +587,33 @@ def get_job_list(lines):
                 if x[1] == '人物类_概念' and len(x[0]) > 2:
                     person_professor_list.append(x[0].strip())
                     
-                elif x[1] == '组织机构类_企事业单位' or x[1] == '组织机构类_教育组织机构':
-                    if not org:
+                elif x[1] == '组织机构类_企事业单位' or x[1] == '组织机构类_教育组织机构' or x[1] == '组织机构类_国家机关':
+                    if not org and len(x[0]) >= 3:
                         org = re.split('\040|\|/', x[0].strip())[0]
-                        org_index = i
+                        org_index = ii
         if not org:
-            for i in range(len(ner_list)):
-                ner_data = ner_list[i]
+            for ii in range(len(ner_list)):
+                if org:
+                    break
+                ner_data = ner_list[ii]
                 for x in ner_data:
-                    if x[1] == '组织机构类':
+                    if x[1][:5] == '组织机构类':
                         org = re.split('\040|\|/', x[0].strip())[0]
                         break
+        #print(person_professor_list)
         if not person_professor_list:
-            for i in range(len(ner_list)):
-                ner_data = ner_list[i]
+            for ii in range(len(ner_list)):
+                ner_data = ner_list[ii]
                 for x in ner_data:
                     if x[1] == '人物类_概念':
                         person_professor_list = [re.split('\040|\|/', x[0].strip())[0]]
                         break
         data_line = ' '.join(data_list[:end_index])
         data_line = re.sub('\||/', ' ', data_line)
-        _list_data = re.split('\040+',data_line)
-        if len(_list_data) == 1:
+        _list_data = re.split('\040+', data_line)
+        if len(_list_data) == 1 and len(data_list) == 1:
             end_index = 0
+        #print(_list_data)
         if not person_professor_list:
             for x in range(len(_list_data)):
                 if re.findall('经理|工程师|会计|董事长|总监|秘书|主管|处长|局长|主任|讲师|教授', _list_data[x][-4:]):
@@ -616,10 +644,20 @@ def get_job_list(lines):
         #print(org, person_professor_list, job_time)
         job_dict['job_company'] = org
         job_dict['job_leval'] = ' '.join(person_professor_list)
+        if not data_list[end_index:] and end_index == 3:
+            end_index = 2
+        if not data_list[end_index:] and end_index == 2:
+            end_index = 1
         job_dict['job_content'] = re.sub('工工作作内内容容::|工工作作内内容容::|工工作作内内容容', '工作内容:', ''.join(data_list[end_index:]))
         job_dict['job_content'] = re.sub('/', '-', job_dict['job_content'])
+        job_dict['start_time'] = job_dict['job_time'].split('~')[0]
+        job_dict['end_time'] = job_dict['job_time'].split('~')[1]
 
-        
+        normal = {"job_company":"company_name","job_content":"job_desc","job_leval":"job_name"}
+        for key in normal.keys():
+            if job_dict.get(key):
+                job_dict[normal[key]] = job_dict[key]
+                job_dict.pop(key)
 
         job_list.append(job_dict)
         continue
@@ -776,6 +814,11 @@ def get_pro_list(lines):
                         rst["时间"][0]["text"] = "{:4d}-{:02d}~至今".format(int(time_list[0]),int(time_list[1]))
                     else:
                         rst["时间"][0]["text"] = "{:4d}~至今".format(int(time_list[0]))
+            normal = {"时间":"Time","项目名称":"project_name","机构":"company_name","职位":"project_duty","工作内容":"project_desc"}
+            for key in normal.keys():
+                if rst.get(key):
+                    rst[normal[key]] = rst[key]
+                    rst.pop(key)
             pro_list.extend([{key:rst[key][0]["text"] for key in rst.keys()} for rst in info])
     return pro_list
 
@@ -909,22 +952,26 @@ def get_cultivate_list(lines):
     return job_list
 
 
-# 语言能力
+# 语言能力(已完成)
 def get_lag_list(lines):
     logger.info(lines)
-
-    job_list = []
+    lan_list = []
     re_lan = re.compile(r'(\w+[语话])')
-    lag_dict = {'lag_name':'', 'lag_leval':""}
+    re_lev = re.compile(r'([公共级四专八]+)')
+    lag_dict = {'lan_name':'', 'level':""}
     for l in lines:
         if not l.strip():
             continue
-        lag_name = re.search(re_lan, l)
-        if lag_name and lag_name.group(1):
-            if lag_dict['lag_name']:
-                job_list.append(lag_dict)
-            lag_dict['lag_name'] = lag_name.group(1)
-    return job_list
+        lan_name = re.search(re_lan, l)
+        lag_lev = re.search(re_lev, l)
+        if lag_lev and lag_lev.group(1):
+            lag_dict["level"] = lag_lev.group(1)
+        if lan_name and lan_name.group(1):
+            if lag_dict["lan_name"]:
+                lan_list.append(lag_dict)
+                lag_dict = {'lan_name':'', 'level':""}
+            lag_dict['lan_name'] = lan_name.group(1)
+    return lan_list
 
 
 # 家庭情况(已弃用)

+ 34 - 29
tools/srafa.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2022-07-07 12:59:42
 # @Last Modified by:   privacy
-# @Last Modified time: 2022-07-08 17:49:57
+# @Last Modified time: 2022-07-12 18:05:48
 # import pdb
 from pprint import pprint
 import pandas as pd
@@ -94,37 +94,42 @@ def parse_line(line):
     return result
 
 
-lo = {}
-with pdfplumber.open(path) as pdf:
-        for page in pdf.pages:
-            for table in page.extract_tables():
-                for line in table:
-                    lo[len(lo.keys())] = line
-
-kwln = -1
-kwline = None
-for key in lo.keys():
-    # pdb.set_trace()
-    for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
-        if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
-            # pdb.set_trace()
-            for c in lo[key]:
+def parse_layout(path):
+    result = []
+    lo = {}
+    with pdfplumber.open(path) as pdf:
+            for page in pdf.pages:
+                for table in page.extract_tables():
+                    for line in table:
+                        lo[len(lo.keys())] = line
+    print(lo)
+    kwln = -1
+    kwline = None
+    for key in lo.keys():
+        # pdb.set_trace()
+        for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
+            if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
                 # pdb.set_trace()
-                if c and ''.join(c.split()) in keywords:# 非关键词行元素
-                    print(parse_line(lo[key]))
+                for c in lo[key]:
+                    # pdb.set_trace()
+                    if c and ''.join(c.split()) in keywords:# 非关键词行元素
+                        result.extend(parse_line(lo[key]))
+                        break
+                else:# 关键词行元素
+                    schema = dict()
+                    for key, val in zip(kwline, lo[key]):
+                        if key:
+                            schema[key] = val
+                    result.append(schema)
                     break
-            else:# 关键词行元素
-                schema = dict()
-                for key, val in zip(kwline, lo[key]):
-                    if key:
-                        schema[key] = val
-                print(schema)
                 break
-            break
-    else:
-        # print("此行为关键词行")
-        kwline = lo[key]
-        kwln = len(lo[key])
+        else:
+            # print("此行为关键词行")
+            kwline = [''.join(cell.split()) for cell in lo[key]]
+            kwln = len(lo[key])
+    return result
 
+if __name__ == '__main__':
+    print(parse_layout(path))