srafa.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 12:59:42
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-16 09:08:43
  6. # import pdb
  7. from pprint import pprint
  8. import json
  9. import pandas as pd
  10. import pdfplumber
  11. path = "d:\\desktop\\社招简历模板.pdf"
  12. keywords = ['姓名',
  13. '性别',
  14. '出生日期',
  15. '一寸照片',
  16. '民族',
  17. '出生地',
  18. '政治面貌(加入时间)',
  19. '参加工作时间',
  20. '健康状况',
  21. '外语水平',
  22. '初始学历、专业',
  23. '最高学历、专业',
  24. '初始学历毕业院校及毕业时间',
  25. '最高学历毕业院校及毕业时间',
  26. '专业技术资格(取得时间)',
  27. '职业技能等级(取得时间)',
  28. '熟悉专业有何专长',
  29. '工作单位',
  30. '现任职务',
  31. '任职时间',
  32. '提职时间',
  33. '意向岗位',
  34. '联系电话',
  35. '学习经历',
  36. '起止时间',
  37. '学校',
  38. '专业',
  39. '学历',
  40. '学位',
  41. '研究方向',
  42. '是否全日制',
  43. '培训',
  44. '起止时间',
  45. '培训类型',
  46. '机构',
  47. '内容',
  48. '成绩',
  49. '证书名称',
  50. '经历',
  51. '工作经历',
  52. '起止时间',
  53. '工作单位',
  54. '职务',
  55. '部门',
  56. '证明人',
  57. '备注',
  58. '对报名岗位认识及工作设想',
  59. '自我评价及主要工作业绩',
  60. '获得职业资格证书情况',
  61. '获得日期',
  62. '名称',
  63. '证书编码/文号',
  64. '授予单位',
  65. '备注',
  66. '奖惩',
  67. '项目',
  68. '时间',
  69. '项目单位',
  70. '证明材料',
  71. '情况',
  72. '主要家庭成员及社会关系',
  73. '称谓',
  74. '出生年月',
  75. '政治面貌',
  76. '工作单位及职务',
  77. '其他情况说明',
  78. '诚信承诺',
  79. '本人承诺,以上信息均与事实相符,若有虚假,愿承担一切后果并自愿取消应聘资格。'
  80. '承诺人:'
  81. '社会招聘工作办公室资格审查意见']
  82. def parse_line(line):
  83. result = []
  84. key = None
  85. for cell in line:
  86. if cell and ''.join(cell.split()) in keywords:
  87. key = ''.join(cell.split())
  88. elif cell and key:
  89. schema = {key:cell}
  90. result.append(schema)
  91. key = None
  92. return result
  93. def parse_layout(path):
  94. result = []
  95. lo = {}
  96. with pdfplumber.open(path) as pdf:
  97. for page in pdf.pages:
  98. for table in page.extract_tables():
  99. for line in table:
  100. # lo[len(lo.keys())] = [cell for cell in line if cell]
  101. lo[len(lo.keys())] = line
  102. kwln = -1
  103. kwline = None
  104. for key in lo.keys():
  105. # pdb.set_trace()
  106. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  107. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  108. # pdb.set_trace()
  109. for c in lo[key] or len(lo[key])!=kwln:
  110. # pdb.set_trace()
  111. if c and ''.join(c.split()) in keywords:# 非关键词行元素
  112. result.extend(parse_line(lo[key]))
  113. break
  114. else:# 关键词行元素
  115. schema = dict()
  116. for key, val in zip(kwline, lo[key]):
  117. if key:
  118. schema[key] = val if val else key
  119. result.append(schema)
  120. break
  121. break
  122. else:
  123. # print("此行为关键词行")
  124. # kwline = lo[key]
  125. kwline = []
  126. for cell in lo[key]:
  127. if cell:
  128. kwline.append(''.join(cell.split()))
  129. else:
  130. kwline.append(cell)
  131. kwln = len(lo[key])
  132. return result
  133. # 格式化数据
  134. def formatter(datalist):
  135. result = dict()
  136. for d in datalist:
  137. if len(d) == 1:
  138. for key in d.keys():
  139. result[key] = d[key]
  140. else:
  141. for k in list(d.keys()):
  142. if k == "".join(d[k].split()):
  143. d.pop(k)
  144. if result.get(k):
  145. result[k].append(d)
  146. else:
  147. result[k] = [d]
  148. # 转译数据库字段名
  149. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  150. json_obj = json.load(ff)
  151. normal = json_obj["base"]
  152. itenormal = json_obj["base"]
  153. edunormal = json_obj["tal_his_edu"]
  154. jobnormal = json_obj["tal_his_job"]
  155. cetnormal = json_obj["tal_vocational_qualification_certificate"]
  156. family = json_obj["tal_family_social_relations"]
  157. for key in normal.keys():
  158. if result.get(key):
  159. result[normal[key]] = result[key]
  160. result.pop(key)
  161. for idx in range(len(result['学习经历'])):
  162. for key in edunormal.keys():
  163. if result['学习经历'][idx].get(key):
  164. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  165. result['学习经历'][idx].pop(key)
  166. for idx in range(len(result['工作经历'])):
  167. for key in jobnormal.keys():
  168. if result['工作经历'][idx].get(key):
  169. result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
  170. result['工作经历'][idx].pop(key)
  171. for idx in range(len(result['获得职业资格证书情况'])):
  172. for key in cetnormal.keys():
  173. if result['获得职业资格证书情况'][idx].get(key):
  174. result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]
  175. result['获得职业资格证书情况'][idx].pop(key)
  176. for idx in range(len(result['主要家庭成员及社会关系'])):
  177. for key in family.keys():
  178. if result['主要家庭成员及社会关系'][idx].get(key):
  179. result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
  180. result['主要家庭成员及社会关系'][idx].pop(key)
  181. tit = {
  182. "基本信息":"base",
  183. "职业发展管理":"intent_job",
  184. "学习经历":"tal_his_edu",
  185. "工作经历":"tal_his_job",
  186. "项目经历":"tal_his_project",
  187. "培训经历":"tal_training_institutions",
  188. "获奖情况":"tal_rewards_punishments",
  189. "语言能力":"tal_language",
  190. "获得职业资格证书情况":"tal_vocational_qualification_certificate",
  191. "专业技能":"tal_professional_tech_certificate",
  192. "主要家庭成员及社会关系":"tal_family_social_relations"
  193. }
  194. for key in tit.keys():
  195. if result.get(key):
  196. result[tit[key]] = result[key]
  197. result.pop(key)
  198. # url = "http://192.168.1.110:9999/talent/getResumeData"
  199. # session = requests.Session()
  200. # session.mount('http://', HTTPAdapter(max_retries = 3))
  201. # try:
  202. # headers = {
  203. # 'contentType':'Application/json'
  204. # }
  205. # response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
  206. # print(response.text)
  207. # except Exception as e:
  208. # print(e)
  209. return result
  210. if __name__ == '__main__':
  211. pprint(formatter(parse_layout(path)))