srafa.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 12:59:42
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-16 11:41:09
  6. # import pdb
  7. from pprint import pprint
  8. import json
  9. import pandas as pd
  10. import pdfplumber
  11. import docx
  12. from docx import Document
  13. from docx.shared import Inches
  14. path = "d:\\desktop\\社招简历模板.docx"
  15. # path = "d:\\desktop\\社招简历模板.pdf"
  16. keywords = ['姓名',
  17. '性别',
  18. '出生日期',
  19. '一寸照片',
  20. '民族',
  21. '出生地',
  22. '政治面貌(加入时间)',
  23. '参加工作时间',
  24. '健康状况',
  25. '外语水平',
  26. '初始学历、专业',
  27. '最高学历、专业',
  28. '初始学历毕业院校及毕业时间',
  29. '最高学历毕业院校及毕业时间',
  30. '专业技术资格(取得时间)',
  31. '职业技能等级(取得时间)',
  32. '熟悉专业有何专长',
  33. '工作单位',
  34. '现任职务',
  35. '任职时间',
  36. '提职时间',
  37. '意向岗位',
  38. '联系电话',
  39. '学习经历',
  40. '起止时间',
  41. '学校',
  42. '专业',
  43. '学历',
  44. '学位',
  45. '研究方向',
  46. '是否全日制',
  47. '培训',
  48. '起止时间',
  49. '培训类型',
  50. '机构',
  51. '内容',
  52. '成绩',
  53. '证书名称',
  54. '经历',
  55. '工作经历',
  56. '起止时间',
  57. '工作单位',
  58. '职务',
  59. '部门',
  60. '证明人',
  61. '备注',
  62. '对报名岗位认识及工作设想',
  63. '自我评价及主要工作业绩',
  64. '获得职业资格证书情况',
  65. '获得日期',
  66. '名称',
  67. '证书编码/文号',
  68. '授予单位',
  69. '备注',
  70. '奖惩',
  71. '项目',
  72. '时间',
  73. '项目单位',
  74. '证明材料',
  75. '情况',
  76. '主要家庭成员及社会关系',
  77. '称谓',
  78. '出生年月',
  79. '政治面貌',
  80. '工作单位及职务',
  81. '其他情况说明',
  82. '诚信承诺',
  83. '本人承诺,以上信息均与事实相符,若有虚假,愿承担一切后果并自愿取消应聘资格。'
  84. '承诺人:'
  85. '社会招聘工作办公室资格审查意见']
  86. def parse_line(line):
  87. result = []
  88. key = None
  89. for cell in line:
  90. if cell and ''.join(cell.split()) in keywords:
  91. key = ''.join(cell.split())
  92. elif cell and key:
  93. schema = {key:cell}
  94. result.append(schema)
  95. key = None
  96. return result
  97. def parse_word_layout(path):
  98. result = []
  99. doc = Document(path)
  100. lo = {}
  101. for _table in doc.tables[:]:
  102. for i, row in enumerate(_table.rows[:]):
  103. row_content = []
  104. for cell in row.cells[:]:
  105. c = cell.text
  106. if c not in row_content:
  107. row_content.append(c)
  108. lo[len(lo.keys())] = row_content
  109. kwln = -1# 关键词行长度
  110. kwline = None# 关键词行
  111. for key in lo.keys():
  112. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  113. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  114. perc = 0# 行内关键词数量
  115. for c in lo[key]:
  116. if c and (''.join(c.split()) in keywords):# 找到此行有关键词
  117. perc += 1
  118. if c and (''.join(c.split()) in keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
  119. perc = 0# 清空行内关键词数
  120. result.extend(parse_line(lo[key]))# 添加并解析普通行级元素
  121. break
  122. else:# 关键词行元素
  123. if len(kwline) != len(lo[key]):
  124. break
  125. schema = dict()
  126. for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
  127. if key:
  128. schema[key] = val
  129. result.append(schema)
  130. break
  131. break
  132. else:
  133. # print("{}:此行为关键词行!".format(lo[key]))
  134. if len(lo[key])>2:
  135. try:
  136. kwline = [''.join(cell.split()) for cell in lo[key]]
  137. except Exception as e:
  138. kwline = lo[key]
  139. kwln = len(lo[key])
  140. return result
  141. def parse_pdf_layout(path):
  142. result = []
  143. lo = {}
  144. with pdfplumber.open(path) as pdf:
  145. for page in pdf.pages:
  146. for table in page.extract_tables():
  147. for line in table:
  148. # lo[len(lo.keys())] = [cell for cell in line if cell]
  149. lo[len(lo.keys())] = line
  150. kwln = -1
  151. kwline = None
  152. for key in lo.keys():
  153. # pdb.set_trace()
  154. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  155. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  156. # pdb.set_trace()
  157. for c in lo[key] or len(lo[key])!=kwln:
  158. # pdb.set_trace()
  159. if c and ''.join(c.split()) in keywords:# 非关键词行元素
  160. result.extend(parse_line(lo[key]))
  161. break
  162. else:# 关键词行元素
  163. schema = dict()
  164. for key, val in zip(kwline, lo[key]):
  165. if key:
  166. schema[key] = val if val else key
  167. result.append(schema)
  168. break
  169. break
  170. else:
  171. # print("此行为关键词行")
  172. # kwline = lo[key]
  173. kwline = []
  174. for cell in lo[key]:
  175. if cell:
  176. kwline.append(''.join(cell.split()))
  177. else:
  178. kwline.append(cell)
  179. kwln = len(lo[key])
  180. return result
  181. # 格式化数据
  182. def formatter(datalist):
  183. result = dict()
  184. for d in datalist:
  185. if len(d) == 1:
  186. for key in d.keys():
  187. result[key] = d[key]
  188. else:
  189. for k in list(d.keys()):
  190. if k == "".join(d[k].split()):
  191. d.pop(k)
  192. if result.get(k):
  193. result[k].append(d)
  194. else:
  195. result[k] = [d]
  196. # 转译数据库字段名
  197. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  198. json_obj = json.load(ff)
  199. normal = json_obj["base"]
  200. itenormal = json_obj["base"]
  201. edunormal = json_obj["tal_his_edu"]
  202. jobnormal = json_obj["tal_his_job"]
  203. cetnormal = json_obj["tal_vocational_qualification_certificate"]
  204. family = json_obj["tal_family_social_relations"]
  205. for key in normal.keys():
  206. if result.get(key):
  207. result[normal[key]] = result[key]
  208. result.pop(key)
  209. for idx in range(len(result['学习经历'])):
  210. for key in edunormal.keys():
  211. if result['学习经历'][idx].get(key):
  212. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  213. result['学习经历'][idx].pop(key)
  214. for idx in range(len(result['工作经历'])):
  215. for key in jobnormal.keys():
  216. if result['工作经历'][idx].get(key):
  217. result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
  218. result['工作经历'][idx].pop(key)
  219. for idx in range(len(result['获得职业资格证书情况'])):
  220. for key in cetnormal.keys():
  221. if result['获得职业资格证书情况'][idx].get(key):
  222. result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]
  223. result['获得职业资格证书情况'][idx].pop(key)
  224. for idx in range(len(result['主要家庭成员及社会关系'])):
  225. for key in family.keys():
  226. if result['主要家庭成员及社会关系'][idx].get(key):
  227. result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
  228. result['主要家庭成员及社会关系'][idx].pop(key)
  229. tit = {
  230. "基本信息":"base",
  231. "职业发展管理":"intent_job",
  232. "学习经历":"tal_his_edu",
  233. "工作经历":"tal_his_job",
  234. "项目经历":"tal_his_project",
  235. "培训经历":"tal_training_institutions",
  236. "获奖情况":"tal_rewards_punishments",
  237. "语言能力":"tal_language",
  238. "获得职业资格证书情况":"tal_vocational_qualification_certificate",
  239. "专业技能":"tal_professional_tech_certificate",
  240. "主要家庭成员及社会关系":"tal_family_social_relations",
  241. "其他情况说明":"intro"
  242. }
  243. for key in tit.keys():
  244. if result.get(key):
  245. result[tit[key]] = result[key]
  246. result.pop(key)
  247. # url = "http://192.168.1.110:9999/talent/getResumeData"
  248. # session = requests.Session()
  249. # session.mount('http://', HTTPAdapter(max_retries = 3))
  250. # try:
  251. # headers = {
  252. # 'contentType':'Application/json'
  253. # }
  254. # response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
  255. # print(response.text)
  256. # except Exception as e:
  257. # print(e)
  258. return result
  259. if __name__ == '__main__':
  260. if path.endswith(".pdf"):
  261. pprint(formatter(parse_pdf_layout(path)))
  262. else:
  263. pprint(formatter(parse_word_layout(path)))