srafa.py 20 KB


  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 12:59:42
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-18 14:57:59
  6. # import pdb
  7. import re
  8. import json
  9. import requests
  10. from requests.adapters import HTTPAdapter
  11. import pdfplumber
  12. from docx import Document
  13. path = "d:\\desktop\\社招简历模板.docx"
  14. class Social(object):
  15. """docstring for Social"""
  16. def __init__(self):
  17. super(Social, self).__init__()
  18. self.keywords = [
  19. '姓名',
  20. '性别',
  21. '出生日期',
  22. '一寸照片',
  23. '民族',
  24. '出生地',
  25. '政治面貌(加入时间)',
  26. '参加工作时间',
  27. '健康状况',
  28. '外语水平',
  29. '初始学历、专业',
  30. '最高学历、专业',
  31. '初始学历毕业院校及毕业时间',
  32. '最高学历毕业院校及毕业时间',
  33. '专业技术资格(取得时间)',
  34. '职业技能等级(取得时间)',
  35. '熟悉专业有何专长',
  36. '工作单位',
  37. '现任职务',
  38. '任职时间',
  39. '提职时间',
  40. '意向岗位',
  41. '联系电话',
  42. '学习经历',
  43. '起止时间',
  44. '学校',
  45. '专业',
  46. '学历',
  47. '学位',
  48. '研究方向',
  49. '是否全日制',
  50. '培训经历',
  51. '培训类型',
  52. '机构',
  53. '内容',
  54. '成绩',
  55. '证书名称',
  56. '工作经历',
  57. '职务',
  58. '部门',
  59. '证明人',
  60. '备注',
  61. '对报名岗位认识及工作设想',
  62. '自我评价及主要工作业绩',
  63. '获得职业资格证书情况',
  64. '获得日期',
  65. '名称',
  66. '证书编码/文号',
  67. '授予单位',
  68. '奖惩情况',
  69. '项目',
  70. '时间',
  71. '项目单位',
  72. '证明材料',
  73. '主要家庭成员及社会关系',
  74. '称谓',
  75. '出生年月',
  76. '政治面貌',
  77. '工作单位及职务',
  78. '其他情况说明',
  79. '诚信承诺',
  80. '社会招聘工作办公室资格审查意见'
  81. ]
  82. self.json_obj = self.get_translate()
  83. def get_translate(self):
  84. # 转译数据库字段名
  85. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  86. json_obj = json.load(ff)
  87. return json_obj
  88. def parse_line(self, line):
  89. result = []
  90. key = None
  91. for cell in line:
  92. if cell and ''.join(cell.split()) in self.keywords:
  93. key = ''.join(cell.split())
  94. elif cell and key:
  95. schema = {key:cell}
  96. result.append(schema)
  97. key = None
  98. return result
  99. # 解析word
  100. def parse_word_layout(self, path):
  101. result = []
  102. doc = Document(path)
  103. lo = {}
  104. for _table in doc.tables[:]:
  105. for i, row in enumerate(_table.rows[:]):
  106. row_content = []
  107. for cell in row.cells[:]:
  108. c = cell.text
  109. if c not in row_content:
  110. row_content.append(c)
  111. lo[len(lo.keys())] = row_content
  112. kwln = -1# 关键词行长度
  113. kwline = None# 关键词行
  114. for key in lo.keys():
  115. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  116. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  117. perc = 0# 行内关键词数量
  118. for c in lo[key]:
  119. if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词
  120. perc += 1
  121. if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
  122. perc = 0# 清空行内关键词数
  123. result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素
  124. break
  125. else:# 关键词行元素
  126. if len(kwline) != len(lo[key]):
  127. break
  128. schema = dict()
  129. for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
  130. if key:
  131. schema[key] = val
  132. result.append(schema)
  133. break
  134. break
  135. else:
  136. # print("{}:此行为关键词行!".format(lo[key]))
  137. if len(lo[key])>2:
  138. try:
  139. kwline = [''.join(cell.split()) for cell in lo[key]]
  140. except Exception as e:
  141. kwline = lo[key]
  142. kwln = len(lo[key])
  143. return result
  144. # 解析pdf
  145. def parse_pdf_layout(self, path):
  146. result = []
  147. lo = {}
  148. with pdfplumber.open(path) as pdf:
  149. for page in pdf.pages:
  150. for table in page.extract_tables():
  151. for line in table:
  152. # lo[len(lo.keys())] = [cell for cell in line if cell]
  153. lo[len(lo.keys())] = line
  154. kwln = -1
  155. kwline = None
  156. for key in lo.keys():
  157. # pdb.set_trace()
  158. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  159. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  160. # pdb.set_trace()
  161. for c in lo[key] or len(lo[key])!=kwln:
  162. # pdb.set_trace()
  163. if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
  164. result.extend(self.parse_line(lo[key]))
  165. break
  166. else:# 关键词行元素
  167. schema = dict()
  168. for key, val in zip(kwline, lo[key]):
  169. if key:
  170. schema[key] = val if val else key
  171. result.append(schema)
  172. break
  173. break
  174. else:
  175. kwline = []
  176. for cell in lo[key]:
  177. if cell:
  178. kwline.append(''.join(cell.split()))
  179. else:
  180. kwline.append(cell)
  181. kwln = len(lo[key])
  182. return result
  183. # 格式化数据
  184. def formatter(self, datalist):
  185. result = dict()
  186. for d in datalist:
  187. if len(d) == 1:
  188. for key in d.keys():
  189. result[key] = d[key]
  190. else:
  191. for k in list(d.keys()):
  192. if k == "".join(d[k].split()):
  193. d.pop(k)
  194. if result.get(k):
  195. result[k].append(d)
  196. else:
  197. result[k] = [d]
  198. if result.get("外语水平"):
  199. data = re.findall(r'(\w+[语话])', result["外语水平"])
  200. if data:
  201. result["外语水平"] = data
  202. if result.get("专业技术资格(取得时间)"):
  203. dates = re.findall(r'\d+', result["专业技术资格(取得时间)"])
  204. for i in dates:
  205. result["专业技术资格(取得时间)"] = result["专业技术资格(取得时间)"].replace(i, "")
  206. names = re.findall(r'\w+', result["专业技术资格(取得时间)"])
  207. if len(dates) == 1:
  208. result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"专业技术资格":names}]
  209. elif len(dates) == 2:
  210. result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"专业技术资格":names}]
  211. elif len(dates) == 3:
  212. result["专业技术资格(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"专业技术资格":names}]
  213. if result.get("职业技能等级(取得时间)"):
  214. dates = re.findall(r'\d+', result["职业技能等级(取得时间)"])
  215. for i in dates:
  216. result["职业技能等级(取得时间)"] = result["职业技能等级(取得时间)"].replace(i, "")
  217. names = re.findall(r'\w+', result["职业技能等级(取得时间)"])
  218. if len(dates) == 1:
  219. result["职业技能等级(取得时间)"] = [{"时间": "{:4d}-01-01".format(int(dates[0])),"职业技能等级":names}]
  220. elif len(dates) == 2:
  221. result["职业技能等级(取得时间)"] = [{"时间": "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1])),"职业技能等级":names}]
  222. elif len(dates) == 3:
  223. result["职业技能等级(取得时间)"] = [{"时间": "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2])),"职业技能等级":names}]
  224. ### 时间格式化
  225. if result.get("出生年月"):
  226. dates = re.findall(r'\d+' , result["出生年月"])
  227. if len(dates) == 1:
  228. result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
  229. elif len(dates) == 2:
  230. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  231. elif len(dates) == 3:
  232. result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  233. if result.get("任职时间"):
  234. dates = re.findall(r'\d+' , result["任职时间"])
  235. if len(dates) == 1:
  236. result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
  237. elif len(dates) == 2:
  238. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  239. elif len(dates) == 3:
  240. result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  241. if result.get("参加工作时间"):
  242. dates = re.findall(r'\d+' , result["参加工作时间"])
  243. if len(dates) == 1:
  244. result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
  245. elif len(dates) == 2:
  246. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  247. elif len(dates) == 3:
  248. result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  249. if result.get("最高学历毕业院校及毕业时间"):
  250. dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
  251. ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
  252. if len(ws) > 0:
  253. result["最高学历毕业院校"] = ws[0]
  254. if len(dates) == 1:
  255. result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  256. elif len(dates) == 2:
  257. result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  258. elif len(dates) == 3:
  259. result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  260. result.pop("最高学历毕业院校及毕业时间")
  261. if result.get("初始学历毕业院校及毕业时间"):
  262. dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
  263. ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
  264. if len(ws) > 0:
  265. result["初始学历毕业院校"] = ws[0]
  266. if len(dates) == 1:
  267. result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  268. elif len(dates) == 2:
  269. result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  270. elif len(dates) == 3:
  271. result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  272. result.pop("初始学历毕业院校及毕业时间")
  273. if result.get("学习经历"):
  274. for idx, edu in enumerate(result["学习经历"]):
  275. if edu.get("起止时间"):
  276. dates = re.findall(r'\d+' , edu["起止时间"])
  277. if len(dates) == 4:
  278. result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  279. if result.get("培训经历"):
  280. for idx, edu in enumerate(result["培训经历"]):
  281. if edu.get("起止时间"):
  282. dates = re.findall(r'\d+' , edu["起止时间"])
  283. if len(dates) == 4:
  284. result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  285. if result.get("工作经历"):
  286. for idx, edu in enumerate(result["工作经历"]):
  287. if edu.get("起止时间"):
  288. dates = re.findall(r'\d+' , edu["起止时间"])
  289. if len(dates) == 4:
  290. result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  291. if result.get("项目经历"):
  292. for idx, edu in enumerate(result["项目经历"]):
  293. if edu.get("起止时间"):
  294. dates = re.findall(r'\d+' , edu["起止时间"])
  295. if len(dates) == 4:
  296. result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  297. if result.get("获得职业资格证书情况"):
  298. for idx, edu in enumerate(result["获得职业资格证书情况"]):
  299. if edu.get("获得日期"):
  300. dates = re.findall(r'\d+' , edu["获得日期"])
  301. if len(dates) == 2:
  302. result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  303. if result.get("奖惩情况"):
  304. for idx, edu in enumerate(result["奖惩情况"]):
  305. if edu.get("时间"):
  306. dates = re.findall(r'\d+' , edu["时间"])
  307. if len(dates) == 2:
  308. result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  309. if result.get("主要家庭成员及社会关系"):
  310. for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
  311. if fam.get("出生年月"):
  312. dates = re.findall(r'\d+' , fam["出生年月"])
  313. if len(dates) == 2:
  314. result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  315. normal = self.json_obj["base"]
  316. itenormal = self.json_obj["base"]
  317. edunormal = self.json_obj["tal_his_edu"]
  318. jobnormal = self.json_obj["tal_his_job"]
  319. tranornal = self.json_obj["tal_training_experience"]
  320. cetnormal = self.json_obj["tal_vocational_qualification_certificate"]
  321. rewnormal = self.json_obj["tal_reward_punishment"]
  322. family = self.json_obj["tal_family_social_relation"]
  323. for key in normal.keys():
  324. if result.get(key):
  325. result[normal[key]] = result[key]
  326. result.pop(key)
  327. for idx in range(len(result['学习经历'])):
  328. for key in edunormal.keys():
  329. if result['学习经历'][idx].get(key):
  330. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  331. result['学习经历'][idx].pop(key)
  332. for idx in range(len(result['工作经历'])):
  333. for key in jobnormal.keys():
  334. if result['工作经历'][idx].get(key):
  335. result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
  336. result['工作经历'][idx].pop(key)
  337. for idx in range(len(result['培训经历'])):
  338. for key in tranornal.keys():
  339. if result['培训经历'][idx].get(key):
  340. result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]
  341. result['培训经历'][idx].pop(key)
  342. for idx in range(len(result['获得职业资格证书情况'])):
  343. for key in cetnormal.keys():
  344. if result['获得职业资格证书情况'][idx].get(key):
  345. result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]
  346. result['获得职业资格证书情况'][idx].pop(key)
  347. for idx in range(len(result['奖惩情况'])):
  348. for key in rewnormal.keys():
  349. if result['奖惩情况'][idx].get(key):
  350. result['奖惩情况'][idx][rewnormal[key]] = result['奖惩情况'][idx][key]
  351. result['奖惩情况'][idx].pop(key)
  352. for idx in range(len(result['主要家庭成员及社会关系'])):
  353. for key in family.keys():
  354. if result['主要家庭成员及社会关系'][idx].get(key):
  355. result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
  356. result['主要家庭成员及社会关系'][idx].pop(key)
  357. tit = {
  358. "基本信息":"base",
  359. "职业发展管理":"intent_job",
  360. "学习经历":"tal_his_edu",
  361. "工作经历":"tal_his_job",
  362. "项目经历":"tal_his_project",
  363. "培训经历":"tal_training_experience",
  364. "奖惩情况":"tal_reward_punishment",
  365. "语言能力":"tal_language",
  366. "获得职业资格证书情况":"tal_vocational_qualification_certificate",
  367. "专业技能":"tal_professional_tech_certificate",
  368. "主要家庭成员及社会关系":"tal_family_social_relation",
  369. "其他情况说明":"intro"
  370. }
  371. for key in tit.keys():
  372. if result.get(key):
  373. result[tit[key]] = result[key]
  374. result.pop(key)
  375. return result
  376. # 推送后端
  377. def push_back(self, result):
  378. url = "http://192.168.1.110:9999/talent/getResumeData"
  379. session = requests.Session()
  380. session.mount('http://', HTTPAdapter(max_retries = 3))
  381. try:
  382. headers = {
  383. 'contentType':'Application/json'
  384. }
  385. response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
  386. print(response.text)
  387. except Exception as e:
  388. print(e)
  389. def predict(self, path):
  390. if path.endswith(".docx"):
  391. result = self.formatter(self.parse_word_layout(path))
  392. self.push_back(result)
  393. print(self.formatter(self.parse_word_layout(path)))
  394. elif path.endswith(".pdf"):
  395. result = self.formatter(self.parse_pdf_layout(path))
  396. self.push_back(result)
  397. print(self.formatter(self.parse_pdf_layout(path)))
  398. if __name__ == '__main__':
  399. s = Social()
  400. s.predict(path)