srafa.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 12:59:42
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-18 14:09:38
  6. # import pdb
  7. import json
  8. import requests
  9. from requests.adapters import HTTPAdapter
  10. import pdfplumber
  11. from docx import Document
  12. path = "d:\\desktop\\社招简历模板.docx"
  13. class Social(object):
  14. """docstring for Social"""
  15. def __init__(self):
  16. super(Social, self).__init__()
  17. self.keywords = [
  18. '姓名',
  19. '性别',
  20. '出生日期',
  21. '一寸照片',
  22. '民族',
  23. '出生地',
  24. '政治面貌(加入时间)',
  25. '参加工作时间',
  26. '健康状况',
  27. '外语水平',
  28. '初始学历、专业',
  29. '最高学历、专业',
  30. '初始学历毕业院校及毕业时间',
  31. '最高学历毕业院校及毕业时间',
  32. '专业技术资格(取得时间)',
  33. '职业技能等级(取得时间)',
  34. '熟悉专业有何专长',
  35. '工作单位',
  36. '现任职务',
  37. '任职时间',
  38. '提职时间',
  39. '意向岗位',
  40. '联系电话',
  41. '学习经历',
  42. '起止时间',
  43. '学校',
  44. '专业',
  45. '学历',
  46. '学位',
  47. '研究方向',
  48. '是否全日制',
  49. '培训经历',
  50. '培训类型',
  51. '机构',
  52. '内容',
  53. '成绩',
  54. '证书名称',
  55. '工作经历',
  56. '职务',
  57. '部门',
  58. '证明人',
  59. '备注',
  60. '对报名岗位认识及工作设想',
  61. '自我评价及主要工作业绩',
  62. '获得职业资格证书情况',
  63. '获得日期',
  64. '名称',
  65. '证书编码/文号',
  66. '授予单位',
  67. '奖惩情况',
  68. '项目',
  69. '时间',
  70. '项目单位',
  71. '证明材料',
  72. '主要家庭成员及社会关系',
  73. '称谓',
  74. '出生年月',
  75. '政治面貌',
  76. '工作单位及职务',
  77. '其他情况说明',
  78. '诚信承诺',
  79. '社会招聘工作办公室资格审查意见'
  80. ]
  81. self.json_obj = self.get_translate()
  82. def get_translate(self):
  83. # 转译数据库字段名
  84. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  85. json_obj = json.load(ff)
  86. return json_obj
  87. def parse_line(self, line):
  88. result = []
  89. key = None
  90. for cell in line:
  91. if cell and ''.join(cell.split()) in self.keywords:
  92. key = ''.join(cell.split())
  93. elif cell and key:
  94. schema = {key:cell}
  95. result.append(schema)
  96. key = None
  97. return result
  98. # 解析word
  99. def parse_word_layout(self, path):
  100. result = []
  101. doc = Document(path)
  102. lo = {}
  103. for _table in doc.tables[:]:
  104. for i, row in enumerate(_table.rows[:]):
  105. row_content = []
  106. for cell in row.cells[:]:
  107. c = cell.text
  108. if c not in row_content:
  109. row_content.append(c)
  110. lo[len(lo.keys())] = row_content
  111. kwln = -1# 关键词行长度
  112. kwline = None# 关键词行
  113. for key in lo.keys():
  114. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  115. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  116. perc = 0# 行内关键词数量
  117. for c in lo[key]:
  118. if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词
  119. perc += 1
  120. if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
  121. perc = 0# 清空行内关键词数
  122. result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素
  123. break
  124. else:# 关键词行元素
  125. if len(kwline) != len(lo[key]):
  126. break
  127. schema = dict()
  128. for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
  129. if key:
  130. schema[key] = val
  131. result.append(schema)
  132. break
  133. break
  134. else:
  135. # print("{}:此行为关键词行!".format(lo[key]))
  136. if len(lo[key])>2:
  137. try:
  138. kwline = [''.join(cell.split()) for cell in lo[key]]
  139. except Exception as e:
  140. kwline = lo[key]
  141. kwln = len(lo[key])
  142. return result
  143. # 解析pdf
  144. def parse_pdf_layout(self, path):
  145. result = []
  146. lo = {}
  147. with pdfplumber.open(path) as pdf:
  148. for page in pdf.pages:
  149. for table in page.extract_tables():
  150. for line in table:
  151. # lo[len(lo.keys())] = [cell for cell in line if cell]
  152. lo[len(lo.keys())] = line
  153. kwln = -1
  154. kwline = None
  155. for key in lo.keys():
  156. # pdb.set_trace()
  157. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  158. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  159. # pdb.set_trace()
  160. for c in lo[key] or len(lo[key])!=kwln:
  161. # pdb.set_trace()
  162. if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
  163. result.extend(self.parse_line(lo[key]))
  164. break
  165. else:# 关键词行元素
  166. schema = dict()
  167. for key, val in zip(kwline, lo[key]):
  168. if key:
  169. schema[key] = val if val else key
  170. result.append(schema)
  171. break
  172. break
  173. else:
  174. kwline = []
  175. for cell in lo[key]:
  176. if cell:
  177. kwline.append(''.join(cell.split()))
  178. else:
  179. kwline.append(cell)
  180. kwln = len(lo[key])
  181. return result
  182. # 格式化数据
  183. def formatter(self, datalist):
  184. result = dict()
  185. for d in datalist:
  186. if len(d) == 1:
  187. for key in d.keys():
  188. result[key] = d[key]
  189. else:
  190. for k in list(d.keys()):
  191. if k == "".join(d[k].split()):
  192. d.pop(k)
  193. if result.get(k):
  194. result[k].append(d)
  195. else:
  196. result[k] = [d]
  197. ### 时间格式化
  198. if result.get("出生年月"):
  199. dates = re.findall(r'\d+' , result["出生年月"])
  200. if len(dates) == 1:
  201. result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
  202. elif len(dates) == 2:
  203. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  204. elif len(dates) == 3:
  205. result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  206. if result.get("任职时间"):
  207. dates = re.findall(r'\d+' , result["任职时间"])
  208. if len(dates) == 1:
  209. result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
  210. elif len(dates) == 2:
  211. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  212. elif len(dates) == 3:
  213. result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  214. if result.get("参加工作时间"):
  215. dates = re.findall(r'\d+' , result["参加工作时间"])
  216. if len(dates) == 1:
  217. result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
  218. elif len(dates) == 2:
  219. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  220. elif len(dates) == 3:
  221. result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  222. if result.get("最高学历毕业院校及毕业时间"):
  223. dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
  224. ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
  225. if len(ws) > 0:
  226. result["最高学历毕业院校"] = ws[0]
  227. if len(dates) == 1:
  228. result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  229. elif len(dates) == 2:
  230. result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  231. elif len(dates) == 3:
  232. result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  233. result.pop("最高学历毕业院校及毕业时间")
  234. if result.get("初始学历毕业院校及毕业时间"):
  235. dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
  236. ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
  237. if len(ws) > 0:
  238. result["初始学历毕业院校"] = ws[0]
  239. if len(dates) == 1:
  240. result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  241. elif len(dates) == 2:
  242. result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  243. elif len(dates) == 3:
  244. result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  245. result.pop("初始学历毕业院校及毕业时间")
  246. if result.get("学习经历"):
  247. for idx, edu in enumerate(result["学习经历"]):
  248. if edu.get("起止时间"):
  249. dates = re.findall(r'\d+' , edu["起止时间"])
  250. if len(dates) == 4:
  251. result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  252. if result.get("培训经历"):
  253. for idx, edu in enumerate(result["培训经历"]):
  254. if edu.get("起止时间"):
  255. dates = re.findall(r'\d+' , edu["起止时间"])
  256. if len(dates) == 4:
  257. result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  258. if result.get("工作经历"):
  259. for idx, edu in enumerate(result["工作经历"]):
  260. if edu.get("起止时间"):
  261. dates = re.findall(r'\d+' , edu["起止时间"])
  262. if len(dates) == 4:
  263. result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  264. if result.get("项目经历"):
  265. for idx, edu in enumerate(result["项目经历"]):
  266. if edu.get("起止时间"):
  267. dates = re.findall(r'\d+' , edu["起止时间"])
  268. if len(dates) == 4:
  269. result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  270. if result.get("获得职业资格证书情况"):
  271. for idx, edu in enumerate(result["获得职业资格证书情况"]):
  272. if edu.get("获得日期"):
  273. dates = re.findall(r'\d+' , edu["获得日期"])
  274. if len(dates) == 2:
  275. result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  276. if result.get("奖惩情况"):
  277. for idx, edu in enumerate(result["奖惩情况"]):
  278. if edu.get("时间"):
  279. dates = re.findall(r'\d+' , edu["时间"])
  280. if len(dates) == 2:
  281. result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  282. if result.get("主要家庭成员及社会关系"):
  283. for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
  284. if fam.get("出生年月"):
  285. dates = re.findall(r'\d+' , fam["出生年月"])
  286. if len(dates) == 2:
  287. result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  288. normal = self.json_obj["base"]
  289. itenormal = self.json_obj["base"]
  290. edunormal = self.json_obj["tal_his_edu"]
  291. jobnormal = self.json_obj["tal_his_job"]
  292. tranornal = self.json_obj["tal_training_experience"]
  293. cetnormal = self.json_obj["tal_vocational_qualification_certificate"]
  294. rewnormal = self.json_obj["tal_reward_punishment"]
  295. family = self.json_obj["tal_family_social_relation"]
  296. for key in normal.keys():
  297. if result.get(key):
  298. result[normal[key]] = result[key]
  299. result.pop(key)
  300. for idx in range(len(result['学习经历'])):
  301. for key in edunormal.keys():
  302. if result['学习经历'][idx].get(key):
  303. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  304. result['学习经历'][idx].pop(key)
  305. for idx in range(len(result['工作经历'])):
  306. for key in jobnormal.keys():
  307. if result['工作经历'][idx].get(key):
  308. result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
  309. result['工作经历'][idx].pop(key)
  310. for idx in range(len(result['培训经历'])):
  311. for key in tranornal.keys():
  312. if result['培训经历'][idx].get(key):
  313. result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]
  314. result['培训经历'][idx].pop(key)
  315. for idx in range(len(result['获得职业资格证书情况'])):
  316. for key in cetnormal.keys():
  317. if result['获得职业资格证书情况'][idx].get(key):
  318. result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]
  319. result['获得职业资格证书情况'][idx].pop(key)
  320. for idx in range(len(result['奖惩情况'])):
  321. for key in rewnormal.keys():
  322. if result['奖惩情况'][idx].get(key):
  323. result['奖惩情况'][idx][rewnormal[key]] = result['奖惩情况'][idx][key]
  324. result['奖惩情况'][idx].pop(key)
  325. for idx in range(len(result['主要家庭成员及社会关系'])):
  326. for key in family.keys():
  327. if result['主要家庭成员及社会关系'][idx].get(key):
  328. result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
  329. result['主要家庭成员及社会关系'][idx].pop(key)
  330. tit = {
  331. "基本信息":"base",
  332. "职业发展管理":"intent_job",
  333. "学习经历":"tal_his_edu",
  334. "工作经历":"tal_his_job",
  335. "项目经历":"tal_his_project",
  336. "培训经历":"tal_training_experience",
  337. "奖惩情况":"tal_reward_punishment",
  338. "语言能力":"tal_language",
  339. "获得职业资格证书情况":"tal_vocational_qualification_certificate",
  340. "专业技能":"tal_professional_tech_certificate",
  341. "主要家庭成员及社会关系":"tal_family_social_relation",
  342. "其他情况说明":"intro"
  343. }
  344. for key in tit.keys():
  345. if result.get(key):
  346. result[tit[key]] = result[key]
  347. result.pop(key)
  348. return result
  349. # 推送后端
  350. def push_back(self, result):
  351. url = "http://192.168.1.110:9999/talent/getResumeData"
  352. session = requests.Session()
  353. session.mount('http://', HTTPAdapter(max_retries = 3))
  354. try:
  355. headers = {
  356. 'contentType':'Application/json'
  357. }
  358. response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
  359. print(response.text)
  360. except Exception as e:
  361. print(e)
  362. def predict(self, path):
  363. if path.endswith(".docx"):
  364. result = self.formatter(self.parse_word_layout(path))
  365. self.push_back(result)
  366. print(self.formatter(self.parse_word_layout(path)))
  367. elif path.endswith(".pdf"):
  368. result = self.formatter(self.parse_pdf_layout(path))
  369. self.push_back(result)
  370. print(self.formatter(self.parse_pdf_layout(path)))
  371. if __name__ == '__main__':
  372. s = Social()
  373. s.predict(path)