srafa.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 12:59:42
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-18 14:10:56
  6. # import pdb
  7. import re
  8. import json
  9. import requests
  10. from requests.adapters import HTTPAdapter
  11. import pdfplumber
  12. from docx import Document
  13. path = "d:\\desktop\\社招简历模板.docx"
  14. class Social(object):
  15. """docstring for Social"""
  16. def __init__(self):
  17. super(Social, self).__init__()
  18. self.keywords = [
  19. '姓名',
  20. '性别',
  21. '出生日期',
  22. '一寸照片',
  23. '民族',
  24. '出生地',
  25. '政治面貌(加入时间)',
  26. '参加工作时间',
  27. '健康状况',
  28. '外语水平',
  29. '初始学历、专业',
  30. '最高学历、专业',
  31. '初始学历毕业院校及毕业时间',
  32. '最高学历毕业院校及毕业时间',
  33. '专业技术资格(取得时间)',
  34. '职业技能等级(取得时间)',
  35. '熟悉专业有何专长',
  36. '工作单位',
  37. '现任职务',
  38. '任职时间',
  39. '提职时间',
  40. '意向岗位',
  41. '联系电话',
  42. '学习经历',
  43. '起止时间',
  44. '学校',
  45. '专业',
  46. '学历',
  47. '学位',
  48. '研究方向',
  49. '是否全日制',
  50. '培训经历',
  51. '培训类型',
  52. '机构',
  53. '内容',
  54. '成绩',
  55. '证书名称',
  56. '工作经历',
  57. '职务',
  58. '部门',
  59. '证明人',
  60. '备注',
  61. '对报名岗位认识及工作设想',
  62. '自我评价及主要工作业绩',
  63. '获得职业资格证书情况',
  64. '获得日期',
  65. '名称',
  66. '证书编码/文号',
  67. '授予单位',
  68. '奖惩情况',
  69. '项目',
  70. '时间',
  71. '项目单位',
  72. '证明材料',
  73. '主要家庭成员及社会关系',
  74. '称谓',
  75. '出生年月',
  76. '政治面貌',
  77. '工作单位及职务',
  78. '其他情况说明',
  79. '诚信承诺',
  80. '社会招聘工作办公室资格审查意见'
  81. ]
  82. self.json_obj = self.get_translate()
  83. def get_translate(self):
  84. # 转译数据库字段名
  85. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  86. json_obj = json.load(ff)
  87. return json_obj
  88. def parse_line(self, line):
  89. result = []
  90. key = None
  91. for cell in line:
  92. if cell and ''.join(cell.split()) in self.keywords:
  93. key = ''.join(cell.split())
  94. elif cell and key:
  95. schema = {key:cell}
  96. result.append(schema)
  97. key = None
  98. return result
  99. # 解析word
  100. def parse_word_layout(self, path):
  101. result = []
  102. doc = Document(path)
  103. lo = {}
  104. for _table in doc.tables[:]:
  105. for i, row in enumerate(_table.rows[:]):
  106. row_content = []
  107. for cell in row.cells[:]:
  108. c = cell.text
  109. if c not in row_content:
  110. row_content.append(c)
  111. lo[len(lo.keys())] = row_content
  112. kwln = -1# 关键词行长度
  113. kwline = None# 关键词行
  114. for key in lo.keys():
  115. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  116. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  117. perc = 0# 行内关键词数量
  118. for c in lo[key]:
  119. if c and (''.join(c.split()) in self.keywords):# 找到此行有关键词
  120. perc += 1
  121. if c and (''.join(c.split()) in self.keywords) and (perc > len(lo[key])/3):# 关键词数量超过1/3,判断此行非关键词行元素
  122. perc = 0# 清空行内关键词数
  123. result.extend(self.parse_line(lo[key]))# 添加并解析普通行级元素
  124. break
  125. else:# 关键词行元素
  126. if len(kwline) != len(lo[key]):
  127. break
  128. schema = dict()
  129. for key, val in zip(kwline, lo[key]):# 合并关键词行和行元素
  130. if key:
  131. schema[key] = val
  132. result.append(schema)
  133. break
  134. break
  135. else:
  136. # print("{}:此行为关键词行!".format(lo[key]))
  137. if len(lo[key])>2:
  138. try:
  139. kwline = [''.join(cell.split()) for cell in lo[key]]
  140. except Exception as e:
  141. kwline = lo[key]
  142. kwln = len(lo[key])
  143. return result
  144. # 解析pdf
  145. def parse_pdf_layout(self, path):
  146. result = []
  147. lo = {}
  148. with pdfplumber.open(path) as pdf:
  149. for page in pdf.pages:
  150. for table in page.extract_tables():
  151. for line in table:
  152. # lo[len(lo.keys())] = [cell for cell in line if cell]
  153. lo[len(lo.keys())] = line
  154. kwln = -1
  155. kwline = None
  156. for key in lo.keys():
  157. # pdb.set_trace()
  158. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  159. if val and ''.join(val.split()) not in self.keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  160. # pdb.set_trace()
  161. for c in lo[key] or len(lo[key])!=kwln:
  162. # pdb.set_trace()
  163. if c and ''.join(c.split()) in self.keywords:# 非关键词行元素
  164. result.extend(self.parse_line(lo[key]))
  165. break
  166. else:# 关键词行元素
  167. schema = dict()
  168. for key, val in zip(kwline, lo[key]):
  169. if key:
  170. schema[key] = val if val else key
  171. result.append(schema)
  172. break
  173. break
  174. else:
  175. kwline = []
  176. for cell in lo[key]:
  177. if cell:
  178. kwline.append(''.join(cell.split()))
  179. else:
  180. kwline.append(cell)
  181. kwln = len(lo[key])
  182. return result
  183. # 格式化数据
  184. def formatter(self, datalist):
  185. result = dict()
  186. for d in datalist:
  187. if len(d) == 1:
  188. for key in d.keys():
  189. result[key] = d[key]
  190. else:
  191. for k in list(d.keys()):
  192. if k == "".join(d[k].split()):
  193. d.pop(k)
  194. if result.get(k):
  195. result[k].append(d)
  196. else:
  197. result[k] = [d]
  198. ### 时间格式化
  199. if result.get("出生年月"):
  200. dates = re.findall(r'\d+' , result["出生年月"])
  201. if len(dates) == 1:
  202. result["出生年月"] = "{:4d}-01-01".format(int(dates[0]))
  203. elif len(dates) == 2:
  204. result["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  205. elif len(dates) == 3:
  206. result["出生年月"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  207. if result.get("任职时间"):
  208. dates = re.findall(r'\d+' , result["任职时间"])
  209. if len(dates) == 1:
  210. result["任职时间"] = "{:4d}-01-01".format(int(dates[0]))
  211. elif len(dates) == 2:
  212. result["任职时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  213. elif len(dates) == 3:
  214. result["任职时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  215. if result.get("参加工作时间"):
  216. dates = re.findall(r'\d+' , result["参加工作时间"])
  217. if len(dates) == 1:
  218. result["参加工作时间"] = "{:4d}-01-01".format(int(dates[0]))
  219. elif len(dates) == 2:
  220. result["参加工作时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  221. elif len(dates) == 3:
  222. result["参加工作时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  223. if result.get("最高学历毕业院校及毕业时间"):
  224. dates = re.findall(r'\d+' , result["最高学历毕业院校及毕业时间"])
  225. ws = re.findall(r'\w+' , result["最高学历毕业院校及毕业时间"])
  226. if len(ws) > 0:
  227. result["最高学历毕业院校"] = ws[0]
  228. if len(dates) == 1:
  229. result["最高学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  230. elif len(dates) == 2:
  231. result["最高学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  232. elif len(dates) == 3:
  233. result["最高学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  234. result.pop("最高学历毕业院校及毕业时间")
  235. if result.get("初始学历毕业院校及毕业时间"):
  236. dates = re.findall(r'\d+' , result["初始学历毕业院校及毕业时间"])
  237. ws = re.findall(r'\w+' , result["初始学历毕业院校及毕业时间"])
  238. if len(ws) > 0:
  239. result["初始学历毕业院校"] = ws[0]
  240. if len(dates) == 1:
  241. result["初始学历毕业时间"] = "{:4d}-01-01".format(int(dates[0]))
  242. elif len(dates) == 2:
  243. result["初始学历毕业时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  244. elif len(dates) == 3:
  245. result["初始学历毕业时间"] = "{:4d}-{:02d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]))
  246. result.pop("初始学历毕业院校及毕业时间")
  247. if result.get("学习经历"):
  248. for idx, edu in enumerate(result["学习经历"]):
  249. if edu.get("起止时间"):
  250. dates = re.findall(r'\d+' , edu["起止时间"])
  251. if len(dates) == 4:
  252. result["学习经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  253. if result.get("培训经历"):
  254. for idx, edu in enumerate(result["培训经历"]):
  255. if edu.get("起止时间"):
  256. dates = re.findall(r'\d+' , edu["起止时间"])
  257. if len(dates) == 4:
  258. result["培训经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  259. if result.get("工作经历"):
  260. for idx, edu in enumerate(result["工作经历"]):
  261. if edu.get("起止时间"):
  262. dates = re.findall(r'\d+' , edu["起止时间"])
  263. if len(dates) == 4:
  264. result["工作经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  265. if result.get("项目经历"):
  266. for idx, edu in enumerate(result["项目经历"]):
  267. if edu.get("起止时间"):
  268. dates = re.findall(r'\d+' , edu["起止时间"])
  269. if len(dates) == 4:
  270. result["项目经历"][idx]["起止时间"] = "{:4d}-{:02d}~{:4d}-{:02d}".format(int(dates[0]), int(dates[1]), int(dates[2]), int(dates[3]))
  271. if result.get("获得职业资格证书情况"):
  272. for idx, edu in enumerate(result["获得职业资格证书情况"]):
  273. if edu.get("获得日期"):
  274. dates = re.findall(r'\d+' , edu["获得日期"])
  275. if len(dates) == 2:
  276. result["获得职业资格证书情况"][idx]["获得日期"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  277. if result.get("奖惩情况"):
  278. for idx, edu in enumerate(result["奖惩情况"]):
  279. if edu.get("时间"):
  280. dates = re.findall(r'\d+' , edu["时间"])
  281. if len(dates) == 2:
  282. result["奖惩情况"][idx]["时间"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  283. if result.get("主要家庭成员及社会关系"):
  284. for idx, fam in enumerate(result["主要家庭成员及社会关系"]):
  285. if fam.get("出生年月"):
  286. dates = re.findall(r'\d+' , fam["出生年月"])
  287. if len(dates) == 2:
  288. result["主要家庭成员及社会关系"][idx]["出生年月"] = "{:4d}-{:02d}-01".format(int(dates[0]), int(dates[1]))
  289. normal = self.json_obj["base"]
  290. itenormal = self.json_obj["base"]
  291. edunormal = self.json_obj["tal_his_edu"]
  292. jobnormal = self.json_obj["tal_his_job"]
  293. tranornal = self.json_obj["tal_training_experience"]
  294. cetnormal = self.json_obj["tal_vocational_qualification_certificate"]
  295. rewnormal = self.json_obj["tal_reward_punishment"]
  296. family = self.json_obj["tal_family_social_relation"]
  297. for key in normal.keys():
  298. if result.get(key):
  299. result[normal[key]] = result[key]
  300. result.pop(key)
  301. for idx in range(len(result['学习经历'])):
  302. for key in edunormal.keys():
  303. if result['学习经历'][idx].get(key):
  304. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  305. result['学习经历'][idx].pop(key)
  306. for idx in range(len(result['工作经历'])):
  307. for key in jobnormal.keys():
  308. if result['工作经历'][idx].get(key):
  309. result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
  310. result['工作经历'][idx].pop(key)
  311. for idx in range(len(result['培训经历'])):
  312. for key in tranornal.keys():
  313. if result['培训经历'][idx].get(key):
  314. result['培训经历'][idx][tranornal[key]] = result['培训经历'][idx][key]
  315. result['培训经历'][idx].pop(key)
  316. for idx in range(len(result['获得职业资格证书情况'])):
  317. for key in cetnormal.keys():
  318. if result['获得职业资格证书情况'][idx].get(key):
  319. result['获得职业资格证书情况'][idx][cetnormal[key]] = result['获得职业资格证书情况'][idx][key]
  320. result['获得职业资格证书情况'][idx].pop(key)
  321. for idx in range(len(result['奖惩情况'])):
  322. for key in rewnormal.keys():
  323. if result['奖惩情况'][idx].get(key):
  324. result['奖惩情况'][idx][rewnormal[key]] = result['奖惩情况'][idx][key]
  325. result['奖惩情况'][idx].pop(key)
  326. for idx in range(len(result['主要家庭成员及社会关系'])):
  327. for key in family.keys():
  328. if result['主要家庭成员及社会关系'][idx].get(key):
  329. result['主要家庭成员及社会关系'][idx][family[key]] = result['主要家庭成员及社会关系'][idx][key]
  330. result['主要家庭成员及社会关系'][idx].pop(key)
  331. tit = {
  332. "基本信息":"base",
  333. "职业发展管理":"intent_job",
  334. "学习经历":"tal_his_edu",
  335. "工作经历":"tal_his_job",
  336. "项目经历":"tal_his_project",
  337. "培训经历":"tal_training_experience",
  338. "奖惩情况":"tal_reward_punishment",
  339. "语言能力":"tal_language",
  340. "获得职业资格证书情况":"tal_vocational_qualification_certificate",
  341. "专业技能":"tal_professional_tech_certificate",
  342. "主要家庭成员及社会关系":"tal_family_social_relation",
  343. "其他情况说明":"intro"
  344. }
  345. for key in tit.keys():
  346. if result.get(key):
  347. result[tit[key]] = result[key]
  348. result.pop(key)
  349. return result
  350. # 推送后端
  351. def push_back(self, result):
  352. url = "http://192.168.1.110:9999/talent/getResumeData"
  353. session = requests.Session()
  354. session.mount('http://', HTTPAdapter(max_retries = 3))
  355. try:
  356. headers = {
  357. 'contentType':'Application/json'
  358. }
  359. response = session.post(url=url, headers=headers, json={"ResumeData": result}, timeout=10)
  360. print(response.text)
  361. except Exception as e:
  362. print(e)
  363. def predict(self, path):
  364. if path.endswith(".docx"):
  365. result = self.formatter(self.parse_word_layout(path))
  366. self.push_back(result)
  367. print(self.formatter(self.parse_word_layout(path)))
  368. elif path.endswith(".pdf"):
  369. result = self.formatter(self.parse_pdf_layout(path))
  370. self.push_back(result)
  371. print(self.formatter(self.parse_pdf_layout(path)))
  372. if __name__ == '__main__':
  373. s = Social()
  374. s.predict(path)