irafa.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 13:12:17
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-16 15:05:03
  6. # 内部人才市场简历模板
  7. from pprint import pprint
  8. import re
  9. import json
  10. from pdfminer.high_level import extract_pages
  11. from pdfminer.layout import LTTextContainer
  12. import pdfplumber
  13. import docx
  14. from docx import Document
  15. from docx.shared import Inches
  16. # path = "d:\\desktop\\内部人才市场简历模板.docx"
  17. path = "d:\\desktop\\内部人才市场简历模板.pdf"
  18. keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
  19. def parse_line(line):
  20. result = []
  21. key = None
  22. for cell in line:
  23. if cell and ''.join(cell.split()) in keywords:
  24. key = ''.join(cell.split())
  25. elif cell and key:
  26. schema = {key:cell}
  27. result.append(schema)
  28. key = None
  29. return result
  30. def parse_layout(path):
  31. result = []
  32. doc = Document(path)
  33. lo = {}
  34. tables = doc.tables
  35. for _table in tables[:]:
  36. for i, row in enumerate(_table.rows[:]):
  37. row_content = []
  38. for cell in row.cells[:]:
  39. c = cell.text
  40. row_content.append(c)
  41. lo[len(lo.keys())] = row_content
  42. kwln = -1
  43. kwline = None
  44. for key in lo.keys():
  45. # pdb.set_trace()
  46. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  47. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  48. # pdb.set_trace()
  49. for c in lo[key]:
  50. # pdb.set_trace()
  51. if c and ''.join(c.split()) in keywords:# 非关键词行元素
  52. result.extend(parse_line(lo[key]))
  53. break
  54. else:# 关键词行元素
  55. schema = dict()
  56. for key, val in zip(kwline, lo[key]):
  57. if key:
  58. schema[key] = val
  59. if "学校/培训机构" in schema.keys():
  60. schema["学习经历"] = "学习经历"
  61. elif "与本人关系" in schema.keys():
  62. schema["家庭成员"] = "家庭成员"
  63. elif "意向地区" in schema.keys():
  64. schema["职业发展管理"] = "职业发展管理"
  65. elif "职业证书" in schema.keys():
  66. schema["职业资格证书"] = "职业资格证书"
  67. result.append(schema)
  68. break
  69. break
  70. else:
  71. # print("此行为关键词行")
  72. kwline = [''.join(cell.split()) for cell in lo[key]]
  73. kwln = len(lo[key])
  74. job = {"工作经历":"工作经历"}
  75. flag = None
  76. for p in doc.paragraphs:
  77. text = p.text.replace(":", ":")
  78. if ":" in text:
  79. text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
  80. for line in text.split("\n"):
  81. if line.strip():
  82. i = line.split(":")
  83. if job.get(i[0].strip()):
  84. result.append(job)
  85. job = {"工作经历":"工作经历"}
  86. job[i[0].strip()] = i[1].strip()
  87. flag = i[0].strip()
  88. elif flag == "工作描述":
  89. job["工作描述"] += '\n' + text.strip()
  90. else:
  91. result.append(job)
  92. return result
  93. def parse_pdf_layout(path):
  94. result = []
  95. lo = {}
  96. with pdfplumber.open(path) as pdf:
  97. for page in pdf.pages:
  98. for table in page.extract_tables():
  99. for line in table:
  100. # lo[len(lo.keys())] = [cell for cell in line if cell]
  101. lo[len(lo.keys())] = line
  102. kwln = -1
  103. kwline = None
  104. for key in lo.keys():
  105. # pdb.set_trace()
  106. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  107. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  108. # pdb.set_trace()
  109. for c in lo[key]:
  110. # pdb.set_trace()
  111. if c and ''.join(c.split()) in keywords:# 非关键词行元素
  112. result.extend(parse_line(lo[key]))
  113. break
  114. if c == "对报名岗位\n认 识及工作":
  115. print(''.join(c.split()))
  116. break
  117. else:# 关键词行元素
  118. schema = dict()
  119. for key, val in zip(kwline, lo[key]):
  120. if key:
  121. schema[key] = val
  122. if "学校/培训机构" in schema.keys():
  123. schema["学习经历"] = "学习经历"
  124. elif "与本人关系" in schema.keys():
  125. schema["家庭成员"] = "家庭成员"
  126. elif "意向地区" in schema.keys():
  127. schema["职业发展管理"] = "职业发展管理"
  128. elif "职业证书" in schema.keys():
  129. schema["职业资格证书"] = "职业资格证书"
  130. result.append(schema)
  131. break
  132. break
  133. else:
  134. # print("此行为关键词行")
  135. kwline = [''.join(cell.split()) for cell in lo[key]]
  136. kwln = len(lo[key])
  137. job = {"工作经历":"工作经历"}
  138. flag = None
  139. with pdfplumber.open(path) as pdf:
  140. for page in pdf.pages:
  141. for predict in page.extract_words():
  142. # print(predict['text'])
  143. text = predict['text'].replace(":", ":")
  144. if ":" in text:
  145. text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
  146. for line in text.split("\n"):
  147. if line.strip():
  148. i = line.split(":")
  149. if job.get(i[0].strip()):
  150. result.append(job)
  151. job = {"工作经历":"工作经历"}
  152. job[i[0].strip()] = i[1].strip()
  153. flag = i[0].strip()
  154. elif flag == "工作描述":
  155. job["工作描述"] += '\n' + text.strip()
  156. else:
  157. result.append(job)
  158. return result
  159. # 格式化数据
  160. def formatter(datalist):
  161. result = dict()
  162. for d in datalist:
  163. if len(d) == 1:
  164. for key in d.keys():
  165. result[key] = d[key]
  166. else:
  167. for k in list(d.keys()):
  168. if k == "".join(d[k].split()):
  169. d.pop(k)
  170. if result.get(k):
  171. result[k].append(d)
  172. else:
  173. result[k] = [d]
  174. # 转译数据库字段名
  175. with open("./resources/translate.json", "r", encoding="utf-8") as ff:
  176. json_obj = json.load(ff)
  177. normal = json_obj["base"]
  178. itenormal = json_obj["base"]
  179. edunormal = json_obj["tal_training_institutions"]
  180. jobnormal = json_obj["tal_his_job"]
  181. cetnormal = json_obj["tal_vocational_qualification_certificate"]
  182. family = json_obj["tal_family_social_relations"]
  183. for key in normal.keys():
  184. if result.get(key):
  185. result[normal[key]] = result[key]
  186. result.pop(key)
  187. for idx in range(len(result['职业发展管理'])):
  188. for key in itenormal.keys():
  189. if result['职业发展管理'][idx].get(key):
  190. result['职业发展管理'][idx][itenormal[key]] = result['职业发展管理'][idx][key]
  191. result['职业发展管理'][idx].pop(key)
  192. for idx in range(len(result['学习经历'])):
  193. for key in edunormal.keys():
  194. if result['学习经历'][idx].get(key):
  195. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  196. result['学习经历'][idx].pop(key)
  197. for idx in range(len(result['工作经历'])):
  198. for key in jobnormal.keys():
  199. if result['工作经历'][idx].get(key):
  200. result['工作经历'][idx][jobnormal[key]] = result['工作经历'][idx][key]
  201. result['工作经历'][idx].pop(key)
  202. for idx in range(len(result['职业资格证书'])):
  203. for key in cetnormal.keys():
  204. if result['职业资格证书'][idx].get(key):
  205. result['职业资格证书'][idx][cetnormal[key]] = result['职业资格证书'][idx][key]
  206. result['职业资格证书'][idx].pop(key)
  207. for idx in range(len(result['家庭成员'])):
  208. for key in family.keys():
  209. if result['家庭成员'][idx].get(key):
  210. result['家庭成员'][idx][family[key]] = result['家庭成员'][idx][key]
  211. result['家庭成员'][idx].pop(key)
  212. tit = {
  213. "基本信息":"base",
  214. "职业发展管理":"intent_job",
  215. "学习经历":"tal_training_institutions",
  216. "工作经历":"tal_his_job",
  217. "项目经历":"tal_his_project",
  218. "培训经历":"tal_training_institutions",
  219. "获奖情况":"tal_rewards_punishments",
  220. "语言能力":"tal_language",
  221. "职业资格证书":"tal_vocational_qualification_certificate",
  222. "专业技能":"tal_professional_tech_certificate",
  223. "家庭成员":"tal_family_social_relations"
  224. }
  225. for key in tit.keys():
  226. if result.get(key):
  227. result[tit[key]] = result[key]
  228. result.pop(key)
  229. # url = "http://192.168.1.110:9999/talent/getResumeData"
  230. # session = requests.Session()
  231. # session.mount('http://', HTTPAdapter(max_retries = 3))
  232. # try:
  233. # headers = {
  234. # 'contentType':'Application/json'
  235. # }
  236. # response = session.post(url=url, headers=headers, json={"ResumeData":result}, timeout=10)
  237. # print(response.text)
  238. # except Exception as e:
  239. # print(e)
  240. return result
  241. if __name__ == "__main__":
  242. if path.endswith(".docx"):
  243. pprint(formatter(parse_layout(path)))
  244. else:
  245. pprint(formatter(parse_pdf_layout(path)))