irafa.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 13:12:17
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-14 09:39:42
  6. # 内部人才市场简历模板
  7. from pprint import pprint
  8. import re
  9. import docx
  10. from docx import Document
  11. from docx.shared import Inches
  12. path = "d:\\desktop\\内部人才市场简历模板.docx"
  13. keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
  14. def parse_line(line):
  15. result = []
  16. key = None
  17. for cell in line:
  18. if cell and ''.join(cell.split()) in keywords:
  19. key = ''.join(cell.split())
  20. elif cell and key:
  21. schema = {key:cell}
  22. result.append(schema)
  23. key = None
  24. return result
  25. def parse_layout(path):
  26. result = []
  27. doc = Document(path)
  28. lo = {}
  29. tables = doc.tables
  30. for _table in tables[:]:
  31. for i, row in enumerate(_table.rows[:]):
  32. row_content = []
  33. for cell in row.cells[:]:
  34. c = cell.text
  35. row_content.append(c)
  36. lo[len(lo.keys())] = row_content
  37. kwln = -1
  38. kwline = None
  39. for key in lo.keys():
  40. # pdb.set_trace()
  41. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  42. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  43. # pdb.set_trace()
  44. for c in lo[key]:
  45. # pdb.set_trace()
  46. if c and ''.join(c.split()) in keywords:# 非关键词行元素
  47. result.extend(parse_line(lo[key]))
  48. break
  49. else:# 关键词行元素
  50. schema = dict()
  51. for key, val in zip(kwline, lo[key]):
  52. if key:
  53. schema[key] = val
  54. if "学校/培训机构" in schema.keys():
  55. schema["学习经历"] = "学习经历"
  56. elif "与本人关系" in schema.keys():
  57. schema["家庭成员"] = "家庭成员"
  58. elif "意向地区" in schema.keys():
  59. schema["职业发展管理"] = "职业发展管理"
  60. elif "职业证书" in schema.keys():
  61. schema["职业资格证书"] = "职业资格证书"
  62. result.append(schema)
  63. break
  64. break
  65. else:
  66. # print("此行为关键词行")
  67. kwline = [''.join(cell.split()) for cell in lo[key]]
  68. kwln = len(lo[key])
  69. job = {"工作经历":"工作经历"}
  70. flag = None
  71. for p in doc.paragraphs:
  72. text = p.text.replace(":", ":")
  73. if ":" in text:
  74. text = re.sub(r'(\w+)\W{0,2}:', r'\n\1:', text)
  75. for line in text.split("\n"):
  76. if line.strip():
  77. i = line.split(":")
  78. if job.get(i[0].strip()):
  79. result.append(job)
  80. job = {"工作经历":"工作经历"}
  81. job[i[0].strip()] = i[1].strip()
  82. flag = i[0].strip()
  83. elif flag == "工作描述":
  84. job["工作描述"] += '\n' + text.strip()
  85. else:
  86. result.append(job)
  87. return result
  88. # 格式化数据
  89. def formatter(datalist):
  90. result = dict()
  91. for d in datalist:
  92. if len(d) == 1:
  93. for key in d.keys():
  94. result[key] = d[key]
  95. else:
  96. for k in list(d.keys()):
  97. if k == "".join(d[k].split()):
  98. d.pop(k)
  99. if result.get(k):
  100. result[k].append(d)
  101. else:
  102. result[k] = [d]
  103. normal = {
  104. "姓名":"name",
  105. "性别":"gender",
  106. "邮箱地址":"email",
  107. "政治面貌":"politics",
  108. "联系电话":"mobile",
  109. "籍贯":"birthplace",
  110. "出生日期":"birth_time",
  111. "现任职务":"current_job",
  112. "所在城市":"living_city",
  113. "参加工作时间":"work_begin_time",
  114. "意向岗位":"intent_job",
  115. "熟悉专业有何专长":"skills",
  116. }
  117. edunormal = {
  118. "学校":"school_name",
  119. "专业":"major",
  120. "学历":"degree",
  121. "是否全日制":"degree_type",
  122. }
  123. for key in normal.keys():
  124. if result.get(key):
  125. result[normal[key]] = result[key]
  126. result.pop(key)
  127. edunormal = {
  128. "学校/培训机构":"school_name",
  129. "专业":"major",
  130. "起始时间":"start_time",
  131. "毕业时间":"end_time"
  132. }
  133. for idx in range(len(result['学习经历'])):
  134. for key in edunormal.keys():
  135. if result['学习经历'][idx].get(key):
  136. result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  137. result['学习经历'][idx].pop(key)
  138. return result
  139. if __name__ == "__main__":
  140. pprint(formatter(parse_layout(path)))