irafa.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 13:12:17
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-13 16:46:02
  6. # 内部人才市场简历模板
  7. from pprint import pprint
  8. import docx
  9. from docx import Document
  10. from docx.shared import Inches
  11. path = "d:\\desktop\\内部人才市场简历模板.docx"
  12. keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
  13. def parse_line(line):
  14. result = []
  15. key = None
  16. for cell in line:
  17. if cell and ''.join(cell.split()) in keywords:
  18. key = ''.join(cell.split())
  19. elif cell and key:
  20. schema = {key:cell}
  21. result.append(schema)
  22. key = None
  23. return result
  24. def parse_layout(path):
  25. result = []
  26. doc = Document(path)
  27. lo = {}
  28. tables = doc.tables
  29. for _table in tables[:]:
  30. for i, row in enumerate(_table.rows[:]):
  31. row_content = []
  32. for cell in row.cells[:]:
  33. c = cell.text
  34. row_content.append(c)
  35. lo[len(lo.keys())] = row_content
  36. kwln = -1
  37. kwline = None
  38. for key in lo.keys():
  39. # pdb.set_trace()
  40. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  41. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  42. # pdb.set_trace()
  43. for c in lo[key]:
  44. # pdb.set_trace()
  45. if c and ''.join(c.split()) in keywords:# 非关键词行元素
  46. result.extend(parse_line(lo[key]))
  47. break
  48. else:# 关键词行元素
  49. schema = dict()
  50. for key, val in zip(kwline, lo[key]):
  51. if key:
  52. schema[key] = val
  53. if "学校/培训机构" in schema.keys():
  54. schema["学习经历"] = "学习经历"
  55. elif "与本人关系" in schema.keys():
  56. schema["家庭成员"] = "家庭成员"
  57. elif "意向地区" in schema.keys():
  58. schema["职业发展管理"] = "职业发展管理"
  59. elif "职业证书" in schema.keys():
  60. schema["职业资格证书"] = "职业资格证书"
  61. result.append(schema)
  62. break
  63. break
  64. else:
  65. # print("此行为关键词行")
  66. kwline = [''.join(cell.split()) for cell in lo[key]]
  67. kwln = len(lo[key])
  68. return result
  69. # 格式化数据
  70. def formatter(datalist):
  71. result = dict()
  72. for d in datalist:
  73. if len(d) == 1:
  74. for key in d.keys():
  75. result[key] = d[key]
  76. else:
  77. for k in list(d.keys()):
  78. if k == "".join(d[k].split()):
  79. d.pop(k)
  80. if result.get(k):
  81. result[k].append(d)
  82. else:
  83. result[k] = [d]
  84. normal = {
  85. "姓名":"name",
  86. "性别":"gender",
  87. "邮箱地址":"email",
  88. "政治面貌":"politics",
  89. "联系电话":"mobile",
  90. "籍贯":"birthplace",
  91. "出生日期":"birth_time",
  92. "现任职务":"current_job",
  93. "所在城市":"living_city",
  94. "参加工作时间":"work_begin_time",
  95. "意向岗位":"intent_job",
  96. "熟悉专业有何专长":"skills",
  97. }
  98. edunormal = {
  99. "学校":"school_name",
  100. "专业":"major",
  101. "学历":"degree",
  102. "是否全日制":"degree_type",
  103. }
  104. for key in normal.keys():
  105. if result.get(key):
  106. result[normal[key]] = result[key]
  107. result.pop(key)
  108. # for idx in range(len(result['学习经历'])):
  109. # result['学习经历'][idx]['start_time'] = result['学习经历'][idx]["起止时间"].split("~")[0]
  110. # result['学习经历'][idx]['end_time'] = result['学习经历'][idx]["起止时间"].split("~")[-1]
  111. # for key in edunormal.keys():
  112. # if result['学习经历'][idx].get(key):
  113. # result['学习经历'][idx][edunormal[key]] = result['学习经历'][idx][key]
  114. # result['学习经历'][idx].pop(key)
  115. return result
  116. if __name__ == "__main__":
  117. pprint(formatter(parse_layout(path)))