irafa.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 13:12:17
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-08 17:52:09
  6. from docx import Document
  7. from docx.shared import Inches
  8. path = "d:\\desktop\\内部人才市场简历模板.docx"
  9. keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格(取得时间)", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]
  10. def parse_line(line):
  11. result = []
  12. key = None
  13. for cell in line:
  14. if cell and ''.join(cell.split()) in keywords:
  15. key = ''.join(cell.split())
  16. elif cell and key:
  17. schema = {key:cell}
  18. result.append(schema)
  19. key = None
  20. return result
  21. doc = Document(path)
  22. lo = {}
  23. tables = doc.tables
  24. for _table in tables[:]:
  25. for i, row in enumerate(_table.rows[:]):
  26. row_content = []
  27. for cell in row.cells[:]:
  28. c = cell.text
  29. row_content.append(c)
  30. lo[len(lo.keys())] = row_content
  31. kwln = -1
  32. kwline = None
  33. for key in lo.keys():
  34. # pdb.set_trace()
  35. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  36. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  37. # pdb.set_trace()
  38. for c in lo[key]:
  39. # pdb.set_trace()
  40. if c and ''.join(c.split()) in keywords:# 非关键词行元素
  41. print(parse_line(lo[key]))
  42. break
  43. else:# 关键词行元素
  44. schema = dict()
  45. for key, val in zip(kwline, lo[key]):
  46. if key:
  47. schema[key] = val
  48. print(schema)
  49. break
  50. break
  51. else:
  52. # print("此行为关键词行")
  53. kwline = lo[key]
  54. kwln = len(lo[key])