srafa.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-07-07 12:59:42
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-07-13 15:22:48
  6. # import pdb
  7. from pprint import pprint
  8. import pandas as pd
  9. import pdfplumber
  10. path = "d:\\desktop\\社招简历模板.pdf"
  11. keywords = ['姓名',
  12. '性别',
  13. '出生日期',
  14. '一寸照片',
  15. '民族',
  16. '出生地',
  17. '政治面貌(加入时间)',
  18. '参加工作时间',
  19. '健康状况',
  20. '外语水平',
  21. '初始学历、专业',
  22. '最高学历、专业',
  23. '初始学历毕业院校及毕业时间',
  24. '最高学历毕业院校及毕业时间',
  25. '专业技术资格(取得时间)',
  26. '职业技能等级(取得时间)',
  27. '熟悉专业有何专长',
  28. '工作单位',
  29. '现任职务',
  30. '任职时间',
  31. '提职时间',
  32. '意向岗位',
  33. '联系电话',
  34. '学习经历',
  35. '起止时间',
  36. '学校',
  37. '专业',
  38. '学历',
  39. '学位',
  40. '研究方向',
  41. '是否全日制',
  42. '培训',
  43. '起止时间',
  44. '培训类型',
  45. '机构',
  46. '内容',
  47. '成绩',
  48. '证书名称',
  49. '经历',
  50. '工作经历',
  51. '起止时间',
  52. '工作单位',
  53. '职务',
  54. '部门',
  55. '证明人',
  56. '备注',
  57. '对报名岗位认识及工作设想',
  58. '自我评价及主要工作业绩',
  59. '获得职业资格证书情况',
  60. '获得日期',
  61. '名称',
  62. '证书编码/文号',
  63. '授予单位',
  64. '备注',
  65. '奖惩',
  66. '项目',
  67. '时间',
  68. '项目单位',
  69. '证明材料',
  70. '情况',
  71. '主要家庭成员及社会关系',
  72. '称谓',
  73. '出生年月',
  74. '政治面貌',
  75. '工作单位及职务',
  76. '其他情况说明',
  77. '诚信承诺',
  78. '本人承诺,以上信息均与事实相符,若有虚假,愿承担一切后果并自愿取消应聘资格。'
  79. '承诺人:'
  80. '社会招聘工作办公室资格审查意见']
  81. def parse_line(line):
  82. result = []
  83. key = None
  84. for cell in line:
  85. if cell and ''.join(cell.split()) in keywords:
  86. key = ''.join(cell.split())
  87. elif cell and key:
  88. schema = {key:cell}
  89. result.append(schema)
  90. key = None
  91. return result
  92. def parse_layout(path):
  93. result = []
  94. lo = {}
  95. with pdfplumber.open(path) as pdf:
  96. for page in pdf.pages:
  97. for table in page.extract_tables():
  98. for line in table:
  99. # lo[len(lo.keys())] = [cell for cell in line if cell]
  100. lo[len(lo.keys())] = line
  101. kwln = -1
  102. kwline = None
  103. for key in lo.keys():
  104. # pdb.set_trace()
  105. for val in lo[key]:# 通过全关键词,判断此行是否为关键词行
  106. if val and ''.join(val.split()) not in keywords:# 有非关键字元素,非关键词行,判断是否为关键词行元素
  107. # pdb.set_trace()
  108. for c in lo[key] or len(lo[key])!=kwln:
  109. # pdb.set_trace()
  110. if c and ''.join(c.split()) in keywords:# 非关键词行元素
  111. result.extend(parse_line(lo[key]))
  112. break
  113. else:# 关键词行元素
  114. schema = dict()
  115. for key, val in zip(kwline, lo[key]):
  116. if key:
  117. schema[key] = val if val else key
  118. result.append(schema)
  119. break
  120. break
  121. else:
  122. # print("此行为关键词行")
  123. # kwline = lo[key]
  124. kwline = []
  125. for cell in lo[key]:
  126. if cell:
  127. kwline.append(''.join(cell.split()))
  128. else:
  129. kwline.append(cell)
  130. kwln = len(lo[key])
  131. return result
  132. # 格式化数据
  133. def formatter(datalist):
  134. result = dict()
  135. for d in datalist:
  136. if len(d) == 1:
  137. for key in d.keys():
  138. result[key] = d[key]
  139. else:
  140. for k in list(d.keys()):
  141. if k == "".join(d[k].split()):
  142. d.pop(k)
  143. if result.get(k):
  144. result[k].append(d)
  145. else:
  146. result[k] = [d]
  147. return result
  148. if __name__ == '__main__':
  149. # pprint(parse_layout(path))
  150. pprint(formatter(parse_layout(path)))