# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2022-07-07 13:12:17
# @Last Modified by:   privacy
# @Last Modified time: 2022-07-08 17:52:09


from docx import Document
from docx.shared import Inches

path = "d:\\desktop\\内部人才市场简历模板.docx"

keywords = ["姓名", "性别", "出生日期", "民族", "籍贯", "健康状况", "政治面貌", "参加工作时间", "外语水平", "专业技术资格（取得时间）", "计算机水平", "熟悉专业有何专长", "工作单位", "现任职务", "任职时间", "联系电话", "对报名岗位认识及工作设想", "意向地区", "意向岗位", "意向单位", "意向专业", "职业证书", "资格等级", "取得日期", "学校/培训机构", "专业", "起始时间", "毕业时间", "姓名", "职业", "与本人关系"]

def parse_line(line):
    result = []
    key = None
    for cell in line:
        if cell and ''.join(cell.split()) in keywords:
            key = ''.join(cell.split())
        elif cell and key:
            schema = {key:cell}
            result.append(schema)
            key = None
    return result

doc = Document(path)
lo = {}
tables = doc.tables
for _table in tables[:]:
    for i, row in enumerate(_table.rows[:]):
        row_content = []
        for cell in row.cells[:]:
            c = cell.text
            row_content.append(c)
        lo[len(lo.keys())] = row_content

kwln = -1
kwline = None
for key in lo.keys():
    # pdb.set_trace()
    for val in lo[key]:# 通过全关键词，判断此行是否为关键词行
        if val and ''.join(val.split()) not in keywords:# 有非关键字元素，非关键词行，判断是否为关键词行元素
            # pdb.set_trace()
            for c in lo[key]:
                # pdb.set_trace()
                if c and ''.join(c.split()) in keywords:# 非关键词行元素
                    print(parse_line(lo[key]))
                    break
            else:# 关键词行元素
                schema = dict()
                for key, val in zip(kwline, lo[key]):
                    if key:
                        schema[key] = val
                print(schema)
                break
            break
    else:
        # print("此行为关键词行")
        kwline = lo[key]
        kwln = len(lo[key])