# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-06-11 13:43:14 # @Last Modified by: privacy # @Last Modified time: 2024-11-22 14:14:09 import re import json import pandas as pd """ [Node] node_id: int text: str node_type: parent: int children: list para_type: [position] pageno: int layout_index: int box: list """ """ [pages] page_id: str page_num: int text: str [layouts] layout_id: str text: str position: list type: sub_type: parent: str children: list [images] [tables] layout_id: str markdown: str table_title_id: position: list [cells] layout_id: str text: str position: list type: sub_type: <> parent: str children: null mata.page_width: int mata.page_height: int meta.is_scan: bool meta.page_angle: int meta.page_type: """ def parse_table(text): table = [] lines = text.split('\n') for line in lines: table.append(line.strip('|').split('|')) return table def get_ocr(raw: dict, pretty: bool = False): nodes = [] for node in raw['para_nodes']: if node['node_type'] == 'root': continue nodes.append(node) df = pd.DataFrame(nodes) df['pageno'] = df['position'].apply(lambda x: x[0]['pageno']) df['layout_index'] = df['position'].apply(lambda x: x[0]['layout_index']) df['box'] = df['position'].apply(lambda x: x[0]['box']) del df['position'] df.text = df.apply(lambda row: parse_table(row['text']) if row['node_type'] == 'table' else row['text'], axis=1) if not pretty: return df title = pd.DataFrame(df.query(''' node_type == 'title' ''').to_dict('records')) title['title'] = title['text'] title['page_number'] = title['pageno'] title['level'] = title['para_type'].apply(lambda x: int(re.findall(r'\d+', x).pop()) if re.findall(r'\d+', x) else 99) # 结果输出 outline = title.to_dict('records') title['seq_num'] = title.index # 结果输出 title = title.to_dict('records') text_df = pd.DataFrame(df.query(''' node_type == 'text' ''').to_dict('records')) content_data = text_df.groupby('pageno')['text'].apply(lambda x: '\n'.join(x)).reset_index() content_data['page_number'] = content_data['pageno'] # 结果输出 contents = content_data.to_dict('records') table_data = pd.DataFrame(df.query(''' node_type == 'table' ''').to_dict('records')) table_data['table'] = table_data['text'] table_data['table_name'] = '' table_data['page_numbers'] = table_data['pageno'].apply(lambda x: [x]) # 结果输出 tables = table_data.to_dict('records') return {"title": title, "outline": outline, "contents": contents, "tables": tables, "images": []} if __name__ == '__main__': with open('D:/Users/sprivacy/Documents/WeChat Files/wxid_uqa5354ji3ag22/FileStorage/File/2024-08/三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8') as fp: raw = json.load(fp) raw = get_ocr(raw) # for content in raw['file_content']: # print(content.keys()) # print(content['page_num'] == 0) # print(content['page_size']['width'] == 595) # print(content['page_size']['height'] == 841) # print(content['page_angle'] == 0) # print(content['is_scan'] == False) # print(content['page_content']['sheetname'] == '') # print(content['page_content']['type'] == 'others') # for layout in content['page_content']['layout']: # print(layout['box']) # print(layout['type']) # print(layout['text']) # print(layout['children']) # print(layout['matrix']) # print(layout['merge_table']) # print(layout['node_id']) # break