# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-06-11 13:43:14 # @Last Modified by: privacy # @Last Modified time: 2024-09-27 14:08:30 import re import json import pandas as pd """ [Node] node_id: int text: str node_type: parent: int children: list para_type: [position] pageno: int layout_index: int box: list """ def parse_table(text): table = [] lines = text.split('\n') for line in lines: table.append(line.strip('|').split('|')) return table def get_ocr(raw: dict, pretty: bool = False): nodes = [] for node in raw['para_nodes']: if node['node_type'] == 'root': continue nodes.append(node) df = pd.DataFrame(nodes) df['pageno'] = df['position'].apply(lambda x: x[0]['pageno']) df['layout_index'] = df['position'].apply(lambda x: x[0]['layout_index']) df['box'] = df['position'].apply(lambda x: x[0]['box']) del df['position'] df.text = df.apply(lambda row: parse_table(row['text']) if row['node_type'] == 'table' else row['text'], axis=1) if not pretty: return df title = pd.DataFrame(df.query(''' node_type == 'title' ''').to_dict('records')) title['title'] = title['text'] title['page_number'] = title['pageno'] title['level'] = title['para_type'].apply(lambda x: int(re.findall(r'\d+', x).pop()) if re.findall(r'\d+', x) else 99) # 结果输出 outline = title.to_dict('records') title['seq_num'] = title.index # 结果输出 title = title.to_dict('records') text_df = pd.DataFrame(df.query(''' node_type == 'text' ''').to_dict('records')) content_data = text_df.groupby('pageno')['text'].apply(lambda x: '\n'.join(x)).reset_index() content_data['page_number'] = content_data['pageno'] # 结果输出 contents = content_data.to_dict('records') table_data = pd.DataFrame(df.query(''' node_type == 'table' ''').to_dict('records')) table_data['table'] = table_data['text'] table_data['table_name'] = '' table_data['page_numbers'] = table_data['pageno'].apply(lambda x: [x]) # 结果输出 tables = table_data.to_dict('records') return {"title": title, "outline": outline, "contents": contents, "tables": tables, "images": []} if __name__ == '__main__': with open('D:/Users/sprivacy/Documents/WeChat Files/wxid_uqa5354ji3ag22/FileStorage/File/2024-08/三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8') as fp: raw = json.load(fp) raw = get_ocr(raw) # for content in raw['file_content']: # print(content.keys()) # print(content['page_num'] == 0) # print(content['page_size']['width'] == 595) # print(content['page_size']['height'] == 841) # print(content['page_angle'] == 0) # print(content['is_scan'] == False) # print(content['page_content']['sheetname'] == '') # print(content['page_content']['type'] == 'others') # for layout in content['page_content']['layout']: # print(layout['box']) # print(layout['type']) # print(layout['text']) # print(layout['children']) # print(layout['matrix']) # print(layout['merge_table']) # print(layout['node_id']) # break