123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2024-06-11 13:43:14
- # @Last Modified by: privacy
- # @Last Modified time: 2024-11-22 14:14:09
- import re
- import json
- import pandas as pd
- """
- [Node]
- node_id: int
- text: str
- node_type: <text|title|contents|head_tail|table|image>
- parent: int
- children: list
- para_type: <text|title_1|title_2|title_3|title_4|title_5|title_6|contents|head_tail|table|image>
- [position]
- pageno: int
- layout_index: int
- box: list
- """
- """
- [pages]
- page_id: str
- page_num: int
- text: str
- [layouts]
- layout_id: str
- text: str
- position: list
- type: <text|head_tail|seal>
- sub_type: <table_title>
- parent: str
- children: list
- [images]
- [tables]
- layout_id: str
- markdown: str
- table_title_id:
- position: list
- [cells]
- layout_id: str
- text: str
- position: list
- type: <text>
- sub_type: <>
- parent: str
- children: null
- mata.page_width: int
- mata.page_height: int
- meta.is_scan: bool
- meta.page_angle: int
- meta.page_type: <others|appendix>
- """
- def parse_table(text):
- table = []
- lines = text.split('\n')
- for line in lines:
- table.append(line.strip('|').split('|'))
- return table
- def get_ocr(raw: dict, pretty: bool = False):
- nodes = []
- for node in raw['para_nodes']:
- if node['node_type'] == 'root':
- continue
- nodes.append(node)
- df = pd.DataFrame(nodes)
- df['pageno'] = df['position'].apply(lambda x: x[0]['pageno'])
- df['layout_index'] = df['position'].apply(lambda x: x[0]['layout_index'])
- df['box'] = df['position'].apply(lambda x: x[0]['box'])
- del df['position']
- df.text = df.apply(lambda row: parse_table(row['text']) if row['node_type'] == 'table' else row['text'], axis=1)
- if not pretty:
- return df
- title = pd.DataFrame(df.query(''' node_type == 'title' ''').to_dict('records'))
- title['title'] = title['text']
- title['page_number'] = title['pageno']
- title['level'] = title['para_type'].apply(lambda x: int(re.findall(r'\d+', x).pop()) if re.findall(r'\d+', x) else 99)
- # 结果输出
- outline = title.to_dict('records')
- title['seq_num'] = title.index
- # 结果输出
- title = title.to_dict('records')
- text_df = pd.DataFrame(df.query(''' node_type == 'text' ''').to_dict('records'))
- content_data = text_df.groupby('pageno')['text'].apply(lambda x: '\n'.join(x)).reset_index()
- content_data['page_number'] = content_data['pageno']
- # 结果输出
- contents = content_data.to_dict('records')
- table_data = pd.DataFrame(df.query(''' node_type == 'table' ''').to_dict('records'))
- table_data['table'] = table_data['text']
- table_data['table_name'] = ''
- table_data['page_numbers'] = table_data['pageno'].apply(lambda x: [x])
- # 结果输出
- tables = table_data.to_dict('records')
- return {"title": title, "outline": outline, "contents": contents, "tables": tables, "images": []}
- if __name__ == '__main__':
- with open('D:/Users/sprivacy/Documents/WeChat Files/wxid_uqa5354ji3ag22/FileStorage/File/2024-08/三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8') as fp:
- raw = json.load(fp)
- raw = get_ocr(raw)
- # for content in raw['file_content']:
- # print(content.keys())
- # print(content['page_num'] == 0)
- # print(content['page_size']['width'] == 595)
- # print(content['page_size']['height'] == 841)
- # print(content['page_angle'] == 0)
- # print(content['is_scan'] == False)
- # print(content['page_content']['sheetname'] == '')
- # print(content['page_content']['type'] == 'others')
- # for layout in content['page_content']['layout']:
- # print(layout['box'])
- # print(layout['type'])
- # print(layout['text'])
- # print(layout['children'])
- # print(layout['matrix'])
- # print(layout['merge_table'])
- # print(layout['node_id'])
- # break
|