ocr_info.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-27 14:08:30
  6. import re
  7. import json
  8. import pandas as pd
  9. """
  10. [Node]
  11. node_id: int
  12. text: str
  13. node_type: <text|title|contents|head_tail|table|image>
  14. parent: int
  15. children: list
  16. para_type: <text|title_1|title_2|title_3|title_4|title_5|title_6|contents|head_tail|table|image>
  17. [position]
  18. pageno: int
  19. layout_index: int
  20. box: list
  21. """
  22. def parse_table(text):
  23. table = []
  24. lines = text.split('\n')
  25. for line in lines:
  26. table.append(line.strip('|').split('|'))
  27. return table
  28. def get_ocr(raw: dict, pretty: bool = False):
  29. nodes = []
  30. for node in raw['para_nodes']:
  31. if node['node_type'] == 'root':
  32. continue
  33. nodes.append(node)
  34. df = pd.DataFrame(nodes)
  35. df['pageno'] = df['position'].apply(lambda x: x[0]['pageno'])
  36. df['layout_index'] = df['position'].apply(lambda x: x[0]['layout_index'])
  37. df['box'] = df['position'].apply(lambda x: x[0]['box'])
  38. del df['position']
  39. df.text = df.apply(lambda row: parse_table(row['text']) if row['node_type'] == 'table' else row['text'], axis=1)
  40. if not pretty:
  41. return df
  42. title = pd.DataFrame(df.query(''' node_type == 'title' ''').to_dict('records'))
  43. title['title'] = title['text']
  44. title['page_number'] = title['pageno']
  45. title['level'] = title['para_type'].apply(lambda x: int(re.findall(r'\d+', x).pop()) if re.findall(r'\d+', x) else 99)
  46. # 结果输出
  47. outline = title.to_dict('records')
  48. title['seq_num'] = title.index
  49. # 结果输出
  50. title = title.to_dict('records')
  51. text_df = pd.DataFrame(df.query(''' node_type == 'text' ''').to_dict('records'))
  52. content_data = text_df.groupby('pageno')['text'].apply(lambda x: '\n'.join(x)).reset_index()
  53. content_data['page_number'] = content_data['pageno']
  54. # 结果输出
  55. contents = content_data.to_dict('records')
  56. table_data = pd.DataFrame(df.query(''' node_type == 'table' ''').to_dict('records'))
  57. table_data['table'] = table_data['text']
  58. table_data['table_name'] = ''
  59. table_data['page_numbers'] = table_data['pageno'].apply(lambda x: [x])
  60. # 结果输出
  61. tables = table_data.to_dict('records')
  62. return {"title": title, "outline": outline, "contents": contents, "tables": tables, "images": []}
  63. if __name__ == '__main__':
  64. with open('D:/Users/sprivacy/Documents/WeChat Files/wxid_uqa5354ji3ag22/FileStorage/File/2024-08/三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8') as fp:
  65. raw = json.load(fp)
  66. raw = get_ocr(raw)
  67. # for content in raw['file_content']:
  68. # print(content.keys())
  69. # print(content['page_num'] == 0)
  70. # print(content['page_size']['width'] == 595)
  71. # print(content['page_size']['height'] == 841)
  72. # print(content['page_angle'] == 0)
  73. # print(content['is_scan'] == False)
  74. # print(content['page_content']['sheetname'] == '')
  75. # print(content['page_content']['type'] == 'others')
  76. # for layout in content['page_content']['layout']:
  77. # print(layout['box'])
  78. # print(layout['type'])
  79. # print(layout['text'])
  80. # print(layout['children'])
  81. # print(layout['matrix'])
  82. # print(layout['merge_table'])
  83. # print(layout['node_id'])
  84. # break