ocr_info.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-11-22 14:14:09
  6. import re
  7. import json
  8. import pandas as pd
  9. """
  10. [Node]
  11. node_id: int
  12. text: str
  13. node_type: <text|title|contents|head_tail|table|image>
  14. parent: int
  15. children: list
  16. para_type: <text|title_1|title_2|title_3|title_4|title_5|title_6|contents|head_tail|table|image>
  17. [position]
  18. pageno: int
  19. layout_index: int
  20. box: list
  21. """
  22. """
  23. [pages]
  24. page_id: str
  25. page_num: int
  26. text: str
  27. [layouts]
  28. layout_id: str
  29. text: str
  30. position: list
  31. type: <text|head_tail|seal>
  32. sub_type: <table_title>
  33. parent: str
  34. children: list
  35. [images]
  36. [tables]
  37. layout_id: str
  38. markdown: str
  39. table_title_id:
  40. position: list
  41. [cells]
  42. layout_id: str
  43. text: str
  44. position: list
  45. type: <text>
  46. sub_type: <>
  47. parent: str
  48. children: null
  49. mata.page_width: int
  50. mata.page_height: int
  51. meta.is_scan: bool
  52. meta.page_angle: int
  53. meta.page_type: <others|appendix>
  54. """
  55. def parse_table(text):
  56. table = []
  57. lines = text.split('\n')
  58. for line in lines:
  59. table.append(line.strip('|').split('|'))
  60. return table
  61. def get_ocr(raw: dict, pretty: bool = False):
  62. nodes = []
  63. for node in raw['para_nodes']:
  64. if node['node_type'] == 'root':
  65. continue
  66. nodes.append(node)
  67. df = pd.DataFrame(nodes)
  68. df['pageno'] = df['position'].apply(lambda x: x[0]['pageno'])
  69. df['layout_index'] = df['position'].apply(lambda x: x[0]['layout_index'])
  70. df['box'] = df['position'].apply(lambda x: x[0]['box'])
  71. del df['position']
  72. df.text = df.apply(lambda row: parse_table(row['text']) if row['node_type'] == 'table' else row['text'], axis=1)
  73. if not pretty:
  74. return df
  75. title = pd.DataFrame(df.query(''' node_type == 'title' ''').to_dict('records'))
  76. title['title'] = title['text']
  77. title['page_number'] = title['pageno']
  78. title['level'] = title['para_type'].apply(lambda x: int(re.findall(r'\d+', x).pop()) if re.findall(r'\d+', x) else 99)
  79. # 结果输出
  80. outline = title.to_dict('records')
  81. title['seq_num'] = title.index
  82. # 结果输出
  83. title = title.to_dict('records')
  84. text_df = pd.DataFrame(df.query(''' node_type == 'text' ''').to_dict('records'))
  85. content_data = text_df.groupby('pageno')['text'].apply(lambda x: '\n'.join(x)).reset_index()
  86. content_data['page_number'] = content_data['pageno']
  87. # 结果输出
  88. contents = content_data.to_dict('records')
  89. table_data = pd.DataFrame(df.query(''' node_type == 'table' ''').to_dict('records'))
  90. table_data['table'] = table_data['text']
  91. table_data['table_name'] = ''
  92. table_data['page_numbers'] = table_data['pageno'].apply(lambda x: [x])
  93. # 结果输出
  94. tables = table_data.to_dict('records')
  95. return {"title": title, "outline": outline, "contents": contents, "tables": tables, "images": []}
  96. if __name__ == '__main__':
  97. with open('D:/Users/sprivacy/Documents/WeChat Files/wxid_uqa5354ji3ag22/FileStorage/File/2024-08/三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8') as fp:
  98. raw = json.load(fp)
  99. raw = get_ocr(raw)
  100. # for content in raw['file_content']:
  101. # print(content.keys())
  102. # print(content['page_num'] == 0)
  103. # print(content['page_size']['width'] == 595)
  104. # print(content['page_size']['height'] == 841)
  105. # print(content['page_angle'] == 0)
  106. # print(content['is_scan'] == False)
  107. # print(content['page_content']['sheetname'] == '')
  108. # print(content['page_content']['type'] == 'others')
  109. # for layout in content['page_content']['layout']:
  110. # print(layout['box'])
  111. # print(layout['type'])
  112. # print(layout['text'])
  113. # print(layout['children'])
  114. # print(layout['matrix'])
  115. # print(layout['merge_table'])
  116. # print(layout['node_id'])
  117. # break