parse_textmind_result.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-12-03 10:50:09
  6. import re
  7. import json
  8. import pandas as pd
  9. import os
  10. """
  11. textmind 结果解析
  12. [Node]
  13. node_id: int
  14. text: str
  15. node_type: <text|title|contents|head_tail|table|image>
  16. parent: int
  17. children: list
  18. para_type: <text|title_1|title_2|title_3|title_4|title_5|title_6|contents|head_tail|table|image>
  19. [position]
  20. pageno: int
  21. layout_index: int
  22. box: list
  23. """
  24. def json2json(path):
  25. _lines = open(path, 'r', encoding='utf-8').read()
  26. json_line = json.loads(_lines)
  27. return json_line
  28. def paese_content(layouts: list):
  29. ''' '''
  30. if not layouts:
  31. return pd.NA
  32. contents = []
  33. for layout in layouts:
  34. if layout['sub_type'] != 'table' or layout['sub_type'] != 'image' or layout['sub_type'] != 'seal':
  35. contents.append(layout['text'])
  36. return "".join(contents).replace('\n\n', '\n').replace(' ', '')
  37. def parse_table_name(tables: list, images: list, layouts: list):
  38. ''' '''
  39. if not tables:
  40. return pd.NA
  41. table_names = []
  42. for layout in layouts:
  43. if layout['sub_type'] == 'table_title' or layout['sub_type'] == 'head_tail':
  44. table_names.append(re.sub("\n| ", "", layout['text']))
  45. for image in images:
  46. for content_layouts in image['content_layouts']:
  47. if content_layouts['sub_type'] == 'table_title' or layout['sub_type'] == 'head_tail':
  48. table_names.append(re.sub("\n| ", "", content_layouts['text']))
  49. return ";".join(table_names)
  50. def parse_title(layouts: list):
  51. ''' 解析标题 '''
  52. if not layouts:
  53. return pd.NA
  54. for layout in layouts:
  55. if layout['type'] == 'title':
  56. return re.sub("\n", "", layout['text'])
  57. for layout in layouts:
  58. if layout['text']:
  59. return re.sub("\n", "", layouts[0]['text']) if 0 < len(layouts[0]['text']) < 15 else pd.NA
  60. def parse_table(markdown: str):
  61. table = []
  62. lines = markdown.split('\n')
  63. for line in lines:
  64. line = re.sub(r"\\n| ", "", line)
  65. table.append(line.strip('|').split('|'))
  66. return table
  67. def get_ocr_new(raw: dict, pretty: bool = False):
  68. '''解析textmind结果'''
  69. nodes = []
  70. for node in raw['pages']:
  71. del node['page_id']
  72. if not node['text']:
  73. continue
  74. nodes.append(node)
  75. df = pd.DataFrame(nodes)
  76. if not pretty:
  77. return df
  78. content_df = df.loc[:, ['page_num']]
  79. content_df['text'] = df['layouts'].apply(lambda x: paese_content(x))
  80. content_df = content_df.rename(columns={'page_num': 'page_number'})
  81. content_df.dropna(inplace=True)
  82. content = content_df.to_dict('records')
  83. title_df = df.loc[:, ['page_num']]
  84. title_df = title_df.rename(columns={'page_num': 'page_number'})
  85. title_df['title'] = df['layouts'].apply(lambda x: parse_title(x))
  86. title_df['parent'] = df['layouts'].apply(lambda x: x[0]['parent'] if x else pd.NA)
  87. title_df['node_type'] = df['layouts'].apply(lambda x: x[0]['type'] if x else pd.NA)
  88. title_df['para_type'] = df['layouts'].apply(lambda x: x[0]['sub_type'] if x else pd.NA)
  89. title_df['text'] = title_df['title']
  90. title_df.dropna(inplace=True)
  91. outline = title_df.to_dict('records')
  92. title_df['seq_num'] = title_df.index
  93. title = title_df.to_dict('records')
  94. table_df = df.loc[:, ['page_num']]
  95. table_df['page_num'] = table_df['page_num'].apply(lambda x: [x])
  96. table_df = table_df.rename(columns={'page_num': 'page_numbers'})
  97. table_df['table'] = df['tables'].apply(lambda x: parse_table(x[0]['markdown']) if x else pd.NA)
  98. table_df['table_name'] = df.apply(lambda x: parse_table_name(x['tables'], x['images'], x['layouts']), axis=1)
  99. table_df.dropna(inplace=True)
  100. table = table_df.to_dict('records')
  101. return {"title": title, "outline": outline, "contents": content, "tables": table, "images": []}
  102. if __name__ == '__main__':
  103. with open('D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\textmind_result\\安徽德通智联科技有限公司_textmind.txt', 'r', encoding='utf-8') as fp:
  104. raw = json.load(fp)
  105. data = get_ocr_new(raw=raw, pretty=True)
  106. print(data['tables'])
  107. # basepath = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/30份数据整理'
  108. # for save_file in os.listdir(basepath):
  109. # save_file_path = os.path.join(basepath, save_file)
  110. # for save_file_name in os.listdir(save_file_path):
  111. # if '投标文件' == save_file_name:
  112. # save_file_name_path = os.path.join(save_file_path,save_file_name)
  113. # textmind_save_dir = os.path.join(save_file_name_path,'textmind')
  114. # for bidder_name in os.listdir(textmind_save_dir):
  115. # if bidder_name[-13:] != 'textmind.json': continue
  116. # textmind_result_path = os.path.join(textmind_save_dir, bidder_name)
  117. # with open(textmind_result_path, 'r', encoding='utf-8') as fp:
  118. # raw = json.load(fp)
  119. # try:
  120. # raw = get_ocr_new(raw=raw, pretty=True)
  121. # for k, v in raw.items():
  122. # if k == 'title':
  123. # with open(f'{textmind_save_dir}/{bidder_name[:-5]}_title.json', 'w', encoding='utf-8') as fo:
  124. # json.dump(v, fo, ensure_ascii=False)
  125. # elif k == 'outline':
  126. # with open(f'{textmind_save_dir}/{bidder_name[:-5]}_outlines.json', 'w', encoding='utf-8') as fo:
  127. # json.dump(v, fo, ensure_ascii=False)
  128. # elif k == 'contents':
  129. # with open(f'{textmind_save_dir}/{bidder_name[:-5]}_content.json', 'w', encoding='utf-8') as fo:
  130. # json.dump(v, fo, ensure_ascii=False)
  131. # elif k == 'tables':
  132. # with open(f'{textmind_save_dir}/{bidder_name[:-5]}_tables.json', 'w', encoding='utf-8') as fo:
  133. # json.dump(v, fo, ensure_ascii=False)
  134. # except Exception:
  135. # print(textmind_result_path)
  136. # raise ValueError("stop")