parse_textmind_result.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-27 14:08:30
  6. import re
  7. import json
  8. import pandas as pd
  9. import os
  10. """
  11. textmind 结果解析
  12. [Node]
  13. node_id: int
  14. text: str
  15. node_type: <text|title|contents|head_tail|table|image>
  16. parent: int
  17. children: list
  18. para_type: <text|title_1|title_2|title_3|title_4|title_5|title_6|contents|head_tail|table|image>
  19. [position]
  20. pageno: int
  21. layout_index: int
  22. box: list
  23. """
  24. def json2json(path):
  25. _lines = open(path, 'r', encoding='utf-8').read()
  26. json_line = json.loads(_lines)
  27. return json_line
  28. def paese_content(layouts:list):
  29. ''' '''
  30. if not layouts:
  31. return pd.NA
  32. contents = []
  33. for layout in layouts:
  34. if layout['type'] != 'table' or layout['type'] != 'image' or layout['type'] != 'seal' or layout['type'] != 'head_tail':
  35. if not layout['text']: continue
  36. contents.append(layout['text'])
  37. return "\n".join(contents).replace('\n\n','\n').replace(' ','')
  38. def parse_table_name(tables:list, layouts:list):
  39. ''' '''
  40. if not tables:
  41. return pd.NA
  42. node_dict = {}
  43. for layout in layouts:
  44. if not layout['children']: continue
  45. node_dict[layout['text']] = layout['children'] # text对应children 一一对应
  46. table_ids = []
  47. for table in tables:
  48. table_ids.append({'layout_id':table['layout_id']})
  49. table_names = []
  50. for table_id in table_ids:
  51. layout_id = table_id['layout_id']
  52. for text, children in node_dict.items():
  53. if layout_id in children:
  54. table_names.append(text)
  55. if not table_names:
  56. layout_ids = []
  57. for layout in layouts:
  58. layout_ids.append({layout['layout_id']:layout['text']})
  59. table_layout_ids = []
  60. for table in tables:
  61. table_layout_ids.append({'layout_id':table['layout_id']})
  62. index_ = 0
  63. for table_layout_id in table_layout_ids:
  64. for layout_id in layout_ids:
  65. if table_layout_id['layout_id'] in layout_id:
  66. index_ = layout_ids.index(layout_id)
  67. break
  68. for ids in layout_ids[:index_]:
  69. for value in ids.values():
  70. if '表' in value: table_names.append(value)
  71. if not table_names and index_ > 0:
  72. table_names.append(list(layout_ids[index_-1].values())[0])
  73. return ";".join(table_names)
  74. def parse_title(layouts:list):
  75. ''' 解析标题 '''
  76. if not layouts: return pd.NA
  77. for layout in layouts:
  78. if (layout['type'] == 'title' or 'title' in layout['sub_type']) and layout['text'] and layout['type'] != 'head_tail':
  79. text = re.sub("\n","",layout['text'])
  80. if not text: continue
  81. return text
  82. for layout in layouts:
  83. if not (layout['type'] == 'text' and layout['text']): continue
  84. text = re.sub("\n","",layout['text'])
  85. if text and len(text) < 30:
  86. return re.sub("\n","",text)
  87. return pd.NA
  88. def parse_table(markdown:str):
  89. table = []
  90. lines = markdown.split('\n')
  91. for line in lines:
  92. line = re.sub(r"\\n| ","",line)
  93. table.append(line.strip('|').split('|'))
  94. return table
  95. def get_ocr_new(raw:dict, pretty: bool = False):
  96. '''解析textmind结果'''
  97. nodes = []
  98. for node in raw['pages']:
  99. del node['page_id']
  100. if not node['text']: continue
  101. nodes.append(node)
  102. df = pd.DataFrame(nodes)
  103. if not pretty:
  104. return df
  105. content_df = df.loc[:,['page_num']]
  106. content_df['text'] = df['layouts'].apply(lambda x: paese_content(x))
  107. content_df = content_df.rename(columns={'page_num':'page_number'})
  108. content_df.dropna(inplace=True)
  109. content = content_df.to_dict('records')
  110. title_df = df.loc[:,['page_num']]
  111. title_df = title_df.rename(columns={'page_num':'page_number'})
  112. title_df['title'] = df['layouts'].apply(lambda x: parse_title(x))
  113. title_df['box'] = df['layouts'].apply(lambda x: x[0]['position'] if x else pd.NA)
  114. # title_df['box'] = df[df['layouts'].apply(lambda x: x[0]['position'] if x else False)]
  115. title_df['node_type'] = df['layouts'].apply(lambda x: x[0]['type'] if x else pd.NA)
  116. title_df['para_type'] = df['layouts'].apply(lambda x: x[0]['sub_type'] if x else pd.NA)
  117. title_df['text'] = title_df['title']
  118. title_df.dropna(inplace=True)
  119. outline = title_df.to_dict('records')
  120. # print(outline[:2])
  121. title_df['seq_num'] = title_df.index
  122. title = title_df.to_dict('records')
  123. # print(title[:2])
  124. table_df = df.loc[:,['page_num']]
  125. table_df['page_num'] = table_df['page_num'].apply(lambda x: [x])
  126. table_df = table_df.rename(columns={'page_num':'page_numbers'})
  127. table_df['table'] = df['tables'].apply(lambda x: parse_table(x[0]['markdown']) if x else pd.NA)
  128. table_df['table_name'] = df.apply(lambda x: parse_table_name(x['tables'], x['layouts']), axis=1)
  129. table_df.dropna(inplace=True)
  130. table = table_df.to_dict('records')
  131. # print(table[:2])
  132. return {"title": title, "outline": outline, "contents": content, "tables": table, "images": []}
  133. def run():
  134. basepath = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/30份数据整理'
  135. for save_file in os.listdir(basepath):
  136. save_file_path = os.path.join(basepath, save_file)
  137. for save_file_name in os.listdir(save_file_path):
  138. if '投标文件' == save_file_name:
  139. save_file_name_path = os.path.join(save_file_path,save_file_name)
  140. textmind_save_dir = os.path.join(save_file_name_path,'textmind')
  141. if not os.path.exists(textmind_save_dir): continue
  142. for bidder_name in os.listdir(textmind_save_dir):
  143. if 'textmind.json' not in bidder_name[-13:]: continue
  144. textmind_result_path = os.path.join(textmind_save_dir, bidder_name)
  145. print("textmind_result_path ",textmind_result_path)
  146. with open(textmind_result_path, 'r', encoding='utf-8') as fp:
  147. raw = json.load(fp)
  148. try:
  149. raw = get_ocr_new(raw=raw, pretty=True)
  150. for k, v in raw.items():
  151. if k == 'title':
  152. with open(f'{textmind_save_dir}/{bidder_name[:-5]}_title.json', 'w', encoding='utf-8') as fo:
  153. json.dump(v, fo, ensure_ascii=False)
  154. elif k == 'outline':
  155. with open(f'{textmind_save_dir}/{bidder_name[:-5]}_outlines.json', 'w', encoding='utf-8') as fo:
  156. json.dump(v, fo, ensure_ascii=False)
  157. elif k == 'contents':
  158. with open(f'{textmind_save_dir}/{bidder_name[:-5]}_content.json', 'w', encoding='utf-8') as fo:
  159. json.dump(v, fo, ensure_ascii=False)
  160. elif k == 'tables':
  161. with open(f'{textmind_save_dir}/{bidder_name[:-5]}_tables.json', 'w', encoding='utf-8') as fo:
  162. json.dump(v, fo, ensure_ascii=False)
  163. except:
  164. print(textmind_result_path)
  165. raise ValueError("stop")
  166. def parse_datasets():
  167. base_dir = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4'
  168. # pre_parse_datasets = []
  169. for base_folders in os.listdir(base_dir):
  170. base_folder = os.path.join(base_dir, base_folders)
  171. folder_info = {}
  172. for folders in os.listdir(base_folder):
  173. folder = os.path.join(base_folder, folders)
  174. if folders == "招标文件":
  175. for file in os.listdir(folder):
  176. if file.endswith(".pdf"):
  177. projectName = file.split(".")[0] # 去掉后缀之后的文件名
  178. tender_file = os.path.join(folder, file)
  179. # folder_info["projectName"] = projectName
  180. # folder_info["buyFile"] = tender_file
  181. elif folders == '投标文件':
  182. # folder_info["bidder_info"] = []
  183. print("folder:", folder)
  184. for file in os.listdir(folder):
  185. # if file.endswith(".pdf"):
  186. # bidderUnit = file.split(".")[0] # 去掉后缀之后的文件名
  187. # bidder_file = os.path.join(folder, file)
  188. # folder_info["bidder_info"].append({"bidderUnit":bidderUnit, "bidderFile":bidder_file})
  189. if file == 'textmind':
  190. textmind_result_path = os.path.join(folder, file)
  191. for textmind_json in os.listdir(textmind_result_path):
  192. if '_textmind' not in textmind_json: continue
  193. bidderUnit = textmind_json.split("_")[0] # _textmind.json
  194. textmind_file_path = os.path.join(textmind_result_path, textmind_json)
  195. with open(textmind_file_path, 'r', encoding='utf-8') as fp:
  196. raw = json.load(fp)
  197. try:
  198. raw = get_ocr_new(raw=raw, pretty=True)
  199. for k, v in raw.items():
  200. if k == 'title':
  201. with open(f'{textmind_result_path}/{bidderUnit}_bidding_title.json', 'w', encoding='utf-8') as fo:
  202. json.dump(v, fo, ensure_ascii=False)
  203. elif k == 'outline':
  204. with open(f'{textmind_result_path}/{bidderUnit}_bidding_outlines.json', 'w', encoding='utf-8') as fo:
  205. json.dump(v, fo, ensure_ascii=False)
  206. elif k == 'contents':
  207. with open(f'{textmind_result_path}/{bidderUnit}_bidding_content.json', 'w', encoding='utf-8') as fo:
  208. json.dump(v, fo, ensure_ascii=False)
  209. elif k == 'tables':
  210. with open(f'{textmind_result_path}/{bidderUnit}_bidding_tables.json', 'w', encoding='utf-8') as fo:
  211. json.dump(v, fo, ensure_ascii=False)
  212. except:
  213. print(textmind_result_path)
  214. raise ValueError("stop")
  215. # pre_parse_datasets.append(folder_info)
  216. # 提前循环遍历建立保存文件夹内容
  217. # pre_parse_datasets
  218. if __name__ == '__main__':
  219. pass
  220. run()
  221. # parse_datasets()