Przeglądaj źródła

parse_textmind_result.py textmind解析、textmind_ocr.py textmind ocr请求

lfygithub01 9 miesięcy temu
rodzic
commit
c1f439184a
2 zmienionych plików z 288 dodań i 0 usunięć
  1. 155 0
      parse_textmind_result.py
  2. 133 0
      textmind_ocr.py

+ 155 - 0
parse_textmind_result.py

@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2024-06-11 13:43:14
+# @Last Modified by:   privacy
+# @Last Modified time: 2024-09-27 14:08:30
+import re
+import json
+import pandas as pd
+import os
+
+"""
+textmind 结果解析
+[Node]
+    node_id: int
+    text: str
+    node_type: <text|title|contents|head_tail|table|image>
+    parent: int
+    children: list
+    para_type: <text|title_1|title_2|title_3|title_4|title_5|title_6|contents|head_tail|table|image>
+    [position]
+        pageno: int
+        layout_index: int
+        box: list
+"""
+
+
+
+def json2json(path):
+    _lines = open(path, 'r', encoding='utf-8').read()
+    json_line = json.loads(_lines)
+    return json_line
+
+def paese_content(layouts:list):
+    ''' '''
+    if not layouts: 
+        return pd.NA
+    contents = []
+    for layout in layouts:
+        if layout['sub_type'] != 'table' or layout['sub_type'] != 'image' or layout['sub_type'] != 'seal':
+            contents.append(layout['text'])
+    return "".join(contents).replace('\n\n','\n').replace(' ','')
+
+def parse_table_name(tables:list, images:list, layouts:list):
+    '''  '''
+    if not tables:
+        return pd.NA
+    table_names = []
+    for layout in layouts:
+        if layout['sub_type'] == 'table_title' or layout['sub_type'] == 'head_tail':
+            table_names.append(re.sub("\n| ","",layout['text']))
+    for image in images:
+        for content_layouts in image['content_layouts']:
+            if content_layouts['sub_type'] == 'table_title' or layout['sub_type'] == 'head_tail':
+                table_names.append(re.sub("\n| ","",content_layouts['text']))
+    
+    return ";".join(table_names)
+                
+def parse_title(layouts:list):
+        ''' 解析标题 '''
+        if not layouts: return pd.NA
+        for layout in layouts:
+            if layout['type'] == 'title':
+                return re.sub("\n","",layout['text'])
+        for layout in layouts:
+            if layout['text']:
+                return re.sub("\n","",layouts[0]['text']) if len(layouts[0]['text']) < 15 else pd.NA
+def parse_table(markdown:str):
+    table = []
+    lines = markdown.split('\n')
+    for line in lines:
+        line = re.sub(r"\\n| ","",line)
+        table.append(line.strip('|').split('|'))
+    return table
+def get_ocr_new(raw:dict, pretty: bool = False):
+    '''解析textmind结果'''
+    nodes = []
+    for node in raw['pages']:
+        del node['page_id']
+        if not node['text']: continue
+        nodes.append(node)
+
+    df = pd.DataFrame(nodes)
+
+    if not pretty:
+        return df
+
+    content_df = df.loc[:,['page_num']]
+    content_df['text'] = df['layouts'].apply(lambda x: paese_content(x))
+    content_df = content_df.rename(columns={'page_num':'page_number'})
+    content_df.dropna(inplace=True)
+
+    content = content_df.to_dict('records')
+
+    title_df = df.loc[:,['page_num']]
+    title_df = title_df.rename(columns={'page_num':'page_number'})
+    title_df['title'] = df['layouts'].apply(lambda x: parse_title(x))
+    title_df['box'] = df['layouts'].apply(lambda x: x[0]['position'] if x else pd.NA)
+    title_df['node_type'] = df['layouts'].apply(lambda x: x[0]['type'] if x else pd.NA)
+    title_df['para_type'] = df['layouts'].apply(lambda x: x[0]['sub_type'] if x else pd.NA)
+    title_df['text'] = title_df['title']
+
+    title_df.dropna(inplace=True)
+
+    outline = title_df.to_dict('records')
+    # print(outline[:2])
+    
+    title_df['seq_num'] = title_df.index
+    title = title_df.to_dict('records')
+    # print(title[:2])
+
+    table_df = df.loc[:,['page_num']]
+    table_df['page_num'] = table_df['page_num'].apply(lambda x: [x])
+    table_df = table_df.rename(columns={'page_num':'page_numbers'})
+    table_df['table'] = df['tables'].apply(lambda x: parse_table(x[0]['markdown']) if x else pd.NA)
+    table_df['table_name'] = df.apply(lambda x: parse_table_name(x['tables'], x['images'], x['layouts']), axis=1)
+    table_df.dropna(inplace=True)
+
+    table = table_df.to_dict('records')
+    # print(table[:2])
+
+
+    return {"title": title, "outline": outline, "contents": content, "tables": table, "images": []}
+ 
+
+if __name__ == '__main__':
+    basepath = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/30份数据整理'
+    for save_file in os.listdir(basepath):
+        save_file_path = os.path.join(basepath, save_file)
+        for save_file_name in os.listdir(save_file_path):
+            if '投标文件' == save_file_name:
+                save_file_name_path = os.path.join(save_file_path,save_file_name)
+                textmind_save_dir = os.path.join(save_file_name_path,'textmind')
+                for bidder_name in os.listdir(textmind_save_dir):
+                    if bidder_name[-13:] != 'textmind.json': continue
+                    textmind_result_path = os.path.join(textmind_save_dir, bidder_name)
+                    with open(textmind_result_path, 'r', encoding='utf-8') as fp:
+                        raw = json.load(fp)
+                        try:
+                            raw = get_ocr_new(raw=raw, pretty=True)
+                            for k, v in raw.items():
+                                if k == 'title':
+                                    with open(f'{textmind_save_dir}/{bidder_name[:-5]}_title.json', 'w', encoding='utf-8') as fo:
+                                        json.dump(v, fo, ensure_ascii=False)
+                                elif k == 'outline':
+                                    with open(f'{textmind_save_dir}/{bidder_name[:-5]}_outlines.json', 'w', encoding='utf-8') as fo:
+                                        json.dump(v, fo, ensure_ascii=False)
+                                elif k == 'contents':
+                                    with open(f'{textmind_save_dir}/{bidder_name[:-5]}_content.json', 'w', encoding='utf-8') as fo:
+                                        json.dump(v, fo, ensure_ascii=False)
+                                elif k == 'tables':
+                                    with open(f'{textmind_save_dir}/{bidder_name[:-5]}_tables.json', 'w', encoding='utf-8') as fo:
+                                        json.dump(v, fo, ensure_ascii=False)
+                        except:
+                            print(textmind_result_path)
+                            raise ValueError("stop")

+ 133 - 0
textmind_ocr.py

@@ -0,0 +1,133 @@
+
+import requests, os, time, json, base64
+
+def create_task(url, file_path, file_url):
+    """
+    Args:
+        url: string, 服务请求链接
+        file_path: 本地文件路径
+        file_url: 文件链接
+    Returns: 响应
+    """
+    file = open(file_path, 'rb').read()
+
+    # 文件请求
+    body = {
+        "file": (os.path.basename(file_path), file, "multipart/form-data"),
+    }
+
+    data = {
+        "file_name": os.path.basename(file_path),
+        "return_para_nodes": True
+    }
+
+    response = requests.post(url, data=data, files=body)
+    return response.json()
+
+def create_task_1(url, file_path, file_url):
+    """
+    Args:
+        url: string, 服务请求链接
+        file_path: 本地文件路径
+        file_url: 文件链接
+    Returns: 响应
+    """
+   # 文件请求
+    with open(file_path, "rb") as f:
+        file_data = base64.b64encode(f.read())
+    data = {
+        "file_data": file_data,
+        "file_url": file_url,
+        "file_name": os.path.basename(file_path)
+    }
+    
+    # 文档切分参数,非必传
+    # return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1})
+    # data["return_doc_chunks"] = return_doc_chunks
+    
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    response = requests.post(url, headers=headers, data=data)
+    return response.json()
+
+
+def query_task(url, task_id):
+    """
+    Args:
+        url: string, 请求链接
+        task_id: string, task id
+    Returns: 响应
+    """
+    data = {
+        "task_id": task_id
+    }
+
+    response = requests.post(url, data=data, files=data)
+    return response.json()
+
+def query_task_1(url, task_id):
+    """
+    Args:
+        url: string, 请求链接
+        task_id: string, task id
+    Returns: 响应
+    """
+    data = {
+        "task_id": task_id
+    }
+    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+
+    response = requests.post(url, headers=headers, data=data)
+    return response.json()
+
+def request1(bidderFile,nums:int=1):
+    try:
+        response = create_task_1(request_host, bidderFile, "")
+        print('res1  :',response)
+        task_id = response['result']['task_id']
+        if not task_id: raise ValueError('task_id is None')
+    except Exception as e:
+        print("request1 :",e)
+        time.sleep(10)
+        nums += 1
+        if nums > 100: return 
+        task_id = request1(bidderFile, nums)
+    return task_id
+    
+def request2(task_id,nums:int=1):
+    try:
+        resp = query_task_1(request_query_host, task_id)
+        print('res2  :',resp)
+        url = resp['result']['parse_result_url']
+        response = requests.get(url)
+        response.encoding = 'utf-8'
+        response.json()
+    except Exception as e:
+        print("request2 :",e)
+        time.sleep(20)
+        nums += 1
+        if nums > 500: return 
+        response = request2(task_id,nums)
+    return response
+
+
+token = "24.87693e5dd8c2d7d7accf260bb2d265d2.2592000.1733970962.282335-86574608"
+# request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
+request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}"  # 更新
+# request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}"
+request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}"  # 更新
+# 测试pdf文件
+# file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf"
+file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf"
+
+# time.sleep(5)
+task_id = request1(file_path)
+print('1   :',task_id)
+
+time.sleep(10)
+response = request2(task_id)
+print('2 file_name :',response.json()['file_name'])
+
+# 保存textmind解析结果
+with open('data/预审查数据/textmind_result/2021_2022年三峡电站左岸厂房中央空调系统主机设备改造_广东申菱环境系统股份有限公司.json', 'w', encoding='utf-8') as fp:
+    json.dump(response.json(), fp, indent=4, ensure_ascii=False)