# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-06-11 13:43:14 # @Last Modified by: privacy # @Last Modified time: 2024-12-02 17:02:16 import os import time import json import base64 import requests def create_task(url, file_path, file_url): """ Args: url: string, 服务请求链接 file_path: 本地文件路径 file_url: 文件链接 Returns: 响应 """ file = open(file_path, 'rb').read() # 文件请求 body = { "file": (os.path.basename(file_path), file, "multipart/form-data"), } data = { "file_name": os.path.basename(file_path), "return_para_nodes": True } response = requests.post(url, data=data, files=body) return response.json() def create_task_1(url, file_path, file_url): """ Args: url: string, 服务请求链接 file_path: 本地文件路径 file_url: 文件链接 Returns: 响应 """ # 文件请求 with open(file_path, "rb") as f: file_data = base64.b64encode(f.read()) data = { "file_data": file_data, # "file_url": file_url, "file_name": os.path.basename(file_path) } # 文档切分参数,非必传 # return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1}) # data["return_doc_chunks"] = return_doc_chunks headers = {'Content-Type': 'application/x-www-form-urlencoded'} response = requests.post(url, headers=headers, data=data) return response.json() def query_task(url, task_id): """ Args: url: string, 请求链接 task_id: string, task id Returns: 响应 """ data = { "task_id": task_id } response = requests.post(url, data=data, files=data) return response.json() def query_task_1(url, task_id): """ Args: url: string, 请求链接 task_id: string, task id Returns: 响应 """ data = { "task_id": task_id } headers = {'Content-Type': 'application/x-www-form-urlencoded'} response = requests.post(url, headers=headers, data=data) return response.json() def request1(bidderFile, nums: int = 1): try: response = create_task_1(request_host, bidderFile, "") print('res1 :', response) task_id = response['result']['task_id'] if not task_id: raise ValueError('task_id is None') except Exception as e: print("request1 :", e) time.sleep(10) nums += 1 if nums > 100: return task_id = request1(bidderFile, nums) return task_id def request2(task_id, nums: int = 1, max_nums: int = 50): nums = 1 while nums < max_nums: try: resp = query_task_1(request_query_host, task_id) print('res2 :', resp) if resp['result']['status'] == 'success': url = resp['result']['parse_result_url'] response = requests.get(url) response.encoding = 'utf-8' # response.json() return response nums += 1 except Exception: nums += 10 token = "24.87693e5dd8c2d7d7accf260bb2d265d2.2592000.1733970962.282335-86574608" # request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}" request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}" # 更新 # request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}" request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}" # 更新 # 测试pdf文件 # file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf" # file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf" # file_path = r'..\data\0预审查初审详审测试数据\三峡左右岸电站厂房清洁水系统改造\湖北海光安全技术工程有限公司_T221100130656\海光 投标文件-PDF.pdf' file_path = r'..\data\浙江国迈建设集团有限公司技术文件.pdf' # time.sleep(5) # task_id = request1(file_path) # print('1 :', task_id) task_id = 'task-T0SnDf9Be4QuKemA8apzkgbq5jK7J9fG' time.sleep(10) response = request2(task_id) print('2 file_name :', response.json()['file_name']) # 保存textmind解析结果 # with open(r'..\data\0预审查初审详审测试数据\textmind_result\三峡左右岸电站厂房清洁水系统改造\湖北海光安全技术工程有限公司.json', 'w', encoding='utf-8') as fp: # json.dump(response.json(), fp, indent=4, ensure_ascii=False)