import requests, os, time, json, base64 def create_task(url, file_path, file_url): """ Args: url: string, 服务请求链接 file_path: 本地文件路径 file_url: 文件链接 Returns: 响应 """ file = open(file_path, 'rb').read() # 文件请求 body = { "file": (os.path.basename(file_path), file, "multipart/form-data"), } data = { "file_name": os.path.basename(file_path), "return_para_nodes": True } response = requests.post(url, data=data, files=body) return response.json() def create_task_1(url, file_path, file_url): """ Args: url: string, 服务请求链接 file_path: 本地文件路径 file_url: 文件链接 Returns: 响应 """ # 文件请求 with open(file_path, "rb") as f: file_data = base64.b64encode(f.read()) data = { "file_data": file_data, "file_url": file_url, "file_name": os.path.basename(file_path) } # 文档切分参数,非必传 # return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1}) # data["return_doc_chunks"] = return_doc_chunks headers = {'Content-Type': 'application/x-www-form-urlencoded'} response = requests.post(url, headers=headers, data=data) return response.json() def query_task(url, task_id): """ Args: url: string, 请求链接 task_id: string, task id Returns: 响应 """ data = { "task_id": task_id } response = requests.post(url, data=data, files=data) return response.json() def query_task_1(url, task_id): """ Args: url: string, 请求链接 task_id: string, task id Returns: 响应 """ data = { "task_id": task_id } headers = {'Content-Type': 'application/x-www-form-urlencoded'} response = requests.post(url, headers=headers, data=data) return response.json() def request1(bidderFile,nums:int=1): try: response = create_task_1(request_host, bidderFile, "") print('res1 :',response) task_id = response['result']['task_id'] if not task_id: raise ValueError('task_id is None') except Exception as e: print("request1 :",e) time.sleep(10) nums += 1 if nums > 100: return task_id = request1(bidderFile, nums) return task_id def request2(task_id,nums:int=1): try: resp = query_task_1(request_query_host, task_id) print('res2 :',resp) url = resp['result']['parse_result_url'] response = requests.get(url) response.encoding = 'utf-8' response.json() except Exception as e: print("request2 :",e) time.sleep(20) nums += 1 if nums > 500: return response = request2(task_id,nums) return response token = "24.87693e5dd8c2d7d7accf260bb2d265d2.2592000.1733970962.282335-86574608" # request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}" request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}" # 更新 # request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}" request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}" # 更新 # 测试pdf文件 # file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf" file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf" # time.sleep(5) task_id = request1(file_path) print('1 :',task_id) time.sleep(10) response = request2(task_id) print('2 file_name :',response.json()['file_name']) # 保存textmind解析结果 with open('data/预审查数据/textmind_result/2021_2022年三峡电站左岸厂房中央空调系统主机设备改造_广东申菱环境系统股份有限公司.json', 'w', encoding='utf-8') as fp: json.dump(response.json(), fp, indent=4, ensure_ascii=False)