import requests, os, time, json, base64 import tqdm, re '''bos_sample_conf''' from baidubce.bce_client_configuration import BceClientConfiguration from baidubce.auth.bce_credentials import BceCredentials '''bos''' import sys import json import numpy as np from baidubce.services.bos import bos_handler from baidubce.services.bos import storage_class from baidubce.services.bos import canned_acl from baidubce.bce_client_configuration import BceClientConfiguration from baidubce.auth.bce_credentials import BceCredentials #导入BOS相关模块 from baidubce import exception from baidubce.services import bos from baidubce.services.bos import canned_acl from baidubce.services.bos.bos_client import BosClient '''bos_sample_conf''' #设置BosClient的Host,Access Key ID和Secret Access Key bos_host = "bj.bcebos.com" access_key_id = "87815919190940dd9ff8a7790281e1e9" secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d" access_key_id = "87815919190940dd9ff8a7790281e1e9" secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d" access_key_id = "ALTAKEq9L0oxxxDi5jUc3e12gu" secret_access_key = "9336a04f88e845e284bab26bd5fd8182" # 创建BceClientConfiguration config = BceClientConfiguration(credentials=BceCredentials(access_key_id, secret_access_key), endpoint = bos_host) '''bos''' bos_client = BosClient(config) """ response = bos_client.list_buckets() for bucket in response.buckets: print (bucket.name) """ #根据ListObjects接口来获取图片的key,prefix为前缀 def get_objects(prefix, max_keys=10): objects = bos_client.list_objects('ocrtrips', max_keys=max_keys, prefix=prefix) return objects.contents #上传 def put_bos(object_key, file_name, bucket_name='ctrimgs'): bos_client.put_object_from_file(bucket_name, object_key, file_name) return 'https://ctrimgs.bj.bcebos.com/' + object_key #return bos_client.put_object_from_file(bucket_name, object_key, file_name) #删除 def delete_bos(object_key, bucket_name='ctrimgs'): bos_client.delete_object(bucket_name, object_key) return '' #下载 def get_bos(bucket_name, object_key, file_name): bos_client.get_object_to_file(bucket_name, object_key, file_name) #bos查询 def get_object_lists(buckent_name, prefix, max_keys=10): objects = bos_client.list_objects(buckent_name, max_keys=max_keys, prefix=prefix) return objects.contents #分块上传 文件大于5G def get_multipart(bucket_name, object_key, file_name): upload_id = bos_client.initiate_multipart_upload(bucket_name, object_key).upload_id left_size = os.path.getsize(file_name) #设置分块的开始偏移位置 offset = 0 part_number = 1 part_list = [] while left_size > 0: #设置每块为5MB part_size = 5 * 1024 * 1024 if left_size < part_size: part_size = left_size response = bos_client.upload_part_from_file( bucket_name, object_key, upload_id, part_number, part_size, file_name, offset) left_size -= part_size offset += part_size part_list.append({ "partNumber": part_number, "eTag": response.metadata.etag }) part_number += 1 bos_client.complete_multipart_upload(bucket_name, object_key, upload_id, part_list) '''textmind_ocr''' # def create_task(url, file_path, file_url): # """ # Args: # url: string, 服务请求链接 # file_path: 本地文件路径 # file_url: 文件链接 # Returns: 响应 # """ # file = open(file_path, 'rb').read() # # 文件请求 # body = { # "file": (os.path.basename(file_path), file, "multipart/form-data"), # } # data = { # "file_name": os.path.basename(file_path), # "return_para_nodes": True # } # response = requests.post(url, data=data, files=body) # return response.json() def create_task_1(url, file_path, file_url): """ Args: url: string, 服务请求链接 file_path: 本地文件路径 file_url: 文件链接 Returns: 响应 """ # 文件请求 with open(file_path, "rb") as f: file_data = base64.b64encode(f.read()) if file_url: data = { "file_url": file_url, "file_name": os.path.basename(file_path) } else: data = { "file_data": file_data, "file_url": file_url, "file_name": os.path.basename(file_path) } # 文档切分参数,非必传 # return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1}) # data["return_doc_chunks"] = return_doc_chunks headers = {'Content-Type': 'application/x-www-form-urlencoded'} response = requests.post(url, headers=headers, data=data) return response.json() # def query_task(url, task_id): # """ # Args: # url: string, 请求链接 # task_id: string, task id # Returns: 响应 # """ # data = { # "task_id": task_id # } # response = requests.post(url, data=data, files=data) # return response.json() def query_task_1(url, task_id): """ Args: url: string, 请求链接 task_id: string, task id Returns: 响应 """ data = { "task_id": task_id } headers = {'Content-Type': 'application/x-www-form-urlencoded'} response = requests.post(url, headers=headers, data=data) return response.json() def request1(bidderFile, bidderUrl:str = '', nums:int = 1, max_nums:int = 50): while nums < max_nums: try: response = create_task_1(request_host, bidderFile, bidderUrl) print('res1 :',response) task_id = response['result'].get('task_id', None) if not task_id: raise ValueError('task_id is None') return task_id except Exception as e: print("request1 :",e) nums += 1 time.sleep(10) def request2(task_id, nums:int = 1, max_nums: int = 500): while nums < max_nums: try: resp = query_task_1(request_query_host, task_id) print('res2 :', resp) if resp['result']['status'] == 'success': url = resp['result']['parse_result_url'] # url = resp['result']['markdown_url'] # 取markdown return TXT response = requests.get(url) response.encoding = 'utf-8' response.json() return response except Exception: nums += 1 time.sleep(20) # def request2(task_id,nums:int=1): # try: # resp = query_task_1(request_query_host, task_id) # print('res2 :',resp) # url = resp['result']['parse_result_url'] # response = requests.get(url) # response.encoding = 'utf-8' # response.json() # except Exception as e: # print("request2 :",e) # time.sleep(20) # nums += 1 # if nums > 500: return # response = request2(task_id,nums) # return response token = "24.8dc8595999193e140449656989204d61.2592000.1736062425.282335-86574608" # request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}" request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}" # 更新 # request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}" request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}" # 更新 def test(): # 测试pdf文件 # file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf" # file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf" file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/北京华科同安监控技术有限公司.pdf" # time.sleep(5) task_id = request1(file_path) print('1 :',task_id) time.sleep(10) response = request2(task_id) # print('2 file_name :',response.json()['file_name']) # 保存textmind解析结果 # with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.json', 'w', encoding='utf-8') as fp: # json.dump(response.json(), fp, indent=4, ensure_ascii=False) with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.md', 'w', encoding='utf-8') as fp: fp.write(response.text()) # test() def parse_pdf(): base_dir = r'/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4' save_dir = 'data/预审查数据/20241122-4/ocr_result' os.makedirs(save_dir, exist_ok=True) pre_parse_datasets = [] # 遍历base_dir下所有文件 for base_folders in os.listdir(base_dir): base_folder = os.path.join(base_dir, base_folders) folder_info = {} for folders in os.listdir(base_folder): folder = os.path.join(base_folder, folders) if folders == "招标文件": for file in os.listdir(folder): if file.endswith(".pdf"): projectName = file.split(".")[0] # 去掉后缀之后的文件名 tender_file = os.path.join(folder, file) folder_info["projectName"] = projectName folder_info["buyFile"] = tender_file elif folders == '投标文件': folder_info["bidder_info"] = [] for file in os.listdir(folder): if file.endswith(".pdf"): bidderUnit = file.split(".")[0] # 去掉后缀之后的文件名 bidder_file = os.path.join(folder, file) folder_info["bidder_info"].append({"bidderUnit":bidderUnit, "bidderFile":bidder_file}) pre_parse_datasets.append(folder_info) # break # pre_parse_datasets = parse_pdf() # print(pre_parse_datasets) # 开始解析pdf for pre_parse_dataset in pre_parse_datasets: bidder_info = pre_parse_dataset['bidder_info'] projectName = pre_parse_dataset['projectName'] buyFile = pre_parse_dataset['buyFile'] for bidder_firm in bidder_info: bidderFile = bidder_firm['bidderFile'] bidderUnit = bidder_firm['bidderUnit'] task_id = request1(bidderFile) response = request2(task_id) with open(f"{save_dir}/{buyFile}_1_{bidderUnit}_textmind.json", 'w', encoding='utf-8') as fp: json.dump(response.json(), fp, indent=4, ensure_ascii=False) return pre_parse_datasets def picture_ocr(image_path:str): ''' 单个图片OCR结果 ''' task_id = request1(image_path) response = request2(task_id) save_file_path = "_".join(image_path[:-4].split('/')[-3:]) print(save_file_path) with open(f"data/预审查数据/download/{save_file_path}_textmind.json", 'w', encoding='utf-8') as fp: json.dump(response.json(), fp, indent=4, ensure_ascii=False) # picture_ocr('/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4测试数据/水车室复杂高危作业环境的模块化集成检修装备研制/中国科学院沈阳自动化研究所/scanned/page-134.jpg') def parse_single_file(file_path:str, save_dir:str): ''' parse single file(> 50M) ''' def get_FileSize(filePath): fsize = os.path.getsize(filePath) fsize = fsize/float(1024*1024) return round(fsize, 2) file_name = os.path.basename(file_path) file_name = re.sub('\040', '', file_name) # if file_name: # delete_bos(object_key=file_name) if get_FileSize(file_path) > 49: print('file_size > 50M') file_url = put_bos(object_key=file_name, file_name=file_path) print(file_url) task_id = request1(file_path, file_url) response = request2(task_id) if file_name: delete_bos(object_key=file_name) save_file_path = os.path.join(save_dir, file_name[:-4]) with open(f'{save_file_path}_textmind.json', 'w', encoding='utf-8') as fp: fp.write(response.json(), fp, indent=4, ensure_ascii=False) file_path = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/河海大学.pdf' save_path = 'data/预审查数据/download' # parse_single_file(file_path, save_path)