123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368 |
- import requests, os, time, json, base64
- import tqdm, re
- '''bos_sample_conf'''
- from baidubce.bce_client_configuration import BceClientConfiguration
- from baidubce.auth.bce_credentials import BceCredentials
- '''bos'''
- import sys
- import json
- import numpy as np
- from baidubce.services.bos import bos_handler
- from baidubce.services.bos import storage_class
- from baidubce.services.bos import canned_acl
- from baidubce.bce_client_configuration import BceClientConfiguration
- from baidubce.auth.bce_credentials import BceCredentials
- #导入BOS相关模块
- from baidubce import exception
- from baidubce.services import bos
- from baidubce.services.bos import canned_acl
- from baidubce.services.bos.bos_client import BosClient
- '''bos_sample_conf'''
- #设置BosClient的Host,Access Key ID和Secret Access Key
- bos_host = "bj.bcebos.com"
- access_key_id = "87815919190940dd9ff8a7790281e1e9"
- secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
- access_key_id = "87815919190940dd9ff8a7790281e1e9"
- secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
- access_key_id = "ALTAKEq9L0oxxxDi5jUc3e12gu"
- secret_access_key = "9336a04f88e845e284bab26bd5fd8182"
- # 创建BceClientConfiguration
- config = BceClientConfiguration(credentials=BceCredentials(access_key_id, secret_access_key), endpoint = bos_host)
- '''bos'''
- bos_client = BosClient(config)
- """
- response = bos_client.list_buckets()
- for bucket in response.buckets:
- print (bucket.name)
- """
- #根据ListObjects接口来获取图片的key,prefix为前缀
- def get_objects(prefix, max_keys=10):
- objects = bos_client.list_objects('ocrtrips', max_keys=max_keys, prefix=prefix)
- return objects.contents
- #上传
- def put_bos(object_key, file_name, bucket_name='ctrimgs'):
- bos_client.put_object_from_file(bucket_name, object_key, file_name)
- return 'https://ctrimgs.bj.bcebos.com/' + object_key
- #return bos_client.put_object_from_file(bucket_name, object_key, file_name)
- #删除
- def delete_bos(object_key, bucket_name='ctrimgs'):
- bos_client.delete_object(bucket_name, object_key)
- return ''
-
- #下载
- def get_bos(bucket_name, object_key, file_name):
- bos_client.get_object_to_file(bucket_name,
- object_key,
- file_name)
- #bos查询
- def get_object_lists(buckent_name, prefix, max_keys=10):
- objects = bos_client.list_objects(buckent_name, max_keys=max_keys, prefix=prefix)
- return objects.contents
- #分块上传 文件大于5G
- def get_multipart(bucket_name, object_key, file_name):
- upload_id = bos_client.initiate_multipart_upload(bucket_name, object_key).upload_id
- left_size = os.path.getsize(file_name)
- #设置分块的开始偏移位置
- offset = 0
- part_number = 1
- part_list = []
- while left_size > 0:
- #设置每块为5MB
- part_size = 5 * 1024 * 1024
- if left_size < part_size:
- part_size = left_size
- response = bos_client.upload_part_from_file(
- bucket_name, object_key, upload_id, part_number, part_size, file_name, offset)
- left_size -= part_size
- offset += part_size
- part_list.append({
- "partNumber": part_number,
- "eTag": response.metadata.etag
- })
- part_number += 1
- bos_client.complete_multipart_upload(bucket_name, object_key, upload_id, part_list)
- '''textmind_ocr'''
- # def create_task(url, file_path, file_url):
- # """
- # Args:
- # url: string, 服务请求链接
- # file_path: 本地文件路径
- # file_url: 文件链接
- # Returns: 响应
- # """
- # file = open(file_path, 'rb').read()
- # # 文件请求
- # body = {
- # "file": (os.path.basename(file_path), file, "multipart/form-data"),
- # }
- # data = {
- # "file_name": os.path.basename(file_path),
- # "return_para_nodes": True
- # }
- # response = requests.post(url, data=data, files=body)
- # return response.json()
- def create_task_1(url, file_path, file_url):
- """
- Args:
- url: string, 服务请求链接
- file_path: 本地文件路径
- file_url: 文件链接
- Returns: 响应
- """
- # 文件请求
- with open(file_path, "rb") as f:
- file_data = base64.b64encode(f.read())
- if file_url:
- data = {
- "file_url": file_url,
- "file_name": os.path.basename(file_path)
- }
- else:
- data = {
- "file_data": file_data,
- "file_url": file_url,
- "file_name": os.path.basename(file_path)
- }
-
- # 文档切分参数,非必传
- # return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1})
- # data["return_doc_chunks"] = return_doc_chunks
-
- headers = {'Content-Type': 'application/x-www-form-urlencoded'}
- response = requests.post(url, headers=headers, data=data)
- return response.json()
- # def query_task(url, task_id):
- # """
- # Args:
- # url: string, 请求链接
- # task_id: string, task id
- # Returns: 响应
- # """
- # data = {
- # "task_id": task_id
- # }
- # response = requests.post(url, data=data, files=data)
- # return response.json()
- def query_task_1(url, task_id):
- """
- Args:
- url: string, 请求链接
- task_id: string, task id
- Returns: 响应
- """
- data = {
- "task_id": task_id
- }
- headers = {'Content-Type': 'application/x-www-form-urlencoded'}
- response = requests.post(url, headers=headers, data=data)
- return response.json()
- def request1(bidderFile, bidderUrl:str = '', nums:int = 1, max_nums:int = 50):
- while nums < max_nums:
- try:
- response = create_task_1(request_host, bidderFile, bidderUrl)
- print('res1 :',response)
- task_id = response['result'].get('task_id', None)
- if not task_id: raise ValueError('task_id is None')
- return task_id
- except Exception as e:
- print("request1 :",e)
- nums += 1
- time.sleep(10)
-
- def request2(task_id, nums:int = 1, max_nums: int = 500):
- while nums < max_nums:
- try:
- resp = query_task_1(request_query_host, task_id)
- print('res2 :', resp)
- if resp['result']['status'] == 'success':
- url = resp['result']['parse_result_url']
- # url = resp['result']['markdown_url'] # 取markdown return TXT
- response = requests.get(url)
- response.encoding = 'utf-8'
- response.json()
- return response
- except Exception:
- nums += 1
- time.sleep(20)
- # def request2(task_id,nums:int=1):
- # try:
- # resp = query_task_1(request_query_host, task_id)
- # print('res2 :',resp)
- # url = resp['result']['parse_result_url']
- # response = requests.get(url)
- # response.encoding = 'utf-8'
- # response.json()
- # except Exception as e:
- # print("request2 :",e)
- # time.sleep(20)
- # nums += 1
- # if nums > 500: return
- # response = request2(task_id,nums)
- # return response
- token = "24.8dc8595999193e140449656989204d61.2592000.1736062425.282335-86574608"
- # request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
- request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}" # 更新
- # request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}"
- request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}" # 更新
- def test():
- # 测试pdf文件
- # file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf"
- # file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf"
- file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/北京华科同安监控技术有限公司.pdf"
- # time.sleep(5)
- task_id = request1(file_path)
- print('1 :',task_id)
- time.sleep(10)
- response = request2(task_id)
- # print('2 file_name :',response.json()['file_name'])
- # 保存textmind解析结果
- # with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.json', 'w', encoding='utf-8') as fp:
- # json.dump(response.json(), fp, indent=4, ensure_ascii=False)
- with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.md', 'w', encoding='utf-8') as fp:
- fp.write(response.text())
- # test()
- def parse_pdf():
- base_dir = r'/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4'
- save_dir = 'data/预审查数据/20241122-4/ocr_result'
- os.makedirs(save_dir, exist_ok=True)
- pre_parse_datasets = []
- # 遍历base_dir下所有文件
- for base_folders in os.listdir(base_dir):
- base_folder = os.path.join(base_dir, base_folders)
- folder_info = {}
- for folders in os.listdir(base_folder):
- folder = os.path.join(base_folder, folders)
- if folders == "招标文件":
- for file in os.listdir(folder):
- if file.endswith(".pdf"):
- projectName = file.split(".")[0] # 去掉后缀之后的文件名
- tender_file = os.path.join(folder, file)
- folder_info["projectName"] = projectName
- folder_info["buyFile"] = tender_file
-
- elif folders == '投标文件':
- folder_info["bidder_info"] = []
- for file in os.listdir(folder):
- if file.endswith(".pdf"):
- bidderUnit = file.split(".")[0] # 去掉后缀之后的文件名
- bidder_file = os.path.join(folder, file)
- folder_info["bidder_info"].append({"bidderUnit":bidderUnit, "bidderFile":bidder_file})
- pre_parse_datasets.append(folder_info)
- # break
- # pre_parse_datasets = parse_pdf()
- # print(pre_parse_datasets)
- # 开始解析pdf
- for pre_parse_dataset in pre_parse_datasets:
- bidder_info = pre_parse_dataset['bidder_info']
- projectName = pre_parse_dataset['projectName']
- buyFile = pre_parse_dataset['buyFile']
- for bidder_firm in bidder_info:
- bidderFile = bidder_firm['bidderFile']
- bidderUnit = bidder_firm['bidderUnit']
- task_id = request1(bidderFile)
- response = request2(task_id)
- with open(f"{save_dir}/{buyFile}_1_{bidderUnit}_textmind.json", 'w', encoding='utf-8') as fp:
- json.dump(response.json(), fp, indent=4, ensure_ascii=False)
- return pre_parse_datasets
-
- def picture_ocr(image_path:str):
- ''' 单个图片OCR结果 '''
- task_id = request1(image_path)
- response = request2(task_id)
- save_file_path = "_".join(image_path[:-4].split('/')[-3:])
- print(save_file_path)
- with open(f"data/预审查数据/download/{save_file_path}_textmind.json", 'w', encoding='utf-8') as fp:
- json.dump(response.json(), fp, indent=4, ensure_ascii=False)
- # picture_ocr('/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4测试数据/水车室复杂高危作业环境的模块化集成检修装备研制/中国科学院沈阳自动化研究所/scanned/page-134.jpg')
- def parse_single_file(file_path:str, save_dir:str):
- '''
- parse single file(> 50M)
- '''
- def get_FileSize(filePath):
- fsize = os.path.getsize(filePath)
- fsize = fsize/float(1024*1024)
- return round(fsize, 2)
-
-
-
- file_name = os.path.basename(file_path)
- file_name = re.sub('\040', '', file_name)
- # if file_name:
- # delete_bos(object_key=file_name)
- if get_FileSize(file_path) > 49:
- print('file_size > 50M')
- file_url = put_bos(object_key=file_name, file_name=file_path)
- print(file_url)
-
- task_id = request1(file_path, file_url)
- response = request2(task_id)
-
- if file_name:
- delete_bos(object_key=file_name)
- save_file_path = os.path.join(save_dir, file_name[:-4])
- with open(f'{save_file_path}_textmind.json', 'w', encoding='utf-8') as fp:
- fp.write(response.json(), fp, indent=4, ensure_ascii=False)
- file_path = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/河海大学.pdf'
- save_path = 'data/预审查数据/download'
- # parse_single_file(file_path, save_path)
|