|
@@ -1,28 +1,131 @@
|
|
|
|
|
|
import requests, os, time, json, base64
|
|
|
+import tqdm, re
|
|
|
+'''bos_sample_conf'''
|
|
|
+from baidubce.bce_client_configuration import BceClientConfiguration
|
|
|
+from baidubce.auth.bce_credentials import BceCredentials
|
|
|
+'''bos'''
|
|
|
+import sys
|
|
|
+import json
|
|
|
+import numpy as np
|
|
|
+from baidubce.services.bos import bos_handler
|
|
|
+from baidubce.services.bos import storage_class
|
|
|
+from baidubce.services.bos import canned_acl
|
|
|
+from baidubce.bce_client_configuration import BceClientConfiguration
|
|
|
+from baidubce.auth.bce_credentials import BceCredentials
|
|
|
+#导入BOS相关模块
|
|
|
+from baidubce import exception
|
|
|
+from baidubce.services import bos
|
|
|
+from baidubce.services.bos import canned_acl
|
|
|
+from baidubce.services.bos.bos_client import BosClient
|
|
|
|
|
|
-def create_task(url, file_path, file_url):
|
|
|
- """
|
|
|
- Args:
|
|
|
- url: string, 服务请求链接
|
|
|
- file_path: 本地文件路径
|
|
|
- file_url: 文件链接
|
|
|
- Returns: 响应
|
|
|
- """
|
|
|
- file = open(file_path, 'rb').read()
|
|
|
|
|
|
- # 文件请求
|
|
|
- body = {
|
|
|
- "file": (os.path.basename(file_path), file, "multipart/form-data"),
|
|
|
- }
|
|
|
+'''bos_sample_conf'''
|
|
|
+#设置BosClient的Host,Access Key ID和Secret Access Key
|
|
|
+bos_host = "bj.bcebos.com"
|
|
|
+access_key_id = "87815919190940dd9ff8a7790281e1e9"
|
|
|
+secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
|
|
|
|
|
|
- data = {
|
|
|
- "file_name": os.path.basename(file_path),
|
|
|
- "return_para_nodes": True
|
|
|
- }
|
|
|
+access_key_id = "87815919190940dd9ff8a7790281e1e9"
|
|
|
+secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
|
|
|
|
|
|
- response = requests.post(url, data=data, files=body)
|
|
|
- return response.json()
|
|
|
+access_key_id = "ALTAKEq9L0oxxxDi5jUc3e12gu"
|
|
|
+secret_access_key = "9336a04f88e845e284bab26bd5fd8182"
|
|
|
+
|
|
|
+# 创建BceClientConfiguration
|
|
|
+config = BceClientConfiguration(credentials=BceCredentials(access_key_id, secret_access_key), endpoint = bos_host)
|
|
|
+
|
|
|
+'''bos'''
|
|
|
+bos_client = BosClient(config)
|
|
|
+"""
|
|
|
+response = bos_client.list_buckets()
|
|
|
+for bucket in response.buckets:
|
|
|
+ print (bucket.name)
|
|
|
+"""
|
|
|
+#根据ListObjects接口来获取图片的key,prefix为前缀
|
|
|
+def get_objects(prefix, max_keys=10):
|
|
|
+ objects = bos_client.list_objects('ocrtrips', max_keys=max_keys, prefix=prefix)
|
|
|
+ return objects.contents
|
|
|
+
|
|
|
+#上传
|
|
|
+def put_bos(object_key, file_name, bucket_name='ctrimgs'):
|
|
|
+ bos_client.put_object_from_file(bucket_name, object_key, file_name)
|
|
|
+ return 'https://ctrimgs.bj.bcebos.com/' + object_key
|
|
|
+ #return bos_client.put_object_from_file(bucket_name, object_key, file_name)
|
|
|
+#删除
|
|
|
+def delete_bos(object_key, bucket_name='ctrimgs'):
|
|
|
+ bos_client.delete_object(bucket_name, object_key)
|
|
|
+ return ''
|
|
|
+
|
|
|
+
|
|
|
+#下载
|
|
|
+def get_bos(bucket_name, object_key, file_name):
|
|
|
+ bos_client.get_object_to_file(bucket_name,
|
|
|
+ object_key,
|
|
|
+ file_name)
|
|
|
+#bos查询
|
|
|
+def get_object_lists(buckent_name, prefix, max_keys=10):
|
|
|
+ objects = bos_client.list_objects(buckent_name, max_keys=max_keys, prefix=prefix)
|
|
|
+ return objects.contents
|
|
|
+#分块上传 文件大于5G
|
|
|
+def get_multipart(bucket_name, object_key, file_name):
|
|
|
+
|
|
|
+ upload_id = bos_client.initiate_multipart_upload(bucket_name, object_key).upload_id
|
|
|
+
|
|
|
+ left_size = os.path.getsize(file_name)
|
|
|
+ #设置分块的开始偏移位置
|
|
|
+ offset = 0
|
|
|
+
|
|
|
+ part_number = 1
|
|
|
+ part_list = []
|
|
|
+
|
|
|
+ while left_size > 0:
|
|
|
+ #设置每块为5MB
|
|
|
+ part_size = 5 * 1024 * 1024
|
|
|
+ if left_size < part_size:
|
|
|
+ part_size = left_size
|
|
|
+
|
|
|
+ response = bos_client.upload_part_from_file(
|
|
|
+ bucket_name, object_key, upload_id, part_number, part_size, file_name, offset)
|
|
|
+
|
|
|
+
|
|
|
+ left_size -= part_size
|
|
|
+ offset += part_size
|
|
|
+ part_list.append({
|
|
|
+ "partNumber": part_number,
|
|
|
+ "eTag": response.metadata.etag
|
|
|
+ })
|
|
|
+
|
|
|
+
|
|
|
+ part_number += 1
|
|
|
+
|
|
|
+ bos_client.complete_multipart_upload(bucket_name, object_key, upload_id, part_list)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+'''textmind_ocr'''
|
|
|
+# def create_task(url, file_path, file_url):
|
|
|
+# """
|
|
|
+# Args:
|
|
|
+# url: string, 服务请求链接
|
|
|
+# file_path: 本地文件路径
|
|
|
+# file_url: 文件链接
|
|
|
+# Returns: 响应
|
|
|
+# """
|
|
|
+# file = open(file_path, 'rb').read()
|
|
|
+
|
|
|
+# # 文件请求
|
|
|
+# body = {
|
|
|
+# "file": (os.path.basename(file_path), file, "multipart/form-data"),
|
|
|
+# }
|
|
|
+
|
|
|
+# data = {
|
|
|
+# "file_name": os.path.basename(file_path),
|
|
|
+# "return_para_nodes": True
|
|
|
+# }
|
|
|
+
|
|
|
+# response = requests.post(url, data=data, files=body)
|
|
|
+# return response.json()
|
|
|
|
|
|
def create_task_1(url, file_path, file_url):
|
|
|
"""
|
|
@@ -35,11 +138,17 @@ def create_task_1(url, file_path, file_url):
|
|
|
# 文件请求
|
|
|
with open(file_path, "rb") as f:
|
|
|
file_data = base64.b64encode(f.read())
|
|
|
- data = {
|
|
|
- "file_data": file_data,
|
|
|
- "file_url": file_url,
|
|
|
- "file_name": os.path.basename(file_path)
|
|
|
- }
|
|
|
+ if file_url:
|
|
|
+ data = {
|
|
|
+ "file_url": file_url,
|
|
|
+ "file_name": os.path.basename(file_path)
|
|
|
+ }
|
|
|
+ else:
|
|
|
+ data = {
|
|
|
+ "file_data": file_data,
|
|
|
+ "file_url": file_url,
|
|
|
+ "file_name": os.path.basename(file_path)
|
|
|
+ }
|
|
|
|
|
|
# 文档切分参数,非必传
|
|
|
# return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1})
|
|
@@ -51,19 +160,19 @@ def create_task_1(url, file_path, file_url):
|
|
|
return response.json()
|
|
|
|
|
|
|
|
|
-def query_task(url, task_id):
|
|
|
- """
|
|
|
- Args:
|
|
|
- url: string, 请求链接
|
|
|
- task_id: string, task id
|
|
|
- Returns: 响应
|
|
|
- """
|
|
|
- data = {
|
|
|
- "task_id": task_id
|
|
|
- }
|
|
|
+# def query_task(url, task_id):
|
|
|
+# """
|
|
|
+# Args:
|
|
|
+# url: string, 请求链接
|
|
|
+# task_id: string, task id
|
|
|
+# Returns: 响应
|
|
|
+# """
|
|
|
+# data = {
|
|
|
+# "task_id": task_id
|
|
|
+# }
|
|
|
|
|
|
- response = requests.post(url, data=data, files=data)
|
|
|
- return response.json()
|
|
|
+# response = requests.post(url, data=data, files=data)
|
|
|
+# return response.json()
|
|
|
|
|
|
def query_task_1(url, task_id):
|
|
|
"""
|
|
@@ -80,54 +189,180 @@ def query_task_1(url, task_id):
|
|
|
response = requests.post(url, headers=headers, data=data)
|
|
|
return response.json()
|
|
|
|
|
|
-def request1(bidderFile,nums:int=1):
|
|
|
- try:
|
|
|
- response = create_task_1(request_host, bidderFile, "")
|
|
|
- print('res1 :',response)
|
|
|
- task_id = response['result']['task_id']
|
|
|
- if not task_id: raise ValueError('task_id is None')
|
|
|
- except Exception as e:
|
|
|
- print("request1 :",e)
|
|
|
- time.sleep(10)
|
|
|
- nums += 1
|
|
|
- if nums > 100: return
|
|
|
- task_id = request1(bidderFile, nums)
|
|
|
- return task_id
|
|
|
-
|
|
|
-def request2(task_id,nums:int=1):
|
|
|
- try:
|
|
|
- resp = query_task_1(request_query_host, task_id)
|
|
|
- print('res2 :',resp)
|
|
|
- url = resp['result']['parse_result_url']
|
|
|
- response = requests.get(url)
|
|
|
- response.encoding = 'utf-8'
|
|
|
- response.json()
|
|
|
- except Exception as e:
|
|
|
- print("request2 :",e)
|
|
|
- time.sleep(20)
|
|
|
- nums += 1
|
|
|
- if nums > 500: return
|
|
|
- response = request2(task_id,nums)
|
|
|
- return response
|
|
|
-
|
|
|
-
|
|
|
-token = "24.87693e5dd8c2d7d7accf260bb2d265d2.2592000.1733970962.282335-86574608"
|
|
|
+def request1(bidderFile, bidderUrl:str = '', nums:int = 1, max_nums:int = 50):
|
|
|
+ while nums < max_nums:
|
|
|
+ try:
|
|
|
+ response = create_task_1(request_host, bidderFile, bidderUrl)
|
|
|
+ print('res1 :',response)
|
|
|
+ task_id = response['result'].get('task_id', None)
|
|
|
+ if not task_id: raise ValueError('task_id is None')
|
|
|
+ return task_id
|
|
|
+ except Exception as e:
|
|
|
+ print("request1 :",e)
|
|
|
+ nums += 1
|
|
|
+ time.sleep(10)
|
|
|
+
|
|
|
+
|
|
|
+def request2(task_id, nums:int = 1, max_nums: int = 500):
|
|
|
+ while nums < max_nums:
|
|
|
+ try:
|
|
|
+ resp = query_task_1(request_query_host, task_id)
|
|
|
+ print('res2 :', resp)
|
|
|
+ if resp['result']['status'] == 'success':
|
|
|
+ url = resp['result']['parse_result_url']
|
|
|
+ # url = resp['result']['markdown_url'] # 取markdown return TXT
|
|
|
+ response = requests.get(url)
|
|
|
+ response.encoding = 'utf-8'
|
|
|
+ response.json()
|
|
|
+ return response
|
|
|
+ except Exception:
|
|
|
+ nums += 1
|
|
|
+ time.sleep(20)
|
|
|
+
|
|
|
+# def request2(task_id,nums:int=1):
|
|
|
+# try:
|
|
|
+# resp = query_task_1(request_query_host, task_id)
|
|
|
+# print('res2 :',resp)
|
|
|
+# url = resp['result']['parse_result_url']
|
|
|
+# response = requests.get(url)
|
|
|
+# response.encoding = 'utf-8'
|
|
|
+# response.json()
|
|
|
+# except Exception as e:
|
|
|
+# print("request2 :",e)
|
|
|
+# time.sleep(20)
|
|
|
+# nums += 1
|
|
|
+# if nums > 500: return
|
|
|
+# response = request2(task_id,nums)
|
|
|
+# return response
|
|
|
+
|
|
|
+
|
|
|
+token = "24.8dc8595999193e140449656989204d61.2592000.1736062425.282335-86574608"
|
|
|
# request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
|
|
|
request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}" # 更新
|
|
|
# request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}"
|
|
|
request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}" # 更新
|
|
|
-# 测试pdf文件
|
|
|
-# file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf"
|
|
|
-file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf"
|
|
|
|
|
|
-# time.sleep(5)
|
|
|
-task_id = request1(file_path)
|
|
|
-print('1 :',task_id)
|
|
|
+def test():
|
|
|
+ # 测试pdf文件
|
|
|
+ # file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf"
|
|
|
+ # file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf"
|
|
|
+ file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/北京华科同安监控技术有限公司.pdf"
|
|
|
+
|
|
|
+ # time.sleep(5)
|
|
|
+ task_id = request1(file_path)
|
|
|
+ print('1 :',task_id)
|
|
|
+
|
|
|
+ time.sleep(10)
|
|
|
+ response = request2(task_id)
|
|
|
+ # print('2 file_name :',response.json()['file_name'])
|
|
|
+
|
|
|
+ # 保存textmind解析结果
|
|
|
+ # with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.json', 'w', encoding='utf-8') as fp:
|
|
|
+ # json.dump(response.json(), fp, indent=4, ensure_ascii=False)
|
|
|
+ with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.md', 'w', encoding='utf-8') as fp:
|
|
|
+ fp.write(response.text())
|
|
|
+
|
|
|
+# test()
|
|
|
+
|
|
|
+
|
|
|
+def parse_pdf():
|
|
|
+ base_dir = r'/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4'
|
|
|
+ save_dir = 'data/预审查数据/20241122-4/ocr_result'
|
|
|
+ os.makedirs(save_dir, exist_ok=True)
|
|
|
+
|
|
|
+ pre_parse_datasets = []
|
|
|
+
|
|
|
+ # 遍历base_dir下所有文件
|
|
|
+ for base_folders in os.listdir(base_dir):
|
|
|
+ base_folder = os.path.join(base_dir, base_folders)
|
|
|
+ folder_info = {}
|
|
|
+ for folders in os.listdir(base_folder):
|
|
|
+ folder = os.path.join(base_folder, folders)
|
|
|
+ if folders == "招标文件":
|
|
|
+ for file in os.listdir(folder):
|
|
|
+ if file.endswith(".pdf"):
|
|
|
+ projectName = file.split(".")[0] # 去掉后缀之后的文件名
|
|
|
+ tender_file = os.path.join(folder, file)
|
|
|
+
|
|
|
+ folder_info["projectName"] = projectName
|
|
|
+ folder_info["buyFile"] = tender_file
|
|
|
+
|
|
|
+ elif folders == '投标文件':
|
|
|
+ folder_info["bidder_info"] = []
|
|
|
+ for file in os.listdir(folder):
|
|
|
+ if file.endswith(".pdf"):
|
|
|
+ bidderUnit = file.split(".")[0] # 去掉后缀之后的文件名
|
|
|
+ bidder_file = os.path.join(folder, file)
|
|
|
+
|
|
|
+ folder_info["bidder_info"].append({"bidderUnit":bidderUnit, "bidderFile":bidder_file})
|
|
|
+
|
|
|
+ pre_parse_datasets.append(folder_info)
|
|
|
+ # break
|
|
|
+
|
|
|
+ # pre_parse_datasets = parse_pdf()
|
|
|
+ # print(pre_parse_datasets)
|
|
|
+
|
|
|
+ # 开始解析pdf
|
|
|
+ for pre_parse_dataset in pre_parse_datasets:
|
|
|
+ bidder_info = pre_parse_dataset['bidder_info']
|
|
|
+ projectName = pre_parse_dataset['projectName']
|
|
|
+ buyFile = pre_parse_dataset['buyFile']
|
|
|
+ for bidder_firm in bidder_info:
|
|
|
+ bidderFile = bidder_firm['bidderFile']
|
|
|
+ bidderUnit = bidder_firm['bidderUnit']
|
|
|
+ task_id = request1(bidderFile)
|
|
|
+ response = request2(task_id)
|
|
|
+ with open(f"{save_dir}/{buyFile}_1_{bidderUnit}_textmind.json", 'w', encoding='utf-8') as fp:
|
|
|
+ json.dump(response.json(), fp, indent=4, ensure_ascii=False)
|
|
|
+
|
|
|
+ return pre_parse_datasets
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def picture_ocr(image_path:str):
|
|
|
+ ''' 单个图片OCR结果 '''
|
|
|
+ task_id = request1(image_path)
|
|
|
+ response = request2(task_id)
|
|
|
+ save_file_path = "_".join(image_path[:-4].split('/')[-3:])
|
|
|
+ print(save_file_path)
|
|
|
+ with open(f"data/预审查数据/download/{save_file_path}_textmind.json", 'w', encoding='utf-8') as fp:
|
|
|
+ json.dump(response.json(), fp, indent=4, ensure_ascii=False)
|
|
|
+# picture_ocr('/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4测试数据/水车室复杂高危作业环境的模块化集成检修装备研制/中国科学院沈阳自动化研究所/scanned/page-134.jpg')
|
|
|
+
|
|
|
+
|
|
|
+def parse_single_file(file_path:str, save_dir:str):
|
|
|
+ '''
|
|
|
+ parse single file(> 50M)
|
|
|
+ '''
|
|
|
+ def get_FileSize(filePath):
|
|
|
+ fsize = os.path.getsize(filePath)
|
|
|
+ fsize = fsize/float(1024*1024)
|
|
|
+ return round(fsize, 2)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ file_name = os.path.basename(file_path)
|
|
|
+ file_name = re.sub('\040', '', file_name)
|
|
|
+
|
|
|
+ # if file_name:
|
|
|
+ # delete_bos(object_key=file_name)
|
|
|
+
|
|
|
+ if get_FileSize(file_path) > 49:
|
|
|
+ print('file_size > 50M')
|
|
|
+ file_url = put_bos(object_key=file_name, file_name=file_path)
|
|
|
+ print(file_url)
|
|
|
+
|
|
|
+ task_id = request1(file_path, file_url)
|
|
|
+ response = request2(task_id)
|
|
|
+
|
|
|
+ if file_name:
|
|
|
+ delete_bos(object_key=file_name)
|
|
|
+
|
|
|
+ save_file_path = os.path.join(save_dir, file_name[:-4])
|
|
|
|
|
|
-time.sleep(10)
|
|
|
-response = request2(task_id)
|
|
|
-print('2 file_name :',response.json()['file_name'])
|
|
|
+ with open(f'{save_file_path}_textmind.json', 'w', encoding='utf-8') as fp:
|
|
|
+ fp.write(response.json(), fp, indent=4, ensure_ascii=False)
|
|
|
|
|
|
-# 保存textmind解析结果
|
|
|
-with open('data/预审查数据/textmind_result/2021_2022年三峡电站左岸厂房中央空调系统主机设备改造_广东申菱环境系统股份有限公司.json', 'w', encoding='utf-8') as fp:
|
|
|
- json.dump(response.json(), fp, indent=4, ensure_ascii=False)
|
|
|
+file_path = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/河海大学.pdf'
|
|
|
+save_path = 'data/预审查数据/download'
|
|
|
+# parse_single_file(file_path, save_path)
|