textmind_ocr.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. import requests, os, time, json, base64
  2. import tqdm, re
  3. '''bos_sample_conf'''
  4. from baidubce.bce_client_configuration import BceClientConfiguration
  5. from baidubce.auth.bce_credentials import BceCredentials
  6. '''bos'''
  7. import sys
  8. import json
  9. import numpy as np
  10. from baidubce.services.bos import bos_handler
  11. from baidubce.services.bos import storage_class
  12. from baidubce.services.bos import canned_acl
  13. from baidubce.bce_client_configuration import BceClientConfiguration
  14. from baidubce.auth.bce_credentials import BceCredentials
  15. #导入BOS相关模块
  16. from baidubce import exception
  17. from baidubce.services import bos
  18. from baidubce.services.bos import canned_acl
  19. from baidubce.services.bos.bos_client import BosClient
  20. '''bos_sample_conf'''
  21. #设置BosClient的Host,Access Key ID和Secret Access Key
  22. bos_host = "bj.bcebos.com"
  23. access_key_id = "87815919190940dd9ff8a7790281e1e9"
  24. secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
  25. access_key_id = "87815919190940dd9ff8a7790281e1e9"
  26. secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
  27. access_key_id = "ALTAKEq9L0oxxxDi5jUc3e12gu"
  28. secret_access_key = "9336a04f88e845e284bab26bd5fd8182"
  29. # 创建BceClientConfiguration
  30. config = BceClientConfiguration(credentials=BceCredentials(access_key_id, secret_access_key), endpoint = bos_host)
  31. '''bos'''
  32. bos_client = BosClient(config)
  33. """
  34. response = bos_client.list_buckets()
  35. for bucket in response.buckets:
  36. print (bucket.name)
  37. """
  38. #根据ListObjects接口来获取图片的key,prefix为前缀
  39. def get_objects(prefix, max_keys=10):
  40. objects = bos_client.list_objects('ocrtrips', max_keys=max_keys, prefix=prefix)
  41. return objects.contents
  42. #上传
  43. def put_bos(object_key, file_name, bucket_name='ctrimgs'):
  44. bos_client.put_object_from_file(bucket_name, object_key, file_name)
  45. return 'https://ctrimgs.bj.bcebos.com/' + object_key
  46. #return bos_client.put_object_from_file(bucket_name, object_key, file_name)
  47. #删除
  48. def delete_bos(object_key, bucket_name='ctrimgs'):
  49. bos_client.delete_object(bucket_name, object_key)
  50. return ''
  51. #下载
  52. def get_bos(bucket_name, object_key, file_name):
  53. bos_client.get_object_to_file(bucket_name,
  54. object_key,
  55. file_name)
  56. #bos查询
  57. def get_object_lists(buckent_name, prefix, max_keys=10):
  58. objects = bos_client.list_objects(buckent_name, max_keys=max_keys, prefix=prefix)
  59. return objects.contents
  60. #分块上传 文件大于5G
  61. def get_multipart(bucket_name, object_key, file_name):
  62. upload_id = bos_client.initiate_multipart_upload(bucket_name, object_key).upload_id
  63. left_size = os.path.getsize(file_name)
  64. #设置分块的开始偏移位置
  65. offset = 0
  66. part_number = 1
  67. part_list = []
  68. while left_size > 0:
  69. #设置每块为5MB
  70. part_size = 5 * 1024 * 1024
  71. if left_size < part_size:
  72. part_size = left_size
  73. response = bos_client.upload_part_from_file(
  74. bucket_name, object_key, upload_id, part_number, part_size, file_name, offset)
  75. left_size -= part_size
  76. offset += part_size
  77. part_list.append({
  78. "partNumber": part_number,
  79. "eTag": response.metadata.etag
  80. })
  81. part_number += 1
  82. bos_client.complete_multipart_upload(bucket_name, object_key, upload_id, part_list)
  83. '''textmind_ocr'''
  84. # def create_task(url, file_path, file_url):
  85. # """
  86. # Args:
  87. # url: string, 服务请求链接
  88. # file_path: 本地文件路径
  89. # file_url: 文件链接
  90. # Returns: 响应
  91. # """
  92. # file = open(file_path, 'rb').read()
  93. # # 文件请求
  94. # body = {
  95. # "file": (os.path.basename(file_path), file, "multipart/form-data"),
  96. # }
  97. # data = {
  98. # "file_name": os.path.basename(file_path),
  99. # "return_para_nodes": True
  100. # }
  101. # response = requests.post(url, data=data, files=body)
  102. # return response.json()
  103. def create_task_1(url, file_path, file_url):
  104. """
  105. Args:
  106. url: string, 服务请求链接
  107. file_path: 本地文件路径
  108. file_url: 文件链接
  109. Returns: 响应
  110. """
  111. # 文件请求
  112. with open(file_path, "rb") as f:
  113. file_data = base64.b64encode(f.read())
  114. if file_url:
  115. data = {
  116. "file_url": file_url,
  117. "file_name": os.path.basename(file_path)
  118. }
  119. else:
  120. data = {
  121. "file_data": file_data,
  122. "file_url": file_url,
  123. "file_name": os.path.basename(file_path)
  124. }
  125. # 文档切分参数,非必传
  126. # return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1})
  127. # data["return_doc_chunks"] = return_doc_chunks
  128. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  129. response = requests.post(url, headers=headers, data=data)
  130. return response.json()
  131. # def query_task(url, task_id):
  132. # """
  133. # Args:
  134. # url: string, 请求链接
  135. # task_id: string, task id
  136. # Returns: 响应
  137. # """
  138. # data = {
  139. # "task_id": task_id
  140. # }
  141. # response = requests.post(url, data=data, files=data)
  142. # return response.json()
  143. def query_task_1(url, task_id):
  144. """
  145. Args:
  146. url: string, 请求链接
  147. task_id: string, task id
  148. Returns: 响应
  149. """
  150. data = {
  151. "task_id": task_id
  152. }
  153. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  154. response = requests.post(url, headers=headers, data=data)
  155. return response.json()
  156. def request1(bidderFile, bidderUrl:str = '', nums:int = 1, max_nums:int = 50):
  157. while nums < max_nums:
  158. try:
  159. response = create_task_1(request_host, bidderFile, bidderUrl)
  160. print('res1 :',response)
  161. task_id = response['result'].get('task_id', None)
  162. if not task_id: raise ValueError('task_id is None')
  163. return task_id
  164. except Exception as e:
  165. print("request1 :",e)
  166. nums += 1
  167. time.sleep(10)
  168. def request2(task_id, nums:int = 1, max_nums: int = 500):
  169. while nums < max_nums:
  170. try:
  171. resp = query_task_1(request_query_host, task_id)
  172. print('res2 :', resp)
  173. if resp['result']['status'] == 'success':
  174. url = resp['result']['parse_result_url']
  175. # url = resp['result']['markdown_url'] # 取markdown return TXT
  176. response = requests.get(url)
  177. response.encoding = 'utf-8'
  178. response.json()
  179. return response
  180. except Exception:
  181. nums += 1
  182. time.sleep(20)
  183. # def request2(task_id,nums:int=1):
  184. # try:
  185. # resp = query_task_1(request_query_host, task_id)
  186. # print('res2 :',resp)
  187. # url = resp['result']['parse_result_url']
  188. # response = requests.get(url)
  189. # response.encoding = 'utf-8'
  190. # response.json()
  191. # except Exception as e:
  192. # print("request2 :",e)
  193. # time.sleep(20)
  194. # nums += 1
  195. # if nums > 500: return
  196. # response = request2(task_id,nums)
  197. # return response
  198. token = "24.8dc8595999193e140449656989204d61.2592000.1736062425.282335-86574608"
  199. # request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
  200. request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}" # 更新
  201. # request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}"
  202. request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}" # 更新
  203. def test():
  204. # 测试pdf文件
  205. # file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf"
  206. # file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf"
  207. file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/北京华科同安监控技术有限公司.pdf"
  208. # time.sleep(5)
  209. task_id = request1(file_path)
  210. print('1 :',task_id)
  211. time.sleep(10)
  212. response = request2(task_id)
  213. # print('2 file_name :',response.json()['file_name'])
  214. # 保存textmind解析结果
  215. # with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.json', 'w', encoding='utf-8') as fp:
  216. # json.dump(response.json(), fp, indent=4, ensure_ascii=False)
  217. with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.md', 'w', encoding='utf-8') as fp:
  218. fp.write(response.text())
  219. # test()
  220. def parse_pdf():
  221. base_dir = r'/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4'
  222. save_dir = 'data/预审查数据/20241122-4/ocr_result'
  223. os.makedirs(save_dir, exist_ok=True)
  224. pre_parse_datasets = []
  225. # 遍历base_dir下所有文件
  226. for base_folders in os.listdir(base_dir):
  227. base_folder = os.path.join(base_dir, base_folders)
  228. folder_info = {}
  229. for folders in os.listdir(base_folder):
  230. folder = os.path.join(base_folder, folders)
  231. if folders == "招标文件":
  232. for file in os.listdir(folder):
  233. if file.endswith(".pdf"):
  234. projectName = file.split(".")[0] # 去掉后缀之后的文件名
  235. tender_file = os.path.join(folder, file)
  236. folder_info["projectName"] = projectName
  237. folder_info["buyFile"] = tender_file
  238. elif folders == '投标文件':
  239. folder_info["bidder_info"] = []
  240. for file in os.listdir(folder):
  241. if file.endswith(".pdf"):
  242. bidderUnit = file.split(".")[0] # 去掉后缀之后的文件名
  243. bidder_file = os.path.join(folder, file)
  244. folder_info["bidder_info"].append({"bidderUnit":bidderUnit, "bidderFile":bidder_file})
  245. pre_parse_datasets.append(folder_info)
  246. # break
  247. # pre_parse_datasets = parse_pdf()
  248. # print(pre_parse_datasets)
  249. # 开始解析pdf
  250. for pre_parse_dataset in pre_parse_datasets:
  251. bidder_info = pre_parse_dataset['bidder_info']
  252. projectName = pre_parse_dataset['projectName']
  253. buyFile = pre_parse_dataset['buyFile']
  254. for bidder_firm in bidder_info:
  255. bidderFile = bidder_firm['bidderFile']
  256. bidderUnit = bidder_firm['bidderUnit']
  257. task_id = request1(bidderFile)
  258. response = request2(task_id)
  259. with open(f"{save_dir}/{buyFile}_1_{bidderUnit}_textmind.json", 'w', encoding='utf-8') as fp:
  260. json.dump(response.json(), fp, indent=4, ensure_ascii=False)
  261. return pre_parse_datasets
  262. def picture_ocr(image_path:str):
  263. ''' 单个图片OCR结果 '''
  264. task_id = request1(image_path)
  265. response = request2(task_id)
  266. save_file_path = "_".join(image_path[:-4].split('/')[-3:])
  267. print(save_file_path)
  268. with open(f"data/预审查数据/download/{save_file_path}_textmind.json", 'w', encoding='utf-8') as fp:
  269. json.dump(response.json(), fp, indent=4, ensure_ascii=False)
  270. # picture_ocr('/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4测试数据/水车室复杂高危作业环境的模块化集成检修装备研制/中国科学院沈阳自动化研究所/scanned/page-134.jpg')
  271. def parse_single_file(file_path:str, save_dir:str):
  272. '''
  273. parse single file(> 50M)
  274. '''
  275. def get_FileSize(filePath):
  276. fsize = os.path.getsize(filePath)
  277. fsize = fsize/float(1024*1024)
  278. return round(fsize, 2)
  279. file_name = os.path.basename(file_path)
  280. file_name = re.sub('\040', '', file_name)
  281. # if file_name:
  282. # delete_bos(object_key=file_name)
  283. if get_FileSize(file_path) > 49:
  284. print('file_size > 50M')
  285. file_url = put_bos(object_key=file_name, file_name=file_path)
  286. print(file_url)
  287. task_id = request1(file_path, file_url)
  288. response = request2(task_id)
  289. if file_name:
  290. delete_bos(object_key=file_name)
  291. save_file_path = os.path.join(save_dir, file_name[:-4])
  292. with open(f'{save_file_path}_textmind.json', 'w', encoding='utf-8') as fp:
  293. fp.write(response.json(), fp, indent=4, ensure_ascii=False)
  294. file_path = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/河海大学.pdf'
  295. save_path = 'data/预审查数据/download'
  296. # parse_single_file(file_path, save_path)