textmind_ocr.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-12-02 17:02:16
  6. import os
  7. import time
  8. import json
  9. import base64
  10. import requests
  11. def create_task(url, file_path, file_url):
  12. """
  13. Args:
  14. url: string, 服务请求链接
  15. file_path: 本地文件路径
  16. file_url: 文件链接
  17. Returns: 响应
  18. """
  19. file = open(file_path, 'rb').read()
  20. # 文件请求
  21. body = {
  22. "file": (os.path.basename(file_path), file, "multipart/form-data"),
  23. }
  24. data = {
  25. "file_name": os.path.basename(file_path),
  26. "return_para_nodes": True
  27. }
  28. response = requests.post(url, data=data, files=body)
  29. return response.json()
  30. def create_task_1(url, file_path, file_url):
  31. """
  32. Args:
  33. url: string, 服务请求链接
  34. file_path: 本地文件路径
  35. file_url: 文件链接
  36. Returns: 响应
  37. """
  38. # 文件请求
  39. with open(file_path, "rb") as f:
  40. file_data = base64.b64encode(f.read())
  41. data = {
  42. "file_data": file_data,
  43. # "file_url": file_url,
  44. "file_name": os.path.basename(file_path)
  45. }
  46. # 文档切分参数,非必传
  47. # return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1})
  48. # data["return_doc_chunks"] = return_doc_chunks
  49. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  50. response = requests.post(url, headers=headers, data=data)
  51. return response.json()
  52. def query_task(url, task_id):
  53. """
  54. Args:
  55. url: string, 请求链接
  56. task_id: string, task id
  57. Returns: 响应
  58. """
  59. data = {
  60. "task_id": task_id
  61. }
  62. response = requests.post(url, data=data, files=data)
  63. return response.json()
  64. def query_task_1(url, task_id):
  65. """
  66. Args:
  67. url: string, 请求链接
  68. task_id: string, task id
  69. Returns: 响应
  70. """
  71. data = {
  72. "task_id": task_id
  73. }
  74. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  75. response = requests.post(url, headers=headers, data=data)
  76. return response.json()
  77. def request1(bidderFile, nums: int = 1):
  78. try:
  79. response = create_task_1(request_host, bidderFile, "")
  80. print('res1 :', response)
  81. task_id = response['result']['task_id']
  82. if not task_id:
  83. raise ValueError('task_id is None')
  84. except Exception as e:
  85. print("request1 :", e)
  86. time.sleep(10)
  87. nums += 1
  88. if nums > 100:
  89. return
  90. task_id = request1(bidderFile, nums)
  91. return task_id
  92. def request2(task_id, nums: int = 1, max_nums: int = 50):
  93. nums = 1
  94. while nums < max_nums:
  95. try:
  96. resp = query_task_1(request_query_host, task_id)
  97. print('res2 :', resp)
  98. if resp['result']['status'] == 'success':
  99. url = resp['result']['parse_result_url']
  100. response = requests.get(url)
  101. response.encoding = 'utf-8'
  102. # response.json()
  103. return response
  104. nums += 1
  105. except Exception:
  106. nums += 10
  107. token = "24.87693e5dd8c2d7d7accf260bb2d265d2.2592000.1733970962.282335-86574608"
  108. # request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
  109. request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}" # 更新
  110. # request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}"
  111. request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}" # 更新
  112. # 测试pdf文件
  113. # file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf"
  114. # file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf"
  115. # file_path = r'..\data\0预审查初审详审测试数据\三峡左右岸电站厂房清洁水系统改造\湖北海光安全技术工程有限公司_T221100130656\海光 投标文件-PDF.pdf'
  116. file_path = r'..\data\浙江国迈建设集团有限公司技术文件.pdf'
  117. # time.sleep(5)
  118. # task_id = request1(file_path)
  119. # print('1 :', task_id)
  120. task_id = 'task-T0SnDf9Be4QuKemA8apzkgbq5jK7J9fG'
  121. time.sleep(10)
  122. response = request2(task_id)
  123. print('2 file_name :', response.json()['file_name'])
  124. # 保存textmind解析结果
  125. # with open(r'..\data\0预审查初审详审测试数据\textmind_result\三峡左右岸电站厂房清洁水系统改造\湖北海光安全技术工程有限公司.json', 'w', encoding='utf-8') as fp:
  126. # json.dump(response.json(), fp, indent=4, ensure_ascii=False)