parser.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-19 17:37:49
  6. import os
  7. import base64
  8. import requests
  9. def main(client_id, client_secret):
  10. url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}".format(client_id, client_secret)
  11. payload = ""
  12. headers = {
  13. 'Content-Type': 'application/json',
  14. 'Accept': 'application/json'
  15. }
  16. response = requests.request("POST", url, headers=headers, data=payload)
  17. print(response.text)
  18. return response.json()
  19. def create_task(url, file_path, file_url):
  20. """
  21. Args:
  22. url: string, 服务请求链接
  23. file_path: 本地文件路径
  24. file_url: 文件链接
  25. Returns: 响应
  26. """
  27. file = open(file_path, 'rb').read()
  28. # 文件请求
  29. body = {
  30. "file": (os.path.basename(file_path), file, "multipart/form-data"),
  31. }
  32. # 文件链接请求
  33. # body = {
  34. # "file_url": (file_url, "multipart/form-data")
  35. # }
  36. data = {
  37. "file_name": os.path.basename(file_path),
  38. "return_para_nodes": True
  39. }
  40. response = requests.post(url, data=data, files=body)
  41. return response.json()
  42. def query_task(url, task_id):
  43. """
  44. Args:
  45. url: string, 请求链接
  46. task_id: string, task id
  47. Returns: 响应
  48. """
  49. data = {
  50. "task_id": task_id
  51. }
  52. response = requests.post(url, data=data, files=data)
  53. return response.json()
  54. if __name__ == '__main__':
  55. import time
  56. import json
  57. # client_id = 'DFIQUMXb59oGUDkvGhTw15mE'
  58. # client_secret = 'F5LkFLo4TatiLcCcJgIXbJrv5Kw04Rf0'
  59. # token = main(client_id, client_secret)['access_token']
  60. token = "24.0ab90c2e2b750b61995052ab6b94f62c.2592000.1728805729.282335-86574608"
  61. # # print(token)
  62. request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
  63. file_path = "D:/desktop/三峡水利/celery.pdf"
  64. response = create_task(request_host, file_path, "")
  65. print(response)
  66. time.sleep(10)
  67. task_id = response['result']['task_id']
  68. request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}"
  69. resp = query_task(request_host, task_id)
  70. print(resp)
  71. url = resp['result']['parse_result_url']
  72. response = requests.get(url)
  73. response.encoding = 'utf-8'
  74. with open('浙江国迈建设集团有限公司技术文件.json', 'w', encoding='utf-8') as fp:
  75. json.dump(response.json(), fp, indent=4, ensure_ascii=False)