textmind_ocr.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import requests, os, time, json, base64
  2. def create_task(url, file_path, file_url):
  3. """
  4. Args:
  5. url: string, 服务请求链接
  6. file_path: 本地文件路径
  7. file_url: 文件链接
  8. Returns: 响应
  9. """
  10. file = open(file_path, 'rb').read()
  11. # 文件请求
  12. body = {
  13. "file": (os.path.basename(file_path), file, "multipart/form-data"),
  14. }
  15. data = {
  16. "file_name": os.path.basename(file_path),
  17. "return_para_nodes": True
  18. }
  19. response = requests.post(url, data=data, files=body)
  20. return response.json()
  21. def create_task_1(url, file_path, file_url):
  22. """
  23. Args:
  24. url: string, 服务请求链接
  25. file_path: 本地文件路径
  26. file_url: 文件链接
  27. Returns: 响应
  28. """
  29. # 文件请求
  30. with open(file_path, "rb") as f:
  31. file_data = base64.b64encode(f.read())
  32. data = {
  33. "file_data": file_data,
  34. "file_url": file_url,
  35. "file_name": os.path.basename(file_path)
  36. }
  37. # 文档切分参数,非必传
  38. # return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1})
  39. # data["return_doc_chunks"] = return_doc_chunks
  40. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  41. response = requests.post(url, headers=headers, data=data)
  42. return response.json()
  43. def query_task(url, task_id):
  44. """
  45. Args:
  46. url: string, 请求链接
  47. task_id: string, task id
  48. Returns: 响应
  49. """
  50. data = {
  51. "task_id": task_id
  52. }
  53. response = requests.post(url, data=data, files=data)
  54. return response.json()
  55. def query_task_1(url, task_id):
  56. """
  57. Args:
  58. url: string, 请求链接
  59. task_id: string, task id
  60. Returns: 响应
  61. """
  62. data = {
  63. "task_id": task_id
  64. }
  65. headers = {'Content-Type': 'application/x-www-form-urlencoded'}
  66. response = requests.post(url, headers=headers, data=data)
  67. return response.json()
  68. def request1(bidderFile,nums:int=1):
  69. try:
  70. response = create_task_1(request_host, bidderFile, "")
  71. print('res1 :',response)
  72. task_id = response['result']['task_id']
  73. if not task_id: raise ValueError('task_id is None')
  74. except Exception as e:
  75. print("request1 :",e)
  76. time.sleep(10)
  77. nums += 1
  78. if nums > 100: return
  79. task_id = request1(bidderFile, nums)
  80. return task_id
  81. def request2(task_id,nums:int=1):
  82. try:
  83. resp = query_task_1(request_query_host, task_id)
  84. print('res2 :',resp)
  85. url = resp['result']['parse_result_url']
  86. response = requests.get(url)
  87. response.encoding = 'utf-8'
  88. response.json()
  89. except Exception as e:
  90. print("request2 :",e)
  91. time.sleep(20)
  92. nums += 1
  93. if nums > 500: return
  94. response = request2(task_id,nums)
  95. return response
  96. token = "24.87693e5dd8c2d7d7accf260bb2d265d2.2592000.1733970962.282335-86574608"
  97. # request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
  98. request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}" # 更新
  99. # request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}"
  100. request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}" # 更新
  101. # 测试pdf文件
  102. # file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf"
  103. file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf"
  104. # time.sleep(5)
  105. task_id = request1(file_path)
  106. print('1 :',task_id)
  107. time.sleep(10)
  108. response = request2(task_id)
  109. print('2 file_name :',response.json()['file_name'])
  110. # 保存textmind解析结果
  111. with open('data/预审查数据/textmind_result/2021_2022年三峡电站左岸厂房中央空调系统主机设备改造_广东申菱环境系统股份有限公司.json', 'w', encoding='utf-8') as fp:
  112. json.dump(response.json(), fp, indent=4, ensure_ascii=False)