commonprocess.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-08-30 13:13:03
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-06 09:25:00
  6. import os
  7. from glob import glob
  8. from typing import List, Optional
  9. from . import celery_app
  10. from .get_info import PdfExtractAttr
  11. from .document_ import DocumentPreReview
  12. from .ocr import OcrAgent
  13. from .tools import check_scan_pdf
  14. @celery_app.task
  15. def pic_ocr(image_bytes: Optional[bytes] = None, image_type: Optional[str] = None, image_path: Optional[str] = None) -> dict:
  16. """
  17. 图片OCR
  18. """
  19. agent = OcrAgent()
  20. if image_bytes and image_type:
  21. return agent.get_content(image_bytes=image_bytes, image_type=image_type)
  22. elif image_bytes and image_path:
  23. return agent.get_content(image_bytes=image_bytes, image_path=image_path)
  24. elif image_bytes:
  25. return agent.get_content(image_bytes=image_bytes)
  26. elif image_path:
  27. return agent.get_content(image_path=image_path)
  28. else:
  29. return {"status": "error"}
  30. @celery_app.task
  31. def common_document(file_path: str, file_type: str, project_name: str, supplier: Optional[str] = None):
  32. """
  33. 从PDF文件中提取内容
  34. Args:
  35. file_path: 文件路径
  36. file_type: 文件类型<招标|投标>
  37. project_name: 项目名称
  38. supplier: 供应商名称,如果文件类型为投标文件,则必须提供该值
  39. Returns:
  40. ...
  41. """
  42. if not os.path.exists(file_path):
  43. return {"status": "error", "message": "File Not Found!"}
  44. if file_type == '招标':
  45. task = bidding_document.apply_async(
  46. kwargs={'file_path': file_path}
  47. )
  48. return task.id
  49. elif file_type == '投标':
  50. task = tender_document.apply_async(
  51. kwargs={'file_path': file_path, 'project_name': project_name, 'supplier': supplier}
  52. )
  53. return task.id
  54. @celery_app.task
  55. def bidding_document(file_path: str) -> dict:
  56. """
  57. 招标文件
  58. Args:
  59. file_path: 招标文件
  60. Returns:
  61. ...
  62. """
  63. agent = PdfExtractAttr(file_path=file_path)
  64. texts = agent.parse_text()
  65. content = agent.extract_content()
  66. table_list = agent.parse_table_pro()
  67. title = agent.parse_title()
  68. return {
  69. "tables": table_list,
  70. "title": title,
  71. "content": content,
  72. "texts": texts
  73. }
  74. @celery_app.task
  75. def bidding_factor(table_list: list) -> dict:
  76. """
  77. 从招标表格中获取详审因素
  78. """
  79. dpr = DocumentPreReview()
  80. dpr.Bidding_tables = table_list
  81. try:
  82. return dpr.get_table()
  83. except Exception:
  84. return {}
  85. @celery_app.task
  86. def tender_document(file_path: str, project_name: str, supplier: str) -> dict:
  87. """
  88. 投标文件
  89. """
  90. agent = PdfExtractAttr(file_path=file_path)
  91. image_dir = os.path.join(os.path.join(project_name, supplier), 'extracted_images')
  92. # 创建图片保存目录
  93. if not os.path.exists(image_dir):
  94. os.makedirs(image_dir)
  95. texts = agent.parse_text()
  96. tables = agent.parse_table_pro()
  97. outlines = agent.parse_outline()
  98. content = agent.extract_content()
  99. images = agent.parse_image(image_dir=image_dir)
  100. title = agent.parse_title()
  101. return {
  102. "outlines": outlines,
  103. "title": title,
  104. "texts": texts,
  105. "tables": tables,
  106. "content": content,
  107. "images": images,
  108. }
  109. @celery_app.task
  110. def add(x, y):
  111. return x + y
  112. @celery_app.task(ignore_result=True)
  113. def test_all_files(proj_name: str):
  114. for file in glob(f"D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\{proj_name}\\*\\*.pdf"):
  115. try:
  116. print('\033[32m' + f'\n\n*****{file}*****\n\n' + '\033[0m')
  117. if check_scan_pdf(file):
  118. print('扫描件')
  119. continue
  120. text_path = ''.join([file[:-4], '-text.json'])
  121. title_path = ''.join([file[:-4], '-title.json'])
  122. outline_path = ''.join([file[:-4], '-outline.json'])
  123. image_meta_path = ''.join([file[:-4], '-image.json'])
  124. table_path = ''.join([file[:-4], '-table.json'])
  125. content_path = ''.join([file[:-4], '-content.json'])
  126. image_dir = '\\'.join(['\\'.join(file.split('\\')[:-1]), 'extracted_images'])
  127. if not os.path.exists(image_dir):
  128. os.makedirs(image_dir)
  129. agent = PdfExtractAttr(file_path=file)
  130. print(f"{file} on parse_text")
  131. agent.parse_text(text_path)
  132. print(f"{file} on parse_title")
  133. agent.parse_title(title_path)
  134. print(f"{file} on parse_outline")
  135. agent.parse_outline(outline_path=outline_path)
  136. print(f"{file} on parse_image")
  137. agent.parse_image(image_dir=image_dir, image_meta_path=image_meta_path)
  138. print(f"{file} on parse_table")
  139. agent.parse_table_pro(table_path=table_path)
  140. print(f"{file} on parse_content")
  141. agent.extract_content(content_path=content_path)
  142. # dpr = DocumentPreReview()
  143. # dpr.get_Bidding_table(file_path=table_path)
  144. # print(dpr.get_table())
  145. except Exception as e:
  146. print(f'\033[31m {e} \033[0m')