# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-08-30 13:13:03 # @Last Modified by: privacy # @Last Modified time: 2024-12-02 17:07:07 import os from glob import glob from typing import List, Optional from . import celery_app from celery_tasks.get_info import PdfExtractAttr from celery_tasks.document_ import DocumentPreReview from celery_tasks.ocr import OcrAgent from celery_tasks.tools import check_scan_pdf @celery_app.task def pic_ocr(image_bytes: Optional[bytes] = None, image_type: Optional[str] = None, image_path: Optional[str] = None) -> dict: """ 图片OCR """ agent = OcrAgent() if image_bytes and image_type: return agent.get_content(image_bytes=image_bytes, image_type=image_type) elif image_bytes and image_path: return agent.get_content(image_bytes=image_bytes, image_path=image_path) elif image_bytes: return agent.get_content(image_bytes=image_bytes) elif image_path: return agent.get_content(image_path=image_path) else: return {"status": "error"} @celery_app.task def common_document(file_path: str, file_type: str, project_name: str, supplier: Optional[str] = None): """ 从PDF文件中提取内容 Args: file_path: 文件路径 file_type: 文件类型<招标|投标> project_name: 项目名称 supplier: 供应商名称,如果文件类型为投标文件,则必须提供该值 Returns: ... """ if not os.path.exists(file_path): return {"status": "error", "message": "File Not Found!"} if file_type == '招标': task = bidding_document.apply_async( kwargs={'file_path': file_path} ) return task.id elif file_type == '投标': task = tender_document.apply_async( kwargs={'file_path': file_path, 'project_name': project_name, 'supplier': supplier} ) return task.id @celery_app.task def bidding_document(file_path: str) -> dict: """ 招标文件 Args: file_path: 招标文件 Returns: ... """ agent = PdfExtractAttr(file_path=file_path) texts = agent.parse_text() content = agent.extract_content() table_list = agent.parse_table_pro() title = agent.parse_title() return { "tables": table_list, "title": title, "content": content, "texts": texts } @celery_app.task def bidding_factor(table_list: list) -> dict: """ 从招标表格中获取详审因素 """ dpr = DocumentPreReview() dpr.Bidding_tables = table_list try: return dpr.get_table() except Exception as e: print(e) return {} @celery_app.task def tender_document(file_path: str, project_name: str, supplier: str) -> dict: """ 投标文件 """ agent = PdfExtractAttr(file_path=file_path) image_dir = os.path.join(os.path.join(project_name, supplier), 'extracted_images') # 创建图片保存目录 if not os.path.exists(image_dir): os.makedirs(image_dir) texts = agent.parse_text() tables = agent.parse_table_pro() outlines = agent.parse_outline() content = agent.extract_content() images = agent.parse_image(image_dir=image_dir) title = agent.parse_title() return { "outlines": outlines, "title": title, "texts": texts, "tables": tables, "content": content, "images": images, } # @celery_app.task(ignore_result=True) # def test_all_files(proj_name: str): # for file in glob(f"D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\{proj_name}\\*\\*.pdf"): # try: # print('\033[32m' + f'\n\n*****{file}*****\n\n' + '\033[0m') # if check_scan_pdf(file): # print('扫描件') # continue # text_path = ''.join([file[:-4], '-text.json']) # title_path = ''.join([file[:-4], '-title.json']) # outline_path = ''.join([file[:-4], '-outline.json']) # image_meta_path = ''.join([file[:-4], '-image.json']) # table_path = ''.join([file[:-4], '-table.json']) # content_path = ''.join([file[:-4], '-content.json']) # image_dir = '\\'.join(['\\'.join(file.split('\\')[:-1]), 'extracted_images']) # if not os.path.exists(image_dir): # os.makedirs(image_dir) # agent = PdfExtractAttr(file_path=file) # print(f"{file} on parse_text") # agent.parse_text(text_path) # print(f"{file} on parse_title") # agent.parse_title(title_path) # print(f"{file} on parse_outline") # agent.parse_outline(outline_path=outline_path) # print(f"{file} on parse_image") # agent.parse_image(image_dir=image_dir, image_meta_path=image_meta_path) # print(f"{file} on parse_table") # agent.parse_table_pro(table_path=table_path) # print(f"{file} on parse_content") # agent.extract_content(content_path=content_path) # # dpr = DocumentPreReview() # # dpr.get_Bidding_table(file_path=table_path) # # print(dpr.get_table()) # except Exception as e: # print(f'\033[31m {e} \033[0m')