# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-08-30 13:13:03 # @Last Modified by: privacy # @Last Modified time: 2024-09-06 09:25:00 import os from glob import glob from typing import List, Optional from . import celery_app from .get_info import PdfExtractAttr from .document_ import DocumentPreReview from .ocr import OcrAgent from .tools import check_scan_pdf @celery_app.task def pic_ocr(image_bytes: Optional[bytes] = None, image_type: Optional[str] = None, image_path: Optional[str] = None) -> dict: """ 图片OCR """ agent = OcrAgent() if image_bytes and image_type: return agent.get_content(image_bytes=image_bytes, image_type=image_type) elif image_bytes and image_path: return agent.get_content(image_bytes=image_bytes, image_path=image_path) elif image_bytes: return agent.get_content(image_bytes=image_bytes) elif image_path: return agent.get_content(image_path=image_path) else: return {"status": "error"} @celery_app.task def common_document(file_path: str, file_type: str, project_name: str, supplier: Optional[str] = None): """ 从PDF文件中提取内容 Args: file_path: 文件路径 file_type: 文件类型<招标|投标> project_name: 项目名称 supplier: 供应商名称,如果文件类型为投标文件,则必须提供该值 Returns: ... """ if not os.path.exists(file_path): return {"status": "error", "message": "File Not Found!"} if file_type == '招标': task = bidding_document.apply_async( kwargs={'file_path': file_path} ) return task.id elif file_type == '投标': task = tender_document.apply_async( kwargs={'file_path': file_path, 'project_name': project_name, 'supplier': supplier} ) return task.id @celery_app.task def bidding_document(file_path: str) -> dict: """ 招标文件 Args: file_path: 招标文件 Returns: ... """ agent = PdfExtractAttr(file_path=file_path) texts = agent.parse_text() content = agent.extract_content() table_list = agent.parse_table_pro() title = agent.parse_title() return { "tables": table_list, "title": title, "content": content, "texts": texts } @celery_app.task def bidding_factor(table_list: list) -> dict: """ 从招标表格中获取详审因素 """ dpr = DocumentPreReview() dpr.Bidding_tables = table_list try: return dpr.get_table() except Exception: return {} @celery_app.task def tender_document(file_path: str, project_name: str, supplier: str) -> dict: """ 投标文件 """ agent = PdfExtractAttr(file_path=file_path) image_dir = os.path.join(os.path.join(project_name, supplier), 'extracted_images') # 创建图片保存目录 if not os.path.exists(image_dir): os.makedirs(image_dir) texts = agent.parse_text() tables = agent.parse_table_pro() outlines = agent.parse_outline() content = agent.extract_content() images = agent.parse_image(image_dir=image_dir) title = agent.parse_title() return { "outlines": outlines, "title": title, "texts": texts, "tables": tables, "content": content, "images": images, } @celery_app.task def add(x, y): return x + y @celery_app.task(ignore_result=True) def test_all_files(proj_name: str): for file in glob(f"D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\{proj_name}\\*\\*.pdf"): try: print('\033[32m' + f'\n\n*****{file}*****\n\n' + '\033[0m') if check_scan_pdf(file): print('扫描件') continue text_path = ''.join([file[:-4], '-text.json']) title_path = ''.join([file[:-4], '-title.json']) outline_path = ''.join([file[:-4], '-outline.json']) image_meta_path = ''.join([file[:-4], '-image.json']) table_path = ''.join([file[:-4], '-table.json']) content_path = ''.join([file[:-4], '-content.json']) image_dir = '\\'.join(['\\'.join(file.split('\\')[:-1]), 'extracted_images']) if not os.path.exists(image_dir): os.makedirs(image_dir) agent = PdfExtractAttr(file_path=file) print(f"{file} on parse_text") agent.parse_text(text_path) print(f"{file} on parse_title") agent.parse_title(title_path) print(f"{file} on parse_outline") agent.parse_outline(outline_path=outline_path) print(f"{file} on parse_image") agent.parse_image(image_dir=image_dir, image_meta_path=image_meta_path) print(f"{file} on parse_table") agent.parse_table_pro(table_path=table_path) print(f"{file} on parse_content") agent.extract_content(content_path=content_path) # dpr = DocumentPreReview() # dpr.get_Bidding_table(file_path=table_path) # print(dpr.get_table()) except Exception as e: print(f'\033[31m {e} \033[0m')