xzc
/
pdf_title_image


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2024-08-30 13:13:03
# @Last Modified by:   privacy
# @Last Modified time: 2024-09-06 09:25:00
import os
from glob import glob
from typing import List, Optional

from . import celery_app
from .get_info import PdfExtractAttr
from .document_ import DocumentPreReview
from .ocr import OcrAgent
from .tools import check_scan_pdf


@celery_app.task
def pic_ocr(image_bytes: Optional[bytes] = None, image_type: Optional[str] = None, image_path: Optional[str] = None) -> dict:
    """
    图片OCR
    """
    agent = OcrAgent()
    if image_bytes and image_type:
        return agent.get_content(image_bytes=image_bytes, image_type=image_type)
    elif image_bytes and image_path:
        return agent.get_content(image_bytes=image_bytes, image_path=image_path)
    elif image_bytes:
        return agent.get_content(image_bytes=image_bytes)
    elif image_path:
        return agent.get_content(image_path=image_path)
    else:
        return {"status": "error"}


@celery_app.task
def common_document(file_path: str, file_type: str, project_name: str, supplier: Optional[str] = None):
    """
    从PDF文件中提取内容
    Args:
        file_path:      文件路径
        file_type:      文件类型<招标|投标>
        project_name:   项目名称
        supplier:       供应商名称，如果文件类型为投标文件，则必须提供该值
    Returns:
        ...
    """
    if not os.path.exists(file_path):
        return {"status": "error", "message": "File Not Found!"}

    if file_type == '招标':
        task = bidding_document.apply_async(
            kwargs={'file_path': file_path}
        )
        return task.id
    elif file_type == '投标':
        task = tender_document.apply_async(
            kwargs={'file_path': file_path, 'project_name': project_name, 'supplier': supplier}
        )
        return task.id


@celery_app.task
def bidding_document(file_path: str) -> dict:
    """
    招标文件
    Args:
        file_path:     招标文件
    Returns:
        ...
    """
    agent = PdfExtractAttr(file_path=file_path)
    texts = agent.parse_text()
    content = agent.extract_content()
    table_list = agent.parse_table_pro()
    title = agent.parse_title()
    return {
        "tables": table_list,
        "title": title,
        "content": content,
        "texts": texts
    }


@celery_app.task
def bidding_factor(table_list: list) -> dict:
    """
    从招标表格中获取详审因素
    """
    dpr = DocumentPreReview()
    dpr.Bidding_tables = table_list
    try:
        return dpr.get_table()
    except Exception:
        return {}


@celery_app.task
def tender_document(file_path: str, project_name: str, supplier: str) -> dict:
    """
    投标文件
    """
    agent = PdfExtractAttr(file_path=file_path)
    image_dir = os.path.join(os.path.join(project_name, supplier), 'extracted_images')

    # 创建图片保存目录
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)

    texts = agent.parse_text()
    tables = agent.parse_table_pro()
    outlines = agent.parse_outline()
    content = agent.extract_content()
    images = agent.parse_image(image_dir=image_dir)
    title = agent.parse_title()

    return {
        "outlines": outlines,
        "title": title,
        "texts": texts,
        "tables": tables,
        "content": content,
        "images": images,
    }


@celery_app.task
def add(x, y):
    return x + y


@celery_app.task(ignore_result=True)
def test_all_files(proj_name: str):
    for file in glob(f"D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\{proj_name}\\*\\*.pdf"):
        try:
            print('\033[32m' + f'\n\n*****{file}*****\n\n' + '\033[0m')

            if check_scan_pdf(file):
                print('扫描件')
                continue

            text_path = ''.join([file[:-4], '-text.json'])
            title_path = ''.join([file[:-4], '-title.json'])
            outline_path = ''.join([file[:-4], '-outline.json'])
            image_meta_path = ''.join([file[:-4], '-image.json'])
            table_path = ''.join([file[:-4], '-table.json'])
            content_path = ''.join([file[:-4], '-content.json'])
            image_dir = '\\'.join(['\\'.join(file.split('\\')[:-1]), 'extracted_images'])

            if not os.path.exists(image_dir):
                os.makedirs(image_dir)

            agent = PdfExtractAttr(file_path=file)

            print(f"{file} on parse_text")
            agent.parse_text(text_path)

            print(f"{file} on parse_title")
            agent.parse_title(title_path)

            print(f"{file} on parse_outline")
            agent.parse_outline(outline_path=outline_path)

            print(f"{file} on parse_image")
            agent.parse_image(image_dir=image_dir, image_meta_path=image_meta_path)

            print(f"{file} on parse_table")
            agent.parse_table_pro(table_path=table_path)

            print(f"{file} on parse_content")
            agent.extract_content(content_path=content_path)

            # dpr = DocumentPreReview()
            # dpr.get_Bidding_table(file_path=table_path)
            # print(dpr.get_table())

        except Exception as e:
            print(f'\033[31m {e} \033[0m')