123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2024-08-30 13:13:03
- # @Last Modified by: privacy
- # @Last Modified time: 2024-12-02 17:07:07
- import os
- from glob import glob
- from typing import List, Optional
- from . import celery_app
- from celery_tasks.get_info import PdfExtractAttr
- from celery_tasks.document_ import DocumentPreReview
- from celery_tasks.ocr import OcrAgent
- from celery_tasks.tools import check_scan_pdf
- @celery_app.task
- def pic_ocr(image_bytes: Optional[bytes] = None, image_type: Optional[str] = None, image_path: Optional[str] = None) -> dict:
- """
- 图片OCR
- """
- agent = OcrAgent()
- if image_bytes and image_type:
- return agent.get_content(image_bytes=image_bytes, image_type=image_type)
- elif image_bytes and image_path:
- return agent.get_content(image_bytes=image_bytes, image_path=image_path)
- elif image_bytes:
- return agent.get_content(image_bytes=image_bytes)
- elif image_path:
- return agent.get_content(image_path=image_path)
- else:
- return {"status": "error"}
- @celery_app.task
- def common_document(file_path: str, file_type: str, project_name: str, supplier: Optional[str] = None):
- """
- 从PDF文件中提取内容
- Args:
- file_path: 文件路径
- file_type: 文件类型<招标|投标>
- project_name: 项目名称
- supplier: 供应商名称,如果文件类型为投标文件,则必须提供该值
- Returns:
- ...
- """
- if not os.path.exists(file_path):
- return {"status": "error", "message": "File Not Found!"}
- if file_type == '招标':
- task = bidding_document.apply_async(
- kwargs={'file_path': file_path}
- )
- return task.id
- elif file_type == '投标':
- task = tender_document.apply_async(
- kwargs={'file_path': file_path, 'project_name': project_name, 'supplier': supplier}
- )
- return task.id
- @celery_app.task
- def bidding_document(file_path: str) -> dict:
- """
- 招标文件
- Args:
- file_path: 招标文件
- Returns:
- ...
- """
- agent = PdfExtractAttr(file_path=file_path)
- texts = agent.parse_text()
- content = agent.extract_content()
- table_list = agent.parse_table_pro()
- title = agent.parse_title()
- return {
- "tables": table_list,
- "title": title,
- "content": content,
- "texts": texts
- }
- @celery_app.task
- def bidding_factor(table_list: list) -> dict:
- """
- 从招标表格中获取详审因素
- """
- dpr = DocumentPreReview()
- dpr.Bidding_tables = table_list
- try:
- return dpr.get_table()
- except Exception as e:
- print(e)
- return {}
- @celery_app.task
- def tender_document(file_path: str, project_name: str, supplier: str) -> dict:
- """
- 投标文件
- """
- agent = PdfExtractAttr(file_path=file_path)
- image_dir = os.path.join(os.path.join(project_name, supplier), 'extracted_images')
- # 创建图片保存目录
- if not os.path.exists(image_dir):
- os.makedirs(image_dir)
- texts = agent.parse_text()
- tables = agent.parse_table_pro()
- outlines = agent.parse_outline()
- content = agent.extract_content()
- images = agent.parse_image(image_dir=image_dir)
- title = agent.parse_title()
- return {
- "outlines": outlines,
- "title": title,
- "texts": texts,
- "tables": tables,
- "content": content,
- "images": images,
- }
- # @celery_app.task(ignore_result=True)
- # def test_all_files(proj_name: str):
- # for file in glob(f"D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\{proj_name}\\*\\*.pdf"):
- # try:
- # print('\033[32m' + f'\n\n*****{file}*****\n\n' + '\033[0m')
- # if check_scan_pdf(file):
- # print('扫描件')
- # continue
- # text_path = ''.join([file[:-4], '-text.json'])
- # title_path = ''.join([file[:-4], '-title.json'])
- # outline_path = ''.join([file[:-4], '-outline.json'])
- # image_meta_path = ''.join([file[:-4], '-image.json'])
- # table_path = ''.join([file[:-4], '-table.json'])
- # content_path = ''.join([file[:-4], '-content.json'])
- # image_dir = '\\'.join(['\\'.join(file.split('\\')[:-1]), 'extracted_images'])
- # if not os.path.exists(image_dir):
- # os.makedirs(image_dir)
- # agent = PdfExtractAttr(file_path=file)
- # print(f"{file} on parse_text")
- # agent.parse_text(text_path)
- # print(f"{file} on parse_title")
- # agent.parse_title(title_path)
- # print(f"{file} on parse_outline")
- # agent.parse_outline(outline_path=outline_path)
- # print(f"{file} on parse_image")
- # agent.parse_image(image_dir=image_dir, image_meta_path=image_meta_path)
- # print(f"{file} on parse_table")
- # agent.parse_table_pro(table_path=table_path)
- # print(f"{file} on parse_content")
- # agent.extract_content(content_path=content_path)
- # # dpr = DocumentPreReview()
- # # dpr.get_Bidding_table(file_path=table_path)
- # # print(dpr.get_table())
- # except Exception as e:
- # print(f'\033[31m {e} \033[0m')
|