# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-08-28 13:17:32 # @Last Modified by: privacy # @Last Modified time: 2024-09-05 09:25:28 import os from typing import List, Optional from requests import post from celery_tasks.configure import OCRConf from celery_tasks.singleton import Singleton class OcrAgent(Singleton): def __init__(self): super().__init__() self.url = OCRConf.url def is_file_exists(self, file_path: str) -> str: return os.path.exists(file_path) def estimate_postfix(self, file_path: str) -> str: filepath_lower = file_path.lower() if filepath_lower.endswith('.jpg') or filepath_lower.endswith('.jpeg'): return "image/jpeg" elif filepath_lower.endswith('.png'): return "image/png" else: return "UNK" def get_content(self, image_bytes: Optional[bytes] = None, image_type: Optional[str] = 'image/jpeg', image_path: Optional[str] = None) -> dict: if (not image_bytes) and (not image_path): raise ValueError("Image File Not Exists!") elif image_bytes and image_path: image_type = self.estimate_postfix(file_path=image_path) if image_type == 'UNK': raise ValueError("Image Type Error!") else: files = {"file": ("image.jpg", image_bytes, image_type)} elif image_bytes: files = {"file": ("image.jpg", image_bytes, image_type)} elif image_path and self.is_file_exists(image_path): image_type = self.estimate_postfix(file_path=image_path) if image_type == 'UNK': raise ValueError("Image Type Error!") else: with open(image_path, 'rb') as image: files = {"file": ("image.jpg", image.read(), image_type)} try: response = post(self.url, files=files) return response.json() except Exception: raise ValueError(f"图片解析失败") def find_current_row(ocr_result: List[dict], top: int, bottom: int, float_range: int = 5): results = [] assert float_range >= 0 top += float_range bottom -= float_range for ret in ocr_result: ct = ret['rect']['top'] cb = ret['rect']['top'] - ret['rect']['height'] if top >= ct > cb >= bottom: results.append(ret) return results if __name__ == '__main__': agent = OcrAgent() res = agent.get_content( image_path=os.path.join('D:\\desktop\\三峡水利\\data\\projects\\2022-2025年度三峡电站9台机组检修密封加工制作重新招标\\投标\\东方电气\\extracted_images\\', 'image_page_27_1.jpg') ) print(res)