xzc
/
pdf_title_image


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243
							# ocr外部接口
import os
from typing import List

from requests import post


class OcrAgent:
    def __init__(self, url):
        self.url = url

    def get_content(self, image_path: str) -> dict:
        try:
            with open(image_path, 'rb') as image_file:
                files = {"file": ("image.jpg", image_file, "image/jpeg")}
                # files = {"file": ("image.png", image_file, "image/png")}
                response = post(self.url, files=files)
            return response.json()
        except ValueError:
            raise ValueError(f"传入图像{image_path}已损坏")


def find_current_row(ocr_result: List[dict], top: int, bottom: int, float_range: int = 5):
    results = []
    assert float_range >= 0
    top += float_range
    bottom -= float_range
    for ret in ocr_result:
        ct = ret['rect']['top']
        cb = ret['rect']['top'] - ret['rect']['height']
        if top >= ct > cb >= bottom:
            results.append(ret)
    return results


if __name__ == '__main__':
    agent = OcrAgent("http://120.48.103.13:18000/ctr_ocr")
    res = agent.get_content(
        os.path.join('D:\\desktop\\三峡水利\\data\\projects\\2022-2025年度三峡电站9台机组检修密封加工制作重新招标\\投标\\东方电气\\extracted_images\\', 'image_page_27_1.jpg'))
    print(res)
    pass