xzc
/
pdf_title_image


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
							import os
import re
import json
import logging
import requests
import cv2
import numpy as np

# ocr外部接口
class OcrAgent():
    def __init__(self, url):
        self.url = url
        self.datetime_re = r'\d{4}年\d{1,2}月\d{1,2}日至(?:\d{4}年\d{1,2}月\d{1,2}日|长期)'
        # 不同类型证书资质正则
        self.re_dict = {
            "business_license" : r'营业执照',
            "deposit": r'^(?:开户许可证|[\u4e00-\u9fff]+存款账户[\u4e00-\u9fff]+)$',
            "production_license": r'\b[\u4e00-\u9fff]*许可证\b',
            "qualtifications" : r'\b[\u4e00-\u9fff]*证书',
            "proof": r'\b[\u4e00-\u9fff]*证明',
        }
        
    def get_content(self, image_path):
        try:
            with open(image_path, 'rb') as image_file:
                files = {"file": ("image.jpg", image_file, "image/jpeg")}
                # files = {"file": ("image.png", image_file, "image/png")}
                response = requests.post(self.url, files=files)
            return response.json()
            
        except:
            raise ValueError(f"传入图像{image_path}已损坏")
        
    
    # 用于识别固定位置是否有公司法人签名
    def signature_recognition(self, image_path):
        
        content = self.get_content(image_path=image_path)


def remove_red_seal(input_img):
    # 分离图片的通道
    blue_c, green_c, red_c = cv2.split(input_img)
    #利用大津法自动选择阈值
    thresh, ret = cv2.threshold(red_c, 0, 255, cv2.THRESH_OTSU)
    #对阈值进行调整
    filter_condition = int(thresh * 1.0)
    #移除红色的印章
    _, red_thresh = cv2.threshold(red_c, filter_condition, 255, cv2.THRESH_BINARY)
    # 把图片转回3通道
    result_img = np.expand_dims(red_thresh, axis=2)
    result_img = np.concatenate((result_img, result_img, result_img), axis=-1)

    return result_img


# 用于判断固定位置的长方形框内是否存在签名字迹
def ink_recognition(input_img, sign_path, meta: dict):
    left = meta["left"]
    right = meta["right"]
    top = meta["top"]
    bottom = meta["bottom"]
    cv2.rectangle(input_img, (left, top), (right, bottom), (0, 255, 0), 2)  # 绿色框，线宽为2
    cv2.imwrite(sign_path, input_img)


if __name__ == "__main__":
    url = "http://120.48.103.13:18000/ctr_ocr"
    ocr = OcrAgent(url=url)
    test_img = "/home/stf/miner_pdf/test_img/example_1.png"
    save_path = '/home/stf/miner_pdf/test_img/example_1_roi.png'
    sign_path = '/home/stf/miner_pdf/test_img/example_1_sign.png'
    input_img = cv2.imread(test_img)
    remove_seal = remove_red_seal(input_img)
    cv2.imwrite(save_path, remove_seal)
    keywords = ['投标函', '法定代表人CA电子印章', '法定代表人：']
    key_pos = {}
    content = ocr.get_content(save_path)
    image_info = content["rawjson"]["ret"]
    for info in image_info:
        word = info['word']
        left = info['rect']['left']
        top = info['rect']['top']
        width = info['rect']['width']
        height = info['rect']['height']
        right = left + width
        bottom = top + height
        for keyword in keywords:
                if keyword in word:
                    key_pos[keyword] = {
                        "word": word,
                        "left": left,
                        "right": right,
                        "top": top,
                        "bottom": bottom
                    }

                    break
        
        # 如果不存在"投标函"、"法定代表人"等关键字，则返回False
        if len(key_pos) == 0:
            print("NO FOUND")
        
        # 定位到法定代表人所在位置
        if key_pos.get('法定代表人：') is not None and key_pos.get('法定代表人CA电子印章') is not None:
            # 此时签名应在两者之间
            l = key_pos['法定代表人：']['right']
            l_word = key_pos['法定代表人：']['word']
            r = key_pos['法定代表人CA电子印章']['left']
            r_word = key_pos['法定代表人CA电子印章']['word']
            t = min(key_pos['法定代表人：']['top'], key_pos['法定代表人CA电子印章']['top'])
            b = max(key_pos['法定代表人：']['bottom'], key_pos['法定代表人CA电子印章']['bottom'])
            if l_word[-6:] != '法定代表人：' or r_word != '法定代表人CA电子印章':
                print("找寻到签名")
                exit(0)
            else:
                ink_recognition(
                    input_img=remove_seal, 
                    sign_path=sign_path,
                    meta={
                        "left": l,
                        "right": r,
                        "top": t,
                        "bottom": b
                    }
                )

        elif key_pos.get('法定代表人CA电子印章') is not None:
            # 此时签名应已包含
            key_word = key_pos['法定代表人CA电子印章']['word']
            key_word = key_word.replace('法定代表人CA电子印章', '').replace('法定代表人：', '') 
            if key_word != '':
                print("found sign")
            else:
                print("No FOUND")

        elif key_pos.get('法定代表人：') is not None:
            # 此时签名在右边或已包含
            word = key_pos['法定代表人：']['word']
            l = key_pos['法定代表人：']['left']
            r = l + 100
            t = key_pos['法定代表人：']['top']
            b = key_pos['法定代表人：']['bottom']

            if word[-6:] != '法定代表人：':
                print("found sign")
            else:
                ink_recognition(
                    input_img=remove_seal,
                    sign_path=sign_path,
                    meta={
                        "left": l,
                        "right": r,
                        "top": t,
                        "bottom": b
                    }
                )


        else:
            print("NO FOUND")