123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- import os
- import re
- import json
- import logging
- import requests
- import cv2
- import numpy as np
- # ocr外部接口
- class OcrAgent():
- def __init__(self, url):
- self.url = url
- self.datetime_re = r'\d{4}年\d{1,2}月\d{1,2}日至(?:\d{4}年\d{1,2}月\d{1,2}日|长期)'
- # 不同类型证书资质正则
- self.re_dict = {
- "business_license" : r'营业执照',
- "deposit": r'^(?:开户许可证|[\u4e00-\u9fff]+存款账户[\u4e00-\u9fff]+)$',
- "production_license": r'\b[\u4e00-\u9fff]*许可证\b',
- "qualtifications" : r'\b[\u4e00-\u9fff]*证书',
- "proof": r'\b[\u4e00-\u9fff]*证明',
- }
-
- def get_content(self, image_path):
- try:
- with open(image_path, 'rb') as image_file:
- files = {"file": ("image.jpg", image_file, "image/jpeg")}
- # files = {"file": ("image.png", image_file, "image/png")}
- response = requests.post(self.url, files=files)
- return response.json()
-
- except:
- raise ValueError(f"传入图像{image_path}已损坏")
-
-
-
- # 用于识别固定位置是否有公司法人签名
- def signature_recognition(self, image_path):
-
- content = self.get_content(image_path=image_path)
- def remove_red_seal(input_img):
- # 分离图片的通道
- blue_c, green_c, red_c = cv2.split(input_img)
- #利用大津法自动选择阈值
- thresh, ret = cv2.threshold(red_c, 0, 255, cv2.THRESH_OTSU)
- #对阈值进行调整
- filter_condition = int(thresh * 1.0)
- #移除红色的印章
- _, red_thresh = cv2.threshold(red_c, filter_condition, 255, cv2.THRESH_BINARY)
- # 把图片转回3通道
- result_img = np.expand_dims(red_thresh, axis=2)
- result_img = np.concatenate((result_img, result_img, result_img), axis=-1)
- return result_img
- # 用于判断固定位置的长方形框内是否存在签名字迹
- def ink_recognition(input_img, sign_path, meta: dict):
- left = meta["left"]
- right = meta["right"]
- top = meta["top"]
- bottom = meta["bottom"]
- cv2.rectangle(input_img, (left, top), (right, bottom), (0, 255, 0), 2) # 绿色框,线宽为2
- cv2.imwrite(sign_path, input_img)
- if __name__ == "__main__":
- url = "http://120.48.103.13:18000/ctr_ocr"
- ocr = OcrAgent(url=url)
- test_img = "/home/stf/miner_pdf/test_img/example_1.png"
- save_path = '/home/stf/miner_pdf/test_img/example_1_roi.png'
- sign_path = '/home/stf/miner_pdf/test_img/example_1_sign.png'
- input_img = cv2.imread(test_img)
- remove_seal = remove_red_seal(input_img)
- cv2.imwrite(save_path, remove_seal)
- keywords = ['投标函', '法定代表人CA电子印章', '法定代表人:']
- key_pos = {}
- content = ocr.get_content(save_path)
- image_info = content["rawjson"]["ret"]
- for info in image_info:
- word = info['word']
- left = info['rect']['left']
- top = info['rect']['top']
- width = info['rect']['width']
- height = info['rect']['height']
- right = left + width
- bottom = top + height
- for keyword in keywords:
- if keyword in word:
- key_pos[keyword] = {
- "word": word,
- "left": left,
- "right": right,
- "top": top,
- "bottom": bottom
- }
- break
-
- # 如果不存在"投标函"、"法定代表人"等关键字,则返回False
- if len(key_pos) == 0:
- print("NO FOUND")
-
- # 定位到法定代表人所在位置
- if key_pos.get('法定代表人:') is not None and key_pos.get('法定代表人CA电子印章') is not None:
- # 此时签名应在两者之间
- l = key_pos['法定代表人:']['right']
- l_word = key_pos['法定代表人:']['word']
- r = key_pos['法定代表人CA电子印章']['left']
- r_word = key_pos['法定代表人CA电子印章']['word']
- t = min(key_pos['法定代表人:']['top'], key_pos['法定代表人CA电子印章']['top'])
- b = max(key_pos['法定代表人:']['bottom'], key_pos['法定代表人CA电子印章']['bottom'])
- if l_word[-6:] != '法定代表人:' or r_word != '法定代表人CA电子印章':
- print("找寻到签名")
- exit(0)
- else:
- ink_recognition(
- input_img=remove_seal,
- sign_path=sign_path,
- meta={
- "left": l,
- "right": r,
- "top": t,
- "bottom": b
- }
- )
- elif key_pos.get('法定代表人CA电子印章') is not None:
- # 此时签名应已包含
- key_word = key_pos['法定代表人CA电子印章']['word']
- key_word = key_word.replace('法定代表人CA电子印章', '').replace('法定代表人:', '')
- if key_word != '':
- print("found sign")
- else:
- print("No FOUND")
- elif key_pos.get('法定代表人:') is not None:
- # 此时签名在右边或已包含
- word = key_pos['法定代表人:']['word']
- l = key_pos['法定代表人:']['left']
- r = l + 100
- t = key_pos['法定代表人:']['top']
- b = key_pos['法定代表人:']['bottom']
- if word[-6:] != '法定代表人:':
- print("found sign")
- else:
- ink_recognition(
- input_img=remove_seal,
- sign_path=sign_path,
- meta={
- "left": l,
- "right": r,
- "top": t,
- "bottom": b
- }
- )
- else:
- print("NO FOUND")
|