Selaa lähdekoodia

Merge branch 'master' of http://192.168.1.202:8087/xzc/pdf_title_image

sprivacy 8 kuukautta sitten
vanhempi
commit
54243f853f
3 muutettua tiedostoa jossa 438 lisäystä ja 0 poistoa
  1. 283 0
      base_file.py
  2. 119 0
      bos.py
  3. 36 0
      bos_sample_conf.py

+ 283 - 0
base_file.py

@@ -0,0 +1,283 @@
+#coding:utf-8
+
+import os
+import json
+import re
+import Levenshtein
+
+
+# 扫描件-投标文件
+HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
+
+
+# 假设标题通常是一行且字体较大
+#获取标题段落
+#line  段落内容
+#list_key  招标文件中响应文件格式(标题或目录)
+def is_title(line: str, list_key=[]) -> bool:
+    if not list_key:        
+        title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
+    else:
+        title_word = re.findall('|'.join(list_key) + '|^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
+    if title_word:
+        return True
+    title_word = re.findall('^附录|^参考文献|^附表', line.strip())
+    if title_word:
+        return True
+    return False
+
+#不存在标题特征的段落,但是段落内容文本居中了且字符内容少于20个字符
+def is_title_v2(line: str, box=[]) -> bool:
+    try:
+        left, right, width, height = box
+    except:
+        return False
+    # if len(line) < 15 and height > 15:
+    #     return True
+    
+    # if left > 135 and len(line) < 15:
+    #     return True
+    if len(re.findall('[\u4e00-\u9fa5]', line)) < 2:
+        return False
+    if left > 130 and left+(width/2) > 294 and left+(width/2) < 300 and len(line) < 15:
+        if re.findall('^图|图$|页$', line):
+            return False
+        return True
+
+    return False
+
+
+
+# 定位营业执照、资质、业绩、财报图像的区间范围
+def search_interval(title):
+    # 通过关键字模糊定位
+    keywords = ['资格审查资料','资格审查材料','其它材料','其他材料','其他资料','附件', '影印件']
+    search_interval = []
+    # locate in title.json
+    left_pos = -1 # 左指针
+    right_pos = -1 # 右指针
+    for title_block in title:
+        # print(title_block)
+        block_text = title_block['text'].replace(' ', '').strip()
+        
+        # 先进行左区间判定
+        if left_pos != -1 and '证书' not in block_text:
+            right_pos = title_block['page_number']
+            search_interval.append((left_pos, right_pos))
+            # 重置
+            left_pos = -1
+
+        for keyword in keywords:
+            if keyword in block_text:
+                # print(title_block)
+                # 先进行模糊的outline定位
+                center_page = None
+                if '.' in block_text:
+                    center_page = block_text.split('.')[-1]
+                    if center_page.isdigit():
+                        center_page = eval(center_page)
+                        left_pos = min(title_block['page_number'], center_page)
+                else:
+                    left_pos = title_block['page_number']
+
+            
+    # 最终判定
+    if left_pos != -1:
+        search_interval.append((left_pos, right_pos))
+
+    # 搜寻区间合并
+    search_interval.sort()
+
+    merge_interval = []
+    if len(search_interval) > 0:
+        left = -1
+        right = -1
+        for interval in search_interval:
+            l, r = interval
+            if r < l:
+                continue
+            if left == -1 and right == -1:
+                left = l
+                right = r
+
+            elif l <= right:
+                right = r
+
+            else:
+                merge_interval.append((left, right))
+                left = l
+                right = r
+        merge_interval.append((left, right))
+
+    return merge_interval
+
+
+def locate_business_license(title):
+    '''locate business license and return image'''
+    keywords = ["资格审查资料", "其它资格审查材料", "资格审查材料"]
+    candidate_pages = []
+    center_pages = []
+    candidate_images = set()
+    # locate in title.json
+    for title_block in title:
+        block_text = title_block['text'].replace(' ', '').strip()
+        for keyword in keywords:
+            if keyword in block_text:
+                # 先进行模糊的outline定位
+                center_page = None
+                if '.' in block_text:
+                    center_page = block_text.split('.')[-1]
+                    if center_page.isdigit():
+                        center_page = eval(center_page)
+                        center_pages.append(center_page)
+                candidate_pages.append(title_block['page_number'])
+    # information match
+    filter_pages = set()
+    if len(center_pages) == 0 and len(candidate_pages) == 0:
+        return None
+    elif len(center_pages) == 0:
+        filter_pages.update(candidate_pages)
+    elif len(candidate_pages) == 0:
+        filter_pages.update(center_pages)
+    else:
+        # center_pages作为锚点,全部加入
+        filter_pages.update(center_pages)
+        # candidate_page与center_page进行匹配加入
+        for candidate_page in candidate_pages:
+            if candidate_page <= start_threshold:
+                continue
+            for center_page in center_pages:
+                distance = abs(candidate_page - center_page)
+                if distance <= distance_threshold:
+                    filter_pages.add(min(candidate_page, center_page) + distance // 2)
+    
+    # return target_path list
+    return target_list
+
+#textmind
+# lines = open('三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8').read()
+lines = open('data_1.json', 'r', encoding='utf-8').read()
+json_line = json.loads(lines)
+print(json_line.keys())
+para_nodes = json_line['para_nodes']
+table_flag = 0
+contents = ""
+for i in range(len(para_nodes)):
+    # '评审因素'
+    # ''
+    if para_nodes[i]['node_type'] == 'contents':
+        contents = para_nodes[i]['text']
+        break
+
+contents = re.sub('[\.\d]+', '', contents)
+table_flag = 0
+title_list = []
+table_list = []
+char_hight = 13
+_index = 0
+page_num = -1
+for i in range(len(para_nodes)):    
+    # if not para_nodes[i]['node_type'] in ["contents",'table', 'text', 'head_tail']:
+    #     print(para_nodes[i])
+    if i < table_flag:
+        continue
+    if not para_nodes[i]['position']:
+        continue
+    if para_nodes[i]['position'][0]['pageno'] != page_num:
+        page_num = para_nodes[i]['position'][0]['pageno']
+        _index = 0
+    if para_nodes[i]['position'][0]['pageno'] == page_num:
+        # page_num = para_nodes[i]['position'][0]['pageno']
+        _index = _index + 1
+    # para_nodes[i]['position'][0]['pageno']
+    
+    if para_nodes[i]['node_type'] == 'title' and para_nodes[i]['position'][0]['box'][-1]:
+        title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
+    elif _index < 3 and is_title(para_nodes[i]['text']) and len(para_nodes[i]['text']) < 20:
+        title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
+        # print(para_nodes[i]['text'])
+    # elif is_title_v2(para_nodes[i]['text'], para_nodes[i]['position'][0]['box'] ) and len(para_nodes[i]['text']) < 20:
+    #     print(para_nodes[i]['text'])
+    # if para_nodes[i]['node_type'] == 'seal':  #印章
+    #     print(para_nodes[i])
+    # if len(para_nodes[i]['text']) > 5 and para_nodes[i]['text'] in contents and para_nodes[i]['position'][0]['box'][-1] >= char_hight:
+    #     print(para_nodes[i]['text'])
+    #报价文件、投标文件中报价清单
+    if para_nodes[i]['node_type'] != 'table' and ('报价汇总表' in para_nodes[i]['text'] or '分项报价表' in para_nodes[i]['text'] or '工程量清单报价表' in para_nodes[i]['text'] or '报价明细表' in para_nodes[i]['text'] or '报价清单' in para_nodes[i]['text'] or ('报价表' in  para_nodes[i]['text'] and para_nodes[i]['node_type']=='title')):
+        print(para_nodes[i])
+        flag_word = re.findall('报价汇总表$|分项报价表$|工程量清单报价表$|报价明细表$|报价清单$', para_nodes[i]['text'])
+        if not flag_word and re.findall('报价表', para_nodes[i]['text']) and para_nodes[i]['node_type']=='title':
+            flag_word = '报价表'
+        if not flag_word:
+            continue
+        if re.findall('^附件', para_nodes[i]['text']):
+            continue
+
+        flag_word = flag_word[0]
+        position_page_id = para_nodes[i]['position'][0]['pageno']
+        for j in range(i, len(para_nodes)):
+            if para_nodes[j]['para_type'] != 'table' and position_page_id + 2 < para_nodes[j]['position'][0]['pageno']:
+                break
+            if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
+                # print(position_page_id)
+                position_page_id = para_nodes[i]['position'][0]['pageno']
+        
+        # print(i, j)
+        lines = ""
+        for k in range(i, j+1):
+            if para_nodes[k]['node_type'] != 'table':
+                word_flag = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表|安全文明措施|报价清单', para_nodes[k]['text'])
+            # print(word_flag, flag_word)
+            table_flag = k
+            if word_flag and word_flag[0] != flag_word:
+                break
+            if para_nodes[k]['para_type'] != 'table':
+                # print(para_nodes[k]['text'])
+                continue
+            _lines = para_nodes[k]['text'].split('\n')
+            if lines and Levenshtein.ratio(lines.split('\n')[0], _lines[0]) > 0.96:
+                lines = lines + '\n'.join(_lines[1:])
+            else:
+                lines = lines + '\n'.join(_lines[:])
+            # print(_lines)
+        # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
+        if not lines:
+            continue
+        table_list.append((para_nodes[i]['text'], lines))
+
+    
+    #技术规范中工程量清单
+    if para_nodes[i]['node_type'] != 'table' and re.findall('工程量清单|材料清单|工作量清单|报价明细表|主要配置(含备品备件、专用工器具)', para_nodes[i]['text']):
+        position_page_id = para_nodes[i]['position'][0]['pageno']
+        table_flag = 0
+        for j in range(i, len(para_nodes)):
+            if para_nodes[j]['para_type'] != 'table' and table_flag == 1:
+                break
+            if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
+                # print(position_page_id)
+                position_page_id = para_nodes[i]['position'][0]['pageno']
+                table_flag = 1
+        # print(i, j)
+        lines = ""
+        for k in range(i, j+1):
+            if para_nodes[k]['para_type'] != 'table':
+                # print(para_nodes[k]['text'])
+                continue
+            lines = lines + para_nodes[k]['text']
+        # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
+print(table_list)
+# 表标题或者表格前标题:工程量清单、材料清单、工作量清单、报价明细表、主要配置(含备品备件、专用工器具)
+# 表头:费用、单价、价格、含税价、单价、合价、估算工程量、单位
+
+# file_content = json_line['para_nodes']
+# for y in range(len(file_content[10:20])):
+#     print(file_content[y])
+
+# print(title_list)
+# print(contents)
+
+
+# print(search_interval(title_list))
+
+
+# print(table_list)

+ 119 - 0
bos.py

@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+import os
+import sys
+import json
+import numpy as np
+
+from baidubce.services.bos import bos_handler
+from baidubce.services.bos import storage_class
+from baidubce.services.bos import canned_acl
+from baidubce.bce_client_configuration import BceClientConfiguration
+from baidubce.auth.bce_credentials import BceCredentials
+import bos_sample_conf 
+
+#导入BOS相关模块
+from baidubce import exception
+from baidubce.services import bos
+from baidubce.services.bos import canned_acl
+from baidubce.services.bos.bos_client import BosClient
+
+
+
+#设置请求超时时间
+#bos_sample_conf.config.connection_timeout_in_mills = TIMEOUT
+
+#设置接收缓冲区大小
+#bos_sample_conf.config.recv_buf_size(BUF_SIZE)
+
+#设置发送缓冲区大小
+#bos_sample_conf.config.send_buf_size(BUF_SIZE)
+
+#设置连接重试策略
+#三次指数退避重试
+#bos_sample_conf.config.retry_policy = BackOffRetryPolicy()
+#不重试
+#bos_sample_conf.config.retry_policy = NoRetryPolicy()
+
+bos_client = BosClient(bos_sample_conf.config)
+"""
+response = bos_client.list_buckets()
+for bucket in response.buckets:
+     print (bucket.name)
+"""
+#根据ListObjects接口来获取图片的key,prefix为前缀
+def get_objects(prefix, max_keys=10):
+     objects = bos_client.list_objects('ocrtrips', max_keys=max_keys, prefix=prefix)
+     return objects.contents
+
+#上传
+def put_bos(object_key, file_name, bucket_name='ctrimgs'):
+     bos_client.put_object_from_file(bucket_name, object_key, file_name)
+     return 'https://ctrimgs.bj.bcebos.com/' + object_key
+     #return bos_client.put_object_from_file(bucket_name, object_key, file_name)
+#删除
+def delete_bos(object_key, bucket_name='ctrimgs'):
+    bos_client.delete_object(bucket_name, object_key)
+    return ''
+
+    
+#下载
+def get_bos(bucket_name, object_key, file_name):
+     bos_client.get_object_to_file(bucket_name,
+                                  object_key,
+                                  file_name)
+#bos查询
+def get_object_lists(buckent_name, prefix, max_keys=10):
+    objects = bos_client.list_objects(buckent_name, max_keys=max_keys, prefix=prefix)
+    return objects.contents
+#分块上传 文件大于5G
+def get_multipart(bucket_name, object_key, file_name):
+
+    upload_id = bos_client.initiate_multipart_upload(bucket_name, object_key).upload_id
+
+    left_size = os.path.getsize(file_name)
+    #设置分块的开始偏移位置
+    offset = 0
+
+    part_number = 1
+    part_list = []
+
+    while left_size > 0:
+        #设置每块为5MB
+        part_size = 5 * 1024 * 1024
+        if left_size < part_size:
+            part_size = left_size
+
+        response = bos_client.upload_part_from_file(
+            bucket_name, object_key, upload_id, part_number, part_size, file_name, offset)
+
+
+        left_size -= part_size
+        offset += part_size
+        part_list.append({
+            "partNumber": part_number,
+            "eTag": response.metadata.etag
+        })
+
+
+        part_number += 1
+
+    bos_client.complete_multipart_upload(bucket_name, object_key, upload_id, part_list)
+'''
+print bos_client.get_bucket_location(bucket_name)
+print bos_client.does_bucket_exist(bucket_name)
+#put_bos(bucket_name, 'data.jpg', '')
+
+
+response = bos_client.get_object_meta_data(bucket_name, 'data.jpg')
+print(response)
+url = bos_client.generate_pre_signed_url(bucket_name,
+                                         'data.jpg',
+                                         -1)
+print(url)
+
+
+#get_bos(bucket_name, 'hadoop-client.tar', './')
+'''
+
+
+#print(put_bos('20210616/16-471802237.jpg', 'image/20210616/16-471802237.jpg'))

+ 36 - 0
bos_sample_conf.py

@@ -0,0 +1,36 @@
+#coding=utf-8
+
+#导入Python标准日志模块
+import logging
+
+#从Python SDK导入BOS配置管理模块以及安全认证模块
+from baidubce.bce_client_configuration import BceClientConfiguration
+from baidubce.auth.bce_credentials import BceCredentials
+
+#设置BosClient的Host,Access Key ID和Secret Access Key
+bos_host = "bj.bcebos.com"
+access_key_id = "87815919190940dd9ff8a7790281e1e9"
+secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
+
+access_key_id = "87815919190940dd9ff8a7790281e1e9"
+secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
+
+access_key_id = "ALTAKEq9L0oxxxDi5jUc3e12gu"
+secret_access_key = "9336a04f88e845e284bab26bd5fd8182"
+
+"""
+#设置日志文件的句柄和日志级别
+logger = logging.getLogger('baidubce.http.bce_http_client')
+fh = logging.FileHandler("sample.log")
+fh.setLevel(logging.DEBUG)
+
+#设置日志文件输出的顺序、结构和内容
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+fh.setFormatter(formatter)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(fh)
+"""
+#创建BceClientConfiguration
+config = BceClientConfiguration(credentials=BceCredentials(access_key_id, secret_access_key), endpoint = bos_host)
+
+