1 vuosi sitten · 54243f853f
--- a/base_file.py
+++ b/base_file.py
@@ -0,0 +1,283 @@
 
				+#coding:utf-8
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+import re
			
 
				+import Levenshtein
			
 
				+
			
 
				+
			
 
				+# 扫描件-投标文件
			
 
				+HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价（元）', '含税合价（元）', '条款号', '评分因素', '评分标准', '页码'})
			
 
				+
			
 
				+
			
 
				+# 假设标题通常是一行且字体较大
			
 
				+#获取标题段落
			
 
				+#line  段落内容
			
 
				+#list_key  招标文件中响应文件格式（标题或目录）
			
 
				+def is_title(line: str, list_key=[]) -> bool:
			
 
				+    if not list_key:        
			
 
				+        title_word = re.findall('^[（\(][一二三四五六七八九十]+[\)）]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
			
 
				+    else:
			
 
				+        title_word = re.findall('|'.join(list_key) + '|^[（\(][一二三四五六七八九十]+[\)）]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
			
 
				+    if title_word:
			
 
				+        return True
			
 
				+    title_word = re.findall('^附录|^参考文献|^附表', line.strip())
			
 
				+    if title_word:
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				+#不存在标题特征的段落，但是段落内容文本居中了且字符内容少于20个字符
			
 
				+def is_title_v2(line: str, box=[]) -> bool:
			
 
				+    try:
			
 
				+        left, right, width, height = box
			
 
				+    except:
			
 
				+        return False
			
 
				+    # if len(line) < 15 and height > 15:
			
 
				+    #     return True
			
 
				+    
			
 
				+    # if left > 135 and len(line) < 15:
			
 
				+    #     return True
			
 
				+    if len(re.findall('[\u4e00-\u9fa5]', line)) < 2:
			
 
				+        return False
			
 
				+    if left > 130 and left+(width/2) > 294 and left+(width/2) < 300 and len(line) < 15:
			
 
				+        if re.findall('^图|图$|页$', line):
			
 
				+            return False
			
 
				+        return True
			
 
				+
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+
			
 
				+# 定位营业执照、资质、业绩、财报图像的区间范围
			
 
				+def search_interval(title):
			
 
				+    # 通过关键字模糊定位
			
 
				+    keywords = ['资格审查资料','资格审查材料','其它材料','其他材料','其他资料','附件', '影印件']
			
 
				+    search_interval = []
			
 
				+    # locate in title.json
			
 
				+    left_pos = -1 # 左指针
			
 
				+    right_pos = -1 # 右指针
			
 
				+    for title_block in title:
			
 
				+        # print(title_block)
			
 
				+        block_text = title_block['text'].replace(' ', '').strip()
			
 
				+        
			
 
				+        # 先进行左区间判定
			
 
				+        if left_pos != -1 and '证书' not in block_text:
			
 
				+            right_pos = title_block['page_number']
			
 
				+            search_interval.append((left_pos, right_pos))
			
 
				+            # 重置
			
 
				+            left_pos = -1
			
 
				+
			
 
				+        for keyword in keywords:
			
 
				+            if keyword in block_text:
			
 
				+                # print(title_block)
			
 
				+                # 先进行模糊的outline定位
			
 
				+                center_page = None
			
 
				+                if '.' in block_text:
			
 
				+                    center_page = block_text.split('.')[-1]
			
 
				+                    if center_page.isdigit():
			
 
				+                        center_page = eval(center_page)
			
 
				+                        left_pos = min(title_block['page_number'], center_page)
			
 
				+                else:
			
 
				+                    left_pos = title_block['page_number']
			
 
				+
			
 
				+            
			
 
				+    # 最终判定
			
 
				+    if left_pos != -1:
			
 
				+        search_interval.append((left_pos, right_pos))
			
 
				+
			
 
				+    # 搜寻区间合并
			
 
				+    search_interval.sort()
			
 
				+
			
 
				+    merge_interval = []
			
 
				+    if len(search_interval) > 0:
			
 
				+        left = -1
			
 
				+        right = -1
			
 
				+        for interval in search_interval:
			
 
				+            l, r = interval
			
 
				+            if r < l:
			
 
				+                continue
			
 
				+            if left == -1 and right == -1:
			
 
				+                left = l
			
 
				+                right = r
			
 
				+
			
 
				+            elif l <= right:
			
 
				+                right = r
			
 
				+
			
 
				+            else:
			
 
				+                merge_interval.append((left, right))
			
 
				+                left = l
			
 
				+                right = r
			
 
				+        merge_interval.append((left, right))
			
 
				+
			
 
				+    return merge_interval
			
 
				+
			
 
				+
			
 
				+def locate_business_license(title):
			
 
				+    '''locate business license and return image'''
			
 
				+    keywords = ["资格审查资料", "其它资格审查材料", "资格审查材料"]
			
 
				+    candidate_pages = []
			
 
				+    center_pages = []
			
 
				+    candidate_images = set()
			
 
				+    # locate in title.json
			
 
				+    for title_block in title:
			
 
				+        block_text = title_block['text'].replace(' ', '').strip()
			
 
				+        for keyword in keywords:
			
 
				+            if keyword in block_text:
			
 
				+                # 先进行模糊的outline定位
			
 
				+                center_page = None
			
 
				+                if '.' in block_text:
			
 
				+                    center_page = block_text.split('.')[-1]
			
 
				+                    if center_page.isdigit():
			
 
				+                        center_page = eval(center_page)
			
 
				+                        center_pages.append(center_page)
			
 
				+                candidate_pages.append(title_block['page_number'])
			
 
				+    # information match
			
 
				+    filter_pages = set()
			
 
				+    if len(center_pages) == 0 and len(candidate_pages) == 0:
			
 
				+        return None
			
 
				+    elif len(center_pages) == 0:
			
 
				+        filter_pages.update(candidate_pages)
			
 
				+    elif len(candidate_pages) == 0:
			
 
				+        filter_pages.update(center_pages)
			
 
				+    else:
			
 
				+        # center_pages作为锚点，全部加入
			
 
				+        filter_pages.update(center_pages)
			
 
				+        # candidate_page与center_page进行匹配加入
			
 
				+        for candidate_page in candidate_pages:
			
 
				+            if candidate_page <= start_threshold:
			
 
				+                continue
			
 
				+            for center_page in center_pages:
			
 
				+                distance = abs(candidate_page - center_page)
			
 
				+                if distance <= distance_threshold:
			
 
				+                    filter_pages.add(min(candidate_page, center_page) + distance // 2)
			
 
				+    
			
 
				+    # return target_path list
			
 
				+    return target_list
			
 
				+
			
 
				+#textmind
			
 
				+# lines = open('三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8').read()
			
 
				+lines = open('data_1.json', 'r', encoding='utf-8').read()
			
 
				+json_line = json.loads(lines)
			
 
				+print(json_line.keys())
			
 
				+para_nodes = json_line['para_nodes']
			
 
				+table_flag = 0
			
 
				+contents = ""
			
 
				+for i in range(len(para_nodes)):
			
 
				+    # '评审因素'
			
 
				+    # ''
			
 
				+    if para_nodes[i]['node_type'] == 'contents':
			
 
				+        contents = para_nodes[i]['text']
			
 
				+        break
			
 
				+
			
 
				+contents = re.sub('[\.\d]+', '', contents)
			
 
				+table_flag = 0
			
 
				+title_list = []
			
 
				+table_list = []
			
 
				+char_hight = 13
			
 
				+_index = 0
			
 
				+page_num = -1
			
 
				+for i in range(len(para_nodes)):    
			
 
				+    # if not para_nodes[i]['node_type'] in ["contents",'table', 'text', 'head_tail']:
			
 
				+    #     print(para_nodes[i])
			
 
				+    if i < table_flag:
			
 
				+        continue
			
 
				+    if not para_nodes[i]['position']:
			
 
				+        continue
			
 
				+    if para_nodes[i]['position'][0]['pageno'] != page_num:
			
 
				+        page_num = para_nodes[i]['position'][0]['pageno']
			
 
				+        _index = 0
			
 
				+    if para_nodes[i]['position'][0]['pageno'] == page_num:
			
 
				+        # page_num = para_nodes[i]['position'][0]['pageno']
			
 
				+        _index = _index + 1
			
 
				+    # para_nodes[i]['position'][0]['pageno']
			
 
				+    
			
 
				+    if para_nodes[i]['node_type'] == 'title' and para_nodes[i]['position'][0]['box'][-1]:
			
 
				+        title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
			
 
				+    elif _index < 3 and is_title(para_nodes[i]['text']) and len(para_nodes[i]['text']) < 20:
			
 
				+        title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
			
 
				+        # print(para_nodes[i]['text'])
			
 
				+    # elif is_title_v2(para_nodes[i]['text'], para_nodes[i]['position'][0]['box'] ) and len(para_nodes[i]['text']) < 20:
			
 
				+    #     print(para_nodes[i]['text'])
			
 
				+    # if para_nodes[i]['node_type'] == 'seal':  #印章
			
 
				+    #     print(para_nodes[i])
			
 
				+    # if len(para_nodes[i]['text']) > 5 and para_nodes[i]['text'] in contents and para_nodes[i]['position'][0]['box'][-1] >= char_hight:
			
 
				+    #     print(para_nodes[i]['text'])
			
 
				+    #报价文件、投标文件中报价清单
			
 
				+    if para_nodes[i]['node_type'] != 'table' and ('报价汇总表' in para_nodes[i]['text'] or '分项报价表' in para_nodes[i]['text'] or '工程量清单报价表' in para_nodes[i]['text'] or '报价明细表' in para_nodes[i]['text'] or '报价清单' in para_nodes[i]['text'] or ('报价表' in  para_nodes[i]['text'] and para_nodes[i]['node_type']=='title')):
			
 
				+        print(para_nodes[i])
			
 
				+        flag_word = re.findall('报价汇总表$|分项报价表$|工程量清单报价表$|报价明细表$|报价清单$', para_nodes[i]['text'])
			
 
				+        if not flag_word and re.findall('报价表', para_nodes[i]['text']) and para_nodes[i]['node_type']=='title':
			
 
				+            flag_word = '报价表'
			
 
				+        if not flag_word:
			
 
				+            continue
			
 
				+        if re.findall('^附件', para_nodes[i]['text']):
			
 
				+            continue
			
 
				+
			
 
				+        flag_word = flag_word[0]
			
 
				+        position_page_id = para_nodes[i]['position'][0]['pageno']
			
 
				+        for j in range(i, len(para_nodes)):
			
 
				+            if para_nodes[j]['para_type'] != 'table' and position_page_id + 2 < para_nodes[j]['position'][0]['pageno']:
			
 
				+                break
			
 
				+            if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
			
 
				+                # print(position_page_id)
			
 
				+                position_page_id = para_nodes[i]['position'][0]['pageno']
			
 
				+        
			
 
				+        # print(i, j)
			
 
				+        lines = ""
			
 
				+        for k in range(i, j+1):
			
 
				+            if para_nodes[k]['node_type'] != 'table':
			
 
				+                word_flag = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表|安全文明措施|报价清单', para_nodes[k]['text'])
			
 
				+            # print(word_flag, flag_word)
			
 
				+            table_flag = k
			
 
				+            if word_flag and word_flag[0] != flag_word:
			
 
				+                break
			
 
				+            if para_nodes[k]['para_type'] != 'table':
			
 
				+                # print(para_nodes[k]['text'])
			
 
				+                continue
			
 
				+            _lines = para_nodes[k]['text'].split('\n')
			
 
				+            if lines and Levenshtein.ratio(lines.split('\n')[0], _lines[0]) > 0.96:
			
 
				+                lines = lines + '\n'.join(_lines[1:])
			
 
				+            else:
			
 
				+                lines = lines + '\n'.join(_lines[:])
			
 
				+            # print(_lines)
			
 
				+        # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
			
 
				+        if not lines:
			
 
				+            continue
			
 
				+        table_list.append((para_nodes[i]['text'], lines))
			
 
				+
			
 
				+    
			
 
				+    #技术规范中工程量清单
			
 
				+    if para_nodes[i]['node_type'] != 'table' and re.findall('工程量清单|材料清单|工作量清单|报价明细表|主要配置（含备品备件、专用工器具）', para_nodes[i]['text']):
			
 
				+        position_page_id = para_nodes[i]['position'][0]['pageno']
			
 
				+        table_flag = 0
			
 
				+        for j in range(i, len(para_nodes)):
			
 
				+            if para_nodes[j]['para_type'] != 'table' and table_flag == 1:
			
 
				+                break
			
 
				+            if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
			
 
				+                # print(position_page_id)
			
 
				+                position_page_id = para_nodes[i]['position'][0]['pageno']
			
 
				+                table_flag = 1
			
 
				+        # print(i, j)
			
 
				+        lines = ""
			
 
				+        for k in range(i, j+1):
			
 
				+            if para_nodes[k]['para_type'] != 'table':
			
 
				+                # print(para_nodes[k]['text'])
			
 
				+                continue
			
 
				+            lines = lines + para_nodes[k]['text']
			
 
				+        # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
			
 
				+print(table_list)
			
 
				+# 表标题或者表格前标题：工程量清单、材料清单、工作量清单、报价明细表、主要配置（含备品备件、专用工器具）
			
 
				+# 表头：费用、单价、价格、含税价、单价、合价、估算工程量、单位
			
 
				+
			
 
				+# file_content = json_line['para_nodes']
			
 
				+# for y in range(len(file_content[10:20])):
			
 
				+#     print(file_content[y])
			
 
				+
			
 
				+# print(title_list)
			
 
				+# print(contents)
			
 
				+
			
 
				+
			
 
				+# print(search_interval(title_list))
			
 
				+
			
 
				+
			
 
				+# print(table_list)
			
--- a/bos.py
+++ b/bos.py
@@ -0,0 +1,119 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import os
			
 
				+import sys
			
 
				+import json
			
 
				+import numpy as np
			
 
				+
			
 
				+from baidubce.services.bos import bos_handler
			
 
				+from baidubce.services.bos import storage_class
			
 
				+from baidubce.services.bos import canned_acl
			
 
				+from baidubce.bce_client_configuration import BceClientConfiguration
			
 
				+from baidubce.auth.bce_credentials import BceCredentials
			
 
				+import bos_sample_conf 
			
 
				+
			
 
				+#导入BOS相关模块
			
 
				+from baidubce import exception
			
 
				+from baidubce.services import bos
			
 
				+from baidubce.services.bos import canned_acl
			
 
				+from baidubce.services.bos.bos_client import BosClient
			
 
				+
			
 
				+
			
 
				+
			
 
				+#设置请求超时时间
			
 
				+#bos_sample_conf.config.connection_timeout_in_mills = TIMEOUT
			
 
				+
			
 
				+#设置接收缓冲区大小
			
 
				+#bos_sample_conf.config.recv_buf_size(BUF_SIZE)
			
 
				+
			
 
				+#设置发送缓冲区大小
			
 
				+#bos_sample_conf.config.send_buf_size(BUF_SIZE)
			
 
				+
			
 
				+#设置连接重试策略
			
 
				+#三次指数退避重试
			
 
				+#bos_sample_conf.config.retry_policy = BackOffRetryPolicy()
			
 
				+#不重试
			
 
				+#bos_sample_conf.config.retry_policy = NoRetryPolicy()
			
 
				+
			
 
				+bos_client = BosClient(bos_sample_conf.config)
			
 
				+"""
			
 
				+response = bos_client.list_buckets()
			
 
				+for bucket in response.buckets:
			
 
				+     print (bucket.name)
			
 
				+"""
			
 
				+#根据ListObjects接口来获取图片的key，prefix为前缀
			
 
				+def get_objects(prefix, max_keys=10):
			
 
				+     objects = bos_client.list_objects('ocrtrips', max_keys=max_keys, prefix=prefix)
			
 
				+     return objects.contents
			
 
				+
			
 
				+#上传
			
 
				+def put_bos(object_key, file_name, bucket_name='ctrimgs'):
			
 
				+     bos_client.put_object_from_file(bucket_name, object_key, file_name)
			
 
				+     return 'https://ctrimgs.bj.bcebos.com/' + object_key
			
 
				+     #return bos_client.put_object_from_file(bucket_name, object_key, file_name)
			
 
				+#删除
			
 
				+def delete_bos(object_key, bucket_name='ctrimgs'):
			
 
				+    bos_client.delete_object(bucket_name, object_key)
			
 
				+    return ''
			
 
				+
			
 
				+    
			
 
				+#下载
			
 
				+def get_bos(bucket_name, object_key, file_name):
			
 
				+     bos_client.get_object_to_file(bucket_name,
			
 
				+                                  object_key,
			
 
				+                                  file_name)
			
 
				+#bos查询
			
 
				+def get_object_lists(buckent_name, prefix, max_keys=10):
			
 
				+    objects = bos_client.list_objects(buckent_name, max_keys=max_keys, prefix=prefix)
			
 
				+    return objects.contents
			
 
				+#分块上传 文件大于5G
			
 
				+def get_multipart(bucket_name, object_key, file_name):
			
 
				+
			
 
				+    upload_id = bos_client.initiate_multipart_upload(bucket_name, object_key).upload_id
			
 
				+
			
 
				+    left_size = os.path.getsize(file_name)
			
 
				+    #设置分块的开始偏移位置
			
 
				+    offset = 0
			
 
				+
			
 
				+    part_number = 1
			
 
				+    part_list = []
			
 
				+
			
 
				+    while left_size > 0:
			
 
				+        #设置每块为5MB
			
 
				+        part_size = 5 * 1024 * 1024
			
 
				+        if left_size < part_size:
			
 
				+            part_size = left_size
			
 
				+
			
 
				+        response = bos_client.upload_part_from_file(
			
 
				+            bucket_name, object_key, upload_id, part_number, part_size, file_name, offset)
			
 
				+
			
 
				+
			
 
				+        left_size -= part_size
			
 
				+        offset += part_size
			
 
				+        part_list.append({
			
 
				+            "partNumber": part_number,
			
 
				+            "eTag": response.metadata.etag
			
 
				+        })
			
 
				+
			
 
				+
			
 
				+        part_number += 1
			
 
				+
			
 
				+    bos_client.complete_multipart_upload(bucket_name, object_key, upload_id, part_list)
			
 
				+'''
			
 
				+print bos_client.get_bucket_location(bucket_name)
			
 
				+print bos_client.does_bucket_exist(bucket_name)
			
 
				+#put_bos(bucket_name, 'data.jpg', '')
			
 
				+
			
 
				+
			
 
				+response = bos_client.get_object_meta_data(bucket_name, 'data.jpg')
			
 
				+print(response)
			
 
				+url = bos_client.generate_pre_signed_url(bucket_name,
			
 
				+                                         'data.jpg',
			
 
				+                                         -1)
			
 
				+print(url)
			
 
				+
			
 
				+
			
 
				+#get_bos(bucket_name, 'hadoop-client.tar', './')
			
 
				+'''
			
 
				+
			
 
				+
			
 
				+#print(put_bos('20210616/16-471802237.jpg', 'image/20210616/16-471802237.jpg'))
			
--- a/bos_sample_conf.py
+++ b/bos_sample_conf.py
@@ -0,0 +1,36 @@
 
				+#coding=utf-8
			
 
				+
			
 
				+#导入Python标准日志模块
			
 
				+import logging
			
 
				+
			
 
				+#从Python SDK导入BOS配置管理模块以及安全认证模块
			
 
				+from baidubce.bce_client_configuration import BceClientConfiguration
			
 
				+from baidubce.auth.bce_credentials import BceCredentials
			
 
				+
			
 
				+#设置BosClient的Host，Access Key ID和Secret Access Key
			
 
				+bos_host = "bj.bcebos.com"
			
 
				+access_key_id = "87815919190940dd9ff8a7790281e1e9"
			
 
				+secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
			
 
				+
			
 
				+access_key_id = "87815919190940dd9ff8a7790281e1e9"
			
 
				+secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
			
 
				+
			
 
				+access_key_id = "ALTAKEq9L0oxxxDi5jUc3e12gu"
			
 
				+secret_access_key = "9336a04f88e845e284bab26bd5fd8182"
			
 
				+
			
 
				+"""
			
 
				+#设置日志文件的句柄和日志级别
			
 
				+logger = logging.getLogger('baidubce.http.bce_http_client')
			
 
				+fh = logging.FileHandler("sample.log")
			
 
				+fh.setLevel(logging.DEBUG)
			
 
				+
			
 
				+#设置日志文件输出的顺序、结构和内容
			
 
				+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+fh.setFormatter(formatter)
			
 
				+logger.setLevel(logging.DEBUG)
			
 
				+logger.addHandler(fh)
			
 
				+"""
			
 
				+#创建BceClientConfiguration
			
 
				+config = BceClientConfiguration(credentials=BceCredentials(access_key_id, secret_access_key), endpoint = bos_host)
			
 
				+
			
 
				+