xzc
/
pdf_title_image


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
							#coding:utf-8

import os
import json
import re
import Levenshtein


# 扫描件-投标文件
HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价（元）', '含税合价（元）', '条款号', '评分因素', '评分标准', '页码'})


# 假设标题通常是一行且字体较大
#获取标题段落
#line  段落内容
#list_key  招标文件中响应文件格式（标题或目录）
def is_title(line: str, list_key=[]) -> bool:
    if not list_key:        
        title_word = re.findall('^[（\(][一二三四五六七八九十]+[\)）]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
    else:
        title_word = re.findall('|'.join(list_key) + '|^[（\(][一二三四五六七八九十]+[\)）]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
    if title_word:
        return True
    title_word = re.findall('^附录|^参考文献|^附表', line.strip())
    if title_word:
        return True
    return False

#不存在标题特征的段落，但是段落内容文本居中了且字符内容少于20个字符
def is_title_v2(line: str, box=[]) -> bool:
    try:
        left, right, width, height = box
    except:
        return False
    # if len(line) < 15 and height > 15:
    #     return True
    
    # if left > 135 and len(line) < 15:
    #     return True
    if len(re.findall('[\u4e00-\u9fa5]', line)) < 2:
        return False
    if left > 130 and left+(width/2) > 294 and left+(width/2) < 300 and len(line) < 15:
        if re.findall('^图|图$|页$', line):
            return False
        return True

    return False


# 定位营业执照、资质、业绩、财报图像的区间范围
def search_interval(title):
    # 通过关键字模糊定位
    keywords = ['资格审查资料','资格审查材料','其它材料','其他材料','其他资料','附件', '影印件']
    search_interval = []
    # locate in title.json
    left_pos = -1 # 左指针
    right_pos = -1 # 右指针
    for title_block in title:
        # print(title_block)
        block_text = title_block['text'].replace(' ', '').strip()
        
        # 先进行左区间判定
        if left_pos != -1 and '证书' not in block_text:
            right_pos = title_block['page_number']
            search_interval.append((left_pos, right_pos))
            # 重置
            left_pos = -1

        for keyword in keywords:
            if keyword in block_text:
                # print(title_block)
                # 先进行模糊的outline定位
                center_page = None
                if '.' in block_text:
                    center_page = block_text.split('.')[-1]
                    if center_page.isdigit():
                        center_page = eval(center_page)
                        left_pos = min(title_block['page_number'], center_page)
                else:
                    left_pos = title_block['page_number']

            
    # 最终判定
    if left_pos != -1:
        search_interval.append((left_pos, right_pos))

    # 搜寻区间合并
    search_interval.sort()

    merge_interval = []
    if len(search_interval) > 0:
        left = -1
        right = -1
        for interval in search_interval:
            l, r = interval
            if r < l:
                continue
            if left == -1 and right == -1:
                left = l
                right = r

            elif l <= right:
                right = r

            else:
                merge_interval.append((left, right))
                left = l
                right = r
        merge_interval.append((left, right))

    return merge_interval


def locate_business_license(title):
    '''locate business license and return image'''
    keywords = ["资格审查资料", "其它资格审查材料", "资格审查材料"]
    candidate_pages = []
    center_pages = []
    candidate_images = set()
    # locate in title.json
    for title_block in title:
        block_text = title_block['text'].replace(' ', '').strip()
        for keyword in keywords:
            if keyword in block_text:
                # 先进行模糊的outline定位
                center_page = None
                if '.' in block_text:
                    center_page = block_text.split('.')[-1]
                    if center_page.isdigit():
                        center_page = eval(center_page)
                        center_pages.append(center_page)
                candidate_pages.append(title_block['page_number'])
    # information match
    filter_pages = set()
    if len(center_pages) == 0 and len(candidate_pages) == 0:
        return None
    elif len(center_pages) == 0:
        filter_pages.update(candidate_pages)
    elif len(candidate_pages) == 0:
        filter_pages.update(center_pages)
    else:
        # center_pages作为锚点，全部加入
        filter_pages.update(center_pages)
        # candidate_page与center_page进行匹配加入
        for candidate_page in candidate_pages:
            if candidate_page <= start_threshold:
                continue
            for center_page in center_pages:
                distance = abs(candidate_page - center_page)
                if distance <= distance_threshold:
                    filter_pages.add(min(candidate_page, center_page) + distance // 2)
    
    # return target_path list
    return target_list

#textmind
# lines = open('三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8').read()
lines = open('data_1.json', 'r', encoding='utf-8').read()
json_line = json.loads(lines)
print(json_line.keys())
para_nodes = json_line['para_nodes']
table_flag = 0
contents = ""
for i in range(len(para_nodes)):
    # '评审因素'
    # ''
    if para_nodes[i]['node_type'] == 'contents':
        contents = para_nodes[i]['text']
        break

contents = re.sub('[\.\d]+', '', contents)
table_flag = 0
title_list = []
table_list = []
char_hight = 13
_index = 0
page_num = -1
for i in range(len(para_nodes)):    
    # if not para_nodes[i]['node_type'] in ["contents",'table', 'text', 'head_tail']:
    #     print(para_nodes[i])
    if i < table_flag:
        continue
    if not para_nodes[i]['position']:
        continue
    if para_nodes[i]['position'][0]['pageno'] != page_num:
        page_num = para_nodes[i]['position'][0]['pageno']
        _index = 0
    if para_nodes[i]['position'][0]['pageno'] == page_num:
        # page_num = para_nodes[i]['position'][0]['pageno']
        _index = _index + 1
    # para_nodes[i]['position'][0]['pageno']
    
    if para_nodes[i]['node_type'] == 'title' and para_nodes[i]['position'][0]['box'][-1]:
        title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
    elif _index < 3 and is_title(para_nodes[i]['text']) and len(para_nodes[i]['text']) < 20:
        title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
        # print(para_nodes[i]['text'])
    # elif is_title_v2(para_nodes[i]['text'], para_nodes[i]['position'][0]['box'] ) and len(para_nodes[i]['text']) < 20:
    #     print(para_nodes[i]['text'])
    # if para_nodes[i]['node_type'] == 'seal':  #印章
    #     print(para_nodes[i])
    # if len(para_nodes[i]['text']) > 5 and para_nodes[i]['text'] in contents and para_nodes[i]['position'][0]['box'][-1] >= char_hight:
    #     print(para_nodes[i]['text'])
    #报价文件、投标文件中报价清单
    if para_nodes[i]['node_type'] != 'table' and ('报价汇总表' in para_nodes[i]['text'] or '分项报价表' in para_nodes[i]['text'] or '工程量清单报价表' in para_nodes[i]['text'] or '报价明细表' in para_nodes[i]['text'] or '报价清单' in para_nodes[i]['text'] or ('报价表' in  para_nodes[i]['text'] and para_nodes[i]['node_type']=='title')):
        print(para_nodes[i])
        flag_word = re.findall('报价汇总表$|分项报价表$|工程量清单报价表$|报价明细表$|报价清单$', para_nodes[i]['text'])
        if not flag_word and re.findall('报价表', para_nodes[i]['text']) and para_nodes[i]['node_type']=='title':
            flag_word = '报价表'
        if not flag_word:
            continue
        if re.findall('^附件', para_nodes[i]['text']):
            continue

        flag_word = flag_word[0]
        position_page_id = para_nodes[i]['position'][0]['pageno']
        for j in range(i, len(para_nodes)):
            if para_nodes[j]['para_type'] != 'table' and position_page_id + 2 < para_nodes[j]['position'][0]['pageno']:
                break
            if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
                # print(position_page_id)
                position_page_id = para_nodes[i]['position'][0]['pageno']
        
        # print(i, j)
        lines = ""
        for k in range(i, j+1):
            if para_nodes[k]['node_type'] != 'table':
                word_flag = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表|安全文明措施|报价清单', para_nodes[k]['text'])
            # print(word_flag, flag_word)
            table_flag = k
            if word_flag and word_flag[0] != flag_word:
                break
            if para_nodes[k]['para_type'] != 'table':
                # print(para_nodes[k]['text'])
                continue
            _lines = para_nodes[k]['text'].split('\n')
            if lines and Levenshtein.ratio(lines.split('\n')[0], _lines[0]) > 0.96:
                lines = lines + '\n'.join(_lines[1:])
            else:
                lines = lines + '\n'.join(_lines[:])
            # print(_lines)
        # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
        if not lines:
            continue
        table_list.append((para_nodes[i]['text'], lines))

    
    #技术规范中工程量清单
    if para_nodes[i]['node_type'] != 'table' and re.findall('工程量清单|材料清单|工作量清单|报价明细表|主要配置（含备品备件、专用工器具）', para_nodes[i]['text']):
        position_page_id = para_nodes[i]['position'][0]['pageno']
        table_flag = 0
        for j in range(i, len(para_nodes)):
            if para_nodes[j]['para_type'] != 'table' and table_flag == 1:
                break
            if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
                # print(position_page_id)
                position_page_id = para_nodes[i]['position'][0]['pageno']
                table_flag = 1
        # print(i, j)
        lines = ""
        for k in range(i, j+1):
            if para_nodes[k]['para_type'] != 'table':
                # print(para_nodes[k]['text'])
                continue
            lines = lines + para_nodes[k]['text']
        # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
print(table_list)
# 表标题或者表格前标题：工程量清单、材料清单、工作量清单、报价明细表、主要配置（含备品备件、专用工器具）
# 表头：费用、单价、价格、含税价、单价、合价、估算工程量、单位

# file_content = json_line['para_nodes']
# for y in range(len(file_content[10:20])):
#     print(file_content[y])

# print(title_list)
# print(contents)


# print(search_interval(title_list))


# print(table_list)