#coding:utf-8 import os import json import re import Levenshtein # 扫描件-投标文件 HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'}) # 假设标题通常是一行且字体较大 #获取标题段落 #line 段落内容 #list_key 招标文件中响应文件格式(标题或目录) def is_title(line: str, list_key=[]) -> bool: if not list_key: title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip()) else: title_word = re.findall('|'.join(list_key) + '|^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip()) if title_word: return True title_word = re.findall('^附录|^参考文献|^附表', line.strip()) if title_word: return True return False #不存在标题特征的段落,但是段落内容文本居中了且字符内容少于20个字符 def is_title_v2(line: str, box=[]) -> bool: try: left, right, width, height = box except: return False # if len(line) < 15 and height > 15: # return True # if left > 135 and len(line) < 15: # return True if len(re.findall('[\u4e00-\u9fa5]', line)) < 2: return False if left > 130 and left+(width/2) > 294 and left+(width/2) < 300 and len(line) < 15: if re.findall('^图|图$|页$', line): return False return True return False # 定位营业执照、资质、业绩、财报图像的区间范围 def search_interval(title): # 通过关键字模糊定位 keywords = ['资格审查资料','资格审查材料','其它材料','其他材料','其他资料','附件', '影印件'] search_interval = [] # locate in title.json left_pos = -1 # 左指针 right_pos = -1 # 右指针 for title_block in title: # print(title_block) block_text = title_block['text'].replace(' ', '').strip() # 先进行左区间判定 if left_pos != -1 and '证书' not in block_text: right_pos = title_block['page_number'] search_interval.append((left_pos, right_pos)) # 重置 left_pos = -1 for keyword in keywords: if keyword in block_text: # print(title_block) # 先进行模糊的outline定位 center_page = None if '.' in block_text: center_page = block_text.split('.')[-1] if center_page.isdigit(): center_page = eval(center_page) left_pos = min(title_block['page_number'], center_page) else: left_pos = title_block['page_number'] # 最终判定 if left_pos != -1: search_interval.append((left_pos, right_pos)) # 搜寻区间合并 search_interval.sort() merge_interval = [] if len(search_interval) > 0: left = -1 right = -1 for interval in search_interval: l, r = interval if r < l: continue if left == -1 and right == -1: left = l right = r elif l <= right: right = r else: merge_interval.append((left, right)) left = l right = r merge_interval.append((left, right)) return merge_interval def locate_business_license(title): '''locate business license and return image''' keywords = ["资格审查资料", "其它资格审查材料", "资格审查材料"] candidate_pages = [] center_pages = [] candidate_images = set() # locate in title.json for title_block in title: block_text = title_block['text'].replace(' ', '').strip() for keyword in keywords: if keyword in block_text: # 先进行模糊的outline定位 center_page = None if '.' in block_text: center_page = block_text.split('.')[-1] if center_page.isdigit(): center_page = eval(center_page) center_pages.append(center_page) candidate_pages.append(title_block['page_number']) # information match filter_pages = set() if len(center_pages) == 0 and len(candidate_pages) == 0: return None elif len(center_pages) == 0: filter_pages.update(candidate_pages) elif len(candidate_pages) == 0: filter_pages.update(center_pages) else: # center_pages作为锚点,全部加入 filter_pages.update(center_pages) # candidate_page与center_page进行匹配加入 for candidate_page in candidate_pages: if candidate_page <= start_threshold: continue for center_page in center_pages: distance = abs(candidate_page - center_page) if distance <= distance_threshold: filter_pages.add(min(candidate_page, center_page) + distance // 2) # return target_path list return target_list #textmind # lines = open('三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8').read() lines = open('data_1.json', 'r', encoding='utf-8').read() json_line = json.loads(lines) print(json_line.keys()) para_nodes = json_line['para_nodes'] table_flag = 0 contents = "" for i in range(len(para_nodes)): # '评审因素' # '' if para_nodes[i]['node_type'] == 'contents': contents = para_nodes[i]['text'] break contents = re.sub('[\.\d]+', '', contents) table_flag = 0 title_list = [] table_list = [] char_hight = 13 _index = 0 page_num = -1 for i in range(len(para_nodes)): # if not para_nodes[i]['node_type'] in ["contents",'table', 'text', 'head_tail']: # print(para_nodes[i]) if i < table_flag: continue if not para_nodes[i]['position']: continue if para_nodes[i]['position'][0]['pageno'] != page_num: page_num = para_nodes[i]['position'][0]['pageno'] _index = 0 if para_nodes[i]['position'][0]['pageno'] == page_num: # page_num = para_nodes[i]['position'][0]['pageno'] _index = _index + 1 # para_nodes[i]['position'][0]['pageno'] if para_nodes[i]['node_type'] == 'title' and para_nodes[i]['position'][0]['box'][-1]: title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])}) elif _index < 3 and is_title(para_nodes[i]['text']) and len(para_nodes[i]['text']) < 20: title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])}) # print(para_nodes[i]['text']) # elif is_title_v2(para_nodes[i]['text'], para_nodes[i]['position'][0]['box'] ) and len(para_nodes[i]['text']) < 20: # print(para_nodes[i]['text']) # if para_nodes[i]['node_type'] == 'seal': #印章 # print(para_nodes[i]) # if len(para_nodes[i]['text']) > 5 and para_nodes[i]['text'] in contents and para_nodes[i]['position'][0]['box'][-1] >= char_hight: # print(para_nodes[i]['text']) #报价文件、投标文件中报价清单 if para_nodes[i]['node_type'] != 'table' and ('报价汇总表' in para_nodes[i]['text'] or '分项报价表' in para_nodes[i]['text'] or '工程量清单报价表' in para_nodes[i]['text'] or '报价明细表' in para_nodes[i]['text'] or '报价清单' in para_nodes[i]['text'] or ('报价表' in para_nodes[i]['text'] and para_nodes[i]['node_type']=='title')): print(para_nodes[i]) flag_word = re.findall('报价汇总表$|分项报价表$|工程量清单报价表$|报价明细表$|报价清单$', para_nodes[i]['text']) if not flag_word and re.findall('报价表', para_nodes[i]['text']) and para_nodes[i]['node_type']=='title': flag_word = '报价表' if not flag_word: continue if re.findall('^附件', para_nodes[i]['text']): continue flag_word = flag_word[0] position_page_id = para_nodes[i]['position'][0]['pageno'] for j in range(i, len(para_nodes)): if para_nodes[j]['para_type'] != 'table' and position_page_id + 2 < para_nodes[j]['position'][0]['pageno']: break if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2: # print(position_page_id) position_page_id = para_nodes[i]['position'][0]['pageno'] # print(i, j) lines = "" for k in range(i, j+1): if para_nodes[k]['node_type'] != 'table': word_flag = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表|安全文明措施|报价清单', para_nodes[k]['text']) # print(word_flag, flag_word) table_flag = k if word_flag and word_flag[0] != flag_word: break if para_nodes[k]['para_type'] != 'table': # print(para_nodes[k]['text']) continue _lines = para_nodes[k]['text'].split('\n') if lines and Levenshtein.ratio(lines.split('\n')[0], _lines[0]) > 0.96: lines = lines + '\n'.join(_lines[1:]) else: lines = lines + '\n'.join(_lines[:]) # print(_lines) # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines) if not lines: continue table_list.append((para_nodes[i]['text'], lines)) #技术规范中工程量清单 if para_nodes[i]['node_type'] != 'table' and re.findall('工程量清单|材料清单|工作量清单|报价明细表|主要配置(含备品备件、专用工器具)', para_nodes[i]['text']): position_page_id = para_nodes[i]['position'][0]['pageno'] table_flag = 0 for j in range(i, len(para_nodes)): if para_nodes[j]['para_type'] != 'table' and table_flag == 1: break if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2: # print(position_page_id) position_page_id = para_nodes[i]['position'][0]['pageno'] table_flag = 1 # print(i, j) lines = "" for k in range(i, j+1): if para_nodes[k]['para_type'] != 'table': # print(para_nodes[k]['text']) continue lines = lines + para_nodes[k]['text'] # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines) print(table_list) # 表标题或者表格前标题:工程量清单、材料清单、工作量清单、报价明细表、主要配置(含备品备件、专用工器具) # 表头:费用、单价、价格、含税价、单价、合价、估算工程量、单位 # file_content = json_line['para_nodes'] # for y in range(len(file_content[10:20])): # print(file_content[y]) # print(title_list) # print(contents) # print(search_interval(title_list)) # print(table_list)