#coding:utf-8 import os import json import re # 扫描件-投标文件 HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'}) def is_title(line: str) -> bool: title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip()) if title_word: return True title_word = re.findall('^附录|^参考文献|^附表', line.strip()) if title_word: return True return False lines = open('三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8').read() # lines = open('data_1.json', 'r', encoding='utf-8').read() json_line = json.loads(lines) print(json_line.keys()) para_nodes = json_line['para_nodes'] table_flag = 0 contents = "" for i in range(len(para_nodes)): # '评审因素' # '' if para_nodes[i]['node_type'] == 'contents': contents = para_nodes[i]['text'] break contents = re.sub('[\.\d]+', '', contents) table_flag = 0 title_list = [] char_hight = 13 _index = 0 page_num = -1 for i in range(len(para_nodes)): # if not para_nodes[i]['node_type'] in ["contents",'table', 'text', 'head_tail']: # print(para_nodes[i]) if i < table_flag: continue if not para_nodes[i]['position']: continue if para_nodes[i]['position'][0]['pageno'] != page_num: page_num = para_nodes[i]['position'][0]['pageno'] _index = 0 if para_nodes[i]['position'][0]['pageno'] == page_num: # page_num = para_nodes[i]['position'][0]['pageno'] _index = _index + 1 # para_nodes[i]['position'][0]['pageno'] if para_nodes[i]['node_type'] == 'title' and para_nodes[i]['position'][0]['box'][-1]: title_list.append((para_nodes[i]['text'], para_nodes[i]['position'][0]['pageno'])) elif _index < 3 and is_title(para_nodes[i]['text']) and len(para_nodes[i]['text']) < 20: title_list.append((para_nodes[i]['text'], para_nodes[i]['position'][0]['pageno'])) # print(para_nodes[i]['text']) if para_nodes[i]['node_type'] == 'seal': #印章 print(para_nodes[i]) # if len(para_nodes[i]['text']) > 5 and para_nodes[i]['text'] in contents and para_nodes[i]['position'][0]['box'][-1] >= char_hight: # print(para_nodes[i]['text']) if para_nodes[i]['node_type'] != 'table' and ('报价汇总表' in para_nodes[i]['text'] or '分项报价表' in para_nodes[i]['text'] or '工程量清单报价表' in para_nodes[i]['text'] or '报价明细表' in para_nodes[i]['text']): flag_word = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表', para_nodes[i]['text'])[0] position_page_id = para_nodes[i]['position'][0]['pageno'] for j in range(i, len(para_nodes)): if para_nodes[j]['para_type'] != 'table' and position_page_id + 2 < para_nodes[j]['position'][0]['pageno']: break if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2: # print(position_page_id) position_page_id = para_nodes[i]['position'][0]['pageno'] # print(i, j) lines = "" for k in range(i, j+1): if para_nodes[k]['node_type'] != 'table': word_flag = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表|安全文明措施', para_nodes[k]['text']) # print(word_flag, flag_word) table_flag = k if word_flag and word_flag[0] != flag_word: break if para_nodes[k]['para_type'] != 'table': # print(para_nodes[k]['text']) continue lines = lines + para_nodes[k]['text'] print(para_nodes[i]['text'], 'xxxxxxxxxx', lines) # file_content = json_line['para_nodes'] # for y in range(len(file_content[10:20])): # print(file_content[y]) print(title_list) print(contents)