123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- #coding:utf-8
- import os
- import json
- import re
- import Levenshtein
- # 扫描件-投标文件
- HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
- # 假设标题通常是一行且字体较大
- #获取标题段落
- #line 段落内容
- #list_key 招标文件中响应文件格式(标题或目录)
- def is_title(line: str, list_key=[]) -> bool:
- if not list_key:
- title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
- else:
- title_word = re.findall('|'.join(list_key) + '|^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
- if title_word:
- return True
- title_word = re.findall('^附录|^参考文献|^附表', line.strip())
- if title_word:
- return True
- return False
- #不存在标题特征的段落,但是段落内容文本居中了且字符内容少于20个字符
- def is_title_v2(line: str, box=[]) -> bool:
- try:
- left, right, width, height = box
- except:
- return False
- # if len(line) < 15 and height > 15:
- # return True
-
- # if left > 135 and len(line) < 15:
- # return True
- if len(re.findall('[\u4e00-\u9fa5]', line)) < 2:
- return False
- if left > 130 and left+(width/2) > 294 and left+(width/2) < 300 and len(line) < 15:
- if re.findall('^图|图$|页$', line):
- return False
- return True
- return False
- # 定位营业执照、资质、业绩、财报图像的区间范围
- def search_interval(title):
- # 通过关键字模糊定位
- keywords = ['资格审查资料','资格审查材料','其它材料','其他材料','其他资料','附件', '影印件']
- search_interval = []
- # locate in title.json
- left_pos = -1 # 左指针
- right_pos = -1 # 右指针
- for title_block in title:
- # print(title_block)
- block_text = title_block['text'].replace(' ', '').strip()
-
- # 先进行左区间判定
- if left_pos != -1 and '证书' not in block_text:
- right_pos = title_block['page_number']
- search_interval.append((left_pos, right_pos))
- # 重置
- left_pos = -1
- for keyword in keywords:
- if keyword in block_text:
- # print(title_block)
- # 先进行模糊的outline定位
- center_page = None
- if '.' in block_text:
- center_page = block_text.split('.')[-1]
- if center_page.isdigit():
- center_page = eval(center_page)
- left_pos = min(title_block['page_number'], center_page)
- else:
- left_pos = title_block['page_number']
-
- # 最终判定
- if left_pos != -1:
- search_interval.append((left_pos, right_pos))
- # 搜寻区间合并
- search_interval.sort()
- merge_interval = []
- if len(search_interval) > 0:
- left = -1
- right = -1
- for interval in search_interval:
- l, r = interval
- if r < l:
- continue
- if left == -1 and right == -1:
- left = l
- right = r
- elif l <= right:
- right = r
- else:
- merge_interval.append((left, right))
- left = l
- right = r
- merge_interval.append((left, right))
- return merge_interval
- def locate_business_license(title):
- '''locate business license and return image'''
- keywords = ["资格审查资料", "其它资格审查材料", "资格审查材料"]
- candidate_pages = []
- center_pages = []
- candidate_images = set()
- # locate in title.json
- for title_block in title:
- block_text = title_block['text'].replace(' ', '').strip()
- for keyword in keywords:
- if keyword in block_text:
- # 先进行模糊的outline定位
- center_page = None
- if '.' in block_text:
- center_page = block_text.split('.')[-1]
- if center_page.isdigit():
- center_page = eval(center_page)
- center_pages.append(center_page)
- candidate_pages.append(title_block['page_number'])
- # information match
- filter_pages = set()
- if len(center_pages) == 0 and len(candidate_pages) == 0:
- return None
- elif len(center_pages) == 0:
- filter_pages.update(candidate_pages)
- elif len(candidate_pages) == 0:
- filter_pages.update(center_pages)
- else:
- # center_pages作为锚点,全部加入
- filter_pages.update(center_pages)
- # candidate_page与center_page进行匹配加入
- for candidate_page in candidate_pages:
- if candidate_page <= start_threshold:
- continue
- for center_page in center_pages:
- distance = abs(candidate_page - center_page)
- if distance <= distance_threshold:
- filter_pages.add(min(candidate_page, center_page) + distance // 2)
-
- # return target_path list
- return target_list
- #textmind
- # lines = open('三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8').read()
- lines = open('data_1.json', 'r', encoding='utf-8').read()
- json_line = json.loads(lines)
- print(json_line.keys())
- para_nodes = json_line['para_nodes']
- table_flag = 0
- contents = ""
- for i in range(len(para_nodes)):
- # '评审因素'
- # ''
- if para_nodes[i]['node_type'] == 'contents':
- contents = para_nodes[i]['text']
- break
- contents = re.sub('[\.\d]+', '', contents)
- table_flag = 0
- title_list = []
- table_list = []
- char_hight = 13
- _index = 0
- page_num = -1
- for i in range(len(para_nodes)):
- # if not para_nodes[i]['node_type'] in ["contents",'table', 'text', 'head_tail']:
- # print(para_nodes[i])
- if i < table_flag:
- continue
- if not para_nodes[i]['position']:
- continue
- if para_nodes[i]['position'][0]['pageno'] != page_num:
- page_num = para_nodes[i]['position'][0]['pageno']
- _index = 0
- if para_nodes[i]['position'][0]['pageno'] == page_num:
- # page_num = para_nodes[i]['position'][0]['pageno']
- _index = _index + 1
- # para_nodes[i]['position'][0]['pageno']
-
- if para_nodes[i]['node_type'] == 'title' and para_nodes[i]['position'][0]['box'][-1]:
- title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
- elif _index < 3 and is_title(para_nodes[i]['text']) and len(para_nodes[i]['text']) < 20:
- title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
- # print(para_nodes[i]['text'])
- # elif is_title_v2(para_nodes[i]['text'], para_nodes[i]['position'][0]['box'] ) and len(para_nodes[i]['text']) < 20:
- # print(para_nodes[i]['text'])
- # if para_nodes[i]['node_type'] == 'seal': #印章
- # print(para_nodes[i])
- # if len(para_nodes[i]['text']) > 5 and para_nodes[i]['text'] in contents and para_nodes[i]['position'][0]['box'][-1] >= char_hight:
- # print(para_nodes[i]['text'])
- #报价文件、投标文件中报价清单
- if para_nodes[i]['node_type'] != 'table' and ('报价汇总表' in para_nodes[i]['text'] or '分项报价表' in para_nodes[i]['text'] or '工程量清单报价表' in para_nodes[i]['text'] or '报价明细表' in para_nodes[i]['text'] or '报价清单' in para_nodes[i]['text'] or ('报价表' in para_nodes[i]['text'] and para_nodes[i]['node_type']=='title')):
- print(para_nodes[i])
- flag_word = re.findall('报价汇总表$|分项报价表$|工程量清单报价表$|报价明细表$|报价清单$', para_nodes[i]['text'])
- if not flag_word and re.findall('报价表', para_nodes[i]['text']) and para_nodes[i]['node_type']=='title':
- flag_word = '报价表'
- if not flag_word:
- continue
- if re.findall('^附件', para_nodes[i]['text']):
- continue
- flag_word = flag_word[0]
- position_page_id = para_nodes[i]['position'][0]['pageno']
- for j in range(i, len(para_nodes)):
- if para_nodes[j]['para_type'] != 'table' and position_page_id + 2 < para_nodes[j]['position'][0]['pageno']:
- break
- if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
- # print(position_page_id)
- position_page_id = para_nodes[i]['position'][0]['pageno']
-
- # print(i, j)
- lines = ""
- for k in range(i, j+1):
- if para_nodes[k]['node_type'] != 'table':
- word_flag = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表|安全文明措施|报价清单', para_nodes[k]['text'])
- # print(word_flag, flag_word)
- table_flag = k
- if word_flag and word_flag[0] != flag_word:
- break
- if para_nodes[k]['para_type'] != 'table':
- # print(para_nodes[k]['text'])
- continue
- _lines = para_nodes[k]['text'].split('\n')
- if lines and Levenshtein.ratio(lines.split('\n')[0], _lines[0]) > 0.96:
- lines = lines + '\n'.join(_lines[1:])
- else:
- lines = lines + '\n'.join(_lines[:])
- # print(_lines)
- # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
- if not lines:
- continue
- table_list.append((para_nodes[i]['text'], lines))
-
- #技术规范中工程量清单
- if para_nodes[i]['node_type'] != 'table' and re.findall('工程量清单|材料清单|工作量清单|报价明细表|主要配置(含备品备件、专用工器具)', para_nodes[i]['text']):
- position_page_id = para_nodes[i]['position'][0]['pageno']
- table_flag = 0
- for j in range(i, len(para_nodes)):
- if para_nodes[j]['para_type'] != 'table' and table_flag == 1:
- break
- if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
- # print(position_page_id)
- position_page_id = para_nodes[i]['position'][0]['pageno']
- table_flag = 1
- # print(i, j)
- lines = ""
- for k in range(i, j+1):
- if para_nodes[k]['para_type'] != 'table':
- # print(para_nodes[k]['text'])
- continue
- lines = lines + para_nodes[k]['text']
- # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
- print(table_list)
- # 表标题或者表格前标题:工程量清单、材料清单、工作量清单、报价明细表、主要配置(含备品备件、专用工器具)
- # 表头:费用、单价、价格、含税价、单价、合价、估算工程量、单位
- # file_content = json_line['para_nodes']
- # for y in range(len(file_content[10:20])):
- # print(file_content[y])
- # print(title_list)
- # print(contents)
- # print(search_interval(title_list))
- # print(table_list)
|