|
@@ -0,0 +1,283 @@
|
|
|
+#coding:utf-8
|
|
|
+
|
|
|
+import os
|
|
|
+import json
|
|
|
+import re
|
|
|
+import Levenshtein
|
|
|
+
|
|
|
+
|
|
|
+# 扫描件-投标文件
|
|
|
+HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
|
|
|
+
|
|
|
+
|
|
|
+# 假设标题通常是一行且字体较大
|
|
|
+#获取标题段落
|
|
|
+#line 段落内容
|
|
|
+#list_key 招标文件中响应文件格式(标题或目录)
|
|
|
+def is_title(line: str, list_key=[]) -> bool:
|
|
|
+ if not list_key:
|
|
|
+ title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
|
|
|
+ else:
|
|
|
+ title_word = re.findall('|'.join(list_key) + '|^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
|
|
|
+ if title_word:
|
|
|
+ return True
|
|
|
+ title_word = re.findall('^附录|^参考文献|^附表', line.strip())
|
|
|
+ if title_word:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+#不存在标题特征的段落,但是段落内容文本居中了且字符内容少于20个字符
|
|
|
+def is_title_v2(line: str, box=[]) -> bool:
|
|
|
+ try:
|
|
|
+ left, right, width, height = box
|
|
|
+ except:
|
|
|
+ return False
|
|
|
+ # if len(line) < 15 and height > 15:
|
|
|
+ # return True
|
|
|
+
|
|
|
+ # if left > 135 and len(line) < 15:
|
|
|
+ # return True
|
|
|
+ if len(re.findall('[\u4e00-\u9fa5]', line)) < 2:
|
|
|
+ return False
|
|
|
+ if left > 130 and left+(width/2) > 294 and left+(width/2) < 300 and len(line) < 15:
|
|
|
+ if re.findall('^图|图$|页$', line):
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# 定位营业执照、资质、业绩、财报图像的区间范围
|
|
|
+def search_interval(title):
|
|
|
+ # 通过关键字模糊定位
|
|
|
+ keywords = ['资格审查资料','资格审查材料','其它材料','其他材料','其他资料','附件', '影印件']
|
|
|
+ search_interval = []
|
|
|
+ # locate in title.json
|
|
|
+ left_pos = -1 # 左指针
|
|
|
+ right_pos = -1 # 右指针
|
|
|
+ for title_block in title:
|
|
|
+ # print(title_block)
|
|
|
+ block_text = title_block['text'].replace(' ', '').strip()
|
|
|
+
|
|
|
+ # 先进行左区间判定
|
|
|
+ if left_pos != -1 and '证书' not in block_text:
|
|
|
+ right_pos = title_block['page_number']
|
|
|
+ search_interval.append((left_pos, right_pos))
|
|
|
+ # 重置
|
|
|
+ left_pos = -1
|
|
|
+
|
|
|
+ for keyword in keywords:
|
|
|
+ if keyword in block_text:
|
|
|
+ # print(title_block)
|
|
|
+ # 先进行模糊的outline定位
|
|
|
+ center_page = None
|
|
|
+ if '.' in block_text:
|
|
|
+ center_page = block_text.split('.')[-1]
|
|
|
+ if center_page.isdigit():
|
|
|
+ center_page = eval(center_page)
|
|
|
+ left_pos = min(title_block['page_number'], center_page)
|
|
|
+ else:
|
|
|
+ left_pos = title_block['page_number']
|
|
|
+
|
|
|
+
|
|
|
+ # 最终判定
|
|
|
+ if left_pos != -1:
|
|
|
+ search_interval.append((left_pos, right_pos))
|
|
|
+
|
|
|
+ # 搜寻区间合并
|
|
|
+ search_interval.sort()
|
|
|
+
|
|
|
+ merge_interval = []
|
|
|
+ if len(search_interval) > 0:
|
|
|
+ left = -1
|
|
|
+ right = -1
|
|
|
+ for interval in search_interval:
|
|
|
+ l, r = interval
|
|
|
+ if r < l:
|
|
|
+ continue
|
|
|
+ if left == -1 and right == -1:
|
|
|
+ left = l
|
|
|
+ right = r
|
|
|
+
|
|
|
+ elif l <= right:
|
|
|
+ right = r
|
|
|
+
|
|
|
+ else:
|
|
|
+ merge_interval.append((left, right))
|
|
|
+ left = l
|
|
|
+ right = r
|
|
|
+ merge_interval.append((left, right))
|
|
|
+
|
|
|
+ return merge_interval
|
|
|
+
|
|
|
+
|
|
|
+def locate_business_license(title):
|
|
|
+ '''locate business license and return image'''
|
|
|
+ keywords = ["资格审查资料", "其它资格审查材料", "资格审查材料"]
|
|
|
+ candidate_pages = []
|
|
|
+ center_pages = []
|
|
|
+ candidate_images = set()
|
|
|
+ # locate in title.json
|
|
|
+ for title_block in title:
|
|
|
+ block_text = title_block['text'].replace(' ', '').strip()
|
|
|
+ for keyword in keywords:
|
|
|
+ if keyword in block_text:
|
|
|
+ # 先进行模糊的outline定位
|
|
|
+ center_page = None
|
|
|
+ if '.' in block_text:
|
|
|
+ center_page = block_text.split('.')[-1]
|
|
|
+ if center_page.isdigit():
|
|
|
+ center_page = eval(center_page)
|
|
|
+ center_pages.append(center_page)
|
|
|
+ candidate_pages.append(title_block['page_number'])
|
|
|
+ # information match
|
|
|
+ filter_pages = set()
|
|
|
+ if len(center_pages) == 0 and len(candidate_pages) == 0:
|
|
|
+ return None
|
|
|
+ elif len(center_pages) == 0:
|
|
|
+ filter_pages.update(candidate_pages)
|
|
|
+ elif len(candidate_pages) == 0:
|
|
|
+ filter_pages.update(center_pages)
|
|
|
+ else:
|
|
|
+ # center_pages作为锚点,全部加入
|
|
|
+ filter_pages.update(center_pages)
|
|
|
+ # candidate_page与center_page进行匹配加入
|
|
|
+ for candidate_page in candidate_pages:
|
|
|
+ if candidate_page <= start_threshold:
|
|
|
+ continue
|
|
|
+ for center_page in center_pages:
|
|
|
+ distance = abs(candidate_page - center_page)
|
|
|
+ if distance <= distance_threshold:
|
|
|
+ filter_pages.add(min(candidate_page, center_page) + distance // 2)
|
|
|
+
|
|
|
+ # return target_path list
|
|
|
+ return target_list
|
|
|
+
|
|
|
+#textmind
|
|
|
+# lines = open('三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8').read()
|
|
|
+lines = open('data_1.json', 'r', encoding='utf-8').read()
|
|
|
+json_line = json.loads(lines)
|
|
|
+print(json_line.keys())
|
|
|
+para_nodes = json_line['para_nodes']
|
|
|
+table_flag = 0
|
|
|
+contents = ""
|
|
|
+for i in range(len(para_nodes)):
|
|
|
+ # '评审因素'
|
|
|
+ # ''
|
|
|
+ if para_nodes[i]['node_type'] == 'contents':
|
|
|
+ contents = para_nodes[i]['text']
|
|
|
+ break
|
|
|
+
|
|
|
+contents = re.sub('[\.\d]+', '', contents)
|
|
|
+table_flag = 0
|
|
|
+title_list = []
|
|
|
+table_list = []
|
|
|
+char_hight = 13
|
|
|
+_index = 0
|
|
|
+page_num = -1
|
|
|
+for i in range(len(para_nodes)):
|
|
|
+ # if not para_nodes[i]['node_type'] in ["contents",'table', 'text', 'head_tail']:
|
|
|
+ # print(para_nodes[i])
|
|
|
+ if i < table_flag:
|
|
|
+ continue
|
|
|
+ if not para_nodes[i]['position']:
|
|
|
+ continue
|
|
|
+ if para_nodes[i]['position'][0]['pageno'] != page_num:
|
|
|
+ page_num = para_nodes[i]['position'][0]['pageno']
|
|
|
+ _index = 0
|
|
|
+ if para_nodes[i]['position'][0]['pageno'] == page_num:
|
|
|
+ # page_num = para_nodes[i]['position'][0]['pageno']
|
|
|
+ _index = _index + 1
|
|
|
+ # para_nodes[i]['position'][0]['pageno']
|
|
|
+
|
|
|
+ if para_nodes[i]['node_type'] == 'title' and para_nodes[i]['position'][0]['box'][-1]:
|
|
|
+ title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
|
|
|
+ elif _index < 3 and is_title(para_nodes[i]['text']) and len(para_nodes[i]['text']) < 20:
|
|
|
+ title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
|
|
|
+ # print(para_nodes[i]['text'])
|
|
|
+ # elif is_title_v2(para_nodes[i]['text'], para_nodes[i]['position'][0]['box'] ) and len(para_nodes[i]['text']) < 20:
|
|
|
+ # print(para_nodes[i]['text'])
|
|
|
+ # if para_nodes[i]['node_type'] == 'seal': #印章
|
|
|
+ # print(para_nodes[i])
|
|
|
+ # if len(para_nodes[i]['text']) > 5 and para_nodes[i]['text'] in contents and para_nodes[i]['position'][0]['box'][-1] >= char_hight:
|
|
|
+ # print(para_nodes[i]['text'])
|
|
|
+ #报价文件、投标文件中报价清单
|
|
|
+ if para_nodes[i]['node_type'] != 'table' and ('报价汇总表' in para_nodes[i]['text'] or '分项报价表' in para_nodes[i]['text'] or '工程量清单报价表' in para_nodes[i]['text'] or '报价明细表' in para_nodes[i]['text'] or '报价清单' in para_nodes[i]['text'] or ('报价表' in para_nodes[i]['text'] and para_nodes[i]['node_type']=='title')):
|
|
|
+ print(para_nodes[i])
|
|
|
+ flag_word = re.findall('报价汇总表$|分项报价表$|工程量清单报价表$|报价明细表$|报价清单$', para_nodes[i]['text'])
|
|
|
+ if not flag_word and re.findall('报价表', para_nodes[i]['text']) and para_nodes[i]['node_type']=='title':
|
|
|
+ flag_word = '报价表'
|
|
|
+ if not flag_word:
|
|
|
+ continue
|
|
|
+ if re.findall('^附件', para_nodes[i]['text']):
|
|
|
+ continue
|
|
|
+
|
|
|
+ flag_word = flag_word[0]
|
|
|
+ position_page_id = para_nodes[i]['position'][0]['pageno']
|
|
|
+ for j in range(i, len(para_nodes)):
|
|
|
+ if para_nodes[j]['para_type'] != 'table' and position_page_id + 2 < para_nodes[j]['position'][0]['pageno']:
|
|
|
+ break
|
|
|
+ if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
|
|
|
+ # print(position_page_id)
|
|
|
+ position_page_id = para_nodes[i]['position'][0]['pageno']
|
|
|
+
|
|
|
+ # print(i, j)
|
|
|
+ lines = ""
|
|
|
+ for k in range(i, j+1):
|
|
|
+ if para_nodes[k]['node_type'] != 'table':
|
|
|
+ word_flag = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表|安全文明措施|报价清单', para_nodes[k]['text'])
|
|
|
+ # print(word_flag, flag_word)
|
|
|
+ table_flag = k
|
|
|
+ if word_flag and word_flag[0] != flag_word:
|
|
|
+ break
|
|
|
+ if para_nodes[k]['para_type'] != 'table':
|
|
|
+ # print(para_nodes[k]['text'])
|
|
|
+ continue
|
|
|
+ _lines = para_nodes[k]['text'].split('\n')
|
|
|
+ if lines and Levenshtein.ratio(lines.split('\n')[0], _lines[0]) > 0.96:
|
|
|
+ lines = lines + '\n'.join(_lines[1:])
|
|
|
+ else:
|
|
|
+ lines = lines + '\n'.join(_lines[:])
|
|
|
+ # print(_lines)
|
|
|
+ # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
|
|
|
+ if not lines:
|
|
|
+ continue
|
|
|
+ table_list.append((para_nodes[i]['text'], lines))
|
|
|
+
|
|
|
+
|
|
|
+ #技术规范中工程量清单
|
|
|
+ if para_nodes[i]['node_type'] != 'table' and re.findall('工程量清单|材料清单|工作量清单|报价明细表|主要配置(含备品备件、专用工器具)', para_nodes[i]['text']):
|
|
|
+ position_page_id = para_nodes[i]['position'][0]['pageno']
|
|
|
+ table_flag = 0
|
|
|
+ for j in range(i, len(para_nodes)):
|
|
|
+ if para_nodes[j]['para_type'] != 'table' and table_flag == 1:
|
|
|
+ break
|
|
|
+ if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
|
|
|
+ # print(position_page_id)
|
|
|
+ position_page_id = para_nodes[i]['position'][0]['pageno']
|
|
|
+ table_flag = 1
|
|
|
+ # print(i, j)
|
|
|
+ lines = ""
|
|
|
+ for k in range(i, j+1):
|
|
|
+ if para_nodes[k]['para_type'] != 'table':
|
|
|
+ # print(para_nodes[k]['text'])
|
|
|
+ continue
|
|
|
+ lines = lines + para_nodes[k]['text']
|
|
|
+ # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
|
|
|
+print(table_list)
|
|
|
+# 表标题或者表格前标题:工程量清单、材料清单、工作量清单、报价明细表、主要配置(含备品备件、专用工器具)
|
|
|
+# 表头:费用、单价、价格、含税价、单价、合价、估算工程量、单位
|
|
|
+
|
|
|
+# file_content = json_line['para_nodes']
|
|
|
+# for y in range(len(file_content[10:20])):
|
|
|
+# print(file_content[y])
|
|
|
+
|
|
|
+# print(title_list)
|
|
|
+# print(contents)
|
|
|
+
|
|
|
+
|
|
|
+# print(search_interval(title_list))
|
|
|
+
|
|
|
+
|
|
|
+# print(table_list)
|