base_file.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. #coding:utf-8
  2. import os
  3. import json
  4. import re
  5. import Levenshtein
  6. # 扫描件-投标文件
  7. HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
  8. # 假设标题通常是一行且字体较大
  9. #获取标题段落
  10. #line 段落内容
  11. #list_key 招标文件中响应文件格式(标题或目录)
  12. def is_title(line: str, list_key=[]) -> bool:
  13. if not list_key:
  14. title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
  15. else:
  16. title_word = re.findall('|'.join(list_key) + '|^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|^文件[一二三四五六七八九十]+|[一二三四五六七八九十]+[、要是]', line.strip())
  17. if title_word:
  18. return True
  19. title_word = re.findall('^附录|^参考文献|^附表', line.strip())
  20. if title_word:
  21. return True
  22. return False
  23. #不存在标题特征的段落,但是段落内容文本居中了且字符内容少于20个字符
  24. def is_title_v2(line: str, box=[]) -> bool:
  25. try:
  26. left, right, width, height = box
  27. except:
  28. return False
  29. # if len(line) < 15 and height > 15:
  30. # return True
  31. # if left > 135 and len(line) < 15:
  32. # return True
  33. if len(re.findall('[\u4e00-\u9fa5]', line)) < 2:
  34. return False
  35. if left > 130 and left+(width/2) > 294 and left+(width/2) < 300 and len(line) < 15:
  36. if re.findall('^图|图$|页$', line):
  37. return False
  38. return True
  39. return False
  40. # 定位营业执照、资质、业绩、财报图像的区间范围
  41. def search_interval(title):
  42. # 通过关键字模糊定位
  43. keywords = ['资格审查资料','资格审查材料','其它材料','其他材料','其他资料','附件', '影印件']
  44. search_interval = []
  45. # locate in title.json
  46. left_pos = -1 # 左指针
  47. right_pos = -1 # 右指针
  48. for title_block in title:
  49. # print(title_block)
  50. block_text = title_block['text'].replace(' ', '').strip()
  51. # 先进行左区间判定
  52. if left_pos != -1 and '证书' not in block_text:
  53. right_pos = title_block['page_number']
  54. search_interval.append((left_pos, right_pos))
  55. # 重置
  56. left_pos = -1
  57. for keyword in keywords:
  58. if keyword in block_text:
  59. # print(title_block)
  60. # 先进行模糊的outline定位
  61. center_page = None
  62. if '.' in block_text:
  63. center_page = block_text.split('.')[-1]
  64. if center_page.isdigit():
  65. center_page = eval(center_page)
  66. left_pos = min(title_block['page_number'], center_page)
  67. else:
  68. left_pos = title_block['page_number']
  69. # 最终判定
  70. if left_pos != -1:
  71. search_interval.append((left_pos, right_pos))
  72. # 搜寻区间合并
  73. search_interval.sort()
  74. merge_interval = []
  75. if len(search_interval) > 0:
  76. left = -1
  77. right = -1
  78. for interval in search_interval:
  79. l, r = interval
  80. if r < l:
  81. continue
  82. if left == -1 and right == -1:
  83. left = l
  84. right = r
  85. elif l <= right:
  86. right = r
  87. else:
  88. merge_interval.append((left, right))
  89. left = l
  90. right = r
  91. merge_interval.append((left, right))
  92. return merge_interval
  93. def locate_business_license(title):
  94. '''locate business license and return image'''
  95. keywords = ["资格审查资料", "其它资格审查材料", "资格审查材料"]
  96. candidate_pages = []
  97. center_pages = []
  98. candidate_images = set()
  99. # locate in title.json
  100. for title_block in title:
  101. block_text = title_block['text'].replace(' ', '').strip()
  102. for keyword in keywords:
  103. if keyword in block_text:
  104. # 先进行模糊的outline定位
  105. center_page = None
  106. if '.' in block_text:
  107. center_page = block_text.split('.')[-1]
  108. if center_page.isdigit():
  109. center_page = eval(center_page)
  110. center_pages.append(center_page)
  111. candidate_pages.append(title_block['page_number'])
  112. # information match
  113. filter_pages = set()
  114. if len(center_pages) == 0 and len(candidate_pages) == 0:
  115. return None
  116. elif len(center_pages) == 0:
  117. filter_pages.update(candidate_pages)
  118. elif len(candidate_pages) == 0:
  119. filter_pages.update(center_pages)
  120. else:
  121. # center_pages作为锚点,全部加入
  122. filter_pages.update(center_pages)
  123. # candidate_page与center_page进行匹配加入
  124. for candidate_page in candidate_pages:
  125. if candidate_page <= start_threshold:
  126. continue
  127. for center_page in center_pages:
  128. distance = abs(candidate_page - center_page)
  129. if distance <= distance_threshold:
  130. filter_pages.add(min(candidate_page, center_page) + distance // 2)
  131. # return target_path list
  132. return target_list
  133. #textmind
  134. # lines = open('三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8').read()
  135. lines = open('data_1.json', 'r', encoding='utf-8').read()
  136. json_line = json.loads(lines)
  137. print(json_line.keys())
  138. para_nodes = json_line['para_nodes']
  139. table_flag = 0
  140. contents = ""
  141. for i in range(len(para_nodes)):
  142. # '评审因素'
  143. # ''
  144. if para_nodes[i]['node_type'] == 'contents':
  145. contents = para_nodes[i]['text']
  146. break
  147. contents = re.sub('[\.\d]+', '', contents)
  148. table_flag = 0
  149. title_list = []
  150. table_list = []
  151. char_hight = 13
  152. _index = 0
  153. page_num = -1
  154. for i in range(len(para_nodes)):
  155. # if not para_nodes[i]['node_type'] in ["contents",'table', 'text', 'head_tail']:
  156. # print(para_nodes[i])
  157. if i < table_flag:
  158. continue
  159. if not para_nodes[i]['position']:
  160. continue
  161. if para_nodes[i]['position'][0]['pageno'] != page_num:
  162. page_num = para_nodes[i]['position'][0]['pageno']
  163. _index = 0
  164. if para_nodes[i]['position'][0]['pageno'] == page_num:
  165. # page_num = para_nodes[i]['position'][0]['pageno']
  166. _index = _index + 1
  167. # para_nodes[i]['position'][0]['pageno']
  168. if para_nodes[i]['node_type'] == 'title' and para_nodes[i]['position'][0]['box'][-1]:
  169. title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
  170. elif _index < 3 and is_title(para_nodes[i]['text']) and len(para_nodes[i]['text']) < 20:
  171. title_list.append({'text':para_nodes[i]['text'], 'page_number' : int(para_nodes[i]['position'][0]['pageno'])})
  172. # print(para_nodes[i]['text'])
  173. # elif is_title_v2(para_nodes[i]['text'], para_nodes[i]['position'][0]['box'] ) and len(para_nodes[i]['text']) < 20:
  174. # print(para_nodes[i]['text'])
  175. # if para_nodes[i]['node_type'] == 'seal': #印章
  176. # print(para_nodes[i])
  177. # if len(para_nodes[i]['text']) > 5 and para_nodes[i]['text'] in contents and para_nodes[i]['position'][0]['box'][-1] >= char_hight:
  178. # print(para_nodes[i]['text'])
  179. #报价文件、投标文件中报价清单
  180. if para_nodes[i]['node_type'] != 'table' and ('报价汇总表' in para_nodes[i]['text'] or '分项报价表' in para_nodes[i]['text'] or '工程量清单报价表' in para_nodes[i]['text'] or '报价明细表' in para_nodes[i]['text'] or '报价清单' in para_nodes[i]['text'] or ('报价表' in para_nodes[i]['text'] and para_nodes[i]['node_type']=='title')):
  181. print(para_nodes[i])
  182. flag_word = re.findall('报价汇总表$|分项报价表$|工程量清单报价表$|报价明细表$|报价清单$', para_nodes[i]['text'])
  183. if not flag_word and re.findall('报价表', para_nodes[i]['text']) and para_nodes[i]['node_type']=='title':
  184. flag_word = '报价表'
  185. if not flag_word:
  186. continue
  187. if re.findall('^附件', para_nodes[i]['text']):
  188. continue
  189. flag_word = flag_word[0]
  190. position_page_id = para_nodes[i]['position'][0]['pageno']
  191. for j in range(i, len(para_nodes)):
  192. if para_nodes[j]['para_type'] != 'table' and position_page_id + 2 < para_nodes[j]['position'][0]['pageno']:
  193. break
  194. if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
  195. # print(position_page_id)
  196. position_page_id = para_nodes[i]['position'][0]['pageno']
  197. # print(i, j)
  198. lines = ""
  199. for k in range(i, j+1):
  200. if para_nodes[k]['node_type'] != 'table':
  201. word_flag = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表|安全文明措施|报价清单', para_nodes[k]['text'])
  202. # print(word_flag, flag_word)
  203. table_flag = k
  204. if word_flag and word_flag[0] != flag_word:
  205. break
  206. if para_nodes[k]['para_type'] != 'table':
  207. # print(para_nodes[k]['text'])
  208. continue
  209. _lines = para_nodes[k]['text'].split('\n')
  210. if lines and Levenshtein.ratio(lines.split('\n')[0], _lines[0]) > 0.96:
  211. lines = lines + '\n'.join(_lines[1:])
  212. else:
  213. lines = lines + '\n'.join(_lines[:])
  214. # print(_lines)
  215. # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
  216. if not lines:
  217. continue
  218. table_list.append((para_nodes[i]['text'], lines))
  219. #技术规范中工程量清单
  220. if para_nodes[i]['node_type'] != 'table' and re.findall('工程量清单|材料清单|工作量清单|报价明细表|主要配置(含备品备件、专用工器具)', para_nodes[i]['text']):
  221. position_page_id = para_nodes[i]['position'][0]['pageno']
  222. table_flag = 0
  223. for j in range(i, len(para_nodes)):
  224. if para_nodes[j]['para_type'] != 'table' and table_flag == 1:
  225. break
  226. if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
  227. # print(position_page_id)
  228. position_page_id = para_nodes[i]['position'][0]['pageno']
  229. table_flag = 1
  230. # print(i, j)
  231. lines = ""
  232. for k in range(i, j+1):
  233. if para_nodes[k]['para_type'] != 'table':
  234. # print(para_nodes[k]['text'])
  235. continue
  236. lines = lines + para_nodes[k]['text']
  237. # print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
  238. print(table_list)
  239. # 表标题或者表格前标题:工程量清单、材料清单、工作量清单、报价明细表、主要配置(含备品备件、专用工器具)
  240. # 表头:费用、单价、价格、含税价、单价、合价、估算工程量、单位
  241. # file_content = json_line['para_nodes']
  242. # for y in range(len(file_content[10:20])):
  243. # print(file_content[y])
  244. # print(title_list)
  245. # print(contents)
  246. # print(search_interval(title_list))
  247. # print(table_list)