base_file.py 4.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. #coding:utf-8
  2. import os
  3. import json
  4. import re
  5. # 扫描件-投标文件
  6. HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
  7. def is_title(line: str) -> bool:
  8. title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
  9. if title_word:
  10. return True
  11. title_word = re.findall('^附录|^参考文献|^附表', line.strip())
  12. if title_word:
  13. return True
  14. return False
  15. lines = open('三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8').read()
  16. # lines = open('data_1.json', 'r', encoding='utf-8').read()
  17. json_line = json.loads(lines)
  18. print(json_line.keys())
  19. para_nodes = json_line['para_nodes']
  20. table_flag = 0
  21. contents = ""
  22. for i in range(len(para_nodes)):
  23. # '评审因素'
  24. # ''
  25. if para_nodes[i]['node_type'] == 'contents':
  26. contents = para_nodes[i]['text']
  27. break
  28. contents = re.sub('[\.\d]+', '', contents)
  29. table_flag = 0
  30. title_list = []
  31. char_hight = 13
  32. _index = 0
  33. page_num = -1
  34. for i in range(len(para_nodes)):
  35. # if not para_nodes[i]['node_type'] in ["contents",'table', 'text', 'head_tail']:
  36. # print(para_nodes[i])
  37. if i < table_flag:
  38. continue
  39. if not para_nodes[i]['position']:
  40. continue
  41. if para_nodes[i]['position'][0]['pageno'] != page_num:
  42. page_num = para_nodes[i]['position'][0]['pageno']
  43. _index = 0
  44. if para_nodes[i]['position'][0]['pageno'] == page_num:
  45. # page_num = para_nodes[i]['position'][0]['pageno']
  46. _index = _index + 1
  47. # para_nodes[i]['position'][0]['pageno']
  48. if para_nodes[i]['node_type'] == 'title' and para_nodes[i]['position'][0]['box'][-1]:
  49. title_list.append((para_nodes[i]['text'], para_nodes[i]['position'][0]['pageno']))
  50. elif _index < 3 and is_title(para_nodes[i]['text']) and len(para_nodes[i]['text']) < 20:
  51. title_list.append((para_nodes[i]['text'], para_nodes[i]['position'][0]['pageno']))
  52. # print(para_nodes[i]['text'])
  53. if para_nodes[i]['node_type'] == 'seal': #印章
  54. print(para_nodes[i])
  55. # if len(para_nodes[i]['text']) > 5 and para_nodes[i]['text'] in contents and para_nodes[i]['position'][0]['box'][-1] >= char_hight:
  56. # print(para_nodes[i]['text'])
  57. if para_nodes[i]['node_type'] != 'table' and ('报价汇总表' in para_nodes[i]['text'] or '分项报价表' in para_nodes[i]['text'] or '工程量清单报价表' in para_nodes[i]['text'] or '报价明细表' in para_nodes[i]['text']):
  58. flag_word = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表', para_nodes[i]['text'])[0]
  59. position_page_id = para_nodes[i]['position'][0]['pageno']
  60. for j in range(i, len(para_nodes)):
  61. if para_nodes[j]['para_type'] != 'table' and position_page_id + 2 < para_nodes[j]['position'][0]['pageno']:
  62. break
  63. if para_nodes[i]['position'][0]['pageno'] - position_page_id < 2:
  64. # print(position_page_id)
  65. position_page_id = para_nodes[i]['position'][0]['pageno']
  66. # print(i, j)
  67. lines = ""
  68. for k in range(i, j+1):
  69. if para_nodes[k]['node_type'] != 'table':
  70. word_flag = re.findall('报价汇总表|分项报价表|工程量清单报价表|报价明细表|安全文明措施', para_nodes[k]['text'])
  71. # print(word_flag, flag_word)
  72. table_flag = k
  73. if word_flag and word_flag[0] != flag_word:
  74. break
  75. if para_nodes[k]['para_type'] != 'table':
  76. # print(para_nodes[k]['text'])
  77. continue
  78. lines = lines + para_nodes[k]['text']
  79. print(para_nodes[i]['text'], 'xxxxxxxxxx', lines)
  80. # file_content = json_line['para_nodes']
  81. # for y in range(len(file_content[10:20])):
  82. # print(file_content[y])
  83. print(title_list)
  84. print(contents)