instance_locate.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. from typing import List
  2. from pdfminer.high_level import extract_pages
  3. from pdfminer.layout import LTFigure, LTImage, LTTextBoxHorizontal
  4. from pprint import pprint
  5. from tqdm import tqdm
  6. from text_extractor import similarity_filter, similar_match, parse_title
  7. from get_info import PdfExtractAttr, export_image
  8. import os
  9. import json
  10. os.environ['TRANSFORMERS_OFFLINE'] = '1'
  11. def parse_pages(pdf_path: str, text_path: str, image_dir: str, start_page: int, end_page: int, total_page: int) -> None:
  12. # 用于存储文本和图像
  13. texts = []
  14. images = []
  15. # 读取PDF文件并提取页面
  16. # 调用pdfminer中的extract_page函数提取每一页的页面布局page_layout
  17. for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)), total=total_page):
  18. if not start_page <= page_number <= end_page:
  19. continue
  20. title_index = 0
  21. image_index = 0
  22. # 遍历页面布局中的每一个元素
  23. for element in page_layout:
  24. if isinstance(element, LTFigure):
  25. for e_obj in element._objs:
  26. if isinstance(e_obj, LTImage):
  27. # 提取图片数据
  28. image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  29. image_file = export_image(e_obj, image_file)
  30. images.append(image_file)
  31. # pprint(f'Image saved: {image_file}')
  32. image_index += 1
  33. elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  34. # 提取文本
  35. text = element.get_text().strip()
  36. # # 假设标题通常是一行且字体较大
  37. texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
  38. title_index += 1
  39. # 最终将标题信息保存为本地的json文件
  40. with open(text_path, 'w', encoding='utf-8') as fp:
  41. json.dump(texts, fp, indent=4, ensure_ascii=False)
  42. def get_instances_by_title(path: str, instances: List[str]):
  43. """
  44. Get all tables and figures of given title
  45. """
  46. # path = './投标文件-修改版9-5-1-1.pdf'
  47. # instances = ['近年财务状况表']
  48. file = PdfExtractAttr(file_path=path)
  49. print('解析PDF文字中')
  50. file.parse_text()
  51. # title = file.parse_outline()
  52. print('解析PDF标题中')
  53. all_title = parse_title(path)
  54. # all_text = file.parse_text() # remain for external parse
  55. print('分析标题中')
  56. title_sims = similarity_filter(similar_match(all_title, instances, key='title'), 0.5)
  57. title_f = [i for i in title_sims]
  58. results = []
  59. for i in title_f:
  60. try:
  61. i['end_page'] = all_title[i['seq_num'] + 1]['page_number'] - 1
  62. if i['end_page'] <= i['page_number']:
  63. continue
  64. # i['end_page'] = all_title[i['seq_num']]['page_number'] + 5 # for debug
  65. except IndexError:
  66. i['end_page'] = float('inf')
  67. image_loc = os.path.join(os.path.dirname(path), 'images')
  68. if not os.path.exists(image_loc):
  69. os.makedirs(image_loc)
  70. print('解析标题:\t{}'.format(i['title']))
  71. print('解析图片中')
  72. parse_pages(path, os.path.join(os.path.dirname(path),
  73. '{}_texts_{}_{}.json'.format(i['title'], i['page_number'], i['index'])),
  74. image_loc, i['page_number'], i['end_page'], file.total_page)
  75. table_loc = os.path.join(os.path.dirname(path),
  76. '{}_tables_{}_{}.json'.format(i['title'], i['page_number'], i['index']))
  77. print('解析表格中')
  78. tables = file.parse_table(start=i['page_number'], end=i['end_page'])
  79. i['tables'] = tables
  80. with open(table_loc, 'w', encoding='utf-8') as fp:
  81. json.dump(tables, fp, indent=4, ensure_ascii=False)
  82. i.update({'table_loc': table_loc, 'image_loc': image_loc})
  83. results.append(i)
  84. return results
  85. '''
  86. 大标题 outlines
  87. 小标题 text
  88. 表/图
  89. 1. 文字 + 表格(取第一行为标题)
  90. 2. 文字 + 图片(取第一行为标题)
  91. 3. 纯图片、表格(向上合并)
  92. '''