instance_locate.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. from typing import List, Optional
  2. from pdfminer.high_level import extract_pages
  3. from pdfminer.layout import LTFigure, LTImage, LTTextBoxHorizontal
  4. from pprint import pprint
  5. from tqdm import tqdm
  6. from text_extractor import similarity_filter, similar_match, parse_title
  7. from get_info import PdfExtractAttr, export_image
  8. import os
  9. import json
  10. os.environ['TRANSFORMERS_OFFLINE'] = '1'
  11. def parse_pages(pdf_path: str, text_path: str, image_dir: str, start_page: int, end_page: int, total_page: int) -> None:
  12. # 用于存储文本和图像
  13. texts = []
  14. images = []
  15. # 读取PDF文件并提取页面
  16. # 调用pdfminer中的extract_page函数提取每一页的页面布局page_layout
  17. for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)), total=total_page):
  18. if not start_page <= page_number <= end_page:
  19. continue
  20. title_index = 0
  21. image_index = 0
  22. # 遍历页面布局中的每一个元素
  23. for element in page_layout:
  24. if isinstance(element, LTFigure):
  25. for e_obj in element._objs:
  26. if isinstance(e_obj, LTImage):
  27. # 提取图片数据
  28. image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  29. image_file = export_image(e_obj, image_file)
  30. images.append(image_file)
  31. # pprint(f'Image saved: {image_file}')
  32. image_index += 1
  33. elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  34. # 提取文本
  35. text = element.get_text().strip()
  36. # # 假设标题通常是一行且字体较大
  37. texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
  38. title_index += 1
  39. # 最终将标题信息保存为本地的json文件
  40. with open(text_path, 'w', encoding='utf-8') as fp:
  41. json.dump(texts, fp, indent=4, ensure_ascii=False)
  42. def get_instances_by_title(path: Optional[str] = None, title_list: List[dict], table_list: List[dict], instances: List[str] = ['近年财务状况表']):
  43. """
  44. Get all tables and figures of given title
  45. Args:
  46. path:
  47. title_list: PDF 标题
  48. table_list: PDF 表格
  49. instances:
  50. Returns:
  51. results
  52. """
  53. title_sims = similarity_filter(similar_match(title_list, instances, key='title'), 0.5)
  54. title_f = [i for i in title_sims]
  55. results = []
  56. for i in title_f:
  57. try:
  58. i['end_page'] = title_list[i['seq_num'] + 1]['page_number'] - 1
  59. if i['end_page'] <= i['page_number']:
  60. continue
  61. except IndexError:
  62. i['end_page'] = float('inf')
  63. image_loc = os.path.join(os.path.dirname(path), 'extracted_images')
  64. if not os.path.exists(image_loc):
  65. os.makedirs(image_loc)
  66. print('解析标题:\t{}'.format(i['title']))
  67. print('解析图片中')
  68. parse_pages(
  69. path,
  70. os.path.join(os.path.dirname(path), '{}_texts_{}_{}.json'.format(i['title'], i['page_number'], i['index'])),
  71. image_loc,
  72. i['page_number'],
  73. i['end_page'],
  74. file.total_page
  75. )
  76. table_loc = os.path.join(os.path.dirname(path), '{}_tables_{}_{}.json'.format(i['title'], i['page_number'], i['index']))
  77. print('解析表格中')
  78. tables = file.parse_table(start=i['page_number'], end=i['end_page'])
  79. i['tables'] = tables
  80. with open(table_loc, 'w', encoding='utf-8') as fp:
  81. json.dump(tables, fp, indent=4, ensure_ascii=False)
  82. i.update({'table_loc': table_loc, 'image_loc': image_loc})
  83. results.append(i)
  84. return results
  85. '''
  86. 大标题 outlines
  87. 小标题 text
  88. 表/图
  89. 1. 文字 + 表格(取第一行为标题)
  90. 2. 文字 + 图片(取第一行为标题)
  91. 3. 纯图片、表格(向上合并)
  92. '''