from typing import List from pdfminer.high_level import extract_pages from pdfminer.layout import LTFigure, LTImage, LTTextBoxHorizontal from pprint import pprint from tqdm import tqdm from text_extractor import similarity_filter, similar_match, parse_title from get_info import PdfExtractAttr, export_image import os import json os.environ['TRANSFORMERS_OFFLINE'] = '1' def parse_pages(pdf_path: str, text_path: str, image_dir: str, start_page: int, end_page: int, total_page: int) -> None: # 用于存储文本和图像 texts = [] images = [] # 读取PDF文件并提取页面 # 调用pdfminer中的extract_page函数提取每一页的页面布局page_layout for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)), total=total_page): if not start_page <= page_number <= end_page: continue title_index = 0 image_index = 0 # 遍历页面布局中的每一个元素 for element in page_layout: if isinstance(element, LTFigure): for e_obj in element._objs: if isinstance(e_obj, LTImage): # 提取图片数据 image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}') image_file = export_image(e_obj, image_file) images.append(image_file) # pprint(f'Image saved: {image_file}') image_index += 1 elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1: # 提取文本 text = element.get_text().strip() # # 假设标题通常是一行且字体较大 texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text}) title_index += 1 # 最终将标题信息保存为本地的json文件 with open(text_path, 'w', encoding='utf-8') as fp: json.dump(texts, fp, indent=4, ensure_ascii=False) def get_instances_by_title(path: str, instances: List[str]): """ Get all tables and figures of given title """ # path = './投标文件-修改版9-5-1-1.pdf' # instances = ['近年财务状况表'] file = PdfExtractAttr(file_path=path) print('解析PDF文字中') file.parse_text() # title = file.parse_outline() print('解析PDF标题中') all_title = parse_title(path) # all_text = file.parse_text() # remain for external parse print('分析标题中') title_sims = similarity_filter(similar_match(all_title, instances, key='title'), 0.5) title_f = [i for i in title_sims] results = [] for i in title_f: try: i['end_page'] = all_title[i['seq_num'] + 1]['page_number'] - 1 if i['end_page'] <= i['page_number']: continue # i['end_page'] = all_title[i['seq_num']]['page_number'] + 5 # for debug except IndexError: i['end_page'] = float('inf') image_loc = os.path.join(os.path.dirname(path), 'images') if not os.path.exists(image_loc): os.makedirs(image_loc) print('解析标题:\t{}'.format(i['title'])) print('解析图片中') parse_pages(path, os.path.join(os.path.dirname(path), '{}_texts_{}_{}.json'.format(i['title'], i['page_number'], i['index'])), image_loc, i['page_number'], i['end_page'], file.total_page) table_loc = os.path.join(os.path.dirname(path), '{}_tables_{}_{}.json'.format(i['title'], i['page_number'], i['index'])) print('解析表格中') tables = file.parse_table(start=i['page_number'], end=i['end_page']) i['tables'] = tables with open(table_loc, 'w', encoding='utf-8') as fp: json.dump(tables, fp, indent=4, ensure_ascii=False) i.update({'table_loc': table_loc, 'image_loc': image_loc}) results.append(i) return results ''' 大标题 outlines 小标题 text 表/图 1. 文字 + 表格(取第一行为标题) 2. 文字 + 图片(取第一行为标题) 3. 纯图片、表格(向上合并) '''