123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106 |
- from typing import List
- from pdfminer.high_level import extract_pages
- from pdfminer.layout import LTFigure, LTImage, LTTextBoxHorizontal
- from pprint import pprint
- from tqdm import tqdm
- from text_extractor import similarity_filter, similar_match, parse_title
- from get_info import PdfExtractAttr, export_image
- import os
- import json
- os.environ['TRANSFORMERS_OFFLINE'] = '1'
- def parse_pages(pdf_path: str, text_path: str, image_dir: str, start_page: int, end_page: int, total_page: int) -> None:
- # 用于存储文本和图像
- texts = []
- images = []
- # 读取PDF文件并提取页面
- # 调用pdfminer中的extract_page函数提取每一页的页面布局page_layout
- for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)), total=total_page):
- if not start_page <= page_number <= end_page:
- continue
- title_index = 0
- image_index = 0
- # 遍历页面布局中的每一个元素
- for element in page_layout:
- if isinstance(element, LTFigure):
- for e_obj in element._objs:
- if isinstance(e_obj, LTImage):
- # 提取图片数据
- image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
- image_file = export_image(e_obj, image_file)
- images.append(image_file)
- # pprint(f'Image saved: {image_file}')
- image_index += 1
- elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
- # 提取文本
- text = element.get_text().strip()
- # # 假设标题通常是一行且字体较大
- texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
- title_index += 1
- # 最终将标题信息保存为本地的json文件
- with open(text_path, 'w', encoding='utf-8') as fp:
- json.dump(texts, fp, indent=4, ensure_ascii=False)
- def get_instances_by_title(path: str, instances: List[str]):
- """
- Get all tables and figures of given title
- """
- # path = './投标文件-修改版9-5-1-1.pdf'
- # instances = ['近年财务状况表']
- file = PdfExtractAttr(file_path=path)
- print('解析PDF文字中')
- file.parse_text()
- # title = file.parse_outline()
- print('解析PDF标题中')
- all_title = parse_title(path)
- # all_text = file.parse_text() # remain for external parse
- print('分析标题中')
- title_sims = similarity_filter(similar_match(all_title, instances, key='title'), 0.5)
- title_f = [i for i in title_sims]
- results = []
- for i in title_f:
- try:
- i['end_page'] = all_title[i['seq_num'] + 1]['page_number'] - 1
- if i['end_page'] <= i['page_number']:
- continue
- # i['end_page'] = all_title[i['seq_num']]['page_number'] + 5 # for debug
- except IndexError:
- i['end_page'] = float('inf')
- image_loc = os.path.join(os.path.dirname(path), 'images')
- if not os.path.exists(image_loc):
- os.makedirs(image_loc)
- print('解析标题:\t{}'.format(i['title']))
- print('解析图片中')
- parse_pages(path, os.path.join(os.path.dirname(path),
- '{}_texts_{}_{}.json'.format(i['title'], i['page_number'], i['index'])),
- image_loc, i['page_number'], i['end_page'], file.total_page)
- table_loc = os.path.join(os.path.dirname(path),
- '{}_tables_{}_{}.json'.format(i['title'], i['page_number'], i['index']))
- print('解析表格中')
- tables = file.parse_table(start=i['page_number'], end=i['end_page'])
- i['tables'] = tables
- with open(table_loc, 'w', encoding='utf-8') as fp:
- json.dump(tables, fp, indent=4, ensure_ascii=False)
- i.update({'table_loc': table_loc, 'image_loc': image_loc})
- results.append(i)
- return results
- '''
- 大标题 outlines
- 小标题 text
- 表/图
- 1. 文字 + 表格(取第一行为标题)
- 2. 文字 + 图片(取第一行为标题)
- 3. 纯图片、表格(向上合并)
- '''
|