from enum import Enum, auto from typing import Any, Optional from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfpage import PDFPage, LITERAL_PAGE from pdfminer.pdfparser import PDFParser, PDFSyntaxError from pdfminer.pdftypes import PDFObjRef # 枚举类型PDFRefType,包含 # PDF_OBJ_REF: PDF对象引用 # DICTIONARY: 字典类型 # LIST: 列表类型 # NAMED_REF: 命名引用 # UNK: 未知类型 class PDFRefType(Enum): """PDF reference type.""" PDF_OBJ_REF = auto() DICTIONARY = auto() LIST = auto() NAMED_REF = auto() UNK = auto() # fallback # 解析pdf引用并将其映射到页面编号 class RefPageNumberResolver: """PDF Reference to page number resolver. .. note:: Remote Go-To Actions (see 12.6.4.3 in `https://www.adobe.com/go/pdfreference/`__) are out of the scope of this resolver. Attributes: document (:obj:`pdfminer.pdfdocument.PDFDocument`): The document that contains the references. objid_to_pagenum (:obj:`dict[int, int]`): Mapping from an object id to the number of the page that contains that object. """ def __init__(self, document: PDFDocument): self.document = document # obj_id -> page_number self.objid_to_pagenum: dict[int, int] = { page.pageid: page_num for page_num, page in enumerate(PDFPage.create_pages(document), 1) } # @classmethod装饰器表示该函数不需要实例调用,使用类本身即可调用 # get_ref_type类型用于确定ref type并返回PDFRefType中的类型 @classmethod def get_ref_type(cls, ref: Any) -> PDFRefType: """Get the type of a PDF reference.""" if isinstance(ref, PDFObjRef): return PDFRefType.PDF_OBJ_REF elif isinstance(ref, dict) and "D" in ref: return PDFRefType.DICTIONARY elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref): return PDFRefType.LIST elif isinstance(ref, bytes): return PDFRefType.NAMED_REF else: return PDFRefType.UNK # is_ref_page函数用于检查给定的pdf引用是否指向一个页面对象 @classmethod def is_ref_page(cls, ref: Any) -> bool: """Check whether a reference is of type '/Page'. Args: ref (:obj:`Any`): The PDF reference. Returns: :obj:`bool`: :obj:`True` if the reference references a page, :obj:`False` otherwise. """ return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE # resolve函数用于解析pdf引用并获取其页面编号 # 递归解析pdf引用,尝试获取引用指向的页面编号 def resolve(self, ref: Any) -> Optional[int]: """Resolve a PDF reference to a page number recursively. Args: ref (:obj:`Any`): The PDF reference. Returns: :obj:`Optional[int]`: The page number or :obj:`None` if the reference could not be resolved (e.g., remote Go-To Actions or malformed references). """ # 先获取当前pdf引用的类型 ref_type = self.get_ref_type(ref) # 如果当前引用是PDF对象引用,且其指向Page,则返回该page对应的page num if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()): return self.objid_to_pagenum.get(ref.objid) # 如果当前引用是PDF对象引用,但其没有指向Page,则对其递归调用 elif ref_type is PDFRefType.PDF_OBJ_REF: return self.resolve(ref.resolve()) # 如果当前引用是字典类型,则尝试解析ref["D"]键对应值 if ref_type is PDFRefType.DICTIONARY: return self.resolve(ref["D"]) # 如果当前引用是列表类型,则找到第一个PDFObjRef类型的元素后递归解析 if ref_type is PDFRefType.LIST: # Get the PDFObjRef in the list (usually first element). return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref))) # 如果当前引用是命名引用类型,则调用self.document.get_dest(ref)获取目标并再次递归解析 if ref_type is PDFRefType.NAMED_REF: return self.resolve(self.document.get_dest(ref)) # 如果当前引用类型未知,则返回None return None # PDFRefType.UNK