xzc
/
pdf_title_image


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
							from enum import Enum, auto
from typing import Any, Optional
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.pdftypes import PDFObjRef


# 枚举类型PDFRefType，包含
    # PDF_OBJ_REF: PDF对象引用
    # DICTIONARY: 字典类型
    # LIST: 列表类型
    # NAMED_REF: 命名引用
    # UNK: 未知类型
class PDFRefType(Enum):
    """PDF reference type."""

    PDF_OBJ_REF = auto()
    DICTIONARY = auto()
    LIST = auto()
    NAMED_REF = auto()
    UNK = auto()  # fallback

# 解析pdf引用并将其映射到页面编号
class RefPageNumberResolver:
    """PDF Reference to page number resolver.

    .. note::

       Remote Go-To Actions (see 12.6.4.3 in
       `https://www.adobe.com/go/pdfreference/`__)
       are out of the scope of this resolver.

    Attributes:
        document (:obj:`pdfminer.pdfdocument.PDFDocument`):
            The document that contains the references.
        objid_to_pagenum (:obj:`dict[int, int]`):
            Mapping from an object id to the number of the page that contains
            that object.
    """

    def __init__(self, document: PDFDocument):
        self.document = document
        # obj_id -> page_number
        self.objid_to_pagenum: dict[int, int] = {
            page.pageid: page_num
            for page_num, page in enumerate(PDFPage.create_pages(document), 1)
        }

    # @classmethod装饰器表示该函数不需要实例调用，使用类本身即可调用
    # get_ref_type类型用于确定ref type并返回PDFRefType中的类型
    @classmethod
    def get_ref_type(cls, ref: Any) -> PDFRefType:
        """Get the type of a PDF reference."""
        if isinstance(ref, PDFObjRef):
            return PDFRefType.PDF_OBJ_REF
        elif isinstance(ref, dict) and "D" in ref:
            return PDFRefType.DICTIONARY
        elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
            return PDFRefType.LIST
        elif isinstance(ref, bytes):
            return PDFRefType.NAMED_REF
        else:
            return PDFRefType.UNK

    # is_ref_page函数用于检查给定的pdf引用是否指向一个页面对象
    @classmethod
    def is_ref_page(cls, ref: Any) -> bool:
        """Check whether a reference is of type '/Page'.

        Args:
            ref (:obj:`Any`):
                The PDF reference.

        Returns:
            :obj:`bool`: :obj:`True` if the reference references
            a page, :obj:`False` otherwise.
        """
        return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE

    # resolve函数用于解析pdf引用并获取其页面编号
        # 递归解析pdf引用，尝试获取引用指向的页面编号
    def resolve(self, ref: Any) -> Optional[int]:
        """Resolve a PDF reference to a page number recursively.

        Args:
            ref (:obj:`Any`):
                The PDF reference.

        Returns:
            :obj:`Optional[int]`: The page number or :obj:`None`
            if the reference could not be resolved (e.g., remote Go-To
            Actions or malformed references).
        """
        # 先获取当前pdf引用的类型
        ref_type = self.get_ref_type(ref)
        # 如果当前引用是PDF对象引用，且其指向Page，则返回该page对应的page num
        if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
            return self.objid_to_pagenum.get(ref.objid)
        # 如果当前引用是PDF对象引用，但其没有指向Page，则对其递归调用
        elif ref_type is PDFRefType.PDF_OBJ_REF:
            return self.resolve(ref.resolve())
        # 如果当前引用是字典类型，则尝试解析ref["D"]键对应值
        if ref_type is PDFRefType.DICTIONARY:
            return self.resolve(ref["D"])
        # 如果当前引用是列表类型，则找到第一个PDFObjRef类型的元素后递归解析
        if ref_type is PDFRefType.LIST:
            # Get the PDFObjRef in the list (usually first element).
            return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
        # 如果当前引用是命名引用类型，则调用self.document.get_dest(ref)获取目标并再次递归解析
        if ref_type is PDFRefType.NAMED_REF:
            return self.resolve(self.document.get_dest(ref))
        # 如果当前引用类型未知，则返回None
        return None  # PDFRefType.UNK