tools.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. from enum import Enum, auto
  2. from typing import Any, Optional
  3. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  4. from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
  5. from pdfminer.pdfparser import PDFParser, PDFSyntaxError
  6. from pdfminer.pdftypes import PDFObjRef
  7. # 枚举类型PDFRefType,包含
  8. # PDF_OBJ_REF: PDF对象引用
  9. # DICTIONARY: 字典类型
  10. # LIST: 列表类型
  11. # NAMED_REF: 命名引用
  12. # UNK: 未知类型
  13. class PDFRefType(Enum):
  14. """PDF reference type."""
  15. PDF_OBJ_REF = auto()
  16. DICTIONARY = auto()
  17. LIST = auto()
  18. NAMED_REF = auto()
  19. UNK = auto() # fallback
  20. # 解析pdf引用并将其映射到页面编号
  21. class RefPageNumberResolver:
  22. """PDF Reference to page number resolver.
  23. .. note::
  24. Remote Go-To Actions (see 12.6.4.3 in
  25. `https://www.adobe.com/go/pdfreference/`__)
  26. are out of the scope of this resolver.
  27. Attributes:
  28. document (:obj:`pdfminer.pdfdocument.PDFDocument`):
  29. The document that contains the references.
  30. objid_to_pagenum (:obj:`dict[int, int]`):
  31. Mapping from an object id to the number of the page that contains
  32. that object.
  33. """
  34. def __init__(self, document: PDFDocument):
  35. self.document = document
  36. # obj_id -> page_number
  37. self.objid_to_pagenum: dict[int, int] = {
  38. page.pageid: page_num
  39. for page_num, page in enumerate(PDFPage.create_pages(document), 1)
  40. }
  41. # @classmethod装饰器表示该函数不需要实例调用,使用类本身即可调用
  42. # get_ref_type类型用于确定ref type并返回PDFRefType中的类型
  43. @classmethod
  44. def get_ref_type(cls, ref: Any) -> PDFRefType:
  45. """Get the type of a PDF reference."""
  46. if isinstance(ref, PDFObjRef):
  47. return PDFRefType.PDF_OBJ_REF
  48. elif isinstance(ref, dict) and "D" in ref:
  49. return PDFRefType.DICTIONARY
  50. elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
  51. return PDFRefType.LIST
  52. elif isinstance(ref, bytes):
  53. return PDFRefType.NAMED_REF
  54. else:
  55. return PDFRefType.UNK
  56. # is_ref_page函数用于检查给定的pdf引用是否指向一个页面对象
  57. @classmethod
  58. def is_ref_page(cls, ref: Any) -> bool:
  59. """Check whether a reference is of type '/Page'.
  60. Args:
  61. ref (:obj:`Any`):
  62. The PDF reference.
  63. Returns:
  64. :obj:`bool`: :obj:`True` if the reference references
  65. a page, :obj:`False` otherwise.
  66. """
  67. return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
  68. # resolve函数用于解析pdf引用并获取其页面编号
  69. # 递归解析pdf引用,尝试获取引用指向的页面编号
  70. def resolve(self, ref: Any) -> Optional[int]:
  71. """Resolve a PDF reference to a page number recursively.
  72. Args:
  73. ref (:obj:`Any`):
  74. The PDF reference.
  75. Returns:
  76. :obj:`Optional[int]`: The page number or :obj:`None`
  77. if the reference could not be resolved (e.g., remote Go-To
  78. Actions or malformed references).
  79. """
  80. # 先获取当前pdf引用的类型
  81. ref_type = self.get_ref_type(ref)
  82. # 如果当前引用是PDF对象引用,且其指向Page,则返回该page对应的page num
  83. if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
  84. return self.objid_to_pagenum.get(ref.objid)
  85. # 如果当前引用是PDF对象引用,但其没有指向Page,则对其递归调用
  86. elif ref_type is PDFRefType.PDF_OBJ_REF:
  87. return self.resolve(ref.resolve())
  88. # 如果当前引用是字典类型,则尝试解析ref["D"]键对应值
  89. if ref_type is PDFRefType.DICTIONARY:
  90. return self.resolve(ref["D"])
  91. # 如果当前引用是列表类型,则找到第一个PDFObjRef类型的元素后递归解析
  92. if ref_type is PDFRefType.LIST:
  93. # Get the PDFObjRef in the list (usually first element).
  94. return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
  95. # 如果当前引用是命名引用类型,则调用self.document.get_dest(ref)获取目标并再次递归解析
  96. if ref_type is PDFRefType.NAMED_REF:
  97. return self.resolve(self.document.get_dest(ref))
  98. # 如果当前引用类型未知,则返回None
  99. return None # PDFRefType.UNK