tools.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. from enum import Enum, auto
  2. from typing import Any, Optional
  3. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  4. from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
  5. from pdfminer.pdfparser import PDFParser, PDFSyntaxError
  6. from pdfminer.pdftypes import PDFObjRef
  7. class PDFRefType(Enum):
  8. """PDF reference type."""
  9. PDF_OBJ_REF = auto()
  10. DICTIONARY = auto()
  11. LIST = auto()
  12. NAMED_REF = auto()
  13. UNK = auto() # fallback
  14. class RefPageNumberResolver:
  15. """PDF Reference to page number resolver.
  16. .. note::
  17. Remote Go-To Actions (see 12.6.4.3 in
  18. `https://www.adobe.com/go/pdfreference/`__)
  19. are out of the scope of this resolver.
  20. Attributes:
  21. document (:obj:`pdfminer.pdfdocument.PDFDocument`):
  22. The document that contains the references.
  23. objid_to_pagenum (:obj:`dict[int, int]`):
  24. Mapping from an object id to the number of the page that contains
  25. that object.
  26. """
  27. def __init__(self, document: PDFDocument):
  28. self.document = document
  29. # obj_id -> page_number
  30. self.objid_to_pagenum: dict[int, int] = {
  31. page.pageid: page_num
  32. for page_num, page in enumerate(PDFPage.create_pages(document), 1)
  33. }
  34. @classmethod
  35. def get_ref_type(cls, ref: Any) -> PDFRefType:
  36. """Get the type of a PDF reference."""
  37. if isinstance(ref, PDFObjRef):
  38. return PDFRefType.PDF_OBJ_REF
  39. elif isinstance(ref, dict) and "D" in ref:
  40. return PDFRefType.DICTIONARY
  41. elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
  42. return PDFRefType.LIST
  43. elif isinstance(ref, bytes):
  44. return PDFRefType.NAMED_REF
  45. else:
  46. return PDFRefType.UNK
  47. @classmethod
  48. def is_ref_page(cls, ref: Any) -> bool:
  49. """Check whether a reference is of type '/Page'.
  50. Args:
  51. ref (:obj:`Any`):
  52. The PDF reference.
  53. Returns:
  54. :obj:`bool`: :obj:`True` if the reference references
  55. a page, :obj:`False` otherwise.
  56. """
  57. return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
  58. def resolve(self, ref: Any) -> Optional[int]:
  59. """Resolve a PDF reference to a page number recursively.
  60. Args:
  61. ref (:obj:`Any`):
  62. The PDF reference.
  63. Returns:
  64. :obj:`Optional[int]`: The page number or :obj:`None`
  65. if the reference could not be resolved (e.g., remote Go-To
  66. Actions or malformed references).
  67. """
  68. ref_type = self.get_ref_type(ref)
  69. if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
  70. return self.objid_to_pagenum.get(ref.objid)
  71. elif ref_type is PDFRefType.PDF_OBJ_REF:
  72. return self.resolve(ref.resolve())
  73. if ref_type is PDFRefType.DICTIONARY:
  74. return self.resolve(ref["D"])
  75. if ref_type is PDFRefType.LIST:
  76. # Get the PDFObjRef in the list (usually first element).
  77. return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
  78. if ref_type is PDFRefType.NAMED_REF:
  79. return self.resolve(self.document.get_dest(ref))
  80. return None # PDFRefType.UNK