tools.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import os
  2. import json
  3. from enum import Enum, auto
  4. from typing import Any, Optional
  5. import pandas as pd
  6. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  7. from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
  8. from pdfminer.pdfparser import PDFParser, PDFSyntaxError
  9. from pdfminer.pdftypes import PDFObjRef
  10. class PDFRefType(Enum):
  11. """PDF reference type."""
  12. PDF_OBJ_REF = auto()
  13. DICTIONARY = auto()
  14. LIST = auto()
  15. NAMED_REF = auto()
  16. UNK = auto() # fallback
  17. class RefPageNumberResolver:
  18. """PDF Reference to page number resolver.
  19. .. note::
  20. Remote Go-To Actions (see 12.6.4.3 in
  21. `https://www.adobe.com/go/pdfreference/`__)
  22. are out of the scope of this resolver.
  23. Attributes:
  24. document (:obj:`pdfminer.pdfdocument.PDFDocument`):
  25. The document that contains the references.
  26. objid_to_pagenum (:obj:`dict[int, int]`):
  27. Mapping from an object id to the number of the page that contains
  28. that object.
  29. """
  30. def __init__(self, document: PDFDocument):
  31. self.document = document
  32. # obj_id -> page_number
  33. self.objid_to_pagenum: dict[int, int] = {
  34. page.pageid: page_num
  35. for page_num, page in enumerate(PDFPage.create_pages(document), 1)
  36. }
  37. @classmethod
  38. def get_ref_type(cls, ref: Any) -> PDFRefType:
  39. """Get the type of a PDF reference."""
  40. if isinstance(ref, PDFObjRef):
  41. return PDFRefType.PDF_OBJ_REF
  42. elif isinstance(ref, dict) and "D" in ref:
  43. return PDFRefType.DICTIONARY
  44. elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
  45. return PDFRefType.LIST
  46. elif isinstance(ref, bytes):
  47. return PDFRefType.NAMED_REF
  48. else:
  49. return PDFRefType.UNK
  50. @classmethod
  51. def is_ref_page(cls, ref: Any) -> bool:
  52. """Check whether a reference is of type '/Page'.
  53. Args:
  54. ref (:obj:`Any`):
  55. The PDF reference.
  56. Returns:
  57. :obj:`bool`: :obj:`True` if the reference references
  58. a page, :obj:`False` otherwise.
  59. """
  60. return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
  61. def resolve(self, ref: Any) -> Optional[int]:
  62. """Resolve a PDF reference to a page number recursively.
  63. Args:
  64. ref (:obj:`Any`):
  65. The PDF reference.
  66. Returns:
  67. :obj:`Optional[int]`: The page number or :obj:`None`
  68. if the reference could not be resolved (e.g., remote Go-To
  69. Actions or malformed references).
  70. """
  71. ref_type = self.get_ref_type(ref)
  72. if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
  73. return self.objid_to_pagenum.get(ref.objid)
  74. elif ref_type is PDFRefType.PDF_OBJ_REF:
  75. return self.resolve(ref.resolve())
  76. if ref_type is PDFRefType.DICTIONARY:
  77. return self.resolve(ref["D"])
  78. if ref_type is PDFRefType.LIST:
  79. # Get the PDFObjRef in the list (usually first element).
  80. return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
  81. if ref_type is PDFRefType.NAMED_REF:
  82. return self.resolve(self.document.get_dest(ref))
  83. return None # PDFRefType.UNK
  84. class BaseMethods:
  85. ''' base methods class
  86. '''
  87. def __init__(self) -> None:
  88. pass
  89. def pandas_read_xls(self, file_path: str, sheetname: str = "Sheet1"):
  90. ''' 读取xls文件方法
  91. '''
  92. return pd.read_excel(file_path, sheet_name=sheetname)
  93. def json_read(self, file_path: str):
  94. ''' 读取json文件方法
  95. '''
  96. with open(file_path, "r", encoding='utf-8') as fp:
  97. return json.load(fp)
  98. def save_file(self, save_data: list, save_path: str, file_format: str):
  99. ''' 保存文件
  100. '''
  101. if file_format == "json":
  102. with open(save_path,'w',encoding='utf-8') as sf:
  103. sf.write(json.dumps(save_data,ensure_ascii=False))
  104. elif file_format == "xlsx" or file_format == "xls":
  105. with pd.ExcelWriter(save_path) as fp:
  106. save_data.to_excel(fp, sheet_name="Sheet1")
  107. elif file_format == 'txt':
  108. with open(save_path, 'w', encoding='utf-8') as tx:
  109. for data in save_data:
  110. tx.write(data+"\n")
  111. def traverse_file(self, dirpath: str):
  112. '''
  113. 遍历文件夹下文件
  114. '''
  115. filename = tuple()
  116. for root, dir, files in os.walk(dirpath):
  117. for name in files:
  118. filename = filename.__add__((name,))
  119. return filename