123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- import os
- import json
- from enum import Enum, auto
- from typing import Any, Optional
- import pandas as pd
- from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
- from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
- from pdfminer.pdfparser import PDFParser, PDFSyntaxError
- from pdfminer.pdftypes import PDFObjRef
- class PDFRefType(Enum):
- """PDF reference type."""
- PDF_OBJ_REF = auto()
- DICTIONARY = auto()
- LIST = auto()
- NAMED_REF = auto()
- UNK = auto() # fallback
- class RefPageNumberResolver:
- """PDF Reference to page number resolver.
- .. note::
- Remote Go-To Actions (see 12.6.4.3 in
- `https://www.adobe.com/go/pdfreference/`__)
- are out of the scope of this resolver.
- Attributes:
- document (:obj:`pdfminer.pdfdocument.PDFDocument`):
- The document that contains the references.
- objid_to_pagenum (:obj:`dict[int, int]`):
- Mapping from an object id to the number of the page that contains
- that object.
- """
- def __init__(self, document: PDFDocument):
- self.document = document
- # obj_id -> page_number
- self.objid_to_pagenum: dict[int, int] = {
- page.pageid: page_num
- for page_num, page in enumerate(PDFPage.create_pages(document), 1)
- }
- @classmethod
- def get_ref_type(cls, ref: Any) -> PDFRefType:
- """Get the type of a PDF reference."""
- if isinstance(ref, PDFObjRef):
- return PDFRefType.PDF_OBJ_REF
- elif isinstance(ref, dict) and "D" in ref:
- return PDFRefType.DICTIONARY
- elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
- return PDFRefType.LIST
- elif isinstance(ref, bytes):
- return PDFRefType.NAMED_REF
- else:
- return PDFRefType.UNK
- @classmethod
- def is_ref_page(cls, ref: Any) -> bool:
- """Check whether a reference is of type '/Page'.
- Args:
- ref (:obj:`Any`):
- The PDF reference.
- Returns:
- :obj:`bool`: :obj:`True` if the reference references
- a page, :obj:`False` otherwise.
- """
- return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
- def resolve(self, ref: Any) -> Optional[int]:
- """Resolve a PDF reference to a page number recursively.
- Args:
- ref (:obj:`Any`):
- The PDF reference.
- Returns:
- :obj:`Optional[int]`: The page number or :obj:`None`
- if the reference could not be resolved (e.g., remote Go-To
- Actions or malformed references).
- """
- ref_type = self.get_ref_type(ref)
- if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
- return self.objid_to_pagenum.get(ref.objid)
- elif ref_type is PDFRefType.PDF_OBJ_REF:
- return self.resolve(ref.resolve())
- if ref_type is PDFRefType.DICTIONARY:
- return self.resolve(ref["D"])
- if ref_type is PDFRefType.LIST:
- # Get the PDFObjRef in the list (usually first element).
- return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
- if ref_type is PDFRefType.NAMED_REF:
- return self.resolve(self.document.get_dest(ref))
- return None # PDFRefType.UNK
- class BaseMethods:
- ''' base methods class
- '''
- def __init__(self) -> None:
- pass
- def pandas_read_xls(self, file_path: str, sheetname: str = "Sheet1"):
- ''' 读取xls文件方法
- '''
- return pd.read_excel(file_path, sheet_name=sheetname)
- def json_read(self, file_path: str):
- ''' 读取json文件方法
- '''
- with open(file_path, "r", encoding='utf-8') as fp:
- return json.load(fp)
-
- def save_file(self, save_data: list, save_path: str, file_format: str):
- ''' 保存文件
- '''
- if file_format == "json":
- with open(save_path,'w',encoding='utf-8') as sf:
- sf.write(json.dumps(save_data,ensure_ascii=False))
- elif file_format == "xlsx" or file_format == "xls":
- with pd.ExcelWriter(save_path) as fp:
- save_data.to_excel(fp, sheet_name="Sheet1")
- elif file_format == 'txt':
- with open(save_path, 'w', encoding='utf-8') as tx:
- for data in save_data:
- tx.write(data+"\n")
-
- def traverse_file(self, dirpath: str):
- '''
- 遍历文件夹下文件
- '''
- filename = tuple()
- for root, dir, files in os.walk(dirpath):
- for name in files:
- filename = filename.__add__((name,))
- return filename
|