1 year ago · 808bee4ea6
--- a/tools.py
+++ b/tools.py
@@ -1,101 +1,60 @@
 
				-from enum import Enum, auto
			
 
				-from typing import Any, Optional
			
 
				-from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
			
 
				-from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
			
 
				-from pdfminer.pdfparser import PDFParser, PDFSyntaxError
			
 
				-from pdfminer.pdftypes import PDFObjRef
			
 
				-
			
 
				-
			
 
				-class PDFRefType(Enum):
			
 
				-    """PDF reference type."""
			
 
				-
			
 
				-    PDF_OBJ_REF = auto()
			
 
				-    DICTIONARY = auto()
			
 
				-    LIST = auto()
			
 
				-    NAMED_REF = auto()
			
 
				-    UNK = auto()  # fallback
			
 
				-
			
 
				-
			
 
				-class RefPageNumberResolver:
			
 
				-    """PDF Reference to page number resolver.
			
 
				-
			
 
				-    .. note::
			
 
				-
			
 
				-       Remote Go-To Actions (see 12.6.4.3 in
			
 
				-       `https://www.adobe.com/go/pdfreference/`__)
			
 
				-       are out of the scope of this resolver.
			
 
				-
			
 
				-    Attributes:
			
 
				-        document (:obj:`pdfminer.pdfdocument.PDFDocument`):
			
 
				-            The document that contains the references.
			
 
				-        objid_to_pagenum (:obj:`dict[int, int]`):
			
 
				-            Mapping from an object id to the number of the page that contains
			
 
				-            that object.
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self, document: PDFDocument):
			
 
				-        self.document = document
			
 
				-        # obj_id -> page_number
			
 
				-        self.objid_to_pagenum: dict[int, int] = {
			
 
				-            page.pageid: page_num
			
 
				-            for page_num, page in enumerate(PDFPage.create_pages(document), 1)
			
 
				-        }
			
 
				-
			
 
				-    @classmethod
			
 
				-    def get_ref_type(cls, ref: Any) -> PDFRefType:
			
 
				-        """Get the type of a PDF reference."""
			
 
				-        if isinstance(ref, PDFObjRef):
			
 
				-            return PDFRefType.PDF_OBJ_REF
			
 
				-        elif isinstance(ref, dict) and "D" in ref:
			
 
				-            return PDFRefType.DICTIONARY
			
 
				-        elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
			
 
				-            return PDFRefType.LIST
			
 
				-        elif isinstance(ref, bytes):
			
 
				-            return PDFRefType.NAMED_REF
			
 
				-        else:
			
 
				-            return PDFRefType.UNK
			
 
				-
			
 
				-    @classmethod
			
 
				-    def is_ref_page(cls, ref: Any) -> bool:
			
 
				-        """Check whether a reference is of type '/Page'.
			
 
				-
			
 
				-        Args:
			
 
				-            ref (:obj:`Any`):
			
 
				-                The PDF reference.
			
 
				-
			
 
				-        Returns:
			
 
				-            :obj:`bool`: :obj:`True` if the reference references
			
 
				-            a page, :obj:`False` otherwise.
			
 
				-        """
			
 
				-        return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
			
 
				-
			
 
				-    def resolve(self, ref: Any) -> Optional[int]:
			
 
				-        """Resolve a PDF reference to a page number recursively.
			
 
				-
			
 
				-        Args:
			
 
				-            ref (:obj:`Any`):
			
 
				-                The PDF reference.
			
 
				-
			
 
				-        Returns:
			
 
				-            :obj:`Optional[int]`: The page number or :obj:`None`
			
 
				-            if the reference could not be resolved (e.g., remote Go-To
			
 
				-            Actions or malformed references).
			
 
				-        """
			
 
				-        ref_type = self.get_ref_type(ref)
			
 
				-
			
 
				-        if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
			
 
				-            return self.objid_to_pagenum.get(ref.objid)
			
 
				-        elif ref_type is PDFRefType.PDF_OBJ_REF:
			
 
				-            return self.resolve(ref.resolve())
			
 
				-
			
 
				-        if ref_type is PDFRefType.DICTIONARY:
			
 
				-            return self.resolve(ref["D"])
			
 
				-
			
 
				-        if ref_type is PDFRefType.LIST:
			
 
				-            # Get the PDFObjRef in the list (usually first element).
			
 
				-            return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
			
 
				-
			
 
				-        if ref_type is PDFRefType.NAMED_REF:
			
 
				-            return self.resolve(self.document.get_dest(ref))
			
 
				-
			
 
				-        return None  # PDFRefType.UNK
			
 
				+''' helper methods
			
 
				+'''
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import re
			
 
				+from Timer import timer
			
 
				+import json
			
 
				+import os
			
 
				+import requests
			
 
				+from pprint import pprint
			
 
				+
			
 
				+
			
 
				+class BaseMethods():
			
 
				+    ''' base methods class
			
 
				+    '''
			
 
				+    def __init__(self) -> None:
			
 
				+        pass
			
 
				+
			
 
				+    def pandas_read_xls(self, file_path:str, sheetname:str="Sheet1"):
			
 
				+        ''' 读取xls文件方法
			
 
				+        '''
			
 
				+        return pd.read_excel(file_path,sheet_name=sheetname)
			
 
				+
			
 
				+    def json_read(self, file_path:str):
			
 
				+        ''' 读取json文件方法
			
 
				+        '''
			
 
				+        with open(file_path, "r", encoding='utf-8') as fp:
			
 
				+            return json.load(fp)
			
 
				+        
			
 
				+    def save_file(self, save_data:list, save_path:str, file_format:str):
			
 
				+        ''' 保存文件
			
 
				+        '''
			
 
				+        print(len(save_data))
			
 
				+        if file_format == "json":
			
 
				+            with open(save_path,'w',encoding='utf-8') as sf:
			
 
				+                sf.write(json.dumps(save_data,ensure_ascii=False))
			
 
				+        elif file_format == "xlsx" or file_format == "xls":
			
 
				+            with pd.ExcelWriter(save_path) as fp:
			
 
				+                save_data.to_excel(fp, sheet_name="Sheet1")
			
 
				+        elif file_format == 'txt':
			
 
				+            with open(save_path, 'w', encoding='utf-8') as tx:
			
 
				+                for data in save_data:
			
 
				+                    tx.write(data+"\n")
			
 
				+    
			
 
				+    def traverse_file(self, dirpath:str):
			
 
				+        '''
			
 
				+        遍历文件夹下文件
			
 
				+        '''
			
 
				+        filename = tuple()
			
 
				+        for root, dir, files in os.walk(dirpath):
			
 
				+            for name in files:
			
 
				+                filename = filename.__add__((name,))
			
 
				+        return filename
			
 
				+
			
 
				+
			
 
				+
			
 
				+    
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pass