|
@@ -1,101 +1,60 @@
|
|
|
-from enum import Enum, auto
|
|
|
-from typing import Any, Optional
|
|
|
-from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
|
|
-from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
|
|
|
-from pdfminer.pdfparser import PDFParser, PDFSyntaxError
|
|
|
-from pdfminer.pdftypes import PDFObjRef
|
|
|
-
|
|
|
-
|
|
|
-class PDFRefType(Enum):
|
|
|
- """PDF reference type."""
|
|
|
-
|
|
|
- PDF_OBJ_REF = auto()
|
|
|
- DICTIONARY = auto()
|
|
|
- LIST = auto()
|
|
|
- NAMED_REF = auto()
|
|
|
- UNK = auto() # fallback
|
|
|
-
|
|
|
-
|
|
|
-class RefPageNumberResolver:
|
|
|
- """PDF Reference to page number resolver.
|
|
|
-
|
|
|
- .. note::
|
|
|
-
|
|
|
- Remote Go-To Actions (see 12.6.4.3 in
|
|
|
- `https://www.adobe.com/go/pdfreference/`__)
|
|
|
- are out of the scope of this resolver.
|
|
|
-
|
|
|
- Attributes:
|
|
|
- document (:obj:`pdfminer.pdfdocument.PDFDocument`):
|
|
|
- The document that contains the references.
|
|
|
- objid_to_pagenum (:obj:`dict[int, int]`):
|
|
|
- Mapping from an object id to the number of the page that contains
|
|
|
- that object.
|
|
|
- """
|
|
|
-
|
|
|
- def __init__(self, document: PDFDocument):
|
|
|
- self.document = document
|
|
|
- # obj_id -> page_number
|
|
|
- self.objid_to_pagenum: dict[int, int] = {
|
|
|
- page.pageid: page_num
|
|
|
- for page_num, page in enumerate(PDFPage.create_pages(document), 1)
|
|
|
- }
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def get_ref_type(cls, ref: Any) -> PDFRefType:
|
|
|
- """Get the type of a PDF reference."""
|
|
|
- if isinstance(ref, PDFObjRef):
|
|
|
- return PDFRefType.PDF_OBJ_REF
|
|
|
- elif isinstance(ref, dict) and "D" in ref:
|
|
|
- return PDFRefType.DICTIONARY
|
|
|
- elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
|
|
|
- return PDFRefType.LIST
|
|
|
- elif isinstance(ref, bytes):
|
|
|
- return PDFRefType.NAMED_REF
|
|
|
- else:
|
|
|
- return PDFRefType.UNK
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def is_ref_page(cls, ref: Any) -> bool:
|
|
|
- """Check whether a reference is of type '/Page'.
|
|
|
-
|
|
|
- Args:
|
|
|
- ref (:obj:`Any`):
|
|
|
- The PDF reference.
|
|
|
-
|
|
|
- Returns:
|
|
|
- :obj:`bool`: :obj:`True` if the reference references
|
|
|
- a page, :obj:`False` otherwise.
|
|
|
- """
|
|
|
- return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
|
|
|
-
|
|
|
- def resolve(self, ref: Any) -> Optional[int]:
|
|
|
- """Resolve a PDF reference to a page number recursively.
|
|
|
-
|
|
|
- Args:
|
|
|
- ref (:obj:`Any`):
|
|
|
- The PDF reference.
|
|
|
-
|
|
|
- Returns:
|
|
|
- :obj:`Optional[int]`: The page number or :obj:`None`
|
|
|
- if the reference could not be resolved (e.g., remote Go-To
|
|
|
- Actions or malformed references).
|
|
|
- """
|
|
|
- ref_type = self.get_ref_type(ref)
|
|
|
-
|
|
|
- if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
|
|
|
- return self.objid_to_pagenum.get(ref.objid)
|
|
|
- elif ref_type is PDFRefType.PDF_OBJ_REF:
|
|
|
- return self.resolve(ref.resolve())
|
|
|
-
|
|
|
- if ref_type is PDFRefType.DICTIONARY:
|
|
|
- return self.resolve(ref["D"])
|
|
|
-
|
|
|
- if ref_type is PDFRefType.LIST:
|
|
|
- # Get the PDFObjRef in the list (usually first element).
|
|
|
- return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
|
|
|
-
|
|
|
- if ref_type is PDFRefType.NAMED_REF:
|
|
|
- return self.resolve(self.document.get_dest(ref))
|
|
|
-
|
|
|
- return None # PDFRefType.UNK
|
|
|
+''' helper methods
|
|
|
+'''
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+import re
|
|
|
+from Timer import timer
|
|
|
+import json
|
|
|
+import os
|
|
|
+import requests
|
|
|
+from pprint import pprint
|
|
|
+
|
|
|
+
|
|
|
+class BaseMethods():
|
|
|
+ ''' base methods class
|
|
|
+ '''
|
|
|
+ def __init__(self) -> None:
|
|
|
+ pass
|
|
|
+
|
|
|
+ def pandas_read_xls(self, file_path:str, sheetname:str="Sheet1"):
|
|
|
+ ''' 读取xls文件方法
|
|
|
+ '''
|
|
|
+ return pd.read_excel(file_path,sheet_name=sheetname)
|
|
|
+
|
|
|
+ def json_read(self, file_path:str):
|
|
|
+ ''' 读取json文件方法
|
|
|
+ '''
|
|
|
+ with open(file_path, "r", encoding='utf-8') as fp:
|
|
|
+ return json.load(fp)
|
|
|
+
|
|
|
+ def save_file(self, save_data:list, save_path:str, file_format:str):
|
|
|
+ ''' 保存文件
|
|
|
+ '''
|
|
|
+ print(len(save_data))
|
|
|
+ if file_format == "json":
|
|
|
+ with open(save_path,'w',encoding='utf-8') as sf:
|
|
|
+ sf.write(json.dumps(save_data,ensure_ascii=False))
|
|
|
+ elif file_format == "xlsx" or file_format == "xls":
|
|
|
+ with pd.ExcelWriter(save_path) as fp:
|
|
|
+ save_data.to_excel(fp, sheet_name="Sheet1")
|
|
|
+ elif file_format == 'txt':
|
|
|
+ with open(save_path, 'w', encoding='utf-8') as tx:
|
|
|
+ for data in save_data:
|
|
|
+ tx.write(data+"\n")
|
|
|
+
|
|
|
+ def traverse_file(self, dirpath:str):
|
|
|
+ '''
|
|
|
+ 遍历文件夹下文件
|
|
|
+ '''
|
|
|
+ filename = tuple()
|
|
|
+ for root, dir, files in os.walk(dirpath):
|
|
|
+ for name in files:
|
|
|
+ filename = filename.__add__((name,))
|
|
|
+ return filename
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ pass
|