xzc
/
pdf_title_image


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2024-06-11 13:43:14
# @Last Modified by:   privacy
# @Last Modified time: 2024-09-04 10:21:48

import os
import re
import json
from enum import Enum, auto
from typing import Any, Optional, List

import pandas as pd
from pdfminer.pdftypes import PDFObjRef
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
import pdfplumber


def check_scan_pdf(file_path: str) -> bool:
    """
    测试PDF文件是否为扫描件

    Args:
        file_path: 文件地址

    Returns:
        bool:   是否为扫描件
    """
    probability_page = 0
    with pdfplumber.open(file_path) as pdf:
        page_num = len(pdf.pages)
        for page in pdf.pages:
            content = page.extract_text()
            if len(content) > 50:
                probability_page += 1
                if (probability_page / page_num) > 0.1:
                    return False
    return True


def num_to_chinese(num: int) -> str:
    """
    数字转中文

    Args:
        num: 待转数字
    Returns:
        数字的中文表示
    """
    chinese_num = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九']
    chinese_unit = ['', '十', '百', '千', '万']

    if num == 0:
        return chinese_num[0]

    res = ''
    unit_index = 0
    while num > 0:
        digit = num % 10
        if digit != 0:
            res = chinese_num[digit] + chinese_unit[unit_index] + res
        elif not res.startswith(chinese_num[0]):
            res = chinese_num[0] + res
        num //= 10
        unit_index += 1

    return res.replace('一十', '十').rstrip('零')


def chinese_to_num(chinese_num: str) -> int:
    """
    中文转数字

    Args:
        chinese_num: 待转中文
    Returns:
        数字
    """
    number_map = {'零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9}

    # 单位映射
    unit_map = {'十': 10, '百': 100, '千': 1000, '万': 10000}

    output = 0
    unit = 1
    num = 0
    for index, cn_num in enumerate(chinese_num):
        if cn_num in number_map:
            # 数字
            num = number_map[cn_num]
            # 最后的个位数字
            if index == len(chinese_num) - 1:
                output = output + num
        elif cn_num in unit_map:
            # 单位
            unit = unit_map[cn_num]
            # 累加
            output = output + num * unit
            num = 0
        else:
            raise ValueError(f"{cn_num} 不在转化范围内")

    return output


def next_chinese_num(chinese_num: str) -> str:
    """
    中文数字加一
    Args:
        chinese_num: 待加中文数字
    Returns:
        加一后的中文
    """
    num = chinese_to_num(chinese_num)
    return num_to_chinese(num + 1)


def filter_images(image_list: list, start_page: int, end_page: int) -> List[dict]:
    """
    从已解析的图片中筛选出指定页面的图片
    Args:
        image_list: 图片列表
        start_page: 起始页码
        end_page:   终止页码
    Returns:
        从起始页码到终止页码间的图片列表
    """
    df = pd.DataFrame(image_list)
    return df.query(f''' {start_page} <= page_number <= {end_page} ''').to_dict(orient='records')


def filter_tables(table_list: list, start_page: int, end_page: int) -> List[dict]:
    """
    从已解析的表格中筛选出指定页面的表格
    Args:
        table_list: 表格列表
        start_page: 起始页码
        end_page:   终止页码
    Returns:
        从起始页码到终止页码间的表格列表
    """
    return [table for table in table_list if (start_page <= min(table['page_numbers'])) and (end_page >= max(table['page_numbers']))]


def rmb_to_digit(rmb_str: str):
    digit_map = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9}
    unit_map = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '万': 10000, '亿': 100000000}
    digit = 0
    total = 0
    tmp = 0
    for char in rmb_str:
        if char in digit_map:
            digit = digit_map[char]
        elif char in unit_map:
            if digit + tmp:
                total += (tmp + digit) * unit_map[char]
                tmp = digit = 0
            else:
                total *= unit_map[char]
        else:
            tmp = digit
    total += tmp + digit
    return '{:.2f}'.format(total)


def match_price_zhs(text: str) -> List[str]:
    pattern = (r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟][壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]+"
               r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]")
    temp = re.findall(pattern, text)
    for i in range(len(temp)):
        if temp[i].endswith('整元') or temp[i].endswith('角元') or temp[i].endswith('分元') or temp[i].endswith('元元'):
            temp[i] = temp[i][:-1]
    return temp


def match_price_num(text: str) -> List[str]:
    pattern = (r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|"
               r"ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR|"
               r"PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
               r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)")
    return re.findall(pattern, text)


def match_duration(text: str) -> List[str]:
    pattern = r"[1-9]+[\d]日历天"
    return re.findall(pattern, text)


def match_quality(text: str) -> List[str]:
    pattern = r"工程质量.+"
    return re.findall(pattern, text)


class PDFRefType(Enum):
    """PDF reference type."""

    PDF_OBJ_REF = auto()
    DICTIONARY = auto()
    LIST = auto()
    NAMED_REF = auto()
    UNK = auto()  # fallback


class RefPageNumberResolver:
    """PDF Reference to page number resolver.

    .. note::

       Remote Go-To Actions (see 12.6.4.3 in
       `https://www.adobe.com/go/pdfreference/`__)
       are out of the scope of this resolver.

    Attributes:
        document (:obj:`pdfminer.pdfdocument.PDFDocument`):
            The document that contains the references.
        objid_to_pagenum (:obj:`dict[int, int]`):
            Mapping from an object id to the number of the page that contains
            that object.
    """

    def __init__(self, document: PDFDocument):
        self.document = document
        # obj_id -> page_number
        self.objid_to_pagenum: dict[int, int] = {
            page.pageid: page_num
            for page_num, page in enumerate(PDFPage.create_pages(document), 1)
        }

    @classmethod
    def get_ref_type(cls, ref: Any) -> PDFRefType:
        """Get the type of a PDF reference."""
        if isinstance(ref, PDFObjRef):
            return PDFRefType.PDF_OBJ_REF
        elif isinstance(ref, dict) and "D" in ref:
            return PDFRefType.DICTIONARY
        elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
            return PDFRefType.LIST
        elif isinstance(ref, bytes):
            return PDFRefType.NAMED_REF
        else:
            return PDFRefType.UNK

    @classmethod
    def is_ref_page(cls, ref: Any) -> bool:
        """Check whether a reference is of type '/Page'.

        Args:
            ref (:obj:`Any`):
                The PDF reference.

        Returns:
            :obj:`bool`: :obj:`True` if the reference references
            a page, :obj:`False` otherwise.
        """
        return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE

    def resolve(self, ref: Any) -> Optional[int]:
        """Resolve a PDF reference to a page number recursively.

        Args:
            ref (:obj:`Any`):
                The PDF reference.

        Returns:
            :obj:`Optional[int]`: The page number or :obj:`None`
            if the reference could not be resolved (e.g., remote Go-To
            Actions or malformed references).
        """
        ref_type = self.get_ref_type(ref)

        if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
            return self.objid_to_pagenum.get(ref.objid)
        elif ref_type is PDFRefType.PDF_OBJ_REF:
            return self.resolve(ref.resolve())

        if ref_type is PDFRefType.DICTIONARY:
            return self.resolve(ref["D"])

        if ref_type is PDFRefType.LIST:
            # Get the PDFObjRef in the list (usually first element).
            return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))

        if ref_type is PDFRefType.NAMED_REF:
            return self.resolve(self.document.get_dest(ref))

        return None  # PDFRefType.UNK


class BaseMethods:
    ''' base methods class
    '''

    def __init__(self) -> None:
        pass

    def pandas_read_xls(self, file_path: str, sheetname: str = "Sheet1"):
        ''' 读取xls文件方法
        '''
        return pd.read_excel(file_path, sheet_name=sheetname)

    def json_read(self, file_path: str):
        ''' 读取json文件方法
        '''
        with open(file_path, "r", encoding='utf-8') as fp:
            return json.load(fp)

    def save_file(self, save_data: list, save_path: str, file_format: str):
        ''' 保存文件
        '''
        if file_format == "json":
            with open(save_path, 'w', encoding='utf-8') as sf:
                sf.write(json.dumps(save_data, ensure_ascii=False))
        elif file_format == "xlsx" or file_format == "xls":
            with pd.ExcelWriter(save_path) as fp:
                save_data.to_excel(fp, sheet_name="Sheet1")
        elif file_format == 'txt':
            with open(save_path, 'w', encoding='utf-8') as tx:
                for data in save_data:
                    tx.write(data + "\n")

    def traverse_file(self, dirpath: str):
        '''
        遍历文件夹下文件
        '''
        filename = tuple()
        for root, dir, files in os.walk(dirpath):
            for name in files:
                filename = filename.__add__((name,))
        return filename