xzc
/
pdf_title_image


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2024-06-11 13:43:14
# @Last Modified by:   privacy
# @Last Modified time: 2024-06-11 14:10:56

# import os

# from PIL import Image
# from PyPDF2 import PdfReader


# # 读取PDF文件
# with open(pdf_path, 'rb') as file:
#     reader = PdfReader(file)
#     num_pages = len(reader.pages)

#     # 遍历PDF的每一页
#     for page_num in range(num_pages):
#         page = reader.pages[page_num]

#         # 提取页面中的图像
#         if '/XObject' in page['/Resources']:
#             xobjects = page['/Resources']['/XObject'].get_object()

#             for obj in xobjects:
#                 if xobjects[obj]['/Subtype'] == '/Image':
#                     size = (xobjects[obj]['/Width'], xobjects[obj]['/Height'])
#                     data = xobjects[obj].get_data()
#                     if xobjects[obj]['/ColorSpace'] == '/DeviceRGB':
#                         mode = "RGB"
#                     else:
#                         mode = "P"

#                     img = Image.frombytes(mode, size, data)
#                     img_path = os.path.join(output_dir, f'image_{page_num}_{obj}.png')
#                     img.save(img_path)
#                     print(f'Image saved: {img_path}')


#######################################################################

# import os
# import re
# import fitz

# def pdf2pic(path, save_path):
#     checkXO = r"/Type(?= */XObject)"
#     checkIM = r"/Subtype(?= */Image)"
#     pdf = fitz.open(path)
#     lenXREF = pdf._getXrefLength()
#     imgcount = 0
#     for i in range(1, lenXREF):
#         text = pdf._getXrefString(i)
#         isXObject = re.search(checkXO, text)
#         isImage = re.search(checkIM, text)
#         if not isXObject or not isImage:
#             continue
#         imgcount += 1
#         pix = fitz.Pixmap(pdf, i)
#         new_name = f"img_{imgcount}.png"
#         if pix.n < 5:
#             pix.writePNG(os.path.join(pic_path, new_name))
#         else:
#             pix0 = fitz.Pixmap(fitz.csRGB, pix)
#             pix0.writePNG(os.path.join(pic_path, new_name))
#             pix0 = None
#         pix = None


# if __name__ == '__main__':
#     pdf2pic(pdf_path, image_dir)


#######################################################################

import os
import re
import json
from io import BytesIO
from pprint import pprint

import numpy as np
import cv2

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY
from pdfminer.pdfcolor import LITERAL_DEVICE_RGB
from pdfminer.pdftypes import (
    LITERALS_DCT_DECODE,
    LITERALS_JBIG2_DECODE,
    LITERALS_JPX_DECODE,
    LITERALS_FLATE_DECODE,
)


def is_title(line: str) -> bool:
    title_word = re.findall('^[（\(][一二三四五六七八九十]+[\)）]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
    if title_word:
        return True
    title_word = re.findall('^附录|^参考文献|^附表', line.strip())
    if title_word:
        return True
    return False

def export_image(image: LTImage, path: str) -> str:
    """Save an LTImage to disk"""
    (width, height) = image.srcsize

    filters = image.stream.get_filters()

    if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
        name = _save_jpeg(image, path)

    elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
        name = _save_jpeg2000(image, path)

    elif image.bits == 1:
        name = _save_bmp(image, width, height, (width + 7) // 8, image.bits)

    elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
        name = _save_bmp(image, width, height, width * 3, image.bits * 3)

    elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
        name = _save_bmp(image, width, height, width, image.bits)

    elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
        name = _save_bytes(image)

    else:
        name = _save_raw(image)

    return name

def _save_jpeg(image: LTImage, path: str) -> str:
    """Save a JPEG encoded image"""
    raw_data = image.stream.get_rawdata()
    assert raw_data is not None

    path = path + ".jpg"

    with open(path, "wb") as fp:
        if LITERAL_DEVICE_CMYK in image.colorspace:
            try:
                from PIL import Image, ImageChops  # type: ignore[import]
            except ImportError:
                raise ImportError(PIL_ERROR_MESSAGE)

            ifp = BytesIO(raw_data)
            i = Image.open(ifp)
            i = ImageChops.invert(i)
            i = i.convert("RGB")
            i.save(fp, "JPEG")
        else:
            fp.write(raw_data)

    return path

def _save_jpeg2000(image: LTImage, path: str) -> str:
    """Save a JPEG 2000 encoded image"""
    raw_data = image.stream.get_rawdata()
    assert raw_data is not None

    path = path + ".png"

    try:
        from PIL import Image  # type: ignore[import]
    except ImportError:
        raise ImportError(PIL_ERROR_MESSAGE)

    # if we just write the raw data, most image programs
    # that I have tried cannot open the file. However,
    # open and saving with PIL produces a file that
    # seems to be easily opened by other programs
    ifp = BytesIO(raw_data)
    i = Image.open(ifp)
    opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
    cv2.imwrite(path, opencv_image)
    return path

def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
    texts = []
    images = []
    # 读取PDF文件并提取页面
    for page_number, page_layout in enumerate(extract_pages(pdf_path)):
        title_index = 0
        image_index = 0
        for element in page_layout:
            if isinstance(element, LTLine):
                pass
            elif isinstance(element, LTRect):
                pass
            elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
                text = element.get_text().strip()
                # # 假设标题通常是一行且字体较大
                if text and (is_title(text) or element.height > 15):
                    texts.append({'index': title_index, 'pageno': page_number, 'bbox': element.bbox, 'text': text})
                    title_index += 1
            elif isinstance(element, LTFigure):
                for e_obj in element._objs:
                    if isinstance(e_obj, LTImage):
                        # 提取图片数据
                        image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
                        image_file = export_image(e_obj, image_file)
                        images.append(image_file)
                        print(f'Image saved: {image_file}')
                        image_index += 1

    with open(title_path, 'a', encoding='utf-8') as fp:
        json.dump(texts, fp, indent=4, ensure_ascii=False)


if __name__ == '__main__':
    pdf_path = './投标文件-修改版9-5-1-1.pdf'
    title_path = './投标文件-修改版9-5-1-1.json'
    image_dir = './extracted_images'
    os.makedirs(image_dir, exist_ok=True)
    main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)