|
@@ -2,7 +2,7 @@
|
|
|
# @Author: privacy
|
|
|
# @Date: 2024-06-11 13:43:14
|
|
|
# @Last Modified by: privacy
|
|
|
-# @Last Modified time: 2024-06-11 14:10:56
|
|
|
+# @Last Modified time: 2024-07-03 16:44:17
|
|
|
|
|
|
# import os
|
|
|
|
|
@@ -98,6 +98,7 @@ from pdfminer.pdftypes import (
|
|
|
)
|
|
|
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
|
|
|
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
|
|
+from pdfminer.image import BMPWriter
|
|
|
import pdfplumber
|
|
|
|
|
|
# 自定义包导入
|
|
@@ -128,13 +129,13 @@ def export_image(image: LTImage, path: str) -> str:
|
|
|
name = _save_jpeg2000(image, path)
|
|
|
|
|
|
elif image.bits == 1:
|
|
|
- name = _save_bmp(image, width, height, (width + 7) // 8, image.bits)
|
|
|
+ name = _save_bmp(image, width, height, (width + 7) // 8, image.bits, path)
|
|
|
|
|
|
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
|
|
|
- name = _save_bmp(image, width, height, width * 3, image.bits * 3)
|
|
|
+ name = _save_bmp(image, width, height, width * 3, image.bits * 3, path)
|
|
|
|
|
|
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
|
|
|
- name = _save_bmp(image, width, height, width, image.bits)
|
|
|
+ name = _save_bmp(image, width, height, width, image.bits, path)
|
|
|
|
|
|
elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
|
|
|
name = _save_bytes(image)
|
|
@@ -188,6 +189,18 @@ def _save_jpeg2000(image: LTImage, path: str) -> str:
|
|
|
cv2.imwrite(path, opencv_image)
|
|
|
return path
|
|
|
|
|
|
+def _save_bmp(image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, path: str) -> str:
|
|
|
+ """Save a BMP encoded image"""
|
|
|
+ path = path + ".bmp"
|
|
|
+ with open(path, "wb") as fp:
|
|
|
+ bmp = BMPWriter(fp, bits, width, height)
|
|
|
+ data = image.stream.get_data()
|
|
|
+ i = 0
|
|
|
+ for y in range(height):
|
|
|
+ bmp.write_line(y, data[i : i + bytes_per_line])
|
|
|
+ i += bytes_per_line
|
|
|
+ return name
|
|
|
+
|
|
|
def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
|
|
|
texts = []
|
|
|
images = []
|