|
@@ -2,7 +2,7 @@
|
|
# @Author: privacy
|
|
# @Author: privacy
|
|
# @Date: 2024-06-11 13:43:14
|
|
# @Date: 2024-06-11 13:43:14
|
|
# @Last Modified by: privacy
|
|
# @Last Modified by: privacy
|
|
-# @Last Modified time: 2024-07-03 16:44:17
|
|
|
|
|
|
+# @Last Modified time: 2024-07-04 09:59:10
|
|
|
|
|
|
# import os
|
|
# import os
|
|
|
|
|
|
@@ -124,26 +124,94 @@ def export_image(image: LTImage, path: str) -> str:
|
|
|
|
|
|
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
|
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
|
name = _save_jpeg(image, path)
|
|
name = _save_jpeg(image, path)
|
|
|
|
+ return name
|
|
|
|
|
|
elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
|
|
elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
|
|
name = _save_jpeg2000(image, path)
|
|
name = _save_jpeg2000(image, path)
|
|
|
|
+ return name
|
|
|
|
|
|
- elif image.bits == 1:
|
|
|
|
- name = _save_bmp(image, width, height, (width + 7) // 8, image.bits, path)
|
|
|
|
|
|
+ # elif image.bits == 1:
|
|
|
|
+ # name = _save_bmp(image, width, height, (width + 7) // 8, image.bits, path)
|
|
|
|
|
|
- elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
|
|
|
|
- name = _save_bmp(image, width, height, width * 3, image.bits * 3, path)
|
|
|
|
|
|
+ # elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
|
|
|
|
+ # name = _save_bmp(image, width, height, width * 3, image.bits * 3, path)
|
|
|
|
|
|
- elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
|
|
|
|
- name = _save_bmp(image, width, height, width, image.bits, path)
|
|
|
|
|
|
+ # elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
|
|
|
|
+ # name = _save_bmp(image, width, height, width, image.bits, path)
|
|
|
|
|
|
- elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
|
|
|
|
- name = _save_bytes(image)
|
|
|
|
|
|
+ # elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
|
|
|
|
+ # name = _save_bytes(image)
|
|
|
|
|
|
|
|
+ # else:
|
|
|
|
+ # name = _save_raw(image)
|
|
|
|
+ data = image.stream.get_data()
|
|
|
|
+ raw_data = image.stream.get_rawdata()
|
|
|
|
+
|
|
|
|
+ if data:
|
|
|
|
+ if data[:2] == b'\xff\xd8' and data[-2:] == b'\xff\xd9':
|
|
|
|
+ path += '.jpg'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(data)
|
|
|
|
+ return path
|
|
|
|
+ elif data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
|
|
|
|
+ path += '.png'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(data)
|
|
|
|
+ return path
|
|
|
|
+ elif data[:2] == b'\x42\x4d':
|
|
|
|
+ path += '.bmp'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(data)
|
|
|
|
+ return path
|
|
|
|
+ elif data[:6] == b'\x47\x49\x46\x38\x37\x61' or data[:6] == b'\x47\x49\x46\x38\x39\x61':
|
|
|
|
+ path += '.gif'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(data)
|
|
|
|
+ return path
|
|
|
|
+ elif data[:2] == b'\x4d\x4d' or data[:2] == b'\x49\x49':
|
|
|
|
+ path += '.tiff'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(data)
|
|
|
|
+ return path
|
|
|
|
+ else:
|
|
|
|
+ path += '.unk'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(data)
|
|
|
|
+ return path
|
|
|
|
+ elif raw_data:
|
|
|
|
+ if raw_data[:2] == b'\xff\xd8' and raw_data[-2:] == b'\xff\xd9':
|
|
|
|
+ path += '.jpg'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(raw_data)
|
|
|
|
+ return path
|
|
|
|
+ elif raw_data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
|
|
|
|
+ path += '.png'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(raw_data)
|
|
|
|
+ return path
|
|
|
|
+ elif raw_data[:2] == b'\x42\x4d':
|
|
|
|
+ path += '.bmp'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(raw_data)
|
|
|
|
+ return path
|
|
|
|
+ elif raw_data[:6] == b'\x47\x49\x46\x38\x37\x61' or raw_data[:6] == b'\x47\x49\x46\x38\x39\x61':
|
|
|
|
+ path += '.gif'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(raw_data)
|
|
|
|
+ return path
|
|
|
|
+ elif raw_data[:2] == b'\x4d\x4d' or raw_data[:2] == b'\x49\x49':
|
|
|
|
+ path += '.tiff'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(raw_data)
|
|
|
|
+ return path
|
|
|
|
+ else:
|
|
|
|
+ path += '.unk'
|
|
|
|
+ with open(path, 'wb') as file:
|
|
|
|
+ file.write(raw_data)
|
|
|
|
+ return path
|
|
else:
|
|
else:
|
|
- name = _save_raw(image)
|
|
|
|
|
|
+ return None
|
|
|
|
|
|
- return name
|
|
|
|
|
|
|
|
def _save_jpeg(image: LTImage, path: str) -> str:
|
|
def _save_jpeg(image: LTImage, path: str) -> str:
|
|
"""Save a JPEG encoded image"""
|
|
"""Save a JPEG encoded image"""
|
|
@@ -191,15 +259,11 @@ def _save_jpeg2000(image: LTImage, path: str) -> str:
|
|
|
|
|
|
def _save_bmp(image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, path: str) -> str:
|
|
def _save_bmp(image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, path: str) -> str:
|
|
"""Save a BMP encoded image"""
|
|
"""Save a BMP encoded image"""
|
|
|
|
+ data = image.stream.get_data()
|
|
path = path + ".bmp"
|
|
path = path + ".bmp"
|
|
with open(path, "wb") as fp:
|
|
with open(path, "wb") as fp:
|
|
- bmp = BMPWriter(fp, bits, width, height)
|
|
|
|
- data = image.stream.get_data()
|
|
|
|
- i = 0
|
|
|
|
- for y in range(height):
|
|
|
|
- bmp.write_line(y, data[i : i + bytes_per_line])
|
|
|
|
- i += bytes_per_line
|
|
|
|
- return name
|
|
|
|
|
|
+ fp.write(data)
|
|
|
|
+ return path
|
|
|
|
|
|
def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
|
|
def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
|
|
texts = []
|
|
texts = []
|
|
@@ -421,8 +485,10 @@ class PdfExtractAttr(object):
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
- pdf_path = './投标文件-修改版9-5-1-1.pdf'
|
|
|
|
- title_path = './投标文件-修改版9-5-1-1.json'
|
|
|
|
|
|
+ # pdf_path = './投标文件-修改版9-5-1-1.pdf'
|
|
|
|
+ pdf_path = './南方电网数字研究院有限公司.pdf'
|
|
|
|
+ # title_path = './投标文件-修改版9-5-1-1.json'
|
|
|
|
+ title_path = './南方电网数字研究院有限公司.json'
|
|
image_dir = './extracted_images'
|
|
image_dir = './extracted_images'
|
|
os.makedirs(image_dir, exist_ok=True)
|
|
os.makedirs(image_dir, exist_ok=True)
|
|
main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
|
|
main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
|