|
@@ -2,7 +2,7 @@
|
|
|
# @Author: privacy
|
|
|
# @Date: 2024-06-11 13:43:14
|
|
|
# @Last Modified by: privacy
|
|
|
-# @Last Modified time: 2024-07-04 09:59:10
|
|
|
+# @Last Modified time: 2024-07-25 16:36:24
|
|
|
|
|
|
# import os
|
|
|
|
|
@@ -173,6 +173,9 @@ def export_image(image: LTImage, path: str) -> str:
|
|
|
with open(path, 'wb') as file:
|
|
|
file.write(data)
|
|
|
return path
|
|
|
+ elif data[:8] == b'\xffO\xffQ\x00/\x00\x00':
|
|
|
+ name = _save_j2k(image, path)
|
|
|
+ return name
|
|
|
else:
|
|
|
path += '.unk'
|
|
|
with open(path, 'wb') as file:
|
|
@@ -212,6 +215,21 @@ def export_image(image: LTImage, path: str) -> str:
|
|
|
else:
|
|
|
return None
|
|
|
|
|
|
+def _save_j2k(image: LTImage, path: str) -> str:
|
|
|
+ try:
|
|
|
+ from PIL import Image
|
|
|
+ except ImportError:
|
|
|
+ raise ImportError(PIL_ERROR_MESSAGE)
|
|
|
+ path = path + ".png"
|
|
|
+ data = image.stream.get_data()
|
|
|
+ assert data is not None
|
|
|
+
|
|
|
+ byte_stream = BytesIO(data)
|
|
|
+ roiImg = Image.open(byte_stream)
|
|
|
+ roiImg.save(path)
|
|
|
+
|
|
|
+ return path
|
|
|
+
|
|
|
def _save_jpeg(image: LTImage, path: str) -> str:
|
|
|
"""Save a JPEG encoded image"""
|
|
|
raw_data = image.stream.get_rawdata()
|
|
@@ -392,7 +410,7 @@ class PdfExtractAttr(object):
|
|
|
with pdfplumber.open(self.file_path) as pdf:
|
|
|
for page in pdf.pages:
|
|
|
self.content.append({
|
|
|
- 'page_number': page.page_number,
|
|
|
+ 'page_number': page.page_number - 1,
|
|
|
'text': page.extract_text()
|
|
|
})
|
|
|
return self.content
|
|
@@ -503,6 +521,7 @@ if __name__ == '__main__':
|
|
|
main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
|
|
|
# tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
|
|
|
# tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
|
|
|
+
|
|
|
agent = PdfExtractAttr(file_path=pdf_path)
|
|
|
print(agent.extract_content())
|
|
|
agent.parse_outline()
|