Parcourir la source

add j2k picture extract

sprivacy il y a 1 an
Parent
commit
ee3755c347
2 fichiers modifiés avec 59 ajouts et 2 suppressions
  1. 21 2
      get_info.py
  2. 38 0
      responser.py

+ 21 - 2
get_info.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-07-04 09:59:10
+# @Last Modified time: 2024-07-25 16:36:24
 
 # import os
 
@@ -173,6 +173,9 @@ def export_image(image: LTImage, path: str) -> str:
             with open(path, 'wb') as file:
                 file.write(data)
             return path
+        elif data[:8] == b'\xffO\xffQ\x00/\x00\x00':
+            name = _save_j2k(image, path)
+            return name
         else:
             path += '.unk'
             with open(path, 'wb') as file:
@@ -212,6 +215,21 @@ def export_image(image: LTImage, path: str) -> str:
     else:
         return None
 
+def _save_j2k(image: LTImage, path: str) -> str:
+    try:
+        from PIL import Image
+    except ImportError:
+        raise ImportError(PIL_ERROR_MESSAGE)
+    path = path + ".png"
+    data = image.stream.get_data()
+    assert data is not None
+
+    byte_stream = BytesIO(data)
+    roiImg = Image.open(byte_stream)
+    roiImg.save(path)
+
+    return path
+
 def _save_jpeg(image: LTImage, path: str) -> str:
     """Save a JPEG encoded image"""
     raw_data = image.stream.get_rawdata()
@@ -392,7 +410,7 @@ class PdfExtractAttr(object):
         with pdfplumber.open(self.file_path) as pdf:
             for page in pdf.pages:
                 self.content.append({
-                    'page_number': page.page_number,
+                    'page_number': page.page_number - 1,
                     'text': page.extract_text()
                 })
         return self.content
@@ -503,6 +521,7 @@ if __name__ == '__main__':
     main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
     # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
     # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
+
     agent = PdfExtractAttr(file_path=pdf_path)
     print(agent.extract_content())
     agent.parse_outline()

+ 38 - 0
responser.py

@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2024-07-24 14:11:01
+# @Last Modified by:   privacy
+# @Last Modified time: 2024-07-24 14:31:52
+from typing import Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class Pages:
+    fileName: str
+    pageKey: Optional[str]
+    pageStart: str
+    pageStart: str
+
+
+@dataclass
+class Suppliers:
+    name: str
+    grade: str
+    supplier: Optional[str]
+    pages: Optional[list[Pages]]
+
+
+@dataclass
+class ScoringCriteria:
+    scoringFactors: str
+    scoringStandard: str
+    percentage: str
+    expertAdvice: Optional[str]
+    suppliers: list[Suppliers]
+
+
+@dataclass
+class DetailResult:
+    name: str
+    scoringCriteria: list[ScoringCriteria]