Ver código fonte

添加BMP图形保存,添加摘要生成、关键词检测

sprivacy 1 ano atrás
pai
commit
59d20e64a0
3 arquivos alterados com 69 adições e 4 exclusões
  1. 17 4
      get_info.py
  2. 42 0
      lmu.py
  3. 10 0
      requirements.txt

+ 17 - 4
get_info.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-06-11 14:10:56
+# @Last Modified time: 2024-07-03 16:44:17
 
 # import os
 
@@ -98,6 +98,7 @@ from pdfminer.pdftypes import (
 )
 from pdfminer.pdfparser import PDFParser, PDFSyntaxError
 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
+from pdfminer.image import BMPWriter
 import pdfplumber
 
 # 自定义包导入
@@ -128,13 +129,13 @@ def export_image(image: LTImage, path: str) -> str:
         name = _save_jpeg2000(image, path)
 
     elif image.bits == 1:
-        name = _save_bmp(image, width, height, (width + 7) // 8, image.bits)
+        name = _save_bmp(image, width, height, (width + 7) // 8, image.bits, path)
 
     elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
-        name = _save_bmp(image, width, height, width * 3, image.bits * 3)
+        name = _save_bmp(image, width, height, width * 3, image.bits * 3, path)
 
     elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
-        name = _save_bmp(image, width, height, width, image.bits)
+        name = _save_bmp(image, width, height, width, image.bits, path)
 
     elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
         name = _save_bytes(image)
@@ -188,6 +189,18 @@ def _save_jpeg2000(image: LTImage, path: str) -> str:
     cv2.imwrite(path, opencv_image)
     return path
 
+def _save_bmp(image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, path: str) -> str:
+    """Save a BMP encoded image"""
+    path = path + ".bmp"
+    with open(path, "wb") as fp:
+        bmp = BMPWriter(fp, bits, width, height)
+        data = image.stream.get_data()
+        i = 0
+        for y in range(height):
+            bmp.write_line(y, data[i : i + bytes_per_line])
+            i += bytes_per_line
+    return name
+
 def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
     texts = []
     images = []

+ 42 - 0
lmu.py

@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2024-07-03 11:14:27
+# @Last Modified by:   privacy
+# @Last Modified time: 2024-07-03 15:11:07
+import logging
+from textrank4zh import TextRank4Keyword, TextRank4Sentence
+from paddlenlp import Taskflow
+
+
+class LMU(object):
+    def __init__(self):
+        self.tr4w = TextRank4Keyword()
+        self.tr4s = TextRank4Sentence()
+
+        self.summarizer = Taskflow("text_summarization", model="unimo-text-1.0-summary")
+
+    def run(self, text: str, topK: int = 5) -> None:
+        # self.keywords = jiabe.analyse.textrank(text, topK=20)
+
+        self.summary = self.summarizer(text)[0]
+
+        self.tr4w.analyze(text, lower=True)
+        self.key_words = self.tr4w.get_keywords(topK)
+
+        # self.tr4s.analyze(text=text, lower=True, source='all_filters')
+        # self.summary = [item.sentence for item in self.tr4s.get_key_sentences(topK)]
+
+    def get_summary(self) -> str:
+        """提取摘要"""
+        return self.summary
+
+    def get_key_words(self) -> list:
+        """提取关键词"""
+        return [item.word for item in self.key_words]
+
+
+if __name__ == '__main__':
+    lmu = LMU()
+    lmu.run('PaddleNLP是一个基于PaddlePaddle深度学习框架的自然语言处理工具包,提供了丰富的文本处理功能。关键词提取是其中一个重要的功能。TextRank算法是PaddleNLP中常用的关键词提取算法,它通过计算词语之间的权重得到关键词。')
+    print(lmu.get_summary())
+    print(lmu.get_key_words())

+ 10 - 0
requirements.txt

@@ -0,0 +1,10 @@
+numpy==1.24.4
+pandas==1.4.2
+opencv-python==4.6.0.66
+pdfminer.six==20231228
+pdfplumber==0.11.1
+torch==2.3.0
+scikit-learn==1.1.1
+transformers==4.41.2
+textrank4zh==0.3
+jieba==0.42.1