1 ano atrás · 59d20e64a0
--- a/get_info.py
+++ b/get_info.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-06-11 13:43:14
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-06-11 14:10:56
			
 
				+# @Last Modified time: 2024-07-03 16:44:17
			
 
				 
			
 
				 # import os
			
 
				 
			
@@ -98,6 +98,7 @@ from pdfminer.pdftypes import (
 
				 )
			
 
				 from pdfminer.pdfparser import PDFParser, PDFSyntaxError
			
 
				 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
			
 
				+from pdfminer.image import BMPWriter
			
 
				 import pdfplumber
			
 
				 
			
 
				 # 自定义包导入
			
@@ -128,13 +129,13 @@ def export_image(image: LTImage, path: str) -> str:
 
				         name = _save_jpeg2000(image, path)
			
 
				 
			
 
				     elif image.bits == 1:
			
 
				-        name = _save_bmp(image, width, height, (width + 7) // 8, image.bits)
			
 
				+        name = _save_bmp(image, width, height, (width + 7) // 8, image.bits, path)
			
 
				 
			
 
				     elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
			
 
				-        name = _save_bmp(image, width, height, width * 3, image.bits * 3)
			
 
				+        name = _save_bmp(image, width, height, width * 3, image.bits * 3, path)
			
 
				 
			
 
				     elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
			
 
				-        name = _save_bmp(image, width, height, width, image.bits)
			
 
				+        name = _save_bmp(image, width, height, width, image.bits, path)
			
 
				 
			
 
				     elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
			
 
				         name = _save_bytes(image)
			
@@ -188,6 +189,18 @@ def _save_jpeg2000(image: LTImage, path: str) -> str:
 
				     cv2.imwrite(path, opencv_image)
			
 
				     return path
			
 
				 
			
 
				+def _save_bmp(image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, path: str) -> str:
			
 
				+    """Save a BMP encoded image"""
			
 
				+    path = path + ".bmp"
			
 
				+    with open(path, "wb") as fp:
			
 
				+        bmp = BMPWriter(fp, bits, width, height)
			
 
				+        data = image.stream.get_data()
			
 
				+        i = 0
			
 
				+        for y in range(height):
			
 
				+            bmp.write_line(y, data[i : i + bytes_per_line])
			
 
				+            i += bytes_per_line
			
 
				+    return name
			
 
				+
			
 
				 def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
			
 
				     texts = []
			
 
				     images = []
			
--- a/lmu.py
+++ b/lmu.py
@@ -0,0 +1,42 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: privacy
			
 
				+# @Date:   2024-07-03 11:14:27
			
 
				+# @Last Modified by:   privacy
			
 
				+# @Last Modified time: 2024-07-03 15:11:07
			
 
				+import logging
			
 
				+from textrank4zh import TextRank4Keyword, TextRank4Sentence
			
 
				+from paddlenlp import Taskflow
			
 
				+
			
 
				+
			
 
				+class LMU(object):
			
 
				+    def __init__(self):
			
 
				+        self.tr4w = TextRank4Keyword()
			
 
				+        self.tr4s = TextRank4Sentence()
			
 
				+
			
 
				+        self.summarizer = Taskflow("text_summarization", model="unimo-text-1.0-summary")
			
 
				+
			
 
				+    def run(self, text: str, topK: int = 5) -> None:
			
 
				+        # self.keywords = jiabe.analyse.textrank(text, topK=20)
			
 
				+
			
 
				+        self.summary = self.summarizer(text)[0]
			
 
				+
			
 
				+        self.tr4w.analyze(text, lower=True)
			
 
				+        self.key_words = self.tr4w.get_keywords(topK)
			
 
				+
			
 
				+        # self.tr4s.analyze(text=text, lower=True, source='all_filters')
			
 
				+        # self.summary = [item.sentence for item in self.tr4s.get_key_sentences(topK)]
			
 
				+
			
 
				+    def get_summary(self) -> str:
			
 
				+        """提取摘要"""
			
 
				+        return self.summary
			
 
				+
			
 
				+    def get_key_words(self) -> list:
			
 
				+        """提取关键词"""
			
 
				+        return [item.word for item in self.key_words]
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    lmu = LMU()
			
 
				+    lmu.run('PaddleNLP是一个基于PaddlePaddle深度学习框架的自然语言处理工具包，提供了丰富的文本处理功能。关键词提取是其中一个重要的功能。TextRank算法是PaddleNLP中常用的关键词提取算法，它通过计算词语之间的权重得到关键词。')
			
 
				+    print(lmu.get_summary())
			
 
				+    print(lmu.get_key_words())
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,10 @@
 
				+numpy==1.24.4
			
 
				+pandas==1.4.2
			
 
				+opencv-python==4.6.0.66
			
 
				+pdfminer.six==20231228
			
 
				+pdfplumber==0.11.1
			
 
				+torch==2.3.0
			
 
				+scikit-learn==1.1.1
			
 
				+transformers==4.41.2
			
 
				+textrank4zh==0.3
			
 
				+jieba==0.42.1