2 tahun lalu · 8b64deab40
--- a/README.md
+++ b/README.md
@@ -47,7 +47,8 @@ Embedchain empowers you to create ChatGPT like apps, on your own dynamic dataset
 
				 * Doc file
			
 
				 * JSON file
			
 
				 * Code documentation website loader
			
 
				-* Notion and many more.
			
 
				+* Notion
			
 
				+* Unstructured file loader and many more
			
 
				 
			
 
				 You can find the full list of data types on [our documentation](https://docs.embedchain.ai/data-sources/csv).
			
 
				 
			
--- a/embedchain/chunkers/unstructured_file.py
+++ b/embedchain/chunkers/unstructured_file.py
@@ -0,0 +1,22 @@
 
				+from typing import Optional
			
 
				+
			
 
				+from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				+
			
 
				+from embedchain.chunkers.base_chunker import BaseChunker
			
 
				+from embedchain.config.add_config import ChunkerConfig
			
 
				+from embedchain.helper.json_serializable import register_deserializable
			
 
				+
			
 
				+
			
 
				+@register_deserializable
			
 
				+class UnstructuredFileChunker(BaseChunker):
			
 
				+    """Chunker for Unstructured file."""
			
 
				+
			
 
				+    def __init__(self, config: Optional[ChunkerConfig] = None):
			
 
				+        if config is None:
			
 
				+            config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
			
 
				+        text_splitter = RecursiveCharacterTextSplitter(
			
 
				+            chunk_size=config.chunk_size,
			
 
				+            chunk_overlap=config.chunk_overlap,
			
 
				+            length_function=config.length_function,
			
 
				+        )
			
 
				+        super().__init__(text_splitter)
			
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -6,6 +6,7 @@ from embedchain.chunkers.json import JSONChunker
 
				 from embedchain.chunkers.mdx import MdxChunker
			
 
				 from embedchain.chunkers.notion import NotionChunker
			
 
				 from embedchain.chunkers.pdf_file import PdfFileChunker
			
 
				+from embedchain.chunkers.unstructured_file import UnstructuredFileChunker
			
 
				 from embedchain.chunkers.qna_pair import QnaPairChunker
			
 
				 from embedchain.chunkers.sitemap import SitemapChunker
			
 
				 from embedchain.chunkers.table import TableChunker
			
@@ -30,6 +31,7 @@ from embedchain.loaders.sitemap import SitemapLoader
 
				 from embedchain.loaders.web_page import WebPageLoader
			
 
				 from embedchain.loaders.xml import XmlLoader
			
 
				 from embedchain.loaders.youtube_video import YoutubeVideoLoader
			
 
				+from embedchain.loaders.unstructured_file import UnstructuredLoader
			
 
				 from embedchain.models.data_type import DataType
			
 
				 
			
 
				 
			
@@ -77,6 +79,7 @@ class DataFormatter(JSONSerializable):
 
				             DataType.CSV: CsvLoader,
			
 
				             DataType.MDX: MdxLoader,
			
 
				             DataType.IMAGES: ImagesLoader,
			
 
				+            DataType.UNSTRUCTURED: UnstructuredLoader,
			
 
				             DataType.JSON: JSONLoader,
			
 
				         }
			
 
				         lazy_loaders = {DataType.NOTION}
			
@@ -119,6 +122,7 @@ class DataFormatter(JSONSerializable):
 
				             DataType.MDX: MdxChunker,
			
 
				             DataType.IMAGES: ImagesChunker,
			
 
				             DataType.XML: XmlChunker,
			
 
				+            DataType.UNSTRUCTURED: UnstructuredFileChunker,
			
 
				             DataType.JSON: JSONChunker,
			
 
				         }
			
 
				         if data_type in chunker_classes:
			
--- a/embedchain/loaders/unstructured_file.py
+++ b/embedchain/loaders/unstructured_file.py
@@ -0,0 +1,40 @@
 
				+import hashlib
			
 
				+
			
 
				+try:
			
 
				+    from langchain.document_loaders import UnstructuredFileLoader
			
 
				+except ImportError:
			
 
				+    raise ImportError(
			
 
				+        'PDF File requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
			
 
				+    ) from None
			
 
				+from embedchain.helper.json_serializable import register_deserializable
			
 
				+from embedchain.loaders.base_loader import BaseLoader
			
 
				+from embedchain.utils import clean_string
			
 
				+
			
 
				+
			
 
				+@register_deserializable
			
 
				+class UnstructuredLoader(BaseLoader):
			
 
				+    def load_data(self, url):
			
 
				+        """Load data from a Unstructured file."""
			
 
				+        loader = UnstructuredFileLoader(url)
			
 
				+        data = []
			
 
				+        all_content = []
			
 
				+        pages = loader.load_and_split()
			
 
				+        if not len(pages):
			
 
				+            raise ValueError("No data found")
			
 
				+        for page in pages:
			
 
				+            content = page.page_content
			
 
				+            content = clean_string(content)
			
 
				+            meta_data = page.metadata
			
 
				+            meta_data["url"] = url
			
 
				+            data.append(
			
 
				+                {
			
 
				+                    "content": content,
			
 
				+                    "meta_data": meta_data,
			
 
				+                }
			
 
				+            )
			
 
				+            all_content.append(content)
			
 
				+        doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
			
 
				+        return {
			
 
				+            "doc_id": doc_id,
			
 
				+            "data": data,
			
 
				+        }
			
--- a/embedchain/models/data_type.py
+++ b/embedchain/models/data_type.py
@@ -25,6 +25,7 @@ class IndirectDataType(Enum):
 
				     CSV = "csv"
			
 
				     MDX = "mdx"
			
 
				     IMAGES = "images"
			
 
				+    UNSTRUCTURED = 'unstructured'
			
 
				     JSON = "json"
			
 
				 
			
 
				 
			
@@ -50,4 +51,5 @@ class DataType(Enum):
 
				     MDX = IndirectDataType.MDX.value
			
 
				     QNA_PAIR = SpecialDataType.QNA_PAIR.value
			
 
				     IMAGES = IndirectDataType.IMAGES.value
			
 
				+    UNSTRUCTURED = IndirectDataType.UNSTRUCTURED.value
			
 
				     JSON = IndirectDataType.JSON.value