Browse Source

Support for Excel files (#1319)

Dev Khant 1 year ago
parent
commit
6c32d287b5

+ 18 - 0
docs/components/data-sources/excel-file.mdx

@@ -0,0 +1,18 @@
+---
+title: '📄 Excel file'
+---
+
+### Excel file
+
+To add any xlsx/xls file, use the data_type as `excel_file`. `excel_file` allows remote urls and conventional file paths. Eg:
+
+```python
+from embedchain import App
+
+app = App()
+app.add('https://example.com/content/intro.xlsx', data_type="excel_file")
+# Or add file using the local file path on your system
+# app.add('content/intro.xls', data_type="excel_file")
+
+app.query("Give brief information about data.")
+```

+ 22 - 0
embedchain/chunkers/excel_file.py

@@ -0,0 +1,22 @@
+from typing import Optional
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from embedchain.chunkers.base_chunker import BaseChunker
+from embedchain.config.add_config import ChunkerConfig
+from embedchain.helpers.json_serializable import register_deserializable
+
+
+@register_deserializable
+class ExcelFileChunker(BaseChunker):
+    """Chunker for Excel file."""
+
+    def __init__(self, config: Optional[ChunkerConfig] = None):
+        if config is None:
+            config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config.chunk_size,
+            chunk_overlap=config.chunk_overlap,
+            length_function=config.length_function,
+        )
+        super().__init__(text_splitter)

+ 2 - 0
embedchain/data_formatter/data_formatter.py

@@ -80,6 +80,7 @@ class DataFormatter(JSONSerializable):
             DataType.SLACK: "embedchain.loaders.slack.SlackLoader",
             DataType.DROPBOX: "embedchain.loaders.dropbox.DropboxLoader",
             DataType.TEXT_FILE: "embedchain.loaders.text_file.TextFileLoader",
+            DataType.EXCEL_FILE: "embedchain.loaders.excel_file.ExcelFileLoader",
         }
 
         if data_type == DataType.CUSTOM or loader is not None:
@@ -127,6 +128,7 @@ class DataFormatter(JSONSerializable):
             DataType.SLACK: "embedchain.chunkers.common_chunker.CommonChunker",
             DataType.DROPBOX: "embedchain.chunkers.common_chunker.CommonChunker",
             DataType.TEXT_FILE: "embedchain.chunkers.common_chunker.CommonChunker",
+            DataType.EXCEL_FILE: "embedchain.chunkers.excel_file.ExcelFileChunker",
         }
 
         if chunker is not None:

+ 40 - 0
embedchain/loaders/excel_file.py

@@ -0,0 +1,40 @@
+import hashlib
+import importlib.util
+
+try:
+    from langchain_community.document_loaders import UnstructuredExcelLoader
+except ImportError:
+    raise ImportError(
+        'Excel file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
+    ) from None
+
+if importlib.util.find_spec("openpyxl") is None and importlib.util.find_spec("xlrd") is None:
+    raise ImportError("Excel file requires extra dependencies. Install with `pip install openpyxl xlrd`") from None
+
+from embedchain.helpers.json_serializable import register_deserializable
+from embedchain.loaders.base_loader import BaseLoader
+from embedchain.utils.misc import clean_string
+
+
+@register_deserializable
+class ExcelFileLoader(BaseLoader):
+    def load_data(self, excel_url):
+        """Load data from a Excel file."""
+        loader = UnstructuredExcelLoader(excel_url)
+        pages = loader.load_and_split()
+
+        data = []
+        for page in pages:
+            content = page.page_content
+            content = clean_string(content)
+
+            metadata = page.metadata
+            metadata["url"] = excel_url
+
+            data.append({"content": content, "meta_data": metadata})
+
+        doc_id = hashlib.sha256((content + excel_url).encode()).hexdigest()
+        return {
+            "doc_id": doc_id,
+            "data": data,
+        }

+ 2 - 0
embedchain/models/data_type.py

@@ -40,6 +40,7 @@ class IndirectDataType(Enum):
     SLACK = "slack"
     DROPBOX = "dropbox"
     TEXT_FILE = "text_file"
+    EXCEL_FILE = "excel_file"
 
 
 class SpecialDataType(Enum):
@@ -79,3 +80,4 @@ class DataType(Enum):
     SLACK = IndirectDataType.SLACK.value
     DROPBOX = IndirectDataType.DROPBOX.value
     TEXT_FILE = IndirectDataType.TEXT_FILE.value
+    EXCEL_FILE = IndirectDataType.EXCEL_FILE.value

+ 1 - 1
pyproject.toml

@@ -190,7 +190,7 @@ dataloaders=[
     "duckduckgo-search",
     "pytube",
     "sentence-transformers",
-    "unstructured",
+    "unstructured"
 ]
 vertexai = ["langchain-google-vertexai"]
 llama2 = ["replicate"]

+ 2 - 0
tests/chunkers/test_chunkers.py

@@ -2,6 +2,7 @@ from embedchain.chunkers.common_chunker import CommonChunker
 from embedchain.chunkers.discourse import DiscourseChunker
 from embedchain.chunkers.docs_site import DocsSiteChunker
 from embedchain.chunkers.docx_file import DocxFileChunker
+from embedchain.chunkers.excel_file import ExcelFileChunker
 from embedchain.chunkers.gmail import GmailChunker
 from embedchain.chunkers.google_drive import GoogleDriveChunker
 from embedchain.chunkers.json import JSONChunker
@@ -43,6 +44,7 @@ chunker_common_config = {
     DiscourseChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
     CommonChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
     GoogleDriveChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
+    ExcelFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
 }
 
 

+ 33 - 0
tests/loaders/test_excel_file.py

@@ -0,0 +1,33 @@
+import hashlib
+from unittest.mock import patch
+
+import pytest
+
+from embedchain.loaders.excel_file import ExcelFileLoader
+
+
+@pytest.fixture
+def excel_file_loader():
+    return ExcelFileLoader()
+
+
+def test_load_data(excel_file_loader):
+    mock_url = "mock_excel_file.xlsx"
+    expected_content = "Sample Excel Content"
+
+    # Mock the load_data method of the excel_file_loader instance
+    with patch.object(
+        excel_file_loader,
+        "load_data",
+        return_value={
+            "doc_id": hashlib.sha256((expected_content + mock_url).encode()).hexdigest(),
+            "data": [{"content": expected_content, "meta_data": {"url": mock_url}}],
+        },
+    ):
+        result = excel_file_loader.load_data(mock_url)
+
+    assert result["data"][0]["content"] == expected_content
+    assert result["data"][0]["meta_data"]["url"] == mock_url
+
+    expected_doc_id = hashlib.sha256((expected_content + mock_url).encode()).hexdigest()
+    assert result["doc_id"] == expected_doc_id