xzc
/
mem0


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
							import hashlib
from unittest.mock import MagicMock

import pytest

from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.add_config import ChunkerConfig
from embedchain.models.data_type import DataType


@pytest.fixture
def text_splitter_mock():
    return MagicMock()


@pytest.fixture
def loader_mock():
    return MagicMock()


@pytest.fixture
def app_id():
    return "test_app"


@pytest.fixture
def data_type():
    return DataType.TEXT


@pytest.fixture
def chunker(text_splitter_mock, data_type):
    text_splitter = text_splitter_mock
    chunker = BaseChunker(text_splitter)
    chunker.set_data_type(data_type)
    return chunker


def test_create_chunks_with_config(chunker, text_splitter_mock, loader_mock, app_id, data_type):
    text_splitter_mock.split_text.return_value = ["Chunk 1", "long chunk"]
    loader_mock.load_data.return_value = {
        "data": [{"content": "Content 1", "meta_data": {"url": "URL 1"}}],
        "doc_id": "DocID",
    }
    config = ChunkerConfig(chunk_size=50, chunk_overlap=0, length_function=len, min_chunk_size=10)
    result = chunker.create_chunks(loader_mock, "test_src", app_id, config)

    assert result["documents"] == ["long chunk"]


def test_create_chunks(chunker, text_splitter_mock, loader_mock, app_id, data_type):
    text_splitter_mock.split_text.return_value = ["Chunk 1", "Chunk 2"]
    loader_mock.load_data.return_value = {
        "data": [{"content": "Content 1", "meta_data": {"url": "URL 1"}}],
        "doc_id": "DocID",
    }

    result = chunker.create_chunks(loader_mock, "test_src", app_id)
    expected_ids = [
        f"{app_id}--" + hashlib.sha256(("Chunk 1" + "URL 1").encode()).hexdigest(),
        f"{app_id}--" + hashlib.sha256(("Chunk 2" + "URL 1").encode()).hexdigest(),
    ]

    assert result["documents"] == ["Chunk 1", "Chunk 2"]
    assert result["ids"] == expected_ids
    assert result["metadatas"] == [
        {
            "url": "URL 1",
            "data_type": data_type.value,
            "doc_id": f"{app_id}--DocID",
        },
        {
            "url": "URL 1",
            "data_type": data_type.value,
            "doc_id": f"{app_id}--DocID",
        },
    ]
    assert result["doc_id"] == f"{app_id}--DocID"


def test_get_chunks(chunker, text_splitter_mock):
    text_splitter_mock.split_text.return_value = ["Chunk 1", "Chunk 2"]

    content = "This is a test content."
    result = chunker.get_chunks(content)

    assert len(result) == 2
    assert result == ["Chunk 1", "Chunk 2"]


def test_set_data_type(chunker):
    chunker.set_data_type(DataType.MDX)
    assert chunker.data_type == DataType.MDX


def test_get_word_count(chunker):
    documents = ["This is a test.", "Another test."]
    result = chunker.get_word_count(documents)
    assert result == 6