test_add.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. import os
  2. import pytest
  3. from embedchain import App
  4. from embedchain.config import AddConfig, AppConfig, ChunkerConfig
  5. from embedchain.models.data_type import DataType
  6. os.environ["OPENAI_API_KEY"] = "test_key"
  7. @pytest.fixture
  8. def app(mocker):
  9. mocker.patch("chromadb.api.models.Collection.Collection.add")
  10. return App(config=AppConfig(collect_metrics=False))
  11. def test_add(app):
  12. app.add("https://example.com", metadata={"meta": "meta-data"})
  13. assert app.user_asks == [["https://example.com", "web_page", {"meta": "meta-data"}]]
  14. def test_add_sitemap(app):
  15. app.add("https://www.google.com/sitemap.xml", metadata={"meta": "meta-data"})
  16. assert app.user_asks == [["https://www.google.com/sitemap.xml", "sitemap", {"meta": "meta-data"}]]
  17. def test_add_forced_type(app):
  18. data_type = "text"
  19. app.add("https://example.com", data_type=data_type, metadata={"meta": "meta-data"})
  20. assert app.user_asks == [["https://example.com", data_type, {"meta": "meta-data"}]]
  21. def test_dry_run(app):
  22. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0)
  23. text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"""
  24. result = app.add(source=text, config=AddConfig(chunker=chunker_config), dry_run=True)
  25. chunks = result["chunks"]
  26. metadata = result["metadata"]
  27. count = result["count"]
  28. data_type = result["type"]
  29. assert len(chunks) == len(text)
  30. assert count == len(text)
  31. assert data_type == DataType.TEXT
  32. for item in metadata:
  33. assert isinstance(item, dict)
  34. assert "local" in item["url"]
  35. assert "text" in item["data_type"]