test_add.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import os
  2. import pytest
  3. from embedchain import App
  4. from embedchain.config import AddConfig, AppConfig, ChunkerConfig
  5. from embedchain.models.data_type import DataType
  6. os.environ["OPENAI_API_KEY"] = "test_key"
  7. @pytest.fixture
  8. def app(mocker):
  9. mocker.patch("chromadb.api.models.Collection.Collection.add")
  10. return App(config=AppConfig(collect_metrics=False))
  11. def test_add(app):
  12. app.add("https://example.com", metadata={"foo": "bar"})
  13. assert app.user_asks == [["https://example.com", "web_page", {"foo": "bar"}]]
  14. # TODO: Make this test faster by generating a sitemap locally rather than using a remote one
  15. # def test_add_sitemap(app):
  16. # app.add("https://www.google.com/sitemap.xml", metadata={"foo": "bar"})
  17. # assert app.user_asks == [["https://www.google.com/sitemap.xml", "sitemap", {"foo": "bar"}]]
  18. def test_add_forced_type(app):
  19. data_type = "text"
  20. app.add("https://example.com", data_type=data_type, metadata={"foo": "bar"})
  21. assert app.user_asks == [["https://example.com", data_type, {"foo": "bar"}]]
  22. def test_dry_run(app):
  23. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, min_chunk_size=0)
  24. text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"""
  25. result = app.add(source=text, config=AddConfig(chunker=chunker_config), dry_run=True)
  26. chunks = result["chunks"]
  27. metadata = result["metadata"]
  28. count = result["count"]
  29. data_type = result["type"]
  30. assert len(chunks) == len(text)
  31. assert count == len(text)
  32. assert data_type == DataType.TEXT
  33. for item in metadata:
  34. assert isinstance(item, dict)
  35. assert "local" in item["url"]
  36. assert "text" in item["data_type"]