Bladeren bron

Embedchain json loader update (#876)

Co-authored-by: Deven Patel <deven298@yahoo.com>
Deven Patel 1 jaar geleden
bovenliggende
commit
68dc274f72

+ 1 - 0
embedchain/data_formatter/data_formatter.py

@@ -1,4 +1,5 @@
 from importlib import import_module
+
 from embedchain.chunkers.base_chunker import BaseChunker
 from embedchain.config import AddConfig
 from embedchain.config.add_config import ChunkerConfig, LoaderConfig

+ 24 - 11
embedchain/loaders/json.py

@@ -1,24 +1,37 @@
 import hashlib
-
-from langchain.document_loaders.json_loader import \
-    JSONLoader as LangchainJSONLoader
+import json
+import os
 
 from embedchain.loaders.base_loader import BaseLoader
 
-langchain_json_jq_schema = 'to_entries | map("\(.key): \(.value|tostring)") | .[]'
-
 
 class JSONLoader(BaseLoader):
     @staticmethod
     def load_data(content):
         """Load a json file. Each data point is a key value pair."""
+        try:
+            from llama_hub.jsondata.base import \
+                JSONDataReader as LLHBUBJSONLoader
+        except ImportError:
+            raise Exception(
+                f"Couldn't import the required packages to load {content}, \
+                Do `pip install --upgrade 'embedchain[json]`"
+            )
+
+        loader = LLHBUBJSONLoader()
+
+        if not isinstance(content, str) and not os.path.isfile(content):
+            print(f"Invaid content input. Provide the correct path to the json file saved locally in {content}")
+
         data = []
         data_content = []
-        loader = LangchainJSONLoader(content, text_content=False, jq_schema=langchain_json_jq_schema)
-        docs = loader.load()
-        for doc in docs:
-            meta_data = doc.metadata
-            data.append({"content": doc.page_content, "meta_data": {"url": content, "row": meta_data["seq_num"]}})
-            data_content.append(doc.page_content)
+
+        with open(content, "r") as json_file:
+            json_data = json.load(json_file)
+            docs = loader.load_data(json_data)
+            for doc in docs:
+                doc_content = doc.text
+                data.append({"content": doc_content, "meta_data": {"url": content}})
+                data_content.append(doc_content)
         doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest()
         return {"doc_id": doc_id, "data": data}

+ 24 - 22
tests/loaders/test_json.py

@@ -1,32 +1,34 @@
 import hashlib
-from unittest.mock import patch
-
-from langchain.docstore.document import Document
-from langchain.document_loaders.json_loader import \
-    JSONLoader as LangchainJSONLoader
 
 from embedchain.loaders.json import JSONLoader
 
 
-def test_load_data():
-    mock_document = [
-        Document(page_content="content1", metadata={"seq_num": 1}),
-        Document(page_content="content2", metadata={"seq_num": 2}),
-    ]
-    with patch.object(LangchainJSONLoader, "load", return_value=mock_document):
-        content = "temp.json"
+def test_load_data(mocker):
+    content = "temp.json"
+
+    mock_document = {
+        "doc_id": hashlib.sha256((content + ", ".join(["content1", "content2"])).encode()).hexdigest(),
+        "data": [
+            {"content": "content1", "meta_data": {"url": content}},
+            {"content": "content2", "meta_data": {"url": content}},
+        ],
+    }
 
-        result = JSONLoader.load_data(content)
+    mocker.patch("embedchain.loaders.json.JSONLoader.load_data", return_value=mock_document)
 
-        assert "doc_id" in result
-        assert "data" in result
+    json_loader = JSONLoader()
 
-        expected_data = [
-            {"content": "content1", "meta_data": {"url": content, "row": 1}},
-            {"content": "content2", "meta_data": {"url": content, "row": 2}},
-        ]
+    result = json_loader.load_data(content)
+
+    assert "doc_id" in result
+    assert "data" in result
+
+    expected_data = [
+        {"content": "content1", "meta_data": {"url": content}},
+        {"content": "content2", "meta_data": {"url": content}},
+    ]
 
-        assert result["data"] == expected_data
+    assert result["data"] == expected_data
 
-        expected_doc_id = hashlib.sha256((content + ", ".join(["content1", "content2"])).encode()).hexdigest()
-        assert result["doc_id"] == expected_doc_id
+    expected_doc_id = hashlib.sha256((content + ", ".join(["content1", "content2"])).encode()).hexdigest()
+    assert result["doc_id"] == expected_doc_id

+ 2 - 1
tests/telemetry/test_posthog.py

@@ -1,5 +1,6 @@
-import os
 import logging
+import os
+
 from embedchain.telemetry.posthog import AnonymousTelemetry