json.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import hashlib
  2. import json
  3. import os
  4. import re
  5. import requests
  6. from embedchain.loaders.base_loader import BaseLoader
  7. VALID_URL_PATTERN = "^https:\/\/[0-9A-z.]+.[0-9A-z.]+.[a-z]+\/.*\.json$"
  8. class JSONLoader(BaseLoader):
  9. @staticmethod
  10. def load_data(content):
  11. """Load a json file. Each data point is a key value pair."""
  12. try:
  13. from llama_hub.jsondata.base import \
  14. JSONDataReader as LLHBUBJSONLoader
  15. except ImportError:
  16. raise Exception(
  17. f"Couldn't import the required packages to load {content}, \
  18. Do `pip install --upgrade 'embedchain[json]`"
  19. )
  20. loader = LLHBUBJSONLoader()
  21. if not isinstance(content, str):
  22. print(f"Invaid content input. Provide the correct path to the json file saved locally in {content}")
  23. data = []
  24. data_content = []
  25. # Load json data from various sources. TODO: add support for dictionary
  26. if os.path.isfile(content):
  27. with open(content, "r") as json_file:
  28. json_data = json.load(json_file)
  29. elif re.match(VALID_URL_PATTERN, content):
  30. response = requests.get(content)
  31. if response.status_code == 200:
  32. json_data = response.json()
  33. else:
  34. raise ValueError(
  35. f"Loading data from the given url: {content} failed. \
  36. Make sure the url is working."
  37. )
  38. else:
  39. raise ValueError(f"Invalid content to load json data from: {content}")
  40. docs = loader.load_data(json_data)
  41. for doc in docs:
  42. doc_content = doc.text
  43. data.append({"content": doc_content, "meta_data": {"url": content}})
  44. data_content.append(doc_content)
  45. doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest()
  46. return {"doc_id": doc_id, "data": data}