json.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import hashlib
  2. import json
  3. import os
  4. import re
  5. import requests
  6. from embedchain.loaders.base_loader import BaseLoader
  7. from embedchain.utils import clean_string, is_valid_json_string
  8. VALID_URL_PATTERN = "^https:\/\/[0-9A-z.]+.[0-9A-z.]+.[a-z]+\/.*\.json$"
  9. class JSONLoader(BaseLoader):
  10. @staticmethod
  11. def _get_llama_hub_loader():
  12. try:
  13. from llama_hub.jsondata.base import \
  14. JSONDataReader as LLHUBJSONLoader
  15. except ImportError as e:
  16. raise Exception(
  17. f"Failed to install required packages: {e}, \
  18. install them using `pip install --upgrade 'embedchain[json]`"
  19. )
  20. return LLHUBJSONLoader()
  21. @staticmethod
  22. def load_data(content):
  23. """Load a json file. Each data point is a key value pair."""
  24. loader = JSONLoader._get_llama_hub_loader()
  25. data = []
  26. data_content = []
  27. content_url_str = content
  28. # Load json data from various sources.
  29. if os.path.isfile(content):
  30. with open(content, "r", encoding="utf-8") as json_file:
  31. json_data = json.load(json_file)
  32. elif re.match(VALID_URL_PATTERN, content):
  33. response = requests.get(content)
  34. if response.status_code == 200:
  35. json_data = response.json()
  36. else:
  37. raise ValueError(
  38. f"Loading data from the given url: {content} failed. \
  39. Make sure the url is working."
  40. )
  41. elif is_valid_json_string(content):
  42. json_data = content
  43. content_url_str = hashlib.sha256((content).encode("utf-8")).hexdigest()
  44. else:
  45. raise ValueError(f"Invalid content to load json data from: {content}")
  46. docs = loader.load_data(json_data)
  47. for doc in docs:
  48. doc_content = clean_string(doc.text)
  49. data.append({"content": doc_content, "meta_data": {"url": content_url_str}})
  50. data_content.append(doc_content)
  51. doc_id = hashlib.sha256((content_url_str + ", ".join(data_content)).encode()).hexdigest()
  52. return {"doc_id": doc_id, "data": data}