json.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. import hashlib
  2. import json
  3. import os
  4. import re
  5. import requests
  6. from embedchain.loaders.base_loader import BaseLoader
  7. from embedchain.utils import clean_string, is_valid_json_string
  8. VALID_URL_PATTERN = "^https:\/\/[0-9A-z.]+.[0-9A-z.]+.[a-z]+\/.*\.json$"
  9. class JSONLoader(BaseLoader):
  10. @staticmethod
  11. def _get_llama_hub_loader():
  12. try:
  13. from llama_hub.jsondata.base import \
  14. JSONDataReader as LLHUBJSONLoader
  15. except ImportError as e:
  16. raise Exception(
  17. f"Failed to install required packages: {e}, \
  18. install them using `pip install --upgrade 'embedchain[json]`"
  19. )
  20. return LLHUBJSONLoader()
  21. @staticmethod
  22. def _check_content(content):
  23. if not isinstance(content, str):
  24. raise ValueError(
  25. "Invaid content input. \
  26. If you want to upload (list, dict, etc.), do \
  27. `json.dump(data, indent=0)` and add the stringified JSON. \
  28. Check - `https://docs.embedchain.ai/data-sources/json`"
  29. )
  30. @staticmethod
  31. def load_data(content):
  32. """Load a json file. Each data point is a key value pair."""
  33. JSONLoader._check_content(content)
  34. loader = JSONLoader._get_llama_hub_loader()
  35. data = []
  36. data_content = []
  37. content_url_str = content
  38. # Load json data from various sources.
  39. if os.path.isfile(content):
  40. with open(content, "r", encoding="utf-8") as json_file:
  41. json_data = json.load(json_file)
  42. elif re.match(VALID_URL_PATTERN, content):
  43. response = requests.get(content)
  44. if response.status_code == 200:
  45. json_data = response.json()
  46. else:
  47. raise ValueError(
  48. f"Loading data from the given url: {content} failed. \
  49. Make sure the url is working."
  50. )
  51. elif is_valid_json_string(content):
  52. json_data = content
  53. content_url_str = hashlib.sha256((content).encode("utf-8")).hexdigest()
  54. else:
  55. raise ValueError(f"Invalid content to load json data from: {content}")
  56. docs = loader.load_data(json_data)
  57. for doc in docs:
  58. doc_content = clean_string(doc.text)
  59. data.append({"content": doc_content, "meta_data": {"url": content_url_str}})
  60. data_content.append(doc_content)
  61. doc_id = hashlib.sha256((content_url_str + ", ".join(data_content)).encode()).hexdigest()
  62. return {"doc_id": doc_id, "data": data}