json.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import hashlib
  2. import json
  3. import os
  4. import re
  5. from typing import Union
  6. import requests
  7. from embedchain.loaders.base_loader import BaseLoader
  8. from embedchain.utils.misc import clean_string, is_valid_json_string
  9. class JSONReader:
  10. def __init__(self) -> None:
  11. """Initialize the JSONReader."""
  12. pass
  13. @staticmethod
  14. def load_data(json_data: Union[dict, str]) -> list[str]:
  15. """Load data from a JSON structure.
  16. Args:
  17. json_data (Union[dict, str]): The JSON data to load.
  18. Returns:
  19. list[str]: A list of strings representing the leaf nodes of the JSON.
  20. """
  21. if isinstance(json_data, str):
  22. json_data = json.loads(json_data)
  23. else:
  24. json_data = json_data
  25. json_output = json.dumps(json_data, indent=0)
  26. lines = json_output.split("\n")
  27. useful_lines = [line for line in lines if not re.match(r"^[{}\[\],]*$", line)]
  28. return ["\n".join(useful_lines)]
  29. VALID_URL_PATTERN = "^https:\/\/[0-9A-Za-z]+(\.[0-9A-Za-z]+)*\/[0-9A-Za-z_\/]*\.json$"
  30. class JSONLoader(BaseLoader):
  31. @staticmethod
  32. def _check_content(content):
  33. if not isinstance(content, str):
  34. raise ValueError(
  35. "Invaid content input. \
  36. If you want to upload (list, dict, etc.), do \
  37. `json.dump(data, indent=0)` and add the stringified JSON. \
  38. Check - `https://docs.embedchain.ai/data-sources/json`"
  39. )
  40. @staticmethod
  41. def load_data(content):
  42. """Load a json file. Each data point is a key value pair."""
  43. JSONLoader._check_content(content)
  44. loader = JSONReader()
  45. data = []
  46. data_content = []
  47. content_url_str = content
  48. if os.path.isfile(content):
  49. with open(content, "r", encoding="utf-8") as json_file:
  50. json_data = json.load(json_file)
  51. elif re.match(VALID_URL_PATTERN, content):
  52. response = requests.get(content)
  53. if response.status_code == 200:
  54. json_data = response.json()
  55. else:
  56. raise ValueError(
  57. f"Loading data from the given url: {content} failed. \
  58. Make sure the url is working."
  59. )
  60. elif is_valid_json_string(content):
  61. json_data = content
  62. content_url_str = hashlib.sha256((content).encode("utf-8")).hexdigest()
  63. else:
  64. raise ValueError(f"Invalid content to load json data from: {content}")
  65. docs = loader.load_data(json_data)
  66. for doc in docs:
  67. text = doc if isinstance(doc, str) else doc["text"]
  68. doc_content = clean_string(text)
  69. data.append({"content": doc_content, "meta_data": {"url": content_url_str}})
  70. data_content.append(doc_content)
  71. doc_id = hashlib.sha256((content_url_str + ", ".join(data_content)).encode()).hexdigest()
  72. return {"doc_id": doc_id, "data": data}