json.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import hashlib
  2. import json
  3. import os
  4. import re
  5. from typing import Dict, List, Union
  6. import requests
  7. from embedchain.loaders.base_loader import BaseLoader
  8. from embedchain.utils import clean_string, is_valid_json_string
  9. class JSONReader:
  10. def __init__(self) -> None:
  11. """Initialize the JSONReader."""
  12. pass
  13. def load_data(self, json_data: Union[Dict, str]) -> List[str]:
  14. """Load data from a JSON structure.
  15. Args:
  16. json_data (Union[Dict, str]): The JSON data to load.
  17. Returns:
  18. List[str]: A list of strings representing the leaf nodes of the JSON.
  19. """
  20. if isinstance(json_data, str):
  21. json_data = json.loads(json_data)
  22. else:
  23. json_data = json_data
  24. json_output = json.dumps(json_data, indent=0)
  25. lines = json_output.split("\n")
  26. useful_lines = [line for line in lines if not re.match(r"^[{}\[\],]*$", line)]
  27. return ["\n".join(useful_lines)]
  28. VALID_URL_PATTERN = "^https:\/\/[0-9A-z.]+.[0-9A-z.]+.[a-z]+\/.*\.json$"
  29. class JSONLoader(BaseLoader):
  30. @staticmethod
  31. def _check_content(content):
  32. if not isinstance(content, str):
  33. raise ValueError(
  34. "Invaid content input. \
  35. If you want to upload (list, dict, etc.), do \
  36. `json.dump(data, indent=0)` and add the stringified JSON. \
  37. Check - `https://docs.embedchain.ai/data-sources/json`"
  38. )
  39. @staticmethod
  40. def load_data(content):
  41. """Load a json file. Each data point is a key value pair."""
  42. JSONLoader._check_content(content)
  43. loader = JSONReader()
  44. data = []
  45. data_content = []
  46. content_url_str = content
  47. if os.path.isfile(content):
  48. with open(content, "r", encoding="utf-8") as json_file:
  49. json_data = json.load(json_file)
  50. elif re.match(VALID_URL_PATTERN, content):
  51. response = requests.get(content)
  52. if response.status_code == 200:
  53. json_data = response.json()
  54. else:
  55. raise ValueError(
  56. f"Loading data from the given url: {content} failed. \
  57. Make sure the url is working."
  58. )
  59. elif is_valid_json_string(content):
  60. json_data = content
  61. content_url_str = hashlib.sha256((content).encode("utf-8")).hexdigest()
  62. else:
  63. raise ValueError(f"Invalid content to load json data from: {content}")
  64. docs = loader.load_data(json_data)
  65. for doc in docs:
  66. text = doc if isinstance(doc, str) else doc["text"]
  67. doc_content = clean_string(text)
  68. data.append({"content": doc_content, "meta_data": {"url": content_url_str}})
  69. data_content.append(doc_content)
  70. doc_id = hashlib.sha256((content_url_str + ", ".join(data_content)).encode()).hexdigest()
  71. return {"doc_id": doc_id, "data": data}