json.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. import hashlib
  2. import json
  3. import os
  4. import re
  5. from typing import Union
  6. import requests
  7. from embedchain.loaders.base_loader import BaseLoader
  8. from embedchain.utils.misc import clean_string, is_valid_json_string
  9. class JSONReader:
  10. def __init__(self) -> None:
  11. """Initialize the JSONReader."""
  12. pass
  13. @staticmethod
  14. def load_data(json_data: Union[dict, str]) -> list[str]:
  15. """Load data from a JSON structure.
  16. Args:
  17. json_data (Union[dict, str]): The JSON data to load.
  18. Returns:
  19. list[str]: A list of strings representing the leaf nodes of the JSON.
  20. """
  21. if isinstance(json_data, str):
  22. json_data = json.loads(json_data)
  23. else:
  24. json_data = json_data
  25. json_output = json.dumps(json_data, indent=0)
  26. lines = json_output.split("\n")
  27. useful_lines = [line for line in lines if not re.match(r"^[{}\[\],]*$", line)]
  28. return ["\n".join(useful_lines)]
  29. VALID_URL_PATTERN = (
  30. "^https?://(?:www\.)?(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[a-zA-Z0-9.-]+)(?::\d+)?/(?:[^/\s]+/)*[^/\s]+\.json$"
  31. )
  32. class JSONLoader(BaseLoader):
  33. @staticmethod
  34. def _check_content(content):
  35. if not isinstance(content, str):
  36. raise ValueError(
  37. "Invaid content input. \
  38. If you want to upload (list, dict, etc.), do \
  39. `json.dump(data, indent=0)` and add the stringified JSON. \
  40. Check - `https://docs.embedchain.ai/data-sources/json`"
  41. )
  42. @staticmethod
  43. def load_data(content):
  44. """Load a json file. Each data point is a key value pair."""
  45. JSONLoader._check_content(content)
  46. loader = JSONReader()
  47. data = []
  48. data_content = []
  49. content_url_str = content
  50. if os.path.isfile(content):
  51. with open(content, "r", encoding="utf-8") as json_file:
  52. json_data = json.load(json_file)
  53. elif re.match(VALID_URL_PATTERN, content):
  54. response = requests.get(content)
  55. if response.status_code == 200:
  56. json_data = response.json()
  57. else:
  58. raise ValueError(
  59. f"Loading data from the given url: {content} failed. \
  60. Make sure the url is working."
  61. )
  62. elif is_valid_json_string(content):
  63. json_data = content
  64. content_url_str = hashlib.sha256((content).encode("utf-8")).hexdigest()
  65. else:
  66. raise ValueError(f"Invalid content to load json data from: {content}")
  67. docs = loader.load_data(json_data)
  68. for doc in docs:
  69. text = doc if isinstance(doc, str) else doc["text"]
  70. doc_content = clean_string(text)
  71. data.append({"content": doc_content, "meta_data": {"url": content_url_str}})
  72. data_content.append(doc_content)
  73. doc_id = hashlib.sha256((content_url_str + ", ".join(data_content)).encode()).hexdigest()
  74. return {"doc_id": doc_id, "data": data}