youtube_video.py 1010 B

12345678910111213141516171819202122232425262728293031323334
  1. import hashlib
  2. from langchain.document_loaders import YoutubeLoader
  3. from embedchain.helper.json_serializable import register_deserializable
  4. from embedchain.loaders.base_loader import BaseLoader
  5. from embedchain.utils import clean_string
  6. @register_deserializable
  7. class YoutubeVideoLoader(BaseLoader):
  8. def load_data(self, url):
  9. """Load data from a Youtube video."""
  10. loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
  11. doc = loader.load()
  12. output = []
  13. if not len(doc):
  14. raise ValueError("No data found")
  15. content = doc[0].page_content
  16. content = clean_string(content)
  17. meta_data = doc[0].metadata
  18. meta_data["url"] = url
  19. output.append(
  20. {
  21. "content": content,
  22. "meta_data": meta_data,
  23. }
  24. )
  25. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  26. return {
  27. "doc_id": doc_id,
  28. "data": output,
  29. }