youtube_video.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. import hashlib
  2. try:
  3. from langchain_community.document_loaders import YoutubeLoader
  4. except ImportError:
  5. raise ImportError(
  6. 'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
  7. ) from None
  8. from embedchain.helpers.json_serializable import register_deserializable
  9. from embedchain.loaders.base_loader import BaseLoader
  10. from embedchain.utils.misc import clean_string
  11. @register_deserializable
  12. class YoutubeVideoLoader(BaseLoader):
  13. def load_data(self, url):
  14. """Load data from a Youtube video."""
  15. loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
  16. doc = loader.load()
  17. output = []
  18. if not len(doc):
  19. raise ValueError(f"No data found for url: {url}")
  20. content = doc[0].page_content
  21. content = clean_string(content)
  22. meta_data = doc[0].metadata
  23. meta_data["url"] = url
  24. output.append(
  25. {
  26. "content": content,
  27. "meta_data": meta_data,
  28. }
  29. )
  30. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  31. return {
  32. "doc_id": doc_id,
  33. "data": output,
  34. }