youtube_video.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. import hashlib
  2. import json
  3. import logging
  4. try:
  5. from youtube_transcript_api import YouTubeTranscriptApi
  6. except ImportError:
  7. raise ImportError('YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`')
  8. try:
  9. from langchain_community.document_loaders import YoutubeLoader
  10. except ImportError:
  11. raise ImportError(
  12. 'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
  13. ) from None
  14. from embedchain.helpers.json_serializable import register_deserializable
  15. from embedchain.loaders.base_loader import BaseLoader
  16. from embedchain.utils.misc import clean_string
  17. @register_deserializable
  18. class YoutubeVideoLoader(BaseLoader):
  19. def load_data(self, url):
  20. """Load data from a Youtube video."""
  21. loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
  22. doc = loader.load()
  23. output = []
  24. if not len(doc):
  25. raise ValueError(f"No data found for url: {url}")
  26. content = doc[0].page_content
  27. content = clean_string(content)
  28. metadata = doc[0].metadata
  29. metadata["url"] = url
  30. video_id = url.split("v=")[1].split("&")[0]
  31. try:
  32. # Fetching transcript data
  33. transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
  34. # convert transcript to json to avoid unicode symboles
  35. metadata["transcript"] = json.dumps(transcript, ensure_ascii=True)
  36. except Exception:
  37. logging.exception(f"Failed to fetch transcript for video {url}")
  38. metadata["transcript"] = "Unavailable"
  39. output.append(
  40. {
  41. "content": content,
  42. "meta_data": metadata,
  43. }
  44. )
  45. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  46. return {
  47. "doc_id": doc_id,
  48. "data": output,
  49. }