youtube_video.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. import hashlib
  2. import json
  3. import logging
  4. try:
  5. from youtube_transcript_api import YouTubeTranscriptApi
  6. except ImportError:
  7. raise ImportError("YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api`")
  8. try:
  9. from langchain_community.document_loaders import YoutubeLoader
  10. from langchain_community.document_loaders.youtube import _parse_video_id
  11. except ImportError:
  12. raise ImportError("YouTube video requires extra dependencies. Install with `pip install pytube==15.0.0`") from None
  13. from embedchain.helpers.json_serializable import register_deserializable
  14. from embedchain.loaders.base_loader import BaseLoader
  15. from embedchain.utils.misc import clean_string
  16. @register_deserializable
  17. class YoutubeVideoLoader(BaseLoader):
  18. def load_data(self, url):
  19. """Load data from a Youtube video."""
  20. video_id = _parse_video_id(url)
  21. languages = ["en"]
  22. try:
  23. # Fetching transcript data
  24. languages = [transcript.language_code for transcript in YouTubeTranscriptApi.list_transcripts(video_id)]
  25. transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
  26. # convert transcript to json to avoid unicode symboles
  27. transcript = json.dumps(transcript, ensure_ascii=True)
  28. except Exception:
  29. logging.exception(f"Failed to fetch transcript for video {url}")
  30. transcript = "Unavailable"
  31. loader = YoutubeLoader.from_youtube_url(url, add_video_info=True, language=languages)
  32. doc = loader.load()
  33. output = []
  34. if not len(doc):
  35. raise ValueError(f"No data found for url: {url}")
  36. content = doc[0].page_content
  37. content = clean_string(content)
  38. metadata = doc[0].metadata
  39. metadata["url"] = url
  40. metadata["transcript"] = transcript
  41. output.append(
  42. {
  43. "content": content,
  44. "meta_data": metadata,
  45. }
  46. )
  47. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  48. return {
  49. "doc_id": doc_id,
  50. "data": output,
  51. }