import hashlib import json import logging try: from youtube_transcript_api import YouTubeTranscriptApi except ImportError: raise ImportError('YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`') try: from langchain_community.document_loaders import YoutubeLoader except ImportError: raise ImportError( 'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' ) from None from embedchain.helpers.json_serializable import register_deserializable from embedchain.loaders.base_loader import BaseLoader from embedchain.utils.misc import clean_string @register_deserializable class YoutubeVideoLoader(BaseLoader): def load_data(self, url): """Load data from a Youtube video.""" loader = YoutubeLoader.from_youtube_url(url, add_video_info=True) doc = loader.load() output = [] if not len(doc): raise ValueError(f"No data found for url: {url}") content = doc[0].page_content content = clean_string(content) metadata = doc[0].metadata metadata["url"] = url video_id = url.split("v=")[1].split("&")[0] try: # Fetching transcript data transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"]) # convert transcript to json to avoid unicode symboles metadata["transcript"] = json.dumps(transcript, ensure_ascii=True) except Exception: logging.exception(f"Failed to fetch transcript for video {url}") metadata["transcript"] = "Unavailable" output.append( { "content": content, "meta_data": metadata, } ) doc_id = hashlib.sha256((content + url).encode()).hexdigest() return { "doc_id": doc_id, "data": output, }