youtube_video.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. import hashlib
  2. import json
  3. import logging
  4. try:
  5. from youtube_transcript_api import YouTubeTranscriptApi
  6. except ImportError:
  7. raise ImportError(
  8. 'YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`'
  9. )
  10. try:
  11. from langchain_community.document_loaders import YoutubeLoader
  12. except ImportError:
  13. raise ImportError(
  14. 'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
  15. ) from None
  16. from embedchain.helpers.json_serializable import register_deserializable
  17. from embedchain.loaders.base_loader import BaseLoader
  18. from embedchain.utils.misc import clean_string
  19. @register_deserializable
  20. class YoutubeVideoLoader(BaseLoader):
  21. def load_data(self, url):
  22. """Load data from a Youtube video."""
  23. loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
  24. doc = loader.load()
  25. output = []
  26. if not len(doc):
  27. raise ValueError(f"No data found for url: {url}")
  28. content = doc[0].page_content
  29. content = clean_string(content)
  30. metadata = doc[0].metadata
  31. metadata["url"] = url
  32. video_id = url.split("v=")[1].split('&')[0]
  33. try:
  34. # Fetching transcript data
  35. transcript = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])
  36. # convert transcript to json to avoid unicode symboles
  37. metadata["transcript"] = json.dumps(transcript, ensure_ascii=True)
  38. except Exception as e:
  39. logging.exception(f"Failed to fetch transcript for video {url}")
  40. metadata["transcript"] = "Unavailable"
  41. output.append(
  42. {
  43. "content": content,
  44. "meta_data": metadata,
  45. }
  46. )
  47. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  48. return {
  49. "doc_id": doc_id,
  50. "data": output,
  51. }