youtube_video.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import hashlib
  2. import json
  3. import logging
  4. try:
  5. from youtube_transcript_api import YouTubeTranscriptApi
  6. except ImportError:
  7. raise ImportError('YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`')
  8. try:
  9. from langchain_community.document_loaders import YoutubeLoader
  10. from langchain_community.document_loaders.youtube import _parse_video_id
  11. except ImportError:
  12. raise ImportError(
  13. 'YouTube video requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
  14. ) from None
  15. from embedchain.helpers.json_serializable import register_deserializable
  16. from embedchain.loaders.base_loader import BaseLoader
  17. from embedchain.utils.misc import clean_string
  18. @register_deserializable
  19. class YoutubeVideoLoader(BaseLoader):
  20. def load_data(self, url):
  21. """Load data from a Youtube video."""
  22. video_id = _parse_video_id(url)
  23. languages = ["en"]
  24. try:
  25. # Fetching transcript data
  26. languages = [transcript.language_code for transcript in YouTubeTranscriptApi.list_transcripts(video_id)]
  27. transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
  28. # convert transcript to json to avoid unicode symboles
  29. transcript = json.dumps(transcript, ensure_ascii=True)
  30. except Exception:
  31. logging.exception(f"Failed to fetch transcript for video {url}")
  32. transcript = "Unavailable"
  33. loader = YoutubeLoader.from_youtube_url(url, add_video_info=True, language=languages)
  34. doc = loader.load()
  35. output = []
  36. if not len(doc):
  37. raise ValueError(f"No data found for url: {url}")
  38. content = doc[0].page_content
  39. content = clean_string(content)
  40. metadata = doc[0].metadata
  41. metadata["url"] = url
  42. metadata["transcript"] = transcript
  43. output.append(
  44. {
  45. "content": content,
  46. "meta_data": metadata,
  47. }
  48. )
  49. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  50. return {
  51. "doc_id": doc_id,
  52. "data": output,
  53. }