ソースを参照

Add feature to extract timestamps from youtube videos (#1345)

Esparon1 1 年間 前
コミット
78301ee63d
1 ファイル変更20 行追加1 行削除
  1. 20 1
      embedchain/loaders/youtube_video.py

+ 20 - 1
embedchain/loaders/youtube_video.py

@@ -1,5 +1,12 @@
 import hashlib
-
+import json
+import logging
+try:
+    from youtube_transcript_api import YouTubeTranscriptApi
+except ImportError:
+    raise ImportError(
+        'YouTube video requires extra dependencies. Install with `pip install youtube-transcript-api "`'
+    )
 try:
     from langchain_community.document_loaders import YoutubeLoader
 except ImportError:
@@ -25,6 +32,18 @@ class YoutubeVideoLoader(BaseLoader):
         metadata = doc[0].metadata
         metadata["url"] = url
 
+
+        video_id = url.split("v=")[1].split('&')[0]
+        try:
+            # Fetching transcript data
+            transcript = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])
+            # convert transcript to json to avoid unicode symboles
+            metadata["transcript"] = json.dumps(transcript, ensure_ascii=True)
+        except Exception as e:
+            logging.exception(f"Failed to fetch transcript for video {url}")
+            metadata["transcript"] = "Unavailable"   
+
+
         output.append(
             {
                 "content": content,