Prechádzať zdrojové kódy

[Features] Add Github and Youtube Channel loaders (#957)

Co-authored-by: Deven Patel <deven298@yahoo.com>
Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
Deven Patel 1 rok pred
rodič
commit
07fb6bee54

+ 12 - 1
embedchain/chunkers/base_chunker.py

@@ -1,5 +1,8 @@
 import hashlib
 
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from embedchain.config.add_config import ChunkerConfig
 from embedchain.helper.json_serializable import JSONSerializable
 from embedchain.models.data_type import DataType
 
@@ -7,7 +10,15 @@ from embedchain.models.data_type import DataType
 class BaseChunker(JSONSerializable):
     def __init__(self, text_splitter):
         """Initialize the chunker."""
-        self.text_splitter = text_splitter
+        if text_splitter is None:
+            config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
+            self.text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=config.chunk_size,
+                chunk_overlap=config.chunk_overlap,
+                length_function=config.length_function,
+            )
+        else:
+            self.text_splitter = text_splitter
         self.data_type = None
 
     def create_chunks(self, loader, src, app_id=None):

+ 4 - 0
embedchain/data_formatter/data_formatter.py

@@ -64,6 +64,8 @@ class DataFormatter(JSONSerializable):
             DataType.GMAIL: "embedchain.loaders.gmail.GmailLoader",
             DataType.NOTION: "embedchain.loaders.notion.NotionLoader",
             DataType.SUBSTACK: "embedchain.loaders.substack.SubstackLoader",
+            DataType.GITHUB: "embedchain.loaders.github.GithubLoader",
+            DataType.YOUTUBE_CHANNEL: "embedchain.loaders.youtube_channel.YoutubeChannelLoader",
         }
 
         custom_loaders = set(
@@ -114,6 +116,8 @@ class DataFormatter(JSONSerializable):
             DataType.SLACK: "embedchain.chunkers.slack.SlackChunker",
             DataType.DISCOURSE: "embedchain.chunkers.discourse.DiscourseChunker",
             DataType.SUBSTACK: "embedchain.chunkers.substack.SubstackChunker",
+            DataType.GITHUB: "embedchain.chunkers.base_chunker.BaseChunker",
+            DataType.YOUTUBE_CHANNEL: "embedchain.chunkers.base_chunker.BaseChunker",
         }
 
         if data_type in chunker_classes:

+ 81 - 0
embedchain/loaders/github.py

@@ -0,0 +1,81 @@
+import concurrent.futures
+import hashlib
+import logging
+import os
+
+from embedchain.loaders.base_loader import BaseLoader
+from embedchain.loaders.json import JSONLoader
+from embedchain.loaders.mdx import MdxLoader
+from embedchain.loaders.unstructured_file import UnstructuredLoader
+from embedchain.utils import detect_datatype
+
+
+class GithubLoader(BaseLoader):
+    def load_data(self, repo_url):
+        """Load data from a git repo."""
+        try:
+            from git import Repo
+        except ImportError as e:
+            raise ValueError(
+                "GithubLoader requires extra dependencies. Install with `pip install --upgrade 'embedchain[git]'`"
+            ) from e
+
+        mdx_loader = MdxLoader()
+        json_loader = JSONLoader()
+        unstructured_loader = UnstructuredLoader()
+        data = []
+        data_urls = []
+
+        def _fetch_or_clone_repo(repo_url: str, local_path: str):
+            if os.path.exists(local_path):
+                logging.info("Repository already exists. Fetching updates...")
+                repo = Repo(local_path)
+                origin = repo.remotes.origin
+                origin.fetch()
+                logging.info("Fetch completed.")
+            else:
+                logging.info("Cloning repository...")
+                Repo.clone_from(repo_url, local_path)
+                logging.info("Clone completed.")
+
+        def _load_file(file_path: str):
+            try:
+                data_type = detect_datatype(file_path).value
+            except Exception:
+                data_type = "unstructured"
+
+            if data_type == "mdx":
+                data = mdx_loader.load_data(file_path)
+            elif data_type == "json":
+                data = json_loader.load_data(file_path)
+            else:
+                data = unstructured_loader.load_data(file_path)
+
+            return data.get("data", [])
+
+        def _add_repo_files(repo_path: str):
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future_to_file = {
+                    executor.submit(_load_file, os.path.join(root, filename)): os.path.join(root, filename)
+                    for root, _, files in os.walk(repo_path)
+                    for filename in files
+                }  # noqa: E501
+                for future in concurrent.futures.as_completed(future_to_file):
+                    file = future_to_file[future]
+                    try:
+                        results = future.result()
+                        if results:
+                            data.extend(results)
+                            data_urls.extend([result.get("meta_data").get("url") for result in results])
+                    except Exception as e:
+                        logging.error(f"Failed to process {file}: {e}")
+
+        source_hash = hashlib.sha256(repo_url.encode()).hexdigest()
+        repo_path = f"/tmp/{source_hash}"
+        _fetch_or_clone_repo(repo_url=repo_url, local_path=repo_path)
+        _add_repo_files(repo_path)
+        doc_id = hashlib.sha256((repo_url + ", ".join(data_urls)).encode()).hexdigest()
+        return {
+            "doc_id": doc_id,
+            "data": data,
+        }

+ 2 - 2
embedchain/loaders/sitemap.py

@@ -57,8 +57,8 @@ class SitemapLoader(BaseLoader):
                 try:
                     data = future.result()
                     if data:
-                        output.append(data)
+                        output.extend(data)
                 except Exception as e:
                     logging.error(f"Error loading page {link}: {e}")
 
-        return {"doc_id": doc_id, "data": [data[0] for data in output if data]}
+        return {"doc_id": doc_id, "data": output}

+ 70 - 0
embedchain/loaders/youtube_channel.py

@@ -0,0 +1,70 @@
+import concurrent.futures
+import hashlib
+import logging
+
+from embedchain.loaders.base_loader import BaseLoader
+from embedchain.loaders.youtube_video import YoutubeVideoLoader
+
+
+class YoutubeChannelLoader(BaseLoader):
+    """Loader for youtube channel."""
+
+    def load_data(self, channel_name):
+        try:
+            import yt_dlp
+        except ImportError as e:
+            raise ValueError(
+                "YoutubeLoader requires extra dependencies. Install with `pip install --upgrade 'embedchain[youtube_channel]'`"  # noqa: E501
+            ) from e
+
+        data = []
+        data_urls = []
+        youtube_url = f"https://www.youtube.com/{channel_name}/videos"
+        youtube_video_loader = YoutubeVideoLoader()
+
+        def _get_yt_video_links():
+            try:
+                ydl_opts = {
+                    "quiet": True,
+                    "extract_flat": True,
+                }
+                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                    info_dict = ydl.extract_info(youtube_url, download=False)
+                    if "entries" in info_dict:
+                        videos = [entry["url"] for entry in info_dict["entries"]]
+                        return videos
+            except Exception:
+                logging.error(f"Failed to fetch youtube videos for channel: {channel_name}")
+                return []
+
+        def _load_yt_video(video_link):
+            try:
+                each_load_data = youtube_video_loader.load_data(video_link)
+                if each_load_data:
+                    return each_load_data.get("data")
+            except Exception as e:
+                logging.error(f"Failed to load youtube video {video_link}: {e}")
+            return None
+
+        def _add_youtube_channel():
+            video_links = _get_yt_video_links()
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future_to_video = {
+                    executor.submit(_load_yt_video, video_link): video_link for video_link in video_links
+                }  # noqa: E501
+                for future in concurrent.futures.as_completed(future_to_video):
+                    video = future_to_video[future]
+                    try:
+                        results = future.result()
+                        if results:
+                            data.extend(results)
+                            data_urls.extend([result.get("meta_data").get("url") for result in results])
+                    except Exception as e:
+                        logging.error(f"Failed to process youtube video {video}: {e}")
+
+        _add_youtube_channel()
+        doc_id = hashlib.sha256((youtube_url + ", ".join(data_urls)).encode()).hexdigest()
+        return {
+            "doc_id": doc_id,
+            "data": data,
+        }

+ 4 - 0
embedchain/models/data_type.py

@@ -34,6 +34,8 @@ class IndirectDataType(Enum):
     SLACK = "slack"
     DISCOURSE = "discourse"
     SUBSTACK = "substack"
+    GITHUB = "github"
+    YOUTUBE_CHANNEL = "youtube_channel"
 
 
 class SpecialDataType(Enum):
@@ -67,3 +69,5 @@ class DataType(Enum):
     SLACK = IndirectDataType.SLACK.value
     DISCOURSE = IndirectDataType.DISCOURSE.value
     SUBSTACK = IndirectDataType.SUBSTACK.value
+    GITHUB = IndirectDataType.GITHUB.value
+    YOUTUBE_CHANNEL = IndirectDataType.YOUTUBE_CHANNEL.value

+ 4 - 0
embedchain/utils.py

@@ -255,6 +255,10 @@ def detect_datatype(source: Any) -> DataType:
             logging.debug(f"Source of `{formatted_source}` detected as `docs_site`.")
             return DataType.DOCS_SITE
 
+        if "github.com" in url.netloc:
+            logging.debug(f"Source of `{formatted_source}` detected as `github`.")
+            return DataType.GITHUB
+
         # If none of the above conditions are met, it's a general web page
         logging.debug(f"Source of `{formatted_source}` detected as `web_page`.")
         return DataType.WEB_PAGE

+ 118 - 1
poetry.lock

@@ -1691,6 +1691,37 @@ files = [
 [package.dependencies]
 wcwidth = ">=0.2.5"
 
+[[package]]
+name = "gitdb"
+version = "4.0.11"
+description = "Git Object Database"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "gitdb-4.0.11-py3-none-any.whl", hash = "sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4"},
+    {file = "gitdb-4.0.11.tar.gz", hash = "sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b"},
+]
+
+[package.dependencies]
+smmap = ">=3.0.1,<6"
+
+[[package]]
+name = "gitpython"
+version = "3.1.40"
+description = "GitPython is a Python library used to interact with Git repositories"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "GitPython-3.1.40-py3-none-any.whl", hash = "sha256:cf14627d5a8049ffbf49915732e5eddbe8134c3bdb9d476e6182b676fc573f8a"},
+    {file = "GitPython-3.1.40.tar.gz", hash = "sha256:22b126e9ffb671fdd0c129796343a02bf67bf2994b35449ffc9321aa755e18a4"},
+]
+
+[package.dependencies]
+gitdb = ">=4.0.1,<5"
+
+[package.extras]
+test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest", "pytest-cov", "pytest-instafail", "pytest-subtests", "pytest-sugar"]
+
 [[package]]
 name = "google-api-core"
 version = "2.12.0"
@@ -3380,6 +3411,17 @@ files = [
     {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
 ]
 
+[[package]]
+name = "mutagen"
+version = "1.47.0"
+description = "read and write audio tags for many formats"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "mutagen-1.47.0-py3-none-any.whl", hash = "sha256:edd96f50c5907a9539d8e5bba7245f62c9f520aef333d13392a79a4f70aca719"},
+    {file = "mutagen-1.47.0.tar.gz", hash = "sha256:719fadef0a978c31b4cf3c956261b3c58b6948b32023078a2117b1de09f0fc99"},
+]
+
 [[package]]
 name = "mypy-extensions"
 version = "1.0.0"
@@ -4639,6 +4681,47 @@ files = [
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
 ]
 
+[[package]]
+name = "pycryptodomex"
+version = "3.19.0"
+description = "Cryptographic library for Python"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+files = [
+    {file = "pycryptodomex-3.19.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff64fd720def623bf64d8776f8d0deada1cc1bf1ec3c1f9d6f5bb5bd098d034f"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:61056a1fd3254f6f863de94c233b30dd33bc02f8c935b2000269705f1eeeffa4"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:258c4233a3fe5a6341780306a36c6fb072ef38ce676a6d41eec3e591347919e8"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e45bb4635b3c4e0a00ca9df75ef6295838c85c2ac44ad882410cb631ed1eeaa"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:a12144d785518f6491ad334c75ccdc6ad52ea49230b4237f319dbb7cef26f464"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27m-win32.whl", hash = "sha256:1789d89f61f70a4cd5483d4dfa8df7032efab1118f8b9894faae03c967707865"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27m-win_amd64.whl", hash = "sha256:eb2fc0ec241bf5e5ef56c8fbec4a2634d631e4c4f616a59b567947a0f35ad83c"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:c9a68a2f7bd091ccea54ad3be3e9d65eded813e6d79fdf4cc3604e26cdd6384f"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:8df69e41f7e7015a90b94d1096ec3d8e0182e73449487306709ec27379fff761"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27mu-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:917033016ecc23c8933205585a0ab73e20020fdf671b7cd1be788a5c4039840b"},
+    {file = "pycryptodomex-3.19.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:e8e5ecbd4da4157889fce8ba49da74764dd86c891410bfd6b24969fa46edda51"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-macosx_10_9_universal2.whl", hash = "sha256:a77b79852175064c822b047fee7cf5a1f434f06ad075cc9986aa1c19a0c53eb0"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-macosx_10_9_x86_64.whl", hash = "sha256:5b883e1439ab63af976656446fb4839d566bb096f15fc3c06b5a99cde4927188"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3866d68e2fc345162b1b9b83ef80686acfe5cec0d134337f3b03950a0a8bf56"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c74eb1f73f788facece7979ce91594dc177e1a9b5d5e3e64697dd58299e5cb4d"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cb51096a6a8d400724104db8a7e4f2206041a1f23e58924aa3d8d96bcb48338"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a588a1cb7781da9d5e1c84affd98c32aff9c89771eac8eaa659d2760666f7139"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-musllinux_1_1_i686.whl", hash = "sha256:d4dd3b381ff5a5907a3eb98f5f6d32c64d319a840278ceea1dcfcc65063856f3"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:263de9a96d2fcbc9f5bd3a279f14ea0d5f072adb68ebd324987576ec25da084d"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-win32.whl", hash = "sha256:67c8eb79ab33d0fbcb56842992298ddb56eb6505a72369c20f60bc1d2b6fb002"},
+    {file = "pycryptodomex-3.19.0-cp35-abi3-win_amd64.whl", hash = "sha256:09c9401dc06fb3d94cb1ec23b4ea067a25d1f4c6b7b118ff5631d0b5daaab3cc"},
+    {file = "pycryptodomex-3.19.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:edbe083c299835de7e02c8aa0885cb904a75087d35e7bab75ebe5ed336e8c3e2"},
+    {file = "pycryptodomex-3.19.0-pp27-pypy_73-win32.whl", hash = "sha256:136b284e9246b4ccf4f752d435c80f2c44fc2321c198505de1d43a95a3453b3c"},
+    {file = "pycryptodomex-3.19.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:5d73e9fa3fe830e7b6b42afc49d8329b07a049a47d12e0ef9225f2fd220f19b2"},
+    {file = "pycryptodomex-3.19.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b2f1982c5bc311f0aab8c293524b861b485d76f7c9ab2c3ac9a25b6f7655975"},
+    {file = "pycryptodomex-3.19.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfb040b5dda1dff1e197d2ef71927bd6b8bfcb9793bc4dfe0bb6df1e691eaacb"},
+    {file = "pycryptodomex-3.19.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:800a2b05cfb83654df80266692f7092eeefe2a314fa7901dcefab255934faeec"},
+    {file = "pycryptodomex-3.19.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:c01678aee8ac0c1a461cbc38ad496f953f9efcb1fa19f5637cbeba7544792a53"},
+    {file = "pycryptodomex-3.19.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2126bc54beccbede6eade00e647106b4f4c21e5201d2b0a73e9e816a01c50905"},
+    {file = "pycryptodomex-3.19.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b801216c48c0886742abf286a9a6b117e248ca144d8ceec1f931ce2dd0c9cb40"},
+    {file = "pycryptodomex-3.19.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:50cb18d4dd87571006fd2447ccec85e6cec0136632a550aa29226ba075c80644"},
+    {file = "pycryptodomex-3.19.0.tar.gz", hash = "sha256:af83a554b3f077564229865c45af0791be008ac6469ef0098152139e6bd4b5b6"},
+]
+
 [[package]]
 name = "pydantic"
 version = "2.4.2"
@@ -6014,6 +6097,17 @@ files = [
 optional = ["SQLAlchemy (>=1.4,<3)", "aiodns (>1.0)", "aiohttp (>=3.7.3,<4)", "boto3 (<=2)", "websocket-client (>=1,<2)", "websockets (>=10,<11)"]
 testing = ["Flask (>=1,<2)", "Flask-Sockets (>=0.2,<1)", "Jinja2 (==3.0.3)", "Werkzeug (<2)", "black (==22.8.0)", "boto3 (<=2)", "click (==8.0.4)", "databases (>=0.5)", "flake8 (>=5,<6)", "itsdangerous (==1.1.0)", "moto (>=3,<4)", "psutil (>=5,<6)", "pytest (>=6.2.5,<7)", "pytest-asyncio (<1)", "pytest-cov (>=2,<3)"]
 
+[[package]]
+name = "smmap"
+version = "5.0.1"
+description = "A pure Python implementation of a sliding window memory map manager"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "smmap-5.0.1-py3-none-any.whl", hash = "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da"},
+    {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"},
+]
+
 [[package]]
 name = "sniffio"
 version = "1.3.0"
@@ -7508,6 +7602,27 @@ files = [
 [package.dependencies]
 requests = "*"
 
+[[package]]
+name = "yt-dlp"
+version = "2023.11.16"
+description = "A youtube-dl fork with additional features and patches"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "yt-dlp-2023.11.16.tar.gz", hash = "sha256:f0ccdaf12e08b15902601a4671c7ab12906d7b11de3ae75fa6506811c24ec5da"},
+    {file = "yt_dlp-2023.11.16-py2.py3-none-any.whl", hash = "sha256:0322ba85aa4afdb75f8641ed550e5958964daff034aeb477abb15031fd9a51ed"},
+]
+
+[package.dependencies]
+brotli = {version = "*", markers = "implementation_name == \"cpython\""}
+brotlicffi = {version = "*", markers = "implementation_name != \"cpython\""}
+certifi = "*"
+mutagen = "*"
+pycryptodomex = "*"
+requests = ">=2.31.0,<3"
+urllib3 = ">=1.26.17,<3"
+websockets = "*"
+
 [[package]]
 name = "zipp"
 version = "3.17.0"
@@ -7529,6 +7644,7 @@ community = ["llama-hub"]
 dataloaders = ["beautifulsoup4", "docx2txt", "duckduckgo-search", "pypdf", "pytube", "sentence-transformers", "unstructured"]
 discord = ["discord"]
 elasticsearch = ["elasticsearch"]
+git = ["gitpython"]
 gmail = ["llama-hub", "requests"]
 huggingface-hub = ["huggingface_hub"]
 images = ["ftfy", "pillow", "regex", "torch", "torchvision"]
@@ -7547,8 +7663,9 @@ streamlit = []
 vertexai = ["google-cloud-aiplatform"]
 weaviate = ["weaviate-client"]
 whatsapp = ["flask", "twilio"]
+youtube-channel = ["yt_dlp"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.12"
-content-hash = "fe9ebe5f637303885981d10ace60b955635c7ca7586605546837e59206bfefd7"
+content-hash = "a7282080c7a4379bdc6f33dfe9cae7eb20764aae0176137ba5c7af7cdcc58ede"

+ 8 - 1
pyproject.toml

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "embedchain"
-version = "0.1.13"
+version = "0.1.14"
 description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
 authors = [
     "Taranjeet Singh <taranjeet@embedchain.ai>",
@@ -134,6 +134,8 @@ psycopg = { version = "^3.1.12", optional = true }
 psycopg-binary = { version = "^3.1.12", optional = true }
 psycopg-pool = { version = "^3.1.8", optional = true }
 mysql-connector-python = { version = "^8.1.0", optional = true }
+gitpython = { version = "^3.1.38", optional = true }
+yt_dlp = { version = "^2023.11.14", optional = true }
 
 [tool.poetry.group.dev.dependencies]
 black = "^23.3.0"
@@ -190,6 +192,11 @@ gmail = [
 json = ["llama-hub"]
 postgres = ["psycopg", "psycopg-binary", "psycopg-pool"]
 mysql = ["mysql-connector-python"]
+git = ["gitpython"]
+youtube_channel = [
+    "yt_dlp",
+    "youtube-transcripts-api",
+]
 
 [tool.poetry.group.docs.dependencies]