Przeglądaj źródła

Discord loader (#976)

Sidharth Mohanty 1 rok temu
rodzic
commit
38426a7af1

+ 28 - 0
docs/data-sources/discord.mdx

@@ -0,0 +1,28 @@
+---
+title: "💬 Discord"
+---
+
+To add any Discord channel messages to your app, just add the `channel_id` as the source and set the `data_type` to `discord`.
+
+<Note>
+    This loader requires a Discord bot token with read messages access.
+    To obtain the token, follow the instructions provided in this tutorial: 
+    <a href="https://www.writebots.com/discord-bot-token/">How to Get a Discord Bot Token?</a>.
+</Note>
+
+```python
+import os
+from embedchain import Pipeline as App
+
+# add your discord "BOT" token
+os.environ["DISCORD_TOKEN"] = "xxx"
+
+app = App()
+
+app.add("1177296711023075338", data_type="discord")
+
+response = app.query("What is Joe saying about Elon Musk?")
+
+print(response)
+# Answer: Joe is saying "Elon Musk is a genius".
+```

+ 1 - 0
docs/data-sources/overview.mdx

@@ -24,6 +24,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
   <Card title="🐬 MySQL" href="/data-sources/mysql"></Card>
   <Card title="🤖 Slack" href="/data-sources/slack"></Card>
   <Card title="🗨️ Discourse" href="/data-sources/discourse"></Card>
+  <Card title="💬 Discord" href="/data-sources/discord"></Card>
 </CardGroup>
 
 <br/ >

+ 2 - 1
docs/mint.json

@@ -89,7 +89,8 @@
             "data-sources/openapi",
             "data-sources/youtube-video",
             "data-sources/discourse",
-            "data-sources/substack"
+            "data-sources/substack",
+            "data-sources/discord"
           ]
         },
         "data-sources/data-type-handling"

+ 2 - 0
embedchain/data_formatter/data_formatter.py

@@ -66,6 +66,7 @@ class DataFormatter(JSONSerializable):
             DataType.SUBSTACK: "embedchain.loaders.substack.SubstackLoader",
             DataType.GITHUB: "embedchain.loaders.github.GithubLoader",
             DataType.YOUTUBE_CHANNEL: "embedchain.loaders.youtube_channel.YoutubeChannelLoader",
+            DataType.DISCORD: "embedchain.loaders.discord.DiscordLoader",
         }
 
         custom_loaders = set(
@@ -118,6 +119,7 @@ class DataFormatter(JSONSerializable):
             DataType.SUBSTACK: "embedchain.chunkers.substack.SubstackChunker",
             DataType.GITHUB: "embedchain.chunkers.common_chunker.CommonChunker",
             DataType.YOUTUBE_CHANNEL: "embedchain.chunkers.common_chunker.CommonChunker",
+            DataType.DISCORD: "embedchain.chunkers.common_chunker.CommonChunker",
         }
 
         if data_type in chunker_classes:

+ 150 - 0
embedchain/loaders/discord.py

@@ -0,0 +1,150 @@
+import logging
+import os
+import hashlib
+
+from embedchain.helpers.json_serializable import register_deserializable
+from embedchain.loaders.base_loader import BaseLoader
+
+
+@register_deserializable
+class DiscordLoader(BaseLoader):
+    """
+    Load data from a Discord Channel ID.
+    """
+
+    def __init__(self):
+        if not os.environ.get("DISCORD_TOKEN"):
+            raise ValueError("DISCORD_TOKEN is not set")
+
+        self.token = os.environ.get("DISCORD_TOKEN")
+
+    @staticmethod
+    def _format_message(message):
+        return {
+            "message_id": message.id,
+            "content": message.content,
+            "author": {
+                "id": message.author.id,
+                "name": message.author.name,
+                "discriminator": message.author.discriminator,
+            },
+            "created_at": message.created_at.isoformat(),
+            "attachments": [
+                {
+                    "id": attachment.id,
+                    "filename": attachment.filename,
+                    "size": attachment.size,
+                    "url": attachment.url,
+                    "proxy_url": attachment.proxy_url,
+                    "height": attachment.height,
+                    "width": attachment.width,
+                }
+                for attachment in message.attachments
+            ],
+            "embeds": [
+                {
+                    "title": embed.title,
+                    "type": embed.type,
+                    "description": embed.description,
+                    "url": embed.url,
+                    "timestamp": embed.timestamp.isoformat(),
+                    "color": embed.color,
+                    "footer": {
+                        "text": embed.footer.text,
+                        "icon_url": embed.footer.icon_url,
+                        "proxy_icon_url": embed.footer.proxy_icon_url,
+                    },
+                    "image": {
+                        "url": embed.image.url,
+                        "proxy_url": embed.image.proxy_url,
+                        "height": embed.image.height,
+                        "width": embed.image.width,
+                    },
+                    "thumbnail": {
+                        "url": embed.thumbnail.url,
+                        "proxy_url": embed.thumbnail.proxy_url,
+                        "height": embed.thumbnail.height,
+                        "width": embed.thumbnail.width,
+                    },
+                    "video": {
+                        "url": embed.video.url,
+                        "height": embed.video.height,
+                        "width": embed.video.width,
+                    },
+                    "provider": {
+                        "name": embed.provider.name,
+                        "url": embed.provider.url,
+                    },
+                    "author": {
+                        "name": embed.author.name,
+                        "url": embed.author.url,
+                        "icon_url": embed.author.icon_url,
+                        "proxy_icon_url": embed.author.proxy_icon_url,
+                    },
+                    "fields": [
+                        {
+                            "name": field.name,
+                            "value": field.value,
+                            "inline": field.inline,
+                        }
+                        for field in embed.fields
+                    ],
+                }
+                for embed in message.embeds
+            ],
+        }
+
+    def load_data(self, channel_id: str):
+        """Load data from a Discord Channel ID."""
+        import discord
+
+        messages = []
+
+        class DiscordClient(discord.Client):
+            async def on_ready(self) -> None:
+                logging.info("Logged on as {0}!".format(self.user))
+                try:
+                    channel = self.get_channel(int(channel_id))
+                    if not isinstance(channel, discord.TextChannel):
+                        raise ValueError(
+                            f"Channel {channel_id} is not a text channel. " "Only text channels are supported for now."
+                        )
+                    threads = {}
+
+                    for thread in channel.threads:
+                        threads[thread.id] = thread
+
+                    async for message in channel.history(limit=None):
+                        messages.append(DiscordLoader._format_message(message))
+                        if message.id in threads:
+                            async for thread_message in threads[message.id].history(limit=None):
+                                messages.append(DiscordLoader._format_message(thread_message))
+
+                except Exception as e:
+                    logging.error(e)
+                    await self.close()
+                finally:
+                    await self.close()
+
+        intents = discord.Intents.default()
+        intents.message_content = True
+        client = DiscordClient(intents=intents)
+        client.run(self.token)
+
+        meta_data = {
+            "url": channel_id,
+        }
+
+        messages = str(messages)
+
+        doc_id = hashlib.sha256((messages + channel_id).encode()).hexdigest()
+
+        return {
+            "doc_id": doc_id,
+            "data": [
+                {
+                    "content": messages,
+                    "meta_data": meta_data,
+                }
+            ],
+        }

+ 2 - 0
embedchain/models/data_type.py

@@ -36,6 +36,7 @@ class IndirectDataType(Enum):
     SUBSTACK = "substack"
     GITHUB = "github"
     YOUTUBE_CHANNEL = "youtube_channel"
+    DISCORD = "discord"
 
 
 class SpecialDataType(Enum):
@@ -71,3 +72,4 @@ class DataType(Enum):
     SUBSTACK = IndirectDataType.SUBSTACK.value
     GITHUB = IndirectDataType.GITHUB.value
     YOUTUBE_CHANNEL = IndirectDataType.YOUTUBE_CHANNEL.value
+    DISCORD = IndirectDataType.DISCORD.value