Переглянути джерело

feat: add method - detect format / data_type (#380)

cachho 2 роки тому
батько
коміт
4c8876f032

+ 5 - 5
README.md

@@ -28,8 +28,8 @@ pip install embedchain
   zuck_bot = Llama2App()
 
   # Embed your data
-  zuck_bot.add("youtube_video", "https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
-  zuck_bot.add("web_page", "https://en.wikipedia.org/wiki/Mark_Zuckerberg")
+  zuck_bot.add("https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
+  zuck_bot.add("https://en.wikipedia.org/wiki/Mark_Zuckerberg")
 
   # Nice, your bot is ready now. Start asking questions to your bot.
   zuck_bot.query("Who is Mark Zuckerberg?")
@@ -64,9 +64,9 @@ os.environ["OPENAI_API_KEY"] = "YOUR API KEY"
 elon_bot = App()
 
 # Embed online resources
-elon_bot.add("web_page", "https://en.wikipedia.org/wiki/Elon_Musk")
-elon_bot.add("web_page", "https://tesla.com/elon-musk")
-elon_bot.add("youtube_video", "https://www.youtube.com/watch?v=MxZpaJK74Y4")
+elon_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
+elon_bot.add("https://tesla.com/elon-musk")
+elon_bot.add("https://www.youtube.com/watch?v=MxZpaJK74Y4")
 
 # Query the bot
 elon_bot.query("How many companies does Elon Musk run?")

+ 6 - 6
docs/advanced/adding_data.mdx

@@ -6,20 +6,20 @@ title: '➕ Adding Data'
 
 - This step assumes that you have already created an `app` instance by either using `App`, `OpenSourceApp` or `CustomApp`. We are calling our app instance as `naval_chat_bot` 🤖
 
-- Now use `.add()` function to add any dataset.
+- Now use `.add` method to add any dataset.
 
 ```python
 # naval_chat_bot = App() or
 # naval_chat_bot = OpenSourceApp()
 
 # Embed Online Resources
-naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44")
-naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
-naval_chat_bot.add("web_page", "https://nav.al/feedback")
-naval_chat_bot.add("web_page", "https://nav.al/agi")
+naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
+naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
+naval_chat_bot.add("https://nav.al/feedback")
+naval_chat_bot.add("https://nav.al/agi")
 
 # Embed Local Resources
-naval_chat_bot.add_local("qna_pair", ("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
+naval_chat_bot.add(("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
 ```
 
 The possible formats to add data can be found on the [Supported Data Formats](/advanced/data_types) page.

+ 2 - 2
docs/advanced/app_types.mdx

@@ -35,8 +35,8 @@ os.environ['REPLICATE_API_TOKEN'] = "REPLICATE API TOKEN"
 zuck_bot = Llama2App()
 
 # Embed your data
-zuck_bot.add("youtube_video", "https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
-zuck_bot.add("web_page", "https://en.wikipedia.org/wiki/Mark_Zuckerberg")
+zuck_bot.add("https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
+zuck_bot.add("https://en.wikipedia.org/wiki/Mark_Zuckerberg")
 
 # Nice, your bot is ready now. Start asking questions to your bot.
 zuck_bot.query("Who is Mark Zuckerberg?")

+ 8 - 8
docs/advanced/configuration.mdx

@@ -26,17 +26,17 @@ naval_chat_bot = App(config)
 
 # Example: define your own chunker config for `youtube_video`
 chunker_config = ChunkerConfig(chunk_size=1000, chunk_overlap=100, length_function=len)
-naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(chunker=chunker_config))
+naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(chunker=chunker_config))
 
 add_config = AddConfig()
-naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf", add_config)
-naval_chat_bot.add("web_page", "https://nav.al/feedback", add_config)
-naval_chat_bot.add("web_page", "https://nav.al/agi", add_config)
+naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf", config=add_config)
+naval_chat_bot.add("https://nav.al/feedback", config=add_config)
+naval_chat_bot.add("https://nav.al/agi", config=add_config)
 
-naval_chat_bot.add_local("qna_pair", ("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."), add_config)
+naval_chat_bot.add(("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."), config=add_config)
 
 query_config = QueryConfig()
-print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?", query_config))
+print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?", config=query_config))
 ```
 
 ### Custom prompt template
@@ -53,7 +53,7 @@ einstein_chat_bot = App()
 
 # Embed Wikipedia page
 page = wikipedia.page("Albert Einstein")
-einstein_chat_bot.add("text", page.content)
+einstein_chat_bot.add(page.content)
 
 # Example: use your own custom template with `$context` and `$query`
 einstein_chat_template = Template("""
@@ -75,7 +75,7 @@ queries = [
         "Why did you divorce your first wife?",
 ]
 for query in queries:
-        response = einstein_chat_bot.query(query, query_config)
+        response = einstein_chat_bot.query(query, config=query_config)
         print("Query: ", query)
         print("Response: ", response)
 

+ 47 - 18
docs/advanced/data_types.mdx

@@ -2,14 +2,40 @@
 title: '📋 Supported data formats'
 ---
 
-Embedchain supports following data formats:
+## Automatic data type detection
+The add method automatically tries to detect the data_type, based on your input for the source argument. So `app.add('https://www.youtube.com/watch?v=dQw4w9WgXcQ')` is enough to embed a YouTube video.
+
+This detection is implemented for all formats. It is based on factors such as whether it's a URL, a local file, the source data type, etc.
+
+### Debugging automatic detection
+
+
+Set `log_level=DEBUG` (in [AppConfig](http://localhost:3000/advanced/query_configuration#appconfig)) and make sure it's working as intended.
+
+Otherwise, you will not know when, for instance, an invalid filepath is interpreted as raw text instead.
+
+### Forcing a data type
+
+To omit any issues with the data type detection, you can **force** a data_type by adding it as a `add` method argument.
+The examples below show you the keyword to force the respective `data_type`.
+
+Forcing can also be used for edge cases, such as interpreting a sitemap as a web_page, for reading it's raw text instead of following links.
+
+## Remote Data Types
+
+<Tip>
+**Use local files in remote data types**
+
+Some data_types are meant for remote content and only work with URLs.
+You can pass local files by formatting the path using the `file:` [URI scheme](https://en.wikipedia.org/wiki/File_URI_scheme), e.g. `file:///info.pdf`.
+</Tip>
 
 ### Youtube video
 
 To add any youtube video to your app, use the data_type (first argument to `.add()` method) as `youtube_video`. Eg:
 
 ```python
-app.add('youtube_video', 'a_valid_youtube_url_here')
+app.add('a_valid_youtube_url_here', data_type='youtube_video')
 ```
 
 ### PDF file
@@ -17,7 +43,7 @@ app.add('youtube_video', 'a_valid_youtube_url_here')
 To add any pdf file, use the data_type as `pdf_file`. Eg:
 
 ```python
-app.add('pdf_file', 'a_valid_url_where_pdf_file_can_be_accessed')
+app.add('a_valid_url_where_pdf_file_can_be_accessed', data_type='pdf_file')
 ```
 
 Note that we do not support password protected pdfs.
@@ -27,7 +53,7 @@ Note that we do not support password protected pdfs.
 To add any web page, use the data_type as `web_page`. Eg:
 
 ```python
-app.add('web_page', 'a_valid_web_page_url')
+app.add('a_valid_web_page_url', data_type='web_page')
 ```
 
 ### Sitemap
@@ -35,15 +61,16 @@ app.add('web_page', 'a_valid_web_page_url')
 Add all web pages from an xml-sitemap. Filters non-text files. Use the data_type as `sitemap`. Eg:
 
 ```python
-app.add('sitemap', 'https://example.com/sitemap.xml')
+app.add('https://example.com/sitemap.xml', data_type='sitemap')
 ```
 
 ### Doc file
 
-To add any doc/docx file, use the data_type as `docx`. Eg:
+To add any doc/docx file, use the data_type as `docx`. `docx` allows remote urls and conventional file paths. Eg:
 
 ```python
-app.add('docx', 'a_local_docx_file_path')
+app.add('https://example.com/content/intro.docx', data_type="docx")
+app.add('content/intro.docx', data_type="docx")
 ```
 
 ### Code documentation website loader
@@ -51,27 +78,29 @@ app.add('docx', 'a_local_docx_file_path')
 To add any code documentation website as a loader, use the data_type as `docs_site`. Eg:
 
 ```python
-app.add("docs_site", "https://docs.embedchain.ai/")
+app.add("https://docs.embedchain.ai/", data_type="docs_site")
 ```
 
 ### Notion
 To use notion you must install the extra dependencies with `pip install embedchain[notion]`.
 
-To load a notion page, use the data_type as `notion`.
+To load a notion page, use the data_type as `notion`. Since it is hard to automatically detect, forcing this is advised.
 The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg:
 
 ```python
-app.add("notion", "cfbc134ca6464fc980d0391613959196")
-app.add("notion", "my-page-cfbc134ca6464fc980d0391613959196")
-app.add("notion", "https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196")
+app.add("cfbc134ca6464fc980d0391613959196", "notion")
+app.add("my-page-cfbc134ca6464fc980d0391613959196", "notion")
+app.add("https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196", "notion")
 ```
 
+## Local Data Types
+
 ### Text
 
 To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
 
 ```python
-app.add_local('text', 'Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.')
+app.add('Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.', data_type='text')
 ```
 
 Note: This is not used in the examples because in most cases you will supply a whole paragraph or file, which did not fit.
@@ -81,7 +110,7 @@ Note: This is not used in the examples because in most cases you will supply a w
 To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg:
 
 ```python
-app.add_local('qna_pair', ("Question", "Answer"))
+app.add(("Question", "Answer"), data_type="qna_pair")
 ```
 
 ## Reusing a vector database
@@ -94,8 +123,8 @@ Create a local index:
 from embedchain import App
 
 naval_chat_bot = App()
-naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44")
-naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
+naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
+naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
 ```
 
 You can reuse the local index with the same code, but without adding new documents:
@@ -107,6 +136,6 @@ naval_chat_bot = App()
 print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"))
 ```
 
-### More formats (coming soon!)
+## More formats (coming soon!)
 
-- If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchain/issues) and we will add it to the list of supported formats.
+- If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchain/issues) and we will add it to the list of supported formats.

+ 1 - 1
docs/advanced/query_configuration.mdx

@@ -25,7 +25,7 @@ Yes, you are passing `ChunkerConfig` to `AddConfig`, like so:
 ```python
 chunker_config = ChunkerConfig(chunk_size=100)
 add_config = AddConfig(chunker=chunker_config)
-app.add_local("text", "lorem ipsum", config=add_config)
+app.add("lorem ipsum", config=add_config)
 ```
 
 ### ChunkerConfig

+ 8 - 8
docs/introduction.mdx

@@ -7,7 +7,7 @@ description: '📝 Embedchain is a framework to easily create LLM powered bots o
 
 Embedchain abstracts the entire process of loading a dataset, chunking it, creating embeddings, and storing it in a vector database.
 
-You can add a single or multiple datasets using the .add and .add_local functions. Then, simply use the .query function to find answers from the added datasets.
+You can add a single or multiple datasets using the `.add` method. Then, simply use the `.query` method to find answers from the added datasets.
 
 If you want to create a Naval Ravikant bot with a YouTube video, a book in PDF format, two blog posts, and a question and answer pair, all you need to do is add the respective links. Embedchain will take care of the rest, creating a bot for you.
 
@@ -16,13 +16,13 @@ from embedchain import App
 
 naval_chat_bot = App()
 # Embed Online Resources
-naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44")
-naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
-naval_chat_bot.add("web_page", "https://nav.al/feedback")
-naval_chat_bot.add("web_page", "https://nav.al/agi")
+naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
+naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
+naval_chat_bot.add("https://nav.al/feedback")
+naval_chat_bot.add("https://nav.al/agi")
 
 # Embed Local Resources
-naval_chat_bot.add_local("qna_pair", ("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
+naval_chat_bot.add(("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
 
 naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?")
 # Answer: Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.
@@ -32,7 +32,7 @@ naval_chat_bot.query("What unique capacity does Naval argue humans possess when
 
 Creating a chat bot over any dataset involves the following steps:
 
-1. Load the data
+1. Detect the data type and load the data
 2. Create meaningful chunks
 3. Create embeddings for each chunk
 4. Store the chunks in a vector database
@@ -53,4 +53,4 @@ The process of loading the dataset and querying involves multiple steps, each wi
 
 Embedchain takes care of all these nuances and provides a simple interface to create bots over any dataset.
 
-In the first release, we make it easier for anyone to get a chatbot over any dataset up and running in less than a minute. Just create an app instance, add the datasets using the `.add()` function, and use the `.query()` function to get the relevant answers.
+In the first release, we make it easier for anyone to get a chatbot over any dataset up and running in less than a minute. Just create an app instance, add the datasets using the `.add` method, and use the `.query` method to get the relevant answers.

+ 1 - 1
docs/mint.json

@@ -32,7 +32,7 @@
     },
     {
       "group": "Advanced",
-      "pages": ["advanced/app_types", "advanced/interface_types", "advanced/adding_data","advanced/data_types", "advanced/query_configuration", "advanced/configuration", "advanced/testing", "advanced/vector_database", "advanced/showcase"]
+      "pages": ["advanced/app_types", "advanced/interface_types", "advanced/adding_data", "advanced/data_types", "advanced/query_configuration", "advanced/configuration", "advanced/testing", "advanced/vector_database", "advanced/showcase"]
     },
     {
       "group": "Examples",

+ 2 - 2
docs/quickstart.mdx

@@ -26,8 +26,8 @@ os.environ["OPENAI_API_KEY"] = "xxx"
 elon_musk_bot = App()
 
 # Embed Online Resources
-elon_musk_bot.add("web_page", "https://en.wikipedia.org/wiki/Elon_Musk")
-elon_musk_bot.add("web_page", "https://www.tesla.com/elon-musk")
+elon_musk_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
+elon_musk_bot.add("https://www.tesla.com/elon-musk")
 
 response = elon_musk_bot.query("How many companies does Elon Musk run?")
 print(response)

+ 6 - 2
embedchain/chunkers/base_chunker.py

@@ -1,5 +1,7 @@
 import hashlib
 
+from embedchain.models.data_type import DataType
+
 
 class BaseChunker:
     def __init__(self, text_splitter):
@@ -26,7 +28,7 @@ class BaseChunker:
 
             meta_data = data["meta_data"]
             # add data type to meta data to allow query using data type
-            meta_data["data_type"] = self.data_type
+            meta_data["data_type"] = self.data_type.value
             url = meta_data["url"]
 
             chunks = self.get_chunks(content)
@@ -52,8 +54,10 @@ class BaseChunker:
         """
         return self.text_splitter.split_text(content)
 
-    def set_data_type(self, data_type):
+    def set_data_type(self, data_type: DataType):
         """
         set the data type of chunker
         """
         self.data_type = data_type
+
+        # TODO: This should be done during initialization. This means it has to be done in the child classes.

+ 23 - 22
embedchain/data_formatter/data_formatter.py

@@ -15,6 +15,7 @@ from embedchain.loaders.pdf_file import PdfFileLoader
 from embedchain.loaders.sitemap import SitemapLoader
 from embedchain.loaders.web_page import WebPageLoader
 from embedchain.loaders.youtube_video import YoutubeVideoLoader
+from embedchain.models.data_type import DataType
 
 
 class DataFormatter:
@@ -24,11 +25,11 @@ class DataFormatter:
     .add or .add_local method call
     """
 
-    def __init__(self, data_type: str, config: AddConfig):
+    def __init__(self, data_type: DataType, config: AddConfig):
         self.loader = self._get_loader(data_type, config.loader)
         self.chunker = self._get_chunker(data_type, config.chunker)
 
-    def _get_loader(self, data_type, config):
+    def _get_loader(self, data_type: DataType, config):
         """
         Returns the appropriate data loader for the given data type.
 
@@ -37,22 +38,22 @@ class DataFormatter:
         :raises ValueError: If an unsupported data type is provided.
         """
         loaders = {
-            "youtube_video": YoutubeVideoLoader,
-            "pdf_file": PdfFileLoader,
-            "web_page": WebPageLoader,
-            "qna_pair": LocalQnaPairLoader,
-            "text": LocalTextLoader,
-            "docx": DocxFileLoader,
-            "sitemap": SitemapLoader,
-            "docs_site": DocsSiteLoader,
+            DataType.YOUTUBE_VIDEO: YoutubeVideoLoader,
+            DataType.PDF_FILE: PdfFileLoader,
+            DataType.WEB_PAGE: WebPageLoader,
+            DataType.QNA_PAIR: LocalQnaPairLoader,
+            DataType.TEXT: LocalTextLoader,
+            DataType.DOCX: DocxFileLoader,
+            DataType.SITEMAP: SitemapLoader,
+            DataType.DOCS_SITE: DocsSiteLoader,
         }
-        lazy_loaders = ("notion",)
+        lazy_loaders = {DataType.NOTION}
         if data_type in loaders:
             loader_class = loaders[data_type]
             loader = loader_class()
             return loader
         elif data_type in lazy_loaders:
-            if data_type == "notion":
+            if data_type == DataType.NOTION:
                 from embedchain.loaders.notion import NotionLoader
 
                 return NotionLoader()
@@ -61,7 +62,7 @@ class DataFormatter:
         else:
             raise ValueError(f"Unsupported data type: {data_type}")
 
-    def _get_chunker(self, data_type, config):
+    def _get_chunker(self, data_type: DataType, config):
         """
         Returns the appropriate chunker for the given data type.
 
@@ -70,15 +71,15 @@ class DataFormatter:
         :raises ValueError: If an unsupported data type is provided.
         """
         chunker_classes = {
-            "youtube_video": YoutubeVideoChunker,
-            "pdf_file": PdfFileChunker,
-            "web_page": WebPageChunker,
-            "qna_pair": QnaPairChunker,
-            "text": TextChunker,
-            "docx": DocxFileChunker,
-            "sitemap": WebPageChunker,
-            "docs_site": DocsSiteChunker,
-            "notion": NotionChunker,
+            DataType.YOUTUBE_VIDEO: YoutubeVideoChunker,
+            DataType.PDF_FILE: PdfFileChunker,
+            DataType.WEB_PAGE: WebPageChunker,
+            DataType.QNA_PAIR: QnaPairChunker,
+            DataType.TEXT: TextChunker,
+            DataType.DOCX: DocxFileChunker,
+            DataType.WEB_PAGE: WebPageChunker,
+            DataType.DOCS_SITE: DocsSiteChunker,
+            DataType.NOTION: NotionChunker,
         }
         if data_type in chunker_classes:
             chunker_class = chunker_classes[data_type]

+ 88 - 43
embedchain/embedchain.py

@@ -1,9 +1,10 @@
+import hashlib
 import importlib.metadata
 import logging
 import os
 import threading
 import uuid
-from typing import Optional
+from typing import Dict, Optional
 
 import requests
 from dotenv import load_dotenv
@@ -17,6 +18,8 @@ from embedchain.config.apps.BaseAppConfig import BaseAppConfig
 from embedchain.config.QueryConfig import DOCS_SITE_PROMPT_TEMPLATE
 from embedchain.data_formatter import DataFormatter
 from embedchain.loaders.base_loader import BaseLoader
+from embedchain.models.data_type import DataType
+from embedchain.utils import detect_datatype
 
 load_dotenv()
 
@@ -47,27 +50,62 @@ class EmbedChain:
         thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("init",))
         thread_telemetry.start()
 
-    def add(self, data_type, url, metadata=None, config: AddConfig = None):
+    def add(
+        self,
+        source,
+        data_type: Optional[DataType] = None,
+        metadata: Optional[Dict] = None,
+        config: Optional[AddConfig] = None,
+    ):
         """
         Adds the data from the given URL to the vector db.
         Loads the data, chunks it, create embedding for each chunk
         and then stores the embedding to vector database.
 
-        :param data_type: The type of the data to add.
-        :param url: The URL where the data is located.
+        :param source: The data to embed, can be a URL, local file or raw content, depending on the data type.
+        :param data_type: Optional. Automatically detected, but can be forced with this argument.
+        The type of the data to add.
         :param metadata: Optional. Metadata associated with the data source.
         :param config: Optional. The `AddConfig` instance to use as configuration
         options.
+        :return: source_id, a md5-hash of the source, in hexadecimal representation.
         """
         if config is None:
             config = AddConfig()
 
+        try:
+            DataType(source)
+            logging.warning(
+                f"""Starting from version v0.0.39, Embedchain can automatically detect the data type. So, in the `add` method, the argument order has changed. You no longer need to specify '{source}' for the `source` argument. So the code snippet will be `.add("{data_type}", "{source}")`"""  # noqa #E501
+            )
+            logging.warning(
+                "Embedchain is swapping the arguments for you. This functionality might be deprecated in the future, so please adjust your code."  # noqa #E501
+            )
+            source, data_type = data_type, source
+        except ValueError:
+            pass
+
+        if data_type:
+            try:
+                data_type = DataType(data_type)
+            except ValueError:
+                raise ValueError(
+                    f"Invalid data_type: '{data_type}'.",
+                    f"Please use one of the following: {[data_type.value for data_type in DataType]}",
+                ) from None
+        if not data_type:
+            data_type = detect_datatype(source)
+
+        # `source_id` is the hash of the source argument
+        hash_object = hashlib.md5(str(source).encode("utf-8"))
+        source_id = hash_object.hexdigest()
+
         data_formatter = DataFormatter(data_type, config)
-        self.user_asks.append([data_type, url, metadata])
+        self.user_asks.append([source, data_type.value, metadata])
         documents, _metadatas, _ids, new_chunks = self.load_and_embed(
-            data_formatter.loader, data_formatter.chunker, url, metadata
+            data_formatter.loader, data_formatter.chunker, source, metadata, source_id
         )
-        if data_type in ("docs_site",):
+        if data_type in {DataType.DOCS_SITE}:
             self.is_docs_site_instance = True
 
         # Send anonymous telemetry
@@ -75,41 +113,35 @@ class EmbedChain:
             # it's quicker to check the variable twice than to count words when they won't be submitted.
             word_count = sum([len(document.split(" ")) for document in documents])
 
-            extra_metadata = {"data_type": data_type, "word_count": word_count, "chunks_count": new_chunks}
+            extra_metadata = {"data_type": data_type.value, "word_count": word_count, "chunks_count": new_chunks}
             thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("add", extra_metadata))
             thread_telemetry.start()
 
-    def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
+        return source_id
+
+    def add_local(self, source, data_type=None, metadata=None, config: AddConfig = None):
         """
-        Adds the data you supply to the vector db.
+        Warning:
+            This method is deprecated and will be removed in future versions. Use `add` instead.
+
+        Adds the data from the given URL to the vector db.
         Loads the data, chunks it, create embedding for each chunk
         and then stores the embedding to vector database.
 
-        :param data_type: The type of the data to add.
-        :param content: The local data. Refer to the `README` for formatting.
+        :param source: The data to embed, can be a URL, local file or raw content, depending on the data type.
+        :param data_type: Optional. Automatically detected, but can be forced with this argument.
+        The type of the data to add.
         :param metadata: Optional. Metadata associated with the data source.
-        :param config: Optional. The `AddConfig` instance to use as
-        configuration options.
+        :param config: Optional. The `AddConfig` instance to use as configuration
+        options.
+        :return: md5-hash of the source, in hexadecimal representation.
         """
-        if config is None:
-            config = AddConfig()
-
-        data_formatter = DataFormatter(data_type, config)
-        self.user_asks.append([data_type, content])
-        documents, _metadatas, _ids, new_chunks = self.load_and_embed(
-            data_formatter.loader, data_formatter.chunker, content, metadata
+        logging.warning(
+            "The `add_local` method is deprecated and will be removed in future versions. Please use the `add` method for both local and remote files."  # noqa: E501
         )
+        return self.add(source=source, data_type=data_type, metadata=metadata, config=config)
 
-        # Send anonymous telemetry
-        if self.config.collect_metrics:
-            # it's quicker to check the variable twice than to count words when they won't be submitted.
-            word_count = sum([len(document.split(" ")) for document in documents])
-
-            extra_metadata = {"data_type": data_type, "word_count": word_count, "chunks_count": new_chunks}
-            thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("add_local", extra_metadata))
-            thread_telemetry.start()
-
-    def load_and_embed(self, loader: BaseLoader, chunker: BaseChunker, src, metadata=None):
+    def load_and_embed(self, loader: BaseLoader, chunker: BaseChunker, src, metadata=None, source_id=None):
         """
         Loads the data from the given URL, chunks it, and adds it to database.
 
@@ -118,12 +150,16 @@ class EmbedChain:
         :param src: The data to be handled by the loader. Can be a URL for
         remote sources or local content for local loaders.
         :param metadata: Optional. Metadata associated with the data source.
+        :param source_id: Hexadecimal hash of the source.
         :return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
         """
         embeddings_data = chunker.create_chunks(loader, src)
+
+        # spread chunking results
         documents = embeddings_data["documents"]
         metadatas = embeddings_data["metadatas"]
         ids = embeddings_data["ids"]
+
         # get existing ids, and discard doc if any common id exist.
         where = {"app_id": self.config.id} if self.config.id is not None else {}
         # where={"url": src}
@@ -144,22 +180,31 @@ class EmbedChain:
             ids = list(data_dict.keys())
             documents, metadatas = zip(*data_dict.values())
 
-        # Add app id in metadatas so that they can be queried on later
-        if self.config.id is not None:
-            metadatas = [{**m, "app_id": self.config.id} for m in metadatas]
+        # Loop though all metadatas and add extras.
+        new_metadatas = []
+        for m in metadatas:
+            # Add app id in metadatas so that they can be queried on later
+            if self.config.id:
+                m["app_id"] = self.config.id
 
-        # FIXME: Fix the error handling logic when metadatas or metadata is None
-        metadatas = metadatas if metadatas else []
-        metadata = metadata if metadata else {}
-        chunks_before_addition = self.count()
+            # Add hashed source
+            m["hash"] = source_id
 
-        # Add metadata to each document
-        metadatas_with_metadata = [{**meta, **metadata} for meta in metadatas]
+            # Note: Metadata is the function argument
+            if metadata:
+                # Spread whatever is in metadata into the new object.
+                m.update(metadata)
+
+            new_metadatas.append(m)
+        metadatas = new_metadatas
+
+        # Count before, to calculate a delta in the end.
+        chunks_before_addition = self.count()
 
-        self.db.add(documents=documents, metadatas=metadatas_with_metadata, ids=ids)
+        self.db.add(documents=documents, metadatas=metadatas, ids=ids)
         count_new_chunks = self.count() - chunks_before_addition
-        print((f"Successfully saved {src}. New chunks count: {count_new_chunks}"))
-        return list(documents), metadatas_with_metadata, ids, count_new_chunks
+        print((f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}"))
+        return list(documents), metadatas, ids, count_new_chunks
 
     def _format_result(self, results):
         return [

+ 13 - 0
embedchain/models/data_type.py

@@ -0,0 +1,13 @@
+from enum import Enum
+
+
+class DataType(Enum):
+    YOUTUBE_VIDEO = "youtube_video"
+    PDF_FILE = "pdf_file"
+    WEB_PAGE = "web_page"
+    SITEMAP = "sitemap"
+    DOCX = "docx"
+    DOCS_SITE = "docs_site"
+    TEXT = "text"
+    QNA_PAIR = "qna_pair"
+    NOTION = "notion"

+ 114 - 0
embedchain/utils.py

@@ -1,6 +1,10 @@
 import logging
+import os
 import re
 import string
+from typing import Any
+
+from embedchain.models.data_type import DataType
 
 
 def clean_string(text):
@@ -89,3 +93,113 @@ def use_pysqlite3():
                 "Error:",
                 e,
             )
+        __import__("pysqlite3")
+        sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
+        # Let the user know what happened.
+        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
+        print(
+            f"{current_time} [embedchain] [INFO]",
+            "Swapped std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.",
+            f"Your original version was {sqlite3.sqlite_version}.",
+        )
+
+
+def format_source(source: str, limit: int = 20) -> str:
+    """
+    Format a string to only take the first x and last x letters.
+    This makes it easier to display a URL, keeping familiarity while ensuring a consistent length.
+    If the string is too short, it is not sliced.
+    """
+    if len(source) > 2 * limit:
+        return source[:limit] + "..." + source[-limit:]
+    return source
+
+
+def detect_datatype(source: Any) -> DataType:
+    """
+    Automatically detect the datatype of the given source.
+
+    :param source: the source to base the detection on
+    :return: data_type string
+    """
+    from urllib.parse import urlparse
+
+    try:
+        if not isinstance(source, str):
+            raise ValueError("Source is not a string and thus cannot be a URL.")
+        url = urlparse(source)
+        # Check if both scheme and netloc are present. Local file system URIs are acceptable too.
+        if not all([url.scheme, url.netloc]) and url.scheme != "file":
+            raise ValueError("Not a valid URL.")
+    except ValueError:
+        url = False
+
+    formatted_source = format_source(str(source), 30)
+
+    if url:
+        from langchain.document_loaders.youtube import \
+            ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
+
+        if url.netloc in YOUTUBE_ALLOWED_NETLOCS:
+            logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")
+            return DataType.YOUTUBE_VIDEO
+
+        if url.netloc in {"notion.so", "notion.site"}:
+            logging.debug(f"Source of `{formatted_source}` detected as `notion`.")
+            return DataType.NOTION
+
+        if url.path.endswith(".pdf"):
+            logging.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
+            return DataType.PDF_FILE
+
+        if url.path.endswith(".xml"):
+            logging.debug(f"Source of `{formatted_source}` detected as `sitemap`.")
+            return DataType.SITEMAP
+
+        if url.path.endswith(".docx"):
+            logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
+            return DataType.DOCX
+
+        if "docs" in url.netloc or ("docs" in url.path and url.scheme != "file"):
+            # `docs_site` detection via path is not accepted for local filesystem URIs,
+            # because that would mean all paths that contain `docs` are now doc sites, which is too aggressive.
+            logging.debug(f"Source of `{formatted_source}` detected as `docs_site`.")
+            return DataType.DOCS_SITE
+
+        # If none of the above conditions are met, it's a general web page
+        logging.debug(f"Source of `{formatted_source}` detected as `web_page`.")
+        return DataType.WEB_PAGE
+
+    elif not isinstance(source, str):
+        # For datatypes where source is not a string.
+
+        if isinstance(source, tuple) and len(source) == 2 and isinstance(source[0], str) and isinstance(source[1], str):
+            logging.debug(f"Source of `{formatted_source}` detected as `qna_pair`.")
+            return DataType.QNA_PAIR
+
+        # Raise an error if it isn't a string and also not a valid non-string type (one of the previous).
+        # We could stringify it, but it is better to raise an error and let the user decide how they want to do that.
+        raise TypeError(
+            "Source is not a string and a valid non-string type could not be detected. If you want to embed it, please stringify it, for instance by using `str(source)` or `(', ').join(source)`."  # noqa: E501
+        )
+
+    elif os.path.isfile(source):
+        # For datatypes that support conventional file references.
+        # Note: checking for string is not necessary anymore.
+
+        if source.endswith(".docx"):
+            logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
+            return DataType.DOCX
+
+        # If the source is a valid file, that's not detectable as a type, an error is raised.
+        # It does not fallback to text.
+        raise ValueError(
+            "Source points to a valid file, but based on the filename, no `data_type` can be detected. Please be aware, that not all data_types allow conventional file references, some require the use of the `file URI scheme`. Please refer to the embedchain documentation (https://docs.embedchain.ai/advanced/data_types#remote-data-types)."  # noqa: E501
+        )
+
+    else:
+        # Source is not a URL.
+
+        # Use text as final fallback.
+        logging.debug(f"Source of `{formatted_source}` detected as `text`.")
+        return DataType.TEXT

+ 7 - 0
tests/chunkers/test_text.py

@@ -4,6 +4,7 @@ import unittest
 
 from embedchain.chunkers.text import TextChunker
 from embedchain.config import ChunkerConfig
+from embedchain.models.data_type import DataType
 
 
 class TestTextChunker(unittest.TestCase):
@@ -15,6 +16,8 @@ class TestTextChunker(unittest.TestCase):
         chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len)
         chunker = TextChunker(config=chunker_config)
         text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+        # Data type must be set manually in the test
+        chunker.set_data_type(DataType.TEXT)
 
         result = chunker.create_chunks(MockLoader(), text)
 
@@ -31,6 +34,8 @@ class TestTextChunker(unittest.TestCase):
         chunker_config = ChunkerConfig(chunk_size=9999999999, chunk_overlap=0, length_function=len)
         chunker = TextChunker(config=chunker_config)
         text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+        # Data type must be set manually in the test
+        chunker.set_data_type(DataType.TEXT)
 
         result = chunker.create_chunks(MockLoader(), text)
 
@@ -46,6 +51,8 @@ class TestTextChunker(unittest.TestCase):
         chunker = TextChunker(config=chunker_config)
         # We can't test with lorem ipsum because chunks are deduped, so would be recurring characters.
         text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
+        # Data type must be set manually in the test
+        chunker.set_data_type(DataType.TEXT)
 
         result = chunker.create_chunks(MockLoader(), text)
 

+ 11 - 2
tests/embedchain/test_add.py

@@ -23,5 +23,14 @@ class TestApp(unittest.TestCase):
         The Collection.add method from the chromadb library is mocked during this test to isolate the behavior of the
         'add' method.
         """
-        self.app.add("web_page", "https://example.com", {"meta": "meta-data"})
-        self.assertEqual(self.app.user_asks, [["web_page", "https://example.com", {"meta": "meta-data"}]])
+        self.app.add("https://example.com", metadata={"meta": "meta-data"})
+        self.assertEqual(self.app.user_asks, [["https://example.com", "web_page", {"meta": "meta-data"}]])
+
+    @patch("chromadb.api.models.Collection.Collection.add", MagicMock)
+    def test_add_forced_type(self):
+        """
+        Test that you can also force a data_type with `add`.
+        """
+        data_type = "text"
+        self.app.add("https://example.com", data_type=data_type, metadata={"meta": "meta-data"})
+        self.assertEqual(self.app.user_asks, [["https://example.com", data_type, {"meta": "meta-data"}]])

+ 1 - 1
tests/embedchain/test_embedchain.py

@@ -31,7 +31,7 @@ class TestChromaDbHostsLoglevel(unittest.TestCase):
 
         knowledge = "lorem ipsum dolor sit amet, consectetur adipiscing"
 
-        app.add_local("text", knowledge)
+        app.add(knowledge, data_type="text")
 
         app.query("What text did I give you?")
         app.chat("What text did I give you?")

+ 129 - 0
tests/embedchain/test_utils.py

@@ -0,0 +1,129 @@
+import tempfile
+import unittest
+from unittest.mock import patch
+
+from embedchain.models.data_type import DataType
+from embedchain.utils import detect_datatype
+
+
+class TestApp(unittest.TestCase):
+    """Test that the datatype detection is working, based on the input."""
+
+    def test_detect_datatype_youtube(self):
+        self.assertEqual(detect_datatype("https://www.youtube.com/watch?v=dQw4w9WgXcQ"), DataType.YOUTUBE_VIDEO)
+        self.assertEqual(detect_datatype("https://m.youtube.com/watch?v=dQw4w9WgXcQ"), DataType.YOUTUBE_VIDEO)
+        self.assertEqual(
+            detect_datatype("https://www.youtube-nocookie.com/watch?v=dQw4w9WgXcQ"), DataType.YOUTUBE_VIDEO
+        )
+        self.assertEqual(detect_datatype("https://vid.plus/watch?v=dQw4w9WgXcQ"), DataType.YOUTUBE_VIDEO)
+        self.assertEqual(detect_datatype("https://youtu.be/dQw4w9WgXcQ"), DataType.YOUTUBE_VIDEO)
+
+    def test_detect_datatype_local_file(self):
+        self.assertEqual(detect_datatype("file:///home/user/file.txt"), DataType.WEB_PAGE)
+
+    def test_detect_datatype_pdf(self):
+        self.assertEqual(detect_datatype("https://www.example.com/document.pdf"), DataType.PDF_FILE)
+
+    def test_detect_datatype_local_pdf(self):
+        self.assertEqual(detect_datatype("file:///home/user/document.pdf"), DataType.PDF_FILE)
+
+    def test_detect_datatype_xml(self):
+        self.assertEqual(detect_datatype("https://www.example.com/sitemap.xml"), DataType.SITEMAP)
+
+    def test_detect_datatype_local_xml(self):
+        self.assertEqual(detect_datatype("file:///home/user/sitemap.xml"), DataType.SITEMAP)
+
+    def test_detect_datatype_docx(self):
+        self.assertEqual(detect_datatype("https://www.example.com/document.docx"), DataType.DOCX)
+
+    def test_detect_datatype_local_docx(self):
+        self.assertEqual(detect_datatype("file:///home/user/document.docx"), DataType.DOCX)
+
+    @patch("os.path.isfile")
+    def test_detect_datatype_regular_filesystem_docx(self, mock_isfile):
+        with tempfile.NamedTemporaryFile(suffix=".docx", delete=True) as tmp:
+            mock_isfile.return_value = True
+            self.assertEqual(detect_datatype(tmp.name), DataType.DOCX)
+
+    def test_detect_datatype_docs_site(self):
+        self.assertEqual(detect_datatype("https://docs.example.com"), DataType.DOCS_SITE)
+
+    def test_detect_datatype_docs_sitein_path(self):
+        self.assertEqual(detect_datatype("https://www.example.com/docs/index.html"), DataType.DOCS_SITE)
+        self.assertNotEqual(detect_datatype("file:///var/www/docs/index.html"), DataType.DOCS_SITE)  # NOT equal
+
+    def test_detect_datatype_web_page(self):
+        self.assertEqual(detect_datatype("https://nav.al/agi"), DataType.WEB_PAGE)
+
+    def test_detect_datatype_invalid_url(self):
+        self.assertEqual(detect_datatype("not a url"), DataType.TEXT)
+
+    def test_detect_datatype_qna_pair(self):
+        self.assertEqual(
+            detect_datatype(("Question?", "Answer. Content of the string is irrelevant.")), DataType.QNA_PAIR
+        )  #
+
+    def test_detect_datatype_qna_pair_types(self):
+        """Test that a QnA pair needs to be a tuple of length two, and both items have to be strings."""
+        with self.assertRaises(TypeError):
+            self.assertNotEqual(
+                detect_datatype(("How many planets are in our solar system?", 8)), DataType.QNA_PAIR
+            )  # NOT equal
+
+    def test_detect_datatype_text(self):
+        self.assertEqual(detect_datatype("Just some text."), DataType.TEXT)
+
+    def test_detect_datatype_non_string_error(self):
+        """Test type error if the value passed is not a string, and not a valid non-string data_type"""
+        with self.assertRaises(TypeError):
+            detect_datatype(["foo", "bar"])
+
+    @patch("os.path.isfile")
+    def test_detect_datatype_regular_filesystem_file_not_detected(self, mock_isfile):
+        """Test error if a valid file is referenced, but it isn't a valid data_type"""
+        with tempfile.NamedTemporaryFile(suffix=".txt", delete=True) as tmp:
+            mock_isfile.return_value = True
+            with self.assertRaises(ValueError):
+                detect_datatype(tmp.name)
+
+    def test_detect_datatype_regular_filesystem_no_file(self):
+        """Test that if a filepath is not actually an existing file, it is not handled as a file path."""
+        self.assertEqual(detect_datatype("/var/not-an-existing-file.txt"), DataType.TEXT)
+
+    def test_doc_examples_quickstart(self):
+        """Test examples used in the documentation."""
+        self.assertEqual(detect_datatype("https://en.wikipedia.org/wiki/Elon_Musk"), DataType.WEB_PAGE)
+        self.assertEqual(detect_datatype("https://www.tesla.com/elon-musk"), DataType.WEB_PAGE)
+
+    def test_doc_examples_introduction(self):
+        """Test examples used in the documentation."""
+        self.assertEqual(detect_datatype("https://www.youtube.com/watch?v=3qHkcs3kG44"), DataType.YOUTUBE_VIDEO)
+        self.assertEqual(
+            detect_datatype(
+                "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf"
+            ),
+            DataType.PDF_FILE,
+        )
+        self.assertEqual(detect_datatype("https://nav.al/feedback"), DataType.WEB_PAGE)
+
+    def test_doc_examples_app_types(self):
+        """Test examples used in the documentation."""
+        self.assertEqual(detect_datatype("https://www.youtube.com/watch?v=Ff4fRgnuFgQ"), DataType.YOUTUBE_VIDEO)
+        self.assertEqual(detect_datatype("https://en.wikipedia.org/wiki/Mark_Zuckerberg"), DataType.WEB_PAGE)
+
+    def test_doc_examples_configuration(self):
+        """Test examples used in the documentation."""
+        import subprocess
+        import sys
+
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "wikipedia"])
+        import wikipedia
+
+        page = wikipedia.page("Albert Einstein")
+        # TODO: Add a wikipedia type, so wikipedia is a dependency and we don't need this slow test.
+        # (timings: import: 1.4s, fetch wiki: 0.7s)
+        self.assertEqual(detect_datatype(page.content), DataType.TEXT)
+
+
+if __name__ == "__main__":
+    unittest.main()