소스 검색

[Improvement] return all the metadata when citations flag is `True` (#1059)

Co-authored-by: Deven Patel <deven298@yahoo.com>
Deven Patel 1 년 전
부모
커밋
19d80914df

+ 3 - 6
docs/api-reference/pipeline/chat.mdx

@@ -53,18 +53,15 @@ print(sources)
 # [
 #    (
 #        'Elon Musk PROFILEElon MuskCEO, Tesla$247.1B$2.3B (0.96%)Real Time Net Worthas of 12/7/23 ...',
-#        'https://www.forbes.com/profile/elon-musk',
-#        '4651b266--4aa78839fe97'
+#        {'url': 'https://www.forbes.com/profile/elon-musk', ...}
 #    ),
 #    (
 #        '74% of the company, which is now called X.Wealth HistoryHOVER TO REVEAL NET WORTH BY YEARForbes ...',
-#        'https://www.forbes.com/profile/elon-musk',
-#        '4651b266--4aa78839fe97'
+#        {'url': 'https://www.forbes.com/profile/elon-musk', ...}
 #    ),
 #    (
 #        'founded in 2002, is worth nearly $150 billion after a $750 million tender offer in June 2023 ...',
-#        'https://www.forbes.com/profile/elon-musk',
-#        '4651b266--4aa78839fe97'
+#        {'url': 'https://www.forbes.com/profile/elon-musk', ...}
 #    )
 # ]
 ```

+ 3 - 6
docs/api-reference/pipeline/query.mdx

@@ -53,18 +53,15 @@ print(sources)
 # [
 #    (
 #        'Elon Musk PROFILEElon MuskCEO, Tesla$247.1B$2.3B (0.96%)Real Time Net Worthas of 12/7/23 ...',
-#        'https://www.forbes.com/profile/elon-musk',
-#        '4651b266--4aa78839fe97'
+#        {'url': 'https://www.forbes.com/profile/elon-musk', ...}
 #    ),
 #    (
 #        '74% of the company, which is now called X.Wealth HistoryHOVER TO REVEAL NET WORTH BY YEARForbes ...',
-#        'https://www.forbes.com/profile/elon-musk',
-#        '4651b266--4aa78839fe97'
+#        {'url': 'https://www.forbes.com/profile/elon-musk', ...}
 #    ),
 #    (
 #        'founded in 2002, is worth nearly $150 billion after a $750 million tender offer in June 2023 ...',
-#        'https://www.forbes.com/profile/elon-musk',
-#        '4651b266--4aa78839fe97'
+#        {'url': 'https://www.forbes.com/profile/elon-musk', ...}
 #    )
 # ]
 ```

+ 8 - 4
docs/api-reference/pipeline/search.mdx

@@ -39,13 +39,17 @@ print(context)
 # [
 #     {
 #         'context': 'Elon Musk PROFILEElon MuskCEO, Tesla$221.9BReal Time Net Worthas of 10/29/23Reflects change since 5 pm ET of prior trading day. 1 in the world todayPhoto by Martin Schoeller for ForbesAbout Elon MuskElon Musk cofounded six companies, including electric car maker Tesla, rocket producer SpaceX and tunneling startup Boring Company.He owns about 21% of Tesla between stock and options, but has pledged more than half his shares as collateral for personal loans of up to $3.5 billion.SpaceX, founded in',
-#         'source': 'https://www.forbes.com/profile/elon-musk',
-#         'document_id': 'some_document_id'
+#         'metadata': {
+#             'source': 'https://www.forbes.com/profile/elon-musk',
+#             'document_id': 'some_document_id'
+#         }
 #     },
 #     {
 #         'context': 'company, which is now called X.Wealth HistoryHOVER TO REVEAL NET WORTH BY YEARForbes Lists 1Forbes 400 (2023)The Richest Person In Every State (2023) 2Billionaires (2023) 1Innovative Leaders (2019) 25Powerful People (2018) 12Richest In Tech (2017)Global Game Changers (2016)More ListsPersonal StatsAge52Source of WealthTesla, SpaceX, Self MadeSelf-Made Score8Philanthropy Score1ResidenceAustin, TexasCitizenshipUnited StatesMarital StatusSingleChildren11EducationBachelor of Arts/Science, University',
-#         'source': 'https://www.forbes.com/profile/elon-musk',
-#         'document_id': 'some_document_id'
+#         'metadata': {
+#             'source': 'https://www.forbes.com/profile/elon-musk',
+#             'document_id': 'some_document_id'
+#         }
 #     }
 # ]
 ```

+ 12 - 6
docs/use-cases/semantic-search.mdx

@@ -48,18 +48,24 @@ app.search("Summarize the features of Next.js 14?")
 [
   {
     'context': 'Next.js 14 | Next.jsBack to BlogThursday, October 26th 2023Next.js 14Posted byLee Robinson@leeerobTim Neutkens@timneutkensAs we announced at Next.js Conf, Next.js 14 is our most focused release with: Turbopack: 5,000 tests passing for App & Pages Router 53% faster local server startup 94% faster code updates with Fast Refresh Server Actions (Stable): Progressively enhanced mutations Integrated with caching & revalidating Simple function calls, or works natively with forms Partial Prerendering',
-    'source': 'https://nextjs.org/blog/next-14',
-    'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5'
+    'metadata': {
+      'source': 'https://nextjs.org/blog/next-14',
+      'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5'
+    }
   },
   {
     'context': 'Next.js 13.3 | Next.jsBack to BlogThursday, April 6th 2023Next.js 13.3Posted byDelba de Oliveira@delba_oliveiraTim Neutkens@timneutkensNext.js 13.3 adds popular community-requested features, including: File-Based Metadata API: Dynamically generate sitemaps, robots, favicons, and more. Dynamic Open Graph Images: Generate OG images using JSX, HTML, and CSS. Static Export for App Router: Static / Single-Page Application (SPA) support for Server Components. Parallel Routes and Interception: Advanced',
-    'source': 'https://nextjs.org/blog/next-13-3',
-    'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5'
+    'metadata': {
+      'source': 'https://nextjs.org/blog/next-13-3',
+      'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5'
+    }
   },
   {
     'context': 'Upgrading: Version 14 | Next.js MenuUsing App RouterFeatures available in /appApp Router.UpgradingVersion 14Version 14 Upgrading from 13 to 14 To update to Next.js version 14, run the following command using your preferred package manager: Terminalnpm i next@latest react@latest react-dom@latest eslint-config-next@latest Terminalyarn add next@latest react@latest react-dom@latest eslint-config-next@latest Terminalpnpm up next react react-dom eslint-config-next -latest Terminalbun add next@latest',
-    'source': 'https://nextjs.org/docs/app/building-your-application/upgrading/version-14',
-    'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5'
+    'metadata': {
+      'source': 'https://nextjs.org/docs/app/building-your-application/upgrading/version-14',
+      'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5'
+    }
   }
 ]
 ```

+ 1 - 7
embedchain/pipeline.py

@@ -237,13 +237,7 @@ class Pipeline(EmbedChain):
             )
             result = []
             for c in context:
-                result.append(
-                    {
-                        "context": c[0],
-                        "source": c[1],
-                        "document_id": c[2],
-                    }
-                )
+                result.append({"context": c[0], "metadata": c[1]})
             return result
         else:
             # Make API call to the backend to get the results

+ 1 - 3
embedchain/vectordb/chroma.py

@@ -250,9 +250,7 @@ class ChromaDB(BaseVectorDB):
             context = result[0].page_content
             if citations:
                 metadata = result[0].metadata
-                source = metadata["url"]
-                doc_id = metadata["doc_id"]
-                contexts.append((context, source, doc_id))
+                contexts.append((context, metadata))
             else:
                 contexts.append(context)
         return contexts

+ 2 - 4
embedchain/vectordb/elasticsearch.py

@@ -202,7 +202,7 @@ class ElasticsearchDB(BaseVectorDB):
         if "app_id" in where:
             app_id = where["app_id"]
             query["script_score"]["query"] = {"match": {"metadata.app_id": app_id}}
-        _source = ["text", "metadata.url", "metadata.doc_id"]
+        _source = ["text", "metadata"]
         response = self.client.search(index=self._get_index(), query=query, _source=_source, size=n_results)
         docs = response["hits"]["hits"]
         contexts = []
@@ -210,9 +210,7 @@ class ElasticsearchDB(BaseVectorDB):
             context = doc["_source"]["text"]
             if citations:
                 metadata = doc["_source"]["metadata"]
-                source = metadata["url"]
-                doc_id = metadata["doc_id"]
-                contexts.append(tuple((context, source, doc_id)))
+                contexts.append(tuple((context, metadata)))
             else:
                 contexts.append(context)
         return contexts

+ 1 - 3
embedchain/vectordb/opensearch.py

@@ -218,9 +218,7 @@ class OpenSearchDB(BaseVectorDB):
         for doc in docs:
             context = doc.page_content
             if citations:
-                source = doc.metadata["url"]
-                doc_id = doc.metadata["doc_id"]
-                contexts.append(tuple((context, source, doc_id)))
+                contexts.append(tuple((context, doc.metadata)))
             else:
                 contexts.append(context)
         return contexts

+ 1 - 3
embedchain/vectordb/pinecone.py

@@ -154,9 +154,7 @@ class PineconeDB(BaseVectorDB):
             metadata = doc["metadata"]
             context = metadata["text"]
             if citations:
-                source = metadata["url"]
-                doc_id = metadata["doc_id"]
-                contexts.append(tuple((context, source, doc_id)))
+                contexts.append(tuple((context, metadata)))
             else:
                 contexts.append(context)
         return contexts

+ 1 - 3
embedchain/vectordb/qdrant.py

@@ -219,9 +219,7 @@ class QdrantDB(BaseVectorDB):
             context = result.payload["text"]
             if citations:
                 metadata = result.payload["metadata"]
-                source = metadata["url"]
-                doc_id = metadata["doc_id"]
-                contexts.append(tuple((context, source, doc_id)))
+                contexts.append(tuple((context, metadata)))
             else:
                 contexts.append(context)
         return contexts

+ 1 - 3
embedchain/vectordb/weaviate.py

@@ -271,9 +271,7 @@ class WeaviateDB(BaseVectorDB):
             context = doc["text"]
             if citations:
                 metadata = doc["metadata"][0]
-                source = metadata["url"]
-                doc_id = metadata["doc_id"]
-                contexts.append((context, source, doc_id))
+                contexts.append((context, metadata))
             else:
                 contexts.append(context)
         return contexts

+ 1 - 3
embedchain/vectordb/zilliz.py

@@ -187,9 +187,7 @@ class ZillizVectorDB(BaseVectorDB):
             data = query[0]["entity"]
             context = data["text"]
             if citations:
-                source = data["url"]
-                doc_id = data["doc_id"]
-                contexts.append(tuple((context, source, doc_id)))
+                contexts.append(tuple((context, data)))
             else:
                 contexts.append(context)
         return contexts

+ 4 - 1
tests/vectordb/test_chroma_db.py

@@ -341,7 +341,10 @@ def test_chroma_db_collection_query(app_with_settings):
     data_with_citations = app_with_settings.db.query(
         input_query=[0, 0, 0], where={}, n_results=2, skip_embedding=True, citations=True
     )
-    expected_value_with_citations = [("document", "url_1", "doc_id_1"), ("document2", "url_2", "doc_id_2")]
+    expected_value_with_citations = [
+        ("document", {"url": "url_1", "doc_id": "doc_id_1"}),
+        ("document2", {"url": "url_2", "doc_id": "doc_id_2"}),
+    ]
     assert data_with_citations == expected_value_with_citations
 
     app_with_settings.db.reset()

+ 2 - 2
tests/vectordb/test_elasticsearch_db.py

@@ -66,8 +66,8 @@ class TestEsDB(unittest.TestCase):
 
         results_with_citations = self.db.query(query, n_results=2, where={}, skip_embedding=False, citations=True)
         expected_results_with_citations = [
-            ("This is a document.", "url_1", "doc_id_1"),
-            ("This is another document.", "url_2", "doc_id_2"),
+            ("This is a document.", {"url": "url_1", "doc_id": "doc_id_1"}),
+            ("This is another document.", {"url": "url_2", "doc_id": "doc_id_2"}),
         ]
         self.assertEqual(results_with_citations, expected_results_with_citations)
 

+ 6 - 2
tests/vectordb/test_zilliz_db.py

@@ -150,7 +150,9 @@ class TestZillizDBCollection:
                 output_fields=["text", "url", "doc_id"],
             )
 
-            assert query_result_with_citations == [("result_doc", "url_1", "doc_id_1")]
+            assert query_result_with_citations == [
+                ("result_doc", {"text": "result_doc", "url": "url_1", "doc_id": "doc_id_1"})
+            ]
 
     @patch("embedchain.vectordb.zilliz.MilvusClient", autospec=True)
     @patch("embedchain.vectordb.zilliz.connections", autospec=True)
@@ -202,4 +204,6 @@ class TestZillizDBCollection:
                 output_fields=["text", "url", "doc_id"],
             )
 
-            assert query_result_with_citations == [("result_doc", "url_1", "doc_id_1")]
+            assert query_result_with_citations == [
+                ("result_doc", {"text": "result_doc", "url": "url_1", "doc_id": "doc_id_1"})
+            ]