pipeline.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. import ast
  2. import json
  3. import logging
  4. import os
  5. import sqlite3
  6. import uuid
  7. import requests
  8. import yaml
  9. from fastapi import FastAPI, HTTPException
  10. from embedchain import Client
  11. from embedchain.config import PipelineConfig
  12. from embedchain.embedchain import CONFIG_DIR, EmbedChain
  13. from embedchain.embedder.base import BaseEmbedder
  14. from embedchain.embedder.openai import OpenAIEmbedder
  15. from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory
  16. from embedchain.helper.json_serializable import register_deserializable
  17. from embedchain.llm.base import BaseLlm
  18. from embedchain.llm.openai import OpenAILlm
  19. from embedchain.vectordb.base import BaseVectorDB
  20. from embedchain.vectordb.chroma import ChromaDB
  21. SQLITE_PATH = os.path.join(CONFIG_DIR, "embedchain.db")
  22. @register_deserializable
  23. class Pipeline(EmbedChain):
  24. """
  25. EmbedChain pipeline lets you create a LLM powered app for your unstructured
  26. data by defining a pipeline with your chosen data source, embedding model,
  27. and vector database.
  28. """
  29. def __init__(
  30. self,
  31. config: PipelineConfig = None,
  32. db: BaseVectorDB = None,
  33. embedding_model: BaseEmbedder = None,
  34. llm: BaseLlm = None,
  35. yaml_path: str = None,
  36. log_level=logging.INFO,
  37. auto_deploy: bool = False,
  38. ):
  39. """
  40. Initialize a new `App` instance.
  41. :param config: Configuration for the pipeline, defaults to None
  42. :type config: PipelineConfig, optional
  43. :param db: The database to use for storing and retrieving embeddings, defaults to None
  44. :type db: BaseVectorDB, optional
  45. :param embedding_model: The embedding model used to calculate embeddings, defaults to None
  46. :type embedding_model: BaseEmbedder, optional
  47. :param llm: The LLM model used to calculate embeddings, defaults to None
  48. :type llm: BaseLlm, optional
  49. :param yaml_path: Path to the YAML configuration file, defaults to None
  50. :type yaml_path: str, optional
  51. :param log_level: Log level to use, defaults to logging.INFO
  52. :type log_level: int, optional
  53. :param auto_deploy: Whether to deploy the pipeline automatically, defaults to False
  54. :type auto_deploy: bool, optional
  55. :raises Exception: If an error occurs while creating the pipeline
  56. """
  57. logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
  58. self.logger = logging.getLogger(__name__)
  59. self.auto_deploy = auto_deploy
  60. # Store the yaml config as an attribute to be able to send it
  61. self.yaml_config = None
  62. self.client = None
  63. # pipeline_id from the backend
  64. self.id = None
  65. if yaml_path:
  66. with open(yaml_path, "r") as file:
  67. config_data = yaml.safe_load(file)
  68. self.yaml_config = config_data
  69. self.config = config or PipelineConfig()
  70. self.name = self.config.name
  71. self.config.id = self.local_id = str(uuid.uuid4()) if self.config.id is None else self.config.id
  72. self.embedding_model = embedding_model or OpenAIEmbedder()
  73. self.db = db or ChromaDB()
  74. self.llm = llm or OpenAILlm()
  75. self._init_db()
  76. # setup user id and directory
  77. self.u_id = self._load_or_generate_user_id()
  78. # Establish a connection to the SQLite database
  79. self.connection = sqlite3.connect(SQLITE_PATH)
  80. self.cursor = self.connection.cursor()
  81. # Create the 'data_sources' table if it doesn't exist
  82. self.cursor.execute(
  83. """
  84. CREATE TABLE IF NOT EXISTS data_sources (
  85. pipeline_id TEXT,
  86. hash TEXT,
  87. type TEXT,
  88. value TEXT,
  89. metadata TEXT,
  90. is_uploaded INTEGER DEFAULT 0,
  91. PRIMARY KEY (pipeline_id, hash)
  92. )
  93. """
  94. )
  95. self.connection.commit()
  96. self.user_asks = [] # legacy defaults
  97. if self.auto_deploy:
  98. self.deploy()
  99. def _init_db(self):
  100. """
  101. Initialize the database.
  102. """
  103. self.db._set_embedder(self.embedding_model)
  104. self.db._initialize()
  105. self.db.set_collection_name(self.db.config.collection_name)
  106. def _init_client(self):
  107. """
  108. Initialize the client.
  109. """
  110. config = Client.load_config()
  111. if config.get("api_key"):
  112. self.client = Client()
  113. else:
  114. api_key = input(
  115. "🔑 Enter your Embedchain API key. You can find the API key at https://app.embedchain.ai/settings/keys/ \n" # noqa: E501
  116. )
  117. self.client = Client(api_key=api_key)
  118. def _create_pipeline(self):
  119. """
  120. Create a pipeline on the platform.
  121. """
  122. print("🛠️ Creating pipeline on the platform...")
  123. # self.yaml_config is a dict. Pass it inside the key 'yaml_config' to the backend
  124. payload = {
  125. "yaml_config": json.dumps(self.yaml_config),
  126. "name": self.name,
  127. "local_id": self.local_id,
  128. }
  129. url = f"{self.client.host}/api/v1/pipelines/cli/create/"
  130. r = requests.post(
  131. url,
  132. json=payload,
  133. headers={"Authorization": f"Token {self.client.api_key}"},
  134. )
  135. if r.status_code not in [200, 201]:
  136. raise Exception(f"❌ Error occurred while creating pipeline. API response: {r.text}")
  137. print(
  138. f"🎉🎉🎉 Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
  139. )
  140. return r.json()
  141. def _get_presigned_url(self, data_type, data_value):
  142. payload = {"data_type": data_type, "data_value": data_value}
  143. r = requests.post(
  144. f"{self.client.host}/api/v1/pipelines/{self.id}/cli/presigned_url/",
  145. json=payload,
  146. headers={"Authorization": f"Token {self.client.api_key}"},
  147. )
  148. r.raise_for_status()
  149. return r.json()
  150. def search(self, query, num_documents=3):
  151. """
  152. Search for similar documents related to the query in the vector database.
  153. """
  154. # TODO: Search will call the endpoint rather than fetching the data from the db itself when deploy=True.
  155. if self.id is None:
  156. where = {"app_id": self.local_id}
  157. return self.db.query(
  158. query,
  159. n_results=num_documents,
  160. where=where,
  161. skip_embedding=False,
  162. )
  163. else:
  164. # Make API call to the backend to get the results
  165. NotImplementedError("Search is not implemented yet for the prod mode.")
  166. def _upload_file_to_presigned_url(self, presigned_url, file_path):
  167. try:
  168. with open(file_path, "rb") as file:
  169. response = requests.put(presigned_url, data=file)
  170. response.raise_for_status()
  171. return response.status_code == 200
  172. except Exception as e:
  173. self.logger.exception(f"Error occurred during file upload: {str(e)}")
  174. print("❌ Error occurred during file upload!")
  175. return False
  176. def _upload_data_to_pipeline(self, data_type, data_value, metadata=None):
  177. payload = {
  178. "data_type": data_type,
  179. "data_value": data_value,
  180. "metadata": metadata,
  181. }
  182. try:
  183. self._send_api_request(f"/api/v1/pipelines/{self.id}/cli/add/", payload)
  184. # print the local file path if user tries to upload a local file
  185. printed_value = metadata.get("file_path") if metadata.get("file_path") else data_value
  186. print(f"✅ Data of type: {data_type}, value: {printed_value} added successfully.")
  187. except Exception as e:
  188. print(f"❌ Error occurred during data upload for type {data_type}!. Error: {str(e)}")
  189. def _send_api_request(self, endpoint, payload):
  190. url = f"{self.client.host}{endpoint}"
  191. headers = {"Authorization": f"Token {self.client.api_key}"}
  192. response = requests.post(url, json=payload, headers=headers)
  193. response.raise_for_status()
  194. return response
  195. def _process_and_upload_data(self, data_hash, data_type, data_value):
  196. if os.path.isabs(data_value):
  197. presigned_url_data = self._get_presigned_url(data_type, data_value)
  198. presigned_url = presigned_url_data["presigned_url"]
  199. s3_key = presigned_url_data["s3_key"]
  200. if self._upload_file_to_presigned_url(presigned_url, file_path=data_value):
  201. metadata = {"file_path": data_value, "s3_key": s3_key}
  202. data_value = presigned_url
  203. else:
  204. self.logger.error(f"File upload failed for hash: {data_hash}")
  205. return False
  206. else:
  207. if data_type == "qna_pair":
  208. data_value = list(ast.literal_eval(data_value))
  209. metadata = {}
  210. try:
  211. self._upload_data_to_pipeline(data_type, data_value, metadata)
  212. self._mark_data_as_uploaded(data_hash)
  213. return True
  214. except Exception:
  215. print(f"❌ Error occurred during data upload for hash {data_hash}!")
  216. return False
  217. def _mark_data_as_uploaded(self, data_hash):
  218. self.cursor.execute(
  219. "UPDATE data_sources SET is_uploaded = 1 WHERE hash = ? AND pipeline_id = ?",
  220. (data_hash, self.local_id),
  221. )
  222. self.connection.commit()
  223. def deploy(self):
  224. if self.client is None:
  225. self._init_client()
  226. pipeline_data = self._create_pipeline()
  227. self.id = pipeline_data["id"]
  228. results = self.cursor.execute(
  229. "SELECT * FROM data_sources WHERE pipeline_id = ? AND is_uploaded = 0", (self.local_id,)
  230. ).fetchall()
  231. if len(results) > 0:
  232. print("🛠️ Adding data to your pipeline...")
  233. for result in results:
  234. data_hash, data_type, data_value = result[1], result[2], result[3]
  235. self._process_and_upload_data(data_hash, data_type, data_value)
  236. @classmethod
  237. def from_config(cls, yaml_path: str, auto_deploy: bool = False):
  238. """
  239. Instantiate a Pipeline object from a YAML configuration file.
  240. :param yaml_path: Path to the YAML configuration file.
  241. :type yaml_path: str
  242. :param auto_deploy: Whether to deploy the pipeline automatically, defaults to False
  243. :type auto_deploy: bool, optional
  244. :return: An instance of the Pipeline class.
  245. :rtype: Pipeline
  246. """
  247. with open(yaml_path, "r") as file:
  248. config_data = yaml.safe_load(file)
  249. pipeline_config_data = config_data.get("pipeline", {}).get("config", {})
  250. db_config_data = config_data.get("vectordb", {})
  251. embedding_model_config_data = config_data.get("embedding_model", {})
  252. llm_config_data = config_data.get("llm", {})
  253. pipeline_config = PipelineConfig(**pipeline_config_data)
  254. db_provider = db_config_data.get("provider", "chroma")
  255. db = VectorDBFactory.create(db_provider, db_config_data.get("config", {}))
  256. if llm_config_data:
  257. llm_provider = llm_config_data.get("provider", "openai")
  258. llm = LlmFactory.create(llm_provider, llm_config_data.get("config", {}))
  259. else:
  260. llm = None
  261. embedding_model_provider = embedding_model_config_data.get("provider", "openai")
  262. embedding_model = EmbedderFactory.create(
  263. embedding_model_provider, embedding_model_config_data.get("config", {})
  264. )
  265. return cls(
  266. config=pipeline_config,
  267. llm=llm,
  268. db=db,
  269. embedding_model=embedding_model,
  270. yaml_path=yaml_path,
  271. auto_deploy=auto_deploy,
  272. )
  273. def start(self, host="0.0.0.0", port=8000):
  274. app = FastAPI()
  275. @app.post("/add")
  276. async def add_document(data_value: str, data_type: str = None):
  277. """
  278. Add a document to the pipeline.
  279. """
  280. try:
  281. document = {"data_value": data_value, "data_type": data_type}
  282. self.add(document)
  283. return {"message": "Document added successfully"}
  284. except Exception as e:
  285. raise HTTPException(status_code=500, detail=str(e))
  286. @app.post("/query")
  287. async def query_documents(query: str, num_documents: int = 3):
  288. """
  289. Query for similar documents in the pipeline.
  290. """
  291. try:
  292. results = self.search(query, num_documents)
  293. return results
  294. except Exception as e:
  295. raise HTTPException(status_code=500, detail=str(e))
  296. import uvicorn
  297. uvicorn.run(app, host=host, port=port)