utils.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. import itertools
  2. import json
  3. import logging
  4. import os
  5. import re
  6. import string
  7. from typing import Any
  8. from schema import Optional, Or, Schema
  9. from tqdm import tqdm
  10. from embedchain.models.data_type import DataType
  11. def parse_content(content, type):
  12. implemented = ["html.parser", "lxml", "lxml-xml", "xml", "html5lib"]
  13. if type not in implemented:
  14. raise ValueError(f"Parser type {type} not implemented. Please choose one of {implemented}")
  15. from bs4 import BeautifulSoup
  16. soup = BeautifulSoup(content, type)
  17. original_size = len(str(soup.get_text()))
  18. tags_to_exclude = [
  19. "nav",
  20. "aside",
  21. "form",
  22. "header",
  23. "noscript",
  24. "svg",
  25. "canvas",
  26. "footer",
  27. "script",
  28. "style",
  29. ]
  30. for tag in soup(tags_to_exclude):
  31. tag.decompose()
  32. ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"]
  33. for id in ids_to_exclude:
  34. tags = soup.find_all(id=id)
  35. for tag in tags:
  36. tag.decompose()
  37. classes_to_exclude = [
  38. "elementor-location-header",
  39. "navbar-header",
  40. "nav",
  41. "header-sidebar-wrapper",
  42. "blog-sidebar-wrapper",
  43. "related-posts",
  44. ]
  45. for class_name in classes_to_exclude:
  46. tags = soup.find_all(class_=class_name)
  47. for tag in tags:
  48. tag.decompose()
  49. content = soup.get_text()
  50. content = clean_string(content)
  51. cleaned_size = len(content)
  52. if original_size != 0:
  53. logging.info(
  54. f"Cleaned page size: {cleaned_size} characters, down from {original_size} (shrunk: {original_size-cleaned_size} chars, {round((1-(cleaned_size/original_size)) * 100, 2)}%)" # noqa:E501
  55. )
  56. return content
  57. def clean_string(text):
  58. """
  59. This function takes in a string and performs a series of text cleaning operations.
  60. Args:
  61. text (str): The text to be cleaned. This is expected to be a string.
  62. Returns:
  63. cleaned_text (str): The cleaned text after all the cleaning operations
  64. have been performed.
  65. """
  66. # Replacement of newline characters:
  67. text = text.replace("\n", " ")
  68. # Stripping and reducing multiple spaces to single:
  69. cleaned_text = re.sub(r"\s+", " ", text.strip())
  70. # Removing backslashes:
  71. cleaned_text = cleaned_text.replace("\\", "")
  72. # Replacing hash characters:
  73. cleaned_text = cleaned_text.replace("#", " ")
  74. # Eliminating consecutive non-alphanumeric characters:
  75. # This regex identifies consecutive non-alphanumeric characters (i.e., not
  76. # a word character [a-zA-Z0-9_] and not a whitespace) in the string
  77. # and replaces each group of such characters with a single occurrence of
  78. # that character.
  79. # For example, "!!! hello !!!" would become "! hello !".
  80. cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
  81. return cleaned_text
  82. def is_readable(s):
  83. """
  84. Heuristic to determine if a string is "readable" (mostly contains printable characters and forms meaningful words)
  85. :param s: string
  86. :return: True if the string is more than 95% printable.
  87. """
  88. try:
  89. printable_ratio = sum(c in string.printable for c in s) / len(s)
  90. except ZeroDivisionError:
  91. logging.warning("Empty string processed as unreadable")
  92. printable_ratio = 0
  93. return printable_ratio > 0.95 # 95% of characters are printable
  94. def use_pysqlite3():
  95. """
  96. Swap std-lib sqlite3 with pysqlite3.
  97. """
  98. import platform
  99. import sqlite3
  100. if platform.system() == "Linux" and sqlite3.sqlite_version_info < (3, 35, 0):
  101. try:
  102. # According to the Chroma team, this patch only works on Linux
  103. import datetime
  104. import subprocess
  105. import sys
  106. subprocess.check_call(
  107. [sys.executable, "-m", "pip", "install", "pysqlite3-binary", "--quiet", "--disable-pip-version-check"]
  108. )
  109. __import__("pysqlite3")
  110. sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
  111. # Let the user know what happened.
  112. current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
  113. print(
  114. f"{current_time} [embedchain] [INFO]",
  115. "Swapped std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.",
  116. f"Your original version was {sqlite3.sqlite_version}.",
  117. )
  118. except Exception as e:
  119. # Escape all exceptions
  120. current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
  121. print(
  122. f"{current_time} [embedchain] [ERROR]",
  123. "Failed to swap std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.",
  124. "Error:",
  125. e,
  126. )
  127. def format_source(source: str, limit: int = 20) -> str:
  128. """
  129. Format a string to only take the first x and last x letters.
  130. This makes it easier to display a URL, keeping familiarity while ensuring a consistent length.
  131. If the string is too short, it is not sliced.
  132. """
  133. if len(source) > 2 * limit:
  134. return source[:limit] + "..." + source[-limit:]
  135. return source
  136. def detect_datatype(source: Any) -> DataType:
  137. """
  138. Automatically detect the datatype of the given source.
  139. :param source: the source to base the detection on
  140. :return: data_type string
  141. """
  142. from urllib.parse import urlparse
  143. import requests
  144. import yaml
  145. def is_openapi_yaml(yaml_content):
  146. # currently the following two fields are required in openapi spec yaml config
  147. return "openapi" in yaml_content and "info" in yaml_content
  148. try:
  149. if not isinstance(source, str):
  150. raise ValueError("Source is not a string and thus cannot be a URL.")
  151. url = urlparse(source)
  152. # Check if both scheme and netloc are present. Local file system URIs are acceptable too.
  153. if not all([url.scheme, url.netloc]) and url.scheme != "file":
  154. raise ValueError("Not a valid URL.")
  155. except ValueError:
  156. url = False
  157. formatted_source = format_source(str(source), 30)
  158. if url:
  159. from langchain.document_loaders.youtube import \
  160. ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
  161. if url.netloc in YOUTUBE_ALLOWED_NETLOCS:
  162. logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")
  163. return DataType.YOUTUBE_VIDEO
  164. if url.netloc in {"notion.so", "notion.site"}:
  165. logging.debug(f"Source of `{formatted_source}` detected as `notion`.")
  166. return DataType.NOTION
  167. if url.path.endswith(".pdf"):
  168. logging.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
  169. return DataType.PDF_FILE
  170. if url.path.endswith(".xml"):
  171. logging.debug(f"Source of `{formatted_source}` detected as `sitemap`.")
  172. return DataType.SITEMAP
  173. if url.path.endswith(".csv"):
  174. logging.debug(f"Source of `{formatted_source}` detected as `csv`.")
  175. return DataType.CSV
  176. if url.path.endswith(".mdx") or url.path.endswith(".md"):
  177. logging.debug(f"Source of `{formatted_source}` detected as `mdx`.")
  178. return DataType.MDX
  179. if url.path.endswith(".docx"):
  180. logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
  181. return DataType.DOCX
  182. if url.path.endswith(".yaml"):
  183. try:
  184. response = requests.get(source)
  185. response.raise_for_status()
  186. try:
  187. yaml_content = yaml.safe_load(response.text)
  188. except yaml.YAMLError as exc:
  189. logging.error(f"Error parsing YAML: {exc}")
  190. raise TypeError(f"Not a valid data type. Error loading YAML: {exc}")
  191. if is_openapi_yaml(yaml_content):
  192. logging.debug(f"Source of `{formatted_source}` detected as `openapi`.")
  193. return DataType.OPENAPI
  194. else:
  195. logging.error(
  196. f"Source of `{formatted_source}` does not contain all the required \
  197. fields of OpenAPI yaml. Check 'https://spec.openapis.org/oas/v3.1.0'"
  198. )
  199. raise TypeError(
  200. "Not a valid data type. Check 'https://spec.openapis.org/oas/v3.1.0', \
  201. make sure you have all the required fields in YAML config data"
  202. )
  203. except requests.exceptions.RequestException as e:
  204. logging.error(f"Error fetching URL {formatted_source}: {e}")
  205. if url.path.endswith(".json"):
  206. logging.debug(f"Source of `{formatted_source}` detected as `json_file`.")
  207. return DataType.JSON
  208. if "docs" in url.netloc or ("docs" in url.path and url.scheme != "file"):
  209. # `docs_site` detection via path is not accepted for local filesystem URIs,
  210. # because that would mean all paths that contain `docs` are now doc sites, which is too aggressive.
  211. logging.debug(f"Source of `{formatted_source}` detected as `docs_site`.")
  212. return DataType.DOCS_SITE
  213. if "github.com" in url.netloc:
  214. logging.debug(f"Source of `{formatted_source}` detected as `github`.")
  215. return DataType.GITHUB
  216. # If none of the above conditions are met, it's a general web page
  217. logging.debug(f"Source of `{formatted_source}` detected as `web_page`.")
  218. return DataType.WEB_PAGE
  219. elif not isinstance(source, str):
  220. # For datatypes where source is not a string.
  221. if isinstance(source, tuple) and len(source) == 2 and isinstance(source[0], str) and isinstance(source[1], str):
  222. logging.debug(f"Source of `{formatted_source}` detected as `qna_pair`.")
  223. return DataType.QNA_PAIR
  224. # Raise an error if it isn't a string and also not a valid non-string type (one of the previous).
  225. # We could stringify it, but it is better to raise an error and let the user decide how they want to do that.
  226. raise TypeError(
  227. "Source is not a string and a valid non-string type could not be detected. If you want to embed it, please stringify it, for instance by using `str(source)` or `(', ').join(source)`." # noqa: E501
  228. )
  229. elif os.path.isfile(source):
  230. # For datatypes that support conventional file references.
  231. # Note: checking for string is not necessary anymore.
  232. if source.endswith(".docx"):
  233. logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
  234. return DataType.DOCX
  235. if source.endswith(".csv"):
  236. logging.debug(f"Source of `{formatted_source}` detected as `csv`.")
  237. return DataType.CSV
  238. if source.endswith(".xml"):
  239. logging.debug(f"Source of `{formatted_source}` detected as `xml`.")
  240. return DataType.XML
  241. if source.endswith(".mdx") or source.endswith(".md"):
  242. logging.debug(f"Source of `{formatted_source}` detected as `mdx`.")
  243. return DataType.MDX
  244. if source.endswith(".txt"):
  245. logging.debug(f"Source of `{formatted_source}` detected as `text`.")
  246. return DataType.TEXT_FILE
  247. if source.endswith(".pdf"):
  248. logging.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
  249. return DataType.PDF_FILE
  250. if source.endswith(".yaml"):
  251. with open(source, "r") as file:
  252. yaml_content = yaml.safe_load(file)
  253. if is_openapi_yaml(yaml_content):
  254. logging.debug(f"Source of `{formatted_source}` detected as `openapi`.")
  255. return DataType.OPENAPI
  256. else:
  257. logging.error(
  258. f"Source of `{formatted_source}` does not contain all the required \
  259. fields of OpenAPI yaml. Check 'https://spec.openapis.org/oas/v3.1.0'"
  260. )
  261. raise ValueError(
  262. "Invalid YAML data. Check 'https://spec.openapis.org/oas/v3.1.0', \
  263. make sure to add all the required params"
  264. )
  265. if source.endswith(".json"):
  266. logging.debug(f"Source of `{formatted_source}` detected as `json`.")
  267. return DataType.JSON
  268. if os.path.exists(source) and is_readable(open(source).read()):
  269. logging.debug(f"Source of `{formatted_source}` detected as `text_file`.")
  270. return DataType.TEXT_FILE
  271. # If the source is a valid file, that's not detectable as a type, an error is raised.
  272. # It does not fallback to text.
  273. raise ValueError(
  274. "Source points to a valid file, but based on the filename, no `data_type` can be detected. Please be aware, that not all data_types allow conventional file references, some require the use of the `file URI scheme`. Please refer to the embedchain documentation (https://docs.embedchain.ai/advanced/data_types#remote-data-types)." # noqa: E501
  275. )
  276. else:
  277. # Source is not a URL.
  278. # TODO: check if source is gmail query
  279. # check if the source is valid json string
  280. if is_valid_json_string(source):
  281. logging.debug(f"Source of `{formatted_source}` detected as `json`.")
  282. return DataType.JSON
  283. # Use text as final fallback.
  284. logging.debug(f"Source of `{formatted_source}` detected as `text`.")
  285. return DataType.TEXT
  286. # check if the source is valid json string
  287. def is_valid_json_string(source: str):
  288. try:
  289. _ = json.loads(source)
  290. return True
  291. except json.JSONDecodeError:
  292. return False
  293. def validate_config(config_data):
  294. schema = Schema(
  295. {
  296. Optional("app"): {
  297. Optional("config"): {
  298. Optional("id"): str,
  299. Optional("name"): str,
  300. Optional("log_level"): Or("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"),
  301. Optional("collect_metrics"): bool,
  302. Optional("collection_name"): str,
  303. }
  304. },
  305. Optional("llm"): {
  306. Optional("provider"): Or(
  307. "openai",
  308. "azure_openai",
  309. "anthropic",
  310. "huggingface",
  311. "cohere",
  312. "together",
  313. "gpt4all",
  314. "ollama",
  315. "jina",
  316. "llama2",
  317. "vertexai",
  318. "google",
  319. ),
  320. Optional("config"): {
  321. Optional("model"): str,
  322. Optional("number_documents"): int,
  323. Optional("temperature"): float,
  324. Optional("max_tokens"): int,
  325. Optional("top_p"): Or(float, int),
  326. Optional("stream"): bool,
  327. Optional("template"): str,
  328. Optional("prompt"): str,
  329. Optional("system_prompt"): str,
  330. Optional("deployment_name"): str,
  331. Optional("where"): dict,
  332. Optional("query_type"): str,
  333. Optional("api_key"): str,
  334. },
  335. },
  336. Optional("vectordb"): {
  337. Optional("provider"): Or(
  338. "chroma", "elasticsearch", "opensearch", "pinecone", "qdrant", "weaviate", "zilliz"
  339. ),
  340. Optional("config"): object, # TODO: add particular config schema for each provider
  341. },
  342. Optional("embedder"): {
  343. Optional("provider"): Or("openai", "gpt4all", "huggingface", "vertexai", "azure_openai", "google"),
  344. Optional("config"): {
  345. Optional("model"): Optional(str),
  346. Optional("deployment_name"): Optional(str),
  347. Optional("api_key"): str,
  348. Optional("title"): str,
  349. Optional("task_type"): str,
  350. },
  351. },
  352. Optional("embedding_model"): {
  353. Optional("provider"): Or("openai", "gpt4all", "huggingface", "vertexai", "azure_openai", "google"),
  354. Optional("config"): {
  355. Optional("model"): str,
  356. Optional("deployment_name"): str,
  357. Optional("api_key"): str,
  358. Optional("title"): str,
  359. Optional("task_type"): str,
  360. },
  361. },
  362. Optional("chunker"): {
  363. Optional("chunk_size"): int,
  364. Optional("chunk_overlap"): int,
  365. Optional("length_function"): str,
  366. Optional("min_chunk_size"): int,
  367. },
  368. Optional("cache"): {
  369. Optional("similarity_threshold"): float,
  370. },
  371. }
  372. )
  373. return schema.validate(config_data)
  374. def chunks(iterable, batch_size=100, desc="Processing chunks"):
  375. """A helper function to break an iterable into chunks of size batch_size."""
  376. it = iter(iterable)
  377. total_size = len(iterable)
  378. with tqdm(total=total_size, desc=desc, unit="batch") as pbar:
  379. chunk = tuple(itertools.islice(it, batch_size))
  380. while chunk:
  381. yield chunk
  382. pbar.update(len(chunk))
  383. chunk = tuple(itertools.islice(it, batch_size))