utils.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. import re
  2. import string
  3. def clean_string(text):
  4. """
  5. This function takes in a string and performs a series of text cleaning operations.
  6. Args:
  7. text (str): The text to be cleaned. This is expected to be a string.
  8. Returns:
  9. cleaned_text (str): The cleaned text after all the cleaning operations
  10. have been performed.
  11. """
  12. # Replacement of newline characters:
  13. text = text.replace("\n", " ")
  14. # Stripping and reducing multiple spaces to single:
  15. cleaned_text = re.sub(r"\s+", " ", text.strip())
  16. # Removing backslashes:
  17. cleaned_text = cleaned_text.replace("\\", "")
  18. # Replacing hash characters:
  19. cleaned_text = cleaned_text.replace("#", " ")
  20. # Eliminating consecutive non-alphanumeric characters:
  21. # This regex identifies consecutive non-alphanumeric characters (i.e., not
  22. # a word character [a-zA-Z0-9_] and not a whitespace) in the string
  23. # and replaces each group of such characters with a single occurrence of
  24. # that character.
  25. # For example, "!!! hello !!!" would become "! hello !".
  26. cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
  27. return cleaned_text
  28. def is_readable(s):
  29. """
  30. Heuristic to determine if a string is "readable" (mostly contains printable characters and forms meaningful words)
  31. :param s: string
  32. :return: True if the string is more than 95% printable.
  33. """
  34. printable_ratio = sum(c in string.printable for c in s) / len(s)
  35. return printable_ratio > 0.95 # 95% of characters are printable
  36. def use_pysqlite3():
  37. """
  38. Swap std-lib sqlite3 with pysqlite3.
  39. """
  40. import platform
  41. import sqlite3
  42. if platform.system() == "Linux" and sqlite3.sqlite_version_info < (3, 35, 0):
  43. # According to the Chroma team, this patch only works on Linux
  44. import datetime
  45. import subprocess
  46. import sys
  47. subprocess.check_call(
  48. [sys.executable, "-m", "pip", "install", "pysqlite3-binary", "--quiet", "--disable-pip-version-check"]
  49. )
  50. __import__("pysqlite3")
  51. sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
  52. # Let the user know what happened.
  53. current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
  54. print(
  55. f"{current_time} [embedchain] [INFO]",
  56. "Swapped std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.",
  57. f"Your original version was {sqlite3.sqlite_version}.",
  58. )