utils.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. import logging
  2. import re
  3. import string
  4. def clean_string(text):
  5. """
  6. This function takes in a string and performs a series of text cleaning operations.
  7. Args:
  8. text (str): The text to be cleaned. This is expected to be a string.
  9. Returns:
  10. cleaned_text (str): The cleaned text after all the cleaning operations
  11. have been performed.
  12. """
  13. # Replacement of newline characters:
  14. text = text.replace("\n", " ")
  15. # Stripping and reducing multiple spaces to single:
  16. cleaned_text = re.sub(r"\s+", " ", text.strip())
  17. # Removing backslashes:
  18. cleaned_text = cleaned_text.replace("\\", "")
  19. # Replacing hash characters:
  20. cleaned_text = cleaned_text.replace("#", " ")
  21. # Eliminating consecutive non-alphanumeric characters:
  22. # This regex identifies consecutive non-alphanumeric characters (i.e., not
  23. # a word character [a-zA-Z0-9_] and not a whitespace) in the string
  24. # and replaces each group of such characters with a single occurrence of
  25. # that character.
  26. # For example, "!!! hello !!!" would become "! hello !".
  27. cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
  28. return cleaned_text
  29. def is_readable(s):
  30. """
  31. Heuristic to determine if a string is "readable" (mostly contains printable characters and forms meaningful words)
  32. :param s: string
  33. :return: True if the string is more than 95% printable.
  34. """
  35. printable_ratio = sum(c in string.printable for c in s) / len(s)
  36. return printable_ratio > 0.95 # 95% of characters are printable
  37. def use_pysqlite3():
  38. """
  39. Swap std-lib sqlite3 with pysqlite3.
  40. """
  41. import platform
  42. if platform.system() == "Linux":
  43. # According to the Chroma team, this patch only works on Linux
  44. import subprocess
  45. import sys
  46. subprocess.check_call([sys.executable, "-m", "pip", "install", "pysqlite3-binary"])
  47. __import__("pysqlite3")
  48. sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
  49. # Don't be surprised if this doesn't log as you expect, because the logger is instantiated after the import
  50. logging.info("Swapped std-lib sqlite3 with pysqlite3")