12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- import logging
- import re
- import string
- def clean_string(text):
- """
- This function takes in a string and performs a series of text cleaning operations.
- Args:
- text (str): The text to be cleaned. This is expected to be a string.
- Returns:
- cleaned_text (str): The cleaned text after all the cleaning operations
- have been performed.
- """
- # Replacement of newline characters:
- text = text.replace("\n", " ")
- # Stripping and reducing multiple spaces to single:
- cleaned_text = re.sub(r"\s+", " ", text.strip())
- # Removing backslashes:
- cleaned_text = cleaned_text.replace("\\", "")
- # Replacing hash characters:
- cleaned_text = cleaned_text.replace("#", " ")
- # Eliminating consecutive non-alphanumeric characters:
- # This regex identifies consecutive non-alphanumeric characters (i.e., not
- # a word character [a-zA-Z0-9_] and not a whitespace) in the string
- # and replaces each group of such characters with a single occurrence of
- # that character.
- # For example, "!!! hello !!!" would become "! hello !".
- cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
- return cleaned_text
- def is_readable(s):
- """
- Heuristic to determine if a string is "readable" (mostly contains printable characters and forms meaningful words)
- :param s: string
- :return: True if the string is more than 95% printable.
- """
- try:
- printable_ratio = sum(c in string.printable for c in s) / len(s)
- except ZeroDivisionError:
- logging.warning("Empty string processed as unreadable")
- printable_ratio = 0
- return printable_ratio > 0.95 # 95% of characters are printable
- def use_pysqlite3():
- """
- Swap std-lib sqlite3 with pysqlite3.
- """
- import platform
- import sqlite3
- if platform.system() == "Linux" and sqlite3.sqlite_version_info < (3, 35, 0):
- try:
- # According to the Chroma team, this patch only works on Linux
- import datetime
- import subprocess
- import sys
- subprocess.check_call(
- [sys.executable, "-m", "pip", "install", "pysqlite3-binary", "--quiet", "--disable-pip-version-check"]
- )
- __import__("pysqlite3")
- sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
- # Let the user know what happened.
- current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
- print(
- f"{current_time} [embedchain] [INFO]",
- "Swapped std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.",
- f"Your original version was {sqlite3.sqlite_version}.",
- )
- except Exception as e:
- # Escape all exceptions
- current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
- print(
- f"{current_time} [embedchain] [ERROR]",
- "Failed to swap std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.",
- "Error:",
- e,
- )
|