import logging import re import string def clean_string(text): """ This function takes in a string and performs a series of text cleaning operations. Args: text (str): The text to be cleaned. This is expected to be a string. Returns: cleaned_text (str): The cleaned text after all the cleaning operations have been performed. """ # Replacement of newline characters: text = text.replace("\n", " ") # Stripping and reducing multiple spaces to single: cleaned_text = re.sub(r"\s+", " ", text.strip()) # Removing backslashes: cleaned_text = cleaned_text.replace("\\", "") # Replacing hash characters: cleaned_text = cleaned_text.replace("#", " ") # Eliminating consecutive non-alphanumeric characters: # This regex identifies consecutive non-alphanumeric characters (i.e., not # a word character [a-zA-Z0-9_] and not a whitespace) in the string # and replaces each group of such characters with a single occurrence of # that character. # For example, "!!! hello !!!" would become "! hello !". cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text) return cleaned_text def is_readable(s): """ Heuristic to determine if a string is "readable" (mostly contains printable characters and forms meaningful words) :param s: string :return: True if the string is more than 95% printable. """ try: printable_ratio = sum(c in string.printable for c in s) / len(s) except ZeroDivisionError: logging.warning("Empty string processed as unreadable") printable_ratio = 0 return printable_ratio > 0.95 # 95% of characters are printable def use_pysqlite3(): """ Swap std-lib sqlite3 with pysqlite3. """ import platform import sqlite3 if platform.system() == "Linux" and sqlite3.sqlite_version_info < (3, 35, 0): try: # According to the Chroma team, this patch only works on Linux import datetime import subprocess import sys subprocess.check_call( [sys.executable, "-m", "pip", "install", "pysqlite3-binary", "--quiet", "--disable-pip-version-check"] ) __import__("pysqlite3") sys.modules["sqlite3"] = sys.modules.pop("pysqlite3") # Let the user know what happened. current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3] print( f"{current_time} [embedchain] [INFO]", "Swapped std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.", f"Your original version was {sqlite3.sqlite_version}.", ) except Exception as e: # Escape all exceptions current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3] print( f"{current_time} [embedchain] [ERROR]", "Failed to swap std-lib sqlite3 with pysqlite3 for ChromaDb compatibility.", "Error:", e, )