Jelajahi Sumber

docs: add clean_string() doc string (#43)

cachho 2 tahun lalu
induk
melakukan
3720ded80f
1 mengubah file dengan 23 tambahan dan 0 penghapusan
  1. 23 0
      embedchain/utils.py

+ 23 - 0
embedchain/utils.py

@@ -2,9 +2,32 @@ import re
 
 
 def clean_string(text):
+    """
+    This function takes in a string and performs a series of text cleaning operations. 
+
+    Args:
+        text (str): The text to be cleaned. This is expected to be a string.
+
+    Returns:
+        cleaned_text (str): The cleaned text after all the cleaning operations have been performed.
+    """
+    # Replacement of newline characters:
     text = text.replace('\n', ' ')
+    
+    # Stripping and reducing multiple spaces to single:
     cleaned_text = re.sub(r'\s+', ' ', text.strip())
+    
+    # Removing backslashes:
     cleaned_text = cleaned_text.replace('\\', '')
+    
+    # Replacing hash characters:
     cleaned_text = cleaned_text.replace('#', ' ')
+    
+    # Eliminating consecutive non-alphanumeric characters:
+    # This regex identifies consecutive non-alphanumeric characters (i.e., not a word character [a-zA-Z0-9_] and not a whitespace) in the string 
+    # and replaces each group of such characters with a single occurrence of that character. 
+    # For example, "!!! hello !!!" would become "! hello !".
     cleaned_text = re.sub(r'([^\w\s])\1*', r'\1', cleaned_text)
+    
     return cleaned_text
+