|
@@ -2,9 +2,32 @@ import re
|
|
|
|
|
|
|
|
|
|
def clean_string(text):
|
|
def clean_string(text):
|
|
|
|
+ """
|
|
|
|
+ This function takes in a string and performs a series of text cleaning operations.
|
|
|
|
+
|
|
|
|
+ Args:
|
|
|
|
+ text (str): The text to be cleaned. This is expected to be a string.
|
|
|
|
+
|
|
|
|
+ Returns:
|
|
|
|
+ cleaned_text (str): The cleaned text after all the cleaning operations have been performed.
|
|
|
|
+ """
|
|
|
|
+ # Replacement of newline characters:
|
|
text = text.replace('\n', ' ')
|
|
text = text.replace('\n', ' ')
|
|
|
|
+
|
|
|
|
+ # Stripping and reducing multiple spaces to single:
|
|
cleaned_text = re.sub(r'\s+', ' ', text.strip())
|
|
cleaned_text = re.sub(r'\s+', ' ', text.strip())
|
|
|
|
+
|
|
|
|
+ # Removing backslashes:
|
|
cleaned_text = cleaned_text.replace('\\', '')
|
|
cleaned_text = cleaned_text.replace('\\', '')
|
|
|
|
+
|
|
|
|
+ # Replacing hash characters:
|
|
cleaned_text = cleaned_text.replace('#', ' ')
|
|
cleaned_text = cleaned_text.replace('#', ' ')
|
|
|
|
+
|
|
|
|
+ # Eliminating consecutive non-alphanumeric characters:
|
|
|
|
+ # This regex identifies consecutive non-alphanumeric characters (i.e., not a word character [a-zA-Z0-9_] and not a whitespace) in the string
|
|
|
|
+ # and replaces each group of such characters with a single occurrence of that character.
|
|
|
|
+ # For example, "!!! hello !!!" would become "! hello !".
|
|
cleaned_text = re.sub(r'([^\w\s])\1*', r'\1', cleaned_text)
|
|
cleaned_text = re.sub(r'([^\w\s])\1*', r'\1', cleaned_text)
|
|
|
|
+
|
|
return cleaned_text
|
|
return cleaned_text
|
|
|
|
+
|