1234567891011121314151617181920212223242526272829303132333435 |
- import re
- def clean_string(text):
- """
- This function takes in a string and performs a series of text cleaning operations.
- Args:
- text (str): The text to be cleaned. This is expected to be a string.
- Returns:
- cleaned_text (str): The cleaned text after all the cleaning operations
- have been performed.
- """
- # Replacement of newline characters:
- text = text.replace("\n", " ")
- # Stripping and reducing multiple spaces to single:
- cleaned_text = re.sub(r"\s+", " ", text.strip())
- # Removing backslashes:
- cleaned_text = cleaned_text.replace("\\", "")
- # Replacing hash characters:
- cleaned_text = cleaned_text.replace("#", " ")
- # Eliminating consecutive non-alphanumeric characters:
- # This regex identifies consecutive non-alphanumeric characters (i.e., not
- # a word character [a-zA-Z0-9_] and not a whitespace) in the string
- # and replaces each group of such characters with a single occurrence of
- # that character.
- # For example, "!!! hello !!!" would become "! hello !".
- cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
- return cleaned_text
|