utils.py 1.1 KB

123456789101112131415161718192021222324252627282930313233
  1. import re
  2. def clean_string(text):
  3. """
  4. This function takes in a string and performs a series of text cleaning operations.
  5. Args:
  6. text (str): The text to be cleaned. This is expected to be a string.
  7. Returns:
  8. cleaned_text (str): The cleaned text after all the cleaning operations have been performed.
  9. """
  10. # Replacement of newline characters:
  11. text = text.replace('\n', ' ')
  12. # Stripping and reducing multiple spaces to single:
  13. cleaned_text = re.sub(r'\s+', ' ', text.strip())
  14. # Removing backslashes:
  15. cleaned_text = cleaned_text.replace('\\', '')
  16. # Replacing hash characters:
  17. cleaned_text = cleaned_text.replace('#', ' ')
  18. # Eliminating consecutive non-alphanumeric characters:
  19. # This regex identifies consecutive non-alphanumeric characters (i.e., not a word character [a-zA-Z0-9_] and not a whitespace) in the string
  20. # and replaces each group of such characters with a single occurrence of that character.
  21. # For example, "!!! hello !!!" would become "! hello !".
  22. cleaned_text = re.sub(r'([^\w\s])\1*', r'\1', cleaned_text)
  23. return cleaned_text