utils.py 1.1 KB

1234567891011121314151617181920212223242526272829303132333435
  1. import re
  2. def clean_string(text):
  3. """
  4. This function takes in a string and performs a series of text cleaning operations.
  5. Args:
  6. text (str): The text to be cleaned. This is expected to be a string.
  7. Returns:
  8. cleaned_text (str): The cleaned text after all the cleaning operations
  9. have been performed.
  10. """
  11. # Replacement of newline characters:
  12. text = text.replace("\n", " ")
  13. # Stripping and reducing multiple spaces to single:
  14. cleaned_text = re.sub(r"\s+", " ", text.strip())
  15. # Removing backslashes:
  16. cleaned_text = cleaned_text.replace("\\", "")
  17. # Replacing hash characters:
  18. cleaned_text = cleaned_text.replace("#", " ")
  19. # Eliminating consecutive non-alphanumeric characters:
  20. # This regex identifies consecutive non-alphanumeric characters (i.e., not
  21. # a word character [a-zA-Z0-9_] and not a whitespace) in the string
  22. # and replaces each group of such characters with a single occurrence of
  23. # that character.
  24. # For example, "!!! hello !!!" would become "! hello !".
  25. cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text)
  26. return cleaned_text