utils.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import re
  2. def clean_string(text):
  3. text = text.replace('\n', ' ')
  4. cleaned_text = re.sub(r'\s+', ' ', text.strip())
  5. cleaned_text = cleaned_text.replace('\\', '')
  6. cleaned_text = cleaned_text.replace('#', ' ')
  7. cleaned_text = re.sub(r'([^\w\s])\1*', r'\1', cleaned_text)
  8. return cleaned_text
  9. def markdown_to_plaintext(markdown_string):
  10. # Lines surrounded by empty lines are considered paragraph text
  11. markdown_string = markdown_string.strip().replace("\n\n", "\n")
  12. # Headers
  13. markdown_string = markdown_string.replace("# ", "")
  14. markdown_string = markdown_string.replace("## ", "")
  15. markdown_string = markdown_string.replace("### ", "")
  16. # Bold text
  17. markdown_string = markdown_string.replace("**", "")
  18. markdown_string = markdown_string.replace("__", "")
  19. # Italicized text
  20. markdown_string = markdown_string.replace("*", "")
  21. markdown_string = markdown_string.replace("_", "")
  22. # Ordered lists
  23. markdown_string = markdown_string.replace("1. ", "")
  24. markdown_string = markdown_string.replace("2. ", "")
  25. markdown_string = markdown_string.replace("3. ", "")
  26. # And so on for other numbers
  27. # Unordered lists
  28. markdown_string = markdown_string.replace("- ", "")
  29. markdown_string = markdown_string.replace("* ", "")
  30. markdown_string = markdown_string.replace("+ ", "")
  31. # Links and images
  32. while ("[" in markdown_string and "]" in markdown_string and
  33. "(" in markdown_string and ")" in markdown_string):
  34. start_link = markdown_string.find("[")
  35. end_link = markdown_string.find("]")
  36. start_paren = markdown_string.find("(")
  37. end_paren = markdown_string.find(")")
  38. if start_link < start_paren and end_link < end_paren:
  39. markdown_string = markdown_string[:start_link] + markdown_string[start_paren+1:end_paren] + markdown_string[end_paren+1:]
  40. return markdown_string