1. import re
  2. import os
  3. def cleanup_text_file(filepath):
  4. """
  5. Cleans up a text file by removing various artifacts.
  6. """
  7. try:
  8. with open(filepath, 'r', encoding='utf-8') as f:
  9. text = f.read()
  10. # Remove comments (e.g., # ... , // ...)
  11. text = re.sub(r'#.*', '', text)
  12. text = re.sub(r'//.*', '', text)
  13. # Remove empty lines
  14. text = text.split('\n')
  15. text = [line for line in text if line.strip()]
  16. text = '\n'.join(text)
  17. # Remove extra whitespace (multiple spaces, tabs, newlines)
  18. text = re.sub(r'\s+', ' ', text)
  19. # Remove trailing whitespace
  20. text = text.rstrip()
  21. # Remove specific unwanted characters/patterns (customize as needed)
  22. text = re.sub(r'\[.*?\]', '', text) # Remove bracketed text
  23. text = re.sub(r'\(.*?\)', '', text) # Remove parenthesized text
  24. with open(filepath, 'w', encoding='utf-8') as f:
  25. f.write(text)
  26. print(f"Cleaned: {filepath}")
  27. except FileNotFoundError:
  28. print(f"Error: File not found: {filepath}")
  29. except Exception as e:
  30. print(f"Error cleaning {filepath}: {e}")
  31. def cleanup_directory(directory):
  32. """
  33. Cleans up all text files in a directory.
  34. """
  35. for filename in os.listdir(directory):
  36. if filename.endswith(".txt"):
  37. filepath = os.path.join(directory, filename)
  38. cleanup_text_file(filepath)
  39. if __name__ == '__main__':
  40. # Example usage:
  41. # cleanup_directory("/path/to/your/text/files") # Clean all .txt files in the specified directory
  42. # cleanup_text_file("my_text_file.txt") # Clean a single file
  43. pass

Add your comment