import re
import os
def cleanup_text_file(filepath):
"""
Cleans up a text file by removing various artifacts.
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
text = f.read()
# Remove comments (e.g., # ... , // ...)
text = re.sub(r'#.*', '', text)
text = re.sub(r'//.*', '', text)
# Remove empty lines
text = text.split('\n')
text = [line for line in text if line.strip()]
text = '\n'.join(text)
# Remove extra whitespace (multiple spaces, tabs, newlines)
text = re.sub(r'\s+', ' ', text)
# Remove trailing whitespace
text = text.rstrip()
# Remove specific unwanted characters/patterns (customize as needed)
text = re.sub(r'\[.*?\]', '', text) # Remove bracketed text
text = re.sub(r'\(.*?\)', '', text) # Remove parenthesized text
with open(filepath, 'w', encoding='utf-8') as f:
f.write(text)
print(f"Cleaned: {filepath}")
except FileNotFoundError:
print(f"Error: File not found: {filepath}")
except Exception as e:
print(f"Error cleaning {filepath}: {e}")
def cleanup_directory(directory):
"""
Cleans up all text files in a directory.
"""
for filename in os.listdir(directory):
if filename.endswith(".txt"):
filepath = os.path.join(directory, filename)
cleanup_text_file(filepath)
if __name__ == '__main__':
# Example usage:
# cleanup_directory("/path/to/your/text/files") # Clean all .txt files in the specified directory
# cleanup_text_file("my_text_file.txt") # Clean a single file
pass
Add your comment