1. import re
  2. import logging
  3. logging.basicConfig(level=logging.INFO)
  4. def clean_form_data(data, dry_run=False):
  5. """
  6. Cleans data from web forms for hypothesis validation.
  7. Args:
  8. data (dict): A dictionary containing the form data.
  9. dry_run (bool): If True, only print changes without modifying the data.
  10. Returns:
  11. dict: The cleaned data.
  12. """
  13. cleaned_data = {}
  14. for key, value in data.items():
  15. if isinstance(value, str):
  16. # Remove leading/trailing whitespace
  17. value = value.strip()
  18. # Convert to lowercase
  19. value = value.lower()
  20. # Remove special characters (keep alphanumeric and spaces)
  21. value = re.sub(r'[^a-zA-Z0-9\s]', '', value)
  22. cleaned_data[key] = value
  23. elif isinstance(value, (int, float)):
  24. cleaned_data[key] = value # Keep numeric values as they are
  25. elif value is None:
  26. cleaned_data[key] = None # keep null values
  27. else:
  28. cleaned_data[key] = str(value) # Convert other types to strings
  29. logging.warning(f"Type conversion detected for key: {key}")
  30. if dry_run:
  31. print("Dry run mode: Changes will not be applied.")
  32. print("Original Data:", data)
  33. print("Cleaned Data:", cleaned_data)
  34. return data
  35. else:
  36. return cleaned_data
  37. if __name__ == '__main__':
  38. # Example usage
  39. form_data = {
  40. "name": " John Doe! ",
  41. "email": "john.doe@example.com",
  42. "age": "30",
  43. "city": None,
  44. "phone": "123-456-7890",
  45. "other_field": 123.45
  46. }
  47. cleaned_data = clean_form_data(form_data, dry_run=True)
  48. print("Cleaned Data:", cleaned_data)
  49. cleaned_data = clean_form_data(form_data)
  50. print("Cleaned Data:", cleaned_data)

Add your comment