1. import pandas as pd
  2. import time
  3. import logging
  4. logging.basicConfig(level=logging.INFO)
  5. def diff_datasets(dataset1_path, dataset2_path, timeout_seconds=60):
  6. """
  7. Compares two datasets (Pandas DataFrames) and returns the differences.
  8. Args:
  9. dataset1_path (str): Path to the first dataset (e.g., CSV file).
  10. dataset2_path (str): Path to the second dataset.
  11. timeout_seconds (int): Timeout in seconds for the comparison.
  12. Returns:
  13. pandas.DataFrame or None: DataFrame containing the differences between the datasets,
  14. or None if an error occurs or the timeout is reached.
  15. """
  16. try:
  17. # Load datasets
  18. df1 = pd.read_csv(dataset1_path)
  19. df2 = pd.read_csv(dataset2_path)
  20. # Validate that the columns match
  21. if not df1.columns.equals(df2.columns):
  22. logging.error("Columns do not match between datasets.")
  23. return None
  24. # Start timer
  25. start_time = time.time()
  26. # Calculate differences
  27. diff = df1.compare(df2, align_axis=0)
  28. # Check for timeout
  29. if time.time() - start_time > timeout_seconds:
  30. logging.warning(f"Timeout reached ({timeout_seconds} seconds).")
  31. return None
  32. return diff
  33. except Exception as e:
  34. logging.error(f"An error occurred: {e}")
  35. return None
  36. if __name__ == '__main__':
  37. # Example usage:
  38. dataset1 = "dataset1.csv"
  39. dataset2 = "dataset2.csv"
  40. # Create dummy datasets for testing
  41. data1 = {'col1': [1, 2, 3, 4], 'col2': ['a', 'b', 'c', 'd']}
  42. data2 = {'col1': [1, 2, 5, 4], 'col2': ['a', 'b', 'c', 'e']}
  43. df1 = pd.DataFrame(data1)
  44. df2 = pd.DataFrame(data2)
  45. df1.to_csv(dataset1, index=False)
  46. df2.to_csv(dataset2, index=False)
  47. diff_df = diff_datasets(dataset1, dataset2, timeout_seconds=5)
  48. if diff_df is not None:
  49. print(diff_df)
  50. else:
  51. print("No differences found or an error occurred.")

Add your comment