import pandas as pd
import time
import logging
logging.basicConfig(level=logging.INFO)
def diff_datasets(dataset1_path, dataset2_path, timeout_seconds=60):
"""
Compares two datasets (Pandas DataFrames) and returns the differences.
Args:
dataset1_path (str): Path to the first dataset (e.g., CSV file).
dataset2_path (str): Path to the second dataset.
timeout_seconds (int): Timeout in seconds for the comparison.
Returns:
pandas.DataFrame or None: DataFrame containing the differences between the datasets,
or None if an error occurs or the timeout is reached.
"""
try:
# Load datasets
df1 = pd.read_csv(dataset1_path)
df2 = pd.read_csv(dataset2_path)
# Validate that the columns match
if not df1.columns.equals(df2.columns):
logging.error("Columns do not match between datasets.")
return None
# Start timer
start_time = time.time()
# Calculate differences
diff = df1.compare(df2, align_axis=0)
# Check for timeout
if time.time() - start_time > timeout_seconds:
logging.warning(f"Timeout reached ({timeout_seconds} seconds).")
return None
return diff
except Exception as e:
logging.error(f"An error occurred: {e}")
return None
if __name__ == '__main__':
# Example usage:
dataset1 = "dataset1.csv"
dataset2 = "dataset2.csv"
# Create dummy datasets for testing
data1 = {'col1': [1, 2, 3, 4], 'col2': ['a', 'b', 'c', 'd']}
data2 = {'col1': [1, 2, 5, 4], 'col2': ['a', 'b', 'c', 'e']}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
df1.to_csv(dataset1, index=False)
df2.to_csv(dataset2, index=False)
diff_df = diff_datasets(dataset1, dataset2, timeout_seconds=5)
if diff_df is not None:
print(diff_df)
else:
print("No differences found or an error occurred.")
Add your comment