1. import pandas as pd
  2. import logging
  3. # Configure logging
  4. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  5. def deduplicate_records(df, columns):
  6. """
  7. Deduplicates records in a Pandas DataFrame based on specified columns.
  8. Args:
  9. df (pd.DataFrame): The input DataFrame.
  10. columns (list): A list of column names to consider for deduplication.
  11. Returns:
  12. pd.DataFrame: The DataFrame with duplicate rows removed.
  13. """
  14. try:
  15. # Check if columns exist
  16. for col in columns:
  17. if col not in df.columns:
  18. raise ValueError(f"Column '{col}' not found in DataFrame.")
  19. # Drop duplicate rows based on the specified columns
  20. df_deduplicated = df.drop_duplicates(subset=columns, keep='first') # Keep the first occurrence
  21. logging.info(f"Deduplication completed successfully based on columns: {columns}")
  22. logging.info(f"Original DataFrame shape: {df.shape}")
  23. logging.info(f"Deduplicated DataFrame shape: {df_deduplicated.shape}")
  24. return df_deduplicated
  25. except ValueError as e:
  26. logging.error(f"Error during deduplication: {e}")
  27. return None
  28. except Exception as e:
  29. logging.error(f"An unexpected error occurred: {e}")
  30. return None
  31. if __name__ == '__main__':
  32. # Example Usage
  33. data = {'col1': [1, 2, 2, 3, 4, 4, 4],
  34. 'col2': ['A', 'B', 'B', 'C', 'D', 'D', 'D'],
  35. 'col3': [10, 20, 20, 30, 40, 40, 40]}
  36. df = pd.DataFrame(data)
  37. columns_to_deduplicate = ['col1', 'col2']
  38. deduplicated_df = deduplicate_records(df.copy(), columns_to_deduplicate) #Using .copy() to avoid modifying original
  39. if deduplicated_df is not None:
  40. print(deduplicated_df)

Add your comment