import pandas as pd
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def deduplicate_records(df, columns):
"""
Deduplicates records in a Pandas DataFrame based on specified columns.
Args:
df (pd.DataFrame): The input DataFrame.
columns (list): A list of column names to consider for deduplication.
Returns:
pd.DataFrame: The DataFrame with duplicate rows removed.
"""
try:
# Check if columns exist
for col in columns:
if col not in df.columns:
raise ValueError(f"Column '{col}' not found in DataFrame.")
# Drop duplicate rows based on the specified columns
df_deduplicated = df.drop_duplicates(subset=columns, keep='first') # Keep the first occurrence
logging.info(f"Deduplication completed successfully based on columns: {columns}")
logging.info(f"Original DataFrame shape: {df.shape}")
logging.info(f"Deduplicated DataFrame shape: {df_deduplicated.shape}")
return df_deduplicated
except ValueError as e:
logging.error(f"Error during deduplication: {e}")
return None
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
return None
if __name__ == '__main__':
# Example Usage
data = {'col1': [1, 2, 2, 3, 4, 4, 4],
'col2': ['A', 'B', 'B', 'C', 'D', 'D', 'D'],
'col3': [10, 20, 20, 30, 40, 40, 40]}
df = pd.DataFrame(data)
columns_to_deduplicate = ['col1', 'col2']
deduplicated_df = deduplicate_records(df.copy(), columns_to_deduplicate) #Using .copy() to avoid modifying original
if deduplicated_df is not None:
print(deduplicated_df)
Add your comment