![]() |
VOOZH | about |
This guide walks through the complete process of building and evaluating classification models using scikit-learn.
Before building any model, we need to prepare our data:
import pandas as pd
from sklearn.model_selection import train_test_split
# Load data
data = pd.read_csv('customer_churn.csv')
# Split features and target
X = data.drop('churn', axis=1)
y = data['churn']
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)Understanding your data is crucial before modeling:
print(X_train.isnull().sum()) print(y_train.value_counts(normalize=True)) import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 10))
sns.heatmap(X_train.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()Transforming raw data into model-ready features:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Identify column types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns
# Create preprocessing pipelines
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)Let's evaluate several classification algorithms:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
# Define models to test
models = {
'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
'Random Forest': RandomForestClassifier(random_state=42),
'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}
# Compare models using cross-validation
for name, model in models.items():
scores = cross_val_score(model, X_train_processed, y_train, cv=5, scoring='f1')
print(f"{name}: Mean F1 = {scores.mean():.4f}, Std = {scores.std():.4f}")Optimizing model performance through hyperparameter tuning:
from sklearn.model_selection import GridSearchCV
# Example: Tuning Random Forest
rf_params = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
rf_grid = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid=rf_params,
cv=5,
scoring='f1',
n_jobs=-1
)
rf_grid.fit(X_train_processed, y_train)
print(f"Best parameters: {rf_grid.best_params_}")
print(f"Best F1 score: {rf_grid.best_score_:.4f}")
# Get the best model
best_rf = rf_grid.best_estimator_Comprehensive evaluation of model performance:
from sklearn.metrics import (
confusion_matrix, roc_curve, roc_auc_score,
precision_recall_curve, average_precision_score
)
# Make predictions
y_pred = best_rf.predict(X_test_processed)
y_pred_proba = best_rf.predict_proba(X_test_processed)[:, 1]
# Classification metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
# ROC Curve
plt.figure(figsize=(8, 6))
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
# Precision-Recall Curve
plt.figure(figsize=(8, 6))
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
ap = average_precision_score(y_test, y_pred_proba)
plt.plot(recall, precision, label=f'AP = {ap:.4f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()Understanding what drives your model's predictions:
import shap
# For Random Forest interpretation
explainer = shap.TreeExplainer(best_rf)
shap_values = explainer.shap_values(X_test_processed)
# Feature importance plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values[1], X_test_processed, feature_names=preprocessor.get_feature_names_out())When deploying your model to production, consider:
import joblib
# Save model and preprocessor
joblib.dump(best_rf, 'best_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl') def predict_churn(data):
# Load model and preprocessor
model = joblib.load('best_model.pkl')
prep = joblib.load('preprocessor.pkl')
# Preprocess data
processed_data = prep.transform(data)
# Make prediction
prediction = model.predict(processed_data)
probability = model.predict_proba(processed_data)[:, 1]
return prediction, probabilityBy following this comprehensive guide, you'll be able to build, evaluate, interpret, and deploy classification models effectively.