ML Pipelines & Advanced Patterns
Production Pipeline patterns: full sklearn-to-Keras pipelines, feature engineering, custom transformers, serialization, and deployment-ready code.
End-to-End Sklearn Pipeline
Custom Transformers
BaseEstimator + TransformerMixin
Build custom feature engineering steps that plug into sklearn
Pipelines
▾
Example
Internals
python
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
class LogTransformer(BaseEstimator, TransformerMixin):
"""Apply log1p transform to specified features (for skewed distributions)."""
def __init__(self, columns=None):
self.columns = columns # list of column names or indices
def fit(self, X, y=None):
return self # stateless transform — nothing to fit
def transform(self, X):
X_out = X.copy()
if self.columns is not None:
X_out[self.columns] = np.log1p(X_out[self.columns])
else:
X_out = np.log1p(X_out)
return X_out
class DomainFeatures(BaseEstimator, TransformerMixin):
"""Create domain-specific engineered features."""
def fit(self, X, y=None):
return self
def transform(self, X):
X_out = X.copy()
X_out['balance_to_income'] = X['balance'] / (X['income'] + 1e-9)
X_out['age_group'] = pd.cut(X['age'],
bins=[0, 30, 50, 100],
labels=['young', 'mid', 'senior']
)
return X_out
# Plug into a Pipeline — fully compatible
pipe = Pipeline([
('domain_feats', DomainFeatures()),
('log_skewed', LogTransformer(columns=['income', 'balance'])),
('preprocessor', preprocessor),
('model', HistGradientBoostingClassifier(random_state=42))
])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
- BaseEstimator: Provides
get_params()/set_params()automatically from__init__arguments — required for GridSearchCV compatibility. - TransformerMixin: Provides
fit_transform(X, y)=fit(X, y).transform(X)for free. - Stateless transformers: If no data is learned during
fit(), justreturn self. Stateful: compute stats in fit (e.g., log-transform bounds learned from training data), store asself.attr_(trailing underscore convention). - set_output API: Call
pipe.set_output(transform='pandas')to propagate DataFrame column names through all transforms (sklearn ≥1.2).
Keras Model in Sklearn Pipeline
SciKeras — Wrap Keras in a Sklearn Estimator
Use a Keras model inside a sklearn Pipeline for cross-validation
and hyperparameter search
▾
Setup
Example
# Install: pip install scikeras
from scikeras.wrappers import
KerasClassifier, KerasRegressor
# Define model factory function
def build_model(n_units=128, dropout=0.3):
model = keras.Sequential([...])
model.compile(...)
return model
# Wrap as sklearn estimator
clf = KerasClassifier(
model=build_model,
n_units=128, dropout=0.3,
epochs=50, batch_size=32,
verbose=0
)
python
from scikeras.wrappers import KerasClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
def build_model(n_units=64, dropout=0.2, lr=1e-3):
model = keras.Sequential([
keras.layers.Input(shape=(30,)),
keras.layers.Dense(n_units, activation='relu'),
keras.layers.Dropout(dropout),
keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
optimizer=keras.optimizers.Adam(lr),
loss='binary_crossentropy',
metrics=['accuracy']
)
return model
# Wrap + pipeline
keras_clf = KerasClassifier(
model=build_model, epochs=50, batch_size=32, verbose=0
)
pipe = Pipeline([
('scaler', StandardScaler()),
('model', keras_clf)
])
# Tune Keras hyperparams via RandomizedSearchCV
param_dist = {
'model__n_units': [32, 64, 128, 256],
'model__dropout': [0.1, 0.2, 0.3, 0.4],
'model__lr': [1e-4, 1e-3, 5e-4]
}
rs = RandomizedSearchCV(
pipe, param_dist, n_iter=15, cv=5,
scoring='roc_auc', n_jobs=1 # n_jobs=1 with GPU-based Keras
)
rs.fit(X_train, y_train)
print(f'Best ROC-AUC: {rs.best_score_:.4f}')
print(rs.best_params_)