from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import KFold, cross_val_score
from tune_sklearn import TuneGridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import pandas as pd
= fetch_openml(
X, y "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
) X.head()
pclass | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
2 | 1 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
3 | 1 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
4 | 1 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
y.head()
0 1
1 1
2 0
3 0
4 0
Name: survived, dtype: category
Categories (2, object): ['0', '1']
= ["age", "fare"]
numeric_features = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
numeric_transformer = ["embarked", "pclass"]
categorical_features
= numeric_features + categorical_features
selected_features print(selected_features)
=X[selected_features]
X X.head()
['age', 'fare', 'embarked', 'pclass']
age | fare | embarked | pclass | |
---|---|---|---|---|
0 | 29.0000 | 211.3375 | S | 1 |
1 | 0.9167 | 151.5500 | S | 1 |
2 | 2.0000 | 151.5500 | S | 1 |
3 | 30.0000 | 151.5500 | S | 1 |
4 | 25.0000 | 151.5500 | S | 1 |
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = len(X_train.index) m
# Transform the data
= ColumnTransformer(
preprocessor = [
transformers"num", numeric_transformer, numeric_features),
(
("cat",
="ignore", sparse_output=False),
OneHotEncoder(handle_unknown
categorical_features,
),
],=True,
verbose_feature_names_out
)
= SelectKBest(k=3)
select_features = [
steps 'preprocessor', preprocessor),
('select_features', select_features),
('logistic_regressor_sgd', SGDClassifier(loss='log_loss', learning_rate='constant', eta0=1e-4, n_jobs=-1, shuffle=True))
(
]= Pipeline(steps)
logistic_reg_pipeline logistic_reg_pipeline.get_params().keys()
dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'select_features', 'logistic_regressor_sgd', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__simpleimputer', 'preprocessor__num__standardscaler', 'preprocessor__num__simpleimputer__add_indicator', 'preprocessor__num__simpleimputer__copy', 'preprocessor__num__simpleimputer__fill_value', 'preprocessor__num__simpleimputer__keep_empty_features', 'preprocessor__num__simpleimputer__missing_values', 'preprocessor__num__simpleimputer__strategy', 'preprocessor__num__simpleimputer__verbose', 'preprocessor__num__standardscaler__copy', 'preprocessor__num__standardscaler__with_mean', 'preprocessor__num__standardscaler__with_std', 'preprocessor__cat__categories', 'preprocessor__cat__drop', 'preprocessor__cat__dtype', 'preprocessor__cat__handle_unknown', 'preprocessor__cat__max_categories', 'preprocessor__cat__min_frequency', 'preprocessor__cat__sparse', 'preprocessor__cat__sparse_output', 'select_features__k', 'select_features__score_func', 'logistic_regressor_sgd__alpha', 'logistic_regressor_sgd__average', 'logistic_regressor_sgd__class_weight', 'logistic_regressor_sgd__early_stopping', 'logistic_regressor_sgd__epsilon', 'logistic_regressor_sgd__eta0', 'logistic_regressor_sgd__fit_intercept', 'logistic_regressor_sgd__l1_ratio', 'logistic_regressor_sgd__learning_rate', 'logistic_regressor_sgd__loss', 'logistic_regressor_sgd__max_iter', 'logistic_regressor_sgd__n_iter_no_change', 'logistic_regressor_sgd__n_jobs', 'logistic_regressor_sgd__penalty', 'logistic_regressor_sgd__power_t', 'logistic_regressor_sgd__random_state', 'logistic_regressor_sgd__shuffle', 'logistic_regressor_sgd__tol', 'logistic_regressor_sgd__validation_fraction', 'logistic_regressor_sgd__verbose', 'logistic_regressor_sgd__warm_start'])
= {
param_grid 'logistic_regressor_sgd__eta0': [1e-4, 1e-3, 1e-2],
'logistic_regressor_sgd__max_iter': [int(np.ceil(1e6 / m))]
}= RandomizedSearchCV(estimator=logistic_reg_pipeline, param_distributions=param_grid, n_iter=5, scoring='roc_auc', cv=5, verbose=2) randsearch_auc
for train_df in tqdm(pd.read_csv("DATA/creditcard.csv", chunksize=chunksize, iterator=True)):
= train_df
X = train_df['Class']
Y =[0,1])
randsearch_auc.partial_fit(X_train, y_train, classes
randsearch_auc.fit(X_train, y_train)
/workspaces/data_mining/.venv/lib/python3.9/site-packages/sklearn/model_selection/_search.py:305: UserWarning: The total space of parameters 3 is smaller than n_iter=5. Running 3 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
RandomizedSearchCV(cv=10, estimator=Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['age', 'fare']), ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['embarked', 'pclass'])])), ('select_features', SelectKBest(k=3)), ('logistic_regressor_sgd', SGDClassifier(eta0=0.0001, learning_rate='constant', loss='log_loss'))]), n_iter=5, param_distributions={'logistic_regressor_sgd__eta0': [0.0001, 0.001, 0.01], 'logistic_regressor_sgd__max_iter': [956]}, scoring='roc_auc', verbose=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=10, estimator=Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['age', 'fare']), ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['embarked', 'pclass'])])), ('select_features', SelectKBest(k=3)), ('logistic_regressor_sgd', SGDClassifier(eta0=0.0001, learning_rate='constant', loss='log_loss'))]), n_iter=5, param_distributions={'logistic_regressor_sgd__eta0': [0.0001, 0.001, 0.01], 'logistic_regressor_sgd__max_iter': [956]}, scoring='roc_auc', verbose=False)
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['age', 'fare']), ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['embarked', 'pclass'])])), ('select_features', SelectKBest(k=3)), ('logistic_regressor_sgd', SGDClassifier(eta0=0.0001, learning_rate='constant', loss='log_loss'))])
ColumnTransformer(transformers=[('num', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), ['age', 'fare']), ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['embarked', 'pclass'])])
['age', 'fare']
SimpleImputer(strategy='median')
StandardScaler()
['embarked', 'pclass']
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
SelectKBest(k=3)
SGDClassifier(eta0=0.0001, learning_rate='constant', loss='log_loss')
randsearch_auc.cv_results_
{'mean_fit_time': array([0.01125438, 0.01076853, 0.01036565]),
'std_fit_time': array([0.00041194, 0.00034422, 0.00011 ]),
'mean_score_time': array([0.00501354, 0.00504141, 0.00494459]),
'std_score_time': array([1.32287857e-04, 1.52852150e-04, 8.08919104e-05]),
'param_logistic_regressor_sgd__max_iter': masked_array(data=[956, 956, 956],
mask=[False, False, False],
fill_value='?',
dtype=object),
'param_logistic_regressor_sgd__eta0': masked_array(data=[0.0001, 0.001, 0.01],
mask=[False, False, False],
fill_value='?',
dtype=object),
'params': [{'logistic_regressor_sgd__max_iter': 956,
'logistic_regressor_sgd__eta0': 0.0001},
{'logistic_regressor_sgd__max_iter': 956,
'logistic_regressor_sgd__eta0': 0.001},
{'logistic_regressor_sgd__max_iter': 956,
'logistic_regressor_sgd__eta0': 0.01}],
'split0_test_score': array([0.59347997, 0.59347997, 0.59347997]),
'split1_test_score': array([0.6948154 , 0.69638649, 0.69874313]),
'split2_test_score': array([0.75490966, 0.75569521, 0.75569521]),
'split3_test_score': array([0.66005499, 0.65534171, 0.65377062]),
'split4_test_score': array([0.70168892, 0.70326002, 0.70404556]),
'split5_test_score': array([0.66297591, 0.66297591, 0.66297591]),
'split6_test_score': array([0.72668998, 0.72707848, 0.72785548]),
'split7_test_score': array([0.65271132, 0.65271132, 0.6523126 ]),
'split8_test_score': array([0.67663477, 0.67503987, 0.6738437 ]),
'split9_test_score': array([0.66846093, 0.66846093, 0.66846093]),
'mean_test_score': array([0.67924218, 0.67904299, 0.67911831]),
'std_test_score': array([0.04187847, 0.04246472, 0.04282848]),
'rank_test_score': array([1, 3, 2], dtype=int32)}
= randsearch_auc.predict(X_test)
y_pred = pd.DataFrame({'y': y_test,'y_pred': y_pred})
pred_df
randsearch_auc.best_score_#gini = 2*roc_auc_score(y_test, y_pred)-1
0.679242184672271
= logistic_reg_pipeline.score(X_test, y_test)
test_score print(test_score)
0.6221374045801527