from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import KFold, cross_val_score
from tune_sklearn import TuneGridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import pandas as pd
X, y = fetch_openml(
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)
X.head()| pclass | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
| 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
| 2 | 1 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
| 3 | 1 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
| 4 | 1 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
y.head()0 1
1 1
2 0
3 0
4 0
Name: survived, dtype: category
Categories (2, object): ['0', '1']
numeric_features = ["age", "fare"]
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
categorical_features = ["embarked", "pclass"]
selected_features = numeric_features + categorical_features
print(selected_features)
X=X[selected_features]
X.head()['age', 'fare', 'embarked', 'pclass']
| age | fare | embarked | pclass | |
|---|---|---|---|---|
| 0 | 29.0000 | 211.3375 | S | 1 |
| 1 | 0.9167 | 151.5500 | S | 1 |
| 2 | 2.0000 | 151.5500 | S | 1 |
| 3 | 30.0000 | 151.5500 | S | 1 |
| 4 | 25.0000 | 151.5500 | S | 1 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
m = len(X_train.index)# Transform the data
preprocessor = ColumnTransformer(
transformers= [
("num", numeric_transformer, numeric_features),
(
"cat",
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
categorical_features,
),
],
verbose_feature_names_out=True,
)
select_features = SelectKBest(k=3)
steps = [
('preprocessor', preprocessor),
('select_features', select_features),
('logistic_regressor_sgd', SGDClassifier(loss='log_loss', learning_rate='constant', eta0=1e-4, n_jobs=-1, shuffle=True))
]
logistic_reg_pipeline = Pipeline(steps)
logistic_reg_pipeline.get_params().keys()dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'select_features', 'logistic_regressor_sgd', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__simpleimputer', 'preprocessor__num__standardscaler', 'preprocessor__num__simpleimputer__add_indicator', 'preprocessor__num__simpleimputer__copy', 'preprocessor__num__simpleimputer__fill_value', 'preprocessor__num__simpleimputer__keep_empty_features', 'preprocessor__num__simpleimputer__missing_values', 'preprocessor__num__simpleimputer__strategy', 'preprocessor__num__simpleimputer__verbose', 'preprocessor__num__standardscaler__copy', 'preprocessor__num__standardscaler__with_mean', 'preprocessor__num__standardscaler__with_std', 'preprocessor__cat__categories', 'preprocessor__cat__drop', 'preprocessor__cat__dtype', 'preprocessor__cat__handle_unknown', 'preprocessor__cat__max_categories', 'preprocessor__cat__min_frequency', 'preprocessor__cat__sparse', 'preprocessor__cat__sparse_output', 'select_features__k', 'select_features__score_func', 'logistic_regressor_sgd__alpha', 'logistic_regressor_sgd__average', 'logistic_regressor_sgd__class_weight', 'logistic_regressor_sgd__early_stopping', 'logistic_regressor_sgd__epsilon', 'logistic_regressor_sgd__eta0', 'logistic_regressor_sgd__fit_intercept', 'logistic_regressor_sgd__l1_ratio', 'logistic_regressor_sgd__learning_rate', 'logistic_regressor_sgd__loss', 'logistic_regressor_sgd__max_iter', 'logistic_regressor_sgd__n_iter_no_change', 'logistic_regressor_sgd__n_jobs', 'logistic_regressor_sgd__penalty', 'logistic_regressor_sgd__power_t', 'logistic_regressor_sgd__random_state', 'logistic_regressor_sgd__shuffle', 'logistic_regressor_sgd__tol', 'logistic_regressor_sgd__validation_fraction', 'logistic_regressor_sgd__verbose', 'logistic_regressor_sgd__warm_start'])
param_grid = {
'logistic_regressor_sgd__eta0': [1e-4, 1e-3, 1e-2],
'logistic_regressor_sgd__max_iter': [int(np.ceil(1e6 / m))]
}
randsearch_auc = RandomizedSearchCV(estimator=logistic_reg_pipeline, param_distributions=param_grid, n_iter=5, scoring='roc_auc', cv=5, verbose=2)for train_df in tqdm(pd.read_csv("DATA/creditcard.csv", chunksize=chunksize, iterator=True)):
X = train_df
Y = train_df['Class']
randsearch_auc.partial_fit(X_train, y_train, classes=[0,1])
randsearch_auc.fit(X_train, y_train)/workspaces/data_mining/.venv/lib/python3.9/site-packages/sklearn/model_selection/_search.py:305: UserWarning: The total space of parameters 3 is smaller than n_iter=5. Running 3 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
RandomizedSearchCV(cv=10,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='median')),
('standardscaler',
StandardScaler())]),
['age',
'fare']),
('cat',
OneHotEncoder(handle_unknown='ignore',
sparse_output=False),
['embarked',
'pclass'])])),
('select_features',
SelectKBest(k=3)),
('logistic_regressor_sgd',
SGDClassifier(eta0=0.0001,
learning_rate='constant',
loss='log_loss'))]),
n_iter=5,
param_distributions={'logistic_regressor_sgd__eta0': [0.0001,
0.001,
0.01],
'logistic_regressor_sgd__max_iter': [956]},
scoring='roc_auc', verbose=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=10,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='median')),
('standardscaler',
StandardScaler())]),
['age',
'fare']),
('cat',
OneHotEncoder(handle_unknown='ignore',
sparse_output=False),
['embarked',
'pclass'])])),
('select_features',
SelectKBest(k=3)),
('logistic_regressor_sgd',
SGDClassifier(eta0=0.0001,
learning_rate='constant',
loss='log_loss'))]),
n_iter=5,
param_distributions={'logistic_regressor_sgd__eta0': [0.0001,
0.001,
0.01],
'logistic_regressor_sgd__max_iter': [956]},
scoring='roc_auc', verbose=False)Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='median')),
('standardscaler',
StandardScaler())]),
['age', 'fare']),
('cat',
OneHotEncoder(handle_unknown='ignore',
sparse_output=False),
['embarked', 'pclass'])])),
('select_features', SelectKBest(k=3)),
('logistic_regressor_sgd',
SGDClassifier(eta0=0.0001, learning_rate='constant',
loss='log_loss'))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='median')),
('standardscaler',
StandardScaler())]),
['age', 'fare']),
('cat',
OneHotEncoder(handle_unknown='ignore',
sparse_output=False),
['embarked', 'pclass'])])['age', 'fare']
SimpleImputer(strategy='median')
StandardScaler()
['embarked', 'pclass']
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
SelectKBest(k=3)
SGDClassifier(eta0=0.0001, learning_rate='constant', loss='log_loss')
randsearch_auc.cv_results_
{'mean_fit_time': array([0.01125438, 0.01076853, 0.01036565]),
'std_fit_time': array([0.00041194, 0.00034422, 0.00011 ]),
'mean_score_time': array([0.00501354, 0.00504141, 0.00494459]),
'std_score_time': array([1.32287857e-04, 1.52852150e-04, 8.08919104e-05]),
'param_logistic_regressor_sgd__max_iter': masked_array(data=[956, 956, 956],
mask=[False, False, False],
fill_value='?',
dtype=object),
'param_logistic_regressor_sgd__eta0': masked_array(data=[0.0001, 0.001, 0.01],
mask=[False, False, False],
fill_value='?',
dtype=object),
'params': [{'logistic_regressor_sgd__max_iter': 956,
'logistic_regressor_sgd__eta0': 0.0001},
{'logistic_regressor_sgd__max_iter': 956,
'logistic_regressor_sgd__eta0': 0.001},
{'logistic_regressor_sgd__max_iter': 956,
'logistic_regressor_sgd__eta0': 0.01}],
'split0_test_score': array([0.59347997, 0.59347997, 0.59347997]),
'split1_test_score': array([0.6948154 , 0.69638649, 0.69874313]),
'split2_test_score': array([0.75490966, 0.75569521, 0.75569521]),
'split3_test_score': array([0.66005499, 0.65534171, 0.65377062]),
'split4_test_score': array([0.70168892, 0.70326002, 0.70404556]),
'split5_test_score': array([0.66297591, 0.66297591, 0.66297591]),
'split6_test_score': array([0.72668998, 0.72707848, 0.72785548]),
'split7_test_score': array([0.65271132, 0.65271132, 0.6523126 ]),
'split8_test_score': array([0.67663477, 0.67503987, 0.6738437 ]),
'split9_test_score': array([0.66846093, 0.66846093, 0.66846093]),
'mean_test_score': array([0.67924218, 0.67904299, 0.67911831]),
'std_test_score': array([0.04187847, 0.04246472, 0.04282848]),
'rank_test_score': array([1, 3, 2], dtype=int32)}
y_pred = randsearch_auc.predict(X_test)
pred_df = pd.DataFrame({'y': y_test,'y_pred': y_pred})
randsearch_auc.best_score_
#gini = 2*roc_auc_score(y_test, y_pred)-10.679242184672271
test_score = logistic_reg_pipeline.score(X_test, y_test)
print(test_score)0.6221374045801527