from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import KFold, cross_val_score
from tune_sklearn import TuneGridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import pandas as pd

X, y = fetch_openml(
    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)
X.head()
pclass name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 Allen, Miss. Elisabeth Walton female 29.0000 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 Allison, Master. Hudson Trevor male 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
2 1 Allison, Miss. Helen Loraine female 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
3 1 Allison, Mr. Hudson Joshua Creighton male 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON
4 1 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
y.head()
0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']
numeric_features = ["age", "fare"]
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
categorical_features = ["embarked", "pclass"]

selected_features = numeric_features + categorical_features
print(selected_features)

X=X[selected_features]
X.head()
['age', 'fare', 'embarked', 'pclass']
age fare embarked pclass
0 29.0000 211.3375 S 1
1 0.9167 151.5500 S 1
2 2.0000 151.5500 S 1
3 30.0000 151.5500 S 1
4 25.0000 151.5500 S 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
m = len(X_train.index)
# Transform the data

preprocessor = ColumnTransformer(
    transformers= [
        ("num", numeric_transformer, numeric_features),
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=True,
)
select_features = SelectKBest(k=3)
steps = [
    ('preprocessor', preprocessor),
    ('select_features', select_features),
    ('logistic_regressor_sgd', SGDClassifier(loss='log_loss',  learning_rate='constant', eta0=1e-4, n_jobs=-1, shuffle=True))
]
logistic_reg_pipeline = Pipeline(steps)
logistic_reg_pipeline.get_params().keys()
dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'select_features', 'logistic_regressor_sgd', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__simpleimputer', 'preprocessor__num__standardscaler', 'preprocessor__num__simpleimputer__add_indicator', 'preprocessor__num__simpleimputer__copy', 'preprocessor__num__simpleimputer__fill_value', 'preprocessor__num__simpleimputer__keep_empty_features', 'preprocessor__num__simpleimputer__missing_values', 'preprocessor__num__simpleimputer__strategy', 'preprocessor__num__simpleimputer__verbose', 'preprocessor__num__standardscaler__copy', 'preprocessor__num__standardscaler__with_mean', 'preprocessor__num__standardscaler__with_std', 'preprocessor__cat__categories', 'preprocessor__cat__drop', 'preprocessor__cat__dtype', 'preprocessor__cat__handle_unknown', 'preprocessor__cat__max_categories', 'preprocessor__cat__min_frequency', 'preprocessor__cat__sparse', 'preprocessor__cat__sparse_output', 'select_features__k', 'select_features__score_func', 'logistic_regressor_sgd__alpha', 'logistic_regressor_sgd__average', 'logistic_regressor_sgd__class_weight', 'logistic_regressor_sgd__early_stopping', 'logistic_regressor_sgd__epsilon', 'logistic_regressor_sgd__eta0', 'logistic_regressor_sgd__fit_intercept', 'logistic_regressor_sgd__l1_ratio', 'logistic_regressor_sgd__learning_rate', 'logistic_regressor_sgd__loss', 'logistic_regressor_sgd__max_iter', 'logistic_regressor_sgd__n_iter_no_change', 'logistic_regressor_sgd__n_jobs', 'logistic_regressor_sgd__penalty', 'logistic_regressor_sgd__power_t', 'logistic_regressor_sgd__random_state', 'logistic_regressor_sgd__shuffle', 'logistic_regressor_sgd__tol', 'logistic_regressor_sgd__validation_fraction', 'logistic_regressor_sgd__verbose', 'logistic_regressor_sgd__warm_start'])
param_grid = {
    'logistic_regressor_sgd__eta0': [1e-4, 1e-3, 1e-2],
    'logistic_regressor_sgd__max_iter': [int(np.ceil(1e6 / m))]
}
randsearch_auc = RandomizedSearchCV(estimator=logistic_reg_pipeline, param_distributions=param_grid, n_iter=5, scoring='roc_auc', cv=5, verbose=2)
for train_df in tqdm(pd.read_csv("DATA/creditcard.csv", chunksize=chunksize, iterator=True)):
    X = train_df
    Y = train_df['Class']
    randsearch_auc.partial_fit(X_train, y_train, classes=[0,1])
    
randsearch_auc.fit(X_train, y_train)
/workspaces/data_mining/.venv/lib/python3.9/site-packages/sklearn/model_selection/_search.py:305: UserWarning: The total space of parameters 3 is smaller than n_iter=5. Running 3 iterations. For exhaustive searches, use GridSearchCV.
  warnings.warn(
RandomizedSearchCV(cv=10,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               ['age',
                                                                                'fare']),
                                                                              ('cat',
                                                                               OneHotEncoder(handle_unknown='ignore',
                                                                                             sparse_output=False),
                                                                               ['embarked',
                                                                                'pclass'])])),
                                             ('select_features',
                                              SelectKBest(k=3)),
                                             ('logistic_regressor_sgd',
                                              SGDClassifier(eta0=0.0001,
                                                            learning_rate='constant',
                                                            loss='log_loss'))]),
                   n_iter=5,
                   param_distributions={'logistic_regressor_sgd__eta0': [0.0001,
                                                                         0.001,
                                                                         0.01],
                                        'logistic_regressor_sgd__max_iter': [956]},
                   scoring='roc_auc', verbose=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
randsearch_auc.cv_results_
{'mean_fit_time': array([0.01125438, 0.01076853, 0.01036565]),
 'std_fit_time': array([0.00041194, 0.00034422, 0.00011   ]),
 'mean_score_time': array([0.00501354, 0.00504141, 0.00494459]),
 'std_score_time': array([1.32287857e-04, 1.52852150e-04, 8.08919104e-05]),
 'param_logistic_regressor_sgd__max_iter': masked_array(data=[956, 956, 956],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_logistic_regressor_sgd__eta0': masked_array(data=[0.0001, 0.001, 0.01],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'logistic_regressor_sgd__max_iter': 956,
   'logistic_regressor_sgd__eta0': 0.0001},
  {'logistic_regressor_sgd__max_iter': 956,
   'logistic_regressor_sgd__eta0': 0.001},
  {'logistic_regressor_sgd__max_iter': 956,
   'logistic_regressor_sgd__eta0': 0.01}],
 'split0_test_score': array([0.59347997, 0.59347997, 0.59347997]),
 'split1_test_score': array([0.6948154 , 0.69638649, 0.69874313]),
 'split2_test_score': array([0.75490966, 0.75569521, 0.75569521]),
 'split3_test_score': array([0.66005499, 0.65534171, 0.65377062]),
 'split4_test_score': array([0.70168892, 0.70326002, 0.70404556]),
 'split5_test_score': array([0.66297591, 0.66297591, 0.66297591]),
 'split6_test_score': array([0.72668998, 0.72707848, 0.72785548]),
 'split7_test_score': array([0.65271132, 0.65271132, 0.6523126 ]),
 'split8_test_score': array([0.67663477, 0.67503987, 0.6738437 ]),
 'split9_test_score': array([0.66846093, 0.66846093, 0.66846093]),
 'mean_test_score': array([0.67924218, 0.67904299, 0.67911831]),
 'std_test_score': array([0.04187847, 0.04246472, 0.04282848]),
 'rank_test_score': array([1, 3, 2], dtype=int32)}
y_pred = randsearch_auc.predict(X_test)
pred_df = pd.DataFrame({'y': y_test,'y_pred': y_pred})
randsearch_auc.best_score_
#gini = 2*roc_auc_score(y_test, y_pred)-1
0.679242184672271
test_score = logistic_reg_pipeline.score(X_test, y_test)
print(test_score)
0.6221374045801527