Engineering AI Agents

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import KFold, cross_val_score
from tune_sklearn import TuneGridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import pandas as pd


X, y = fetch_openml(
    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)
X.head()

	pclass	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
2	1	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
3	1	Allison, Mr. Hudson Joshua Creighton	male	30.0000	1	2	113781	151.5500	C22 C26	S	NaN	135.0	Montreal, PQ / Chesterville, ON
4	1	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON

y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']

numeric_features = ["age", "fare"]
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
categorical_features = ["embarked", "pclass"]

selected_features = numeric_features + categorical_features
print(selected_features)

X=X[selected_features]
X.head()

['age', 'fare', 'embarked', 'pclass']

	age	fare	embarked	pclass
0	29.0000	211.3375	S	1
1	0.9167	151.5500	S	1
2	2.0000	151.5500	S	1
3	30.0000	151.5500	S	1
4	25.0000	151.5500	S	1


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
m = len(X_train.index)

# Transform the data

preprocessor = ColumnTransformer(
    transformers= [
        ("num", numeric_transformer, numeric_features),
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=True,
)

select_features = SelectKBest(k=3)
steps = [
    ('preprocessor', preprocessor),
    ('select_features', select_features),
    ('logistic_regressor_sgd', SGDClassifier(loss='log_loss',  learning_rate='constant', eta0=1e-4, n_jobs=-1, shuffle=True))
]
logistic_reg_pipeline = Pipeline(steps)
logistic_reg_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'select_features', 'logistic_regressor_sgd', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__simpleimputer', 'preprocessor__num__standardscaler', 'preprocessor__num__simpleimputer__add_indicator', 'preprocessor__num__simpleimputer__copy', 'preprocessor__num__simpleimputer__fill_value', 'preprocessor__num__simpleimputer__keep_empty_features', 'preprocessor__num__simpleimputer__missing_values', 'preprocessor__num__simpleimputer__strategy', 'preprocessor__num__simpleimputer__verbose', 'preprocessor__num__standardscaler__copy', 'preprocessor__num__standardscaler__with_mean', 'preprocessor__num__standardscaler__with_std', 'preprocessor__cat__categories', 'preprocessor__cat__drop', 'preprocessor__cat__dtype', 'preprocessor__cat__handle_unknown', 'preprocessor__cat__max_categories', 'preprocessor__cat__min_frequency', 'preprocessor__cat__sparse', 'preprocessor__cat__sparse_output', 'select_features__k', 'select_features__score_func', 'logistic_regressor_sgd__alpha', 'logistic_regressor_sgd__average', 'logistic_regressor_sgd__class_weight', 'logistic_regressor_sgd__early_stopping', 'logistic_regressor_sgd__epsilon', 'logistic_regressor_sgd__eta0', 'logistic_regressor_sgd__fit_intercept', 'logistic_regressor_sgd__l1_ratio', 'logistic_regressor_sgd__learning_rate', 'logistic_regressor_sgd__loss', 'logistic_regressor_sgd__max_iter', 'logistic_regressor_sgd__n_iter_no_change', 'logistic_regressor_sgd__n_jobs', 'logistic_regressor_sgd__penalty', 'logistic_regressor_sgd__power_t', 'logistic_regressor_sgd__random_state', 'logistic_regressor_sgd__shuffle', 'logistic_regressor_sgd__tol', 'logistic_regressor_sgd__validation_fraction', 'logistic_regressor_sgd__verbose', 'logistic_regressor_sgd__warm_start'])

param_grid = {
    'logistic_regressor_sgd__eta0': [1e-4, 1e-3, 1e-2],
    'logistic_regressor_sgd__max_iter': [int(np.ceil(1e6 / m))]
}
randsearch_auc = RandomizedSearchCV(estimator=logistic_reg_pipeline, param_distributions=param_grid, n_iter=5, scoring='roc_auc', cv=5, verbose=2)

for train_df in tqdm(pd.read_csv("DATA/creditcard.csv", chunksize=chunksize, iterator=True)):
    X = train_df
    Y = train_df['Class']
    randsearch_auc.partial_fit(X_train, y_train, classes=[0,1])
    
randsearch_auc.fit(X_train, y_train)

/workspaces/data_mining/.venv/lib/python3.9/site-packages/sklearn/model_selection/_search.py:305: UserWarning: The total space of parameters 3 is smaller than n_iter=5. Running 3 iterations. For exhaustive searches, use GridSearchCV.
  warnings.warn(

randsearch_auc.cv_results_

{'mean_fit_time': array([0.01125438, 0.01076853, 0.01036565]),
 'std_fit_time': array([0.00041194, 0.00034422, 0.00011   ]),
 'mean_score_time': array([0.00501354, 0.00504141, 0.00494459]),
 'std_score_time': array([1.32287857e-04, 1.52852150e-04, 8.08919104e-05]),
 'param_logistic_regressor_sgd__max_iter': masked_array(data=[956, 956, 956],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_logistic_regressor_sgd__eta0': masked_array(data=[0.0001, 0.001, 0.01],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'logistic_regressor_sgd__max_iter': 956,
   'logistic_regressor_sgd__eta0': 0.0001},
  {'logistic_regressor_sgd__max_iter': 956,
   'logistic_regressor_sgd__eta0': 0.001},
  {'logistic_regressor_sgd__max_iter': 956,
   'logistic_regressor_sgd__eta0': 0.01}],
 'split0_test_score': array([0.59347997, 0.59347997, 0.59347997]),
 'split1_test_score': array([0.6948154 , 0.69638649, 0.69874313]),
 'split2_test_score': array([0.75490966, 0.75569521, 0.75569521]),
 'split3_test_score': array([0.66005499, 0.65534171, 0.65377062]),
 'split4_test_score': array([0.70168892, 0.70326002, 0.70404556]),
 'split5_test_score': array([0.66297591, 0.66297591, 0.66297591]),
 'split6_test_score': array([0.72668998, 0.72707848, 0.72785548]),
 'split7_test_score': array([0.65271132, 0.65271132, 0.6523126 ]),
 'split8_test_score': array([0.67663477, 0.67503987, 0.6738437 ]),
 'split9_test_score': array([0.66846093, 0.66846093, 0.66846093]),
 'mean_test_score': array([0.67924218, 0.67904299, 0.67911831]),
 'std_test_score': array([0.04187847, 0.04246472, 0.04282848]),
 'rank_test_score': array([1, 3, 2], dtype=int32)}

y_pred = randsearch_auc.predict(X_test)
pred_df = pd.DataFrame({'y': y_test,'y_pred': y_pred})
randsearch_auc.best_score_
#gini = 2*roc_auc_score(y_test, y_pred)-1

0.679242184672271

test_score = logistic_reg_pipeline.score(X_test, y_test)
print(test_score)

0.6221374045801527