import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.datasets import load_iris, load_wine
from sklearn.metrics import accuracy_score
from IPython.display import SVG
from graphviz import Source
from IPython.display import display                               
from ipywidgets import interactive
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, poisson


df = pd.read_csv('./data/Pulsar_data.csv')

df.head(10)


X = df.drop(columns='Class')
y = df['Class']
feature_names = df.columns.tolist()[:-1]

print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=42)
X_train.head(10)

(3278, 4)


y_train.head(10)

233     1
831     1
2658    0
2495    0
2603    0
111     1
1370    1
1124    1
2170    0
2177    0
Name: Class, dtype: int64


y_train.value_counts()

0    1319
1    1303
Name: Class, dtype: int64


def fit_and_grapth_estimator(estimator):
    
    estimator.fit(X_train, y_train)
    
    accuracy = accuracy_score(y_train, estimator.predict(X_train))
    print(f'Training Accuracy: {accuracy:.4f}')
    
    class_names = [str(i) for i in range(y_train.nunique())]
    graph = Source(tree.export_graphviz(estimator, 
                                        out_file=None, 
                                        feature_names=feature_names, 
                                        class_names=class_names, 
                                        filled = True))
    display(SVG(graph.pipe(format='svg')))
    return estimator


def plot_tree(max_depth=1, min_samples_leaf=1):
    
    estimator = DecisionTreeClassifier(random_state=42, 
                                       max_depth=max_depth, 
                                       min_samples_leaf=min_samples_leaf)
    
    return fit_and_grapth_estimator(estimator)

display(interactive(plot_tree, 
                    max_depth=(1, 10, 1), 
                    min_samples_leaf=(1, 100, 1)))


clf_manual = DecisionTreeClassifier(random_state=42, 
                                    max_depth=10, 
                                    min_samples_leaf=5)

clf_manual.fit(X_train, y_train)
accuracy_manual = accuracy_score(y_test, clf_manual.predict(X_test))
print(f'Accuracy Manual: {accuracy_manual:.4f}')

Accuracy Manual: 0.8201


parameters_GridSearch = {'max_depth':[1, 10, 100], 
                         'min_samples_leaf':[1, 10, 100],
                        }


clf_DecisionTree = DecisionTreeClassifier(random_state=42)


GridSearch = GridSearchCV(clf_DecisionTree, 
                          parameters_GridSearch, 
                          cv=5, 
                          return_train_score=True, 
                          refit=True, 
                         )


GridSearch.fit(X_train, y_train);


GridSearch_results = pd.DataFrame(GridSearch.cv_results_)


print("Grid Search: \tBest parameters: ", GridSearch.best_params_, f", Best scores: {GridSearch.best_score_:.4f}\n")

Grid Search: 	Best parameters:  {'max_depth': 1, 'min_samples_leaf': 1} , Best scores: 0.8551


GridSearch_results


clf_GridSearch = GridSearch.best_estimator_


accuracy_GridSearch = accuracy_score(y_test, clf_GridSearch.predict(X_test))
print(f'Accuracy Manual:      {accuracy_manual:.4f}')
print(f'Accuracy Grid Search: {accuracy_GridSearch:.4f}')

Accuracy Manual:      0.8201
Accuracy Grid Search: 0.8430


# specify parameters and distributions to sample from
parameters_RandomSearch = {'max_depth': poisson(25), 
                           'min_samples_leaf': randint(1, 100)}


# run randomized search
n_iter_search = 9
RandomSearch = RandomizedSearchCV(clf_DecisionTree, 
                                  param_distributions=parameters_RandomSearch, 
                                  n_iter=n_iter_search, 
                                  cv=5, 
                                  return_train_score=True,
                                  random_state=42,
                                 )


# fit the random search instance
RandomSearch.fit(X_train, y_train);


RandomSearch_results = pd.DataFrame(RandomSearch.cv_results_)                 
print("Random Search: \tBest parameters: ", RandomSearch.best_params_, f", Best scores: {RandomSearch.best_score_:.3f}")

Random Search: 	Best parameters:  {'max_depth': 26, 'min_samples_leaf': 83} , Best scores: 0.855


RandomSearch_results.head(10)


clf_RandomSearch = RandomSearch.best_estimator_

accuracy_RandomSearch = accuracy_score(y_test, clf_RandomSearch.predict(X_test))
print(f'Accuracy Manual:        {accuracy_manual:.4f}')
print(f'Accuracy Grid search:   {accuracy_GridSearch:.4f}')
print(f'Accuracy Random Search: {accuracy_RandomSearch:.4f}')

Accuracy Manual:        0.8201
Accuracy Grid search:   0.8430
Accuracy Random Search: 0.8430


from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

def DecisionTree_CrossValidation(max_depth, min_samples_leaf, data, targets):
    """Decision Tree cross validation.
       Fits a Decision Tree with the given paramaters to the target 
       given data, calculated a CV accuracy score and returns the mean.
       The goal is to find combinations of max_depth, min_samples_leaf 
       that maximize the accuracy
    """
    
    estimator = DecisionTreeClassifier(random_state=42, 
                                       max_depth=max_depth, 
                                       min_samples_leaf=min_samples_leaf)
    
    cval = cross_val_score(estimator, data, targets, scoring='accuracy', cv=5)
    
    return cval.mean()


def optimize_DecisionTree(data, targets, pars, n_iter=5):
    """Apply Bayesian Optimization to Decision Tree parameters."""
    
    def crossval_wrapper(max_depth, min_samples_leaf):
        """Wrapper of Decision Tree cross validation. 
           Notice how we ensure max_depth, min_samples_leaf 
           are casted to integer before we pass them along.
        """
        return DecisionTree_CrossValidation(max_depth=int(max_depth), 
                                            min_samples_leaf=int(min_samples_leaf), 
                                            data=data, 
                                            targets=targets)

    optimizer = BayesianOptimization(f=crossval_wrapper, 
                                     pbounds=pars, 
                                     random_state=42, 
                                     verbose=2)
    optimizer.maximize(init_points=4, n_iter=n_iter)

    return optimizer


parameters_BayesianOptimization = {"max_depth": (1, 100), 
                                   "min_samples_leaf": (1, 100),
                                  }

BayesianOptimization = optimize_DecisionTree(X_train, 
                                             y_train, 
                                             parameters_BayesianOptimization, 
                                             n_iter=5)
print(BayesianOptimization.max)

|   iter    |  target   | max_depth | min_sa... |
-------------------------------------------------
|  1        |  0.8551   |  38.08    |  95.12    |
|  2        |  0.849    |  73.47    |  60.27    |
|  3        |  0.8524   |  16.45    |  16.44    |
|  4        |  0.8551   |  6.75     |  86.75    |
|  5        |  0.8551   |  22.46    |  67.29    |
|  6        |  0.8551   |  21.86    |  65.88    |
|  7        |  0.8242   |  100.0    |  1.0      |
|  8        |  0.8551   |  100.0    |  100.0    |
|  9        |  0.8551   |  1.0      |  37.51    |
=================================================
{'target': 0.8550716103235187, 'params': {'max_depth': 38.07947176588889, 'min_samples_leaf': 95.1207163345817}}


params = BayesianOptimization.max['params']


clf_BO = DecisionTreeClassifier(random_state=42, **params)
clf_BO = clf_BO.fit(X_train, y_train)


accuracy_BayesianOptimization = accuracy_score(y_test, clf_BO.predict(X_test))
print(f'Accuracy Manual:                {accuracy_manual:.4f}')
print(f'Accuracy Grid Search:           {accuracy_GridSearch:.4f}')
print(f'Accuracy Random Search:         {accuracy_RandomSearch:.4f}')
print(f'Accuracy Bayesian Optimization: {accuracy_BayesianOptimization:.4f}')

Accuracy Manual:                0.8201
Accuracy Grid Search:           0.8430
Accuracy Random Search:         0.8430
Accuracy Bayesian Optimization: 0.8430


max_depth_array = np.arange(1, 30)
min_samples_leaf_array = np.arange(2, 31)
Z = np.zeros((len(max_depth_array), len(min_samples_leaf_array)))

for i, max_depth in enumerate(max_depth_array):
    for j, min_samples_leaf in enumerate(min_samples_leaf_array):
        
        clf = DecisionTreeClassifier(random_state=42, 
                                     max_depth=max_depth, 
                                     min_samples_leaf=
                                     min_samples_leaf)
        clf.fit(X_train, y_train)
        acc = accuracy_score(y_test, clf.predict(X_test))
        Z[i, j] = acc
        
# Notice: have to transpose Z to match up with imshow
Z = Z.T

100%|██████████| 29/29 [00:12<00:00,  2.36it/s]


fig, ax = plt.subplots(figsize=(12, 6))

# notice that we are setting the extent and origin keywords
CS = ax.imshow(Z, extent=[1, 30, 2, 31], cmap='viridis', origin='lower')
ax.set(xlabel='max_depth', ylabel='min_samples_leaf')

fig.colorbar(CS);


import optuna
from optuna.samplers import TPESampler
from optuna.integration import LightGBMPruningCallback
from optuna.pruners import MedianPruner
import lightgbm as lgb

#%%

lgb_data_train = ...


def objective(trial):

    boosting_types = ["gbdt", "rf", "dart"]
    boosting_type = trial.suggest_categorical("boosting_type", boosting_types)

    params = {
        "objective": "binary",
        "boosting": boosting_type,
        "max_depth": trial.suggest_int("max_depth", 2, 63),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-5, 10),
        "scale_pos_weight": trial.suggest_uniform("scale_pos_weight", 10.0, 30.0),
    }

    N_iterations_max = 10_000
    early_stopping_rounds = 50

    if boosting_type == "dart":
        N_iterations_max = 100
        early_stopping_rounds = None

    cv_res = lgb.cv(
        params,
        lgb_data_train,
        num_boost_round=N_iterations_max,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=False,
        seed=42,
        callbacks=[LightGBMPruningCallback(trial, "auc")],
    )

    num_boost_round = len(cv_res["auc-mean"])
    trial.set_user_attr("num_boost_round", num_boost_round)

    return cv_res["auc-mean"][-1]


study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(seed=42),
    pruner=MedianPruner(n_warmup_steps=50),
)

study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2021-05-05 12:45:02,243] A new study created in memory with name: no-name-0cabd40c-f7e5-49ca-9c67-fb4fad9a8305

	Mean_SNR	STD_SNR	Kurtosis_SNR	Skewness_SNR	Class
0	27.555184	61.719016	2.208808	3.662680	1
1	1.358696	13.079034	13.312141	212.597029	1
2	73.112876	62.070220	1.268206	1.082920	1
3	146.568562	82.394624	-0.274902	-1.121848	1
4	6.071070	29.760400	5.318767	28.698048	1
5	32.919732	65.094197	1.605538	0.871364	1
6	34.101171	62.577395	1.890020	2.572133	1
7	50.107860	66.321825	1.456423	1.335182	1
8	176.119565	59.737720	-1.785377	2.940913	1
9	183.622910	79.932815	-1.326647	0.346712	1

	Mean_SNR	STD_SNR	Kurtosis_SNR	Skewness_SNR
233	159.849498	76.740010	-0.575016	-0.941293
831	4.243311	26.746490	7.110978	52.701218
2658	1.015050	10.449662	15.593479	316.011541
2495	2.235786	19.071848	9.659137	99.294390
2603	2.266722	15.512103	9.062942	99.652157
111	121.404682	47.965569	0.663053	1.203139
1370	35.209866	60.573157	1.635995	1.609377
1124	199.577759	58.656643	-1.862320	2.391870
2170	0.663043	8.571517	23.415092	655.614875
2177	3.112876	16.855717	8.301954	90.378150

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_max_depth	param_min_samples_leaf	params	split0_test_score	split1_test_score	split2_test_score	...	mean_test_score	std_test_score	rank_test_score	split0_train_score	split1_train_score	split2_train_score	split3_train_score	split4_train_score	mean_train_score	std_train_score
0	0.006253	0.001529	0.003232	0.000902	1	1	{'max_depth': 1, 'min_samples_leaf': 1}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.860343	0.859840	0.001340
1	0.005775	0.001584	0.003146	0.001839	1	10	{'max_depth': 1, 'min_samples_leaf': 10}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.860343	0.859840	0.001340
2	0.007634	0.002424	0.002856	0.001284	1	100	{'max_depth': 1, 'min_samples_leaf': 100}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.860343	0.859840	0.001340
3	0.016240	0.002724	0.004229	0.001771	10	1	{'max_depth': 10, 'min_samples_leaf': 1}	0.847619	0.822857	0.858779	...	0.841729	0.011991	8	0.956128	0.948021	0.954242	0.962345	0.957102	0.955568	0.004633
4	0.014747	0.002093	0.003895	0.001379	10	10	{'max_depth': 10, 'min_samples_leaf': 10}	0.845714	0.847619	0.853053	...	0.846300	0.008757	6	0.896042	0.898903	0.898475	0.893232	0.895615	0.896453	0.002066
5	0.011232	0.001848	0.004265	0.001313	10	100	{'max_depth': 10, 'min_samples_leaf': 100}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.860343	0.859840	0.001340
6	0.020646	0.007275	0.006328	0.006625	100	1	{'max_depth': 100, 'min_samples_leaf': 1}	0.826667	0.796190	0.841603	...	0.824190	0.015056	9	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000
7	0.012461	0.001733	0.003525	0.001143	100	10	{'max_depth': 100, 'min_samples_leaf': 10}	0.845714	0.849524	0.854962	...	0.845154	0.009863	7	0.896996	0.899857	0.898475	0.894185	0.896568	0.897216	0.001909
8	0.006863	0.000958	0.003435	0.001022	100	100	{'max_depth': 100, 'min_samples_leaf': 100}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.860343	0.859840	0.001340

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_max_depth	param_min_samples_leaf	params	split0_test_score	split1_test_score	split2_test_score	...	mean_test_score	std_test_score	rank_test_score	split0_train_score	split1_train_score	split2_train_score	split3_train_score	split4_train_score	mean_train_score	std_train_score
0	0.008612	0.002231	0.003622	0.001708	23	72	{'max_depth': 23, 'min_samples_leaf': 72}	0.849524	0.862857	0.862595	...	0.854308	0.010440	7	0.862184	0.858369	0.858913	0.859390	0.860343	0.859840	0.001340
1	0.006745	0.000889	0.001851	0.000398	26	83	{'max_depth': 26, 'min_samples_leaf': 83}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.860343	0.859840	0.001340
2	0.008370	0.001054	0.002575	0.000637	17	24	{'max_depth': 17, 'min_samples_leaf': 24}	0.847619	0.860952	0.868321	...	0.854310	0.012162	6	0.875536	0.879351	0.878456	0.875596	0.881792	0.878146	0.002373
3	0.008715	0.002070	0.002739	0.000699	27	88	{'max_depth': 27, 'min_samples_leaf': 88}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.860343	0.859840	0.001340
4	0.007544	0.000536	0.002442	0.000690	31	64	{'max_depth': 31, 'min_samples_leaf': 64}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.861296	0.860031	0.001460
5	0.007143	0.001285	0.002282	0.001095	27	89	{'max_depth': 27, 'min_samples_leaf': 89}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.860343	0.859840	0.001340
6	0.010512	0.004536	0.002821	0.000599	22	42	{'max_depth': 22, 'min_samples_leaf': 42}	0.849524	0.836190	0.856870	...	0.845540	0.015574	9	0.866476	0.865999	0.862726	0.865110	0.864156	0.864893	0.001343
7	0.006751	0.000624	0.001920	0.000266	21	62	{'max_depth': 21, 'min_samples_leaf': 62}	0.849524	0.840000	0.862595	...	0.850500	0.009778	8	0.862184	0.858369	0.858913	0.859390	0.861296	0.860031	0.001460
8	0.007471	0.000942	0.002427	0.000536	20	64	{'max_depth': 20, 'min_samples_leaf': 64}	0.849524	0.862857	0.862595	...	0.855072	0.009121	1	0.862184	0.858369	0.858913	0.859390	0.861296	0.860031	0.001460

Hyperparameter Optimization Example¶

Author and Date:¶

Nomenclature¶

Domain of hyperparameters:¶

Goal:¶

Part A: Naïve Approach¶

Part B: Grid Search¶

Part C: Random Search¶

Part D: Bayesian Optimization¶

Part D: Full Scan over Parameter Space¶

Sum up:¶

Part E: New Methods¶

Part F: New Software¶

Questions?¶