# This line will add a button to toggle visibility of code blocks,
# for use with the HTML export version
from IPython.core.display import HTML
HTML('''<button style="margin:0 auto; display: block;" onclick="jQuery('.code_cell .input_area').toggle();
jQuery('.prompt').toggle();">Toggle code</button>''')
Dr. David Elliott
Notes
%matplotlib inline
import os # locating directories
import numpy as np # Arrays
import pandas as pd # DataFrames
# Plotting
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['animation.embed_limit'] = 30000000.0
#plt.rcParams['figure.dpi'] = 300
import seaborn as sns#; sns.set()
from sklearn.datasets import load_iris # for the Iris data
from IPython.display import Image # displaying .png images
from sklearn.svm import SVC, LinearSVC # SVM
from mpl_toolkits.mplot3d import Axes3D # 3d plots
from sklearn.preprocessing import StandardScaler # scaling features
from sklearn.preprocessing import LabelEncoder # binary encoding
from sklearn.pipeline import Pipeline # combining classifier steps
from sklearn.preprocessing import PolynomialFeatures # make PolynomialFeatures
from sklearn.datasets import make_classification, make_moons # make example data
import warnings # prevent warnings
import joblib # saving models
from time import time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from scipy.stats.distributions import uniform, loguniform
import itertools
from sklearn.model_selection import GridSearchCV, KFold
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from imblearn.metrics import classification_report_imbalanced
import re
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# colours for print()
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
image_dir = os.path.join(os.getcwd(),"Images")
# Initial fig number
fig_num=29
plt.rcParams['figure.dpi'] = 120
# golden ratio for figures ()
gr = 1.618
height_pix = 500
width_pix = height_pix*gr
height_inch = 4
width_inch = height_inch*gr
# Centered figures in the notebook and presentation
# ...was a real pain to find this:
# https://gist.githubusercontent.com/maxalbert/800b9f06c7b2dd365ea5
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import urllib
import base64
from io import BytesIO, StringIO
def fig2str(fig, format='svg'):
"""
Return a string containing the raw data of the matplotlib figure in the given format.
"""
assert isinstance(fig, matplotlib.figure.Figure)
imgdata = BytesIO()
fig.savefig(imgdata, format=format, bbox_inches='tight')
imgdata.seek(0) # rewind the data
output = imgdata.getvalue()
if format == 'svg':
return output
else:
return urllib.parse.quote(base64.b64encode(output))
class MatplotlibFigure(object):
"""
Thin wrapper around a matplotlib figure which provides a custom
HTML representation that allows tweaking the appearance
"""
def __init__(self, fig, centered=False):
assert isinstance(fig, matplotlib.figure.Figure)
self.centered = centered
def _repr_html_(self):
img_str_png = fig2str(fig, format='png')
uri = 'data:image/png;base64,' + img_str_png
html_repr = "<img src='{}'>".format(uri)
if self.centered:
html_repr = "<center>" + html_repr + "</center>"
return html_repr
def hyper_search(model, params, X, y, save_path, n_iter=60, metric="accuracy",
cv = KFold(5), random_state=42, refit=True,
overwrite=False, warning=False):
if os.path.exists(save_path) and overwrite==False:
#load the model
models = joblib.load(save_path)
else:
# check all param inputs are lists
if all(type(x)==list for x in params.values()):
search_type = "Gridsearch"
models = GridSearchCV(model, param_grid=params, scoring=metric, cv=cv,
refit=refit, return_train_score=True)
n_iter = len(list(itertools.product(*list(iter(params.values())))))
else:
search_type = "Randomsearch"
models = RandomizedSearchCV(model, param_distributions=params,
n_iter=n_iter, scoring=metric, cv=cv,
refit=refit, random_state=random_state,
return_train_score=True)
start = time()
if warning:
models.fit(X, y)
else:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
models.fit(X, y)
print(search_type + " took %.2f seconds for %d candidates" % ((time() - start), n_iter))
joblib.dump(models, save_path)
return models
Class imbalance is a quite common problem when working with real-world data.
This occours when examples from one class or multiple classes are over-represented in a dataset (e.g. spam filtering, fraud detection, disease screening).
Default: Customer default records for a credit card company
"We are interested in predicting whether an individual will default on his or her credit card payment, on the basis of annual income and monthly credit card balance."$^5$
df = pd.read_csv("../Data/Default.csv", index_col=0)
df = df.drop("student", axis=1)
df.head()
balance_label = "Monthly Credit Card Balance"
income_label = "Annual Income"
default_label = "Default"
fig, axes = plt.subplots(ncols = 3, figsize=(width_inch*3, height_inch*2))
sns.scatterplot(data = df[df['default']=="No"], x = "balance", y="income", hue="default", palette='Blues',ax=axes[0])
sns.scatterplot(data = df[df['default']=="Yes"], x = "balance", y="income",hue="default", palette='Oranges',ax=axes[0])
sns.boxplot(data = df, x = "default", y="balance", ax=axes[1])
sns.boxplot(data = df, x = "default", y="income", ax=axes[2])
axes[0].legend(title=default_label)
axes[0].set_xlabel(balance_label)
axes[0].set_ylabel(income_label)
axes[1].set_xlabel(default_label)
axes[1].set_ylabel(balance_label)
axes[2].set_xlabel(default_label)
axes[2].set_ylabel(income_label)
fig_num+=1
plt.suptitle("Figure %d: Annual Income and Monthly Credit Card Balance on Credit Card Payment Defaulting"%fig_num)
plt.tight_layout()
plt.close()
display(MatplotlibFigure(fig, centered=True))
Notes
"We have plotted annual income and monthly credit card balance for a subset of 10, 000 individuals"5
"It appears that individuals who defaulted tended to have higher credit card balances than those who did not."5
Below is a recreation of figure 4.15
print(color.BOLD+color.UNDERLINE+"Class Distribution (%)"+color.END)
display(df['default'].value_counts(normalize=True)*100)
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
data_x, data_y = df.drop(['default'], axis=1), df['default']
data_y = LE.fit_transform(data_y)
X_train, X_test, y_train, y_test = train_test_split(data_x.values, data_y, test_size = 0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state=42)
Notes
Notes
train_test_split
We'll do a random search and we can find a model with high accuracy.
C_list = []
pwr = -5
for i in range(11):
C_list.append(2**pwr)
pwr+=2
linear_svm = Pipeline([
("scaler", StandardScaler()),
("svm_clf", LinearSVC(random_state=42))
])
# specify parameters and distributions to sample from
lin_param_dist = {'svm_clf__C':loguniform(C_list[0], C_list[-1])}
lin_rs = hyper_search(linear_svm, lin_param_dist, X_train, y_train,
os.path.join(os.getcwd(), "Models", "def_lin_rs.pkl"),
overwrite=False)
lin_rs_df = pd.DataFrame(lin_rs.cv_results_)
lin_rs_df.sort_values("rank_test_score")[["param_svm_clf__C",
"mean_test_score",
"std_test_score"]].head()
model_score = round(accuracy_score(y_true=y_val, y_pred=lin_rs.best_estimator_.predict(X_val))*100,2)
print("Best Linear Model Accuracy: "+str(model_score)+"%")
However, this is not much better than a completely useless model that only predicts "No".
# demonstrate accuracy of
model_score = round(accuracy_score(y_true=y_val, y_pred=[0]*len(y_val))*100,2)
print("Useless Model Accuracy: "+str(model_score)+"%")
Notes
This binary classifier can make two types of errors$^5$:
While the overall error rate is low, the error rate among individuals who defaulted is very high.
Notes
from sklearn.metrics import confusion_matrix
# this creates the matplotlib graph to make the confmat look nicer
def pretty_confusion_matrix(confmat, labels, title, labeling=False, highlight_indexes=[]):
labels_list = [["TN", "FP"], ["FN", "TP"]]
fig, ax = plt.subplots(figsize=(width_inch, height_inch))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
for j in range(confmat.shape[1]):
if labeling:
label = str(confmat[i, j])+" ("+labels_list[i][j]+")"
else:
label = confmat[i, j]
if [i,j] in highlight_indexes:
ax.text(x=j, y=i, s=label, va='center', ha='center',
weight = "bold", fontsize=18, color='#32618b')
else:
ax.text(x=j, y=i, s=label, va='center', ha='center')
# change the labels
with warnings.catch_warnings():
warnings.simplefilter("ignore")
ax.set_xticklabels(['']+[labels[0], labels[1]])
ax.set_yticklabels(['']+[labels[0], labels[1]])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
ax.xaxis.set_label_position('top')
plt.suptitle(title)
plt.tight_layout()
plt.show()
# use the first classifier to predict the validation set
predictions = lin_rs.best_estimator_.predict(X_val)
# get the confusion matrix as a numpy array
confmat = confusion_matrix(y_true=y_val, y_pred=predictions)
fig_num+=1
# use the pretty function to make it nicer
pretty_confusion_matrix(confmat, LE.classes_,
"Figure %d: Validation Confusion Matrix"%fig_num,
labeling=True,
)
Error and Accuracy$^1$
Gives general performance information regarding the number of all correct or false predictions comparative to the total number of predictions for both the positive and negative labels.
$$ \begin{align} ERR &= \frac{FP+FN}{FP+FN+TP+TN} \\ \\ ACC &= 1-ERR \end{align} $$from sklearn.metrics import precision_score, recall_score, f1_score
FP_i = [0,1]
TN_i = [0,0]
TP_i = [1,1]
FN_i = [1,0]
FP = confmat[0,1]
TN = confmat[0,0]
TP = confmat[1,1]
FN = confmat[1,0]
ERR = (FP+FN)/(FP+FN+TP+TN)
ACC = 1-ERR
pretty_confusion_matrix(confmat, LE.classes_,
"Figure %d: Accuracy Validation Confusion Matrix"%fig_num,
labeling=True, highlight_indexes=[FP_i,FN_i,TP_i,TN_i])
#print(color.BOLD+'Error (ERR): '+color.END +'%.3f' % ERR)
print(color.BOLD+'Accuracy (ACC): '+color.END +'%.3f' % ACC)
#print('Accuracy (ACC): %.3f' % accuracy_score(y_true=y_val, y_pred=predictions))
Precision (PRE)$^1$
PRE = TP/(TP+FP)
pretty_confusion_matrix(confmat, LE.classes_,
"Figure %d: Precision Validation Confusion Matrix"%fig_num,
labeling=True, highlight_indexes=[FP_i,TP_i])
print(color.BOLD+'Precision (PRE): '+color.END +'%.3f' % PRE)
#print('Precision (PRE): %.3f' % precision_score(y_true=y_val, y_pred=predictions))
Recall (or True Positive Rate)$^1$
Calculates how many of the actual positives our model correctly or incorrectly labelled.
This is useful when the fraction of correctly or misclassified samples in the positive class are of interest.
REC = TP/(FN+TP)
pretty_confusion_matrix(confmat, LE.classes_,
"Figure %d: Recall Validation Confusion Matrix"%fig_num,
labeling=True, highlight_indexes=[FN_i,TP_i])
print(color.BOLD+'Recall (REC): '+color.END +'%.3f' % REC)
#print('Recall (REC): %.3f' % recall_score(y_true=y_val, y_pred=predictions))
F1-score$^1$
F1 = 2*((PRE*REC)/(PRE+REC))
pretty_confusion_matrix(confmat, LE.classes_,
"Figure %d: F1-Score Validation Confusion Matrix"%fig_num,
labeling=True, highlight_indexes=[FN_i,TP_i, FP_i])
print(color.BOLD+'F1-Score (F1): '+color.END +'%.3f' % F1)
#print('F1-score (F1): %.3f' % f1_score(y_true=y_val, y_pred=predictions))
We can use a classification report, which gives more information such as the macro avg and weighted avg.
Macro Average
Weighted Average
Notes
"yes"
class. This is because in binary classification problems, the default positive label is the target (class 1). You can change this if you are more interested in the other classes performance or the average metrics.from sklearn.metrics import classification_report
pd.DataFrame(classification_report(y_val,
predictions,
labels=None,
target_names=LE.classes_,
sample_weight=None,
digits=2,
output_dict=True)).round(2)
Notes
Training Error > Test Error$^5$
Notes
lin_rs_plt = lin_rs_df[['mean_test_score','mean_train_score']].melt(var_name='Dataset', value_name= 'Score')
sns.boxplot(data = lin_rs_plt, x = 'Dataset', y='Score')
fig_num+=1
plt.suptitle("Figure %d: Average Training vs. Validation Accuracy Across Models During GridSearch"%fig_num)
plt.show()
Optimising for Accuracy
During hyperparamter cross-validation we are choosing the model with the best overall accuracy.
This gives us a model with the smallest possible total number of misclassified observations, irrespective of which class the errors come from$^5$.
ML algorithms typically optimize a reward or cost function computed as a sum over the training examples, the decision rule is likely going to be biased toward the majority class$^9$.
Notes
from mlxtend.plotting import plot_decision_regions
scatter_kwargs = {'edgecolor': None, 'alpha': 0.7}
contourf_kwargs = {'alpha': 0.2}
scatter_highlight_kwargs = {'s': 120, 'label': 'Validation data', 'alpha': 0.7}
with warnings.catch_warnings():
warnings.simplefilter("ignore")
plot_decision_regions(data_x.values, data_y, clf=lin_rs,
legend=1, X_highlight=X_val[y_val==1],
contourf_kwargs=contourf_kwargs,
scatter_kwargs=scatter_kwargs,
scatter_highlight_kwargs=scatter_highlight_kwargs
)
fig_num+=1
plt.suptitle("Figure %d: Model with Best Accuracys Decision Boundary"%fig_num)
There are a number of methods available to address imbalances in a dataset, such as:
Some of the folds may not have the same amount of data in, so the validation error we get from models may be a poor estimate of performance.
KF = KFold(n_splits=5)
SKF = StratifiedKFold(n_splits=5)
fold_names = ["KFold", "StratifiedKFold"]
for i, K in enumerate([KF, SKF]):
print(color.BOLD+color.UNDERLINE+fold_names[i]+color.END)
for j, (train_i, test_i) in enumerate(K.split(X_train, y_train)):
fold_no = pd.DataFrame(pd.Series(y_train[test_i]).value_counts(), columns=["Fold "+str(j)])
if j == 0:
fold_nos = fold_no
else:
fold_nos = pd.concat([fold_nos,fold_no], axis=1)
display(fold_nos)
During model fitting we can assign a larger penalty to wrong predictions on the minority class.
The heuristic used for class_weight="balanced"
in Scikit-Learn (0.23.1) is:
where $n$ are the number of samples, $Nc$ the number of classes, $I$ is an indicator function, and $S$ contains the class elements.
lin_param_dist['svm_clf__class_weight'] = [None, "balanced"]
bal_lin_rs = hyper_search(linear_svm, lin_param_dist, X_train, y_train,
os.path.join(os.getcwd(), "Models", "def_bal_lin_rs.pkl"),
metric = ["accuracy", "f1","recall","precision"],
refit=False, # we will refit manually later
cv = SKF, overwrite=False)
bal_lin_rs_df = pd.DataFrame(bal_lin_rs.cv_results_)
bal_lin_rs_df.sort_values("mean_test_accuracy", ascending=False)[["param_svm_clf__class_weight",
"param_svm_clf__C",
"mean_test_accuracy",
"std_test_accuracy"]].head()
from sklearn.base import clone
# we refit the best accuracy model on all the training data
# so lets do that for the best other metric models
def manual_refit(input_model, X, y, gs, metric, disp_df=[]):
output_model = clone(input_model)
gs_df = pd.DataFrame(gs.cv_results_).sort_values("mean_test_"+metric, ascending=False)
if disp_df:
display(gs_df[disp_df].head())
params = gs_df["params"].iloc[0]
output_model = output_model.set_params(**params)
output_model = output_model.fit(X, y)
return output_model
acc_model = manual_refit(linear_svm, X_train, y_train, bal_lin_rs, "accuracy")
Extra
So far in these notes we have been using a standard classifcation table fom Scikit-Learn, however we may wish instead to use one more suited to imballanced data.
TODO
Notes
def imbalanced_report_df(y_val, y_pred, target_names):
imb_str = classification_report_imbalanced(y_val,
y_pred,
target_names=target_names)
sl = re.split(' |\n|\n\n',imb_str)
scores = []
for string in sl:
try:
scores.append(float(string))
except:
continue
n=7
scores = [scores[i * n:(i + 1) * n] for i in range((len(scores) + n - 1) // n )]
scores_df = pd.DataFrame(scores,
index=[target_names + ['avg / total']],
columns = ['precision','recall','specificity','f1-score','geo','iba','support']).T
return scores_df
display(pd.DataFrame(classification_report(y_val,
acc_model.predict(X_val),
labels=None,
target_names=list(LE.classes_),
sample_weight=None,
digits=2,
output_dict=True)).round(2))
bal_lin_rs_rpt = imbalanced_report_df(y_val, acc_model.predict(X_val), target_names=list(LE.classes_))
display(bal_lin_rs_rpt)
Changing the metric for what is defined as the "best model" can help us prioritise models that make particular errors.
For example, a credit card company might particularly wish to avoid incorrectly classifying an individual who will default, whereas incorrectly classifying an individual who will not default, though still to be avoided, is less problematic.
In this case, recall would therefore be a useful metric to use.
Notes
balanced
models are indeed better if we want a good average recall or f1rec_model = manual_refit(linear_svm, X_train, y_train, bal_lin_rs, "recall",
disp_df=["param_svm_clf__class_weight",
"param_svm_clf__C",
"mean_test_recall",
"std_test_recall"])
rec_lin_rs_rpt = imbalanced_report_df(y_val, rec_model.predict(X_val), target_names=list(LE.classes_))
f1_model = manual_refit(linear_svm, X_train, y_train, bal_lin_rs, "f1",
disp_df=["param_svm_clf__class_weight",
"param_svm_clf__C",
"mean_test_f1",
"std_test_f1"])
f1_lin_rs_rpt = imbalanced_report_df(y_val, f1_model.predict(X_val), target_names=list(LE.classes_))
# get the confusion matrix as a numpy array
confmat = confusion_matrix(y_true=y_val, y_pred=acc_model.predict(X_val))
fig_num+=1
# use the pretty function to make it nicer
pretty_confusion_matrix(confmat, LE.classes_,
"Figure %d: Best CV Accuracy Validation Confusion Matrix"%fig_num,
highlight_indexes=[FP_i,FN_i,TP_i,TN_i]
)
# get the confusion matrix as a numpy array
confmat = confusion_matrix(y_true=y_val, y_pred=rec_model.predict(X_val))
fig_num+=1
# use the pretty function to make it nicer
pretty_confusion_matrix(confmat, LE.classes_,
"Figure %d: Best CV Recall Validation Confusion Matrix"%fig_num,
highlight_indexes=[FN_i,TP_i]
)
Extra
This is just some additional visualisations.
# get the confusion matrix as a numpy array
confmat = confusion_matrix(y_true=y_val, y_pred=f1_model.predict(X_val))
fig_num+=1
# use the pretty function to make it nicer
pretty_confusion_matrix(confmat, LE.classes_ ,
"Figure %d: Best CV F1-Score Validation Confusion Matrix"%fig_num,
highlight_indexes=[FN_i,TP_i, FP_i]
)
def imb_report_prep(df, new_col_name, new_col_val):
df_ = df.T.stack()
df_ = df_.reset_index()
df_.columns = ["class", "metric", "score"]
df_[new_col_name] = new_col_val
return df_
bal_lin_rs_rpt_ = imb_report_prep(bal_lin_rs_rpt, 'cv metric', "accuracy")
rec_lin_rs_rpt_ = imb_report_prep(rec_lin_rs_rpt, 'cv metric', "recall")
f1_lin_rs_rpt_ = imb_report_prep(f1_lin_rs_rpt, 'cv metric', "f1")
lin_rs_rpt = pd.concat([bal_lin_rs_rpt_, rec_lin_rs_rpt_, f1_lin_rs_rpt_])
lin_rs_rpt_plt = lin_rs_rpt[lin_rs_rpt['class']=='Yes']
lin_rs_rpt_plt = lin_rs_rpt_plt[lin_rs_rpt_plt["metric"].isin(["recall", "f1-score"])]
sns.catplot(data = lin_rs_rpt_plt,
x = "cv metric", y = 'score',
col='metric', col_wrap=4, kind="bar")
plt.show()
Providing your comfortable using metrics instead of relying on a confusion_matrix
, you can use more of your training data by just using the multiple metrics in the CvGridSearch
.
From here on in, I will get rid of my separate training and validation sets and I will just use "recall"
as our metric of interest.
X_train, X_test, y_train, y_test = train_test_split(data_x.values, data_y, test_size = 0.1, random_state=42)
wt_lin_rs = hyper_search(linear_svm, lin_param_dist, X_train, y_train,
os.path.join(os.getcwd(), "Models", "def_wt_lin_rs.pkl"),
metric = ["f1","recall","precision"], refit='recall',
cv = SKF, overwrite=False)
Notes
RandomUnderSampler
is part of the Imblearn package, which allows for a lot of techniques for working with imballanced data.resample
method in scikit-learn
but Imblearn is a bit smoother to work with.from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
def plot_manual_pairplot(data_x, data_y, ax, title=None, col=0):
data_plot = data_x.copy()
data_plot['class'] = np.vectorize({0:'No', 1:'Yes'}.get)(data_y)
if title:
ax[0, col].set_title(title)
sns.scatterplot(data = data_plot[data_plot['class']=="No"], x = "balance", y="income",
hue="class", palette='Blues',ax=ax[0, col], legend=False)
sns.scatterplot(data = data_plot[data_plot['class']=="Yes"], x = "balance", y="income",
hue="class", palette='Oranges',ax=ax[0, col], legend=False)
sns.kdeplot("balance", data=data_plot, hue="class",ax=ax[1, col], fill=True, legend=False)
sns.kdeplot("income", data=data_plot, hue="class",ax=ax[2, col], fill=True, legend=False)
def imblearn_sample(sampler, data_x, data_y, titles, main_title, counter=False):
fig, axes = plt.subplots(3,2, figsize=(width_inch*2, height_inch*3), sharey='row')
plot_manual_pairplot(data_x, data_y, axes, titles[0])
data_x_downsampled, data_y_downsampled = sampler.fit_resample(data_x,
data_y)
if counter:
print(color.UNDERLINE + 'Before Resample' + color.END)
print(Counter(data_y))
print(color.UNDERLINE + 'After Resample' + color.END)
print(Counter(data_y_downsampled))
plot_manual_pairplot(data_x_downsampled, data_y_downsampled, axes, titles[1], 1)
plt.suptitle(main_title)
plt.tight_layout()
plt.show()
fig_num+=1
undersampler = RandomUnderSampler(random_state=123)
imblearn_sample(undersampler, data_x, data_y, ["Raw", "Undersampled"],
'Figure %d: RandomUnderSampler'%fig_num)
from imblearn.pipeline import Pipeline as ImPipeline
if 'svm_clf__class_weight' in lin_param_dist.keys():
del lin_param_dist['svm_clf__class_weight']
linear_svm = ImPipeline([
("scaler", StandardScaler()),
("sampler", undersampler),
("svm_clf", LinearSVC(random_state=42))
])
us_lin_rs = hyper_search(linear_svm, lin_param_dist, X_train, y_train,
os.path.join(os.getcwd(), "Models", "def_us_lin_rs.pkl"),
metric = ["f1","recall","precision"], refit='recall', cv = SKF,
overwrite=False)
Note
Data can be oversampled easily by randomly sampling from minority classes with replacement to duplicate original samples.
Notes
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(random_state=123)
fig_num+=1
imblearn_sample(oversampler, data_x, data_y, ["Raw", "Oversampled"],
'Figure %d: RandomOverSampler'%fig_num
)
linear_svm = ImPipeline([
("scaler", StandardScaler()),
("sampler", oversampler),
("svm_clf", LinearSVC(random_state=42))
])
os_lin_rs = hyper_search(linear_svm, lin_param_dist, X_train, y_train,
os.path.join(os.getcwd(), "Models", "def_os_lin_rs.pkl"),
metric = ["f1","recall","precision"], refit='recall', overwrite=False)
We can see if a RBF improves things although, if you plan on running this yourself (overwrite=True
), this is computationally expensive.
# specify parameters and distributions to sample from
rbf_param_dist = lin_param_dist.copy()
gamma_list = []
pwr = -15
for i in range(10):
gamma_list.append(2**pwr)
pwr+=2
rbf_param_dist['svm_clf__gamma'] = loguniform(gamma_list[0], gamma_list[-1])
# -----------
# Undersample
# -----------
rbf_svm = ImPipeline([
("scaler", StandardScaler()),
("sampler", undersampler),
("svm_clf", SVC(random_state=42))])
us_rbf_rs = hyper_search(rbf_svm, rbf_param_dist, X_train, y_train,
os.path.join(os.getcwd(), "Models", "def_us_rbf_rs.pkl"),
metric = ["f1","recall","precision"],
refit='recall', cv = SKF, overwrite=False)
# ----------
# Oversample
# ----------
#rbf_svm = ImPipeline([
# ("scaler", StandardScaler()),
# ("sampler", oversampler),
# ("svm_clf", SVC(random_state=42))])
#os_rbf_rs = hyper_search(rbf_svm, rbf_param_dist, X_train, y_train,
# os.path.join(os.getcwd(), "Models", "def_os_rbf_rs.pkl"),
# metric = ["f1","recall","precision"], refit='f1', cv = SKF, overwrite=False)
# -------
# Weights
# -------
rbf_svm = ImPipeline([
("scaler", StandardScaler()),
("svm_clf", SVC(random_state=42))])
rbf_param_dist['svm_clf__class_weight'] = [None, "balanced"]
#wt_rbf_rs = hyper_search(rbf_svm, rbf_param_dist, X_train, y_train,
# os.path.join(os.getcwd(), "Models", "def_wt_rbf_rs.pkl"),
# metric = ["f1","recall","precision"], refit='f1', cv = SKF, overwrite=False)
A number of undersampling methods use heuristics based on k-nearest neighbors (KNN) classification8. KNN finds a number of samples that are the most similar to a data point we want to classify, based on a given distance metric, with its assigned class label depending on a majority vote by the nearest neighbours9 (we'll come back to this later). NearMiss uses this by selecting samples in the class to be under-sampled where the average distance to the closest or farthest samples of the minority class is smallest10.
Undersampling techniques also include data cleaning rules, where the number of samples in classes are not specified, but data is edited based on methods such as removing data dissimilar to their neighbourhood11 or by removing one or both samples in different classes when they are nearest neighbors of each other12.
Instead of just randomly oversampling there are also available approaches that generate new samples through the use of interpolation, such as SMOTE and ADASYN. However these methods can generate noisy samples so cleaning rule can be applied after oversampling13.
from imblearn.under_sampling import NearMiss, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
fig, axes = plt.subplots(3,5, figsize=(width_inch*2, height_inch*2), sharey='row')
plot_manual_pairplot(data_x, data_y, axes, "Raw")
titles = ["NearMiss", "NeighbourhoodCleaningRule", "SMOTE", "SMOTEENN"]
for i, sampler in enumerate([NearMiss(),
NeighbourhoodCleaningRule(),
SMOTE(random_state=42),
SMOTEENN(random_state=42)
]):
data_x_downsampled, data_y_downsampled = sampler.fit_resample(data_x,
data_y)
plot_manual_pairplot(data_x_downsampled, data_y_downsampled, axes, titles[i], i+1)
plt.suptitle('Figure Extra: Other Sampling Approaches')
plt.tight_layout()
plt.show()
There is no one best approach, its typically dependent on the data and the aims for the model.
Below are examples of cross-validation scores for the best models (according to recall) for the different approaches.
Notes
def imb_cv_prep(input_pd, model_name):
metrics = ["f1", "precision", "recall"]
for i, metric in enumerate(metrics):
if isinstance(input_pd, pd.DataFrame):
df_ = input_pd[["split0_test_"+metric, "split1_test_"+metric,
"split2_test_"+metric, "split3_test_"+metric,
"split4_test_"+metric]].stack().reset_index()
df_['fold'] = list(range(5))*(df_["level_0"].iloc[-1] +1)
df_.columns = ["cv_iter", "metric", "score", "fold"]
else:
df_ = pd.DataFrame(input_pd[["split0_test_"+metric, "split1_test_"+metric,
"split2_test_"+metric, "split3_test_"+metric,
"split4_test_"+metric]]).reset_index().reset_index()
df_.columns = ["fold", "metric", "score"]
df_['metric'] = df_.metric.str.replace('split0_test_','')
df_['metric'] = df_.metric.str.replace('split1_test_','')
df_['metric'] = df_.metric.str.replace('split2_test_','')
df_['metric'] = df_.metric.str.replace('split3_test_','')
df_['metric'] = df_.metric.str.replace('split4_test_','')
if i == 0:
df = df_
else:
df = pd.concat([df,df_])
df['model'] = model_name
return df
svm_scores_dict = {
"Weighted Linear SVM": pd.DataFrame(wt_lin_rs.cv_results_).sort_values("rank_test_f1").iloc[0],
"Oversample Linear SVM": pd.DataFrame(os_lin_rs.cv_results_).sort_values("rank_test_f1").iloc[0],
"Undersample Linear SVM": pd.DataFrame(us_lin_rs.cv_results_).sort_values("rank_test_f1").iloc[0],
"Undersample RBF SVM": pd.DataFrame(us_rbf_rs.cv_results_).sort_values("rank_test_f1").iloc[0]
}
for i, model_name in enumerate(svm_scores_dict):
svm_score_df = imb_cv_prep(svm_scores_dict[model_name], model_name)
if i ==0:
svm_scores_df = svm_score_df
else:
svm_scores_df = pd.concat([svm_scores_df, svm_score_df])
sns.catplot(data = svm_scores_df, x = "score", y = 'model',
col='metric', col_wrap=3, kind="box",
legend=False)
fig_num+=1
plt.suptitle("Figure %d: Best Recall Model Across Validation Folds"%fig_num)
plt.tight_layout()
plt.show()
Using the figure above, for the client who wants the model to prioritise avoiding incorrectly classifying an individual who will default, we would probably choose the undersampled linear SVM.
As we can see on the test set, we get similar scores as we did on the validation.
Notes
test_rpt = imbalanced_report_df(y_test, us_lin_rs.best_estimator_.predict(X_test), target_names=list(LE.classes_))
display(test_rpt)
# get the confusion matrix as a numpy array
confmat = confusion_matrix(y_true=y_val, y_pred=us_lin_rs.best_estimator_.predict(X_val))
fig_num+=1
# use the pretty function to make it nicer
pretty_confusion_matrix(confmat, LE.classes_, "Figure %d: Test Set Performance"%fig_num)
Would adding in if someone is a student improve the model?
Note
df = pd.read_csv("../Data/Default.csv", index_col=0)
df.head()
Note
OneHotEncoder
into the pipeline but not do it for continous datafrom sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
data_x = df.drop(['default'], axis=1)
categorical_features = ['student']
numeric_features = ['balance', 'income']
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(data_x, data_y, test_size = 0.1, random_state=42)
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])
linear_svm = ImPipeline([
("preprocessor", preprocessor),
("sampler", undersampler),
("svm_clf", LinearSVC(random_state=42))
])
us_stu_lin_rs = hyper_search(linear_svm, lin_param_dist, X_cat_train, y_cat_train,
os.path.join(os.getcwd(), "Models", "def_stud_us_lin_rs.pkl"),
metric = ["f1","recall","precision"], refit='recall', cv = SKF,
overwrite=False)
rbf_svm = ImPipeline([
("preprocessor", preprocessor),
("sampler", undersampler),
("svm_clf", SVC(random_state=42))])
us_stu_rbf_rs = hyper_search(rbf_svm, rbf_param_dist, X_cat_train, y_cat_train,
os.path.join(os.getcwd(), "Models", "def_stud_us_rbf_rs.pkl"),
metric = ["f1","recall","precision"],
refit='recall', cv = SKF, overwrite=False)
So in this case it does not seem to improve the metric of most interest (recall), although did improve precision at the expense of recall.
svm_scores_dict = {
"Undersample Linear SVM": pd.DataFrame(us_lin_rs.cv_results_).sort_values("rank_test_f1").iloc[0],
"Undersample New Linear SVM": pd.DataFrame(us_stu_lin_rs.cv_results_).sort_values("rank_test_f1").iloc[0],
"Undersample RBF SVM": pd.DataFrame(us_rbf_rs.cv_results_).sort_values("rank_test_f1").iloc[0],
"Undersample New RBF SVM": pd.DataFrame(us_stu_rbf_rs.cv_results_).sort_values("rank_test_f1").iloc[0]
}
for i, model_name in enumerate(svm_scores_dict):
svm_score_df = imb_cv_prep(svm_scores_dict[model_name], model_name)
if i ==0:
svm_scores_df = svm_score_df
else:
svm_scores_df = pd.concat([svm_scores_df, svm_score_df])
svm_scores_df['score'] = svm_scores_df['score'].astype(float)
display(svm_scores_df.drop('fold', axis=1).groupby(['model', 'metric']).mean().unstack('metric'))
sns.catplot(data = svm_scores_df, x = "score", y = 'model',
col='metric', col_wrap=3, kind="box",
legend=False)
plt.suptitle("Extra: Best F1 Model Across Validation Folds")
plt.tight_layout()
plt.show()
Fairness
Imagine we used the model in practice, and those deemed more likely to default were given more unfavourable terms due to them being seeming more risky according to the algorithm?
Imagine adding their student status had improved our model. Would it be fair to judge their chances of defaulting based on their student status rather than their actual financial information?
Maybe we need to know more about the people applying for the loan, but what variables do we use?
This is a difficult but important aspect of ML and I leave it here to make you think about the possibilities. We'll be exploring this more in the last week.
Notes
Some models (e.g. tree-based classifiers) are inherently multiclass, whereas other machine learning algorithms are able to be extended to multi-class classification using techniques such as the One-versus-Rest or One-versus-One methods3.
iris = load_iris(as_frame=True) # data stored in a `sklearn.utils.Bunch`
iris_df = iris['data'] # get features DataFrame
target = iris['target'] # get target Series
# get the labels of flowers capitalised for visualisation
target_names = list(map(lambda s: s.capitalize(), iris['target_names']))
The One-verses-all approach is were you train a classifier for each class and select the class from the classifier that outputs the highest score$^3$.
In other terms, if we fit $K$ SVMs, we assign a test observation, $x^*$, to the class for which $\beta_{0k} + \beta_{1k}x^*_1, ...,\beta_{pk}x^*_p$ is largest (the most confident)5.
Advantage: As each class is fitted against all other classes for each classifier, it is relatively interpretable$^{14}$.
Disadvantages: Can result in ambiguious decision regions (e.g. could be class 1 or class 2), and classifiers could suffer from issues of class imballance$^4$.
Notes
sklearn.svm.SVC
, you could also put the SVC inside sklearn.multiclass.OneVsRestClassifier
features = ["petal length (cm)", "petal width (cm)"]
ovr_rbf = SVC(kernel='rbf',
decision_function_shape = 'ovr',
random_state=42)
ovr_pipe = Pipeline([
('scl', StandardScaler()),
('svm_clf', ovr_rbf)])
ovr_rbf_rs = hyper_search(ovr_pipe, rbf_param_dist, iris_df[features].values,
target.values, os.path.join(os.getcwd(), "Models", "iris_ovr_rbf_rs.pkl"),
cv = KFold(n_splits=3, shuffle=True, random_state=42), overwrite=False)
display(pd.DataFrame(ovr_rbf_rs.cv_results_).sort_values("rank_test_score")[["param_svm_clf__class_weight",
"param_svm_clf__C",
"param_svm_clf__gamma",
"mean_test_score",
"std_test_score"]].head())
Another strategy is to use a OneVsOne approach.
This trains $N \times (N-1) / 2$ classifiers by comparing each class against each other.
When a prediction is made, the class that is selected the most is chosen (Majority Vote)$^3$.
Advantage: It is useful where algorithms do not scale well with data size (such as SVM), because each training and prediction is only needed to be run on a small subset of the data for each classifer$^{3,14}$.
Disadvantages: Can still result in ambiguious decision regions and be computationally expensive$^4$.
Notes
ovo_rbf = SVC(kernel='rbf',
decision_function_shape = 'ovo',
random_state=42)
ovo_pipe = Pipeline([
('scl', StandardScaler()),
('svm_clf', ovo_rbf)])
ovo_rbf_rs = hyper_search(ovo_pipe, rbf_param_dist, iris_df[features].values,
target.values, os.path.join(os.getcwd(), "Models", "iris_ovo_rbf_rs.pkl"),
cv = KFold(n_splits=3, shuffle=True, random_state=42), overwrite=False)
display(pd.DataFrame(ovo_rbf_rs.cv_results_).sort_values("rank_test_score")[["param_svm_clf__class_weight",
"param_svm_clf__C",
"param_svm_clf__gamma",
"mean_test_score",
"std_test_score"]].head())
from mlxtend.plotting import plot_decision_regions
fig, axes = plt.subplots(ncols = 2, figsize=(width_inch*3, height_inch*2), sharey=True, sharex=True)
names = ["One-verses-all", "OneVsOne"]
for i, best_estimator in enumerate([ovr_rbf_rs.best_estimator_,ovo_rbf_rs.best_estimator_]):
ax = axes[i]
ax = plot_decision_regions(iris_df[features].values,
target.values,
clf = best_estimator,
ax = ax
)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, target_names,
framealpha=0.3, scatterpoints=1)
ax.set_title(names[i])
axes[0].set_xlabel(features[0])
axes[1].set_xlabel(features[0])
axes[0].set_ylabel(features[1])
fig_num+=1
plt.suptitle("Figure %d: Multi-label classification on the Iris Dataset"%fig_num)
plt.show()
scikit-learn
implements macro and micro averaging methods to extend scoring metrics to multiclass problems.
The micro-average is calculated from each TPs, TNs, FPs, and FNs of the system.
For example, the micro-average precision score for a $k$-class system is,
$$ PRE_{micro} = \frac{TP_1+\ldots+TP_K}{TP_1+\ldots+TP_K+FP_1+\ldots+FP_K}. $$This is useful when we want to weight each instance or prediction equally.
The macro-average is the average scores of the different systems:
$$ PRE_{macro} = \frac{PRE_1+\ldots+PRE_K}{K}. $$This is useful when we want to evaluate the overall performance of a classifier with regard to the most frequent class labels.
There are always advantages and disadvantages to using any model on a particular dataset.
import sys
from shutil import copyfile
# where the HTML template is located
dst = os.path.join(sys.prefix, 'lib', 'site-packages', 'nbconvert', 'templates', "classic.tplx")
# If its not located where it should be
if not os.path.exists(dst):
# uses a nb_pdf_template
curr_path = os.path.join(os.getcwd(),"..", "Extra", "classic.tplx")
# copy where it is meant to be
copyfile(curr_path, dst)
# Create HTML notes document
!jupyter nbconvert 3_Applications.ipynb \
--to html \
--output-dir . \
--template classic
!jupyter nbconvert 3_Applications.ipynb \
--to slides \
--output-dir . \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True \
--SlidesExporter.reveal_scroll=True
# Create pdf notes document (issues)
!jupyter nbconvert 3_Applications.ipynb \
--to html \
--output-dir . \
--output 3_Applications_no_code \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True