# This line will add a button to toggle visibility of code blocks,
# for use with the HTML export version
from IPython.core.display import HTML
HTML('''<button style="margin:0 auto; display: block;" onclick="jQuery('.code_cell .input_area').toggle();
jQuery('.prompt').toggle();">Toggle code</button>''')
Dr. David Elliott
%matplotlib inline
import os # locating directories
import numpy as np # Arrays
import pandas as pd # DataFrames
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
# colours for print()
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
image_dir = os.path.join(os.getcwd(),"Images")
# Initial fig number
fig_num=0
plt.rcParams['figure.dpi'] = 120
# golden ratio for figures ()
gr = 1.618
height_pix = 500
width_pix = height_pix*gr
height_inch = 4
width_inch = height_inch*gr
# Centered figures in the notebook and presentation
# ...was a real pain to find this:
# https://gist.githubusercontent.com/maxalbert/800b9f06c7b2dd365ea5
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import urllib
import base64
from io import BytesIO, StringIO
def fig2str(fig, format='svg'):
"""
Return a string containing the raw data of the matplotlib figure in the given format.
"""
assert isinstance(fig, matplotlib.figure.Figure)
imgdata = BytesIO()
fig.savefig(imgdata, format=format, bbox_inches='tight')
imgdata.seek(0) # rewind the data
output = imgdata.getvalue()
if format == 'svg':
return output
else:
return urllib.parse.quote(base64.b64encode(output))
class MatplotlibFigure(object):
"""
Thin wrapper around a matplotlib figure which provides a custom
HTML representation that allows tweaking the appearance
"""
def __init__(self, fig, centered=False):
assert isinstance(fig, matplotlib.figure.Figure)
self.centered = centered
def _repr_html_(self):
img_str_png = fig2str(fig, format='png')
uri = 'data:image/png;base64,' + img_str_png
html_repr = "<img src='{}'>".format(uri)
if self.centered:
html_repr = "<center>" + html_repr + "</center>"
return html_repr
# TODO: when tidying this up in the future, but these common functions I use
# all the time in a separate .py file.
def hyper_search(model, params, X, y, save_path, n_iter=60, metric="accuracy",
cv = KFold(5), random_state=42, refit=True,
overwrite=False, warning=False, verbose=0):
if os.path.exists(save_path) and overwrite==False:
#load the model
models = joblib.load(save_path)
else:
# check all param inputs are lists
if all(type(x)==list for x in params.values()):
search_type = "Gridsearch"
models = GridSearchCV(model, param_grid=params, scoring=metric, cv=cv,
refit=refit, return_train_score=True, verbose=verbose)
n_iter = len(list(itertools.product(*list(iter(params.values())))))
else:
search_type = "Randomsearch"
if verbose>0:
print("Starting "+search_type)
models = RandomizedSearchCV(model, param_distributions=params,
n_iter=n_iter, scoring=metric, cv=cv,
refit=refit, random_state=random_state,
return_train_score=True, verbose=verbose)
start = time()
if warning:
models.fit(X, y)
else:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
models.fit(X, y)
if verbose>0:
print(search_type + " took %.2f seconds for %d candidates" % ((time() - start), n_iter))
joblib.dump(models, save_path)
return models
LendingClub (was) the world's largest peer-to-peer lending platform.
Investors were able to search and browse the loan listings on LendingClub website and select loans that they want to invest in based on the information supplied about the borrower, amount of loan, loan grade, and loan purpose.
Notes
df = pd.read_csv('../Data/LCData/LC_reduced.csv')
df = df.drop("Unnamed: 0", axis=1)
df.head()
When a person applies for a loan, there are two types of decisions that could be taken by the company18:
Notes
df['fico_range_low'].min()
When the company receives a loan application, the company has to make a decision for loan approval based on the applicant’s profile. Two types of risks are associated with the bank’s decision:
The data given contains the information about past loan applicants and whether they ‘defaulted’ or not. The aim is to identify patterns which indicate if a person is likely to default, which may be used for taking actions such as denying the loan, reducing the amount of loan, lending (to risky applicants) at a higher interest rate, etc.
df.info()
import warnings # prevent warnings
def set_pandas_display_options() -> None:
"""Set pandas display options."""
# Ref: https://stackoverflow.com/a/52432757/
display = pd.options.display
display.max_columns = 1000
display.max_rows = 1000
display.max_colwidth = 1000
display.width = None
# display.precision = 2 # set as needed
with warnings.catch_warnings():
warnings.simplefilter("ignore")
set_pandas_display_options()
info = pd.read_excel("../Data/LCData/LCDataDictionary.xlsx")
info = info[['LoanStatNew','Description']].iloc[0:-2]
display(info.set_index('LoanStatNew').loc[df.columns].sort_index())
with warnings.catch_warnings():
warnings.simplefilter("ignore")
# not sure why this give all this output
pd.reset_option('all')
print(color.BOLD+color.UNDERLINE+"Loan Status (%)"+color.END)
((df['loan_status'].value_counts()/len(df['loan_status']))*100).round(2)
For the purposes of this lecture, we are only using a sample of the full dataset (~1%). This is to ensure things dont take an age to run.
# turn intrest rate into a float
df['int_rate'] = df['int_rate'].str.replace(r'%', '').astype(float)
# get the employment length in years
df['emp_length'] = df['emp_length'].str.extract('(\d+)').astype(int)
# get the loan term in months
df['term'] = df['term'].str.extract('(\d+)').astype(int)
# we are going to remove categorical variables for now as scikit-learn
# doesnt like them (see later).
cont_df = df.drop(['home_ownership', 'zip_code'], axis=1)
cont_df.info()
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
LE = LabelEncoder()
data_x, data_y = cont_df.drop(['loan_status'], axis=1), cont_df['loan_status']
# it will do this alpabetically but we want our positive value to
# be "charged off".
classes = list(cont_df['loan_status'].unique())
classes.sort(reverse=True)
LE.fit(classes)
LE.classes = np.array(classes)
LE.classes_ = np.array(classes)
data_y = LE.transform(data_y)
X_train, X_test, y_train, y_test = train_test_split(data_x.values, data_y, test_size = 0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state=42)
We can deal with class imballance using class_weight = 'balanced'
, alike to discussed last week.
We can also undersample using a ballanced random forest, which builds several estimators on different randomly selected subsets of data.
Generally what performs better depends on the amount of data you are training on and the aims of the model.
Notes
# from https://realpython.com/python-timer/
import time
class TimerError(Exception):
"""A custom exception used to report errors in use of Timer class"""
class Timer:
def __init__(self):
self._start_time = None
def start(self):
"""Start a new timer"""
if self._start_time is not None:
raise TimerError(f"Timer is running. Use .stop() to stop it")
self._start_time = time.perf_counter()
def stop(self):
"""Stop the timer, and report the elapsed time"""
if self._start_time is None:
raise TimerError(f"Timer is not running. Use .start() to start it")
elapsed_time = time.perf_counter() - self._start_time
self._start_time = None
print(f"Elapsed time: {elapsed_time:0.2f} seconds")
return elapsed_time
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.base import clone
t = Timer()
RF = RandomForestClassifier(criterion='gini',
n_estimators=100,
random_state=42,
oob_score = True,
n_jobs=-1)
RF_weight = clone(RF)
RF_weight.set_params(**{'class_weight': 'balanced'})
bal_f = BalancedRandomForestClassifier(criterion='gini',
n_estimators=100,
max_features = 'sqrt',
random_state=42,
oob_score = True,
n_jobs=-1)
rf_dict = {'Random Forest':RF, 'Random Forest (Balanced Class Weight)':RF_weight, 'Balanced Random Forest':bal_f}
for classifier_name in rf_dict:
t.start()
rf_dict[classifier_name].fit(X_train, y=y_train)
print(color.BOLD+color.UNDERLINE+classifier_name+color.END)
t.stop()
print('OOB Score: %.3f' % (rf_dict[classifier_name].oob_score_))
Notes
from sklearn.metrics import confusion_matrix
import warnings # prevent warnings
# this creates the matplotlib graph to make the confmat look nicer
def pretty_confusion_matrix(confmat, labels, title, labeling=False, highlight_indexes=[]):
labels_list = [["TN", "FP"], ["FN", "TP"]]
fig, ax = plt.subplots(figsize=(width_inch, height_inch))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
for j in range(confmat.shape[1]):
if labeling:
label = str(confmat[i, j])+" ("+labels_list[i][j]+")"
else:
label = confmat[i, j]
if [i,j] in highlight_indexes:
ax.text(x=j, y=i, s=label, va='center', ha='center',
weight = "bold", fontsize=18, color='#32618b')
else:
ax.text(x=j, y=i, s=label, va='center', ha='center')
# change the labels
with warnings.catch_warnings():
warnings.simplefilter("ignore")
ax.set_xticklabels(['']+[labels[0], labels[1]])
ax.set_yticklabels(['']+[labels[0], labels[1]])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
ax.xaxis.set_label_position('top')
plt.suptitle(title)
plt.tight_layout()
plt.show()
for classifier_name in rf_dict:
# use the first classifier to predict the validation set
predictions = rf_dict[classifier_name].predict(X_val)
# get the confusion matrix as a numpy array
confmat = confusion_matrix(y_true=y_val, y_pred=predictions)
fig_num+=1
# use the pretty function to make it nicer
pretty_confusion_matrix(confmat, LE.classes_,
"Figure %d: "%fig_num+classifier_name+" Validation Confusion Matrix",
labeling=True,
)
Notes
from sklearn.metrics import classification_report
for i, classifier_name in enumerate(rf_dict):
# use the first classifier to predict the validation set
predictions = rf_dict[classifier_name].predict(X_val)
report = pd.DataFrame(classification_report(y_val,
predictions,
labels=None,
target_names=LE.classes_,
sample_weight=None,
digits=2,
output_dict=True)).round(2)
report['classifer'] = classifier_name
if i == 0:
reports = report
else:
reports = pd.concat([reports, report])
reports.index.name = 'metric'
reports = reports.set_index('classifer', append=True)
reports = reports.swaplevel()
display(reports)
The above was just done on pretty much the default params, so you'd want to do some searches to get better hyperparameters.
from time import time
import joblib
rf_param_dist = {'n_estimators': range(50, 500),
'max_features': range(1, X_train.shape[1]),
'class_weight': [None, 'balanced']}
lc_rf_rs = hyper_search(RF, rf_param_dist, X_train, y_train,
os.path.join(os.getcwd(), "Models", "lc_rf_rs.pkl"),
n_iter=60, overwrite=False, metric="f1", verbose=2)
display(pd.DataFrame(lc_rf_rs.cv_results_).sort_values("rank_test_score")[["param_n_estimators",
"param_max_features",
"param_class_weight",
"mean_test_score",
"std_test_score"]].head())
del rf_param_dist['class_weight']
lc_bal_rf_rs = hyper_search(bal_f, rf_param_dist, X_train, y_train,
os.path.join(os.getcwd(), "Models", "lc_bal_rf_rs.pkl"),
n_iter=60, overwrite=False, metric="f1", verbose=2)
display(pd.DataFrame(lc_bal_rf_rs.cv_results_).sort_values("rank_test_score")[["param_n_estimators",
"param_max_features",
"mean_test_score",
"std_test_score"]].head())
Feature importances, as we have previously seen, can give us insight into the important features for our model.
fig = plt.figure(figsize = (width_inch, height_inch))
# get the importances for the features
rf_importances = lc_bal_rf_rs.best_estimator_.feature_importances_
rf_importances_series = pd.Series(rf_importances,index=data_x.columns).sort_index(ascending = False)
# plot the important features
rf_importances_series.plot.barh(legend =False, grid=False,)
fig_num+=1
plt.suptitle('Figure %d: Average Feature Importances for a Balanced Random Forest'%fig_num)
plt.tight_layout()
plt.close()
display(MatplotlibFigure(fig, centered=True))
We could also use the importance of each feature (using average impurity decrease), for feature selection.
We can put it in a pipeline and use the SelectFromModel
function from Scikit-learn.
Notes
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
svm = SVC(kernel='rbf', random_state=42, class_weight = 'balanced')
rf_svm = Pipeline([
('feature_selection', SelectFromModel(lc_bal_rf_rs.best_estimator_, threshold = 'mean')),
('classification', svm)
])
svm_dict = {'SVM':svm, 'Forest SVM':rf_svm}
for classifier_name in svm_dict:
scores = cross_val_score(estimator=svm_dict[classifier_name],
X=X_train,
y=y_train,
scoring = 'f1',
cv=StratifiedKFold(),
n_jobs=-1)
print(color.BOLD+color.UNDERLINE+classifier_name+color.END)
print('CV f1-score: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
Feature importances can be misleading for high cardinality features (many unique values)6
If features are highly correlated, one feature may be ranked highly while the information of the others not being fully captured4.
plt.figure(figsize=(width_inch, height_inch))
sns.heatmap(data_x.corr(), cmap='viridis')
plt.show()
The permutation feature importance is defined to be the decrease in a model score when a single feature value is randomly shuffled7.
This procedure breaks the relationship between the feature and the target, thus the drop in the model score is indicative of how much the model depends on the feature8.
Notes
" In the permutation-based approach, for each tree, the OOB sample is passed down the tree and the prediction accuracy is recorded. Then the values for each variable (one at a time) are randomly permuted and the accuracy is again computed. The decrease in accuracy as a result of this randomly shuffling of feature values is averaged over all the trees for each predictor. The variables with the largest average decrease in accuracy are considered most important."9
"This technique benefits from being model agnostic and can be calculated many times with different permutations of the feature." https://scikit-learn.org/stable/modules/permutation_importance.html
"Using a held-out set makes it possible to highlight which features contribute the most to the generalization power of the inspected model. Features that are important on the training set but not on the held-out set might cause the model to overfit." https://scikit-learn.org/stable/modules/permutation_importance.html
"Warning: Features that are deemed of low importance for a bad model (low cross-validation score) could be very important for a good model. Therefore it is always important to evaluate the predictive power of a model using a held-out set (or better with cross-validation) prior to computing importances. Permutation importance does not reflect to the intrinsic predictive value of a feature by itself but how important this feature is for a particular model." https://scikit-learn.org/stable/modules/permutation_importance.html
For more examples of methods of interpreting forests see: https://towardsdatascience.com/explaining-feature-importance-by-example-of-a-random-forest-d9166011959e
from sklearn.inspection import permutation_importance
fig, axes = plt.subplots(ncols = 2, figsize=(width_inch*2, height_inch), sharey=True)
axes = axes.flatten()
for i, ax in enumerate(axes):
if i == 0:
X, y = X_train, y_train
name = "Training"
else:
X, y = X_val, y_val
name = "Validation"
imp = permutation_importance(lc_bal_rf_rs.best_estimator_, X, y,
n_repeats=30, scoring='f1', random_state=0)
imp_df = pd.DataFrame(imp['importances']).T
imp_df.columns = data_x.columns
imp_df_ = imp_df.melt(var_name = 'Feature', value_name='Permutation Importance (F1-Score)')
sns.barplot(data = imp_df_, x = 'Permutation Importance (F1-Score)', y = 'Feature', ax = ax)
ax.set_title(name)
plt.show()
Extra: Fairness
In last weeks "Applications" notes I questioned what information is fair for us to use when making decisions around lending. Well this week we have a dataset which has some of these very varibles in which could be problematic.
If you look at the full list of varibles for this dataset (we've used a subset), you'll see there are even more features of which you may question their validity or fairness. Should we be including these varibles in our models, in other words should we be assigning weights to them? Without other demographic information (race, gender, ect.) can we ever try correct for them in our model?
More on this in week 11...
Note
There are always advantages and disadvantages to using any model on a particular dataset.
* limited or unavailable in Scikit-Learn
Notes
You may also be wondering: where are my previous data visualisations of the categorical data before this? Well Sklearn's CART decision trees currently "does not support categorical variables". This means:
Do not use Label Encoding
if your categorical data is not ordinal with DecisionTreeClassifier()
, you'll end up with splits that do not make sense, as the data will be treat as numeric13.
Using a OneHotEncoder
is the only current valid way with sklearn, allowing arbitrary splits not dependent on the label ordering, but is computationally expensive and it can deteriorate the performance of decision trees as it leads to sparse features, which can mess up feature importance12.
Solutions
Currently the best way of handling categorical features is to use h2o.randomForest
, a different forest implimentation, or catboost
, a boosting classifier.
Notes
Comparatively poor generalization performance.
Easy to overfit
High variance
Orthogonal decision boundaries
Cannot guarantee to return the globally optimal decision tree.
Notes
# from https://github.com/ageron/handson-ml2/blob/master/06_decision_trees.ipynb
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
def plot_decision_boundary(clf, X, y, axes=[0, 7.5, 0, 3], iris=True, legend=False, plot_training=True):
x1s = np.linspace(axes[0], axes[1], 100)
x2s = np.linspace(axes[2], axes[3], 100)
x1, x2 = np.meshgrid(x1s, x2s)
X_new = np.c_[x1.ravel(), x2.ravel()]
y_pred = clf.predict(X_new).reshape(x1.shape)
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
if not iris:
custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
if plot_training:
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", label="Iris setosa")
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", label="Iris versicolor")
plt.plot(X[:, 0][y==2], X[:, 1][y==2], "g^", label="Iris virginica")
plt.axis(axes)
if iris:
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
else:
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
if legend:
plt.legend(loc="lower right", fontsize=14)
np.random.seed(6)
Xs = np.random.rand(100, 2) - 0.5
ys = (Xs[:, 0] > 0).astype(np.float32) * 2
angle = np.pi / 4
rotation_matrix = np.array([[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]])
Xsr = Xs.dot(rotation_matrix)
tree_clf_s = DecisionTreeClassifier(random_state=42)
tree_clf_s.fit(Xs, ys)
tree_clf_sr = DecisionTreeClassifier(random_state=42)
tree_clf_sr.fit(Xsr, ys)
fig, axes = plt.subplots(ncols=2, figsize=(10, 4), sharey=True)
plt.sca(axes[0])
plot_decision_boundary(tree_clf_s, Xs, ys, axes=[-0.7, 0.7, -0.7, 0.7], iris=False)
plt.sca(axes[1])
plot_decision_boundary(tree_clf_sr, Xsr, ys, axes=[-0.7, 0.7, -0.7, 0.7], iris=False)
plt.ylabel("")
plt.suptitle("Extra: Sensitivity to Rotation")
plt.show()
Generally improve upon trees, at the expense of interpretability.
Comparatively good generalisability
Comparatively small variability in prediction accuracy when tuning14
Comparatively good “out of the box” performance15
ExtraTrees is faster to train than random forests
Work well on large datasets
Not good for very high-dimensional sparse data17.
Harder than trees to interpret
More computationally demanding to train than other algorithms17
Setting different random states can drastically change the model17.
Notes
import sys
from shutil import copyfile
# where the HTML template is located
dst = os.path.join(sys.prefix, 'lib', 'site-packages', 'nbconvert', 'templates', "classic.tplx")
# If its not located where it should be
if not os.path.exists(dst):
# uses a nb_pdf_template
curr_path = os.path.join(os.getcwd(),"..", "Extra", "classic.tplx")
# copy where it is meant to be
copyfile(curr_path, dst)
# Create HTML notes document
!jupyter nbconvert 3_Applications.ipynb \
--to html \
--output-dir . \
--template classic
!jupyter nbconvert 3_Applications.ipynb \
--to slides \
--output-dir . \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True \
--SlidesExporter.reveal_scroll=True
# Create pdf notes document (issues)
!jupyter nbconvert 3_Applications.ipynb \
--to html \
--output-dir ./PDF_Prep \
--output 3_Applications_no_code \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True