# This line will add a button to toggle visibility of code blocks,
# for use with the HTML export version
from IPython.core.display import HTML
HTML('''<button style="margin:0 auto; display: block;" onclick="jQuery('.code_cell .input_area').toggle();
jQuery('.prompt').toggle();">Toggle code</button>''')
NOTES
Question: Why not just stick to decision trees?
Answer: Although they are useful for interpretation, they are not as competative for supervised learning methods, so we'll also talk about bagging, random forests, and boosting; which all rely on producing multiple trees and combining them to gain a single prediction. This typically improves accuracy a lot, but at the expense of interpretability.
The goal of ensemble methods is to combine different classifiers into a metaclassifier that has better generalization performance than each individual classifier alone4.
Most popular ensemble methods use the majority (or plurality) voting principle.
Notes
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import matplotlib
from IPython.display import Image
import os
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.tree import plot_tree
from mlxtend.plotting import plot_decision_regions
from time import time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings # prevent warnings
import joblib # saving models
import itertools
from scipy.stats.distributions import uniform, loguniform
from scipy.stats import distributions
from sklearn.model_selection import LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import NeighbourhoodCleaningRule
from sklearn.decomposition import PCA
import timeit
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
matplotlib.rcParams['animation.embed_limit'] = 30000000.0
plt.rcParams['figure.dpi'] = 120
image_dir = os.path.join(os.getcwd(),"Images")
data_dir = os.path.join(os.getcwd(),"..","Data")
# colours for printing outputs
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
fig_num=21
plt.rcParams['figure.dpi'] = 120
# golden ratio for figures ()
gr = 1.618
height_pix = 500
width_pix = height_pix*gr
height_inch = 4
width_inch = height_inch*gr
# Centered figures in the notebook and presentation
# ...was a real pain to find this:
# https://gist.githubusercontent.com/maxalbert/800b9f06c7b2dd365ea5
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import urllib
import base64
from io import BytesIO, StringIO
def fig2str(fig, format='svg'):
"""
Return a string containing the raw data of the matplotlib figure in the given format.
"""
assert isinstance(fig, matplotlib.figure.Figure)
imgdata = BytesIO()
fig.savefig(imgdata, format=format, bbox_inches='tight')
imgdata.seek(0) # rewind the data
output = imgdata.getvalue()
if format == 'svg':
return output
else:
return urllib.parse.quote(base64.b64encode(output))
class MatplotlibFigure(object):
"""
Thin wrapper around a matplotlib figure which provides a custom
HTML representation that allows tweaking the appearance
"""
def __init__(self, fig, centered=False):
assert isinstance(fig, matplotlib.figure.Figure)
self.centered = centered
def _repr_html_(self):
img_str_png = fig2str(fig, format='png')
uri = 'data:image/png;base64,' + img_str_png
html_repr = "<img src='{}'>".format(uri)
if self.centered:
html_repr = "<center>" + html_repr + "</center>"
return html_repr
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values,
test_size = 0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
test_size = 0.2, random_state=42)
Majority voting can be done by simply selecting the class label that has been predicted by the majority of the classifiers (more than 50% of votes).
Majority vote refers to binary class decisions but can be generalized to a multi-class setting using plurality voting4.
To predict a class label via simple majority or plurality voting, we start by training $m$ different classifiers ($C_1,\ldots,C_m$), and then combine the predicted labels of each classifier, $C_j$, selecting the class label, $\hat y$, that received the most votes4.
Notes
VotingClassifier
where multipule classification pipelines can be combined to create a classifier that aggregates predictions.fig_num+=1
print(color.BOLD+color.UNDERLINE+"Figure %d: Majority Voting Classifier"%fig_num+color.END)
# From https://github.com/rasbt/stat451-machine-learning-fs20/blob/master/L07/07-ensembles__notes.pdf
#Image(os.path.join(image_dir, "majority_voting.png"), width=600)
# From https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch07/images/07_02.png
Image(os.path.join(image_dir, "07_02.png"), width=600)
In binary classification (-1,1) we can write the majority vote prediction as4:
$$ \hat y = \text{mode}\{C_1(x), C_2(x),...,C_m(x)\} \\ C(x) = \text{sgn}\left[ \sum^m_jC_j(x)\right] = \left\{{1 \: \text{if}\: \sum_j C_j(x) \: \geq \: 0 \atop -1 \: \text{otherwise}} \right\} $$Instead we may want to weight the votes:
$$ \hat y = {\arg\max \atop i} \sum^m_{j=1}w_j\mathbb{1}_A(C_j(x)=i) $$where $w_j$ is a weight associated with a base classifier, $C_j$ ; $\hat y$ is the predicted class label of the ensemble; $A$ is the set of unique class labels; $\mathbb{1}_A$ is the characteristic function or indicator function, which returns 1 if the predicted class of the $j$th classifier matches $C_j(x)=i$.
Particular classifiers return the probability of a predicted class label and this can be used for "soft voting" instead of class labels4. The modified version of the majority vote for predicting class labels from probabilities can be written as:
$$ \hat y = \text{arg} {\text{max} \atop i} \sum^m_{j=1}w_jp_{ij}, $$where $p_{ij}$ is the predicted probability of the $j$th classifier for class label $i$.
Soft voting often achives a higher performance than hard voting because highly confident votes are given more weight3.
NOTES
predict_proba
method to compute class probabilities. RANDOM_STATE=42
clf1 = Pipeline([('scl', StandardScaler()),
('clf', SVC(kernel='rbf',
gamma='auto',
random_state=RANDOM_STATE,
probability = True))])
clf2 = Pipeline([('scl', StandardScaler()),
('clf', LogisticRegression(solver='liblinear',
random_state=RANDOM_STATE))
])
clf3 = DecisionTreeClassifier(random_state=RANDOM_STATE)
clf_labels = ['SVM', # Support Vector Machine
'LR', # LogisticRegression
'DT'] # Decision Tree
# Majority Rule Voting
hard_mv_clf = VotingClassifier(estimators=[(clf_labels[0],clf1),
(clf_labels[1],clf2),
(clf_labels[2],clf3)],
voting='hard')
soft_mv_clf = VotingClassifier(estimators=[(clf_labels[0],clf1),
(clf_labels[1],clf2),
(clf_labels[2],clf3)],
voting='soft')
clf_labels += ['Hard Majority Voting', 'Soft Majority Voting']
all_clf = [clf1, clf2, clf3, hard_mv_clf, soft_mv_clf]
for clf, label in zip(all_clf, clf_labels):
clf.fit(X_train, y_train)
print(color.BOLD+color.UNDERLINE+label+color.END)
print("Training ACC Score: %0.2f"
% accuracy_score(y_train, clf.predict(X_train)))
print("Validation ACC Score: %0.2f"
% accuracy_score(y_val, clf.predict(X_val)))
print()
A bagging classifier is an ensemble of base classifiers, each fit on random subsets of a dataset. Their predictions are then pooled or aggregated to form a final prediction.
This reduces variance of an estimator so can be a simple way to reduce overfitting and increase prediction accuracy5.
We could use bagging by taking many separate training sets, $B$, from the population, building a separate prediction model using each training set, $\hat f^1(x),\hat f^2(x),...,\hat f^B(x)$, and average the resulting predictions1:
$\hat f_{avg}(x)=\frac{1}{B}\sum_{b=1}^B\hat f^b(x)$.
However we generally do not have access to multiple training sets so instead we can bootstrap by taking repeated samples from a (single) training data set to create multiple bootstrapped training data sets, $B$. We then train our method on the $b$th bootstrapped training set to get $f^{∗b}(x)$ , and finally average all the predictions, to obtain1:
$\hat f_{bag}(x) = \frac{1}{B}\sum_{b=1}^B\hat f^{∗b}(x)$.
Notes
fig_num+=1
print(color.BOLD+color.UNDERLINE+"Figure %d: Bagging Classifier"%fig_num+color.END)
# From https://github.com/rasbt/stat451-machine-learning-fs20/blob/master/L07/07-ensembles__notes.pdf
#Image(os.path.join(image_dir, "bagging.png"), width=600)
# From https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch07/images/07_06.png
Image(os.path.join(image_dir, "07_06.png"), width=600)
Specifically, bagging is when sampling is produced with replacement6, and without replacement being called pasting7.
Pasting is designed to use smaller sample sizes than the training dataset in cases where the training dataset does not fit into memory7.
Both bagging and pasting allow training to be sampled several times across multiple predictors, with bagging only allowing several samples for the same predictor 3.
from sklearn.ensemble import BaggingClassifier
print(color.BOLD+color.UNDERLINE+"Original Data"+color.END)
example_np = pd.DataFrame(range(10)).values
display(example_np.flatten())
print(color.BOLD+color.UNDERLINE+"Bagging (0.8 Samples)"+color.END)
bag = BaggingClassifier(base_estimator=clf3,
n_estimators=5,
random_state=42,
max_samples = 0.8,
bootstrap=True
).fit(example_np,example_np.ravel())
display(bag.estimators_samples_)
print(color.BOLD+color.UNDERLINE+"Pasting (0.8 Samples)"+color.END)
bag.set_params(bootstrap=False).fit(example_np,example_np.ravel())
display(bag.estimators_samples_)
Averaging methods generally work best when the predictors are as independent as possible, so one way of achiving this is to get diverse classifiers3.
Bagging has been demonstrated to give impressive improvements in accuracy by combining together hundreds or even thousands of trees into a single procedure.
These trees are grown deep, and are not pruned so each individual tree has high variance, but low bias. Averaging these $B$ trees reduces the variance.
To apply bagging to decision trees, we simply:
Notes
Using a bagged model means we can get a validation/test error without using cross-validation.
As each tree does not use all observations for training, we can predict the remaining observations.
With a sufficiently large amount of bags, OOB error is virtually equivalent to leave-one-out cross-validation error, which is convenient when performing bagging on large data sets for which cross-validation would be computationally onerous1.
bag = BaggingClassifier(base_estimator=clf3,
n_estimators=100,
max_samples=0.8,
max_features=1.,
bootstrap=True,
bootstrap_features=False,
oob_score=True,
warm_start=False,
n_jobs=-1,
random_state=42)
start = time()
bag.fit(X_train, y_train)
calc_time = time() - start
print("Bagged OOB Accuracy: " + str(round(bag.oob_score_,2)) + " ("+str(round(calc_time, 2))+" secs)")
start = time()
LOO_ave = np.mean(cross_val_score(bag, X_train, y_train, cv=LeaveOneOut()))
calc_time = time() - start
print("Leave-one-out Cross-validation Accuracy: " + str(round(LOO_ave,2)) + " ("+str(round(calc_time, 2))+" secs)")
Bagging improves prediction accuracy at the expense of interpretability.
However we can use the feature importance method as previously discussed, to get an overall summary of the importance of each predictor across trees.
NOTES
feature_importances_
.# get the importances for the features
bag_importances = np.mean([
tree.feature_importances_ for tree in bag.estimators_
], axis=0)
bag_importances_sd = np.std([
tree.feature_importances_ for tree in bag.estimators_
], axis=0)
bag_importances_series = pd.Series(bag_importances,index=X.columns)
fig = plt.figure(figsize = (width_inch*1.2, height_inch*1.2))
# plot the important features
bag_importances_series.sort_index(ascending = False).plot.barh(legend =False, grid=False, xerr=bag_importances_sd)
fig_num+=1
plt.title('Figure %d: Average (and Standard Devision) Decision Tree Feature Importances in a Bagged Classifier'%fig_num)
plt.xlim(0,0.8)
plt.tight_layout()
plt.close()
display(MatplotlibFigure(fig, centered=True))
Random forests are among the most widely used machine learning algorithm8.
Random forests are essentally bagged tree classifiers, but decorrelate the trees by using a random sample of features each time a split in a tree is considered. The random forest algorithm can therefore be summarized in four steps4:
Grow a decision tree from the bootstrap sample. At each node:
a. Randomly select $d$ features without replacement (typically the square root of the total number of predictors).
b. Split the node using the feature that provides the best split according to the objective function.
Repeat the steps above $k$ times.
NOTES
RandomForestClassifier
averages the probabilistic prediction.graphviz
or sklearn.tree.plot_tree
on the whole forest as we did for the trees, as each tree is built differently.BaggingClassifier
using bootstrap_features=True
By not allowing the model to use the majority of the available predictors, we ensure the bagged trees look different from each other.
If there is a particularly strong set of predictors in the data, then without randomly selecting features, the bagged trees will look quite similar to each other and predictions will be highly correlated.
Averaging highly correlated quantities does not lead to as large of a reduction in variance as averaging many uncorrelated quantities1.
Notes
fig, axes = plt.subplots(ncols=2, figsize = (width_inch*2, height_inch*1.4),
sharex=True, sharey=True)
# plot the important features
bag_importances_series.sort_index(ascending = False).plot.barh(legend =False, grid=False, ax=axes[0])
axes[0].set_title('Bagging')
plt.xlim(0,0.6)
plt.tight_layout()
RF = RandomForestClassifier(criterion='gini',
n_estimators=100,
max_samples=0.8,
random_state=42,
n_jobs=-1)
RF.fit(X.values, y.values)
# get the importances for the features
rf_importances = RF.feature_importances_
rf_importances_series = pd.Series(rf_importances,index=X.columns).sort_index(ascending = False)
# plot the important features
rf_importances_series.plot.barh(legend =False, grid=False, ax=axes[1])
axes[1].set_title('Random Forest')
fig_num+=1
plt.suptitle('Figure %d: Average Feature Importances for Bagged Trees and a Random Forest'%fig_num)
plt.tight_layout()
plt.close()
display(MatplotlibFigure(fig, centered=True))
Extra
As a further example, we could look at the trees in our forest directly and see they are quite different.
plot_trees = 3
fig, axes = plt.subplots(ncols=plot_trees, figsize=(10,5))
with plt.style.context("classic"):
for i in range(plot_trees):
plt.sca(axes[i])
plot_tree(RF.estimators_[i],
feature_names=X.columns,
class_names=load_breast_cancer().target_names,
filled = True)
plt.show()
Extra8
Earlier random decision forests by Tin Kam Ho10 used the "random subspace method", where each tree got a random subset of features.
“The essence of the method is to build multiple trees in randomly selected subspaces of the feature space.”
However, a few years later, Leo Breiman11 described the procedure of selecting different subsets of features for each node; so each tree was given the full set of features. This formulation is now the typical "random forest" algorithm
“. . . random forest with random features is formed by selecting at random, at each node, a small group of input variables to split on.”
Important parameters to adjust are13:
n_estimators
: Larger is generally better as averaging more trees will typically yield a more robust ensemble by reducing overfitting.max_features
: Determines how random each tree is so smaller number of features at each split reduces overfitting.max_samples
: Sample size of the bootstrap sample, also reduces overfitting.max_depth
, max_leaf_nodes
): more important for single trees, but can improve performance, reduce space, and time requirements.Notes
max_features
default values are max_features=sqrt(n_features)
for classification and max_features=n_features
for regression13.max_features
] values are performing best, then performance may improve by increasing node size (i.e., decreasing tree depth and complexity). Moreover, if computation time is a concern then you can often decrease run time substantially by increasing the node size and have only marginal impacts to your error estimate..."_14RF = RandomForestClassifier(criterion='gini',
random_state=42,
oob_score = True,
n_jobs=-1)
n_estimators_list = list(range(0, 1050, 50))
n_estimators_list[0] = 1
max_feats_list = list(range(0, X_train.shape[1]+6, 6))
max_feats_list[0] = 2
oob_scores = pd.DataFrame()
for n in n_estimators_list:
for f in max_feats_list:
RF.set_params(**{'n_estimators':n, 'max_features':f})
with warnings.catch_warnings(record=True) as w:
# Cause all warnings to always be triggered.
warnings.simplefilter("always")
try:
RF.fit(X_train, y_train)
score = RF.oob_score_
except:
score = 0.
oob_score = pd.DataFrame([n, f, score], index = ['Estimators', 'Max Features', 'OOB Score']).T
if oob_scores.empty:
oob_scores = oob_score
else:
oob_scores = pd.concat([oob_scores,oob_score])
fig, axes = plt.subplots(2, figsize = (width_inch*2, height_inch*2), sharex=True)
plt.sca(axes[1])
sns.lineplot(data = oob_scores, x = 'Estimators', y = 'OOB Score', hue = 'Max Features')
axes[1].set_ylim([0.5,1.])
axes[1].set_xlim([-10.,1100])
plt.legend([],[], frameon=False)
plt.sca(axes[0])
sns.lineplot(data = oob_scores, x = 'Estimators', y = 'OOB Score', hue = 'Max Features')
axes[0].set_ylim([0.95,0.98])
axes[0].set_xlim([-10.,1100])
fig_num+=1
plt.suptitle('Figure %d: `n_estimators` and `max_features` for Random Forest Performance'%fig_num)
plt.tight_layout()
plt.close()
display(MatplotlibFigure(fig, centered=True))
plt.show()
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
# TODO: when tidying this up in the future, but these common functions I use
# all the time in a separate .py file.
def hyper_search(model, params, X, y, save_path, n_iter=60, metric="accuracy",
cv = KFold(5), random_state=42, refit=True,
overwrite=False, warning=False, verbose=0):
if os.path.exists(save_path) and overwrite==False:
#load the model
models = joblib.load(save_path)
else:
# check all param inputs are lists
if all(type(x)==list for x in params.values()):
search_type = "Gridsearch"
models = GridSearchCV(model, param_grid=params, scoring=metric, cv=cv,
refit=refit, return_train_score=True, verbose=verbose)
n_iter = len(list(itertools.product(*list(iter(params.values())))))
else:
search_type = "Randomsearch"
if verbose>0:
print("Starting "+search_type)
models = RandomizedSearchCV(model, param_distributions=params,
n_iter=n_iter, scoring=metric, cv=cv,
refit=refit, random_state=random_state,
return_train_score=True, verbose=verbose)
start = time()
if warning:
models.fit(X, y)
else:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
models.fit(X, y)
if verbose>0:
print(search_type + " took %.2f seconds for %d candidates" % ((time() - start), n_iter))
joblib.dump(models, save_path)
return models
RF = RandomForestClassifier(criterion='gini',
random_state=42,
n_jobs=-1)
rf_param_dict = {'n_estimators': n_estimators_list,
'max_features': max_feats_list}
# tbf a random search here would be better but I guess I'm doing this for comparison
# with the previous plot.
rf_gs = hyper_search(RF, rf_param_dict, X_train, y_train,
os.path.join(os.getcwd(), "Models", "rf_gs.pkl"),
overwrite=False)
display(pd.DataFrame(rf_gs.cv_results_).sort_values("rank_test_score")[["param_n_estimators",
"param_max_features",
"mean_test_score",
"std_test_score"]].head())
# from https://realpython.com/python-timer/
import time
class TimerError(Exception):
"""A custom exception used to report errors in use of Timer class"""
class Timer:
def __init__(self):
self._start_time = None
def start(self):
"""Start a new timer"""
if self._start_time is not None:
raise TimerError(f"Timer is running. Use .stop() to stop it")
self._start_time = time.perf_counter()
def stop(self):
"""Stop the timer, and report the elapsed time"""
if self._start_time is None:
raise TimerError(f"Timer is not running. Use .start() to start it")
elapsed_time = time.perf_counter() - self._start_time
self._start_time = None
#print(f"Elapsed time: {elapsed_time:0.4f} seconds")
return elapsed_time
max_min_samples = 41
min_samples_leaf_list = list(range(1, max_min_samples))
RF = rf_gs.best_estimator_
t = Timer()
oob_scores = pd.DataFrame()
for s in min_samples_leaf_list:
RF.set_params(**{'min_samples_leaf':s, 'oob_score':True})
t.start()
RF.fit(X_train, y_train)
score = RF.oob_score_
func_time = t.stop()
oob_score = pd.DataFrame([s, score, func_time], index = ['Minimum Samples per Leaf', 'OOB Score', 'Run Time (secs)']).T
if oob_scores.empty:
oob_scores = oob_score
else:
oob_scores = pd.concat([oob_scores,oob_score])
Well in this case theres not much impact on run time, but there often is (see https://bradleyboehmke.github.io/HOML/random-forest.html)
fig, axes = plt.subplots(2, figsize = (width_inch*2, height_inch*2), sharex=True)
plt.sca(axes[0])
sns.lineplot(data = oob_scores, x = 'Minimum Samples per Leaf', y = 'OOB Score')
axes[0].set_ylim([0.93,0.97])
axes[0].set_xlim([-1.,max_min_samples])
plt.sca(axes[1])
sns.lineplot(data = oob_scores, x = 'Minimum Samples per Leaf', y = 'Run Time (secs)')
axes[1].set_ylim([0.5,1.2])
axes[0].set_xlim([-1.,max_min_samples])
fig_num+=1
plt.suptitle('Figure %d: Reduction in Tree Complexity and Impact on Computation Speed'%fig_num)
plt.tight_layout()
plt.close()
display(MatplotlibFigure(fig, centered=True))
plt.show()
As averaging methods work best when the predictors are as independent as possible5, we may specifically want our trees in our ensemble to be more independent.
Instead of looking for the most discriminative thresholds, thresholds are drawn at random for each candidate feature and the best of these randomly-generated thresholds is picked as the splitting rule.
Notes
from sklearn.tree import ExtraTreeClassifier
ET = ExtraTreeClassifier(criterion='gini',
random_state=42,
max_depth = None)
ET.fit(X.values, y.values)
fig, axes = plt.subplots(figsize = (width_inch*2, height_inch*2))
with plt.style.context("classic"):
plot_tree(ET,
feature_names=X.columns,
class_names=load_breast_cancer().target_names,
filled = True)
fig_num+=1
plt.suptitle('Figure %d: Extremely Randomised Trees'%fig_num)
plt.close()
display(MatplotlibFigure(fig, centered=True))
Extra
Lets look at decision boundaries created by the different trees to demonstrate generalisation performance.
from time import time
cancer_features = ['mean radius','mean smoothness']
# specify parameters and distributions to sample from
param_grid = {"min_samples_leaf":list(range(1,15))}
DT = DecisionTreeClassifier(criterion='gini',
random_state=42)
tree_2d_gs = hyper_search(DT, param_grid, X[cancer_features].values, y,
os.path.join(os.getcwd(), "Models", "tree_2d_gs.pkl"),
overwrite=True)
pd.DataFrame(tree_2d_gs.cv_results_).sort_values("rank_test_score")[["param_min_samples_leaf", "mean_test_score", "std_test_score"]].head()
RF = RandomForestClassifier(criterion='gini',
max_features = 2,
random_state=42,
n_jobs=-1)
rf_param_dist = {'n_estimators': range(0, 1050),
"min_samples_leaf":range(1,15),
"max_samples":uniform(0.05, 1.)}
rf_2d_rs = hyper_search(RF, rf_param_dist, X[cancer_features].values, y,
os.path.join(os.getcwd(), "Models", "rf_2d_rs.pkl"),
overwrite=False, verbose=2)
display(pd.DataFrame(rf_2d_rs.cv_results_).sort_values("rank_test_score")[["param_n_estimators",
"mean_test_score",
"std_test_score"]].head())
tree_dict = {'Tree':tree_2d_gs.best_estimator_, 'Forest':rf_2d_rs.best_estimator_}
fig, axes = plt.subplots(ncols=2, figsize=(15, 5))
for i, classifier_name in enumerate(tree_dict):
plt.sca(axes[i])
scatter_kwargs = {'alpha': 0.1}
plot_decision_regions(X[cancer_features].values, y.values,
clf = tree_dict[classifier_name],
scatter_kwargs = scatter_kwargs
)
plt.xlabel(cancer_features[0])
plt.ylabel(cancer_features[1])
plt.title(classifier_name)
plt.ylim([0.05,0.18])
plt.xlim([10,20])
plt.show()
Now might be a good time to try exercises 7-10.
import sys
from shutil import copyfile
# where the HTML template is located
dst = os.path.join(sys.prefix, 'lib', 'site-packages', 'nbconvert', 'templates', "classic.tplx")
# If its not located where it should be
if not os.path.exists(dst):
# uses a nb_pdf_template
curr_path = os.path.join(os.getcwd(),"..", "Extra", "classic.tplx")
# copy where it is meant to be
copyfile(curr_path, dst)
# Create HTML notes document (preferred)
!jupyter nbconvert 2_Ensemble_Averaging.ipynb \
--to html \
--output-dir . \
--template classic
# Create html slides (issues)
!jupyter nbconvert 2_Ensemble_Averaging.ipynb \
--to slides \
--output-dir . \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True \
--SlidesExporter.reveal_scroll=True
# Create pdf notes document (issues)
!jupyter nbconvert 2_Ensemble_Averaging.ipynb \
--to html \
--output-dir ./PDF_Prep \
--output 2_Ensemble_Averaging_no_code \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True