# This line will add a button to toggle visibility of code blocks,
# for use with the HTML export version
from IPython.core.display import HTML
HTML('''<button style="margin:0 auto; display: block;" onclick="jQuery('.code_cell .input_area').toggle();
jQuery('.prompt').toggle();">Toggle code</button>''')
Hierarchical clustering is an approach to clustering which does not require that we commit to a apriori number of clusters.
Instead we can determine the number of clusters using a tree-based representation of the observations, called a dendrogram1.
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import matplotlib
import os
from IPython.display import Image
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy
import warnings
matplotlib.rcParams['animation.embed_limit'] = 30000000.0
image_dir = os.path.join(os.getcwd(),"Images")
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
# Initial fig number
fig_num = 10
anim_num = 5
plt.rcParams['figure.dpi'] = 120
# golden ratio for figures ()
gr = 1.618
height_pix = 500
width_pix = height_pix*gr
height_inch = 4
width_inch = height_inch*gr
# Pdf conversion can't seem to handle the animations
PDF=False
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']
# set a random seed
np.random.seed(1)
# A list of centres of clusters
true_centres = [[-5, 0], [0, 0], [2, 3]]
points, clusters = make_blobs(
n_samples=[18, 11, 16], # number of samples in each cluster
centers=true_centres, # where are the centres?
)
labelled_points = pd.DataFrame({'cluster': clusters,
'x1': points[:, 0],
'x2': points[:, 1]})
def clusters_plt(title):
sns.scatterplot(x = 'x1', y = 'x2', hue="cluster", data = labelled_points, alpha=0.7,
palette = CB_color_cycle[:3], legend=False)
plt.title(title)
plt.show()
Agglomerative (or bottom-up) clustering is the most common type of hierarchical clustering and refers to the fact that a dendrogram (generally depicted as an upside-down tree) is built starting from the leaves and combining clusters up to the trunk1.
Notes
Each leaf of the dendrogram represents an observation and as we move up the tree, some leaves begin to fuse into branches. These correspond to observations that are similar to each other1.
As we move higher up the tree, branches themselves fuse, either with leaves or other branches1.
The earlier (lower in the tree) fusions occur, the more similar the groups of observations are to each other. On the other hand, observations that fuse later (near the top of the tree) can be quite different1.
fig, axes = plt.subplots(figsize=(15,5))
cluster = hierarchy.dendrogram(hierarchy.complete(points), ax= axes, color_threshold=0)
axes.get_xaxis().set_ticks([])
fig_num+=1
plt.title("Figure %d: Basic Dendrogram"%fig_num)
plt.show()
We will use the following data generated data.
There are three distinct classes, shown in separate colors. However, we will treat these class labels as unknown and will seek to cluster the observations in order to discover the classes from the data1.
fig_num+=1
clusters_plt("Figure %d: Forty-five observations generated in two-dimensional space."%fig_num)
Each leaf of the dendrogram represents one of the 45 observations.
For any two observations, we can look for the point in the tree where branches containing those two observations are first fused.
The height of this fusion, indicates how different the two observations are. Observations that fuse at the bottom of the tree are quite similar to each other, whereas observations that fuse close to the top will generally be quite different1.
def highlight_points(X, highlight, ax):
for h in highlight:
circle = plt.Circle((X[h,0]+0.1, X[h,1]+0.1), 0.15, color='r',fill=False)
ax.add_patch(circle)
def numbered_scatter(X, ax=None, color_dict={}, title=None, highlight=None):
if ax:
plt.sca(ax)
else:
sns.set(rc={"figure.figsize":(10,5)})
if not color_dict:
for i in range(len(X)):
color_dict[i]="purple"
ax = sns.scatterplot(x=X[:,0], y=X[:,1], alpha=0)
if title:
ax.set(title=title)
# label points on the plot
for i, [x, y] in enumerate(zip(X[:,0], X[:,1])):
# the position of the data label relative to the data point can be adjusted by adding/subtracting a value from the x &/ y coordinates
plt.text(x = x, # x-coordinate position of data label
y = y, # y-coordinate position of data label
s = "{:.0f}".format(i), # data label, formatted to ignore decimals
color = color_dict[i]) # set colour
if highlight:
highlight_points(X, highlight, ax)
def heir_scat_plot(linkage, X, threshold=0, highlight=None, title=None):
#sns.set_theme(style = "whitegrid")
links_list = ["Complete", "Average", "Single"]
if linkage not in links_list:
raise NameError("linkage needs to be in: " +str(links_list))
fig, (ax1,ax2) = plt.subplots(2,1, figsize=(15,12))
if title:
plt.suptitle(title)
# heirarchy plot
plt.sca(ax1)
if linkage == "Complete":
links = hierarchy.complete(X)
elif linkage == "Average":
links = hierarchy.average(X)
elif linkage == "Single":
links = hierarchy.single(X)
hierarchy.set_link_color_palette(CB_color_cycle)
cluster = hierarchy.dendrogram(links, color_threshold=threshold)
if threshold:
plt.hlines(threshold, xmin=0., xmax=pd.DataFrame(cluster['icoord']).max().max(), colors ='k', linestyles = 'dashed')
if highlight:
ax_txt_labels = ax1.get_xticklabels()
for label in ax_txt_labels:
if int(label.get_text()) in highlight:
label.set_fontweight("bold")
label.set_fontsize(label.get_fontsize()*2)
plt.xticks(rotation = 0)
numbered_scatter(X, ax2, dict(zip(cluster['leaves'], cluster['leaves_color_list'])), highlight=highlight)
plt.tight_layout()
plt.show()
fig_num+=1
heir_scat_plot("Complete", points, highlight=[38,3,22,23,2,30],
title = "Figure %d: Complete Linkage"%fig_num)
In order to identifying clusters using a dendrogram we make a horizontal cut across it. The distinct sets of observations beneath the cut can be interpreted as clusters.
A dendrogram can be used to obtain any number of clusters. In practice, it is common select by eye a sensible number of clusters based on the heights of the fusions and the number of clusters desired.
However, often the choice of where to cut the dendrogram is not so clear.
Notes
fig, axes = plt.subplots(1,3, figsize=(15,5), sharey=True)
axes = axes.flatten()
thesh_list = [0,8,6]
for i in range(3):
plt.sca(axes[i])
cluster = hierarchy.dendrogram(hierarchy.complete(points), ax=axes[i], color_threshold=thesh_list[i])
axes[i].get_xaxis().set_ticks([])
if thesh_list[i]:
plt.hlines(thesh_list[i], xmin=0., xmax=pd.DataFrame(cluster['icoord']).max().max(),
colors ='k', linestyles = 'dashed')
fig_num+=1
plt.suptitle("Figure %d: Dendrogram obtained from hierarchical clustering with complete linkage"%fig_num)
plt.show()
row_clusters = hierarchy.complete(points)
pd.DataFrame(row_clusters,
columns=['row label 1', 'row label 2',
'distance', 'no. of items in clust.'],
index=['cluster %d' % (i + 1)
for i in range(row_clusters.shape[0])]).head()
Heirarchical clustering dendrograms are oten used in combination with a heat map, which represent each individual value with a color2.
# altered from https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/ch11/ch11.ipynb
df = pd.DataFrame(points, columns = ['X', 'Y'])
# plot row dendrogram
fig = plt.figure(figsize=(8, 8), facecolor='white')
axd = fig.add_axes([0.09, 0.1, 0.5, 0.6])
# note: for matplotlib < v1.5.1, please use orientation='right'
row_dendr = hierarchy.dendrogram(row_clusters, orientation='left')
# reorder data with respect to clustering
df_rowclust = df.iloc[row_dendr['leaves'][::-1]]
axd.set_xticks([])
axd.set_yticks([])
# remove axes spines from dendrogram
for i in axd.spines.values():
i.set_visible(False)
# plot heatmap
axm = fig.add_axes([0.59, 0.1, 0.3, 0.6]) # x-pos, y-pos, width, height
plt.grid(False)
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r', aspect='auto')
fig.colorbar(cax)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''])
fig_num+=1
plt.suptitle("Figure %d: Dendrogram with Heatmap"%fig_num, y=0.78)
plt.savefig(os.path.join(image_dir,"Dendrogram.png"), bbox_inches="tight")
plt.show()
Begin with n observations and a measure (such as Euclidean distance) of all the $({n \atop 2}) = \frac{n(n−1)}{2}$ pairwise dissimilarities. Treat each observation as its own cluster.
For $i = n, n−1 , \ldots , 2$:
(a) Examine all pairwise inter-cluster dissimilarities among the $i$ clusters and identify the pair of clusters that are least dissimilar (that is, most similar). Fuse these two clusters. The dissimilarity between these two clusters indicates the height in the dendrogram at which the fusion should be placed.
(b) Compute the new pairwise inter-cluster dissimilarities among the $i − 1$ remaining clusters.
Notes
The algorithm uses a dissimilarity measure between each pair of observations (e.g. Euclidean distance).
It then proceeds iteratively starting at the bottom of the dendrogram where each observation is treat as its own cluster. Then the two clusters that are most similar to each other are fused so that there now are $n − 1$ clusters. The algorithm then proceeds in this fashion until all of the observations belong to one single cluster.
from matplotlib import animation, rc
from IPython.display import HTML
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
kclust = labelled_points[['x1', 'x2']]
def update_plot(i, kclust, dist_df, scat, final_colors, curr_colors):
if i == 0:
scat.set_array(curr_colors)
else:
child_1 = dist_df['child_1'][int(i-1)]
child_2 = dist_df['child_2'][int(i-1)]
if child_1 < len(final_colors):
curr_colors[child_1]=final_colors[child_1]
if child_2 < len(final_colors):
curr_colors[child_2]=final_colors[child_2]
# changes color
scat.set_array(curr_colors)
return scat,
def heir_vis(n_clusters=1, random_state=1):
cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='average', compute_distances =True)
cluster.fit(points)
dist_df = pd.DataFrame(cluster.children_,cluster.distances_).reset_index()
dist_df.columns = ["distances", "child_1", "child_2"]
final_colors = cluster.labels_
updated_colors = np.array(range(len(final_colors)))
fig = plt.figure(figsize =(8,5))
scat1 = plt.scatter(kclust['x1'], kclust['x2'], c=updated_colors, cmap=plt.cm.prism)
ani = animation.FuncAnimation(fig, update_plot,
frames=len(dist_df)+1,
fargs=(kclust, dist_df, scat1, final_colors, updated_colors),
interval=200
)
plt.close()
# Note: below is the part which makes it work on Colab
rc('animation', html='jshtml')
return ani
if not PDF:
display(heir_vis())
For this to work, the dissimilarity between a pair of observations also needs to be extended to a pair of groups of observations (linkage).
The four most commonly-used types of linkage in hierarchical clustering are1:
Complete
Single
## TODO: Find another diagram to demonstrate the other two linkages as well
# https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/ch11/ch11.ipynb
fig_num+=1
print(color.BOLD+color.UNDERLINE+"Figure %d: Different Linkage Methods"%(fig_num)+color.END)
Image(filename=os.path.join(image_dir,"11_07.png"), width=400)
Average
Centeroid
Notes
fig, axes = plt.subplots(1,2, figsize=(15,5), sharey=True)
axes = axes.flatten()
for i, (links, title) in enumerate(zip([hierarchy.complete(points), hierarchy.single(points)],
["Complete", "Single"])):
plt.sca(axes[i])
cluster = hierarchy.dendrogram(links, ax=axes[i])
axes[i].get_xaxis().set_ticks([])
plt.title(title)
plt.ylim(0,11)
fig_num+=1
plt.suptitle("Figure %d: Dendrograms with different types of linkage"%fig_num)
plt.show()
heir_scat_plot("Complete", points, threshold=5, title = "Extra Figure: Complete Linkage")
So far we have used Euclidean distance but other metrics exist.
Correlation-based distance1
Choice is important but depends on data being clustered and the scientific question at hand.
Density-based Spatial Clustering of Applications with Noise (DBSCAN) is density based clustering method.
For each instance:
min_samples
instances around it, then it is considered a core instance3.min_samples
within $\epsilon$, but lies within the $\epsilon$ of a core instance, its considered a border point2.Notes
# from https://github.com/rasbt/python-machine-learning-book-3rd-edition/tree/master/ch11/images
Image(os.path.join(image_dir, "11_13.png"), width=500)
Once each point is labelled as either core, border, or noise, the algorithm2:
Notes
# https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb
from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
X, y = make_moons(n_samples=1000, noise=0.05, random_state=42)
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)
dbscan2 = DBSCAN(eps=0.2)
dbscan2.fit(X)
def plot_dbscan(dbscan, X, size, show_xlabels=True, show_ylabels=True):
core_mask = np.zeros_like(dbscan.labels_, dtype=bool)
core_mask[dbscan.core_sample_indices_] = True
anomalies_mask = dbscan.labels_ == -1
non_core_mask = ~(core_mask | anomalies_mask)
cores = dbscan.components_
anomalies = X[anomalies_mask]
non_cores = X[non_core_mask]
plt.scatter(cores[:, 0], cores[:, 1],
c=dbscan.labels_[core_mask], marker='o', s=size, cmap="Paired")
plt.scatter(cores[:, 0], cores[:, 1], marker='*', s=20, c=dbscan.labels_[core_mask])
plt.scatter(anomalies[:, 0], anomalies[:, 1],
c="r", marker="x", s=100)
plt.scatter(non_cores[:, 0], non_cores[:, 1], c=dbscan.labels_[non_core_mask], marker=".")
if show_xlabels:
plt.xlabel("$x_1$", fontsize=14)
else:
plt.tick_params(labelbottom=False)
if show_ylabels:
plt.ylabel("$x_2$", fontsize=14, rotation=0)
else:
plt.tick_params(labelleft=False)
plt.title("eps={:.2f}, min_samples={}".format(dbscan.eps, dbscan.min_samples), fontsize=14)
plt.figure(figsize=(9, 3.2))
plt.subplot(121)
plot_dbscan(dbscan, X, size=100)
plt.subplot(122)
plot_dbscan(dbscan2, X, size=600, show_ylabels=False)
plt.show()
DBSCAN is not very good at making prediction of cluster assignment for new observations, so is more commonly used in a pipeline with another classification model3.
Notes
# https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb
dbscan = dbscan2
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])
def plot_data(X):
plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)
def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='o', s=35, linewidths=8,
color=circle_color, zorder=10, alpha=0.9)
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=2, linewidths=12,
color=cross_color, zorder=11, alpha=1)
def plot_decision_boundaries(clusterer, X, resolution=1000, show_centroids=True,
show_xlabels=True, show_ylabels=True):
mins = X.min(axis=0) - 0.1
maxs = X.max(axis=0) + 0.1
xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
np.linspace(mins[1], maxs[1], resolution))
Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
cmap="Pastel2")
plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
linewidths=1, colors='k')
plot_data(X)
if show_centroids:
plot_centroids(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$", fontsize=14)
else:
plt.tick_params(labelbottom=False)
if show_ylabels:
plt.ylabel("$x_2$", fontsize=14, rotation=0)
else:
plt.tick_params(labelleft=False)
plt.figure(figsize=(6, 3))
plot_decision_boundaries(knn, X, show_centroids=False)
plt.scatter(X_new[:, 0], X_new[:, 1], c="b", marker="+", s=200, zorder=10)
plt.show()
Extra
You may want to try HDBSCAN which turns DBSCAN into a hierarchical clustering algorithm.
Now might be a good time to try exercise 3.
import sys
from shutil import copyfile
# where the HTML template is located
dst = os.path.join(sys.prefix, 'lib', 'site-packages', 'nbconvert', 'templates', "classic.tplx")
# If its not located where it should be
if not os.path.exists(dst):
# uses a nb_pdf_template
curr_path = os.path.join(os.getcwd(),"..", "Extra", "classic.tplx")
# copy where it is meant to be
copyfile(curr_path, dst)
if not PDF:
# Create HTML notes document (preferred)
!jupyter nbconvert 2_Hierarchical_and_Density_Clustering.ipynb \
--to html \
--output-dir . \
--template classic
# Create html slides (issues)
!jupyter nbconvert 2_Hierarchical_and_Density_Clustering.ipynb \
--to slides \
--output-dir . \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True \
--SlidesExporter.reveal_scroll=True
else:
# Create pdf notes document (issues)
!jupyter nbconvert 2_Hierarchical_and_Density_Clustering.ipynb \
--to html \
--output-dir ./PDF_Prep \
--output 2_Hierarchical_and_Density_Clustering_no_code \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True