In [1]:
# This line will add a button to toggle visibility of code blocks,
# for use with the HTML export version
from IPython.core.display import HTML
HTML('''<button style="margin:0 auto; display: block;" onclick="jQuery('.code_cell .input_area').toggle();
    jQuery('.prompt').toggle();">Toggle code</button>''')
Out[1]:

drawing

Week 10 - Clustering

Dr. David Elliott

  1. Hierarchical Clustering

  2. DBSCAN

4. Hierarchical Clustering

Hierarchical clustering is an approach to clustering which does not require that we commit to a apriori number of clusters.

Instead we can determine the number of clusters using a tree-based representation of the observations, called a dendrogram1.

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import matplotlib
import os
from IPython.display import Image
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy
import warnings 

matplotlib.rcParams['animation.embed_limit'] = 30000000.0

image_dir = os.path.join(os.getcwd(),"Images")

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

# Initial fig number
fig_num = 10
anim_num = 5

plt.rcParams['figure.dpi'] = 120
# golden ratio for figures ()
gr = 1.618

height_pix = 500
width_pix = height_pix*gr

height_inch = 4
width_inch = height_inch*gr

# Pdf conversion can't seem to handle the animations
PDF=False
In [3]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#984ea3',
                  '#999999', '#e41a1c', '#dede00']

# set a random seed
np.random.seed(1)

# A list of centres of clusters
true_centres = [[-5, 0], [0, 0], [2, 3]]

points, clusters = make_blobs(
    n_samples=[18, 11, 16], # number of samples in each cluster
    centers=true_centres,     # where are the centres?
)


labelled_points = pd.DataFrame({'cluster': clusters,
                                'x1': points[:, 0],
                                'x2': points[:, 1]})

def clusters_plt(title):
    sns.scatterplot(x = 'x1', y = 'x2', hue="cluster", data = labelled_points, alpha=0.7, 
                    palette = CB_color_cycle[:3], legend=False)
    plt.title(title)
    plt.show()

Agglomerative (or bottom-up) clustering is the most common type of hierarchical clustering and refers to the fact that a dendrogram (generally depicted as an upside-down tree) is built starting from the leaves and combining clusters up to the trunk1.

Notes

  • Each leaf of the dendrogram represents an observation and as we move up the tree, some leaves begin to fuse into branches. These correspond to observations that are similar to each other1.

  • As we move higher up the tree, branches themselves fuse, either with leaves or other branches1.

  • The earlier (lower in the tree) fusions occur, the more similar the groups of observations are to each other. On the other hand, observations that fuse later (near the top of the tree) can be quite different1.

In [4]:
fig, axes = plt.subplots(figsize=(15,5))
cluster = hierarchy.dendrogram(hierarchy.complete(points), ax= axes, color_threshold=0)
axes.get_xaxis().set_ticks([])
fig_num+=1
plt.title("Figure %d: Basic Dendrogram"%fig_num)
plt.show()

Data Example

We will use the following data generated data.

There are three distinct classes, shown in separate colors. However, we will treat these class labels as unknown and will seek to cluster the observations in order to discover the classes from the data1.

In [5]:
fig_num+=1
clusters_plt("Figure %d: Forty-five observations generated in two-dimensional space."%fig_num)

Each leaf of the dendrogram represents one of the 45 observations.

For any two observations, we can look for the point in the tree where branches containing those two observations are first fused.

The height of this fusion, indicates how different the two observations are. Observations that fuse at the bottom of the tree are quite similar to each other, whereas observations that fuse close to the top will generally be quite different1.

  • Observations close to each other on the hoizontal axis are not similar, we need to use the vertical axis to see where they are fused (see observation 37 and 2 for example)
In [6]:
def highlight_points(X, highlight, ax):
    for h in highlight:

        circle = plt.Circle((X[h,0]+0.1, X[h,1]+0.1), 0.15, color='r',fill=False)
        ax.add_patch(circle)


def numbered_scatter(X, ax=None, color_dict={}, title=None, highlight=None):
    
    if ax:
        plt.sca(ax)
    else:
        sns.set(rc={"figure.figsize":(10,5)})
        
    if not color_dict:
        for i in range(len(X)):
            color_dict[i]="purple"

    ax = sns.scatterplot(x=X[:,0], y=X[:,1], alpha=0)
    
    if title:
        ax.set(title=title)
    # label points on the plot
    for i, [x, y] in enumerate(zip(X[:,0], X[:,1])):
        # the position of the data label relative to the data point can be adjusted by adding/subtracting a value from the x &/ y coordinates
        plt.text(x = x, # x-coordinate position of data label
        y = y, # y-coordinate position of data label
        s = "{:.0f}".format(i), # data label, formatted to ignore decimals
        color = color_dict[i]) # set colour
        
    if highlight:
        highlight_points(X, highlight, ax)

def heir_scat_plot(linkage, X, threshold=0, highlight=None, title=None):
    #sns.set_theme(style = "whitegrid") 
    links_list = ["Complete", "Average", "Single"]
    if linkage not in links_list:
        raise NameError("linkage needs to be in: " +str(links_list))
    
    fig, (ax1,ax2) = plt.subplots(2,1, figsize=(15,12))
    
    if title:
        plt.suptitle(title)
    
    # heirarchy plot
    plt.sca(ax1)
    if linkage == "Complete":
        links = hierarchy.complete(X)
    elif linkage == "Average":
        links = hierarchy.average(X)
    elif linkage == "Single":
        links = hierarchy.single(X)
    hierarchy.set_link_color_palette(CB_color_cycle)
    cluster = hierarchy.dendrogram(links, color_threshold=threshold)
    
    if threshold:
        plt.hlines(threshold, xmin=0., xmax=pd.DataFrame(cluster['icoord']).max().max(), colors ='k', linestyles = 'dashed')
    
    if highlight:
        ax_txt_labels = ax1.get_xticklabels()
        for label in ax_txt_labels:
            if int(label.get_text()) in highlight:
                label.set_fontweight("bold")
                label.set_fontsize(label.get_fontsize()*2)
    

    plt.xticks(rotation = 0)
    
    numbered_scatter(X, ax2, dict(zip(cluster['leaves'], cluster['leaves_color_list'])), highlight=highlight)

    plt.tight_layout()
    plt.show()

fig_num+=1
heir_scat_plot("Complete", points, highlight=[38,3,22,23,2,30],
               title = "Figure %d: Complete Linkage"%fig_num)

In order to identifying clusters using a dendrogram we make a horizontal cut across it. The distinct sets of observations beneath the cut can be interpreted as clusters.

A dendrogram can be used to obtain any number of clusters. In practice, it is common select by eye a sensible number of clusters based on the heights of the fusions and the number of clusters desired.

However, often the choice of where to cut the dendrogram is not so clear.

Notes

  • "The height of the cut to the dendrogram serves the same role as the K in K-means clustering: it controls the number of clusters obtained."1
  • "the dendrogram from the left-hand panel, cut at a height of nine (indicated by the dashed line). This cut results in two distinct clusters, shown in different colors. Right: the dendrogram from the left-hand panel, now cut at a height of five. This cut results in three distinct clusters, shown in different colors."1
In [7]:
fig, axes = plt.subplots(1,3, figsize=(15,5), sharey=True)

axes = axes.flatten()

thesh_list = [0,8,6]

for i in range(3):
    
    plt.sca(axes[i])
    cluster = hierarchy.dendrogram(hierarchy.complete(points), ax=axes[i], color_threshold=thesh_list[i])
    axes[i].get_xaxis().set_ticks([])
    if thesh_list[i]:
        plt.hlines(thesh_list[i], xmin=0., xmax=pd.DataFrame(cluster['icoord']).max().max(), 
                   colors ='k', linestyles = 'dashed')

fig_num+=1
plt.suptitle("Figure %d: Dendrogram obtained from hierarchical clustering with complete linkage"%fig_num)
plt.show()
In [8]:
row_clusters = hierarchy.complete(points)
pd.DataFrame(row_clusters,
             columns=['row label 1', 'row label 2',
                      'distance', 'no. of items in clust.'],
             index=['cluster %d' % (i + 1)
                    for i in range(row_clusters.shape[0])]).head()
Out[8]:
row label 1 row label 2 distance no. of items in clust.
cluster 1 22.0 23.0 0.076208 2.0
cluster 2 27.0 35.0 0.125870 2.0
cluster 3 21.0 46.0 0.135057 3.0
cluster 4 36.0 39.0 0.191957 2.0
cluster 5 5.0 15.0 0.223856 2.0

Heirarchical clustering dendrograms are oten used in combination with a heat map, which represent each individual value with a color2.

In [9]:
# altered from https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/ch11/ch11.ipynb

df = pd.DataFrame(points, columns = ['X', 'Y'])

# plot row dendrogram
fig = plt.figure(figsize=(8, 8), facecolor='white')
axd = fig.add_axes([0.09, 0.1, 0.5, 0.6])

# note: for matplotlib < v1.5.1, please use orientation='right'
row_dendr = hierarchy.dendrogram(row_clusters, orientation='left')

# reorder data with respect to clustering
df_rowclust = df.iloc[row_dendr['leaves'][::-1]]

axd.set_xticks([])
axd.set_yticks([])

# remove axes spines from dendrogram
for i in axd.spines.values():
    i.set_visible(False)

# plot heatmap
axm = fig.add_axes([0.59, 0.1, 0.3, 0.6])  # x-pos, y-pos, width, height
plt.grid(False)
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r', aspect='auto')
fig.colorbar(cax)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    axm.set_xticklabels([''] + list(df_rowclust.columns))
    axm.set_yticklabels([''])

fig_num+=1
plt.suptitle("Figure %d: Dendrogram with Heatmap"%fig_num, y=0.78)

plt.savefig(os.path.join(image_dir,"Dendrogram.png"), bbox_inches="tight")
plt.show()

The Hierarchical Clustering Algorithm1

  1. Begin with n observations and a measure (such as Euclidean distance) of all the $({n \atop 2}) = \frac{n(n−1)}{2}$ pairwise dissimilarities. Treat each observation as its own cluster.

  2. For $i = n, n−1 , \ldots , 2$:

    (a) Examine all pairwise inter-cluster dissimilarities among the $i$ clusters and identify the pair of clusters that are least dissimilar (that is, most similar). Fuse these two clusters. The dissimilarity between these two clusters indicates the height in the dendrogram at which the fusion should be placed.

    (b) Compute the new pairwise inter-cluster dissimilarities among the $i − 1$ remaining clusters.

Notes

  • The algorithm uses a dissimilarity measure between each pair of observations (e.g. Euclidean distance).

  • It then proceeds iteratively starting at the bottom of the dendrogram where each observation is treat as its own cluster. Then the two clusters that are most similar to each other are fused so that there now are $n − 1$ clusters. The algorithm then proceeds in this fashion until all of the observations belong to one single cluster.

In [10]:
from matplotlib import animation, rc
from IPython.display import HTML
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

kclust = labelled_points[['x1', 'x2']]

def update_plot(i, kclust, dist_df, scat, final_colors, curr_colors):

    if i == 0:
        scat.set_array(curr_colors)
    
    else:
        child_1 = dist_df['child_1'][int(i-1)]
        child_2 = dist_df['child_2'][int(i-1)]

        if child_1 < len(final_colors):
            curr_colors[child_1]=final_colors[child_1]
        if child_2 < len(final_colors):
            curr_colors[child_2]=final_colors[child_2]

        # changes color
        scat.set_array(curr_colors)
                
    
    return scat,

def heir_vis(n_clusters=1, random_state=1):
  
    cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='average', compute_distances =True)
    cluster.fit(points)

    dist_df = pd.DataFrame(cluster.children_,cluster.distances_).reset_index()
    dist_df.columns = ["distances", "child_1", "child_2"]
    
    final_colors = cluster.labels_
    updated_colors = np.array(range(len(final_colors)))

    fig = plt.figure(figsize =(8,5))
    
    scat1 = plt.scatter(kclust['x1'], kclust['x2'], c=updated_colors, cmap=plt.cm.prism)
    
    ani = animation.FuncAnimation(fig, update_plot, 
                                  frames=len(dist_df)+1,
                                  fargs=(kclust, dist_df, scat1, final_colors, updated_colors),
                                  interval=200
                                 )

    plt.close()
    # Note: below is the part which makes it work on Colab
    rc('animation', html='jshtml')
    return ani

if not PDF:
    display(heir_vis())

Dissimilarity Measure

For this to work, the dissimilarity between a pair of observations also needs to be extended to a pair of groups of observations (linkage).

The four most commonly-used types of linkage in hierarchical clustering are1:

Complete

  • Maximal intercluster dissimilarity
  • Compute all pairwise dissimilarities between the observations in cluster A and the observations in cluster B, and record the largest of these dissimilarities.

Single

  • Minimal intercluster dissimilarity.
  • Compute all pairwise dissimilarities between the observations in cluster A and the observations in cluster B, and record the smallest of these dissimilarities.
  • Single linkage can result in extended, trailing clusters in which single observations are fused one-at-a-time.
In [11]:
## TODO: Find another diagram to demonstrate the other two linkages as well

# https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/ch11/ch11.ipynb
fig_num+=1
print(color.BOLD+color.UNDERLINE+"Figure %d: Different Linkage Methods"%(fig_num)+color.END)
Image(filename=os.path.join(image_dir,"11_07.png"), width=400)
Figure 16: Different Linkage Methods
Out[11]:

Average

  • Mean intercluster dissimilarity.
  • Compute all pairwise dissimilarities between the observations in cluster A and the observations in cluster B, and record the average of these dissimilarities.

Centeroid

  • Dissimilarity between the centroid for cluster A (a mean vector of length p) and the centroid for cluster B.
  • Centroid linkage can result in undesirable inversions.

Notes

  • "Average, complete, and single linkage are most popular among statisticians. Average and complete linkage are generally preferred over single linkage, as they tend to yield more balanced dendrograms."1
In [12]:
fig, axes = plt.subplots(1,2, figsize=(15,5), sharey=True)

axes = axes.flatten()

for i, (links, title) in enumerate(zip([hierarchy.complete(points), hierarchy.single(points)],
                                       ["Complete", "Single"])):
    
    plt.sca(axes[i])
    cluster = hierarchy.dendrogram(links, ax=axes[i])
    axes[i].get_xaxis().set_ticks([])
    plt.title(title)
        
plt.ylim(0,11)
fig_num+=1
plt.suptitle("Figure %d: Dendrograms with different types of linkage"%fig_num)
plt.show()
In [13]:
heir_scat_plot("Complete", points, threshold=5, title = "Extra Figure: Complete Linkage")

Distance Measure

So far we have used Euclidean distance but other metrics exist.

Correlation-based distance1

  • Considers two observations to be similar if their features are highly correlated.
  • Values may be similar dispite being far apart in terms of Euclidean distance.

Choice is important but depends on data being clustered and the scientific question at hand.

5. DBSCAN

Density-based Spatial Clustering of Applications with Noise (DBSCAN) is density based clustering method.

For each instance:

  • The algorithm counts how many instances are located within a small distance, $\epsilon$, of it3.
  • If an instance has at least min_samples instances around it, then it is considered a core instance3.
  • If a point has fewer neighbours than min_samples within $\epsilon$, but lies within the $\epsilon$ of a core instance, its considered a border point2.
  • All points not a core point or border point are considered noise points (or anomaly)2.

Notes

  • Clusters are defined as continuous regions of high density3.
  • The region around each instance is called an $\epsilon$-neighbourhood3.
  • Core instances are instances located in dense regions3.
In [14]:
# from https://github.com/rasbt/python-machine-learning-book-3rd-edition/tree/master/ch11/images
Image(os.path.join(image_dir, "11_13.png"), width=500)
Out[14]:

Once each point is labelled as either core, border, or noise, the algorithm2:

  • Forms a separate cluster for each core point or connected group of core points provided they are no father than $\epsilon$ away from each other.
  • Assigns each border point to the cluster of its corresponding core point.

Notes

  • As neighourhood can have multiple core instances, you can form a long sequence for a cluster3.
  • All instances in the neighbourhood of a core instance belong to a cluster, including other core instances3.
In [15]:
# https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb

from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN

X, y = make_moons(n_samples=1000, noise=0.05, random_state=42)
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)
dbscan2 = DBSCAN(eps=0.2)
dbscan2.fit(X)

def plot_dbscan(dbscan, X, size, show_xlabels=True, show_ylabels=True):
    core_mask = np.zeros_like(dbscan.labels_, dtype=bool)
    core_mask[dbscan.core_sample_indices_] = True
    anomalies_mask = dbscan.labels_ == -1
    non_core_mask = ~(core_mask | anomalies_mask)

    cores = dbscan.components_
    anomalies = X[anomalies_mask]
    non_cores = X[non_core_mask]
    
    plt.scatter(cores[:, 0], cores[:, 1],
                c=dbscan.labels_[core_mask], marker='o', s=size, cmap="Paired")
    plt.scatter(cores[:, 0], cores[:, 1], marker='*', s=20, c=dbscan.labels_[core_mask])
    plt.scatter(anomalies[:, 0], anomalies[:, 1],
                c="r", marker="x", s=100)
    plt.scatter(non_cores[:, 0], non_cores[:, 1], c=dbscan.labels_[non_core_mask], marker=".")
    if show_xlabels:
        plt.xlabel("$x_1$", fontsize=14)
    else:
        plt.tick_params(labelbottom=False)
    if show_ylabels:
        plt.ylabel("$x_2$", fontsize=14, rotation=0)
    else:
        plt.tick_params(labelleft=False)
    plt.title("eps={:.2f}, min_samples={}".format(dbscan.eps, dbscan.min_samples), fontsize=14)
    
plt.figure(figsize=(9, 3.2))

plt.subplot(121)
plot_dbscan(dbscan, X, size=100)

plt.subplot(122)
plot_dbscan(dbscan2, X, size=600, show_ylabels=False)

plt.show()

DBSCAN is not very good at making prediction of cluster assignment for new observations, so is more commonly used in a pipeline with another classification model3.

Notes

  • We could train a model only using the core instances, all instances, or just the anomolies depending on what we are trying to achieve.
  • If used in combination with an algorithm such as k-nearest neighbors, we can also set a maximum distance from clusters for them to be considered anomolie; see pg. 256 of Géron (2019).
In [16]:
# https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb

dbscan = dbscan2
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])

def plot_data(X):
    plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)

def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):
    if weights is not None:
        centroids = centroids[weights > weights.max() / 10]
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='o', s=35, linewidths=8,
                color=circle_color, zorder=10, alpha=0.9)
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=2, linewidths=12,
                color=cross_color, zorder=11, alpha=1)

def plot_decision_boundaries(clusterer, X, resolution=1000, show_centroids=True,
                             show_xlabels=True, show_ylabels=True):
    mins = X.min(axis=0) - 0.1
    maxs = X.max(axis=0) + 0.1
    xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
                         np.linspace(mins[1], maxs[1], resolution))
    Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                cmap="Pastel2")
    plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                linewidths=1, colors='k')
    plot_data(X)
    if show_centroids:
        plot_centroids(clusterer.cluster_centers_)

    if show_xlabels:
        plt.xlabel("$x_1$", fontsize=14)
    else:
        plt.tick_params(labelbottom=False)
    if show_ylabels:
        plt.ylabel("$x_2$", fontsize=14, rotation=0)
    else:
        plt.tick_params(labelleft=False)

plt.figure(figsize=(6, 3))
plot_decision_boundaries(knn, X, show_centroids=False)
plt.scatter(X_new[:, 0], X_new[:, 1], c="b", marker="+", s=200, zorder=10)
plt.show()

Extra

You may want to try HDBSCAN which turns DBSCAN into a hierarchical clustering algorithm.

Associated Exercises

Now might be a good time to try exercise 3.

References

  1. James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An introduction to statistical learning (Vol. 112, p. 18). New York: springer.
  2. Raschka, S., & Mirjalili, V. (2019). Python machine learning: Machine learning and deep learning with Python, scikit-learn, and TensorFlow 2. Packt Publishing Ltd.
  3. Géron, A. (2019). Hands-on machine learning with Scikit-Learn, Keras, and TensorFlow: Concepts, tools, and techniques to build intelligent systems. O'Reilly Media.
In [ ]:
import sys
from shutil import copyfile

# where the HTML template is located
dst = os.path.join(sys.prefix, 'lib', 'site-packages', 'nbconvert', 'templates', "classic.tplx")

# If its not located where it should be
if not os.path.exists(dst):
    # uses a nb_pdf_template
    curr_path = os.path.join(os.getcwd(),"..", "Extra", "classic.tplx")
    # copy where it is meant to be
    copyfile(curr_path, dst)

if not PDF:   
    # Create HTML notes document (preferred)
    !jupyter nbconvert 2_Hierarchical_and_Density_Clustering.ipynb \
        --to html \
        --output-dir . \
        --template classic
    # Create html slides (issues)
    !jupyter nbconvert 2_Hierarchical_and_Density_Clustering.ipynb \
        --to slides \
        --output-dir . \
        --TemplateExporter.exclude_input=True \
        --TemplateExporter.exclude_output_prompt=True \
        --SlidesExporter.reveal_scroll=True
    
else:
    # Create pdf notes document (issues)
    !jupyter nbconvert 2_Hierarchical_and_Density_Clustering.ipynb \
        --to html \
        --output-dir ./PDF_Prep \
        --output 2_Hierarchical_and_Density_Clustering_no_code \
        --TemplateExporter.exclude_input=True \
        --TemplateExporter.exclude_output_prompt=True
In [ ]: