# This line will add a button to toggle visibility of code blocks,
# for use with the HTML export version
from IPython.core.display import HTML
HTML('''<button style="margin:0 auto; display: block;" onclick="jQuery('.code_cell .input_area').toggle();
    jQuery('.prompt').toggle();">Toggle code</button>''')

drawing

Week 10 - Clustering¶

Dr. David Elliott

Preprocessing
Data Exploration and Segmentation
Semi-Supervised Learning
Strengths and Limitations

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import pandas as pd

Notes

Clustering has a vast range of use cases, of which are few are going to touch on a few explored in this lecture/notebook.

6. Preprocessing ¶

Dimensionality Reduction²¶

One clustered, we can measure the affinity of each instance with its cluster (how well it fits in the cluster).

Cluster affinities can be used as features (e.g. a $K$ dimensional vector), often leading to a reduced feature space than the feature vector - but still preserving some information.

Example: Image Classification²¶

The simple MNIST-like dataset availble in Sklearn is a common classification example.

Notes

The example contains 1,797 grayscale 8×8 images representing digits 0 to 9.
The example below clusters the training set into 50 clusters and replace the images with their distances to the 50 clusters
As $k$ here is a preprocessing step in a classification pipeline you can use search for the "optimal" value of $k$

# From https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb

X_digits, y_digits = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)

# initial logistic regression
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_train, y_train)
log_reg_score = log_reg.score(X_test, y_test)

print("Logistic Regression Accuracy: %.2f"%round(log_reg_score*100,2)+"%")

# K-Means preprocessing
# cluster the training set first
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=50, random_state=42)),
    ("log_reg", LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)),
])
pipeline.fit(X_train, y_train)
pipeline_score = pipeline.score(X_test, y_test)
print("KMeans and Logistic Regression Accuracy: %.2f"%round(pipeline_score*100,2)+"%")

Logistic Regression Accuracy: 96.89%
KMeans and Logistic Regression Accuracy: 97.78%

7. Data Exploration and Segmentation ¶

Data Exploration¶

Clustering is useful for when we are looking for new patterns in data.

It maybe we want to cluster our data into groups, and analyze each cluster separately.

Example: Cancer Research¹¶

A researcher might assay gene expression levels in patients with breast cancer.

They might look for subgroups among the breast cancer samples, or among the genes, in order to obtain a better understanding of the disease.

Notes

"For instance, suppose that we have a set of n observations, each with p features. The n observations could correspond to tissue samples for patients with breast cancer, and the p features could correspond to measurements collected for each tissue sample; these could be clinical measurements, such as tumor stage or grade, or they could be gene expression measurements. We may have a reason to believe that there is some heterogeneity among the n tissue samples; for instance, perhaps there are a few different unknown subtypes of breast cancer. Clustering could be used to find these subgroups. This is an unsupervised problem because we are trying to discover structure—in this case, distinct clusters—on the basis of a data set."¹

Recommender Systems¶

In a very general sense, recommender systems are algorithms aimed at suggesting relevant items to users (e.g. movies, text, products, ect.)³.

Example: Customer Segmentation²¶

You can cluster your customers based on their purchases and website activity.

It is useful to know who your customers are you can adapt products and marketing campaigns.

Notes

An online shopping site might try to identify groups of shoppers with similar browsing and purchase histories, and items of particular interest to the these shoppers. Then an individual shopper can be preferentially shown the items in which he or she is particularly likely to be interested, based on the purchase histories of similar shoppers¹.
For hierarchical models the choice of dissimilarity measure here is very important, as it has a strong effect on the resulting dendrogram. For instance, the type of dissimilarity measure that should be used to cluster the shoppers depends on the goal. If Euclidean distance is used, then shoppers who have bought very few items overall (i.e. infrequent users of the online shopping site) will be clustered together. If correlation-based distance is used, then shoppers with similar preferences (e.g. shoppers who have bought items A and B but never items C or D) will be clustered together, even if some shoppers with these preferences are higher-volume shoppers than others¹.
- This will also be effected by whether the variables are scaled to have standard deviation one before the dissimilarity between the observations is computed. For example, some items may be purchased more frequently than others; for instance, a shopper might buy ten pairs of socks a year, but a computer very rarely. High-frequency purchases like socks therefore tend to have a much larger effect on the inter-shopper dissimilarities, and hence on the clustering ultimately obtained, than rare purchases like computers. This may not be desirable¹.
- We might also want to scale the variables to have standard deviation one if they are measured on different scales; otherwise, the choice of units (e.g. centimeters versus kilometers) for a particular variable will greatly affect the dissimilarity measure obtained.
- The issue of whether or not to scale the variables before performing clustering applies to K-means clustering as well¹.
If your interested in trying this out, this data set may be useful.
- Two example projects using this data:
- https://www.kaggle.com/hellbuoy/online-retail-k-means-hierarchical-clustering
- https://www.kaggle.com/tklimonova/online-retail-cohort-and-rfm-analysis

Extra Example: Search Recommendations^1,2¶

A search engine might choose what search results to display to a particular individual based on the click histories of other individuals with similar search patterns.

Some search engines allow you to search for images similar to a reference image.

Notes

You may first build a clustering algorithm to images in a database, then similar images end in similar clusters allowing you to return images from the same cluster.²

Image Segmentation¶

An image can be segmented in a number of different ways:

Semantic Segmentation
- All pixels part of the same object type get assigned to the same segment
Instance Segmentation
- All pixels part of the same object get assigned to the same segment

Example: Color Segmentation¶

By clustering pixels acording to color, then replacing each pixel with the mean cluster color, you can reduce the number of colors in an image².

This is used in many object detection and tracking systems as contours of objects become more defined².

Notes

An example of Semantic Segmentation would be that for a self-driving cars vision system, all pixels belonging to pedestrians are clustered².
An example of Instance Segmentation is that there would be a separate cluster for each individual pedestrian².
Semantic and Instance Segmentation is typically done using convolutional neural networks.
Another example of Color Segmentation is the use of satellite images to measure total forest area.

import os

image_dir = os.path.join(os.getcwd(),"Images")
from matplotlib.image import imread

# read in the image
image = imread(os.path.join(image_dir, "ladybug.png"))
# the first dimension is the height, second is width, third is number of color channels (RGB)
image.shape

(533, 800, 3)

# From https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb

X = image.reshape(-1, 3)
kmeans = KMeans(n_clusters=8, random_state=42).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)
segmented_imgs = []
n_colors = (10, 8, 6, 4, 2)
for n_clusters in n_colors:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
    segmented_img = kmeans.cluster_centers_[kmeans.labels_]
    segmented_imgs.append(segmented_img.reshape(image.shape))

plt.figure(figsize=(10,5))
plt.subplots_adjust(wspace=0.05, hspace=0.1)

plt.subplot(231)
plt.imshow(image)
plt.title("Original image")
plt.axis('off')

for idx, n_clusters in enumerate(n_colors):
    plt.subplot(232 + idx)
    plt.imshow(segmented_imgs[idx])
    plt.title("{} colors".format(n_clusters))
    plt.axis('off')

plt.show()

Notes

In the color segmentation example above, we are using the k-means centers to replace the clusters color.
K-means prefers similar sized clusters, which is why the ladybugs red color dissapears below 8 colors.

8. Semi-Supervised Learning¶

Semi-supervised learning is used in the case where we have lots of unlabelled examples, and few labeled cases (common).

Example: Building an MNIST-like Dataset²¶

Returning to the MNIST-like dataset, we got 96.89% accuracy on a test set (25% of the full data) when we trained a logistic regression using the rest (75%).

Imagine we only had our test set that we have gone through and labeled, and now need to label our training set.

Instead of labelling all 1347 instances manually, we could start by training a k-means algorithm and just manually labeling instances closest to the centeroid for each cluster.

## https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb

k = 50
kmeans = KMeans(n_clusters=k, random_state=42)
X_digits_dist = kmeans.fit_transform(X_train)
representative_digit_idx = np.argmin(X_digits_dist, axis=0)
X_representative_digits = X_train[representative_digit_idx]

plt.figure(figsize=(8, 2))
for index, X_representative_digit in enumerate(X_representative_digits):
    plt.subplot(k // 10, 10, index + 1)
    plt.imshow(X_representative_digit.reshape(8, 8), cmap="binary", interpolation="bilinear")
    plt.axis('off')

plt.show()

y_representative_digits = np.array([
     4, 8, 0, 6, 8, 3, 7, 7, 9, 2, 
     5, 5, 8, 5, 2, 1, 2, 9, 6, 1, 
     1, 6, 9, 0, 8, 3, 0, 7, 4, 1, 
     6, 5, 2, 4, 1, 8, 6, 3, 9, 2, 
     4, 2, 9, 4, 7, 6, 2, 3, 1, 1])
#y_representative_digits = y_train[representative_digit_idx]
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_representative_digits, y_representative_digits)
score = log_reg.score(X_test, y_test)
print("KMeans 50 Labels and Logistic Regression Accuracy: %.2f"%round(score*100,2)+"%")

KMeans 50 Labels and Logistic Regression Accuracy: 92.22%

Notes

A classifier where we only had to label 50 observations instead of the 1347 in the previous training set getting ~5% less accuracy is pretty good.

We could further improve performance by using these labels we made to automatically label a percentage of the observations close to the centeroid.

percentile_closest = 75

X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]
for i in range(k):
    in_cluster = (kmeans.labels_ == i)
    cluster_dist = X_cluster_dist[in_cluster]
    cutoff_distance = np.percentile(cluster_dist, percentile_closest)
    above_cutoff = (X_cluster_dist > cutoff_distance)
    X_cluster_dist[in_cluster & above_cutoff] = -1

partially_propagated = (X_cluster_dist != -1)
X_train_partially_propagated = X_train[partially_propagated]
y_train_propagated = np.empty(len(X_train), dtype=np.int32)
for i in range(k):
    y_train_propagated[kmeans.labels_==i] = y_representative_digits[i]
y_train_partially_propagated = y_train_propagated[partially_propagated]

label_acc = np.mean(y_train_partially_propagated == y_train[partially_propagated])

print("Propogated Label Accuracy: %.2f"%round(label_acc*100,2)+"%")

log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)

score = log_reg.score(X_test, y_test)

print("Propogated KMeans and Logistic Regression Accuracy: %.2f"%round(score*100,2)+"%")

Propogated Label Accuracy: 97.51%
Propogated KMeans and Logistic Regression Accuracy: 93.56%

Active Learning²

To continue improving this model above is we could use active learning, where human experts provide labels for specific instances when requested by the algorithm.

A strategy for this is uncertainty sampling:

A model trains using labeled instances and makes predictions on unlabelled instance
The instances where the model is the most uncertain (e.g. using probabilities), is given an expert label.
You iterate the above process until performance improvement stops being worth the labelling effort.

Other strategies include:

Labeling instances that will result in the most model change,
Result in the largest drop in validation error,
Instances where different models (e.g. SVM or Random Forest) disagree the most.

9. Strengths and Limitations ¶

K-Means¶

Advantages¶

k-means generally has linear computational complexity regarding the number of instances, $n$, number of clusters, $k$, and the number of dimensions $d$.
- This is only true when there is a clustering structure.
- K-Means is generally one of the fastest clustering algorithms.
Clustering methods generally are not very robust to perturbations to the data¹.

Notes

Perturbation Robustness: "For instance, suppose that we cluster n observations, and then cluster the observations again after removing a subset of the n observations at random. One would hope that the two sets of clusters obtained would be quite similar, but often this is not the case!"¹

Disadvantages¶

Does not work very well with clusters with different densities or non-spherical shapes².
Requires us to pre-specify the number of clusters¹.
Generally require scaling to be effective¹.
Can result in empty clusters⁴.
Can have problems with convergence⁴.
Assumes that all the data belongs in clusters with no noise⁵.

Notes

Empty clusters are dealt with in Scikit-Learn by searching for the sample that is the fathest away from the centroid of an empty cluster, and reassign the centeroid to that.
convergence problems can be dealt with by changing the tolerance for the changes in the within-cluster sum-squared-error to be declared as convergene (e.g. tol=1e-04)

# from https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb

X1, y1 = make_blobs(n_samples=1000, centers=((4, -4), (0, 0)), random_state=42)
X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))
X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)
X2 = X2 + [6, -8]
X = np.r_[X1, X2]
y = np.r_[y1, y2]

def plot_data(X):
    plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)

def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):
    if weights is not None:
        centroids = centroids[weights > weights.max() / 10]
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='o', s=10, linewidths=8,
                color=circle_color, zorder=10, alpha=0.9)
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=2, linewidths=5,
                color=cross_color, zorder=11, alpha=1)

def plot_decision_boundaries(clusterer, X, resolution=1000, show_centroids=True,
                             show_xlabels=True, show_ylabels=True):
    mins = X.min(axis=0) - 0.1
    maxs = X.max(axis=0) + 0.1
    xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
                         np.linspace(mins[1], maxs[1], resolution))
    Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                cmap="Pastel2")
    plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                linewidths=1, colors='k')
    plot_data(X)
    if show_centroids:
        plot_centroids(clusterer.cluster_centers_)

    if show_xlabels:
        plt.xlabel("$x_1$", fontsize=14)
    else:
        plt.tick_params(labelbottom=False)
    if show_ylabels:
        plt.ylabel("$x_2$", fontsize=14, rotation=0)
    else:
        plt.tick_params(labelleft=False)

        
kmeans_good = KMeans(n_clusters=3, init=np.array([[-1.5, 2.5], [0.5, 0], [4, 0]]), n_init=1, random_state=42)
kmeans_bad = KMeans(n_clusters=3, random_state=42)
kmeans_good.fit(X)
kmeans_bad.fit(X)

plt.figure(figsize=(10, 3.2))

plt.subplot(121)
plot_decision_boundaries(kmeans_good, X)
plt.title("Inertia = {:.1f}".format(kmeans_good.inertia_), fontsize=14)

plt.subplot(122)
plot_decision_boundaries(kmeans_bad, X, show_ylabels=False)
plt.title("Inertia = {:.1f}".format(kmeans_bad.inertia_), fontsize=14)

#save_fig("bad_kmeans_plot")
plt.suptitle("Example of poor performance on ellipsoidal blobs")
plt.tight_layout()
plt.show()

Hierarchical Clustering¶

Advantages¶

Results in an interpetable tree-based representation of the observations (dendrogram)¹.
Doesn't need the number of clusters to be pre-specified.
Agglomerative clustering is generally stable across runs and the dendrogram shows how it varies over parameter choices⁵.

Disadvantages¶

Relies on careful selection of dissimilarity measure¹.
Generally requires scaling to be effective¹.
Assumes a hierarchical structure to the clusters, which may be unrealistic¹.
Assumes that all the data belongs in clusters with no noise⁵.

Notes

"The term hierarchical refers to the fact that clusters obtained by cutting the dendrogram at a given height are necessarily nested within the clusters obtained by cutting the dendrogram at any greater height. However, on an arbitrary data set, this assumption of hierarchical structure might be unrealistic. For instance, suppose that our observations correspond to a group of people with a 50–50 split of males and females, evenly split among Americans, Japanese, and French. We can imagine a scenario in which the best division into two groups might split these people by gender, and the best division into three groups might split them by nationality. In this case, the true clusters are not nested, in the sense that the best division into three groups does not result from taking the best division into two groups and splitting up one of those groups. Consequently, this situation could not be well-represented by hierarchical clustering. Due to situations such as this one, hierarchical clustering can sometimes yield worse (i.e. less accurate) results than K-means clustering for a given number of clusters."¹

DBSCAN¶

Advantages¶

Can identify clusters of any shape².
Is robust to outliers².
Has only two hyperparameters (eps and min_samples)².
Can be relatively linear to compute with number of instances; $O(m \log m)$².
- Although can be $O(m^2)$ if eps is large.
DBSCAN is generally stable across multiple runs⁵
- But stability over varying eps and min_samples is not so good.

Disadvantages¶

The algorithm can be quite sensitive to eps⁵.
- It can also be hard to pick in practice.
Performs worse with in increasing number of features
- Although the same can be said for k-means and hierarchical methods too.

#https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/ch11/ch11.ipynb

from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN, AgglomerativeClustering

X, y = make_moons(n_samples=200, noise=0.05, random_state=0)
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 4))

km = KMeans(n_clusters=2, random_state=0)
y_km = km.fit_predict(X)
ax1.scatter(X[y_km == 0, 0], X[y_km == 0, 1],
            edgecolor='black',
            c='lightblue', marker='o', s=40, label='cluster 1')
ax1.scatter(X[y_km == 1, 0], X[y_km == 1, 1],
            edgecolor='black',
            c='red', marker='s', s=40, label='cluster 2')
ax1.set_title('K-means clustering')

ac = AgglomerativeClustering(n_clusters=2,
                             affinity='euclidean',
                             linkage='complete')
y_ac = ac.fit_predict(X)
ax2.scatter(X[y_ac == 0, 0], X[y_ac == 0, 1], c='lightblue',
            edgecolor='black',
            marker='o', s=40, label='Cluster 1')
ax2.scatter(X[y_ac == 1, 0], X[y_ac == 1, 1], c='red',
            edgecolor='black',
            marker='s', s=40, label='Cluster 2')
ax2.set_title('Agglomerative clustering')

db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
y_db = db.fit_predict(X)
ax3.scatter(X[y_db == 0, 0], X[y_db == 0, 1],
            c='lightblue', marker='o', s=40,
            edgecolor='black', 
            label='Cluster 1')
ax3.scatter(X[y_db == 1, 0], X[y_db == 1, 1],
            c='red', marker='s', s=40,
            edgecolor='black', 
            label='Cluster 2')
ax3.set_title('DBSCAN')
plt.tight_layout()
#plt.savefig('images/11_16.png', dpi=300)
plt.show()

Extra¶

Other Clustering Algorithms²¶

Scikit-Learn implements many more clustering algorithms so here is a brief overview:

Affinity Propogation

Instances vote for similar instances to represent them, with representatives and votesrs forming a cluster upon conversion.

Can find clusters of different sizes
Not suited for large datasets

Balanced Iterative Reducing and Clustering using Hierarchies (BIRCH)

Builds a minimal tree structure to quickly assign instances to clusters without having to store them all in memory.

Designed for large datasets
Can be faster, with similar results, to batch k-means provided features $<20$.

Mean-Shift

Places a circle on each instance (the radius of the circle being a bandwidth hyperparamter), calculates the mean of all instances located within the circle, then shifts the circle so it is centred on the mean. This is iterated until all circles stop moving.

Can find clusters of any shape
Has few hyperparameters
Chops clusters into pieces that have internal density variations
Not suited for large datasets

Spectral Clustering

Creates a low-dimensional embedding for a similarity matrix between instances, and then uses another clustering algorithm (e.g. K-means).

Can capture complex cluster structures
Can be used to cut graphs
Not suited for large datasets
Poor performance when clusters have different sizes

References¶

James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An introduction to statistical learning (Vol. 112, p. 18). New York: springer.
Géron, A. (2019). Hands-on machine learning with Scikit-Learn, Keras, and TensorFlow: Concepts, tools, and techniques to build intelligent systems. O'Reilly Media.
https://towardsdatascience.com/introduction-to-recommender-systems-6c66cf15ada
Raschka, S., & Mirjalili, V. (2019). Python machine learning: Machine learning and deep learning with Python, scikit-learn, and TensorFlow 2. Packt Publishing Ltd.
https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html

import sys
from shutil import copyfile

# where the HTML template is located
dst = os.path.join(sys.prefix, 'lib', 'site-packages', 'nbconvert', 'templates', "classic.tplx")

# If its not located where it should be
if not os.path.exists(dst):
    # uses a nb_pdf_template
    curr_path = os.path.join(os.getcwd(),"..", "Extra", "classic.tplx")
    # copy where it is meant to be
    copyfile(curr_path, dst)

# Create HTML notes document
!jupyter nbconvert 3_Applications.ipynb \
    --to html \
    --output-dir . \
    --template classic
!jupyter nbconvert 3_Applications.ipynb \
    --to slides \
    --output-dir . \
    --TemplateExporter.exclude_input=True \
    --TemplateExporter.exclude_output_prompt=True \
    --SlidesExporter.reveal_scroll=True

# Create pdf notes document (issues)
!jupyter nbconvert 3_Applications.ipynb \
    --to html \
    --output-dir ./PDF_Prep \
    --output 3_Applications_no_code \
    --TemplateExporter.exclude_input=True \
    --TemplateExporter.exclude_output_prompt=True

[NbConvertApp] Converting notebook 3_Applications.ipynb to html
[NbConvertApp] Writing 799822 bytes to 3_Applications.html
[NbConvertApp] Converting notebook 3_Applications.ipynb to slides
[NbConvertApp] Writing 979089 bytes to 3_Applications.slides.html
[NbConvertApp] Converting notebook 3_Applications.ipynb to html
[NbConvertApp] Writing 1034526 bytes to PDF_Prep\3_Applications_no_code.html

Week 10 - Clustering¶

6. Preprocessing ¶

Dimensionality Reduction2¶

Example: Image Classification2¶

7. Data Exploration and Segmentation ¶

Data Exploration¶

Example: Cancer Research1¶

Recommender Systems¶

Example: Customer Segmentation2¶

Extra Example: Search Recommendations1,2¶

Image Segmentation¶

Example: Color Segmentation¶

8. Semi-Supervised Learning¶

Example: Building an MNIST-like Dataset2¶

9. Strengths and Limitations ¶

K-Means¶

Advantages¶

Disadvantages¶

Hierarchical Clustering¶

Advantages¶

Disadvantages¶

DBSCAN¶

Advantages¶

Disadvantages¶

Extra¶

Other Clustering Algorithms2¶

References¶

Dimensionality Reduction²¶

Example: Image Classification²¶

Example: Cancer Research¹¶

Example: Customer Segmentation²¶

Extra Example: Search Recommendations^1,2¶

Example: Building an MNIST-like Dataset²¶

Other Clustering Algorithms²¶