# This line will add a button to toggle visibility of code blocks,
# for use with the HTML export version
from IPython.core.display import HTML
HTML('''<button style="margin:0 auto; display: block;" onclick="jQuery('.code_cell .input_area').toggle();
jQuery('.prompt').toggle();">Toggle code</button>''')
Dr. David Elliott
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import pandas as pd
Notes
One clustered, we can measure the affinity of each instance with its cluster (how well it fits in the cluster).
Cluster affinities can be used as features (e.g. a $K$ dimensional vector), often leading to a reduced feature space than the feature vector - but still preserving some information.
The simple MNIST-like dataset availble in Sklearn is a common classification example.
Notes
# From https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb
X_digits, y_digits = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)
# initial logistic regression
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_train, y_train)
log_reg_score = log_reg.score(X_test, y_test)
print("Logistic Regression Accuracy: %.2f"%round(log_reg_score*100,2)+"%")
# K-Means preprocessing
# cluster the training set first
pipeline = Pipeline([
("kmeans", KMeans(n_clusters=50, random_state=42)),
("log_reg", LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)),
])
pipeline.fit(X_train, y_train)
pipeline_score = pipeline.score(X_test, y_test)
print("KMeans and Logistic Regression Accuracy: %.2f"%round(pipeline_score*100,2)+"%")
Clustering is useful for when we are looking for new patterns in data.
It maybe we want to cluster our data into groups, and analyze each cluster separately.
A researcher might assay gene expression levels in patients with breast cancer.
They might look for subgroups among the breast cancer samples, or among the genes, in order to obtain a better understanding of the disease.
Notes
In a very general sense, recommender systems are algorithms aimed at suggesting relevant items to users (e.g. movies, text, products, ect.)3.
You can cluster your customers based on their purchases and website activity.
It is useful to know who your customers are you can adapt products and marketing campaigns.
Notes
A search engine might choose what search results to display to a particular individual based on the click histories of other individuals with similar search patterns.
Some search engines allow you to search for images similar to a reference image.
Notes
An image can be segmented in a number of different ways:
By clustering pixels acording to color, then replacing each pixel with the mean cluster color, you can reduce the number of colors in an image2.
This is used in many object detection and tracking systems as contours of objects become more defined2.
Notes
import os
image_dir = os.path.join(os.getcwd(),"Images")
from matplotlib.image import imread
# read in the image
image = imread(os.path.join(image_dir, "ladybug.png"))
# the first dimension is the height, second is width, third is number of color channels (RGB)
image.shape
# From https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb
X = image.reshape(-1, 3)
kmeans = KMeans(n_clusters=8, random_state=42).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)
segmented_imgs = []
n_colors = (10, 8, 6, 4, 2)
for n_clusters in n_colors:
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_imgs.append(segmented_img.reshape(image.shape))
plt.figure(figsize=(10,5))
plt.subplots_adjust(wspace=0.05, hspace=0.1)
plt.subplot(231)
plt.imshow(image)
plt.title("Original image")
plt.axis('off')
for idx, n_clusters in enumerate(n_colors):
plt.subplot(232 + idx)
plt.imshow(segmented_imgs[idx])
plt.title("{} colors".format(n_clusters))
plt.axis('off')
plt.show()
Notes
Semi-supervised learning is used in the case where we have lots of unlabelled examples, and few labeled cases (common).
Returning to the MNIST-like dataset, we got 96.89% accuracy on a test set (25% of the full data) when we trained a logistic regression using the rest (75%).
Imagine we only had our test set that we have gone through and labeled, and now need to label our training set.
Instead of labelling all 1347 instances manually, we could start by training a k-means algorithm and just manually labeling instances closest to the centeroid for each cluster.
## https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb
k = 50
kmeans = KMeans(n_clusters=k, random_state=42)
X_digits_dist = kmeans.fit_transform(X_train)
representative_digit_idx = np.argmin(X_digits_dist, axis=0)
X_representative_digits = X_train[representative_digit_idx]
plt.figure(figsize=(8, 2))
for index, X_representative_digit in enumerate(X_representative_digits):
plt.subplot(k // 10, 10, index + 1)
plt.imshow(X_representative_digit.reshape(8, 8), cmap="binary", interpolation="bilinear")
plt.axis('off')
plt.show()
y_representative_digits = np.array([
4, 8, 0, 6, 8, 3, 7, 7, 9, 2,
5, 5, 8, 5, 2, 1, 2, 9, 6, 1,
1, 6, 9, 0, 8, 3, 0, 7, 4, 1,
6, 5, 2, 4, 1, 8, 6, 3, 9, 2,
4, 2, 9, 4, 7, 6, 2, 3, 1, 1])
#y_representative_digits = y_train[representative_digit_idx]
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_representative_digits, y_representative_digits)
score = log_reg.score(X_test, y_test)
print("KMeans 50 Labels and Logistic Regression Accuracy: %.2f"%round(score*100,2)+"%")
Notes
We could further improve performance by using these labels we made to automatically label a percentage of the observations close to the centeroid.
percentile_closest = 75
X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]
for i in range(k):
in_cluster = (kmeans.labels_ == i)
cluster_dist = X_cluster_dist[in_cluster]
cutoff_distance = np.percentile(cluster_dist, percentile_closest)
above_cutoff = (X_cluster_dist > cutoff_distance)
X_cluster_dist[in_cluster & above_cutoff] = -1
partially_propagated = (X_cluster_dist != -1)
X_train_partially_propagated = X_train[partially_propagated]
y_train_propagated = np.empty(len(X_train), dtype=np.int32)
for i in range(k):
y_train_propagated[kmeans.labels_==i] = y_representative_digits[i]
y_train_partially_propagated = y_train_propagated[partially_propagated]
label_acc = np.mean(y_train_partially_propagated == y_train[partially_propagated])
print("Propogated Label Accuracy: %.2f"%round(label_acc*100,2)+"%")
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)
score = log_reg.score(X_test, y_test)
print("Propogated KMeans and Logistic Regression Accuracy: %.2f"%round(score*100,2)+"%")
Active Learning2
To continue improving this model above is we could use active learning, where human experts provide labels for specific instances when requested by the algorithm.
A strategy for this is uncertainty sampling:
Other strategies include:
k-means generally has linear computational complexity regarding the number of instances, $n$, number of clusters, $k$, and the number of dimensions $d$.
Clustering methods generally are not very robust to perturbations to the data1.
Notes
Notes
tol=1e-04
)# from https://github.com/ageron/handson-ml2/blob/master/09_unsupervised_learning.ipynb
X1, y1 = make_blobs(n_samples=1000, centers=((4, -4), (0, 0)), random_state=42)
X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))
X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)
X2 = X2 + [6, -8]
X = np.r_[X1, X2]
y = np.r_[y1, y2]
def plot_data(X):
plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)
def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='o', s=10, linewidths=8,
color=circle_color, zorder=10, alpha=0.9)
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=2, linewidths=5,
color=cross_color, zorder=11, alpha=1)
def plot_decision_boundaries(clusterer, X, resolution=1000, show_centroids=True,
show_xlabels=True, show_ylabels=True):
mins = X.min(axis=0) - 0.1
maxs = X.max(axis=0) + 0.1
xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
np.linspace(mins[1], maxs[1], resolution))
Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
cmap="Pastel2")
plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
linewidths=1, colors='k')
plot_data(X)
if show_centroids:
plot_centroids(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$", fontsize=14)
else:
plt.tick_params(labelbottom=False)
if show_ylabels:
plt.ylabel("$x_2$", fontsize=14, rotation=0)
else:
plt.tick_params(labelleft=False)
kmeans_good = KMeans(n_clusters=3, init=np.array([[-1.5, 2.5], [0.5, 0], [4, 0]]), n_init=1, random_state=42)
kmeans_bad = KMeans(n_clusters=3, random_state=42)
kmeans_good.fit(X)
kmeans_bad.fit(X)
plt.figure(figsize=(10, 3.2))
plt.subplot(121)
plot_decision_boundaries(kmeans_good, X)
plt.title("Inertia = {:.1f}".format(kmeans_good.inertia_), fontsize=14)
plt.subplot(122)
plot_decision_boundaries(kmeans_bad, X, show_ylabels=False)
plt.title("Inertia = {:.1f}".format(kmeans_bad.inertia_), fontsize=14)
#save_fig("bad_kmeans_plot")
plt.suptitle("Example of poor performance on ellipsoidal blobs")
plt.tight_layout()
plt.show()
Notes
eps
and min_samples
)2.eps
is large.eps
and min_samples
is not so good.eps
5.#https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/ch11/ch11.ipynb
from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN, AgglomerativeClustering
X, y = make_moons(n_samples=200, noise=0.05, random_state=0)
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 4))
km = KMeans(n_clusters=2, random_state=0)
y_km = km.fit_predict(X)
ax1.scatter(X[y_km == 0, 0], X[y_km == 0, 1],
edgecolor='black',
c='lightblue', marker='o', s=40, label='cluster 1')
ax1.scatter(X[y_km == 1, 0], X[y_km == 1, 1],
edgecolor='black',
c='red', marker='s', s=40, label='cluster 2')
ax1.set_title('K-means clustering')
ac = AgglomerativeClustering(n_clusters=2,
affinity='euclidean',
linkage='complete')
y_ac = ac.fit_predict(X)
ax2.scatter(X[y_ac == 0, 0], X[y_ac == 0, 1], c='lightblue',
edgecolor='black',
marker='o', s=40, label='Cluster 1')
ax2.scatter(X[y_ac == 1, 0], X[y_ac == 1, 1], c='red',
edgecolor='black',
marker='s', s=40, label='Cluster 2')
ax2.set_title('Agglomerative clustering')
db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
y_db = db.fit_predict(X)
ax3.scatter(X[y_db == 0, 0], X[y_db == 0, 1],
c='lightblue', marker='o', s=40,
edgecolor='black',
label='Cluster 1')
ax3.scatter(X[y_db == 1, 0], X[y_db == 1, 1],
c='red', marker='s', s=40,
edgecolor='black',
label='Cluster 2')
ax3.set_title('DBSCAN')
plt.tight_layout()
#plt.savefig('images/11_16.png', dpi=300)
plt.show()
Scikit-Learn
implements many more clustering algorithms so here is a brief overview:
Affinity Propogation
Instances vote for similar instances to represent them, with representatives and votesrs forming a cluster upon conversion.
Balanced Iterative Reducing and Clustering using Hierarchies (BIRCH)
Builds a minimal tree structure to quickly assign instances to clusters without having to store them all in memory.
Mean-Shift
Places a circle on each instance (the radius of the circle being a bandwidth hyperparamter), calculates the mean of all instances located within the circle, then shifts the circle so it is centred on the mean. This is iterated until all circles stop moving.
Spectral Clustering
Creates a low-dimensional embedding for a similarity matrix between instances, and then uses another clustering algorithm (e.g. K-means).
import sys
from shutil import copyfile
# where the HTML template is located
dst = os.path.join(sys.prefix, 'lib', 'site-packages', 'nbconvert', 'templates', "classic.tplx")
# If its not located where it should be
if not os.path.exists(dst):
# uses a nb_pdf_template
curr_path = os.path.join(os.getcwd(),"..", "Extra", "classic.tplx")
# copy where it is meant to be
copyfile(curr_path, dst)
# Create HTML notes document
!jupyter nbconvert 3_Applications.ipynb \
--to html \
--output-dir . \
--template classic
!jupyter nbconvert 3_Applications.ipynb \
--to slides \
--output-dir . \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True \
--SlidesExporter.reveal_scroll=True
# Create pdf notes document (issues)
!jupyter nbconvert 3_Applications.ipynb \
--to html \
--output-dir ./PDF_Prep \
--output 3_Applications_no_code \
--TemplateExporter.exclude_input=True \
--TemplateExporter.exclude_output_prompt=True