Create data

library("mlbench")
set.seed(2015)
Cassini <- mlbench.cassini(1250)
plot(Cassini)

x<-Cassini$x
dim(x)
## [1] 1250    2
k<-3

k-means

system.time(cl <- kmeans(x, centers=k))
##    user  system elapsed 
##   0.002   0.001   0.002
plot(x, col=cl$cluster+1)
points(cl$centers, cex=5, lwd=2)

k-medoids (PAM)

library("cluster")
system.time(cl <- pam(x, k=k))
##    user  system elapsed 
##   0.107   0.000   0.107
plot(x, col=cl$cluster+1)
points(cl$medoids, cex=5, lwd=2)

CLARA: fast version of PAM with sampling

system.time(cl <- clara(x, k=k))
##    user  system elapsed 
##   0.003   0.000   0.002
plot(x, col=cl$cluster+1)
points(cl$medoids, cex=5, lwd=2)

Hierarchical clustering

system.time(d <- dist(x))
##    user  system elapsed 
##   0.003   0.004   0.007
system.time(hc <- hclust(d)) ### default is complete linkage
##    user  system elapsed 
##   0.054   0.008   0.062
plot(hc, labels = FALSE)

cl <- cutree(hc, k=k)
plot(x, col=cl+1)

system.time(hc <- hclust(d, method="single"))
##    user  system elapsed 
##   0.054   0.004   0.058
plot(hc, labels = FALSE)

cl <- cutree(hc, k=k)
plot(x, col=cl+1)

system.time(hc <- hclust(d, method="average"))
##    user  system elapsed 
##   0.050   0.004   0.054
plot(hc, labels = FALSE)

cl <- cutree(hc, k=k)
plot(x, col=cl+1)

Spectral clustering

library(kernlab)

This is too slow!

#cl <- specc(x, centers=3)
#plot(x, col=cl+1)

Model based clustering

Estimates a Gaussian mixture model using EM and BIC. Tries different k.

library(mclust)
## Package 'mclust' version 5.4.2
## Type 'citation("mclust")' for citing this R package in publications.
system.time(cl <- Mclust(x, G=1:(k+3)))   ### try 1 to 6 components
##    user  system elapsed 
##   1.639   0.008   1.648
cl
## 'Mclust' model object: (EEV,5) 
## 
## Available components: 
##  [1] "call"           "data"           "modelName"      "n"             
##  [5] "d"              "G"              "BIC"            "bic"           
##  [9] "loglik"         "df"             "hypvol"         "parameters"    
## [13] "z"              "classification" "uncertainty"
plot(x, col=cl$classification+1)

Density based clustering (DBSCAN)

library(dbscan)

uses Eps and MinPts Ester et al. (1996) suggest to use MinPts=4 and select Eps using a sorted k-dist graph

kNNdistplot(x, k = 4)

Find the knee. Points to the right of the knee are considered noise.

system.time(cl <- dbscan(x, eps=.15, minPts=4))
##    user  system elapsed 
##   0.004   0.000   0.004
cl
## DBSCAN clustering for 1250 objects.
## Parameters: eps = 0.15, minPts = 4
## The clustering contains 3 cluster(s) and 0 noise points.
## 
##   1   2   3 
## 500 500 250 
## 
## Available fields: cluster, eps, minPts
plot(x, col=cl$cluster+1)

Try other dataset (run the code above with these data sets)

Cassini with noise

Cassini <- mlbench.cassini(1000)
noise <- cbind(runif(250, -3, 3), runif(250, -3, 3))
x <- rbind(Cassini$x, noise)
x <- x[sample(nrow(x)),]
plot(x)

dim(x)
## [1] 1250    2
k<-3

Cassini (large n)

Cassini <- mlbench.cassini(12500)
plot(Cassini)

x<-Cassini$x
dim(x)
## [1] 12500     2
k<-3

Twonorm

twonorm <- mlbench.twonorm(500, d=3)
plot(twonorm)

x <- twonorm$x
dim(x)
## [1] 500   3
k <- 2

Smiley

Smiley <- mlbench.smiley()
plot(Smiley)

x <- Smiley$x
dim(x)
## [1] 500   2
k <- 4

Spirals

Spirals <- mlbench.spirals(500,1,0.05)
plot(Spirals)

x <- Spirals$x
k <- 2