Create data

library("mlbench")
set.seed(2015)
Cassini <- mlbench.cassini(1250)
plot(Cassini)

x<-Cassini$x
dim(x)
## [1] 1250    2
k<-3

k-means

system.time(cl <- kmeans(x, centers=k))
##    user  system elapsed 
##   0.006   0.000   0.011
plot(x, col=cl$cluster+1)
points(cl$centers, cex=5, lwd=2)

k-medoids (PAM)

library("cluster")
system.time(cl <- pam(x, k=k))
##    user  system elapsed 
##   0.341   0.004   0.345
plot(x, col=cl$cluster+1)
points(cl$medoids, cex=5, lwd=2)

CLARA: fast version of PAM with sampling

system.time(cl <- clara(x, k=k))
##    user  system elapsed 
##   0.006   0.000   0.007
plot(x, col=cl$cluster+1)
points(cl$medoids, cex=5, lwd=2)

Hierarchical clustering

system.time(d<- dist(x))
##    user  system elapsed 
##   0.056   0.004   0.060
system.time(hc <- hclust(d)) ### default is complete linkage
##    user  system elapsed 
##   0.232   0.008   0.240
plot(hc)

cl <- cutree(hc, k=k)
plot(x, col=cl+1)

system.time(hc <- hclust(d, method="single"))
##    user  system elapsed 
##   0.129   0.000   0.129
plot(hc)

cl <- cutree(hc, k=k)
plot(x, col=cl+1)

system.time(hc <- hclust(d, method="average"))
##    user  system elapsed 
##   0.126   0.000   0.126
plot(hc)

cl <- cutree(hc, k=k)
plot(x, col=cl+1)

Model based clustering

Estimates a Gaussian mixture model using EM and BIC. Tries different k.

library(mclust)
## Package 'mclust' version 4.4
## Type 'citation("mclust")' for citing this R package in publications.
system.time(cl <- Mclust(x, G=1:(k+3)))   ### try 1 to 6 components
##    user  system elapsed 
##   2.161   0.000   2.169
plot(x, col=cl$classification+1)

Density based clustering (DBSCAN)

does not use k but Eps and MinPts Ester et al. (1996) suggest to use MinPts=4 and select Eps using a sorted k-dist graph

d <- dist(x)

DBSCAN_k_dist_graph <- function(d, k=4) {
  m <- as.matrix(d)
  index <- t(apply(m, MARGIN=1, order, decreasing = FALSE))
  k_dist <- sapply(1:nrow(m), FUN = function(i) m[i,index[i,k+1]])
  plot(sort(k_dist, decreasing = TRUE),
    ylab = paste(k, "-dist", sep = ""), main="Sorted k-dist graph")
  invisible(k_dist)
}

DBSCAN_k_dist_graph(d)

All points to the left of the first valley are considered noise.

library(fpc)
system.time(cl <- dbscan(x, eps=.15, MinPts=4))
##    user  system elapsed 
##   0.217   0.004   0.224
cl
## dbscan Pts=1250 MinPts=4 eps=0.15
##          1   2   3
## border   2   5   0
## seed   498 495 250
## total  500 500 250
plot(x, col=cl$cluster+1)

Try other dataset (run the code above with these data sets)

Cassini with noise

Cassini <- mlbench.cassini(1000)
noise <- cbind(runif(250, -3, 3), runif(250, -3, 3))
x <- rbind(Cassini$x, noise)
x <- x[sample(nrow(x)),]
plot(x)

dim(x)
## [1] 1250    2
k<-3

Cassini (large n)

Cassini <- mlbench.cassini(12500)
plot(Cassini)

x<-Cassini$x
dim(x)
## [1] 12500     2
k<-3

Twonorm

twonorm <- mlbench.twonorm(500, d=2)
plot(twonorm)

x <- twonorm$x
dim(x)
## [1] 500   2
k <- 2

Smiley

Smiley <- mlbench.smiley()
plot(Smiley)

x <- Smiley$x
dim(x)
## [1] 500   2
k <- 4

Spirals

Spirals <- mlbench.spirals(500,1,0.05)
plot(Spirals)

x <- Spirals$x
k <- 2