library("mlbench")
set.seed(2015)
Cassini <- mlbench.cassini(1250)
plot(Cassini)
x<-Cassini$x
dim(x)
## [1] 1250 2
k<-3
system.time(cl <- kmeans(x, centers=k))
## user system elapsed
## 0.006 0.000 0.011
plot(x, col=cl$cluster+1)
points(cl$centers, cex=5, lwd=2)
library("cluster")
system.time(cl <- pam(x, k=k))
## user system elapsed
## 0.341 0.004 0.345
plot(x, col=cl$cluster+1)
points(cl$medoids, cex=5, lwd=2)
system.time(cl <- clara(x, k=k))
## user system elapsed
## 0.006 0.000 0.007
plot(x, col=cl$cluster+1)
points(cl$medoids, cex=5, lwd=2)
system.time(d<- dist(x))
## user system elapsed
## 0.056 0.004 0.060
system.time(hc <- hclust(d)) ### default is complete linkage
## user system elapsed
## 0.232 0.008 0.240
plot(hc)
cl <- cutree(hc, k=k)
plot(x, col=cl+1)
system.time(hc <- hclust(d, method="single"))
## user system elapsed
## 0.129 0.000 0.129
plot(hc)
cl <- cutree(hc, k=k)
plot(x, col=cl+1)
system.time(hc <- hclust(d, method="average"))
## user system elapsed
## 0.126 0.000 0.126
plot(hc)
cl <- cutree(hc, k=k)
plot(x, col=cl+1)
Estimates a Gaussian mixture model using EM and BIC. Tries different k.
library(mclust)
## Package 'mclust' version 4.4
## Type 'citation("mclust")' for citing this R package in publications.
system.time(cl <- Mclust(x, G=1:(k+3))) ### try 1 to 6 components
## user system elapsed
## 2.161 0.000 2.169
plot(x, col=cl$classification+1)
does not use k but Eps and MinPts Ester et al. (1996) suggest to use MinPts=4 and select Eps using a sorted k-dist graph
d <- dist(x)
DBSCAN_k_dist_graph <- function(d, k=4) {
m <- as.matrix(d)
index <- t(apply(m, MARGIN=1, order, decreasing = FALSE))
k_dist <- sapply(1:nrow(m), FUN = function(i) m[i,index[i,k+1]])
plot(sort(k_dist, decreasing = TRUE),
ylab = paste(k, "-dist", sep = ""), main="Sorted k-dist graph")
invisible(k_dist)
}
DBSCAN_k_dist_graph(d)
All points to the left of the first valley are considered noise.
library(fpc)
system.time(cl <- dbscan(x, eps=.15, MinPts=4))
## user system elapsed
## 0.217 0.004 0.224
cl
## dbscan Pts=1250 MinPts=4 eps=0.15
## 1 2 3
## border 2 5 0
## seed 498 495 250
## total 500 500 250
plot(x, col=cl$cluster+1)
Cassini <- mlbench.cassini(1000)
noise <- cbind(runif(250, -3, 3), runif(250, -3, 3))
x <- rbind(Cassini$x, noise)
x <- x[sample(nrow(x)),]
plot(x)
dim(x)
## [1] 1250 2
k<-3
Cassini <- mlbench.cassini(12500)
plot(Cassini)
x<-Cassini$x
dim(x)
## [1] 12500 2
k<-3
twonorm <- mlbench.twonorm(500, d=2)
plot(twonorm)
x <- twonorm$x
dim(x)
## [1] 500 2
k <- 2
Smiley <- mlbench.smiley()
plot(Smiley)
x <- Smiley$x
dim(x)
## [1] 500 2
k <- 4
Spirals <- mlbench.spirals(500,1,0.05)
plot(Spirals)
x <- Spirals$x
k <- 2