The following code implements and illustrates some of the concepts of the paper:
Vineet Chaoji, Geng Li, Hilmi Yildirim, Mohammed J. Zaki. ABACUS: Mining Arbitrary Shaped Clusters from Large Datasets based on Backbone Identification, SIAM International Conference on Data Mining, 2011.
NOTE: Incomplete test implementation
library("dbscan")
library("proxy")
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
abacus <- function(x, k, max_it, noise = 0.05) {
plot(
x,
col = "black",
cex = .1,
sub = paste(nrow(x), "points"),
main = "Full data"
)
x_orig <- x
# **compute kNN**
kNN_d <- kNNdist(x, k)
# **estimate globing radius** as the average of the $k$-nearest neighbor distances
# (top noise % removed as potential outliers).
r <- mean(kNN_d[kNN_d <= quantile(kNN_d, 1 - noise)])
# processing order is from most dense to least dense (measured by kNN distance).
proc_order <- order(kNN_d, decreasing = FALSE)
# initial weights (each original point has a weight of 1, i.e., represents 1 point)
w <- rep(1L, nrow(x))
# FIXME: implement the stopping criterion!
for (it in seq_len(max_it)) {
for (i in 1:length(proc_order)) {
# process the points $p$ in order given by density.
p <- proc_order[i]
if (is.na(w[[p]]))
next
# **glob objects**: glob all points to point $p$ that have a distance $d \le r$
d <- as.vector(dist(x[p, ], x))
glob <- setdiff(which(d <= r), p)
# **move objects**: move point $p$ for kNN objects which were not globbed
mv <- setdiff(order(d, decreasing = FALSE)[1:(k + 1L)], p)
if (length(glob) > 1)
mv <- setdiff(mv, glob)
if (length(mv) > 0) {
x[p, ] <- (x[p, ] * w[p] + colSums(x[mv,] * w[mv] /
d[mv])) / (w[p] + sum(w[mv] / d[mv]))
}
# update weight and remove globbed objects
if (length(glob) > 0) {
w[p] <- w[p] + sum(w[glob])
w[glob] <- NA
x[glob, ] <- NA
}
i <- i + 1L
}
plot(
x_orig,
col = "grey",
cex = .1,
sub = paste(sum(!is.na(w)), "of", nrow(x), "backbone points left"),
main = paste("Iteration:" , it)
)
points(x, cex = w / max(w, na.rm = TRUE) * 2, pch = 19)
}
# FIXME: **identify clusters** using e.g., DBSCAN is missing
list(x = x, w = w)
}
Use a dataset from the CHAMELEON paper (available in package seriation).
library("seriation")
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
data("Chameleon")
x <- chameleon_ds4
Figure 2 in the paper uses 8 iterations, but I could not find the value used for k for the figure. In Table 1 \(k = 20\) is specified for DS4, but that does not seem to work well for my partial implementation, so I use 5.
res <- abacus(x, k = 5, max_it = 8, noise = 0.05)
Does data scale make a difference?
res <- abacus(as.data.frame(scale(x)), k = 5, max_it = 8, noise = 0.05)