The following code implements and illustrates some of the concepts of the paper:
Vineet Chaoji, Geng Li, Hilmi Yildirim, Mohammed J. Zaki. ABACUS: Mining Arbitrary Shaped Clusters from Large Datasets based on Backbone Identification, SIAM International Conference on Data Mining, 2011.
NOTE: Incomplete test implementation
library("dbscan")
library("proxy")
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
abacus <- function(x, k, max_it, noise = 0.05) {
plot(x, col = "black", cex = .1, sub = paste(nrow(x), "points"),
main = "Full data")
#' set globing radius as the average of the $k$-nearest neighbor distances
#' (top 5% removed as potential outliers).
kNN_d <- kNNdist(x, k)
r <- mean(kNN_d[kNN_d <= quantile(kNN_d, 1-noise)])
#' processing order is from most dense to least dense (measured by kNN distance).
proc_order <- order(kNN_d, decreasing = FALSE)
#' inital weights (each original point has a weight of 1, i.e., represents 1 point)
w <- rep(1L, nrow(x))
# I need to implement the stopping criterion!
for(it in 1:max_it) {
for(i in 1:length(proc_order)) {
# process the points $p$ in order given by density.
p <- proc_order[i]
if(is.na(w[[p]])) next
# glob all points to point $p$ that have a distance $d \le r$
d <- as.vector(dist(x[p,], x))
glob <- setdiff(which(d <= r), p)
# move point $p$ for kNN objects which were not globbed
mv <- setdiff(order(d, decreasing = FALSE)[1:(k+1L)], p)
if(length(glob)>1) mv <- setdiff(mv, glob)
if(length(mv) > 0) {
x[p,] <- (x[p,] * w[p] + colSums(x[mv, ] * w[mv] /
d[mv])) / (w[p] + sum(w[mv] / d[mv]))
}
# update weight and remove globbed objects
if(length(glob) > 0) {
w[p] <- w[p] + sum(w[glob])
w[glob] <- NA
x[glob,] <- NA
}
i <- i + 1L
}
plot(chameleon_ds4, col = "grey", cex = .1,
sub = paste(sum(!is.na(w)), "of", nrow(x), "points left"),
main = paste("Iteration:" , it))
points(x, cex = w/max(w, na.rm = TRUE)*2, pch=19)
}
list(x = x, w = w)
}
Use a dataset from the CHAMELEON paper (is provided in seriation).
library("seriation")
data("Chameleon")
x <- chameleon_ds4
res <- abacus(x, 10, 8, noise = 0.05)