Introduction

The following code implements and illustrates some of the concepts of the paper:

Vineet Chaoji, Geng Li, Hilmi Yildirim, Mohammed J. Zaki. ABACUS: Mining Arbitrary Shaped Clusters from Large Datasets based on Backbone Identification, SIAM International Conference on Data Mining, 2011.

NOTE: Incomplete test implementation

library("dbscan")
library("proxy")
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix

Partial Implementation of Globbing and Moving

abacus <- function(x, k, max_it, noise = 0.05) {
  
  plot(x, col = "black", cex = .1, sub = paste(nrow(x), "points"), 
    main = "Full data")
  
  
  #' set globing radius as the average of the $k$-nearest neighbor distances 
  #' (top 5% removed as potential outliers).
  kNN_d <- kNNdist(x, k)
  r <- mean(kNN_d[kNN_d <= quantile(kNN_d, 1-noise)])
  
  #' processing order is from most dense to least dense (measured by kNN distance).
  proc_order <- order(kNN_d, decreasing = FALSE)
  
  #' inital weights (each original point has a weight of 1, i.e., represents 1 point)
  w <- rep(1L, nrow(x))
  
  # I need to implement the stopping criterion!
  for(it in 1:max_it) {
    for(i in 1:length(proc_order)) {
      
      # process the points $p$ in order given by density.
      p <- proc_order[i]
      
      if(is.na(w[[p]])) next
      
      # glob all points to point $p$ that have a distance $d \le r$
      d <- as.vector(dist(x[p,], x))
      glob <- setdiff(which(d <= r), p)
      
      # move point $p$ for kNN objects which were not globbed
      mv <- setdiff(order(d, decreasing = FALSE)[1:(k+1L)], p)
      if(length(glob)>1) mv <- setdiff(mv, glob)
      
      if(length(mv) > 0) {
        x[p,] <- (x[p,] * w[p] + colSums(x[mv, ] * w[mv] /
            d[mv])) / (w[p] + sum(w[mv] / d[mv]))
      }
      
      # update weight and remove globbed objects
      if(length(glob) > 0) {
        w[p] <- w[p] + sum(w[glob])
        
        w[glob] <- NA
        x[glob,] <- NA
      }
      
      i <- i + 1L
    }
    
    plot(chameleon_ds4, col = "grey", cex = .1,
      sub = paste(sum(!is.na(w)), "of", nrow(x), "points left"),
      main = paste("Iteration:" , it))
    points(x, cex = w/max(w, na.rm = TRUE)*2, pch=19)
  }
  
  list(x = x, w = w)  
}

Dataset

Use a dataset from the CHAMELEON paper (is provided in seriation).

library("seriation")
data("Chameleon")
x <- chameleon_ds4

Run ABACUS Globbing and Moving

res <- abacus(x, 10, 8, noise = 0.05)