Introduction

The following code implements and illustrates some of the concepts of the paper:

Vineet Chaoji, Geng Li, Hilmi Yildirim, Mohammed J. Zaki. ABACUS: Mining Arbitrary Shaped Clusters from Large Datasets based on Backbone Identification, SIAM International Conference on Data Mining, 2011.

NOTE: Incomplete test implementation

library("dbscan")
library("proxy")
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix

Partial Implementation of Globbing and Moving

abacus <- function(x, k, max_it, noise = 0.05) {
  plot(
    x,
    col = "black",
    cex = .1,
    sub = paste(nrow(x), "points"),
    main = "Full data"
  )
  
  x_orig <- x
  
  # **compute kNN**
  kNN_d <- kNNdist(x, k)
  
  # **estimate globing radius** as the average of the $k$-nearest neighbor distances
  # (top noise % removed as potential outliers).
  r <- mean(kNN_d[kNN_d <= quantile(kNN_d, 1 - noise)])
  
  # processing order is from most dense to least dense (measured by kNN distance).
  proc_order <- order(kNN_d, decreasing = FALSE)
  
  # initial weights (each original point has a weight of 1, i.e., represents 1 point)
  w <- rep(1L, nrow(x))
  
  # FIXME: implement the stopping criterion!
  for (it in seq_len(max_it)) {
    for (i in 1:length(proc_order)) {
      # process the points $p$ in order given by density.
      p <- proc_order[i]
      
      if (is.na(w[[p]]))
        next
      
      # **glob objects**: glob all points to point $p$ that have a distance $d \le r$
      d <- as.vector(dist(x[p, ], x))
      glob <- setdiff(which(d <= r), p)
      
      # **move objects**: move point $p$ for kNN objects which were not globbed
      mv <- setdiff(order(d, decreasing = FALSE)[1:(k + 1L)], p)
      if (length(glob) > 1)
        mv <- setdiff(mv, glob)
      
      if (length(mv) > 0) {
        x[p, ] <- (x[p, ] * w[p] + colSums(x[mv,] * w[mv] /
            d[mv])) / (w[p] + sum(w[mv] / d[mv]))
      }
      
      # update weight and remove globbed objects
      if (length(glob) > 0) {
        w[p] <- w[p] + sum(w[glob])
        
        w[glob] <- NA
        x[glob, ] <- NA
      }
      
      
      i <- i + 1L
    }
    
    plot(
      x_orig,
      col = "grey",
      cex = .1,
      sub = paste(sum(!is.na(w)), "of", nrow(x), "backbone points left"),
      main = paste("Iteration:" , it)
    )
    points(x, cex = w / max(w, na.rm = TRUE) * 2, pch = 19)
  }
  
  # FIXME: **identify clusters** using e.g., DBSCAN is missing
  
  list(x = x, w = w)
}

Dataset

Use a dataset from the CHAMELEON paper (available in package seriation).

library("seriation")
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
data("Chameleon")
x <- chameleon_ds4

Run ABACUS Globbing and Moving on the dataset in the paper

Figure 2 in the paper uses 8 iterations, but I could not find the value used for k for the figure. In Table 1 \(k = 20\) is specified for DS4, but that does not seem to work well for my partial implementation, so I use 5.

res <- abacus(x, k = 5,  max_it = 8, noise = 0.05)

Does data scale make a difference?

res <- abacus(as.data.frame(scale(x)), k = 5,  max_it = 8, noise = 0.05)