Introduction

The following code implements and illustrates some of the concepts of the paper:

R. A. Jarvis and E. A. Patrick. 1973. Clustering Using a Similarity Measure Based on Shared Near Neighbors. IEEE Trans. Comput. 22, 11 (November 1973), 1025-1034. DOI=http://dx.doi.org/10.1109/T-C.1973.223640

Note: The R package dbscan implements jpclust() and snnclust().

library("dbscan") # for kNN search
library("mlbench") # for data
set.seed(2015)

Create Spirals Data

Spirals <- mlbench.spirals(500, 1, 0.05)
plot(Spirals)

x <- Spirals$x

Try with clusters with different density

# x <- rbind(
#  matrix(rnorm(n = 2*100, mean = -1, sd = .2), ncol = 2),
#  matrix(rnorm(n = 2*200, mean =  1, sd = 1), ncol = 2)
#)

plot(x)

Most clustering algorithms don’t do well here.

cl <- kmeans(x, centers = 2)
plot(x, col = cl$cluster)

d <- dist(x)
cl <- cutree(hclust(d, method = "single"), k = 2)
plot(x, col = cl)

Show sparsified kNN graph

d <- as.matrix(dist(x))

nn <- kNN(x, k = 10, sort = FALSE)
nn
## k-nearest neighbors for 500 objects (k=10).
## Available fields: dist, id, k, sort
str(nn)
## List of 4
##  $ dist: num [1:500, 1:10] 0.0143 0.0376 0.0143 0.0203 0.0164 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:10] "1" "2" "3" "4" ...
##  $ id  : int [1:500, 1:10] 3 267 1 340 3 7 9 58 7 21 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:10] "1" "2" "3" "4" ...
##  $ k   : int 10
##  $ sort: logi FALSE
##  - attr(*, "class")= chr [1:2] "kNN" "NN"
head(nn$id)
##        1   2   3   4   5   6   7   8   9  10
## [1,]   3   6   5   7   9  18  20  12  22  10
## [2,] 267  49 422 489 290 404 337  26 360  13
## [3,]   1   5   6   7   9  18  12  20  22  10
## [4,] 340 384 430 469 300 368 174 207 190 243
## [5,]   3   6   1   7   9  18  12  20  10  22
## [6,]   7   3   5   9   1  18  20  12  22  25
plot_nn <- function(x, nn, ...) {
  plot(x, ...)
  for(i in 1:nrow(nn$id))
    for(j in 1:length(nn$id[i,]))
      lines(x = c(x[i,1], x[nn$id[i,j],1]), y = c(x[i,2], x[nn$id[i,j],2]))

}

plot_nn(x, nn, col = "grey")

Implementing the Jarvis-Patrick algorithm in R

Parameters:

  • k: consider k nearest neighbors
  • k_t: is the number of neighbors that need to match to put two points in the same cluster

Algorithm:

  • Step 1: For each point of the data set list the k nearest neighbors.
  • Step 2: Set up an integer label table of length n, with each entry initially set to the first entry of the corresponding neighborhood row.
  • Step 3: All possible pairs of neighborhood rows are tested in the following manner. Replace both label entries by the smaller of the two existing entries if both zeroth neighbors (the points being tested) are found in both neighborhood rows and at least kt neighbor matches exist between the two rows (kt is referred to as the similarity threshold). Also, replace all appearances of the higher label (throughout the entire label table) with the lower label if the above test is successful.
  • Step 4: The clusters under the k, kt selections are now indicated by identical labeling of the points belonging to the clusters.
JP_R <- function(x, k, kt) {
  # Step 1
  nn <- kNN(x, k, sort = FALSE)$id
  n <- nrow(nn)

  # Step 2
  labels <- 1:n

  # Step 3
  for(i in 1:n) {
    # check all neighbors of i
    for(j in nn[i,]) {
      if(j<i) next ### we already checked this edge
      if(labels[i] == labels[j]) next ### already in the same cluster
      if(i %in% nn[j,] && length(intersect(nn[i,], nn[j,]))+1L >= kt) {
        labels[labels == max(labels[i], labels[j])] <- min(labels[i], labels[j])
      }
    }
  }

  # Step 4: create contiguous labels
  as.integer(factor(labels))
}


cl <- JP_R(x, k = 10, kt = 6)
cl
##   [1] 1 2 1 2 1 1 1 2 1 1 2 1 2 2 2 2 2 1 2 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 2
##  [36] 2 1 2 2 1 1 2 2 2 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 1 1 1 1 1 2 2 1
##  [71] 2 1 1 2 2 1 1 1 2 1 1 3 1 1 2 2 2 4 2 2 1 2 1 1 2 1 2 1 1 2 2 1 2 2 1
## [106] 1 2 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 2 2 1 2 1 1 2 1 2 1 1 2 1 1 1
## [141] 2 2 1 1 2 1 1 2 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 2 1 2 1 2 2 2
## [176] 2 2 2 1 1 2 1 2 2 1 1 2 2 1 2 2 2 2 2 2 1 2 1 1 2 1 1 2 2 1 2 2 1 1 1
## [211] 1 1 1 1 1 1 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 1 1 1 2 2 2 1 2 1 1 1 2 1 1
## [246] 2 2 2 1 1 2 2 1 2 1 1 2 1 1 2 1 2 2 2 1 1 2 2 2 1 1 1 1 1 2 2 1 1 2 2
## [281] 1 1 2 1 1 1 2 1 2 2 2 1 1 2 1 1 2 2 1 2 1 1 2 1 2 1 1 1 1 2 1 2 2 2 1
## [316] 1 1 2 2 2 2 2 1 2 2 1 1 2 1 1 2 1 2 2 2 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1
## [351] 2 2 2 2 1 1 2 1 2 2 2 2 1 2 1 1 2 2 1 2 2 1 2 1 2 1 1 2 1 2 2 1 1 2 2
## [386] 2 1 1 2 2 1 1 2 2 1 1 2 2 2 2 2 2 1 2 1 2 2 1 1 1 2 1 2 2 2 2 2 2 2 1
## [421] 1 2 2 2 1 1 2 2 2 2 2 1 1 1 1 1 2 1 1 2 2 1 1 1 2 2 1 1 2 2 1 2 2 1 1
## [456] 2 1 2 2 1 1 2 2 1 1 1 1 1 2 1 2 2 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 2 2
## [491] 1 1 2 1 1 2 1 2 2 2
plot(x, col = cl)

Create a similarity matrix using JP strength

Aka the number of shared nearest neighbors.

JP_sim <- function(x, k) {
  nn <- kNN(x, k, sort = FALSE)$id
  n <- nrow(nn)

  s <- matrix(0L, nrow = n, ncol = n)

  for(i in 1:n) {
    # check all neighbors of i
    for(j in nn[i,]) {
      if(j<i) next
      if(i %in% nn[j,]) s[i,j] <- s[j,i] <-  length(intersect(nn[i,], nn[j,]))+1L
    }
  }

  s
}

k <- 10
s <- JP_sim(x, k = k)

convert to distance matrix and then do single link hierarchical clustering. Note that the maximum is k!

d <- as.dist(k-s)
cl <- hclust(d, method = "single")
plot(cl, cex=.8)

Try to cut the dendrogram into a different number of components. Finding the right number might be tricky.

plot(x, col = cutree(cl, k = 2))

plot(x, col = cutree(cl, k = 3))

plot(x, col = cutree(cl, k = 4))

plot(x, col = cutree(cl, k = 5))

Weighted similarity matrix using JP strength

\(str(i,j) = sum (k+1-m)*(k+1-n)\) where \(i_m=i_j\), \(m\) and \(n\) are the positions in \(i\) and \(j\)’s nearest neighbors lists. Note 1st neighbor entry is the point itself!

JPW_sim <- function(x, k) {
  nn <- kNN(x, k, sort = FALSE)$id
  n <- nrow(nn)

  s <- matrix(0L, nrow = n, ncol = n)

  for(i in 1:n) {
    # check all neighbors of i
    for(j in nn[i,]) {
      if(j<i) next
      if(i %in% nn[j,]){
        m <- match(c(i, nn[i,]), c(j, nn[j,]))
        s[i,j] <- s[j,i] <-  sum((k+1L-(0:k))*(k+1L-m-1L), na.rm = TRUE)
        # note index starts with 0 in paper
      }
    }
  }

  s
}

s <- JPW_sim(x, k = 10)

convert to distance matrix and then do single link hierarchical clustering. Max is roughly

m <- sum((k+1-(1:(k+1)))^2)
m
## [1] 385
d <- as.dist(m-s)
cl <- hclust(d, method = "single")
plot(cl, cex=.8)

plot(x, col = cutree(cl, k = 2))

plot(x, col = cutree(cl, k = 3))

plot(x, col = cutree(cl, k = 4))

plot(x, col = cutree(cl, k = 5))

Note: Does not seem to work so well on this data set.

C++ Reimplementation of JP clustering

Loops are slow in interpreted languages like R!

library("Rcpp")
library("inline")
## 
## Attaching package: 'inline'
## The following object is masked from 'package:Rcpp':
## 
##     registerPlugin

C++ code for Step 2

cppFunction('
IntegerVector JP_C(IntegerMatrix nn, unsigned int kt) {
   int n = nn.nrow();

   // create label vector
   std::vector<int> label(n);
   std::iota(std::begin(label), std::end(label), 1); // Fill with 1, 2, ..., n.

   // create sorted sets so we can use set operations
   std::vector< std::set<int> > nn_set(nn.nrow());
   IntegerVector r;
   std::vector<int> s;
   for(int i = 0; i < n; ++i) {
     r = nn(i,_);
     s =  Rcpp::as<std::vector<int> >(r);
     nn_set[i].insert(s.begin(), s.end());
   }

   std::vector<int> z;
   int newlabel, oldlabel;

   std::set<int>::iterator it;
   int j;

   for(int i = 0; i < n; ++i) {
     // check all neighbors of i
     for (it = nn_set[i].begin(); it != nn_set[i].end(); ++it) {
       j = *it-1;
       if(j<i) continue;
       if(label[i] == label[j]) continue;

       if(nn_set[j].find(i+1) != nn_set[j].end()) {

        z.clear();
        std::set_intersection(nn_set[i].begin(), nn_set[i].end(),
          nn_set[j].begin(), nn_set[j].end(),
          std::back_inserter(z));

        // this could be done faster with set unions
        if(z.size()+1 >= kt) {
          // update labels
          if(label[i] > label[j]) {
            newlabel = label[j]; oldlabel = label[i];
          }else{
            newlabel = label[i]; oldlabel = label[j];
          }

          for(int k = 0; k < n; ++k) {
            if(label[k] == oldlabel) label[k] = newlabel;
          }
        }
       }
      }
    }

   return wrap(label);
 }')

R does Steps 1 and 3.

JP <- function(x, k, kt) {
  # Step 1
  nn <- kNN(x, k, sort = FALSE)$id

  # Step 2
  cl <- JP_C(nn, kt = as.integer(kt))

  # Step 3
  as.integer(factor(cl))
}

cl <- JP(x, k = 10, kt = 6)
cl
##   [1] 1 2 1 2 1 1 1 2 1 1 2 1 2 2 2 2 2 1 2 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 2
##  [36] 2 1 2 2 1 1 2 2 2 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 1 1 1 1 1 2 2 1
##  [71] 2 1 1 2 2 1 1 1 2 1 1 3 1 1 2 2 2 4 2 2 1 2 1 1 2 1 2 1 1 2 2 1 2 2 1
## [106] 1 2 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 2 2 1 2 1 1 2 1 2 1 1 2 1 1 1
## [141] 2 2 1 1 2 1 1 2 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 2 1 2 1 2 2 2
## [176] 2 2 2 1 1 2 1 2 2 1 1 2 2 1 2 2 2 2 2 2 1 2 1 1 2 1 1 2 2 1 2 2 1 1 1
## [211] 1 1 1 1 1 1 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 1 1 1 2 2 2 1 2 1 1 1 2 1 1
## [246] 2 2 2 1 1 2 2 1 2 1 1 2 1 1 2 1 2 2 2 1 1 2 2 2 1 1 1 1 1 2 2 1 1 2 2
## [281] 1 1 2 1 1 1 2 1 2 2 2 1 1 2 1 1 2 2 1 2 1 1 2 1 2 1 1 1 1 2 1 2 2 2 1
## [316] 1 1 2 2 2 2 2 1 2 2 1 1 2 1 1 2 1 2 2 2 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1
## [351] 2 2 2 2 1 1 2 1 2 2 2 2 1 2 1 1 2 2 1 2 2 1 2 1 2 1 1 2 1 2 2 1 1 2 2
## [386] 2 1 1 2 2 1 1 2 2 1 1 2 2 2 2 2 2 1 2 1 2 2 1 1 1 2 1 2 2 2 2 2 2 2 1
## [421] 1 2 2 2 1 1 2 2 2 2 2 1 1 1 1 1 2 1 1 2 2 1 1 1 2 2 1 1 2 2 1 2 2 1 1
## [456] 2 1 2 2 1 1 2 2 1 1 1 1 1 2 1 2 2 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 2 2
## [491] 1 1 2 1 1 2 1 2 2 2
plot(x, col = cl)

Check timing and compare to other methods

library(microbenchmark)

hc <- function(x, k) {
  d <- dist(x)
  cutree(hclust(d, method = "single"), k = k)
}

rbind(
  microbenchmark(JP_R(x, k=10, kt = 6), times = 10),
  microbenchmark(JP(x, k=10, kt = 6), times = 10),
  microbenchmark(kmeans(x, centers=2), times = 10),
  microbenchmark(hc(x, k=2), times = 10)
)
## Unit: microseconds
##                     expr       min        lq       mean    median
##  JP_R(x, k = 10, kt = 6) 13762.330 13961.181 14779.7089 14375.088
##    JP(x, k = 10, kt = 6)  1926.131  1938.962  2543.5234  2154.980
##   kmeans(x, centers = 2)   223.933   256.302   323.1584   289.722
##             hc(x, k = 2)  7907.122  9178.834 10251.2318  9694.385
##         uq       max neval  cld
##  15253.657 17747.393    10    d
##   2355.752  6264.929    10  b  
##    349.048   498.849    10 a   
##  11534.700 12902.947    10   c

Note: k-means is fast, but does not provide a good clustering!