Introduction

The following code implements and illustrates some of the concepts of the paper:

R. A. Jarvis and E. A. Patrick. 1973. Clustering Using a Similarity Measure Based on Shared Near Neighbors. IEEE Trans. Comput. 22, 11 (November 1973), 1025-1034. DOI: http://dx.doi.org/10.1109/T-C.1973.223640

Note: The R package dbscan implements jpclust() and snnclust().

library("dbscan") # for kNN search
library("mlbench") # for data
set.seed(2015)

Create Spirals Data

Spirals <- mlbench.spirals(500, 1, 0.05)
plot(Spirals)

x <- Spirals$x

Try with clusters with different density

# x <- rbind(
#  matrix(rnorm(n = 2*100, mean = -1, sd = .2), ncol = 2),
#  matrix(rnorm(n = 2*200, mean =  1, sd = 1), ncol = 2)
#)

head(x)

##             [,1]       [,2]
## [1,] -0.28846459 -0.1742258
## [2,] -0.09467295  0.9018287
## [3,]  0.55489363 -0.2318116
## [4,]  0.24941206 -0.4120516
## [5,]  0.45362583  0.6094411
## [6,] -0.96096853  0.2759620

plot(x)

Most clustering algorithms don’t do well here.

cl <- kmeans(x, centers = 2)
plot(x, col = cl$cluster)

d <- dist(x)
cl <- cutree(hclust(d, method = "single"), k = 2)
plot(x, col = cl)

Show sparsified kNN graph

d <- as.matrix(dist(x))

nn <- kNN(d, k = 10)
nn

## k-nearest neighbors for 500 objects (k=10).
## Available fields: dist, id, k, sort

str(nn)

## List of 4
##  $ dist: num [1:500, 1:10] 0.331 0.283 0.516 0.795 0.452 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:500] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:10] "1" "2" "3" "4" ...
##  $ id  : int [1:500, 1:10] 76 271 321 161 167 307 9 173 10 9 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:500] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:10] "1" "2" "3" "4" ...
##  $ k   : int 10
##  $ sort: logi TRUE
##  - attr(*, "class")= chr [1:2] "kNN" "NN"

head(nn$id)

##     1   2   3   4   5   6   7   8   9  10
## 1  76  35 222  78 290  63 456 324 412  14
## 2 271 163  26 160 347  15 235  48 102 495
## 3 321 294  59 159 158 493 465  34  73 263
## 4 161 391 366 309  54 438 275 120 498 484
## 5 167 379 311  53 180 358 105   8 444 483
## 6 307 297 331 318  55  38 101 369 326 291

plot_nn <- function(x, nn, ...) {
  plot(x, ...)
  for (i in 1:nrow(nn$id))
    for (j in 1:length(nn$id[i, ]))
      lines(x = c(x[i, 1], x[nn$id[i, j], 1]), y = c(x[i, 2], x[nn$id[i, j], 2]))
  
}

plot_nn(x, nn, col = "grey")

Implementing the Jarvis-Patrick algorithm in R

Parameters:

k: consider k nearest neighbors
k_t: is the number of neighbors that need to match to put two points in the same cluster

Algorithm:

Step 1: For each point of the data set list the k nearest neighbors. The point itself is defined to be its own zeroth neighbor.
Step 2: Set up an integer label table of length n, with each entry initially set to the first entry of the corresponding neighborhood row.
Step 3: All possible pairs of neighborhood rows are tested in the following manner. Replace both label entries by the smaller of the two existing entries if both zeroth neighbors (the points being tested) are found in both neighborhood rows and at least kt neighbor matches exist between the two rows (kt is referred to as the similarity threshold). Also, replace all appearances of the higher label (throughout the entire label table) with the lower label if the above test is successful.
Step 4: The clusters under the k, kt selections are now indicated by identical labeling of the points belonging to the clusters.

JP_R <- function(x, k, kt) {
  # Step 1
  n <- nrow(x)
  nn <- kNN(x, k, sort = FALSE)$id
  
  # Step 2
  labels <- 1:n
  
  # Step 3
  for (i in 1:n) {
    # check all neighbors of i
    for (j in nn[i,]) {
      if (j < i)
        next ### we already checked this edge
      if (labels[i] == labels[j])
        next ### already in the same cluster
      if (i %in% nn[j,] &&
          length(intersect(nn[i,], nn[j,])) + 1L >= kt) {
        labels[labels == max(labels[i], labels[j])] <-
          min(labels[i], labels[j])
      }
    }
  }
  
  # Step 4: create contiguous labels
  as.integer(factor(labels))
}


cl <- JP_R(x, k = 10, kt = 6)
cl

##   [1] 1 1 1 1 1 1 2 1 2 2 1 1 2 1 1 1 2 2 1 2 1 2 2 2 2 1 2 1 2 1 2 1 2 1 1 2 2
##  [38] 1 1 1 2 2 1 1 2 2 2 1 1 2 2 2 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 1 1 1
##  [75] 2 1 2 1 2 2 2 1 1 2 2 2 2 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 2 2 1 1 1 2 1 1 1
## [112] 1 2 1 2 2 2 1 1 1 2 1 1 2 2 2 1 2 2 1 1 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 1
## [149] 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 2 1 1 1 2 2 1 2 2 1 2 1
## [186] 1 2 2 1 1 1 2 1 2 1 2 2 3 1 2 2 1 2 2 2 2 1 2 2 2 2 1 4 4 1 4 1 2 2 2 1 1
## [223] 1 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 1 1 2 2 1 1 2 2 2 1 2 1 2 2 2 2 1 1 1 1 2
## [260] 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 1 1 1 1 2 1 2 1
## [297] 1 2 1 2 2 2 2 1 1 2 1 2 1 1 1 2 2 2 2 2 2 1 2 2 1 1 1 1 2 1 2 2 1 2 1 2 1
## [334] 2 1 1 2 1 2 1 1 1 2 2 2 2 1 2 2 1 2 1 1 1 1 1 2 1 1 1 2 2 1 2 2 1 1 1 1 2
## [371] 1 2 2 2 2 2 2 1 1 2 1 2 1 1 1 2 2 1 1 2 1 1 1 2 1 1 2 1 2 2 1 1 2 2 2 2 2
## [408] 2 2 2 2 1 2 2 1 2 2 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 1 1
## [445] 1 2 2 1 1 1 2 2 2 1 2 1 2 2 2 2 2 2 2 1 1 2 2 2 1 2 2 1 1 2 2 1 2 2 1 2 2
## [482] 1 1 1 2 1 2 1 2 1 1 2 1 2 1 1 1 1 2 1

plot(x, col = cl)

Create a similarity matrix using JP strength

Aka the number of shared nearest neighbors.

JP_sim <- function(x, k) {
  nn <- kNN(x, k, sort = FALSE)$id
  n <- nrow(nn)
  
  s <- matrix(0L, nrow = n, ncol = n)
  
  for (i in 1:n) {
    # check all neighbors of i
    for (j in nn[i,]) {
      if (j < i)
        next
      if (i %in% nn[j,])
        s[i, j] <- s[j, i] <-  length(intersect(nn[i,], nn[j,])) + 1L
    }
  }
  
  s
}

k <- 10
s <- JP_sim(x, k = k)

library("seriation")
pimage(s)

convert to distance matrix and then do single link hierarchical clustering. Note that the maximum is k!

d <- as.dist(k - s)
cl <- hclust(d, method = "single")
plot(cl, cex = .8)

Try to cut the dendrogram into a different number of components. Finding the right number might be tricky.

plot(x, col = cutree(cl, k = 2))

plot(x, col = cutree(cl, k = 3))

plot(x, col = cutree(cl, k = 4))

plot(x, col = cutree(cl, k = 5))

Weighted similarity matrix using JP strength

\(str(i,j) = sum (k+1-m)*(k+1-n)\) where \(i_m=i_j\), \(m\) and \(n\) are the positions in \(i\) and \(j\)’s nearest neighbors lists. Note 1st neighbor entry is the point itself!

JPW_sim <- function(x, k) {
  nn <- kNN(x, k, sort = FALSE)$id
  n <- nrow(nn)
  
  s <- matrix(0L, nrow = n, ncol = n)
  
  for (i in 1:n) {
    # check all neighbors of i
    for (j in nn[i, ]) {
      if (j < i)
        next
      if (i %in% nn[j, ]) {
        m <- match(c(i, nn[i, ]), c(j, nn[j, ]))
        s[i, j] <-
          s[j, i] <-  sum((k + 1L - (0:k)) * (k + 1L - m - 1L), na.rm = TRUE)
        # note index starts with 0 in paper
      }
    }
  }
  
  s
}

s <- JPW_sim(x, k = 10)
pimage(s)

convert to distance matrix and then do single link hierarchical clustering. Max is roughly

m <- sum((k + 1 - (1:(k + 1))) ^ 2)
m

## [1] 385

d <- as.dist(m - s)
cl <- hclust(d, method = "single")
plot(cl, cex = .8)

plot(x, col = cutree(cl, k = 2))

plot(x, col = cutree(cl, k = 3))

plot(x, col = cutree(cl, k = 4))

plot(x, col = cutree(cl, k = 5))

Note: Does not seem to work so well on this data set.

A related approach: SNN clustering

Levent Ertoz, Michael Steinbach, Vipin Kumar, Finding Clusters of Different Sizes, Shapes, and Densities in Noisy, High Dimensional Data, SIAM International Conference on Data Mining, 2003, 47-59. URL: https://www-users.cse.umn.edu/~kumar001/papers/SIAM_snn.pdf

Interestingly this paper is basically just an application of DBSCAN (Ester et al, 1996) on the SNN distance defined for JP clustering instead of the original data.

s <- JP_sim(x, k = k)
d <- as.dist(k - s)
cl <- dbscan(d, eps = k - 5, minPts = 4)
plot(x, col = cl$cluster + 1)

C++ Reimplementation of JP clustering

Loops are slow in interpreted languages like R!

library("Rcpp")
library("inline")

## 
## Attaching package: 'inline'

## The following object is masked from 'package:Rcpp':
## 
##     registerPlugin

C++ code for Step 2

cppFunction(
  '
IntegerVector JP_step2_cpp(IntegerMatrix nn, unsigned int kt) {
   int n = nn.nrow();

   // create label vector
   std::vector<int> label(n);
   std::iota(std::begin(label), std::end(label), 1); // Fill with 1, 2, ..., n.

   // create sorted sets so we can use set operations
   std::vector< std::set<int> > nn_set(nn.nrow());
   IntegerVector r;
   std::vector<int> s;
   for(int i = 0; i < n; ++i) {
     r = nn(i,_);
     s =  Rcpp::as<std::vector<int> >(r);
     nn_set[i].insert(s.begin(), s.end());
   }

   std::vector<int> z;
   int newlabel, oldlabel;

   std::set<int>::iterator it;
   int j;

   for(int i = 0; i < n; ++i) {
     // check all neighbors of i
     for (it = nn_set[i].begin(); it != nn_set[i].end(); ++it) {
       j = *it-1;
       if(j<i) continue;
       if(label[i] == label[j]) continue;

       if(nn_set[j].find(i+1) != nn_set[j].end()) {

        z.clear();
        std::set_intersection(nn_set[i].begin(), nn_set[i].end(),
          nn_set[j].begin(), nn_set[j].end(),
          std::back_inserter(z));

        // this could be done faster with set unions
        if(z.size()+1 >= kt) {
          // update labels
          if(label[i] > label[j]) {
            newlabel = label[j]; oldlabel = label[i];
          }else{
            newlabel = label[i]; oldlabel = label[j];
          }

          for(int k = 0; k < n; ++k) {
            if(label[k] == oldlabel) label[k] = newlabel;
          }
        }
       }
      }
    }

   return wrap(label);
 }'
)

R does Steps 1 and 3.

JP_CPP <- function(x, k, kt) {
  # Step 1
  nn <- kNN(x, k, sort = FALSE)$id
  
  # Step 2
  cl <- JP_step2_cpp(nn, kt = as.integer(kt))
  
  # Step 3
  as.integer(factor(cl))
}

cl <- JP_CPP(x, k = 10, kt = 6)
cl

##   [1] 1 1 1 1 1 1 2 1 2 2 1 1 2 1 1 1 2 2 1 2 1 2 2 2 2 1 2 1 2 1 2 1 2 1 1 2 2
##  [38] 1 1 1 2 2 1 1 2 2 2 1 1 2 2 2 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 1 1 1
##  [75] 2 1 2 1 2 2 2 1 1 2 2 2 2 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 2 2 1 1 1 2 1 1 1
## [112] 1 2 1 2 2 2 1 1 1 2 1 1 2 2 2 1 2 2 1 1 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 1
## [149] 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 2 1 1 1 2 2 1 2 2 1 2 1
## [186] 1 2 2 1 1 1 2 1 2 1 2 2 3 1 2 2 1 2 2 2 2 1 2 2 2 2 1 4 4 1 4 1 2 2 2 1 1
## [223] 1 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 1 1 2 2 1 1 2 2 2 1 2 1 2 2 2 2 1 1 1 1 2
## [260] 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 1 1 1 1 2 1 2 1
## [297] 1 2 1 2 2 2 2 1 1 2 1 2 1 1 1 2 2 2 2 2 2 1 2 2 1 1 1 1 2 1 2 2 1 2 1 2 1
## [334] 2 1 1 2 1 2 1 1 1 2 2 2 2 1 2 2 1 2 1 1 1 1 1 2 1 1 1 2 2 1 2 2 1 1 1 1 2
## [371] 1 2 2 2 2 2 2 1 1 2 1 2 1 1 1 2 2 1 1 2 1 1 1 2 1 1 2 1 2 2 1 1 2 2 2 2 2
## [408] 2 2 2 2 1 2 2 1 2 2 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 1 1
## [445] 1 2 2 1 1 1 2 2 2 1 2 1 2 2 2 2 2 2 2 1 1 2 2 2 1 2 2 1 1 2 2 1 2 2 1 2 2
## [482] 1 1 1 2 1 2 1 2 1 1 2 1 2 1 1 1 1 2 1

plot(x, col = cl)

Check timing and compare to other methods

library("microbenchmark")

hc <- function(x, k) {
  d <- dist(x)
  cutree(hclust(d, method = "single"), k = k)
}

jpclust is the C++ implementation in package dbscan

rbind(
  microbenchmark(JP_R(x, k = 10, kt = 6), times = 10),
  microbenchmark(JP_CPP(x, k = 10, kt = 6), times = 10),
  microbenchmark(jpclust(x, k = 10, kt = 6), times = 10),
  microbenchmark(kmeans(x, centers = 2), times = 10),
  microbenchmark(hc(x, k = 2), times = 10)
)

## Unit: microseconds
##                        expr       min        lq       mean     median        uq
##     JP_R(x, k = 10, kt = 6) 10922.976 10999.170 11957.2338 11697.9185 12402.986
##   JP_CPP(x, k = 10, kt = 6)  2067.412  2223.690  3091.9826  2413.9955  2675.014
##  jpclust(x, k = 10, kt = 6)  2068.630  2090.276  2332.0848  2241.6065  2376.902
##      kmeans(x, centers = 2)   301.375   329.309   370.1431   358.1585   374.460
##                hc(x, k = 2)  6185.253  6366.743  7159.8364  6802.8775  7117.759
##        max neval  cld
##  14990.272    10    d
##   9345.693    10  b  
##   3198.576    10  b  
##    560.767    10 a   
##  10173.254    10   c

Note: k-means is fast, but does not provide a good clustering!

CS 8331: Jarvis-Patrick Clustering

Michael Hahsler

2021-09-08