The following code implements and illustrates some of the concepts of the paper:
R. A. Jarvis and E. A. Patrick. 1973. Clustering Using a Similarity Measure Based on Shared Near Neighbors. IEEE Trans. Comput. 22, 11 (November 1973), 1025-1034. DOI=http://dx.doi.org/10.1109/T-C.1973.223640
Note: The R package dbscan implements jpclust()
and snnclust()
.
library("dbscan") # for kNN search
library("mlbench") # for data
set.seed(2015)
Spirals <- mlbench.spirals(500, 1, 0.05)
plot(Spirals)
x <- Spirals$x
# x <- rbind(
# matrix(rnorm(n = 2*100, mean = -1, sd = .2), ncol = 2),
# matrix(rnorm(n = 2*200, mean = 1, sd = 1), ncol = 2)
#)
plot(x)
Most clustering algorithms don’t do well here.
cl <- kmeans(x, centers = 2)
plot(x, col = cl$cluster)
d <- dist(x)
cl <- cutree(hclust(d, method = "single"), k = 2)
plot(x, col = cl)
d <- as.matrix(dist(x))
nn <- kNN(x, k = 10, sort = FALSE)
nn
## k-nearest neighbors for 500 objects (k=10).
## Available fields: dist, id, k, sort
str(nn)
## List of 4
## $ dist: num [1:500, 1:10] 0.0143 0.0376 0.0143 0.0203 0.0164 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:10] "1" "2" "3" "4" ...
## $ id : int [1:500, 1:10] 3 267 1 340 3 7 9 58 7 21 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:10] "1" "2" "3" "4" ...
## $ k : int 10
## $ sort: logi FALSE
## - attr(*, "class")= chr [1:2] "kNN" "NN"
head(nn$id)
## 1 2 3 4 5 6 7 8 9 10
## [1,] 3 6 5 7 9 18 20 12 22 10
## [2,] 267 49 422 489 290 404 337 26 360 13
## [3,] 1 5 6 7 9 18 12 20 22 10
## [4,] 340 384 430 469 300 368 174 207 190 243
## [5,] 3 6 1 7 9 18 12 20 10 22
## [6,] 7 3 5 9 1 18 20 12 22 25
plot_nn <- function(x, nn, ...) {
plot(x, ...)
for(i in 1:nrow(nn$id))
for(j in 1:length(nn$id[i,]))
lines(x = c(x[i,1], x[nn$id[i,j],1]), y = c(x[i,2], x[nn$id[i,j],2]))
}
plot_nn(x, nn, col = "grey")
Parameters:
Algorithm:
JP_R <- function(x, k, kt) {
# Step 1
nn <- kNN(x, k, sort = FALSE)$id
n <- nrow(nn)
# Step 2
labels <- 1:n
# Step 3
for(i in 1:n) {
# check all neighbors of i
for(j in nn[i,]) {
if(j<i) next ### we already checked this edge
if(labels[i] == labels[j]) next ### already in the same cluster
if(i %in% nn[j,] && length(intersect(nn[i,], nn[j,]))+1L >= kt) {
labels[labels == max(labels[i], labels[j])] <- min(labels[i], labels[j])
}
}
}
# Step 4: create contiguous labels
as.integer(factor(labels))
}
cl <- JP_R(x, k = 10, kt = 6)
cl
## [1] 1 2 1 2 1 1 1 2 1 1 2 1 2 2 2 2 2 1 2 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 2
## [36] 2 1 2 2 1 1 2 2 2 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 1 1 1 1 1 2 2 1
## [71] 2 1 1 2 2 1 1 1 2 1 1 3 1 1 2 2 2 4 2 2 1 2 1 1 2 1 2 1 1 2 2 1 2 2 1
## [106] 1 2 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 2 2 1 2 1 1 2 1 2 1 1 2 1 1 1
## [141] 2 2 1 1 2 1 1 2 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 2 1 2 1 2 2 2
## [176] 2 2 2 1 1 2 1 2 2 1 1 2 2 1 2 2 2 2 2 2 1 2 1 1 2 1 1 2 2 1 2 2 1 1 1
## [211] 1 1 1 1 1 1 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 1 1 1 2 2 2 1 2 1 1 1 2 1 1
## [246] 2 2 2 1 1 2 2 1 2 1 1 2 1 1 2 1 2 2 2 1 1 2 2 2 1 1 1 1 1 2 2 1 1 2 2
## [281] 1 1 2 1 1 1 2 1 2 2 2 1 1 2 1 1 2 2 1 2 1 1 2 1 2 1 1 1 1 2 1 2 2 2 1
## [316] 1 1 2 2 2 2 2 1 2 2 1 1 2 1 1 2 1 2 2 2 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1
## [351] 2 2 2 2 1 1 2 1 2 2 2 2 1 2 1 1 2 2 1 2 2 1 2 1 2 1 1 2 1 2 2 1 1 2 2
## [386] 2 1 1 2 2 1 1 2 2 1 1 2 2 2 2 2 2 1 2 1 2 2 1 1 1 2 1 2 2 2 2 2 2 2 1
## [421] 1 2 2 2 1 1 2 2 2 2 2 1 1 1 1 1 2 1 1 2 2 1 1 1 2 2 1 1 2 2 1 2 2 1 1
## [456] 2 1 2 2 1 1 2 2 1 1 1 1 1 2 1 2 2 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 2 2
## [491] 1 1 2 1 1 2 1 2 2 2
plot(x, col = cl)
Aka the number of shared nearest neighbors.
JP_sim <- function(x, k) {
nn <- kNN(x, k, sort = FALSE)$id
n <- nrow(nn)
s <- matrix(0L, nrow = n, ncol = n)
for(i in 1:n) {
# check all neighbors of i
for(j in nn[i,]) {
if(j<i) next
if(i %in% nn[j,]) s[i,j] <- s[j,i] <- length(intersect(nn[i,], nn[j,]))+1L
}
}
s
}
k <- 10
s <- JP_sim(x, k = k)
convert to distance matrix and then do single link hierarchical clustering. Note that the maximum is k!
d <- as.dist(k-s)
cl <- hclust(d, method = "single")
plot(cl, cex=.8)
Try to cut the dendrogram into a different number of components. Finding the right number might be tricky.
plot(x, col = cutree(cl, k = 2))
plot(x, col = cutree(cl, k = 3))
plot(x, col = cutree(cl, k = 4))
plot(x, col = cutree(cl, k = 5))
\(str(i,j) = sum (k+1-m)*(k+1-n)\) where \(i_m=i_j\), \(m\) and \(n\) are the positions in \(i\) and \(j\)’s nearest neighbors lists. Note 1st neighbor entry is the point itself!
JPW_sim <- function(x, k) {
nn <- kNN(x, k, sort = FALSE)$id
n <- nrow(nn)
s <- matrix(0L, nrow = n, ncol = n)
for(i in 1:n) {
# check all neighbors of i
for(j in nn[i,]) {
if(j<i) next
if(i %in% nn[j,]){
m <- match(c(i, nn[i,]), c(j, nn[j,]))
s[i,j] <- s[j,i] <- sum((k+1L-(0:k))*(k+1L-m-1L), na.rm = TRUE)
# note index starts with 0 in paper
}
}
}
s
}
s <- JPW_sim(x, k = 10)
convert to distance matrix and then do single link hierarchical clustering. Max is roughly
m <- sum((k+1-(1:(k+1)))^2)
m
## [1] 385
d <- as.dist(m-s)
cl <- hclust(d, method = "single")
plot(cl, cex=.8)
plot(x, col = cutree(cl, k = 2))
plot(x, col = cutree(cl, k = 3))
plot(x, col = cutree(cl, k = 4))
plot(x, col = cutree(cl, k = 5))
Note: Does not seem to work so well on this data set.
Loops are slow in interpreted languages like R!
library("Rcpp")
library("inline")
##
## Attaching package: 'inline'
## The following object is masked from 'package:Rcpp':
##
## registerPlugin
C++ code for Step 2
cppFunction('
IntegerVector JP_C(IntegerMatrix nn, unsigned int kt) {
int n = nn.nrow();
// create label vector
std::vector<int> label(n);
std::iota(std::begin(label), std::end(label), 1); // Fill with 1, 2, ..., n.
// create sorted sets so we can use set operations
std::vector< std::set<int> > nn_set(nn.nrow());
IntegerVector r;
std::vector<int> s;
for(int i = 0; i < n; ++i) {
r = nn(i,_);
s = Rcpp::as<std::vector<int> >(r);
nn_set[i].insert(s.begin(), s.end());
}
std::vector<int> z;
int newlabel, oldlabel;
std::set<int>::iterator it;
int j;
for(int i = 0; i < n; ++i) {
// check all neighbors of i
for (it = nn_set[i].begin(); it != nn_set[i].end(); ++it) {
j = *it-1;
if(j<i) continue;
if(label[i] == label[j]) continue;
if(nn_set[j].find(i+1) != nn_set[j].end()) {
z.clear();
std::set_intersection(nn_set[i].begin(), nn_set[i].end(),
nn_set[j].begin(), nn_set[j].end(),
std::back_inserter(z));
// this could be done faster with set unions
if(z.size()+1 >= kt) {
// update labels
if(label[i] > label[j]) {
newlabel = label[j]; oldlabel = label[i];
}else{
newlabel = label[i]; oldlabel = label[j];
}
for(int k = 0; k < n; ++k) {
if(label[k] == oldlabel) label[k] = newlabel;
}
}
}
}
}
return wrap(label);
}')
R does Steps 1 and 3.
JP <- function(x, k, kt) {
# Step 1
nn <- kNN(x, k, sort = FALSE)$id
# Step 2
cl <- JP_C(nn, kt = as.integer(kt))
# Step 3
as.integer(factor(cl))
}
cl <- JP(x, k = 10, kt = 6)
cl
## [1] 1 2 1 2 1 1 1 2 1 1 2 1 2 2 2 2 2 1 2 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 2
## [36] 2 1 2 2 1 1 2 2 2 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 2 1 2 1 1 1 1 1 2 2 1
## [71] 2 1 1 2 2 1 1 1 2 1 1 3 1 1 2 2 2 4 2 2 1 2 1 1 2 1 2 1 1 2 2 1 2 2 1
## [106] 1 2 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 2 2 1 2 1 1 2 1 2 1 1 2 1 1 1
## [141] 2 2 1 1 2 1 1 2 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 2 1 2 1 2 2 2
## [176] 2 2 2 1 1 2 1 2 2 1 1 2 2 1 2 2 2 2 2 2 1 2 1 1 2 1 1 2 2 1 2 2 1 1 1
## [211] 1 1 1 1 1 1 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 1 1 1 2 2 2 1 2 1 1 1 2 1 1
## [246] 2 2 2 1 1 2 2 1 2 1 1 2 1 1 2 1 2 2 2 1 1 2 2 2 1 1 1 1 1 2 2 1 1 2 2
## [281] 1 1 2 1 1 1 2 1 2 2 2 1 1 2 1 1 2 2 1 2 1 1 2 1 2 1 1 1 1 2 1 2 2 2 1
## [316] 1 1 2 2 2 2 2 1 2 2 1 1 2 1 1 2 1 2 2 2 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1
## [351] 2 2 2 2 1 1 2 1 2 2 2 2 1 2 1 1 2 2 1 2 2 1 2 1 2 1 1 2 1 2 2 1 1 2 2
## [386] 2 1 1 2 2 1 1 2 2 1 1 2 2 2 2 2 2 1 2 1 2 2 1 1 1 2 1 2 2 2 2 2 2 2 1
## [421] 1 2 2 2 1 1 2 2 2 2 2 1 1 1 1 1 2 1 1 2 2 1 1 1 2 2 1 1 2 2 1 2 2 1 1
## [456] 2 1 2 2 1 1 2 2 1 1 1 1 1 2 1 2 2 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 2 2
## [491] 1 1 2 1 1 2 1 2 2 2
plot(x, col = cl)
library(microbenchmark)
hc <- function(x, k) {
d <- dist(x)
cutree(hclust(d, method = "single"), k = k)
}
rbind(
microbenchmark(JP_R(x, k=10, kt = 6), times = 10),
microbenchmark(JP(x, k=10, kt = 6), times = 10),
microbenchmark(kmeans(x, centers=2), times = 10),
microbenchmark(hc(x, k=2), times = 10)
)
## Unit: microseconds
## expr min lq mean median
## JP_R(x, k = 10, kt = 6) 13762.330 13961.181 14779.7089 14375.088
## JP(x, k = 10, kt = 6) 1926.131 1938.962 2543.5234 2154.980
## kmeans(x, centers = 2) 223.933 256.302 323.1584 289.722
## hc(x, k = 2) 7907.122 9178.834 10251.2318 9694.385
## uq max neval cld
## 15253.657 17747.393 10 d
## 2355.752 6264.929 10 b
## 349.048 498.849 10 a
## 11534.700 12902.947 10 c
Note: k-means is fast, but does not provide a good clustering!