The following code implements and illustrates some of the concepts of the paper:
R. A. Jarvis and E. A. Patrick. 1973. Clustering Using a Similarity Measure Based on Shared Near Neighbors. IEEE Trans. Comput. 22, 11 (November 1973), 1025-1034. DOI: http://dx.doi.org/10.1109/T-C.1973.223640
Note: The R package dbscan implements jpclust()
and snnclust()
.
library("dbscan") # for kNN search
library("mlbench") # for data
set.seed(2015)
Spirals <- mlbench.spirals(500, 1, 0.05)
plot(Spirals)
x <- Spirals$x
# x <- rbind(
# matrix(rnorm(n = 2*100, mean = -1, sd = .2), ncol = 2),
# matrix(rnorm(n = 2*200, mean = 1, sd = 1), ncol = 2)
#)
head(x)
## [,1] [,2]
## [1,] -0.28846459 -0.1742258
## [2,] -0.09467295 0.9018287
## [3,] 0.55489363 -0.2318116
## [4,] 0.24941206 -0.4120516
## [5,] 0.45362583 0.6094411
## [6,] -0.96096853 0.2759620
plot(x)
Most clustering algorithms don’t do well here.
cl <- kmeans(x, centers = 2)
plot(x, col = cl$cluster)
d <- dist(x)
cl <- cutree(hclust(d, method = "single"), k = 2)
plot(x, col = cl)
d <- as.matrix(dist(x))
nn <- kNN(d, k = 10)
nn
## k-nearest neighbors for 500 objects (k=10).
## Available fields: dist, id, k, sort
str(nn)
## List of 4
## $ dist: num [1:500, 1:10] 0.331 0.283 0.516 0.795 0.452 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:500] "1" "2" "3" "4" ...
## .. ..$ : chr [1:10] "1" "2" "3" "4" ...
## $ id : int [1:500, 1:10] 76 271 321 161 167 307 9 173 10 9 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:500] "1" "2" "3" "4" ...
## .. ..$ : chr [1:10] "1" "2" "3" "4" ...
## $ k : int 10
## $ sort: logi TRUE
## - attr(*, "class")= chr [1:2] "kNN" "NN"
head(nn$id)
## 1 2 3 4 5 6 7 8 9 10
## 1 76 35 222 78 290 63 456 324 412 14
## 2 271 163 26 160 347 15 235 48 102 495
## 3 321 294 59 159 158 493 465 34 73 263
## 4 161 391 366 309 54 438 275 120 498 484
## 5 167 379 311 53 180 358 105 8 444 483
## 6 307 297 331 318 55 38 101 369 326 291
plot_nn <- function(x, nn, ...) {
plot(x, ...)
for (i in 1:nrow(nn$id))
for (j in 1:length(nn$id[i, ]))
lines(x = c(x[i, 1], x[nn$id[i, j], 1]), y = c(x[i, 2], x[nn$id[i, j], 2]))
}
plot_nn(x, nn, col = "grey")
Parameters:
Algorithm:
JP_R <- function(x, k, kt) {
# Step 1
n <- nrow(x)
nn <- kNN(x, k, sort = FALSE)$id
# Step 2
labels <- 1:n
# Step 3
for (i in 1:n) {
# check all neighbors of i
for (j in nn[i,]) {
if (j < i)
next ### we already checked this edge
if (labels[i] == labels[j])
next ### already in the same cluster
if (i %in% nn[j,] &&
length(intersect(nn[i,], nn[j,])) + 1L >= kt) {
labels[labels == max(labels[i], labels[j])] <-
min(labels[i], labels[j])
}
}
}
# Step 4: create contiguous labels
as.integer(factor(labels))
}
cl <- JP_R(x, k = 10, kt = 6)
cl
## [1] 1 1 1 1 1 1 2 1 2 2 1 1 2 1 1 1 2 2 1 2 1 2 2 2 2 1 2 1 2 1 2 1 2 1 1 2 2
## [38] 1 1 1 2 2 1 1 2 2 2 1 1 2 2 2 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 1 1 1
## [75] 2 1 2 1 2 2 2 1 1 2 2 2 2 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 2 2 1 1 1 2 1 1 1
## [112] 1 2 1 2 2 2 1 1 1 2 1 1 2 2 2 1 2 2 1 1 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 1
## [149] 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 2 1 1 1 2 2 1 2 2 1 2 1
## [186] 1 2 2 1 1 1 2 1 2 1 2 2 3 1 2 2 1 2 2 2 2 1 2 2 2 2 1 4 4 1 4 1 2 2 2 1 1
## [223] 1 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 1 1 2 2 1 1 2 2 2 1 2 1 2 2 2 2 1 1 1 1 2
## [260] 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 1 1 1 1 2 1 2 1
## [297] 1 2 1 2 2 2 2 1 1 2 1 2 1 1 1 2 2 2 2 2 2 1 2 2 1 1 1 1 2 1 2 2 1 2 1 2 1
## [334] 2 1 1 2 1 2 1 1 1 2 2 2 2 1 2 2 1 2 1 1 1 1 1 2 1 1 1 2 2 1 2 2 1 1 1 1 2
## [371] 1 2 2 2 2 2 2 1 1 2 1 2 1 1 1 2 2 1 1 2 1 1 1 2 1 1 2 1 2 2 1 1 2 2 2 2 2
## [408] 2 2 2 2 1 2 2 1 2 2 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 1 1
## [445] 1 2 2 1 1 1 2 2 2 1 2 1 2 2 2 2 2 2 2 1 1 2 2 2 1 2 2 1 1 2 2 1 2 2 1 2 2
## [482] 1 1 1 2 1 2 1 2 1 1 2 1 2 1 1 1 1 2 1
plot(x, col = cl)
Aka the number of shared nearest neighbors.
JP_sim <- function(x, k) {
nn <- kNN(x, k, sort = FALSE)$id
n <- nrow(nn)
s <- matrix(0L, nrow = n, ncol = n)
for (i in 1:n) {
# check all neighbors of i
for (j in nn[i,]) {
if (j < i)
next
if (i %in% nn[j,])
s[i, j] <- s[j, i] <- length(intersect(nn[i,], nn[j,])) + 1L
}
}
s
}
k <- 10
s <- JP_sim(x, k = k)
library("seriation")
pimage(s)
convert to distance matrix and then do single link hierarchical clustering. Note that the maximum is k!
d <- as.dist(k - s)
cl <- hclust(d, method = "single")
plot(cl, cex = .8)
Try to cut the dendrogram into a different number of components. Finding the right number might be tricky.
plot(x, col = cutree(cl, k = 2))
plot(x, col = cutree(cl, k = 3))
plot(x, col = cutree(cl, k = 4))
plot(x, col = cutree(cl, k = 5))
\(str(i,j) = sum (k+1-m)*(k+1-n)\) where \(i_m=i_j\), \(m\) and \(n\) are the positions in \(i\) and \(j\)’s nearest neighbors lists. Note 1st neighbor entry is the point itself!
JPW_sim <- function(x, k) {
nn <- kNN(x, k, sort = FALSE)$id
n <- nrow(nn)
s <- matrix(0L, nrow = n, ncol = n)
for (i in 1:n) {
# check all neighbors of i
for (j in nn[i, ]) {
if (j < i)
next
if (i %in% nn[j, ]) {
m <- match(c(i, nn[i, ]), c(j, nn[j, ]))
s[i, j] <-
s[j, i] <- sum((k + 1L - (0:k)) * (k + 1L - m - 1L), na.rm = TRUE)
# note index starts with 0 in paper
}
}
}
s
}
s <- JPW_sim(x, k = 10)
pimage(s)
convert to distance matrix and then do single link hierarchical clustering. Max is roughly
m <- sum((k + 1 - (1:(k + 1))) ^ 2)
m
## [1] 385
d <- as.dist(m - s)
cl <- hclust(d, method = "single")
plot(cl, cex = .8)
plot(x, col = cutree(cl, k = 2))
plot(x, col = cutree(cl, k = 3))
plot(x, col = cutree(cl, k = 4))
plot(x, col = cutree(cl, k = 5))
Note: Does not seem to work so well on this data set.
Loops are slow in interpreted languages like R!
library("Rcpp")
library("inline")
##
## Attaching package: 'inline'
## The following object is masked from 'package:Rcpp':
##
## registerPlugin
C++ code for Step 2
cppFunction(
'
IntegerVector JP_step2_cpp(IntegerMatrix nn, unsigned int kt) {
int n = nn.nrow();
// create label vector
std::vector<int> label(n);
std::iota(std::begin(label), std::end(label), 1); // Fill with 1, 2, ..., n.
// create sorted sets so we can use set operations
std::vector< std::set<int> > nn_set(nn.nrow());
IntegerVector r;
std::vector<int> s;
for(int i = 0; i < n; ++i) {
r = nn(i,_);
s = Rcpp::as<std::vector<int> >(r);
nn_set[i].insert(s.begin(), s.end());
}
std::vector<int> z;
int newlabel, oldlabel;
std::set<int>::iterator it;
int j;
for(int i = 0; i < n; ++i) {
// check all neighbors of i
for (it = nn_set[i].begin(); it != nn_set[i].end(); ++it) {
j = *it-1;
if(j<i) continue;
if(label[i] == label[j]) continue;
if(nn_set[j].find(i+1) != nn_set[j].end()) {
z.clear();
std::set_intersection(nn_set[i].begin(), nn_set[i].end(),
nn_set[j].begin(), nn_set[j].end(),
std::back_inserter(z));
// this could be done faster with set unions
if(z.size()+1 >= kt) {
// update labels
if(label[i] > label[j]) {
newlabel = label[j]; oldlabel = label[i];
}else{
newlabel = label[i]; oldlabel = label[j];
}
for(int k = 0; k < n; ++k) {
if(label[k] == oldlabel) label[k] = newlabel;
}
}
}
}
}
return wrap(label);
}'
)
R does Steps 1 and 3.
JP_CPP <- function(x, k, kt) {
# Step 1
nn <- kNN(x, k, sort = FALSE)$id
# Step 2
cl <- JP_step2_cpp(nn, kt = as.integer(kt))
# Step 3
as.integer(factor(cl))
}
cl <- JP_CPP(x, k = 10, kt = 6)
cl
## [1] 1 1 1 1 1 1 2 1 2 2 1 1 2 1 1 1 2 2 1 2 1 2 2 2 2 1 2 1 2 1 2 1 2 1 1 2 2
## [38] 1 1 1 2 2 1 1 2 2 2 1 1 2 2 2 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 1 1 1
## [75] 2 1 2 1 2 2 2 1 1 2 2 2 2 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 2 2 1 1 1 2 1 1 1
## [112] 1 2 1 2 2 2 1 1 1 2 1 1 2 2 2 1 2 2 1 1 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 1
## [149] 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 2 1 1 1 2 2 1 2 2 1 2 1
## [186] 1 2 2 1 1 1 2 1 2 1 2 2 3 1 2 2 1 2 2 2 2 1 2 2 2 2 1 4 4 1 4 1 2 2 2 1 1
## [223] 1 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 1 1 2 2 1 1 2 2 2 1 2 1 2 2 2 2 1 1 1 1 2
## [260] 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 1 1 1 1 2 1 2 1
## [297] 1 2 1 2 2 2 2 1 1 2 1 2 1 1 1 2 2 2 2 2 2 1 2 2 1 1 1 1 2 1 2 2 1 2 1 2 1
## [334] 2 1 1 2 1 2 1 1 1 2 2 2 2 1 2 2 1 2 1 1 1 1 1 2 1 1 1 2 2 1 2 2 1 1 1 1 2
## [371] 1 2 2 2 2 2 2 1 1 2 1 2 1 1 1 2 2 1 1 2 1 1 1 2 1 1 2 1 2 2 1 1 2 2 2 2 2
## [408] 2 2 2 2 1 2 2 1 2 2 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 1 1
## [445] 1 2 2 1 1 1 2 2 2 1 2 1 2 2 2 2 2 2 2 1 1 2 2 2 1 2 2 1 1 2 2 1 2 2 1 2 2
## [482] 1 1 1 2 1 2 1 2 1 1 2 1 2 1 1 1 1 2 1
plot(x, col = cl)
library("microbenchmark")
hc <- function(x, k) {
d <- dist(x)
cutree(hclust(d, method = "single"), k = k)
}
jpclust is the C++ implementation in package dbscan
rbind(
microbenchmark(JP_R(x, k = 10, kt = 6), times = 10),
microbenchmark(JP_CPP(x, k = 10, kt = 6), times = 10),
microbenchmark(jpclust(x, k = 10, kt = 6), times = 10),
microbenchmark(kmeans(x, centers = 2), times = 10),
microbenchmark(hc(x, k = 2), times = 10)
)
## Unit: microseconds
## expr min lq mean median uq
## JP_R(x, k = 10, kt = 6) 10922.976 10999.170 11957.2338 11697.9185 12402.986
## JP_CPP(x, k = 10, kt = 6) 2067.412 2223.690 3091.9826 2413.9955 2675.014
## jpclust(x, k = 10, kt = 6) 2068.630 2090.276 2332.0848 2241.6065 2376.902
## kmeans(x, centers = 2) 301.375 329.309 370.1431 358.1585 374.460
## hc(x, k = 2) 6185.253 6366.743 7159.8364 6802.8775 7117.759
## max neval cld
## 14990.272 10 d
## 9345.693 10 b
## 3198.576 10 b
## 560.767 10 a
## 10173.254 10 c
Note: k-means is fast, but does not provide a good clustering!