Implementation of the Cylinder-Bell-Funnel experiment in:

Kreogh and Lin, Clustering of time-series subsequences is meaningless: implications for previous and future research, KAIS, 2005.

set.seed(1234)

Create data

Create the patterns used in the paper.

pattern1 <- rep(0, 128)
pattern1[31:90] <- (1:60)/6
plot(pattern1, type = "l")

pattern2 <- rev(pattern1)
plot(pattern2, type = "l")

pattern3 <- rep(0, 128)
pattern3[31:90] <- 10
plot(pattern3, type = "l")

patterns <- list(pattern1, pattern2, pattern3)

take <- sample(length(patterns), 90, replace = T)

y <- patterns[take]
y <- t(sapply(y, I))

add noise

y <- y + rnorm(0, sd = 2, n = length(y))

plot(y[1,], type="l")

plot(y[2,], type="l")

plot(y[3,], type="l")

Whole sequence clustering

cl <- kmeans(y, centers = 3, nstart = 10)

Look at the centroids (black) and the assigned patterns (grey).

old.par <- par(mar=c(0,2,0,0), mfrow=c(3,1))
for(i in 1:3) {
  plot(cl$centers[i,], type="l", ylim = c(-5,20))
  for(j in which(cl$cluster==i)) lines(y[j,], col="gray")
  lines(cl$centers[i,], lwd=2)
}

par(old.par)

Note: Whole sequence clustering works as expected.

Subsequence clustering

y2 <- as.vector(t(y))
plot(y2[1:1000], type = "l")

window.size <- 128
y.window <- t(sapply(1:(length(y)-window.size), 
  FUN = function(i)
  window(y2, i, i+window.size-1)))

dim(y.window)
## [1] 11392   128

look at some consecutive windows.

plot(y.window[1,], type="l")
for(i in 2:5){ lines(y.window[i,], type="l", col = "grey") }

cl <- kmeans(y.window, centers = 3, nstart = 10)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

plot of the centroids from the paper.

plot(cl$centers[1,], type = "l")
lines(cl$centers[2,], type = "l", col = "red")
lines(cl$centers[3,], type = "l", col = "blue")

Look at the centroids (black) and the assigned patterns (grey).

old.par <- par(mar=c(0,2,0,0), mfrow=c(3,1))
for(i in 1:3) {
  plot(cl$centers[i,], type="l", ylim = c(-5,20))
  for(j in sample(which(cl$cluster==i), 50)) lines(y.window[j,], col="gray")
  lines(cl$centers[i,], lwd=2)
}

par(old.par)

try 5 clusters instead

cl <- kmeans(y.window, centers = 5, nstart = 10)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 569600)
old.par <- par(mar=c(0,2,0,0), mfrow=c(5,1))
for(i in 1:5) {
  plot(cl$centers[i,], type="l", ylim = c(-5,20))
  for(j in sample(which(cl$cluster==i), 50)) lines(y.window[j,], col="gray")
  lines(cl$centers[i,], lwd=2)
}

par(old.par)

Conclusion: Subsequence clustering does not work this way. The clustering just discretizes the signal into k evenly spaced positions where a mixture of all three patterns has their peak. We would need to replace the moving window identifying the begining and the end of the patterns that we want to cluster.