Warning: Don’t try this at home! We know that subsequence clustering with sliding windows and Euclidean distances does not work!

Kreogh and Lin, Clustering of time-series subsequences is meaningless: implications for previous and future research, KAIS, 2005.

Get APPL data

Data from Yahoo finance retrieved with package quantmod

library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
getSymbols("AAPL")
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "AAPL"
head(AAPL)
##            AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume AAPL.Adjusted
## 2007-01-03  3.081786  3.092143 2.925000   2.992857  1238319600      2.569716
## 2007-01-04  3.001786  3.069643 2.993571   3.059286   847260400      2.626753
## 2007-01-05  3.063214  3.078571 3.014286   3.037500   834741600      2.608047
## 2007-01-08  3.070000  3.090357 3.045714   3.052500   797106800      2.620926
## 2007-01-09  3.087500  3.320714 3.041071   3.306071  3349298400      2.838647
## 2007-01-10  3.383929  3.492857 3.337500   3.464286  2952880000      2.974493
nrow(AAPL)
## [1] 3714

Pick the closing price and aggregate by week.

apple <- AAPL[, 4]
#apple <- apply.weekly(apple, max)
plot(apple, ylab = "Close [USD]")

Compute percentage changes.

apple.diff <- (apple - lag(apple, k = 1))/lag(apple, k = 1)[-1]
plot(apple.diff)

Prepare for clustering

(sequences of 12 weeks, we embed the sequence in NAs)

window.size <- 12
apple.diff <- c(rep(0, window.size-1), apple.diff, rep(0, window.size))
apple.window <- t(sapply(1:(length(apple.diff)-window.size), FUN=function(i)
  window(as.numeric(apple.diff), i, i+window.size-1)))
dim(apple.window)
## [1] 3724   12

apple.window is a matrix where each row contains the values a window

Look at the sliding window

#for (i in 1:10) {
#    plot(apple.window[i,], type="l", main=paste("Window:", i))
#    Sys.sleep(1)
#}

plot(apple.window[10,], type="l", main=paste("3 consecutive Windows"))
for (i in 11:13) {
  lines(apple.window[i,], lty=i)
}

Sum over all windows is a horizontal line!

plot(colSums(apple.window, na.rm = TRUE), type = "l")

Sum of cluster with one cluster

cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")

Three clusters (look at the oscillations in the results)

cl <- kmeans(apple.window, centers=3, nstart = 5)

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
  plot(cl$centers[i,], type="l")
 # for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
 #  lines(cl$centers[i,], lwd=2)
}
par(old.par)

Sums up to a straight line

plot(round(colSums(cl$centers*cl$size)), type = "l")

Cluster into 50 patterns

cl <- kmeans(apple.window, centers=50, iter.max=20, nstart = 1)
table(cl$cluster)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##  37  53  80 169 101  44 131  67   8   3 175  36  73 210  31  94 245 119  34  16 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##  57  20  28  59   6  15  89  83  38   8   7  45 114   5  28  80 112   6  29 105 
##  41  42  43  44  45  46  47  48  49  50 
##  58 122 214 190  50  83 177  73  10  87

Inspect the last 100 days and the patterns

plot(tail(cl$cluster, 100), type="s", ylab="cluster id", main ="Trivial Matches?")

Get the upper limit of the number of trivial matches (horizontal lines)

plot(diff(cl$cluster)==0, type = "s", ylab = "Trivial matches")

sum(diff(cl$cluster)==0)
## [1] 189
sum(diff(cl$cluster)==0)/length(cl$cluster)
## [1] 0.05075188

Not too many trivial matches since the data is not very smooth!

Shows cluster centers

strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE), 10)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")

par(old.par)

Clusters still add up to a horizontal line

plot(round(colSums(cl$centers*cl$size)), type = "l")

Show the subsequences assigned to each pattern

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
  plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
  for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
  lines(cl$centers[i,], lwd=2)
}

par(old.par)

WARNING: The above type of clustering is known to be problematic because of trivial matches introduced by the sliding window!

How can we avoid the problem?

Randomly sample subsequences. Kreogh and Lin call this whole sequence clustering since each subsequence can now be seen as a whole (different) sequence.

Prepare for clustering (sequences of 12 days)

window.size <- 12
i <- sample(length(apple.diff)-window.size, 1000, replace=TRUE)
plot(i, rep(1, length(i)), type="h")

apple.window <- t(sapply(i, FUN=function(i)
  as.numeric(apple.diff[i:(i+window.size)])))

Sum over all windows is now not a straight line!

plot(round(colSums(apple.window, na.rm = TRUE)), type = "l")

Sum of cluster with one cluster

cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")

Three clusters

cl <- kmeans(apple.window, centers=3, nstart = 5)

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
  plot(cl$centers[i,], type="l")
  # for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
  #  lines(cl$centers[i,], lwd=2)
}
par(old.par)

Sum over clusters is the same as sum over windows

plot(round(colSums(cl$centers*cl$size)), type = "l")

FIXME: we should probably correct for holidays so the patterns all start on Monday!

Cluster into 50 patterns

cl <- kmeans(na.omit(apple.window), centers=50, iter.max=20)
table(cl$cluster)
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  8 11  5 18 49  2  1  9 24 69 20 15 11  1 20 15 38 54  5  8 53 27 28  1  5 15 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
## 42 20 27 39  2  5 32  9  4  2 12 18 20  1 38  3 12 47 23 21 12  3 93  3
strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE),10)

Shows cluster centers

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")

par(old.par)

See what data was clustered in the strongest clusters

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
  plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
  for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
  lines(cl$centers[i,], lwd=2)
}

par(old.par)

Note: We are still clustering data that looks like a random walk, so do this at your own risk!