Warning: Don’t try this at home! We know that subsequence clustering with sliding windows and Euclidean distances does not work!

Kreogh and Lin, Clustering of time-series subsequences is meaningless: implications for previous and future research, KAIS, 2005.

Get some data from Yahoo finance (package quantmod)

library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
getSymbols("AAPL")
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## 
## WARNING: There have been significant changes to Yahoo Finance data.
## Please see the Warning section of '?getSymbols.yahoo' for details.
## 
## This message is shown once per session and may be disabled by setting
## options("getSymbols.yahoo.warning"=FALSE).
## [1] "AAPL"
head(AAPL)
##            AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2007-01-03  12.32714  12.36857 11.70000   11.97143   309579900
## 2007-01-04  12.00714  12.27857 11.97429   12.23714   211815100
## 2007-01-05  12.25286  12.31428 12.05714   12.15000   208685400
## 2007-01-08  12.28000  12.36143 12.18286   12.21000   199276700
## 2007-01-09  12.35000  13.28286 12.16429   13.22429   837324600
## 2007-01-10  13.53571  13.97143 13.35000   13.85714   738220000
##            AAPL.Adjusted
## 2007-01-03      7.982585
## 2007-01-04      8.159763
## 2007-01-05      8.101658
## 2007-01-08      8.141665
## 2007-01-09      8.817995
## 2007-01-10      9.239983
nrow(AAPL)
## [1] 3059

Pick close price and aggregate by week

apple <- AAPL[, 4]
#apple <- apply.weekly(apple, max)
plot(apple, ylab = "Close [USD]")

Compute percentage changes

apple.diff <- (apple - lag(apple, k = 1))/lag(apple, k = 1)[-1]
plot(apple.diff)

Prepare for clustering

(sequences of 12 weeks, we embed the sequence in NAs)

window.size <- 12
apple.diff <- c(rep(0, window.size-1), apple.diff, rep(0, window.size))
apple.window <- t(sapply(1:(length(apple.diff)-window.size), FUN=function(i)
  window(as.numeric(apple.diff), i, i+window.size-1)))
dim(apple.window)
## [1] 3069   12

apple.window is a matrix where each row contains the values a window

Look at the sliding window

#for (i in 1:10) {
#    plot(apple.window[i,], type="l", main=paste("Window:", i))
#    Sys.sleep(1)
#}

plot(apple.window[10,], type="l", main=paste("3 consecutive Windows"))
for (i in 11:13) {
  lines(apple.window[i,], lty=i)
}

Sum over all windows is a horizontal line!

plot(colSums(apple.window, na.rm = TRUE), type = "l")

Sum of cluster with one cluster

cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")

Three clusters (look at the oscillations in the results)

cl <- kmeans(apple.window, centers=3, nstart = 5)

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
  plot(cl$centers[i,], type="l")
 # for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
 #  lines(cl$centers[i,], lwd=2)
}
par(old.par)

Sums up to a straight line

plot(round(colSums(cl$centers*cl$size)), type = "l")

Cluster into 50 patterns

cl <- kmeans(apple.window, centers=50, iter.max=20, nstart = 1)
table(cl$cluster)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##  12  65  54  44 117   7 213  94  56  42   5   6  42 139  69  58 124  69 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
## 100   5   4  33   7  46  45  83  67  51 109  91   4   5  30  25  20 131 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50 
##  52  62  52  95 281  40  61  71   3 120  56  57   6  41

Inspect the last 100 days and the patterns

plot(tail(cl$cluster, 100), type="s", ylab="cluster id", main ="Trivial Matches?")

Get the upper limit of the number of trivial matches (horizontal lines)

plot(diff(cl$cluster)==0, type = "s", ylab = "Trivial matches")

sum(diff(cl$cluster)==0)
## [1] 151
sum(diff(cl$cluster)==0)/length(cl$cluster)
## [1] 0.04920169

Not too many trivial matches since the data is not very smooth!

Shows cluster centers

strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE), 10)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")

par(old.par)

Clusters still add up to a horizontal line

plot(round(colSums(cl$centers*cl$size)), type = "l")

Show the subsequences assigned to each pattern

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
  plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
  for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
  lines(cl$centers[i,], lwd=2)
}

par(old.par)

WARNING: The above type of clustering is known to be problematic because of trivial matches introduced by the sliding window!

How can we avoid the problem?

Randomly sample subsequences. Kreogh and Lin call this whole sequence clustering since each subsequence can now be seen as a whole (different) sequence.

Prepare for clustering (sequences of 12 days)

window.size <- 12
i <- sample(length(apple.diff)-window.size, 1000, replace=TRUE)
plot(i, rep(1, length(i)), type="h")

apple.window <- t(sapply(i, FUN=function(i)
  as.numeric(apple.diff[i:(i+window.size)])))

Sum over all windows is now not a straight line!

plot(round(colSums(apple.window, na.rm = TRUE)), type = "l")

Sum of cluster with one cluster

cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")

Three clusters

cl <- kmeans(apple.window, centers=3, nstart = 5)

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
  plot(cl$centers[i,], type="l")
  # for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
  #  lines(cl$centers[i,], lwd=2)
}
par(old.par)

Sum over clusters is the same as sum over windows

plot(round(colSums(cl$centers*cl$size)), type = "l")

FIXME: we should probably correct for holidays so the patterns all start on Monday!

Cluster into 50 patterns

cl <- kmeans(na.omit(apple.window), centers=50, iter.max=20)
table(cl$cluster)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   3   2  17  31  45  10   8  10   1  33   2  41  17  24  17   6  30   1 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##  44   2   7  10  14   7   2  25  10   1   6  36   9   9  81  12  40  23 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50 
##  13  11   5  20  38  20   2  25  23  21  17  24  23 122
strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE),10)

Shows cluster centers

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")

par(old.par)

See what data was clustered in the strongest clusters

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
  plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
  for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
  lines(cl$centers[i,], lwd=2)
}

par(old.par)

Note: We are still clustering data that looks like a random walk, so do this at your own risk!