Warning: Don’t trie this at home!

We know that subsequence clustering with sliding windows does not work (see: Kreogh and Lin: Clustering of time-series subsequences is meaningless: implications for previous and future research, KAIS, 2005). However, if it made sense then this is how you could do it.

Get some data from Yahoo finance (package quantmod)

library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
getSymbols("AAPL")
##     As of 0.4-0, 'getSymbols' uses env=parent.frame() and
##  auto.assign=TRUE by default.
## 
##  This  behavior  will be  phased out in 0.5-0  when the call  will
##  default to use auto.assign=FALSE. getOption("getSymbols.env") and 
##  getOptions("getSymbols.auto.assign") are now checked for alternate defaults
## 
##  This message is shown once per session and may be disabled by setting 
##  options("getSymbols.warning4.0"=FALSE). See ?getSymbols for more details.
## [1] "AAPL"
head(AAPL)
##            AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2007-01-03     86.29     86.58    81.90      83.80   309579900
## 2007-01-04     84.05     85.95    83.82      85.66   211815100
## 2007-01-05     85.77     86.20    84.40      85.05   208685400
## 2007-01-08     85.96     86.53    85.28      85.47   199276700
## 2007-01-09     86.45     92.98    85.15      92.57   837324600
## 2007-01-10     94.75     97.80    93.45      97.00   738220000
##            AAPL.Adjusted
## 2007-01-03      10.85709
## 2007-01-04      11.09807
## 2007-01-05      11.01904
## 2007-01-08      11.07345
## 2007-01-09      11.99333
## 2007-01-10      12.56728
nrow(AAPL)
## [1] 2551

Pick close price and aggregate by week

apple <- AAPL[, 4]
#apple <- apply.weekly(apple, max)
plot(apple, ylab = "Close [USD]")

Compute percentage changes

apple.diff <- (apple - lag(apple, k = 1))/lag(apple, k = 1)[-1]
plot(apple.diff)

Prepare for clustering

(sequences of 12 weeks, we embed the sequence in NAs)

window.size <- 12
apple.diff <- c(rep(0, window.size-1), apple.diff, rep(0, window.size))
apple.window <- t(sapply(1:(length(apple.diff)-window.size), FUN=function(i)
  window(as.numeric(apple.diff), i, i+window.size-1)))
dim(apple.window)
## [1] 2561   12

apple.window is a matrix where each row contains the values a window

Look at the sliding window

#for (i in 1:10) {
#    plot(apple.window[i,], type="l", main=paste("Window:", i))
#    Sys.sleep(1)
#}

plot(apple.window[10,], type="l", main=paste("3 consecutive Windows"))
for (i in 11:13) {
  lines(apple.window[i,], lty=i)
}

Sum over all windows is a line!

plot(colSums(apple.window, na.rm = TRUE), type = "l")

Sum of cluster with one cluster

cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")

Three clusters (look at the oscillations in the results)

cl <- kmeans(apple.window, centers=3, nstart = 5)

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
  plot(cl$centers[i,], type="l")
 # for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
 #  lines(cl$centers[i,], lwd=2)
}
par(old.par)

Sums up to a straight line

plot(round(colSums(cl$centers*cl$size)), type = "l")

Cluster into 50 patterns

cl <- kmeans(apple.window, centers=50, iter.max=20, nstart = 1)
table(cl$cluster)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1  42   1  26  62   1 143  76   1  42  53  43  38  46   1   1   1 154 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##  61  46   1   5 115  66  22  50  33  34   1 178  24 214  95  72   5  91 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50 
##  92  78   1  79   3  55  46  10   1  99 183   1  35  33

Inspect the last 100 days and the patterns

plot(tail(cl$cluster, 100), type="s", ylab="cluster id", main ="Trivial Matches?")

Get the upper limit of the number of trivial matches (horizontal lines)

plot(diff(cl$cluster)==0, type = "s", ylab = "Trivial matches")

sum(diff(cl$cluster)==0)
## [1] 282
sum(diff(cl$cluster)==0)/length(cl$cluster)
## [1] 0.1101132

Not too many trivial matches since the data is not very smooth!

Shows cluster centers

strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE), 10)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")