EMIS/CSE 8331: Example for Subsequence Clustering

Warning: Don’t do this! We know that subsequence clustering with sliding windows does not work (see: Kreogh and Lin: Clustering of time-series subsequences is meaningless: implications for previous and future research, KAIS, 2005). However, if it made sense then this is how you could do it.

Get some data from Yahoo finance (package quantmod)

library(quantmod)

## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.

getSymbols("AAPL")

##     As of 0.4-0, 'getSymbols' uses env=parent.frame() and
##  auto.assign=TRUE by default.
## 
##  This  behavior  will be  phased out in 0.5-0  when the call  will
##  default to use auto.assign=FALSE. getOption("getSymbols.env") and 
##  getOptions("getSymbols.auto.assign") are now checked for alternate defaults
## 
##  This message is shown once per session and may be disabled by setting 
##  options("getSymbols.warning4.0"=FALSE). See ?getSymbol for more details

## [1] "AAPL"

head(AAPL)

##            AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2007-01-03     86.29     86.58    81.90      83.80   309579900
## 2007-01-04     84.05     85.95    83.82      85.66   211815100
## 2007-01-05     85.77     86.20    84.40      85.05   208685400
## 2007-01-08     85.96     86.53    85.28      85.47   199276700
## 2007-01-09     86.45     92.98    85.15      92.57   837324600
## 2007-01-10     94.75     97.80    93.45      97.00   738220000
##            AAPL.Adjusted
## 2007-01-03         11.29
## 2007-01-04         11.54
## 2007-01-05         11.46
## 2007-01-08         11.52
## 2007-01-09         12.47
## 2007-01-10         13.07

nrow(AAPL)

## [1] 2048

Pick close price and aggregate by week

apple <- AAPL[, 4]
#apple <- apply.weekly(apple, max)
plot(apple, ylab = "Close [USD]")

Compute percentage changes

apple.diff <- (apple - lag(apple, k = 1))/lag(apple, k = 1)[-1]
plot(apple.diff)

Prepare for clustering (sequences of 12 weeks, we embedd the sequence in NAs)

window.size <- 12
apple.diff <- c(rep(0, window.size-1), apple.diff, rep(0, window.size))
apple.window <- t(sapply(1:(length(apple.diff)-window.size), FUN=function(i)
  window(as.numeric(apple.diff), i, i+window.size-1)))
dim(apple.window)

## [1] 2058   12

apple.window is a matrix where each row contains the values a window

Look at the sliding window

#for (i in 1:10) {
#    plot(apple.window[i,], type="l", main=paste("Window:", i))
#    Sys.sleep(1)
#}

plot(apple.window[10,], type="l", main=paste("3 consecutive Windows"))
for (i in 11:13) {
  lines(apple.window[i,], lty=i)
}

Sum over all windows is a line!

plot(colSums(apple.window, na.rm = TRUE), type = "l")

Sum of cluster with one cluster

cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")

Three clusters (look at the oscillations in the results)

cl <- kmeans(apple.window, centers=3, nstart = 5)

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
  plot(cl$centers[i,], type="l")
 # for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
 #  lines(cl$centers[i,], lwd=2)
}
par(old.par)

Cluster into 50 patterns

cl <- kmeans(apple.window, centers=50, iter.max=20, nstart = 1)
table(cl$cluster)

## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   2  50  21  83  68  99 122   1 114 167  31   1   5   1 119  14   1 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##  47 152   1  77  42  62  90   1   1  56   1  21  52  44   1  20  65   1 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50 
##  27  32 109  32  29  12  13  61  45  27  11   1   9  16

Inspect the last 100 days and the patterns

plot(tail(cl$cluster, 100), type="s", ylab="cluster id", main ="Trivial Matches?")

Get the upper limit of the number of trivial matches

plot(diff(cl$cluster)==0, type = "s", ylab = "Trivial matches")

sum(diff(cl$cluster)==0)

## [1] 132

sum(diff(cl$cluster)==0)/length(cl$cluster)

## [1] 0.06413994

Not to many trivial matches since the data is not very smooth!

Shows cluster centers

strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE), 10)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")

par(old.par)

Show the subsequences assigned to each pattern

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
  plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
  for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
  lines(cl$centers[i,], lwd=2)
}

par(old.par)

WARNING: The above type of clustering is known to be problematic because of trivial matches introduced by the sliding window!

How can we avoid the problem? Randomly sample subsequences. Kreogh and Lin call this whole sequence clustering since each subsequence can now be seen as a whole (different) sequence.

Prepare for clustering (sequences of 12 days)

window.size <- 12
i <- sample(length(apple.diff)-window.size, 1000, replace=TRUE)
plot(i, rep(1, length(i)), type="h")

apple.window <- t(sapply(i, FUN=function(i)
  as.numeric(apple.diff[i:(i+window.size)])))

Sum over all windows is a line!

plot(colSums(apple.window, na.rm = TRUE), type = "l")

Sum of cluster with one cluster

cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")

Three clusters

cl <- kmeans(apple.window, centers=3, nstart = 5)

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
  plot(cl$centers[i,], type="l")
  # for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
  #  lines(cl$centers[i,], lwd=2)
}
par(old.par)

FIXME: we should probably correct for holidays so the patterns all start on Monday!

Cluster into 50 patterns

cl <- kmeans(na.omit(apple.window), centers=50, iter.max=20)
table(cl$cluster)

## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##  37  25  10  57  17  43   7   7  15   3  17  17   3   2  15  40   5  70 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
## 101  21  20  34  33  35   1   4   1  14  21   8  46  14   1   4   1  25 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50 
##   1   1   1  29  20   1  33  68  18   8  29   8   7   2

strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE),10)

Shows cluster centers

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")

par(old.par)

See what data was clustered in the strongest clusters

old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
  plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
  for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
  lines(cl$centers[i,], lwd=2)
}

par(old.par)

Note: We are still clustering data that looks like a random walk, so do this at your own risk!

EMIS/CSE 8331: Example for Subsequence Clustering

Michael Hahsler

Sat Feb 21 08:35:54 2015