Warning: Don’t try this at home! We know that subsequence clustering with sliding windows and Euclidean distances does not work!
Kreogh and Lin, Clustering of time-series subsequences is meaningless: implications for previous and future research, KAIS, 2005.
Data from Yahoo finance retrieved with package quantmod
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
getSymbols("AAPL")
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "AAPL"
head(AAPL)
## AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume AAPL.Adjusted
## 2007-01-03 3.081786 3.092143 2.925000 2.992857 1238319600 2.569716
## 2007-01-04 3.001786 3.069643 2.993571 3.059286 847260400 2.626753
## 2007-01-05 3.063214 3.078571 3.014286 3.037500 834741600 2.608047
## 2007-01-08 3.070000 3.090357 3.045714 3.052500 797106800 2.620926
## 2007-01-09 3.087500 3.320714 3.041071 3.306071 3349298400 2.838647
## 2007-01-10 3.383929 3.492857 3.337500 3.464286 2952880000 2.974493
nrow(AAPL)
## [1] 3714
Pick the closing price and aggregate by week.
apple <- AAPL[, 4]
#apple <- apply.weekly(apple, max)
plot(apple, ylab = "Close [USD]")
Compute percentage changes.
apple.diff <- (apple - lag(apple, k = 1))/lag(apple, k = 1)[-1]
plot(apple.diff)
(sequences of 12 weeks, we embed the sequence in NAs)
window.size <- 12
apple.diff <- c(rep(0, window.size-1), apple.diff, rep(0, window.size))
apple.window <- t(sapply(1:(length(apple.diff)-window.size), FUN=function(i)
window(as.numeric(apple.diff), i, i+window.size-1)))
dim(apple.window)
## [1] 3724 12
apple.window is a matrix where each row contains the values a window
#for (i in 1:10) {
# plot(apple.window[i,], type="l", main=paste("Window:", i))
# Sys.sleep(1)
#}
plot(apple.window[10,], type="l", main=paste("3 consecutive Windows"))
for (i in 11:13) {
lines(apple.window[i,], lty=i)
}
Sum over all windows is a horizontal line!
plot(colSums(apple.window, na.rm = TRUE), type = "l")
Sum of cluster with one cluster
cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")
cl <- kmeans(apple.window, centers=3, nstart = 5)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
plot(cl$centers[i,], type="l")
# for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
# lines(cl$centers[i,], lwd=2)
}
par(old.par)
Sums up to a straight line
plot(round(colSums(cl$centers*cl$size)), type = "l")
cl <- kmeans(apple.window, centers=50, iter.max=20, nstart = 1)
table(cl$cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 37 53 80 169 101 44 131 67 8 3 175 36 73 210 31 94 245 119 34 16
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 57 20 28 59 6 15 89 83 38 8 7 45 114 5 28 80 112 6 29 105
## 41 42 43 44 45 46 47 48 49 50
## 58 122 214 190 50 83 177 73 10 87
Inspect the last 100 days and the patterns
plot(tail(cl$cluster, 100), type="s", ylab="cluster id", main ="Trivial Matches?")
Get the upper limit of the number of trivial matches (horizontal lines)
plot(diff(cl$cluster)==0, type = "s", ylab = "Trivial matches")
sum(diff(cl$cluster)==0)
## [1] 189
sum(diff(cl$cluster)==0)/length(cl$cluster)
## [1] 0.05075188
Not too many trivial matches since the data is not very smooth!
Shows cluster centers
strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE), 10)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")
par(old.par)
Clusters still add up to a horizontal line
plot(round(colSums(cl$centers*cl$size)), type = "l")
Show the subsequences assigned to each pattern
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
lines(cl$centers[i,], lwd=2)
}
par(old.par)
WARNING: The above type of clustering is known to be problematic because of trivial matches introduced by the sliding window!
Randomly sample subsequences. Kreogh and Lin call this whole sequence clustering since each subsequence can now be seen as a whole (different) sequence.
Prepare for clustering (sequences of 12 days)
window.size <- 12
i <- sample(length(apple.diff)-window.size, 1000, replace=TRUE)
plot(i, rep(1, length(i)), type="h")
apple.window <- t(sapply(i, FUN=function(i)
as.numeric(apple.diff[i:(i+window.size)])))
Sum over all windows is now not a straight line!
plot(round(colSums(apple.window, na.rm = TRUE)), type = "l")
Sum of cluster with one cluster
cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")
Three clusters
cl <- kmeans(apple.window, centers=3, nstart = 5)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
plot(cl$centers[i,], type="l")
# for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
# lines(cl$centers[i,], lwd=2)
}
par(old.par)
Sum over clusters is the same as sum over windows
plot(round(colSums(cl$centers*cl$size)), type = "l")
FIXME: we should probably correct for holidays so the patterns all start on Monday!
Cluster into 50 patterns
cl <- kmeans(na.omit(apple.window), centers=50, iter.max=20)
table(cl$cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 8 11 5 18 49 2 1 9 24 69 20 15 11 1 20 15 38 54 5 8 53 27 28 1 5 15
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 42 20 27 39 2 5 32 9 4 2 12 18 20 1 38 3 12 47 23 21 12 3 93 3
strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE),10)
Shows cluster centers
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")
par(old.par)
See what data was clustered in the strongest clusters
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
lines(cl$centers[i,], lwd=2)
}
par(old.par)
Note: We are still clustering data that looks like a random walk, so do this at your own risk!