Warning: Don’t try this at home! We know that subsequence clustering with sliding windows and Euclidean distances does not work!
Kreogh and Lin, Clustering of time-series subsequences is meaningless: implications for previous and future research, KAIS, 2005.
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
getSymbols("AAPL")
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
##
## WARNING: There have been significant changes to Yahoo Finance data.
## Please see the Warning section of '?getSymbols.yahoo' for details.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.yahoo.warning"=FALSE).
## [1] "AAPL"
head(AAPL)
## AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2007-01-03 12.32714 12.36857 11.70000 11.97143 309579900
## 2007-01-04 12.00714 12.27857 11.97429 12.23714 211815100
## 2007-01-05 12.25286 12.31428 12.05714 12.15000 208685400
## 2007-01-08 12.28000 12.36143 12.18286 12.21000 199276700
## 2007-01-09 12.35000 13.28286 12.16429 13.22429 837324600
## 2007-01-10 13.53571 13.97143 13.35000 13.85714 738220000
## AAPL.Adjusted
## 2007-01-03 7.982585
## 2007-01-04 8.159763
## 2007-01-05 8.101658
## 2007-01-08 8.141665
## 2007-01-09 8.817995
## 2007-01-10 9.239983
nrow(AAPL)
## [1] 3059
Pick close price and aggregate by week
apple <- AAPL[, 4]
#apple <- apply.weekly(apple, max)
plot(apple, ylab = "Close [USD]")
Compute percentage changes
apple.diff <- (apple - lag(apple, k = 1))/lag(apple, k = 1)[-1]
plot(apple.diff)
(sequences of 12 weeks, we embed the sequence in NAs)
window.size <- 12
apple.diff <- c(rep(0, window.size-1), apple.diff, rep(0, window.size))
apple.window <- t(sapply(1:(length(apple.diff)-window.size), FUN=function(i)
window(as.numeric(apple.diff), i, i+window.size-1)))
dim(apple.window)
## [1] 3069 12
apple.window is a matrix where each row contains the values a window
#for (i in 1:10) {
# plot(apple.window[i,], type="l", main=paste("Window:", i))
# Sys.sleep(1)
#}
plot(apple.window[10,], type="l", main=paste("3 consecutive Windows"))
for (i in 11:13) {
lines(apple.window[i,], lty=i)
}
Sum over all windows is a horizontal line!
plot(colSums(apple.window, na.rm = TRUE), type = "l")
Sum of cluster with one cluster
cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")
cl <- kmeans(apple.window, centers=3, nstart = 5)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
plot(cl$centers[i,], type="l")
# for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
# lines(cl$centers[i,], lwd=2)
}
par(old.par)
Sums up to a straight line
plot(round(colSums(cl$centers*cl$size)), type = "l")
cl <- kmeans(apple.window, centers=50, iter.max=20, nstart = 1)
table(cl$cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 12 65 54 44 117 7 213 94 56 42 5 6 42 139 69 58 124 69
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 100 5 4 33 7 46 45 83 67 51 109 91 4 5 30 25 20 131
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 52 62 52 95 281 40 61 71 3 120 56 57 6 41
Inspect the last 100 days and the patterns
plot(tail(cl$cluster, 100), type="s", ylab="cluster id", main ="Trivial Matches?")
Get the upper limit of the number of trivial matches (horizontal lines)
plot(diff(cl$cluster)==0, type = "s", ylab = "Trivial matches")
sum(diff(cl$cluster)==0)
## [1] 151
sum(diff(cl$cluster)==0)/length(cl$cluster)
## [1] 0.04920169
Not too many trivial matches since the data is not very smooth!
Shows cluster centers
strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE), 10)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")
par(old.par)
Clusters still add up to a horizontal line
plot(round(colSums(cl$centers*cl$size)), type = "l")
Show the subsequences assigned to each pattern
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
lines(cl$centers[i,], lwd=2)
}
par(old.par)
WARNING: The above type of clustering is known to be problematic because of trivial matches introduced by the sliding window!
Randomly sample subsequences. Kreogh and Lin call this whole sequence clustering since each subsequence can now be seen as a whole (different) sequence.
Prepare for clustering (sequences of 12 days)
window.size <- 12
i <- sample(length(apple.diff)-window.size, 1000, replace=TRUE)
plot(i, rep(1, length(i)), type="h")
apple.window <- t(sapply(i, FUN=function(i)
as.numeric(apple.diff[i:(i+window.size)])))
Sum over all windows is now not a straight line!
plot(round(colSums(apple.window, na.rm = TRUE)), type = "l")
Sum of cluster with one cluster
cl <- kmeans(apple.window, centers=1)
plot(colSums(cl$centers), type = "l")
Three clusters
cl <- kmeans(apple.window, centers=3, nstart = 5)
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in 1:nrow(cl$centers)) {
plot(cl$centers[i,], type="l")
# for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
# lines(cl$centers[i,], lwd=2)
}
par(old.par)
Sum over clusters is the same as sum over windows
plot(round(colSums(cl$centers*cl$size)), type = "l")
FIXME: we should probably correct for holidays so the patterns all start on Monday!
Cluster into 50 patterns
cl <- kmeans(na.omit(apple.window), centers=50, iter.max=20)
table(cl$cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 3 2 17 31 45 10 8 10 1 33 2 41 17 24 17 6 30 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 44 2 7 10 14 7 2 25 10 1 6 36 9 9 81 12 40 23
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 13 11 5 20 38 20 2 25 23 21 17 24 23 122
strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE),10)
Shows cluster centers
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) plot(cl$centers[i,], type="l")
par(old.par)
See what data was clustered in the strongest clusters
old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2))
for(i in strongest.patters) {
plot(cl$centers[i,], type="l", ylim=c(-.03,.03))
for(j in which(cl$cluster==i)) lines(apple.window[j,], col="gray")
lines(cl$centers[i,], lwd=2)
}
par(old.par)
Note: We are still clustering data that looks like a random walk, so do this at your own risk!