### CSE 8331 - Spring 2012 - Michael Hahsler ### Example for Subsequence Clustering ### Don't do this! We know that subsequence clustering ### with sliding windows does not work (see: Kreogh and Lin: ### Clustering of time-series ### subsequences is meaningless: implications for previous and future ### research, KAIS, 2005) ### However, if it made sense then this is how you could do it. ### get some data from Yahoo finance URL <- "http://ichart.finance.yahoo.com/table.csv?s=^DJI" dat <- read.csv(URL) dat$Date <- as.Date(dat$Date, "%Y-%m-%d") ### fix the date column head(dat) nrow(dat) dat <- dat[nrow(dat):1,] ### reverse order dat <- dat[, c("Date", "Close")] plot(dat, type="l") ### compute percentage changes dji.diff <- diff(dat[,"Close"]) / head(dat[,"Close"], length(dat[,"Close"])-1) plot(dat[-1, "Date"], dji.diff, type="l") ### prepare for clustering (sequences of 20 days) window.size <- 10 dji.window <- t(sapply(1:length(dji.diff), function(i) dji.diff[i:(i+window.size-1)])) ### dji.window is a matrix where each row contains the 20 values a window ### look at the sliding window for (i in 1:10) { plot(dji.window[i,], type="l", main=paste("Window:", i)) Sys.sleep(1) } ### cluster into 50 patterns cl <- kmeans(na.omit(dji.window), centers=50, iter.max=20) table(cl$cluster) ### inspect the last 100 days and the patterns plot(tail(cl$cluster, 100), type="s", ylab="cluster id", main ="Trivial Matches?") ### get the upper limit of the number of trivial matches sum(diff(cl$cluster)==0) sum(diff(cl$cluster)==0)/length(cl$cluster) ### shows cluster centers strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE),10) old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2)) for(i in strongest.patters) plot(cl$centers[i,], type="l") #for(i in 1:10) plot(cl$centers[i,], type="l") par(old.par) ### WARNING: The above type of clustering is known to be problematic ### because of trivial matches introduced by the sliding window! ### Since this does not make sense, how can we fix it? ### Do not use a moving window! Lets look for 4 week (20 patterns) ### prepare for clustering (sequences of 20 days) window.size <- 10 dji.window <- matrix(dji.diff, ncol=window.size, byrow=TRUE) ### FIXME: we should probably correct for holidays so the patterns patterns ### all start on Monday! ### look at the non-sliding window for (i in 1:10) { plot(dji.window[i,], type="l", ylim=c(-0.05, 0.05)) Sys.sleep(1) } ### cluster into 50 patterns cl <- kmeans(na.omit(dji.window), centers=50, iter.max=20) table(cl$cluster) ### inspect the last 100 days and the patterns plot(tail(cl$cluster, 100), type="s") ### shows cluster centers strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE),10) old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2)) for(i in strongest.patters) plot(cl$centers[i,], type="l") #for(i in 1:10) plot(cl$centers[i,], type="l") par(old.par) ### second alternative is random sampling ### prepare for clustering (sequences of 20 days) window.size <- 10 i <- sample(1:length(dji.diff)-window.size, 500) dji.window <- t(sapply(i,FUN=function(i) dji.diff[i:(i+window.size)])) ### FIXME: we should probably correct for holidays so the patterns patterns ### all start on Monday! ### cluster into 50 patterns cl <- kmeans(na.omit(dji.window), centers=50, iter.max=20) table(cl$cluster) strongest.patters <- head(order(table(cl$cluster), decreasing=TRUE),10) ### shows cluster centers old.par <- par(mar=c(0,0,0,0), mfrow=c(5,2)) for(i in strongest.patters) plot(cl$centers[i,], type="l") #for(i in 1:10) plot(cl$centers[i,], type="l") par(old.par) ### see what data was clustered in the largest cluster sel <- which.max(table(cl$cluster)) sel plot(NA, ylim=c(-.05,.05), xlim=c(1,window.size)) for(i in which(cl$cluster==sel)) lines(dji.window[i,], col="gray") lines(cl$centers[sel,], lwd=2)