Code examples for the paper:

Michael Hahsler and Margaret Dunham, Temporal structure learning for clustering massive data streams in real-time, SIAM International Conference on Data Mining, 2011. #’

library(rEMM)
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## 
## Attaching package: 'rEMM'
## The following objects are masked from 'package:igraph':
## 
##     as.igraph, clusters
set.seed(1234)

Create a data stream

5 clusters with noise. From ? synthetic_stream

This function creates a synthetic data stream with data points in roughly \([0, 1]^d\) by choosing points form \(k\) clusters following a sequence through these clusters.

stream <-  synthetic_stream(
    k=5,
    d=2,
    n_subseq=100,
    p_transition=.5,
    p_swap=0,
    n_train=0,
    n_test=1000,
    p_outlier = .1,
    rangeVar = c(0.005, 0.01)
    )

str(stream)
## List of 8
##  $ test            : num [1:1000, 1:2] 0.9916 0.0983 0.244 0.1821 0.2008 ...
##  $ train           : logi NA
##  $ sequence_test   : int [1:1000] 1 1 1 1 1 3 2 2 4 2 ...
##  $ sequence_train  : logi NA
##  $ swap_test       : logi NA
##  $ swap_train      : logi NA
##  $ outlier_position: logi [1:1000] TRUE FALSE FALSE TRUE FALSE FALSE ...
##  $ model           :List of 5
##   ..$ k     : num 5
##   ..$ d     : num 2
##   ..$ mu    : num [1:5, 1:2] 0.191 0.598 0.587 0.599 0.789 ...
##   ..$ Sigma :List of 5
##   .. ..$ : num [1:2, 1:2] 0.00772 0.00273 0.00273 0.00641
##   .. ..$ : num [1:2, 1:2] 0.00646 0.00652 0.00652 0.00919
##   .. ..$ : num [1:2, 1:2] 0.00633 -0.00262 -0.00262 0.00593
##   .. ..$ : num [1:2, 1:2] 0.00658 -0.00351 -0.00351 0.00651
##   .. ..$ : num [1:2, 1:2] 0.0052 -0.00384 -0.00384 0.00609
##   ..$ subseq: int [1:100] 1 1 1 1 1 3 2 2 4 2 ...
data <- stream$test
class <- stream$sequence_test+1
class[stream$outlier] <- "gray"

plot(data, col= class)

Model the sequence

Choose a suitable threshold (25% quantile of distances of a sample)

emm_threshold <- quantile(dist(data[sample(1:nrow(data), 100)]), .25)

build EMM (TRACDS+tNN)

emm <- EMM(threshold=emm_threshold)
emm <- build(emm, data, verb=TRUE)
## Adding 1000 observations. 
## Added 100 observations -  32 clusters.
## Added 200 observations -  47 clusters.
## Added 300 observations -  55 clusters.
## Added 400 observations -  59 clusters.
## Added 500 observations -  62 clusters.
## Added 600 observations -  63 clusters.
## Added 700 observations -  67 clusters.
## Added 800 observations -  72 clusters.
## Added 900 observations -  76 clusters.
## Added 1000 observations -  78 clusters.
## Done - 78 clusters.
## Update for 1000 assignments 
## Resizing matrix from 10 to 20 
## Resizing matrix from 20 to 40 
## Processing assignment 100 
## Resizing matrix from 40 to 80 
## Processing assignment 200 
## Processing assignment 300 
## Processing assignment 400 
## Processing assignment 500 
## Processing assignment 600 
## Processing assignment 700 
## Processing assignment 800 
## Processing assignment 900 
## Processing assignment 1000 
## Update done.

show micro-cluster centers

plot(data, col= class)
points(cluster_centers(emm), cex=cluster_counts(emm)/max(cluster_counts(emm))*10)

look at density (count) in each cluster

hist(cluster_counts(emm), breaks=100)

sort(cluster_counts(emm), decreasing = TRUE)
##  7 14  8  6 13 17 20 16 19  9  3 22 10 15 26 31 57 12 39 29 46 47 48 41 54 
## 78 71 51 47 43 41 39 37 36 35 34 34 27 27 26 21 21 20 19 14 14 14 14 13 13 
##  2 37  5 35 36 27 32 44 51 55 43 52 59 21 24 25 49 60 65 70 18 28 38 63 76 
## 12 12 11 11 10  9  9  9  9  9  7  7  7  5  5  5  4  4  4  4  3  3  3  3  3 
##  1  4 11 23 42 45 50 53 58 62 66 73 74 77 78 30 33 34 40 56 61 64 67 68 69 
##  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  1  1  1  1  1  1  1  1  1  1 
## 71 72 75 
##  1  1  1

points in clusters of low density are classified as outliers look at transition probability

head(transition_table(emm, data, match_cluster="exact"))
##   from to probability
## 1    1  2  0.02500000
## 2    2  3  0.02222222
## 3    3  4  0.01785714
## 4    4  5  0.02500000
## 5    5 48  0.02247191
## 6   48  7  0.01086957

points with low transition probabilities are classified as outliers

Visualize the EMM

emm
## EMM with 78 states/clusters.
##  Measure: euclidean 
##  Threshold: 0.07971852 
##  Centroid: TRUE 
##  Lambda: 0
plot(emm)

plotting the EMM as a graph looks not very helpful. Project the data onto two dimensions using multi-dimensional scaling (MDS) on the distances.

plot(emm, method="MDS")
points(data, col = class, xpd = TRUE)

Remove noise

emm2 <- prune(emm, count = 10)
emm2  
## EMM with 29 states/clusters.
##  Measure: euclidean 
##  Threshold: 0.07971852 
##  Centroid: TRUE 
##  Lambda: 0
plot(emm2, method = "MDS")
points(data, col = class, xpd = TRUE)

Cluster states

Note: k=5 was chosen by looking at the dendrogram below.

emmc <- recluster_hclust(emm2, k=5, method ="average")
plot(attr(emmc, "cluster_info")$dendrogram)
abline(h=0.22, col = "red")

plot(emmc, method = "MDS")
points(data, col = class, xpd = TRUE)

Note: Clustering would be better if it also took the transitions into account.