Source

Introduction

This example demonstrates how to use D-Stream for data stream clustering. This is an expanded example based on the manual page ? DSC_DStream.

library(stream)
## Loading required package: registry
## Loading required package: proxy
## Registered S3 methods overwritten by 'proxy':
##   method               from    
##   print.registry_field registry
##   print.registry_entry registry
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix

Create a Data Stream

stream <- DSD_BarsAndGaussians(noise = .05)

The stream is a sequence of data points.

get_points(stream)
##          x          y
## 1 3.813348 -0.4695497
get_points(stream, 5)
##           x          y
## 1  1.079473 -0.2459382
## 2 -2.783133 -0.5628950
## 3 -3.341400  1.3418192
## 4 -3.394696  1.1683414
## 5 -1.803918 -3.8143167

Plot 100 points from the stream.

plot(stream, n = 1000)

Sample from a Stream

sample <- DSO_Sample(k = 100, biased = TRUE)
update(sample, stream, n = 500)

plot(get_points(sample))
points(kmeans(get_points(sample), centers = 4)$centers, col = "red", pch = "+")

Note on k-means on the sample (note that k-means has an issue with noise)

DSO_Window is also available.

Cluster Part of the Stream

We create an empty DStream clusterer and update it with part of the stream

dstream1 <- DSC_DStream(gridsize = 1, Cm = 1.5)
dstream1
## DStream
## Class: DSC_DStream, DSC_Micro, DSC_R, DSC 
## Number of micro-clusters: 0 
## Number of macro-clusters: 0
update(dstream1, stream, n = 1000)
dstream1
## DStream
## Class: DSC_DStream, DSC_Micro, DSC_R, DSC 
## Number of micro-clusters: 42 
## Number of macro-clusters: 4

Micro-clusters (these are “used” grid cells)

nclusters(dstream1)
## [1] 42
head(get_centers(dstream1))
##     X1   X2
## 1 -4.5 -2.5
## 2 -4.5  2.5
## 3 -3.5 -3.5
## 4 -3.5 -2.5
## 5 -3.5 -1.5
## 6 -3.5 -0.5

plot (DStream provides additional grid visualization)

plot(dstream1, stream)

plot(dstream1, stream, grid = TRUE)

look only at dense grids

nclusters(dstream1, grid_type = "dense")
## [1] 28
plot(dstream1, stream, grid = TRUE, grid_type = "dense")

look at transitional and sparse cells

plot(dstream1, stream, grid = TRUE, grid_type = "transitional")

plot(dstream1, stream, grid = TRUE, grid_type = "sparse")

Macro-clusters

standard D-Stream uses reachability

nclusters(dstream1, type = "macro")
## [1] 4
get_centers(dstream1, type = "macro")
##          X1         X2
## 1 -2.443889 -2.5298394
## 2 -3.315029  1.5129748
## 3  3.300452 -0.5058322
## 4  1.738802  1.9660876
plot(dstream1, stream, type = "both", grid = TRUE)

evaluate(dstream1, stream, measure = "crand", type = "macro")
## Evaluation results for macro-clusters.
## Points were assigned to micro-clusters.
## 
##     cRand 
## 0.6640316

use attraction for reclustering

dstream2 <- DSC_DStream(gridsize = 1,
  attraction = TRUE,
  Cm = 1.5)
update(dstream2, stream, 1000)
dstream2
## DStream
## Class: DSC_DStream, DSC_Micro, DSC_R, DSC 
## Number of micro-clusters: 42 
## Number of macro-clusters: 5
plot(dstream2, stream, type = "both", grid = TRUE)

Evaluate the clustering

evaluate(dstream2, stream, measure = "crand", type = "macro")
## Evaluation results for macro-clusters.
## Points were assigned to micro-clusters.
## 
##     cRand 
## 0.8083212