Introduction
This example demonstrates how to use D-Stream for data stream clustering. This is an expanded example based on the manual page ? DSC_DStream
.
library(stream)
## Loading required package: registry
## Loading required package: proxy
## Registered S3 methods overwritten by 'proxy':
## method from
## print.registry_field registry
## print.registry_entry registry
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
stream <- DSD_BarsAndGaussians(noise = .05)
The stream is a sequence of data points.
get_points(stream)
## x y
## 1 3.813348 -0.4695497
get_points(stream, 5)
## x y
## 1 1.079473 -0.2459382
## 2 -2.783133 -0.5628950
## 3 -3.341400 1.3418192
## 4 -3.394696 1.1683414
## 5 -1.803918 -3.8143167
Plot 100 points from the stream.
plot(stream, n = 1000)
sample <- DSO_Sample(k = 100, biased = TRUE)
update(sample, stream, n = 500)
plot(get_points(sample))
points(kmeans(get_points(sample), centers = 4)$centers, col = "red", pch = "+")
Note on k-means on the sample (note that k-means has an issue with noise)
DSO_Window
is also available.
We create an empty DStream clusterer and update it with part of the stream
dstream1 <- DSC_DStream(gridsize = 1, Cm = 1.5)
dstream1
## DStream
## Class: DSC_DStream, DSC_Micro, DSC_R, DSC
## Number of micro-clusters: 0
## Number of macro-clusters: 0
update(dstream1, stream, n = 1000)
dstream1
## DStream
## Class: DSC_DStream, DSC_Micro, DSC_R, DSC
## Number of micro-clusters: 42
## Number of macro-clusters: 4
nclusters(dstream1)
## [1] 42
head(get_centers(dstream1))
## X1 X2
## 1 -4.5 -2.5
## 2 -4.5 2.5
## 3 -3.5 -3.5
## 4 -3.5 -2.5
## 5 -3.5 -1.5
## 6 -3.5 -0.5
plot (DStream provides additional grid visualization)
plot(dstream1, stream)
plot(dstream1, stream, grid = TRUE)
look only at dense grids
nclusters(dstream1, grid_type = "dense")
## [1] 28
plot(dstream1, stream, grid = TRUE, grid_type = "dense")
look at transitional and sparse cells
plot(dstream1, stream, grid = TRUE, grid_type = "transitional")
plot(dstream1, stream, grid = TRUE, grid_type = "sparse")
standard D-Stream uses reachability
nclusters(dstream1, type = "macro")
## [1] 4
get_centers(dstream1, type = "macro")
## X1 X2
## 1 -2.443889 -2.5298394
## 2 -3.315029 1.5129748
## 3 3.300452 -0.5058322
## 4 1.738802 1.9660876
plot(dstream1, stream, type = "both", grid = TRUE)
evaluate(dstream1, stream, measure = "crand", type = "macro")
## Evaluation results for macro-clusters.
## Points were assigned to micro-clusters.
##
## cRand
## 0.6640316
use attraction for reclustering
dstream2 <- DSC_DStream(gridsize = 1,
attraction = TRUE,
Cm = 1.5)
update(dstream2, stream, 1000)
dstream2
## DStream
## Class: DSC_DStream, DSC_Micro, DSC_R, DSC
## Number of micro-clusters: 42
## Number of macro-clusters: 5
plot(dstream2, stream, type = "both", grid = TRUE)
evaluate(dstream2, stream, measure = "crand", type = "macro")
## Evaluation results for macro-clusters.
## Points were assigned to micro-clusters.
##
## cRand
## 0.8083212