library(seriation)
Note: this requires at least version 1.1-0 of seriation (http://r-forge.r-project.org/R/?group_id=141)
data(iris)
shuffle the data set
iris <- iris[sample(1:nrow(iris)),]
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 62 5.9 3.0 4.2 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 38 4.9 3.6 1.4 0.1 setosa
## 91 5.5 2.6 4.4 1.2 versicolor
## 115 5.8 2.8 5.1 2.4 virginica
## 69 6.2 2.2 4.5 1.5 versicolor
plot(iris[,-5], col=iris[,5])
compute Eucliden distance on scaled data. Col 5 is the class attribute. Note that in the paper they do not scale the data, but normalize it between 0 and 1.
d <- dist(scale(iris[,-5]))
pimage(d, key=TRUE)
o <- seriate(d, method = "OLO")
pimage(d, o, key=TRUE)
pimage(d, o, key=TRUE, zlim = c(0, 2))
knn <- function(d, k) {
m <- as.matrix(d)
knn <- t(apply(m, MARGIN = 1, FUN = order))[,1:(k+1)]
for(i in 1:nrow(m)) m[i, -knn[i,]] <- NA
m
}
pimage(knn(d, 50), c(o, o), key=TRUE, prop = TRUE)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
tree <- ctree(Species~., data=iris)
tree
##
## Conditional inference tree with 4 terminal nodes
##
## Response: Species
## Inputs: Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
## Number of observations: 150
##
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 140.264
## 2)* weights = 50
## 1) Petal.Length > 1.9
## 3) Petal.Width <= 1.7; criterion = 1, statistic = 67.894
## 4) Petal.Length <= 4.8; criterion = 0.999, statistic = 13.865
## 5)* weights = 46
## 4) Petal.Length > 4.8
## 6)* weights = 8
## 3) Petal.Width > 1.7
## 7)* weights = 46
plot(tree)
table(true=iris$Species, terminal_note=tree@where)
## terminal_note
## true 2 5 6 7
## setosa 50 0 0 0
## versicolor 0 45 4 1
## virginica 0 1 4 45
ctree has an element where giving the number of the terminal node the corresponding observations is element of.(see ? BinaryTree)
pimage(d, order(tree@where), key = TRUE)
pimage(d, order(tree@where), key = TRUE, zlim = c(0, .7))
hc <- hclust(d)
hc
##
## Call:
## hclust(d = d)
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 150
plot(hc)
cl <- cutree(hc, k=3)
table(true=iris$Species, predict=cl)
## predict
## true 1 2 3
## setosa 0 1 49
## versicolor 29 21 0
## virginica 48 2 0
use just the cluster information
pimage(d, order(cl), key = TRUE)
use leaf node order in dendrogram
pimage(d, hc$order, key = TRUE)
o <- order(iris$Species)
pimage(d, o, key = TRUE)
pimage(d, o, key = TRUE, zlim = c(0, .7))
order <- seriate(d, method="OLO")
order
## object of class 'ser_permutation', 'list'
## contains permutation vectors for 1-mode data
##
## vector length seriation method
## 1 150 OLO
pimage(d, order, key = TRUE)
use differnet colors
pimage(d, order, key = TRUE,
col = bluered(100, bias=.25))
see http://en.wikipedia.org/wiki/Heat_map ## On distances
hmap(d)
hmap(d, col = bluered(100, bias=.25))
hmap(iris[,-5], cexCol = 1, margin = c(7,3))
Scale first to repesent observations with below average values (blue) or above average values (red)
hmap(scale(iris[,-5]), cexCol = 1, margin = c(7,3), col = bluered(100))
data(Zoo)
d <- dist(Zoo[,-17])
unique(sort(Zoo$class))
## [1] amphibian bird fish insect invertebrate
## [6] mammal reptile
## Levels: amphibian bird fish insect invertebrate mammal reptile
pimage(d, order(Zoo$class), key=TRUE)
pimage(d, order(Zoo$class), key = TRUE, zlim = c(0, 2))
pimage(knn(d, 3), ser_permutation(order(Zoo$class), order(Zoo$class)),
key = TRUE, prop = TRUE)
Use dissimilarity plot instead (see http://michael.hahsler.net/research/dissplot_JCGS2011/dissplot_preprint.pdf)
res <- dissplot(d, as.integer(Zoo$class), zlim = c(0,3))
levels(Zoo$class)
## [1] "amphibian" "bird" "fish" "insect"
## [5] "invertebrate" "mammal" "reptile"