library(seriation)

Note: this requires at least version 1.1-0 of seriation (http://r-forge.r-project.org/R/?group_id=141)

data(iris)

shuffle the data set

iris <- iris[sample(1:nrow(iris)),]

head(iris)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 62           5.9         3.0          4.2         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 38           4.9         3.6          1.4         0.1     setosa
## 91           5.5         2.6          4.4         1.2 versicolor
## 115          5.8         2.8          5.1         2.4  virginica
## 69           6.2         2.2          4.5         1.5 versicolor

plot(iris[,-5], col=iris[,5])

compute Eucliden distance on scaled data. Col 5 is the class attribute. Note that in the paper they do not scale the data, but normalize it between 0 and 1.

d <- dist(scale(iris[,-5]))

Matrix shading

pimage(d, key=TRUE)

Reorder using seriation

o <- seriate(d, method = "OLO")
pimage(d, o, key=TRUE)

Threshold setting

pimage(d, o, key=TRUE, zlim = c(0, 2))

K-nearest neighbor setting

knn <- function(d, k) {
  m <- as.matrix(d)
  knn <- t(apply(m, MARGIN = 1, FUN = order))[,1:(k+1)]
  for(i in 1:nrow(m)) m[i, -knn[i,]] <- NA
  m
}

pimage(knn(d, 50), c(o, o), key=TRUE, prop = TRUE)

Order using a concept tree (a.k.a. decision tree)

Use a ctree as the concept tree

library(party)

## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich

tree <- ctree(Species~., data=iris)
tree

## 
##   Conditional inference tree with 4 terminal nodes
## 
## Response:  Species 
## Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
## Number of observations:  150 
## 
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 140.264
##   2)*  weights = 50 
## 1) Petal.Length > 1.9
##   3) Petal.Width <= 1.7; criterion = 1, statistic = 67.894
##     4) Petal.Length <= 4.8; criterion = 0.999, statistic = 13.865
##       5)*  weights = 46 
##     4) Petal.Length > 4.8
##       6)*  weights = 8 
##   3) Petal.Width > 1.7
##     7)*  weights = 46

plot(tree)

table(true=iris$Species, terminal_note=tree@where)

##             terminal_note
## true          2  5  6  7
##   setosa     50  0  0  0
##   versicolor  0 45  4  1
##   virginica   0  1  4 45

ctree has an element where giving the number of the terminal node the corresponding observations is element of.(see ? BinaryTree)

pimage(d, order(tree@where), key = TRUE)

pimage(d, order(tree@where), key = TRUE, zlim = c(0, .7))

Use hierarchical clustering (cluster tree, a.k.a. dendrogram)

hc <- hclust(d)
hc

## 
## Call:
## hclust(d = d)
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 150

plot(hc)

cl <- cutree(hc, k=3)
table(true=iris$Species, predict=cl)

##             predict
## true          1  2  3
##   setosa      0  1 49
##   versicolor 29 21  0
##   virginica  48  2  0

use just the cluster information

pimage(d, order(cl), key = TRUE)

use leaf node order in dendrogram

pimage(d, hc$order, key = TRUE)

Nearest neighbor on species (order by species)

o <- order(iris$Species)
pimage(d, o, key = TRUE)

pimage(d, o, key = TRUE, zlim = c(0, .7))

Use hierarchical clustering with optimal leaf ordering

order <- seriate(d, method="OLO")
order

## object of class 'ser_permutation', 'list'
## contains permutation vectors for 1-mode data
## 
##   vector length seriation method
## 1           150              OLO

pimage(d, order, key = TRUE)

use differnet colors

pimage(d, order, key = TRUE,
  col = bluered(100, bias=.25))

Heat maps

see http://en.wikipedia.org/wiki/Heat_map ## On distances

hmap(d)

hmap(d, col = bluered(100, bias=.25))

On original data

hmap(iris[,-5], cexCol = 1, margin = c(7,3))

Scale first to repesent observations with below average values (blue) or above average values (red)

hmap(scale(iris[,-5]), cexCol = 1, margin = c(7,3), col = bluered(100))

Zoo data

data(Zoo)

d <- dist(Zoo[,-17])
unique(sort(Zoo$class))

## [1] amphibian    bird         fish         insect       invertebrate
## [6] mammal       reptile     
## Levels: amphibian bird fish insect invertebrate mammal reptile

pimage(d, order(Zoo$class), key=TRUE)

pimage(d, order(Zoo$class), key = TRUE, zlim = c(0, 2))

pimage(knn(d, 3), ser_permutation(order(Zoo$class), order(Zoo$class)),
  key = TRUE, prop = TRUE)

Use dissimilarity plot instead (see http://michael.hahsler.net/research/dissplot_JCGS2011/dissplot_preprint.pdf)

res <- dissplot(d, as.integer(Zoo$class), zlim = c(0,3))

levels(Zoo$class)

## [1] "amphibian"    "bird"         "fish"         "insect"      
## [5] "invertebrate" "mammal"       "reptile"

EMIS/CSE 8331: (Dis)similarity matrix shading using concept trees

Michael Hahsler

Tue Mar 3 09:30:18 2015