The following code implements some of the concepts of the paper:

Jun Wang, Bei Yu, and Les Gasser. 2002. Concept Tree Based Clustering Visualization with Shaded Similarity Matrices. In Proceedings of the 2002 IEEE International Conference on Data Mining (ICDM ’02). IEEE Computer Society, Washington, DC, USA.

library(seriation)
data(iris)

shuffle the data set

iris <- iris[sample(1:nrow(iris)),]

head(iris)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 87           6.7         3.1          4.7         1.5 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 15           5.8         4.0          1.2         0.2     setosa
## 96           5.7         3.0          4.2         1.2 versicolor
## 148          6.5         3.0          5.2         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica

plot(iris[,-5], col=iris[,5])

Compute Eucliden distance on scaled data. Col 5 is the class attribute. Note that in the paper they do not scale the data, but normalize it between 0 and 1.

d <- dist(scale(iris[,-5]))

Matrix shading

pimage(d)

Reorder using seriation

o <- seriate(d)
pimage(d, o)

Threshold setting

pimage(d, o, zlim = c(0, 1))

K-nearest neighbor setting

Only the k nearest (for each row) neighbors are displayed.

knn <- function(d, k) {
  m <- as.matrix(d)
  knn <- t(apply(m, MARGIN = 1, FUN = order))[,1:(k+1)]
  for(i in 1:nrow(m)) m[i, -knn[i,]] <- NA
  m
}

pimage(knn(d, 20), c(o, o), key=TRUE, prop = TRUE)

Order using a concept tree (a.k.a. a decision tree)

Use a ctree as the concept tree

library(party)

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

tree <- ctree(Species~., data=iris)
tree

## 
##   Conditional inference tree with 4 terminal nodes
## 
## Response:  Species 
## Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
## Number of observations:  150 
## 
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 140.264
##   2)*  weights = 50 
## 1) Petal.Length > 1.9
##   3) Petal.Width <= 1.7; criterion = 1, statistic = 67.894
##     4) Petal.Length <= 4.8; criterion = 0.999, statistic = 13.865
##       5)*  weights = 46 
##     4) Petal.Length > 4.8
##       6)*  weights = 8 
##   3) Petal.Width > 1.7
##     7)*  weights = 46

plot(tree)

table(true=iris$Species, terminal_note=tree@where)

##             terminal_note
## true          2  5  6  7
##   setosa     50  0  0  0
##   versicolor  0 45  4  1
##   virginica   0  1  4 45

ctree has an element where giving the number of the terminal node the corresponding observations is element of.(see ? BinaryTree)

pimage(d, order(tree@where))

pimage(d, order(tree@where), zlim = c(0, .7))

Use hierarchical clustering (cluster tree, a.k.a. a dendrogram)

hc <- hclust(d)
hc

## 
## Call:
## hclust(d = d)
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 150

plot(hc)

cl <- cutree(hc, k=3)
table(true=iris$Species, predict=cl)

##             predict
## true          1  2  3
##   setosa      0 49  1
##   versicolor 29  0 21
##   virginica  48  0  2

use just the cluster information

pimage(d, order(cl))

use leaf node order in dendrogram

pimage(d, hc$order)

pimage(d, hc$order, zlim = c(0,1))

Nearest neighbor on species (just orders by class)

o <- order(iris$Species)
pimage(d, o)

pimage(d, o, zlim = c(0, .7))

Use hierarchical clustering with optimal leaf ordering

order <- seriate(d, method="OLO")
order

## object of class 'ser_permutation', 'list'
## contains permutation vectors for 1-mode data
## 
##   vector length seriation method
## 1           150              OLO

pimage(d, order)

use colors to highlight high similarity (low distances)

pimage(d, order, col = bluered(100, bias=.25))

Heat maps

see http://en.wikipedia.org/wiki/Heat_map

On distances

hmap(d)

hmap(d, col = bluered(100, bias=.25))

On original data

hmap(iris[,-5], cexCol = 1, margin = c(7,3))

Scale first to repesent observations with below average values (blue) or above average values (red)

hmap(scale(iris[,-5]), cexCol = 1, margin = c(7,3), col = bluered(100))

Zoo data

data(Zoo)

d <- dist(Zoo[,-17])

order by class

o <- order(Zoo$class)
unique(sort(Zoo$class))

## [1] amphibian    bird         fish         insect       invertebrate
## [6] mammal       reptile     
## Levels: amphibian bird fish insect invertebrate mammal reptile

pimage(d, o)

See emerging structure

pimage(d, o, zlim = c(0, 1))

pimage(d, o, zlim = c(0, 2))

pimage(d, o, zlim = c(0, 3))

Use knn instead

pimage(knn(d, 7), ser_permutation(o, o),
  prop = TRUE)

Use dissimilarity plot instead (see http://michael.hahsler.net/research/dissplot_JCGS2011/dissplot_preprint.pdf)

res <- dissplot(d, as.integer(Zoo$class), zlim =c(0,3))

levels(Zoo$class)

## [1] "amphibian"    "bird"         "fish"         "insect"      
## [5] "invertebrate" "mammal"       "reptile"

EMIS/CSE 8331: (Dis)similarity matrix shading using concept trees

Michael Hahsler

Mon Apr 17 13:15:48 2017