CC This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.

Numbers data set

library(seriation) # used for `pimage()`
set.seed(1234)

numbers <- read.csv("numbers.csv", header=TRUE)
dim(numbers)
## [1] 42000   784

Helper functions

Take a row of pixels and make it into a 28x28 matrix

toMatrix <- function(x) matrix(as.numeric(x), nrow=28, byrow=TRUE)

Convert a matrix back to a vector

toVector <- function(x) as.vector(t(x))

Test if it works

all(toVector(toMatrix(numbers[2,])) == numbers[2,])
## [1] TRUE
toMatrix(numbers[2,])
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [2,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [3,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [4,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [5,]    0    0    0    0    0    0    0    0    0     0    18    30   137
##  [6,]    0    0    0    0    0    0    0    0   13    86   250   254   254
##  [7,]    0    0    0    0    0    0    0   16  179   254   254   254   254
##  [8,]    0    0    0    0    0    0    0   72  254   254   254   254   254
##  [9,]    0    0    0    0    0    0   61  191  254   254   254   254   254
## [10,]    0    0    0    0    0    0  172  254  254   254   202   147   147
## [11,]    0    0    0    0    0    1  174  254  254    89    67     0     0
## [12,]    0    0    0    0    0   47  254  254  254    29     0     0     0
## [13,]    0    0    0    0    0   80  254  254  240    24     0     0     0
## [14,]    0    0    0    0    0   64  254  254  186     7     0     0     0
## [15,]    0    0    0    0   14  232  254  254  254    29     0     0     0
## [16,]    0    0    0    0   18  254  254  254  254    29     0     0     0
## [17,]    0    0    0    0    2  163  254  254  254    29     0     0     0
## [18,]    0    0    0    0    0   94  254  254  254   200    12     0     0
## [19,]    0    0    0    0    0   15  206  254  254   254   202    66     0
## [20,]    0    0    0    0    0    0   60  212  254   254   254   194    48
## [21,]    0    0    0    0    0    0    0   86  243   254   254   254   254
## [22,]    0    0    0    0    0    0    0    0  114   254   254   254   254
## [23,]    0    0    0    0    0    0    0    0   13   182   254   254   254
## [24,]    0    0    0    0    0    0    0    0    0     8    76   146   254
## [25,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [26,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [27,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [28,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##       [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
##  [1,]     0     0     0     0     0     0     0     0     0     0     0
##  [2,]     0     0     0     0     0     0     0     0     0     0     0
##  [3,]     0     0     0     0     0     0     0     0     0     0     0
##  [4,]     0     0     0     0     0     0     0     0     0     0     0
##  [5,]   137   192    86    72     1     0     0     0     0     0     0
##  [6,]   254   254   217   246   151    32     0     0     0     0     0
##  [7,]   254   254   254   254   254   231    54    15     0     0     0
##  [8,]   254   254   254   254   254   254   254   104     0     0     0
##  [9,]   109    83   199   254   254   254   254   243    85     0     0
## [10,]    45     0    11    29   200   254   254   254   171     0     0
## [11,]     0     0     0     0   128   252   254   254   212    76     0
## [12,]     0     0     0     0     0    83   254   254   254   153     0
## [13,]     0     0     0     0     0    25   240   254   254   153     0
## [14,]     0     0     0     0     0     0   166   254   254   224    12
## [15,]     0     0     0     0     0     0    75   254   254   254    17
## [16,]     0     0     0     0     0     0    48   254   254   254    17
## [17,]     0     0     0     0     0     0    48   254   254   254    17
## [18,]     0     0     0     0     0    16   209   254   254   150     1
## [19,]     0     0     0     0    21   161   254   254   245    31     0
## [20,]    48    34    41    48   209   254   254   254   171     0     0
## [21,]   254   233   243   254   254   254   254   254    86     0     0
## [22,]   254   254   254   254   254   254   239    86    11     0     0
## [23,]   254   254   254   254   254   243    70     0     0     0     0
## [24,]   255   254   255   146    19    15     0     0     0     0     0
## [25,]     0     0     0     0     0     0     0     0     0     0     0
## [26,]     0     0     0     0     0     0     0     0     0     0     0
## [27,]     0     0     0     0     0     0     0     0     0     0     0
## [28,]     0     0     0     0     0     0     0     0     0     0     0
##       [,25] [,26] [,27] [,28]
##  [1,]     0     0     0     0
##  [2,]     0     0     0     0
##  [3,]     0     0     0     0
##  [4,]     0     0     0     0
##  [5,]     0     0     0     0
##  [6,]     0     0     0     0
##  [7,]     0     0     0     0
##  [8,]     0     0     0     0
##  [9,]     0     0     0     0
## [10,]     0     0     0     0
## [11,]     0     0     0     0
## [12,]     0     0     0     0
## [13,]     0     0     0     0
## [14,]     0     0     0     0
## [15,]     0     0     0     0
## [16,]     0     0     0     0
## [17,]     0     0     0     0
## [18,]     0     0     0     0
## [19,]     0     0     0     0
## [20,]     0     0     0     0
## [21,]     0     0     0     0
## [22,]     0     0     0     0
## [23,]     0     0     0     0
## [24,]     0     0     0     0
## [25,]     0     0     0     0
## [26,]     0     0     0     0
## [27,]     0     0     0     0
## [28,]     0     0     0     0
pimage(toMatrix(numbers[2,]))

Some basic image processing

2D convolution

see http://en.wikipedia.org/wiki/Kernel_%28image_processing%29 and http://graphics.stanford.edu/courses/cs178/applets/convolution.html

Note: This is very slow \(O(n^2 k^2)\) (\(n\)=image size, \(k\)=kernel size). So we do it in C++ (Source: http://permalink.gmane.org/gmane.comp.lang.r.rcpp/2926)

For Windows you will need to install Rtools from https://cran.r-project.org/bin/windows/Rtools/ to be able to compile code.

library(Rcpp)
library(inline)
## 
## Attaching package: 'inline'
## 
## The following object is masked from 'package:Rcpp':
## 
##     registerPlugin
convolve_2d <- cxxfunction(signature(sampleS = "numeric",
  kernelS = "numeric"),
  plugin = "Rcpp", '
    Rcpp::NumericMatrix sample(sampleS), kernel(kernelS);
    int x_s = sample.nrow(), x_k = kernel.nrow();
    int y_s = sample.ncol(), y_k = kernel.ncol();

    Rcpp::NumericMatrix output(x_s + x_k - 1, y_s + y_k - 1);
    for (int row = 0; row < x_s; row++) {
     for (int col = 0; col < y_s; col++) {
       for (int i = 0; i < x_k; i++) {
         for (int j = 0; j < y_k; j++) {
           output(row + i, col + j) += sample(row, col) * kernel(i, j);
         }
       }
     }
    }
    return output;
  ')

Blurring images (with a Gaussian Kernel)

blurr <- rbind(
  c(0  , .5, 0 ),
  c(0.5, 1 , .5),
  c(0  , .5, 0 )
)

blurr <- blurr/sum(blurr) #' normalize to sum to 1
pimage(blurr)

#m <- toMatrix(numbers[1,])
#m <- toMatrix(numbers[2,])
#m <- toMatrix(numbers[3,])
m <- toMatrix(numbers[4,])

pimage(m)

mb <- convolve_2d(m, blurr) # blurr
pimage(mb)

mbb <- convolve_2d(mb, blurr) # blurr some more
pimage(mbb)

Keep only 40% of the darkest (non-white) pixels

mbb2 <- mbb>=quantile(mbb[mbb>0], 1-.4)
pimage(mbb2)

Extract patterns

vertical pattern of 2 dark pixels

conv1 <- rbind(
  c(0,0,1,1,0,0),
  c(0,0,1,1,0,0),
  c(0,0,1,1,0,0),
  c(0,0,1,1,0,0),
  c(0,0,1,1,0,0),
  c(0,0,1,1,0,0)
)

make the mean of the kernel 0

conv1 <- conv1-mean(conv1)

pimage(conv1)

the horizontal pattern

conv2 <- t(conv1)
pimage(conv2)

we start with a b/w image

mc1 <- convolve_2d(mbb2, conv1)
pimage(mc1)

# use 5% of the highest values
pimage(mc1>quantile(mc1, .95))

find horizontal lines

mc2 <- convolve_2d(mbb2, conv2)
pimage(mc2)

pimage(mc2>quantile(mc1, .95))

Edge detection

conv3 <- rbind(
  c(0 ,-2 ,0 ),
  c(-2, 8 ,-2),
  c(0 ,-2 ,0 )
)

mc3 <- convolve_2d(mbb2, conv3)
pimage(mc3)

pimage(mc3<0)

End point detection

We use thinning and then the edge detection kernel. Note: There are better ways to do this!

source("thinning.R")
pimage(m>100)

m_thin <- thinImage(m>100)
pimage(m_thin)

mc4 <- convolve_2d(m_thin, conv3)
pimage(mc4)

pimage(mc4>4)

sum(mc4>4)
## [1] 3

Clustering

Simple approach: Take a small sample and cluster the pixels directly. I use 10 clusters, but different writing styles mean that we should use more clusters. You should probably use some preprocessing (e.g., blurring, rotation) and feature engineering instead of the raw pixels.

s <- numbers[sample(1:nrow(numbers), 1000),]
km <- kmeans(s, c=10, nstart=5)
#km

How many are in each cluster?

km$size
##  [1] 120  90  88  70 127 157  52  48  86 162

Within Sum of Squares of the clusters

km$withinss
##  [1] 267419230 135748347 274060197  83591636 365901270 429188824 152646281
##  [8] 135409185 225621254 460167735

Look at some members of cluster 1

s1 <- s[km$cluster==1,]
pimage(toMatrix(s1[1,]))

pimage(toMatrix(s1[2,]))

pimage(toMatrix(s1[3,]))

pimage(toMatrix(s1[4,]))

pimage(toMatrix(s1[5,]))

pimage(toMatrix(s1[6,]))

Look at centroids

pimage(toMatrix(km$centers[1,]))

pimage(toMatrix(km$centers[2,]))

pimage(toMatrix(km$centers[3,]))

pimage(toMatrix(km$centers[4,]))

pimage(toMatrix(km$centers[5,]))

pimage(toMatrix(km$centers[6,]))

pimage(toMatrix(km$centers[7,]))

pimage(toMatrix(km$centers[8,]))

pimage(toMatrix(km$centers[9,]))

pimage(toMatrix(km$centers[10,]))

How similar are the cluster centers?

Measured as Euclidean distance

plot(hclust(dist(km$centers)))

Measured as Pearson Correlation

plot(hclust(as.dist(1-cor(t(km$centers)))))

Measured as amount of ink on the paper

plot(hclust(as.dist(
  abs(outer(rowSums(km$centers), rowSums(km$centers), FUN = "-"))
  )))

Feature Reduction with PCA

pc <- prcomp(s)

# How important are the first principal components?
plot(pc)

str(pc)
## List of 5
##  $ sdev    : num [1:784] 607 503 447 436 421 ...
##  $ rotation: num [1:784, 1:784] 3.85e-19 -1.39e-16 0.00 -2.22e-16 0.00 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:784] "pixel0" "pixel1" "pixel2" "pixel3" ...
##   .. ..$ : chr [1:784] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:784] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "names")= chr [1:784] "pixel0" "pixel1" "pixel2" "pixel3" ...
##  $ scale   : logi FALSE
##  $ x       : num [1:1000, 1:784] 111 -608 450 -603 580 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:1000] "4776" "26136" "25589" "26181" ...
##   .. ..$ : chr [1:784] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"

$x contains the data projected on the PCs. Let’s look at the first two PCs.

plot(pc$x[,1:2])

show the row number in s for the image

plot(pc$x[,1:2], col = 0)
text(pc$x[,1], pc$x[,2], 1:nrow(s), cex = .6)

look at some images to the far left

pimage(toMatrix(s[986,]))

pimage(toMatrix(s[432,]))

pimage(toMatrix(s[110,]))

pimage(toMatrix(s[315,]))

cluster the images using only the first 10 PCs

data_subspace <- pc$x[,1:10]
k <- 12
km <- kmeans(data_subspace, centers = k)
plot(data_subspace, col = km$cluster)

show average image for all clusters

for(i in 1:k)
pimage(toMatrix(colMeans(s[km$cluster == i,])))

25 Best and Worst Written Digits using LOF

library(dbscan)

l <- lof(s)

# ' Best handwriting
for(i in order(l)[1:25]) pimage(toMatrix(s[i,]))

Worst handwriting

for(i in order(l, decreasing = TRUE)[1:25]) pimage(toMatrix(s[i,]))