This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.
library(seriation) # used for `pimage()`
set.seed(1234)
numbers <- read.csv("numbers.csv", header=TRUE)
dim(numbers)
## [1] 42000 784
Take a row of pixels and make it into a 28x28 matrix
toMatrix <- function(x) matrix(as.numeric(x), nrow=28, byrow=TRUE)
Convert a matrix back to a vector
toVector <- function(x) as.vector(t(x))
Test if it works
all(toVector(toMatrix(numbers[2,])) == numbers[2,])
## [1] TRUE
toMatrix(numbers[2,])
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 18 30 137
## [6,] 0 0 0 0 0 0 0 0 13 86 250 254 254
## [7,] 0 0 0 0 0 0 0 16 179 254 254 254 254
## [8,] 0 0 0 0 0 0 0 72 254 254 254 254 254
## [9,] 0 0 0 0 0 0 61 191 254 254 254 254 254
## [10,] 0 0 0 0 0 0 172 254 254 254 202 147 147
## [11,] 0 0 0 0 0 1 174 254 254 89 67 0 0
## [12,] 0 0 0 0 0 47 254 254 254 29 0 0 0
## [13,] 0 0 0 0 0 80 254 254 240 24 0 0 0
## [14,] 0 0 0 0 0 64 254 254 186 7 0 0 0
## [15,] 0 0 0 0 14 232 254 254 254 29 0 0 0
## [16,] 0 0 0 0 18 254 254 254 254 29 0 0 0
## [17,] 0 0 0 0 2 163 254 254 254 29 0 0 0
## [18,] 0 0 0 0 0 94 254 254 254 200 12 0 0
## [19,] 0 0 0 0 0 15 206 254 254 254 202 66 0
## [20,] 0 0 0 0 0 0 60 212 254 254 254 194 48
## [21,] 0 0 0 0 0 0 0 86 243 254 254 254 254
## [22,] 0 0 0 0 0 0 0 0 114 254 254 254 254
## [23,] 0 0 0 0 0 0 0 0 13 182 254 254 254
## [24,] 0 0 0 0 0 0 0 0 0 8 76 146 254
## [25,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [26,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [27,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [28,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0
## [5,] 137 192 86 72 1 0 0 0 0 0 0
## [6,] 254 254 217 246 151 32 0 0 0 0 0
## [7,] 254 254 254 254 254 231 54 15 0 0 0
## [8,] 254 254 254 254 254 254 254 104 0 0 0
## [9,] 109 83 199 254 254 254 254 243 85 0 0
## [10,] 45 0 11 29 200 254 254 254 171 0 0
## [11,] 0 0 0 0 128 252 254 254 212 76 0
## [12,] 0 0 0 0 0 83 254 254 254 153 0
## [13,] 0 0 0 0 0 25 240 254 254 153 0
## [14,] 0 0 0 0 0 0 166 254 254 224 12
## [15,] 0 0 0 0 0 0 75 254 254 254 17
## [16,] 0 0 0 0 0 0 48 254 254 254 17
## [17,] 0 0 0 0 0 0 48 254 254 254 17
## [18,] 0 0 0 0 0 16 209 254 254 150 1
## [19,] 0 0 0 0 21 161 254 254 245 31 0
## [20,] 48 34 41 48 209 254 254 254 171 0 0
## [21,] 254 233 243 254 254 254 254 254 86 0 0
## [22,] 254 254 254 254 254 254 239 86 11 0 0
## [23,] 254 254 254 254 254 243 70 0 0 0 0
## [24,] 255 254 255 146 19 15 0 0 0 0 0
## [25,] 0 0 0 0 0 0 0 0 0 0 0
## [26,] 0 0 0 0 0 0 0 0 0 0 0
## [27,] 0 0 0 0 0 0 0 0 0 0 0
## [28,] 0 0 0 0 0 0 0 0 0 0 0
## [,25] [,26] [,27] [,28]
## [1,] 0 0 0 0
## [2,] 0 0 0 0
## [3,] 0 0 0 0
## [4,] 0 0 0 0
## [5,] 0 0 0 0
## [6,] 0 0 0 0
## [7,] 0 0 0 0
## [8,] 0 0 0 0
## [9,] 0 0 0 0
## [10,] 0 0 0 0
## [11,] 0 0 0 0
## [12,] 0 0 0 0
## [13,] 0 0 0 0
## [14,] 0 0 0 0
## [15,] 0 0 0 0
## [16,] 0 0 0 0
## [17,] 0 0 0 0
## [18,] 0 0 0 0
## [19,] 0 0 0 0
## [20,] 0 0 0 0
## [21,] 0 0 0 0
## [22,] 0 0 0 0
## [23,] 0 0 0 0
## [24,] 0 0 0 0
## [25,] 0 0 0 0
## [26,] 0 0 0 0
## [27,] 0 0 0 0
## [28,] 0 0 0 0
pimage(toMatrix(numbers[2,]))
see http://en.wikipedia.org/wiki/Kernel_%28image_processing%29 and http://graphics.stanford.edu/courses/cs178/applets/convolution.html
Note: This is very slow \(O(n^2 k^2)\) (\(n\)=image size, \(k\)=kernel size). So we do it in C++ (Source: http://permalink.gmane.org/gmane.comp.lang.r.rcpp/2926)
For Windows you will need to install Rtools from https://cran.r-project.org/bin/windows/Rtools/ to be able to compile code.
library(Rcpp)
library(inline)
##
## Attaching package: 'inline'
##
## The following object is masked from 'package:Rcpp':
##
## registerPlugin
convolve_2d <- cxxfunction(signature(sampleS = "numeric",
kernelS = "numeric"),
plugin = "Rcpp", '
Rcpp::NumericMatrix sample(sampleS), kernel(kernelS);
int x_s = sample.nrow(), x_k = kernel.nrow();
int y_s = sample.ncol(), y_k = kernel.ncol();
Rcpp::NumericMatrix output(x_s + x_k - 1, y_s + y_k - 1);
for (int row = 0; row < x_s; row++) {
for (int col = 0; col < y_s; col++) {
for (int i = 0; i < x_k; i++) {
for (int j = 0; j < y_k; j++) {
output(row + i, col + j) += sample(row, col) * kernel(i, j);
}
}
}
}
return output;
')
blurr <- rbind(
c(0 , .5, 0 ),
c(0.5, 1 , .5),
c(0 , .5, 0 )
)
blurr <- blurr/sum(blurr) #' normalize to sum to 1
pimage(blurr)
#m <- toMatrix(numbers[1,])
#m <- toMatrix(numbers[2,])
#m <- toMatrix(numbers[3,])
m <- toMatrix(numbers[4,])
pimage(m)
mb <- convolve_2d(m, blurr) # blurr
pimage(mb)
mbb <- convolve_2d(mb, blurr) # blurr some more
pimage(mbb)
mbb2 <- mbb>=quantile(mbb[mbb>0], 1-.4)
pimage(mbb2)
vertical pattern of 2 dark pixels
conv1 <- rbind(
c(0,0,1,1,0,0),
c(0,0,1,1,0,0),
c(0,0,1,1,0,0),
c(0,0,1,1,0,0),
c(0,0,1,1,0,0),
c(0,0,1,1,0,0)
)
make the mean of the kernel 0
conv1 <- conv1-mean(conv1)
pimage(conv1)
the horizontal pattern
conv2 <- t(conv1)
pimage(conv2)
we start with a b/w image
mc1 <- convolve_2d(mbb2, conv1)
pimage(mc1)
# use 5% of the highest values
pimage(mc1>quantile(mc1, .95))
find horizontal lines
mc2 <- convolve_2d(mbb2, conv2)
pimage(mc2)
pimage(mc2>quantile(mc1, .95))
conv3 <- rbind(
c(0 ,-2 ,0 ),
c(-2, 8 ,-2),
c(0 ,-2 ,0 )
)
mc3 <- convolve_2d(mbb2, conv3)
pimage(mc3)
pimage(mc3<0)
We use thinning and then the edge detection kernel. Note: There are better ways to do this!
source("thinning.R")
pimage(m>100)
m_thin <- thinImage(m>100)
pimage(m_thin)
mc4 <- convolve_2d(m_thin, conv3)
pimage(mc4)
pimage(mc4>4)
sum(mc4>4)
## [1] 3
Simple approach: Take a small sample and cluster the pixels directly. I use 10 clusters, but different writing styles mean that we should use more clusters. You should probably use some preprocessing (e.g., blurring, rotation) and feature engineering instead of the raw pixels.
s <- numbers[sample(1:nrow(numbers), 1000),]
km <- kmeans(s, c=10, nstart=5)
#km
How many are in each cluster?
km$size
## [1] 120 90 88 70 127 157 52 48 86 162
Within Sum of Squares of the clusters
km$withinss
## [1] 267419230 135748347 274060197 83591636 365901270 429188824 152646281
## [8] 135409185 225621254 460167735
s1 <- s[km$cluster==1,]
pimage(toMatrix(s1[1,]))
pimage(toMatrix(s1[2,]))
pimage(toMatrix(s1[3,]))
pimage(toMatrix(s1[4,]))
pimage(toMatrix(s1[5,]))
pimage(toMatrix(s1[6,]))
pimage(toMatrix(km$centers[1,]))
pimage(toMatrix(km$centers[2,]))
pimage(toMatrix(km$centers[3,]))
pimage(toMatrix(km$centers[4,]))
pimage(toMatrix(km$centers[5,]))
pimage(toMatrix(km$centers[6,]))
pimage(toMatrix(km$centers[7,]))
pimage(toMatrix(km$centers[8,]))
pimage(toMatrix(km$centers[9,]))
pimage(toMatrix(km$centers[10,]))
Measured as Euclidean distance
plot(hclust(dist(km$centers)))
Measured as Pearson Correlation
plot(hclust(as.dist(1-cor(t(km$centers)))))
Measured as amount of ink on the paper
plot(hclust(as.dist(
abs(outer(rowSums(km$centers), rowSums(km$centers), FUN = "-"))
)))
pc <- prcomp(s)
# How important are the first principal components?
plot(pc)
str(pc)
## List of 5
## $ sdev : num [1:784] 607 503 447 436 421 ...
## $ rotation: num [1:784, 1:784] 3.85e-19 -1.39e-16 0.00 -2.22e-16 0.00 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:784] "pixel0" "pixel1" "pixel2" "pixel3" ...
## .. ..$ : chr [1:784] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:784] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "names")= chr [1:784] "pixel0" "pixel1" "pixel2" "pixel3" ...
## $ scale : logi FALSE
## $ x : num [1:1000, 1:784] 111 -608 450 -603 580 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:1000] "4776" "26136" "25589" "26181" ...
## .. ..$ : chr [1:784] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
$x contains the data projected on the PCs. Let’s look at the first two PCs.
plot(pc$x[,1:2])
show the row number in s for the image
plot(pc$x[,1:2], col = 0)
text(pc$x[,1], pc$x[,2], 1:nrow(s), cex = .6)
look at some images to the far left
pimage(toMatrix(s[986,]))
pimage(toMatrix(s[432,]))
pimage(toMatrix(s[110,]))
pimage(toMatrix(s[315,]))
cluster the images using only the first 10 PCs
data_subspace <- pc$x[,1:10]
k <- 12
km <- kmeans(data_subspace, centers = k)
plot(data_subspace, col = km$cluster)
show average image for all clusters
for(i in 1:k)
pimage(toMatrix(colMeans(s[km$cluster == i,])))
library(dbscan)
l <- lof(s)
# ' Best handwriting
for(i in order(l)[1:25]) pimage(toMatrix(s[i,]))
Worst handwriting
for(i in order(l, decreasing = TRUE)[1:25]) pimage(toMatrix(s[i,]))