#install.packages("R.matlab")
library("R.matlab")
## R.matlab v3.2.0 (2015-02-24) successfully loaded. See ?R.matlab for help.
## 
## Attaching package: 'R.matlab'
## 
## The following objects are masked from 'package:base':
## 
##     getOption, isOpen
library("caret")
## Loading required package: lattice
## Loading required package: ggplot2

Read and prepare data from paper

Source: http://web.mit.edu/stgoh/www/imbalanceddatafolder/

data <- readMat("diamonddistribution.mat")
str(data)
## List of 4
##  $ Apositive      : num [1:386, 1:2] 24.8 16.7 25.6 80.1 11 ...
##  $ Anegative      : num [1:9614, 1:2] 21.4 98 73.2 47.2 65.3 ...
##  $ positivepermute: num [1:10, 1:386] 343 353 31 78 322 312 110 197 293 163 ...
##  $ negativepermute: num [1:10, 1:9614] 7106 3672 4701 2760 2945 ...
##  - attr(*, "header")=List of 3
##   ..$ description: chr "MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Sat Sep 07 15:50:27 2013                                        "
##   ..$ version    : chr "5"
##   ..$ endian     : chr "little"
d <- rbind(
  data.frame(data$Apositive, class = TRUE),
  data.frame(data$Anegative, class = FALSE)
)
d[,3] <- as.factor(d[,3])
colnames(d) <- c("x", "y", "class")
d <- d[sample(nrow(d)),]

Data set statistics

head(d)
##             x         y class
## 6367 72.86221  3.893148 FALSE
## 9749 59.66190 77.928175 FALSE
## 4607 72.90128 53.873021 FALSE
## 8506 86.14446 88.801245 FALSE
## 4152 92.60641 61.218094 FALSE
## 777  79.35508 21.481507 FALSE
dim(d)
## [1] 10000     3
summary(d)
##        x                   y             class     
##  Min.   :  0.01231   Min.   : 0.0011   FALSE:9614  
##  1st Qu.: 24.54731   1st Qu.:24.3116   TRUE : 386  
##  Median : 49.72268   Median :49.2927               
##  Mean   : 49.61158   Mean   :49.7376               
##  3rd Qu.: 74.42327   3rd Qu.:74.9864               
##  Max.   : 99.99728   Max.   :99.9945
tbl <- table(d[,3])
tbl["TRUE"]/sum(tbl)
##   TRUE 
## 0.0386

Imbalance factor

1/(tbl["TRUE"]/sum(tbl))
##     TRUE 
## 25.90674
plot(d[,1:2], col=as.logical(d[,3])+1L, pch=20)

Split into training and testing data

train <- d[1:5000,]
test <- d[5001:6000,]

Use a classification tree

library(rpart)
library(rpart.plot)
rp <- rpart(class ~ ., data = train)
rpart.plot(rp)