Set the random number generator to make the experiments repeatable

set.seed(1234)

Prepare the data

We use the iris data set

data(iris)

shuffle since flowers are in order by species!

x <- iris[sample(1:nrow(iris)),]

Make the Iris data a little messy

x <- cbind(x, useless = rnorm(nrow(x)))
x[,1] <- x[,1] + rnorm(nrow(x))
x[,2] <- x[,2] + rnorm(nrow(x))
x[,3] <- x[,3] + rnorm(nrow(x))

Look at the scatterplot matrix

plot(x, col=x$Species)

plot of chunk unnamed-chunk-5

head(x)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species useless
## 18         4.440       3.632       0.5518         0.3     setosa -0.1534
## 93         8.719       2.324       3.6531         1.2 versicolor -1.3907
## 91         6.177       1.921       4.6275         1.2 versicolor -0.7236
## 92         5.416       3.501       4.5726         1.4 versicolor  0.2583
## 126        7.386       2.868       6.2009         1.8  virginica -0.3171
## 149        5.876       1.565       5.5989         2.3  virginica -0.1778

Create training and testing data (Note: the data is in random order)

train <- x[1:100,]
test <- x[101:150,]

(Multiple) Linear Regression

Let us model Petal.Width as the dependent variable

model <- lm(Petal.Width ~ Sepal.Length
    + Sepal.Width + Petal.Length + useless,
    data = train)
model
## 
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length + 
##     useless, data = train)
## 
## Coefficients:
##  (Intercept)  Sepal.Length   Sepal.Width  Petal.Length       useless  
##      -0.5266        0.0778        0.0158        0.3402       -0.0512
coef(model)
##  (Intercept) Sepal.Length  Sepal.Width Petal.Length      useless 
##     -0.52664      0.07782      0.01580      0.34024     -0.05121
summary(model)
## 
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length + 
##     useless, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8128 -0.3370 -0.0093  0.3102  1.3751 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.5266     0.2242   -2.35    0.021 *  
## Sepal.Length   0.0778     0.0381    2.04    0.044 *  
## Sepal.Width    0.0158     0.0377    0.42    0.676    
## Petal.Length   0.3402     0.0255   13.34   <2e-16 ***
## useless       -0.0512     0.0478   -1.07    0.287    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.415 on 95 degrees of freedom
## Multiple R-squared:  0.744,  Adjusted R-squared:  0.733 
## F-statistic:   69 on 4 and 95 DF,  p-value: <2e-16

Petal.length is highly significant (the estimated coefficient is different from 0). useless is as expected not significant.

Let us create two additional models with less predictors

model2 <- lm(Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length,
            data = train)

model3 <- lm(Petal.Width ~ Petal.Length,
            data = train)

Compare nested models with Analysis of variance (ANOVA)

Null hypothesis is that the treatments are the same

anova(model, model2, model3)
## Analysis of Variance Table
## 
## Model 1: Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length + useless
## Model 2: Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length
## Model 3: Petal.Width ~ Petal.Length
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## 1     95 16.4                           
## 2     96 16.6 -1    -0.198 1.15  0.287  
## 3     98 17.6 -2    -0.982 2.85  0.063 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model 2 is not significantly worse than model 1. Model 3 is significantly worse than model 2 at alpha=10%, but not at alpha=5%.

Prediction using the linear model

Use model 2 for prediction

test[1:5,]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species  useless
## 105        5.783       3.548       7.5226         2.2 virginica -0.83758
## 115        6.883       2.296       4.2183         2.4 virginica -1.12376
## 14         3.347       4.059       0.7201         0.1    setosa  3.04377
## 117        7.626       4.097       6.4961         1.8 virginica  0.23502
## 139        5.351       1.833       4.3409         1.8 virginica -0.03326
predict(model2, test[1:5,])
##    105    115     14    117    139 
## 2.5398 1.4839 0.0243 2.3539 1.3892

Plot predictions over actual values for comparison

plot(test[,"Petal.Width"], predict(model2, test),
  xlim=c(0,4), ylim=c(0,4))
abline(0,1, col="red")

plot of chunk unnamed-chunk-11

See generalized linear models (glm) and nonlinear least squares (nlm)

Classification models

We will model Species

k Nearest Neighbors

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
model_knn3 <- knn3(Species ~ ., k=5, data=train)
model_knn3
## 5-nearest neighbor classification model
## 
## Call:
## knn3.formula(formula = Species ~ ., data = train, k = 5)
## 
## Training set class distribution:
## 
##     setosa versicolor  virginica 
##         38         29         33
predict(model_knn3, test)
##       setosa versicolor virginica
##  [1,]    0.0        0.2       0.8
##  [2,]    0.0        0.2       0.8
##  [3,]    1.0        0.0       0.0
##  [4,]    0.0        0.0       1.0
##  [5,]    0.0        0.4       0.6
##  [6,]    0.0        0.2       0.8
##  [7,]    0.0        0.4       0.6
##  [8,]    0.0        0.4       0.6
##  [9,]    1.0        0.0       0.0
## [10,]    0.0        0.2       0.8
## [11,]    0.6        0.2       0.2
## [12,]    0.0        0.4       0.6
## [13,]    0.0        0.6       0.4
## [14,]    0.0        1.0       0.0
## [15,]    0.0        0.4       0.6
## [16,]    0.0        0.0       1.0
## [17,]    0.0        0.8       0.2
## [18,]    0.2        0.6       0.2
## [19,]    0.0        0.8       0.2
## [20,]    0.0        0.6       0.4
## [21,]    0.0        0.6       0.4
## [22,]    1.0        0.0       0.0
## [23,]    0.0        0.8       0.2
## [24,]    0.0        0.8       0.2
## [25,]    0.0        0.2       0.8
## [26,]    0.0        0.4       0.6
## [27,]    1.0        0.0       0.0
## [28,]    1.0        0.0       0.0
## [29,]    1.0        0.0       0.0
## [30,]    0.0        1.0       0.0
## [31,]    0.0        0.8       0.2
## [32,]    0.0        0.4       0.6
## [33,]    1.0        0.0       0.0
## [34,]    0.6        0.4       0.0
## [35,]    0.6        0.2       0.2
## [36,]    0.0        0.6       0.4
## [37,]    0.0        0.0       1.0
## [38,]    1.0        0.0       0.0
## [39,]    0.0        0.4       0.6
## [40,]    0.0        0.2       0.8
## [41,]    0.0        0.8       0.2
## [42,]    0.8        0.2       0.0
## [43,]    0.0        0.6       0.4
## [44,]    0.0        0.0       1.0
## [45,]    0.0        0.2       0.8
## [46,]    0.0        0.8       0.2
## [47,]    0.0        0.2       0.8
## [48,]    1.0        0.0       0.0
## [49,]    0.0        0.6       0.4
## [50,]    1.0        0.0       0.0

Predict returns often posteriori probabilities for each class

Predict class labels

predict(model_knn3, test, type="class")
##  [1] virginica  virginica  setosa     virginica  virginica  virginica 
##  [7] virginica  virginica  setosa     virginica  setosa     virginica 
## [13] versicolor versicolor virginica  virginica  versicolor versicolor
## [19] versicolor versicolor versicolor setosa     versicolor versicolor
## [25] virginica  virginica  setosa     setosa     setosa     versicolor
## [31] versicolor virginica  setosa     setosa     setosa     versicolor
## [37] virginica  setosa     virginica  virginica  versicolor setosa    
## [43] versicolor virginica  virginica  versicolor virginica  setosa    
## [49] versicolor setosa    
## Levels: setosa versicolor virginica

Compare prediction with real classes (confusion table)

table(true=test$Species,
  predicted=predict(model_knn3, test, type="class"))
##             predicted
## true         setosa versicolor virginica
##   setosa         12          0         0
##   versicolor      2         13         6
##   virginica       0          3        14

Naive Bayes Classifier

library(klaR)
## Loading required package: MASS
model_nb <- NaiveBayes(Species ~ ., data=train)
model_nb
## $apriori
## grouping
##     setosa versicolor  virginica 
##       0.38       0.29       0.33 
## 
## $tables
## $tables$Sepal.Length
##             [,1]   [,2]
## setosa     5.081 0.8881
## versicolor 6.149 1.2789
## virginica  6.460 1.1762
## 
## $tables$Sepal.Width
##             [,1]  [,2]
## setosa     3.285 1.201
## versicolor 2.652 1.087
## virginica  2.902 1.059
## 
## $tables$Petal.Length
##             [,1]   [,2]
## setosa     1.390 0.8650
## versicolor 4.420 0.8756
## virginica  5.037 0.9558
## 
## $tables$Petal.Width
##              [,1]    [,2]
## setosa     0.2316 0.09036
## versicolor 1.3345 0.20049
## virginica  2.0576 0.27160
## 
## $tables$useless
##               [,1]   [,2]
## setosa      0.1900 0.8527
## versicolor  0.1497 0.8769
## virginica  -0.1926 0.9160
## 
## 
## $levels
## [1] "setosa"     "versicolor" "virginica" 
## 
## $call
## NaiveBayes.default(x = X, grouping = Y)
## 
## $x
##     Sepal.Length Sepal.Width Petal.Length Petal.Width  useless
## 18         4.440      3.6323      0.55180         0.3 -0.15340
## 93         8.719      2.3245      3.65306         1.2 -1.39070
## 91         6.177      1.9212      4.62748         1.2 -0.72358
## 92         5.416      3.5008      4.57260         1.4  0.25826
## 126        7.386      2.8683      6.20088         1.8 -0.31706
## 149        5.876      1.5650      5.59894         2.3 -0.17779
## 2          4.625      0.3483      2.11460         0.2 -0.16999
## 34         4.566      3.6194      1.12682         0.2 -1.37230
## 95         5.717      4.1542      5.83535         1.3 -0.17379
## 73         6.619      3.3381      5.33769         1.5  0.85023
## 98         5.122      4.1151      3.10646         1.3  0.69761
## 76         3.367      3.9825      4.12335         1.4  0.55000
## 40         4.845      3.7158      2.01588         0.2 -0.40273
## 127        6.230      1.2929      4.38894         1.8 -0.19159
## 138        6.994      3.3056      5.84500         1.8 -1.19453
## 114        5.759      4.0972      5.43440         2.0 -0.05316
## 39         4.813     -0.3961      2.60348         0.2  0.25520
## 36         3.902      2.4186      0.54830         0.2  1.70596
## 25         5.511      4.5025      1.88312         0.2  1.00151
## 31         5.519      3.6287      1.03231         0.2 -0.49558
## 42         4.752      3.0894      0.17287         0.3  0.35555
## 136        9.057      3.4571      4.25103         2.3 -1.13461
## 21         5.804      3.9388      1.89553         0.2  0.87820
## 6          5.664      3.9146      2.30157         0.4  0.97292
## 28         5.468      2.5835      2.13916         0.2  2.12112
## 102        6.237      1.4732      3.71001         1.9  0.41452
## 66         7.760      3.1362      3.27032         1.4 -0.47472
## 113        7.252      2.5786      4.35885         2.1  0.06599
## 125        7.363      2.4006      5.01133         2.1 -0.50248
## 137        5.164      3.8174      5.77020         2.4 -0.82600
## 55         6.130      2.9534      5.19448         1.5  0.16699
## 32         6.877      4.8633      0.68497         0.4 -0.89626
## 133        5.176      1.6785      2.73565         2.2  0.16819
## 60         5.458      2.1822      4.85382         1.4  0.35497
## 22         5.505      3.6251      2.11528         0.4 -0.05211
## 88         7.276      0.8922      3.69913         1.3 -0.19593
## 23         4.251      3.3153     -0.07366         0.2 -0.64907
## 30         4.859      2.4918      1.30001         0.2 -1.10977
## 112        4.637      0.5524      5.40811         1.9  0.84927
## 90         5.839      2.2162      4.72320         1.3  0.02236
## 61         4.333      1.4659      4.89010         1.0  0.83114
## 71         5.661      4.3330      3.95112         1.8 -1.24429
## 143        4.612      2.0959      5.39072         1.9  0.16903
## 67         5.985      3.5575      5.83418         1.5  0.67317
## 35         5.567      3.2426      0.59866         0.2 -0.02628
## 53         6.595      1.8631      4.67724         1.5 -0.19139
## 109        8.525      2.8741      4.02853         1.8 -0.78191
## 50         5.671      3.1939      1.68742         0.2  2.05816
## 132        8.849      3.9661      5.75881         2.0  0.75050
## 78         8.749      3.6580      4.07506         1.7  1.82421
## 8          4.349      3.5140      0.72002         0.2  0.08006
## 131        8.209      4.3852      6.54450         1.9 -0.63141
## 104        7.287      3.0156      3.43753         1.8 -1.51329
## 49         5.294      2.4460      1.07449         0.2 -0.63610
## 15         6.119      4.7419      0.18628         0.2  0.22630
## 48         3.588      1.9771      0.43788         0.2  1.01369
## 47         5.570      1.8811      2.07463         0.2  0.25275
## 70         4.899      1.6901      4.25175         1.1 -1.17195
## 17         6.214      3.1553      1.19298         0.4  0.66871
## 101        5.489      2.6925      6.53752         2.5 -1.65010
## 148        6.819      4.6068      4.59790         2.0 -0.36585
## 4          3.753      4.7356     -0.52330         0.2 -0.31612
## 146        6.454      3.7261      6.35442         2.3 -1.94825
## 144        5.247      1.9262      5.66931         2.3  0.92006
## 128        6.228      3.0231      4.53491         1.8 -0.62287
## 110        8.185      4.0871      6.25752         2.5 -0.33404
## 26         5.183      2.8630      2.36593         0.2  1.39515
## 43         2.634      3.3717      1.72609         0.2  0.63667
## 5          4.379      2.6552      2.32727         0.2 -0.10843
## 46         6.456      1.7124      1.44613         0.3  0.51376
## 10         6.710      4.5078      0.99571         0.1  0.39927
## 140        5.725      3.3674      5.09955         2.1  1.66286
## 87         6.333      2.6916      5.52292         1.5  0.27589
## 85         5.754      3.3253      3.65073         1.5  0.50627
## 7          4.919      5.4625      0.44639         0.3  0.34755
## 134        5.720      3.1627      5.41348         1.5 -0.37724
## 29         4.247      4.8114      2.01247         0.2  0.09762
## 121        6.721      4.5675      4.00892         2.3  1.63874
## 24         6.110      2.8928      2.48470         0.5 -0.87559
## 142        6.924      3.8625      5.11193         2.3  0.12176
## 65         4.951      2.2488      3.41874         1.3  1.36213
## 33         4.696      2.6263      2.60695         0.1 -0.23462
## 80         7.314      1.3983      4.97939         1.0 -1.05338
## 37         5.053      3.3513      0.15260         0.2 -0.86978
## 13         5.563      4.7971      2.41165         0.1 -0.39013
## 59         8.072      3.0048      3.96793         1.3 -0.84735
## 122        6.044      1.9972      5.03236         2.0 -0.26064
## 20         4.678      4.0304      1.98093         0.3 -0.41442
## 68         5.760      3.3980      4.23318         1.0 -0.18305
## 120        5.508      0.9048      3.53714         1.5  0.40706
## 62         7.128      1.9453      3.69360         1.5  0.62463
## 54         5.350      0.3584      5.53966         1.3  1.67821
## 100        7.250      1.5304      4.27279         1.3 -0.06869
## 58         4.338      1.5126      2.46418         1.0 -0.32084
## 141        6.053      2.8081      4.94407         2.4  1.47101
## 74         6.243      4.2101      5.75757         1.2  1.70433
## 147        6.324      3.3886      5.34608         1.9  0.04324
## 111        5.996      3.4730      4.20281         2.0 -0.33266
## 145        5.119      2.7334      5.70627         2.5 -1.82224
## 38         4.930      3.5598      1.99667         0.1  1.41126
## 
## $usekernel
## [1] FALSE
## 
## $varnames
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "useless"     
## 
## attr(,"class")
## [1] "NaiveBayes"
predict(model_nb, test)
## $class
##        105        115         14        117        139         96 
##  virginica  virginica     setosa  virginica  virginica versicolor 
##         81         56         19        103         77         89 
## versicolor versicolor     setosa  virginica versicolor versicolor 
##        107         82         57        119         69         94 
##  virginica versicolor  virginica  virginica versicolor versicolor 
##         75         64        135         27         97         84 
## versicolor versicolor versicolor     setosa versicolor versicolor 
##        124        118         44         12         11         72 
##  virginica  virginica     setosa     setosa     setosa versicolor 
##         63        129          3         99         45         52 
## versicolor  virginica     setosa versicolor     setosa versicolor 
##        130         16         79        108         83          9 
##  virginica     setosa versicolor  virginica versicolor     setosa 
##        150        106        123         86        116         41 
##  virginica  virginica  virginica versicolor  virginica     setosa 
##         51          1 
## versicolor     setosa 
## Levels: setosa versicolor virginica
## 
## $posterior
##         setosa versicolor virginica
## 105 1.110e-112  4.571e-06 1.000e+00
## 115 3.810e-128  1.644e-06 1.000e+00
## 14   1.000e+00  2.916e-13 1.146e-17
## 117  1.094e-73  1.774e-02 9.823e-01
## 139  1.018e-67  2.107e-01 7.893e-01
## 96   2.030e-30  9.427e-01 5.731e-02
## 81   1.255e-23  9.956e-01 4.435e-03
## 56   1.559e-30  9.979e-01 2.061e-03
## 19   1.000e+00  2.907e-08 3.774e-12
## 103  2.221e-96  1.140e-03 9.989e-01
## 77   2.605e-36  9.854e-01 1.457e-02
## 89   5.891e-33  9.970e-01 2.989e-03
## 107  7.215e-61  3.829e-01 6.171e-01
## 82   1.875e-16  9.995e-01 5.360e-04
## 57   7.897e-55  3.577e-01 6.423e-01
## 119 5.398e-124  8.142e-07 1.000e+00
## 69   1.424e-45  7.974e-01 2.026e-01
## 94   5.743e-16  9.986e-01 1.398e-03
## 75   1.027e-33  9.940e-01 5.990e-03
## 64   5.050e-40  9.832e-01 1.685e-02
## 135  2.271e-40  9.667e-01 3.326e-02
## 27   1.000e+00  8.060e-09 4.814e-13
## 97   1.127e-32  9.924e-01 7.580e-03
## 84   3.378e-52  7.504e-01 2.496e-01
## 124  2.273e-69  2.921e-02 9.708e-01
## 118 1.694e-108  4.262e-05 1.000e+00
## 44   9.992e-01  7.825e-04 4.538e-08
## 12   1.000e+00  2.029e-11 1.563e-15
## 11   1.000e+00  4.285e-10 4.202e-14
## 72   2.368e-31  9.935e-01 6.508e-03
## 63   3.127e-18  9.992e-01 7.538e-04
## 129 2.906e-102  3.290e-04 9.997e-01
## 3    1.000e+00  5.904e-12 2.786e-16
## 99   9.714e-20  9.992e-01 8.258e-04
## 45   1.000e+00  2.623e-05 6.852e-09
## 52   9.495e-45  9.522e-01 4.784e-02
## 130  1.216e-59  2.868e-01 7.132e-01
## 16   1.000e+00  6.548e-09 4.321e-13
## 79   1.194e-43  8.630e-01 1.370e-01
## 108  4.556e-69  1.064e-01 8.936e-01
## 83   8.660e-27  9.978e-01 2.241e-03
## 9    1.000e+00  1.189e-12 2.809e-17
## 150  2.559e-67  1.277e-01 8.723e-01
## 106 1.512e-105  1.128e-05 1.000e+00
## 123  2.360e-92  2.644e-03 9.974e-01
## 86   8.993e-54  5.796e-01 4.204e-01
## 116 9.140e-119  1.168e-05 1.000e+00
## 41   1.000e+00  7.883e-12 4.892e-16
## 51   2.483e-39  9.765e-01 2.350e-02
## 1    1.000e+00  8.479e-12 5.509e-16

Helper function to calculate the missclassification rate

posteriorToClass <- function(predicted) {
    colnames(predicted$posterior)[apply(predicted$posterior,
        MARGIN=1, FUN=function(x) which.max(x))]
}

table(true=test$Species,
  predicted=posteriorToClass(predict(model_nb, test)))
##             predicted
## true         setosa versicolor virginica
##   setosa         12          0         0
##   versicolor      0         20         1
##   virginica       0          1        16

Artificial Neural Net

library(nnet)
model_nnet <- nnet(Species ~ ., data=train, size=10)
## # weights:  93
## initial  value 130.776648 
## iter  10 value 50.721475
## iter  20 value 10.301070
## iter  30 value 5.297236
## iter  40 value 4.200102
## iter  50 value 4.062111
## iter  60 value 3.296631
## iter  70 value 2.930707
## iter  80 value 2.871913
## iter  90 value 2.869570
## iter 100 value 2.697828
## final  value 2.697828 
## stopped after 100 iterations
model_nnet
## a 5-10-3 network with 93 weights
## inputs: Sepal.Length Sepal.Width Petal.Length Petal.Width useless 
## output(s): Species 
## options were - softmax modelling
predict(model_nnet, test)
##        setosa versicolor  virginica
## 105 4.567e-67  1.069e-14  1.000e+00
## 115 4.567e-67  1.069e-14  1.000e+00
## 14  1.000e+00  1.969e-18  5.293e-60
## 117 4.594e-67  1.074e-14  1.000e+00
## 139 4.567e-67  1.069e-14  1.000e+00
## 96  7.195e-37  1.000e+00  8.028e-31
## 81  1.828e-15  1.000e+00  7.832e-35
## 56  1.167e-42  1.000e+00  1.488e-21
## 19  1.000e+00  5.230e-18  1.289e-67
## 103 4.613e-67  1.072e-14  1.000e+00
## 77  2.363e-14  1.000e+00  7.605e-42
## 89  4.012e-36  1.000e+00  3.885e-31
## 107 5.492e-67  1.198e-14  1.000e+00
## 82  1.828e-15  1.000e+00  7.831e-35
## 57  1.923e-65  1.812e-13  1.000e+00
## 119 4.567e-67  1.069e-14  1.000e+00
## 69  1.477e-51  4.329e-05  1.000e+00
## 94  2.297e-14  1.000e+00  9.170e-42
## 75  7.119e-34  1.000e+00  1.437e-32
## 64  1.134e-42  1.000e+00  1.509e-21
## 135 2.509e-16  1.000e+00  2.065e-34
## 27  1.000e+00  5.230e-18  1.289e-67
## 97  1.341e-55  7.127e-06  1.000e+00
## 84  2.490e-66  2.386e-14  1.000e+00
## 124 6.692e-67  1.211e-14  1.000e+00
## 118 4.567e-67  1.069e-14  1.000e+00
## 44  1.000e+00  5.230e-18  1.289e-67
## 12  1.000e+00  5.230e-18  1.289e-67
## 11  1.000e+00  5.230e-18  1.289e-67
## 72  1.829e-15  1.000e+00  7.823e-35
## 63  7.044e-10  1.000e+00  2.776e-44
## 129 4.568e-67  1.069e-14  1.000e+00
## 3   1.000e+00  5.230e-18  1.289e-67
## 99  1.832e-15  1.000e+00  7.741e-35
## 45  1.000e+00  6.550e-24  3.986e-83
## 52  1.828e-15  1.000e+00  7.831e-35
## 130 7.760e-37  1.000e+00  6.237e-31
## 16  1.000e+00  5.230e-18  1.289e-67
## 79  3.061e-23  1.000e+00  7.061e-09
## 108 4.567e-67  1.069e-14  1.000e+00
## 83  1.373e-09  1.000e+00  2.136e-44
## 9   1.000e+00  3.841e-53 7.875e-112
## 150 7.956e-29  3.667e-02  9.633e-01
## 106 4.567e-67  1.069e-14  1.000e+00
## 123 1.501e-26  2.041e-01  7.959e-01
## 86  4.387e-28  1.000e+00  1.762e-28
## 116 4.567e-67  1.069e-14  1.000e+00
## 41  1.000e+00  1.969e-18  5.286e-60
## 51  1.828e-15  1.000e+00  7.831e-35
## 1   1.000e+00  5.230e-18  1.289e-67
table(true=test$Species,
  predicted=predict(model_nnet, test, type="class"))
##             predicted
## true         setosa versicolor virginica
##   setosa         12          0         0
##   versicolor      0         17         4
##   virginica       0          2        15

Decision Trees

library(party)
## Loading required package: grid
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
model_ctree <- ctree(Species ~ ., data=train)
model_ctree
## 
##   Conditional inference tree with 3 terminal nodes
## 
## Response:  Species 
## Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, useless 
## Number of observations:  100 
## 
## 1) Petal.Width <= 0.5; criterion = 1, statistic = 93.141
##   2)*  weights = 38 
## 1) Petal.Width > 0.5
##   3) Petal.Width <= 1.7; criterion = 1, statistic = 42.599
##     4)*  weights = 30 
##   3) Petal.Width > 1.7
##     5)*  weights = 32
plot(model_ctree)

plot of chunk unnamed-chunk-18

predict(model_ctree, test)
##  [1] virginica  virginica  setosa     virginica  virginica  versicolor
##  [7] versicolor versicolor setosa     virginica  versicolor versicolor
## [13] versicolor versicolor versicolor virginica  versicolor versicolor
## [19] versicolor versicolor versicolor setosa     versicolor versicolor
## [25] virginica  virginica  versicolor setosa     setosa     versicolor
## [31] versicolor virginica  setosa     versicolor setosa     versicolor
## [37] versicolor setosa     versicolor virginica  versicolor setosa    
## [43] virginica  virginica  virginica  versicolor virginica  setosa    
## [49] versicolor setosa    
## Levels: setosa versicolor virginica
table(true=test$Species,
  predicted=predict(model_ctree, test))
##             predicted
## true         setosa versicolor virginica
##   setosa         11          1         0
##   versicolor      0         21         0
##   virginica       0          3        14

Random forrest

library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
model_rforrest <- randomForest(Species ~ ., data=train)
model_rforrest
## 
## Call:
##  randomForest(formula = Species ~ ., data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         38          0         0     0.00000
## versicolor      0         27         2     0.06897
## virginica       0          2        31     0.06061

Random forest can calculate variable importance

importance(model_rforrest)
##              MeanDecreaseGini
## Sepal.Length            4.872
## Sepal.Width             3.712
## Petal.Length           16.989
## Petal.Width            36.843
## useless                 3.176
predict(model_rforrest, test)
##        105        115         14        117        139         96 
##  virginica  virginica     setosa  virginica  virginica versicolor 
##         81         56         19        103         77         89 
## versicolor versicolor     setosa  virginica versicolor versicolor 
##        107         82         57        119         69         94 
##  virginica versicolor versicolor  virginica versicolor versicolor 
##         75         64        135         27         97         84 
## versicolor versicolor versicolor     setosa versicolor versicolor 
##        124        118         44         12         11         72 
##  virginica  virginica     setosa     setosa     setosa versicolor 
##         63        129          3         99         45         52 
## versicolor  virginica     setosa versicolor     setosa versicolor 
##        130         16         79        108         83          9 
## versicolor     setosa versicolor  virginica versicolor     setosa 
##        150        106        123         86        116         41 
##  virginica  virginica  virginica versicolor  virginica     setosa 
##         51          1 
## versicolor     setosa 
## Levels: setosa versicolor virginica
table(true=test$Species,
  predicted=predict(model_rforrest, test))
##             predicted
## true         setosa versicolor virginica
##   setosa         12          0         0
##   versicolor      0         21         0
##   virginica       0          2        15

Support vector machines

library(e1071)
model_svm <- svm(Species ~ ., data=train)
model_svm
## 
## Call:
## svm(formula = Species ~ ., data = train)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.2 
## 
## Number of Support Vectors:  66
predict(model_svm, test)
##        105        115         14        117        139         96 
##  virginica  virginica     setosa  virginica  virginica versicolor 
##         81         56         19        103         77         89 
## versicolor versicolor     setosa  virginica versicolor versicolor 
##        107         82         57        119         69         94 
##  virginica versicolor versicolor  virginica versicolor versicolor 
##         75         64        135         27         97         84 
## versicolor versicolor versicolor     setosa versicolor versicolor 
##        124        118         44         12         11         72 
##  virginica  virginica     setosa     setosa     setosa versicolor 
##         63        129          3         99         45         52 
## versicolor versicolor     setosa versicolor     setosa versicolor 
##        130         16         79        108         83          9 
##  virginica     setosa versicolor  virginica versicolor     setosa 
##        150        106        123         86        116         41 
##  virginica  virginica  virginica versicolor  virginica     setosa 
##         51          1 
## versicolor     setosa 
## Levels: setosa versicolor virginica
table(true=test$Species,
  predicted=predict(model_svm, test))
##             predicted
## true         setosa versicolor virginica
##   setosa         12          0         0
##   versicolor      0         21         0
##   virginica       0          2        15

Logistic regression

Logistic regression can only model a binary response so we reduce the problem to two classes using Virginica is class 1 and the others are class 0

train$Species2 <- as.integer(train$Species == "virginica")
head(train)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species useless
## 18         4.440       3.632       0.5518         0.3     setosa -0.1534
## 93         8.719       2.324       3.6531         1.2 versicolor -1.3907
## 91         6.177       1.921       4.6275         1.2 versicolor -0.7236
## 92         5.416       3.501       4.5726         1.4 versicolor  0.2583
## 126        7.386       2.868       6.2009         1.8  virginica -0.3171
## 149        5.876       1.565       5.5989         2.3  virginica -0.1778
##     Species2
## 18         0
## 93         0
## 91         0
## 92         0
## 126        1
## 149        1
plot(train, col= train$Species2+1L)

plot of chunk unnamed-chunk-22

test$Species2 <- as.integer(test$Species == "virginica")

Use all but Species

mod_logreg <- glm(Species2 ~ .-Species, data = train,
  family=binomial(logit))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

The just warning means that the data is possibly linearly seperable

summary(mod_logreg)
## 
## Call:
## glm(formula = Species2 ~ . - Species, family = binomial(logit), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2619  -0.0006   0.0000   0.0001   1.9146  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -42.67      28.78   -1.48     0.14
## Sepal.Length    -1.08       1.72   -0.63     0.53
## Sepal.Width     -4.23       3.38   -1.25     0.21
## Petal.Length     3.03       3.23    0.94     0.35
## Petal.Width     28.67      19.38    1.48     0.14
## useless         -2.97       3.20   -0.93     0.35
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 126.8357  on 99  degrees of freedom
## Residual deviance:   9.6751  on 94  degrees of freedom
## AIC: 21.68
## 
## Number of Fisher Scoring iterations: 12

Prediction returns log odds

predict(mod_logreg, test)
##     105     115      14     117     139      96      81      56      19 
##  24.481  25.157 -67.449   2.398   8.689   1.047 -18.292  -9.966 -53.444 
##     103      77      89     107      82      57     119      69      94 
##   7.190 -29.780  -3.103   7.369 -23.665  -5.063  23.916  10.146 -26.895 
##      75      64     135      27      97      84     124     118      44 
##  -8.319  -9.161  -9.152 -56.906  -7.496  -4.403   5.145  11.279 -40.254 
##      12      11      72      63     129       3      99      45      52 
## -54.967 -53.744 -19.061 -18.874  17.681 -56.901 -19.889 -44.165  -8.035 
##     130      16      79     108      83       9     150     106     123 
##   4.685 -57.410  -7.643   5.885 -15.381 -39.966   1.171  20.770   1.348 
##      86     116      41      51       1 
##  -9.178  20.844 -65.507 -14.654 -51.323

Response means the probability

predict(mod_logreg, test, type="response")
##       105       115        14       117       139        96        81 
## 1.000e+00 1.000e+00 2.220e-16 9.167e-01 9.998e-01 7.403e-01 1.137e-08 
##        56        19       103        77        89       107        82 
## 4.697e-05 2.220e-16 9.992e-01 1.165e-13 4.300e-02 9.994e-01 5.278e-11 
##        57       119        69        94        75        64       135 
## 6.289e-03 1.000e+00 1.000e+00 2.087e-12 2.438e-04 1.050e-04 1.060e-04 
##        27        97        84       124       118        44        12 
## 2.220e-16 5.547e-04 1.209e-02 9.942e-01 1.000e+00 2.220e-16 2.220e-16 
##        11        72        63       129         3        99        45 
## 2.220e-16 5.270e-09 6.355e-09 1.000e+00 2.220e-16 2.304e-09 2.220e-16 
##        52       130        16        79       108        83         9 
## 3.238e-04 9.908e-01 2.220e-16 4.792e-04 9.972e-01 2.089e-07 2.220e-16 
##       150       106       123        86       116        41        51 
## 7.633e-01 1.000e+00 7.938e-01 1.033e-04 1.000e+00 2.220e-16 4.322e-07 
##         1 
## 2.220e-16

For the confusion table we use a threshold of .5 to separate between class 0 and 1

table(true=test$Species2,
  predicted=predict(mod_logreg, test, type="response") > .5)
##     predicted
## true FALSE TRUE
##    0    31    2
##    1     1   16