Set the random number generator to make the experiments repeatable
set.seed(1234)
We use the iris data set
data(iris)
shuffle since flowers are in order by species!
x <- iris[sample(1:nrow(iris)),]
Make the Iris data a little messy
x <- cbind(x, useless = rnorm(nrow(x)))
x[,1] <- x[,1] + rnorm(nrow(x))
x[,2] <- x[,2] + rnorm(nrow(x))
x[,3] <- x[,3] + rnorm(nrow(x))
Look at the scatterplot matrix
plot(x, col=x$Species)
head(x)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species useless
## 18 4.440 3.632 0.5518 0.3 setosa -0.1534
## 93 8.719 2.324 3.6531 1.2 versicolor -1.3907
## 91 6.177 1.921 4.6275 1.2 versicolor -0.7236
## 92 5.416 3.501 4.5726 1.4 versicolor 0.2583
## 126 7.386 2.868 6.2009 1.8 virginica -0.3171
## 149 5.876 1.565 5.5989 2.3 virginica -0.1778
Create training and testing data (Note: the data is in random order)
train <- x[1:100,]
test <- x[101:150,]
Let us model Petal.Width as the dependent variable
model <- lm(Petal.Width ~ Sepal.Length
+ Sepal.Width + Petal.Length + useless,
data = train)
model
##
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length +
## useless, data = train)
##
## Coefficients:
## (Intercept) Sepal.Length Sepal.Width Petal.Length useless
## -0.5266 0.0778 0.0158 0.3402 -0.0512
coef(model)
## (Intercept) Sepal.Length Sepal.Width Petal.Length useless
## -0.52664 0.07782 0.01580 0.34024 -0.05121
summary(model)
##
## Call:
## lm(formula = Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length +
## useless, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8128 -0.3370 -0.0093 0.3102 1.3751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5266 0.2242 -2.35 0.021 *
## Sepal.Length 0.0778 0.0381 2.04 0.044 *
## Sepal.Width 0.0158 0.0377 0.42 0.676
## Petal.Length 0.3402 0.0255 13.34 <2e-16 ***
## useless -0.0512 0.0478 -1.07 0.287
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.415 on 95 degrees of freedom
## Multiple R-squared: 0.744, Adjusted R-squared: 0.733
## F-statistic: 69 on 4 and 95 DF, p-value: <2e-16
Petal.length is highly significant (the estimated coefficient is different from 0). useless is as expected not significant.
Let us create two additional models with less predictors
model2 <- lm(Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length,
data = train)
model3 <- lm(Petal.Width ~ Petal.Length,
data = train)
Null hypothesis is that the treatments are the same
anova(model, model2, model3)
## Analysis of Variance Table
##
## Model 1: Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length + useless
## Model 2: Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length
## Model 3: Petal.Width ~ Petal.Length
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 95 16.4
## 2 96 16.6 -1 -0.198 1.15 0.287
## 3 98 17.6 -2 -0.982 2.85 0.063 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model 2 is not significantly worse than model 1. Model 3 is significantly worse than model 2 at alpha=10%, but not at alpha=5%.
Use model 2 for prediction
test[1:5,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species useless
## 105 5.783 3.548 7.5226 2.2 virginica -0.83758
## 115 6.883 2.296 4.2183 2.4 virginica -1.12376
## 14 3.347 4.059 0.7201 0.1 setosa 3.04377
## 117 7.626 4.097 6.4961 1.8 virginica 0.23502
## 139 5.351 1.833 4.3409 1.8 virginica -0.03326
predict(model2, test[1:5,])
## 105 115 14 117 139
## 2.5398 1.4839 0.0243 2.3539 1.3892
Plot predictions over actual values for comparison
plot(test[,"Petal.Width"], predict(model2, test),
xlim=c(0,4), ylim=c(0,4))
abline(0,1, col="red")
See generalized linear models (glm) and nonlinear least squares (nlm)
We will model Species
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
model_knn3 <- knn3(Species ~ ., k=5, data=train)
model_knn3
## 5-nearest neighbor classification model
##
## Call:
## knn3.formula(formula = Species ~ ., data = train, k = 5)
##
## Training set class distribution:
##
## setosa versicolor virginica
## 38 29 33
predict(model_knn3, test)
## setosa versicolor virginica
## [1,] 0.0 0.2 0.8
## [2,] 0.0 0.2 0.8
## [3,] 1.0 0.0 0.0
## [4,] 0.0 0.0 1.0
## [5,] 0.0 0.4 0.6
## [6,] 0.0 0.2 0.8
## [7,] 0.0 0.4 0.6
## [8,] 0.0 0.4 0.6
## [9,] 1.0 0.0 0.0
## [10,] 0.0 0.2 0.8
## [11,] 0.6 0.2 0.2
## [12,] 0.0 0.4 0.6
## [13,] 0.0 0.6 0.4
## [14,] 0.0 1.0 0.0
## [15,] 0.0 0.4 0.6
## [16,] 0.0 0.0 1.0
## [17,] 0.0 0.8 0.2
## [18,] 0.2 0.6 0.2
## [19,] 0.0 0.8 0.2
## [20,] 0.0 0.6 0.4
## [21,] 0.0 0.6 0.4
## [22,] 1.0 0.0 0.0
## [23,] 0.0 0.8 0.2
## [24,] 0.0 0.8 0.2
## [25,] 0.0 0.2 0.8
## [26,] 0.0 0.4 0.6
## [27,] 1.0 0.0 0.0
## [28,] 1.0 0.0 0.0
## [29,] 1.0 0.0 0.0
## [30,] 0.0 1.0 0.0
## [31,] 0.0 0.8 0.2
## [32,] 0.0 0.4 0.6
## [33,] 1.0 0.0 0.0
## [34,] 0.6 0.4 0.0
## [35,] 0.6 0.2 0.2
## [36,] 0.0 0.6 0.4
## [37,] 0.0 0.0 1.0
## [38,] 1.0 0.0 0.0
## [39,] 0.0 0.4 0.6
## [40,] 0.0 0.2 0.8
## [41,] 0.0 0.8 0.2
## [42,] 0.8 0.2 0.0
## [43,] 0.0 0.6 0.4
## [44,] 0.0 0.0 1.0
## [45,] 0.0 0.2 0.8
## [46,] 0.0 0.8 0.2
## [47,] 0.0 0.2 0.8
## [48,] 1.0 0.0 0.0
## [49,] 0.0 0.6 0.4
## [50,] 1.0 0.0 0.0
Predict returns often posteriori probabilities for each class
Predict class labels
predict(model_knn3, test, type="class")
## [1] virginica virginica setosa virginica virginica virginica
## [7] virginica virginica setosa virginica setosa virginica
## [13] versicolor versicolor virginica virginica versicolor versicolor
## [19] versicolor versicolor versicolor setosa versicolor versicolor
## [25] virginica virginica setosa setosa setosa versicolor
## [31] versicolor virginica setosa setosa setosa versicolor
## [37] virginica setosa virginica virginica versicolor setosa
## [43] versicolor virginica virginica versicolor virginica setosa
## [49] versicolor setosa
## Levels: setosa versicolor virginica
Compare prediction with real classes (confusion table)
table(true=test$Species,
predicted=predict(model_knn3, test, type="class"))
## predicted
## true setosa versicolor virginica
## setosa 12 0 0
## versicolor 2 13 6
## virginica 0 3 14
library(klaR)
## Loading required package: MASS
model_nb <- NaiveBayes(Species ~ ., data=train)
model_nb
## $apriori
## grouping
## setosa versicolor virginica
## 0.38 0.29 0.33
##
## $tables
## $tables$Sepal.Length
## [,1] [,2]
## setosa 5.081 0.8881
## versicolor 6.149 1.2789
## virginica 6.460 1.1762
##
## $tables$Sepal.Width
## [,1] [,2]
## setosa 3.285 1.201
## versicolor 2.652 1.087
## virginica 2.902 1.059
##
## $tables$Petal.Length
## [,1] [,2]
## setosa 1.390 0.8650
## versicolor 4.420 0.8756
## virginica 5.037 0.9558
##
## $tables$Petal.Width
## [,1] [,2]
## setosa 0.2316 0.09036
## versicolor 1.3345 0.20049
## virginica 2.0576 0.27160
##
## $tables$useless
## [,1] [,2]
## setosa 0.1900 0.8527
## versicolor 0.1497 0.8769
## virginica -0.1926 0.9160
##
##
## $levels
## [1] "setosa" "versicolor" "virginica"
##
## $call
## NaiveBayes.default(x = X, grouping = Y)
##
## $x
## Sepal.Length Sepal.Width Petal.Length Petal.Width useless
## 18 4.440 3.6323 0.55180 0.3 -0.15340
## 93 8.719 2.3245 3.65306 1.2 -1.39070
## 91 6.177 1.9212 4.62748 1.2 -0.72358
## 92 5.416 3.5008 4.57260 1.4 0.25826
## 126 7.386 2.8683 6.20088 1.8 -0.31706
## 149 5.876 1.5650 5.59894 2.3 -0.17779
## 2 4.625 0.3483 2.11460 0.2 -0.16999
## 34 4.566 3.6194 1.12682 0.2 -1.37230
## 95 5.717 4.1542 5.83535 1.3 -0.17379
## 73 6.619 3.3381 5.33769 1.5 0.85023
## 98 5.122 4.1151 3.10646 1.3 0.69761
## 76 3.367 3.9825 4.12335 1.4 0.55000
## 40 4.845 3.7158 2.01588 0.2 -0.40273
## 127 6.230 1.2929 4.38894 1.8 -0.19159
## 138 6.994 3.3056 5.84500 1.8 -1.19453
## 114 5.759 4.0972 5.43440 2.0 -0.05316
## 39 4.813 -0.3961 2.60348 0.2 0.25520
## 36 3.902 2.4186 0.54830 0.2 1.70596
## 25 5.511 4.5025 1.88312 0.2 1.00151
## 31 5.519 3.6287 1.03231 0.2 -0.49558
## 42 4.752 3.0894 0.17287 0.3 0.35555
## 136 9.057 3.4571 4.25103 2.3 -1.13461
## 21 5.804 3.9388 1.89553 0.2 0.87820
## 6 5.664 3.9146 2.30157 0.4 0.97292
## 28 5.468 2.5835 2.13916 0.2 2.12112
## 102 6.237 1.4732 3.71001 1.9 0.41452
## 66 7.760 3.1362 3.27032 1.4 -0.47472
## 113 7.252 2.5786 4.35885 2.1 0.06599
## 125 7.363 2.4006 5.01133 2.1 -0.50248
## 137 5.164 3.8174 5.77020 2.4 -0.82600
## 55 6.130 2.9534 5.19448 1.5 0.16699
## 32 6.877 4.8633 0.68497 0.4 -0.89626
## 133 5.176 1.6785 2.73565 2.2 0.16819
## 60 5.458 2.1822 4.85382 1.4 0.35497
## 22 5.505 3.6251 2.11528 0.4 -0.05211
## 88 7.276 0.8922 3.69913 1.3 -0.19593
## 23 4.251 3.3153 -0.07366 0.2 -0.64907
## 30 4.859 2.4918 1.30001 0.2 -1.10977
## 112 4.637 0.5524 5.40811 1.9 0.84927
## 90 5.839 2.2162 4.72320 1.3 0.02236
## 61 4.333 1.4659 4.89010 1.0 0.83114
## 71 5.661 4.3330 3.95112 1.8 -1.24429
## 143 4.612 2.0959 5.39072 1.9 0.16903
## 67 5.985 3.5575 5.83418 1.5 0.67317
## 35 5.567 3.2426 0.59866 0.2 -0.02628
## 53 6.595 1.8631 4.67724 1.5 -0.19139
## 109 8.525 2.8741 4.02853 1.8 -0.78191
## 50 5.671 3.1939 1.68742 0.2 2.05816
## 132 8.849 3.9661 5.75881 2.0 0.75050
## 78 8.749 3.6580 4.07506 1.7 1.82421
## 8 4.349 3.5140 0.72002 0.2 0.08006
## 131 8.209 4.3852 6.54450 1.9 -0.63141
## 104 7.287 3.0156 3.43753 1.8 -1.51329
## 49 5.294 2.4460 1.07449 0.2 -0.63610
## 15 6.119 4.7419 0.18628 0.2 0.22630
## 48 3.588 1.9771 0.43788 0.2 1.01369
## 47 5.570 1.8811 2.07463 0.2 0.25275
## 70 4.899 1.6901 4.25175 1.1 -1.17195
## 17 6.214 3.1553 1.19298 0.4 0.66871
## 101 5.489 2.6925 6.53752 2.5 -1.65010
## 148 6.819 4.6068 4.59790 2.0 -0.36585
## 4 3.753 4.7356 -0.52330 0.2 -0.31612
## 146 6.454 3.7261 6.35442 2.3 -1.94825
## 144 5.247 1.9262 5.66931 2.3 0.92006
## 128 6.228 3.0231 4.53491 1.8 -0.62287
## 110 8.185 4.0871 6.25752 2.5 -0.33404
## 26 5.183 2.8630 2.36593 0.2 1.39515
## 43 2.634 3.3717 1.72609 0.2 0.63667
## 5 4.379 2.6552 2.32727 0.2 -0.10843
## 46 6.456 1.7124 1.44613 0.3 0.51376
## 10 6.710 4.5078 0.99571 0.1 0.39927
## 140 5.725 3.3674 5.09955 2.1 1.66286
## 87 6.333 2.6916 5.52292 1.5 0.27589
## 85 5.754 3.3253 3.65073 1.5 0.50627
## 7 4.919 5.4625 0.44639 0.3 0.34755
## 134 5.720 3.1627 5.41348 1.5 -0.37724
## 29 4.247 4.8114 2.01247 0.2 0.09762
## 121 6.721 4.5675 4.00892 2.3 1.63874
## 24 6.110 2.8928 2.48470 0.5 -0.87559
## 142 6.924 3.8625 5.11193 2.3 0.12176
## 65 4.951 2.2488 3.41874 1.3 1.36213
## 33 4.696 2.6263 2.60695 0.1 -0.23462
## 80 7.314 1.3983 4.97939 1.0 -1.05338
## 37 5.053 3.3513 0.15260 0.2 -0.86978
## 13 5.563 4.7971 2.41165 0.1 -0.39013
## 59 8.072 3.0048 3.96793 1.3 -0.84735
## 122 6.044 1.9972 5.03236 2.0 -0.26064
## 20 4.678 4.0304 1.98093 0.3 -0.41442
## 68 5.760 3.3980 4.23318 1.0 -0.18305
## 120 5.508 0.9048 3.53714 1.5 0.40706
## 62 7.128 1.9453 3.69360 1.5 0.62463
## 54 5.350 0.3584 5.53966 1.3 1.67821
## 100 7.250 1.5304 4.27279 1.3 -0.06869
## 58 4.338 1.5126 2.46418 1.0 -0.32084
## 141 6.053 2.8081 4.94407 2.4 1.47101
## 74 6.243 4.2101 5.75757 1.2 1.70433
## 147 6.324 3.3886 5.34608 1.9 0.04324
## 111 5.996 3.4730 4.20281 2.0 -0.33266
## 145 5.119 2.7334 5.70627 2.5 -1.82224
## 38 4.930 3.5598 1.99667 0.1 1.41126
##
## $usekernel
## [1] FALSE
##
## $varnames
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "useless"
##
## attr(,"class")
## [1] "NaiveBayes"
predict(model_nb, test)
## $class
## 105 115 14 117 139 96
## virginica virginica setosa virginica virginica versicolor
## 81 56 19 103 77 89
## versicolor versicolor setosa virginica versicolor versicolor
## 107 82 57 119 69 94
## virginica versicolor virginica virginica versicolor versicolor
## 75 64 135 27 97 84
## versicolor versicolor versicolor setosa versicolor versicolor
## 124 118 44 12 11 72
## virginica virginica setosa setosa setosa versicolor
## 63 129 3 99 45 52
## versicolor virginica setosa versicolor setosa versicolor
## 130 16 79 108 83 9
## virginica setosa versicolor virginica versicolor setosa
## 150 106 123 86 116 41
## virginica virginica virginica versicolor virginica setosa
## 51 1
## versicolor setosa
## Levels: setosa versicolor virginica
##
## $posterior
## setosa versicolor virginica
## 105 1.110e-112 4.571e-06 1.000e+00
## 115 3.810e-128 1.644e-06 1.000e+00
## 14 1.000e+00 2.916e-13 1.146e-17
## 117 1.094e-73 1.774e-02 9.823e-01
## 139 1.018e-67 2.107e-01 7.893e-01
## 96 2.030e-30 9.427e-01 5.731e-02
## 81 1.255e-23 9.956e-01 4.435e-03
## 56 1.559e-30 9.979e-01 2.061e-03
## 19 1.000e+00 2.907e-08 3.774e-12
## 103 2.221e-96 1.140e-03 9.989e-01
## 77 2.605e-36 9.854e-01 1.457e-02
## 89 5.891e-33 9.970e-01 2.989e-03
## 107 7.215e-61 3.829e-01 6.171e-01
## 82 1.875e-16 9.995e-01 5.360e-04
## 57 7.897e-55 3.577e-01 6.423e-01
## 119 5.398e-124 8.142e-07 1.000e+00
## 69 1.424e-45 7.974e-01 2.026e-01
## 94 5.743e-16 9.986e-01 1.398e-03
## 75 1.027e-33 9.940e-01 5.990e-03
## 64 5.050e-40 9.832e-01 1.685e-02
## 135 2.271e-40 9.667e-01 3.326e-02
## 27 1.000e+00 8.060e-09 4.814e-13
## 97 1.127e-32 9.924e-01 7.580e-03
## 84 3.378e-52 7.504e-01 2.496e-01
## 124 2.273e-69 2.921e-02 9.708e-01
## 118 1.694e-108 4.262e-05 1.000e+00
## 44 9.992e-01 7.825e-04 4.538e-08
## 12 1.000e+00 2.029e-11 1.563e-15
## 11 1.000e+00 4.285e-10 4.202e-14
## 72 2.368e-31 9.935e-01 6.508e-03
## 63 3.127e-18 9.992e-01 7.538e-04
## 129 2.906e-102 3.290e-04 9.997e-01
## 3 1.000e+00 5.904e-12 2.786e-16
## 99 9.714e-20 9.992e-01 8.258e-04
## 45 1.000e+00 2.623e-05 6.852e-09
## 52 9.495e-45 9.522e-01 4.784e-02
## 130 1.216e-59 2.868e-01 7.132e-01
## 16 1.000e+00 6.548e-09 4.321e-13
## 79 1.194e-43 8.630e-01 1.370e-01
## 108 4.556e-69 1.064e-01 8.936e-01
## 83 8.660e-27 9.978e-01 2.241e-03
## 9 1.000e+00 1.189e-12 2.809e-17
## 150 2.559e-67 1.277e-01 8.723e-01
## 106 1.512e-105 1.128e-05 1.000e+00
## 123 2.360e-92 2.644e-03 9.974e-01
## 86 8.993e-54 5.796e-01 4.204e-01
## 116 9.140e-119 1.168e-05 1.000e+00
## 41 1.000e+00 7.883e-12 4.892e-16
## 51 2.483e-39 9.765e-01 2.350e-02
## 1 1.000e+00 8.479e-12 5.509e-16
Helper function to calculate the missclassification rate
posteriorToClass <- function(predicted) {
colnames(predicted$posterior)[apply(predicted$posterior,
MARGIN=1, FUN=function(x) which.max(x))]
}
table(true=test$Species,
predicted=posteriorToClass(predict(model_nb, test)))
## predicted
## true setosa versicolor virginica
## setosa 12 0 0
## versicolor 0 20 1
## virginica 0 1 16
library(nnet)
model_nnet <- nnet(Species ~ ., data=train, size=10)
## # weights: 93
## initial value 130.776648
## iter 10 value 50.721475
## iter 20 value 10.301070
## iter 30 value 5.297236
## iter 40 value 4.200102
## iter 50 value 4.062111
## iter 60 value 3.296631
## iter 70 value 2.930707
## iter 80 value 2.871913
## iter 90 value 2.869570
## iter 100 value 2.697828
## final value 2.697828
## stopped after 100 iterations
model_nnet
## a 5-10-3 network with 93 weights
## inputs: Sepal.Length Sepal.Width Petal.Length Petal.Width useless
## output(s): Species
## options were - softmax modelling
predict(model_nnet, test)
## setosa versicolor virginica
## 105 4.567e-67 1.069e-14 1.000e+00
## 115 4.567e-67 1.069e-14 1.000e+00
## 14 1.000e+00 1.969e-18 5.293e-60
## 117 4.594e-67 1.074e-14 1.000e+00
## 139 4.567e-67 1.069e-14 1.000e+00
## 96 7.195e-37 1.000e+00 8.028e-31
## 81 1.828e-15 1.000e+00 7.832e-35
## 56 1.167e-42 1.000e+00 1.488e-21
## 19 1.000e+00 5.230e-18 1.289e-67
## 103 4.613e-67 1.072e-14 1.000e+00
## 77 2.363e-14 1.000e+00 7.605e-42
## 89 4.012e-36 1.000e+00 3.885e-31
## 107 5.492e-67 1.198e-14 1.000e+00
## 82 1.828e-15 1.000e+00 7.831e-35
## 57 1.923e-65 1.812e-13 1.000e+00
## 119 4.567e-67 1.069e-14 1.000e+00
## 69 1.477e-51 4.329e-05 1.000e+00
## 94 2.297e-14 1.000e+00 9.170e-42
## 75 7.119e-34 1.000e+00 1.437e-32
## 64 1.134e-42 1.000e+00 1.509e-21
## 135 2.509e-16 1.000e+00 2.065e-34
## 27 1.000e+00 5.230e-18 1.289e-67
## 97 1.341e-55 7.127e-06 1.000e+00
## 84 2.490e-66 2.386e-14 1.000e+00
## 124 6.692e-67 1.211e-14 1.000e+00
## 118 4.567e-67 1.069e-14 1.000e+00
## 44 1.000e+00 5.230e-18 1.289e-67
## 12 1.000e+00 5.230e-18 1.289e-67
## 11 1.000e+00 5.230e-18 1.289e-67
## 72 1.829e-15 1.000e+00 7.823e-35
## 63 7.044e-10 1.000e+00 2.776e-44
## 129 4.568e-67 1.069e-14 1.000e+00
## 3 1.000e+00 5.230e-18 1.289e-67
## 99 1.832e-15 1.000e+00 7.741e-35
## 45 1.000e+00 6.550e-24 3.986e-83
## 52 1.828e-15 1.000e+00 7.831e-35
## 130 7.760e-37 1.000e+00 6.237e-31
## 16 1.000e+00 5.230e-18 1.289e-67
## 79 3.061e-23 1.000e+00 7.061e-09
## 108 4.567e-67 1.069e-14 1.000e+00
## 83 1.373e-09 1.000e+00 2.136e-44
## 9 1.000e+00 3.841e-53 7.875e-112
## 150 7.956e-29 3.667e-02 9.633e-01
## 106 4.567e-67 1.069e-14 1.000e+00
## 123 1.501e-26 2.041e-01 7.959e-01
## 86 4.387e-28 1.000e+00 1.762e-28
## 116 4.567e-67 1.069e-14 1.000e+00
## 41 1.000e+00 1.969e-18 5.286e-60
## 51 1.828e-15 1.000e+00 7.831e-35
## 1 1.000e+00 5.230e-18 1.289e-67
table(true=test$Species,
predicted=predict(model_nnet, test, type="class"))
## predicted
## true setosa versicolor virginica
## setosa 12 0 0
## versicolor 0 17 4
## virginica 0 2 15
library(party)
## Loading required package: grid
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
model_ctree <- ctree(Species ~ ., data=train)
model_ctree
##
## Conditional inference tree with 3 terminal nodes
##
## Response: Species
## Inputs: Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, useless
## Number of observations: 100
##
## 1) Petal.Width <= 0.5; criterion = 1, statistic = 93.141
## 2)* weights = 38
## 1) Petal.Width > 0.5
## 3) Petal.Width <= 1.7; criterion = 1, statistic = 42.599
## 4)* weights = 30
## 3) Petal.Width > 1.7
## 5)* weights = 32
plot(model_ctree)
predict(model_ctree, test)
## [1] virginica virginica setosa virginica virginica versicolor
## [7] versicolor versicolor setosa virginica versicolor versicolor
## [13] versicolor versicolor versicolor virginica versicolor versicolor
## [19] versicolor versicolor versicolor setosa versicolor versicolor
## [25] virginica virginica versicolor setosa setosa versicolor
## [31] versicolor virginica setosa versicolor setosa versicolor
## [37] versicolor setosa versicolor virginica versicolor setosa
## [43] virginica virginica virginica versicolor virginica setosa
## [49] versicolor setosa
## Levels: setosa versicolor virginica
table(true=test$Species,
predicted=predict(model_ctree, test))
## predicted
## true setosa versicolor virginica
## setosa 11 1 0
## versicolor 0 21 0
## virginica 0 3 14
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
model_rforrest <- randomForest(Species ~ ., data=train)
model_rforrest
##
## Call:
## randomForest(formula = Species ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 38 0 0 0.00000
## versicolor 0 27 2 0.06897
## virginica 0 2 31 0.06061
Random forest can calculate variable importance
importance(model_rforrest)
## MeanDecreaseGini
## Sepal.Length 4.872
## Sepal.Width 3.712
## Petal.Length 16.989
## Petal.Width 36.843
## useless 3.176
predict(model_rforrest, test)
## 105 115 14 117 139 96
## virginica virginica setosa virginica virginica versicolor
## 81 56 19 103 77 89
## versicolor versicolor setosa virginica versicolor versicolor
## 107 82 57 119 69 94
## virginica versicolor versicolor virginica versicolor versicolor
## 75 64 135 27 97 84
## versicolor versicolor versicolor setosa versicolor versicolor
## 124 118 44 12 11 72
## virginica virginica setosa setosa setosa versicolor
## 63 129 3 99 45 52
## versicolor virginica setosa versicolor setosa versicolor
## 130 16 79 108 83 9
## versicolor setosa versicolor virginica versicolor setosa
## 150 106 123 86 116 41
## virginica virginica virginica versicolor virginica setosa
## 51 1
## versicolor setosa
## Levels: setosa versicolor virginica
table(true=test$Species,
predicted=predict(model_rforrest, test))
## predicted
## true setosa versicolor virginica
## setosa 12 0 0
## versicolor 0 21 0
## virginica 0 2 15
library(e1071)
model_svm <- svm(Species ~ ., data=train)
model_svm
##
## Call:
## svm(formula = Species ~ ., data = train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.2
##
## Number of Support Vectors: 66
predict(model_svm, test)
## 105 115 14 117 139 96
## virginica virginica setosa virginica virginica versicolor
## 81 56 19 103 77 89
## versicolor versicolor setosa virginica versicolor versicolor
## 107 82 57 119 69 94
## virginica versicolor versicolor virginica versicolor versicolor
## 75 64 135 27 97 84
## versicolor versicolor versicolor setosa versicolor versicolor
## 124 118 44 12 11 72
## virginica virginica setosa setosa setosa versicolor
## 63 129 3 99 45 52
## versicolor versicolor setosa versicolor setosa versicolor
## 130 16 79 108 83 9
## virginica setosa versicolor virginica versicolor setosa
## 150 106 123 86 116 41
## virginica virginica virginica versicolor virginica setosa
## 51 1
## versicolor setosa
## Levels: setosa versicolor virginica
table(true=test$Species,
predicted=predict(model_svm, test))
## predicted
## true setosa versicolor virginica
## setosa 12 0 0
## versicolor 0 21 0
## virginica 0 2 15
Logistic regression can only model a binary response so we reduce the problem to two classes using Virginica is class 1 and the others are class 0
train$Species2 <- as.integer(train$Species == "virginica")
head(train)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species useless
## 18 4.440 3.632 0.5518 0.3 setosa -0.1534
## 93 8.719 2.324 3.6531 1.2 versicolor -1.3907
## 91 6.177 1.921 4.6275 1.2 versicolor -0.7236
## 92 5.416 3.501 4.5726 1.4 versicolor 0.2583
## 126 7.386 2.868 6.2009 1.8 virginica -0.3171
## 149 5.876 1.565 5.5989 2.3 virginica -0.1778
## Species2
## 18 0
## 93 0
## 91 0
## 92 0
## 126 1
## 149 1
plot(train, col= train$Species2+1L)
test$Species2 <- as.integer(test$Species == "virginica")
Use all but Species
mod_logreg <- glm(Species2 ~ .-Species, data = train,
family=binomial(logit))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
The just warning means that the data is possibly linearly seperable
summary(mod_logreg)
##
## Call:
## glm(formula = Species2 ~ . - Species, family = binomial(logit),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2619 -0.0006 0.0000 0.0001 1.9146
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.67 28.78 -1.48 0.14
## Sepal.Length -1.08 1.72 -0.63 0.53
## Sepal.Width -4.23 3.38 -1.25 0.21
## Petal.Length 3.03 3.23 0.94 0.35
## Petal.Width 28.67 19.38 1.48 0.14
## useless -2.97 3.20 -0.93 0.35
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 126.8357 on 99 degrees of freedom
## Residual deviance: 9.6751 on 94 degrees of freedom
## AIC: 21.68
##
## Number of Fisher Scoring iterations: 12
Prediction returns log odds
predict(mod_logreg, test)
## 105 115 14 117 139 96 81 56 19
## 24.481 25.157 -67.449 2.398 8.689 1.047 -18.292 -9.966 -53.444
## 103 77 89 107 82 57 119 69 94
## 7.190 -29.780 -3.103 7.369 -23.665 -5.063 23.916 10.146 -26.895
## 75 64 135 27 97 84 124 118 44
## -8.319 -9.161 -9.152 -56.906 -7.496 -4.403 5.145 11.279 -40.254
## 12 11 72 63 129 3 99 45 52
## -54.967 -53.744 -19.061 -18.874 17.681 -56.901 -19.889 -44.165 -8.035
## 130 16 79 108 83 9 150 106 123
## 4.685 -57.410 -7.643 5.885 -15.381 -39.966 1.171 20.770 1.348
## 86 116 41 51 1
## -9.178 20.844 -65.507 -14.654 -51.323
Response means the probability
predict(mod_logreg, test, type="response")
## 105 115 14 117 139 96 81
## 1.000e+00 1.000e+00 2.220e-16 9.167e-01 9.998e-01 7.403e-01 1.137e-08
## 56 19 103 77 89 107 82
## 4.697e-05 2.220e-16 9.992e-01 1.165e-13 4.300e-02 9.994e-01 5.278e-11
## 57 119 69 94 75 64 135
## 6.289e-03 1.000e+00 1.000e+00 2.087e-12 2.438e-04 1.050e-04 1.060e-04
## 27 97 84 124 118 44 12
## 2.220e-16 5.547e-04 1.209e-02 9.942e-01 1.000e+00 2.220e-16 2.220e-16
## 11 72 63 129 3 99 45
## 2.220e-16 5.270e-09 6.355e-09 1.000e+00 2.220e-16 2.304e-09 2.220e-16
## 52 130 16 79 108 83 9
## 3.238e-04 9.908e-01 2.220e-16 4.792e-04 9.972e-01 2.089e-07 2.220e-16
## 150 106 123 86 116 41 51
## 7.633e-01 1.000e+00 7.938e-01 1.033e-04 1.000e+00 2.220e-16 4.322e-07
## 1
## 2.220e-16
For the confusion table we use a threshold of .5 to separate between class 0 and 1
table(true=test$Species2,
predicted=predict(mod_logreg, test, type="response") > .5)
## predicted
## true FALSE TRUE
## 0 31 2
## 1 1 16