Look at the Shanghai Ranking over Time

Can we predict if a university will improve its ranking over time?

shanghai <- read.csv("shanghaiData.csv")
summary(shanghai)
##    world_rank                             university_name national_rank 
##  301-400: 600   Queen's University                :  13   1      : 343  
##  401-500: 600   University of Maryland, Baltimore :  12   2      : 206  
##  201-300: 584   Aarhus University                 :  11   3      : 133  
##  151-200: 300   Boston University                 :  11   4      : 122  
##  201-302: 204   Brown University                  :  11   1-2    :  86  
##  101-150: 200   California Institute of Technology:  11   2-3    :  84  
##  (Other):2409   (Other)                           :4828   (Other):3923  
##   total_score         alumni            award              hici       
##  Min.   : 23.50   Min.   :  0.000   Min.   :  0.000   Min.   :  0.00  
##  1st Qu.: 27.40   1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  7.30  
##  Median : 31.30   Median :  0.000   Median :  0.000   Median : 12.60  
##  Mean   : 36.38   Mean   :  9.162   Mean   :  7.692   Mean   : 16.22  
##  3rd Qu.: 41.80   3rd Qu.: 15.600   3rd Qu.: 13.400   3rd Qu.: 21.70  
##  Max.   :100.00   Max.   :100.000   Max.   :100.000   Max.   :100.00  
##  NA's   :3796     NA's   :1         NA's   :2         NA's   :2       
##        ns              pub              pcp              year     
##  Min.   :  0.00   Min.   :  7.30   Min.   :  8.30   Min.   :2005  
##  1st Qu.:  8.00   1st Qu.: 28.90   1st Qu.: 15.60   1st Qu.:2007  
##  Median : 12.80   Median : 36.00   Median : 19.00   Median :2009  
##  Mean   : 16.08   Mean   : 38.25   Mean   : 21.24   Mean   :2010  
##  3rd Qu.: 19.80   3rd Qu.: 45.30   3rd Qu.: 24.50   3rd Qu.:2012  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :2015  
##  NA's   :22       NA's   :2        NA's   :2

clean data

rnk <- as.character(shanghai$world_rank)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
summary(rnk)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   101.0   201.0   207.6   301.0   403.0
shanghai$world_rank <- rnk

rnk <- as.character(shanghai$national_rank)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
summary(rnk)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       1       4      10      28      35     141       1
shanghai$national_rank <- rnk

split the data by university

l <- split(shanghai, shanghai$university_name)
years <- sapply(l, nrow)
table(years)
## years
##   1   2   3   4   5   6   7   8   9  10  11  12  13 
##  56  61  21  23  17  24  42  39 126 158  90   1   1

pick two years for comparison

r <- lapply(l, FUN = function(x) {
  d_2005 <- x[x$year==2005,]
  d_2015 <- x[x$year==2015,]
  merge(d_2005, d_2015, by = "university_name",
    all = TRUE, suffix = c("_Y2005", "_Y2015"))
})

r <- do.call(rbind, r)

head(r)
##                                                           university_name
## Aalborg University                                     Aalborg University
## Aalto University                                         Aalto University
## Aarhus University                                       Aarhus University
## Aix Marseille University                         Aix Marseille University
## Aristotle University of Thessaloniki Aristotle University of Thessaloniki
## Arizona State University                         Arizona State University
##                                      world_rank_Y2005 national_rank_Y2005
## Aalborg University                                 NA                  NA
## Aalto University                                  401                   3
## Aarhus University                                 101                   2
## Aix Marseille University                           NA                  NA
## Aristotle University of Thessaloniki              301                   2
## Arizona State University                           NA                  NA
##                                      total_score_Y2005 alumni_Y2005
## Aalborg University                                  NA           NA
## Aalto University                                    NA          0.0
## Aarhus University                                   NA         15.4
## Aix Marseille University                            NA           NA
## Aristotle University of Thessaloniki                NA          0.0
## Arizona State University                            NA           NA
##                                      award_Y2005 hici_Y2005 ns_Y2005
## Aalborg University                            NA         NA       NA
## Aalto University                             0.0        0.0     11.4
## Aarhus University                           19.3        7.9     22.3
## Aix Marseille University                      NA         NA       NA
## Aristotle University of Thessaloniki         0.0        7.9      2.0
## Arizona State University                      NA         NA       NA
##                                      pub_Y2005 pcp_Y2005 year_Y2005
## Aalborg University                          NA        NA         NA
## Aalto University                          28.6      13.1       2005
## Aarhus University                         41.6      22.4       2005
## Aix Marseille University                    NA        NA         NA
## Aristotle University of Thessaloniki      34.9      15.2       2005
## Arizona State University                    NA        NA         NA
##                                      world_rank_Y2015 national_rank_Y2015
## Aalborg University                                301                   5
## Aalto University                                  401                   4
## Aarhus University                                  73                   2
## Aix Marseille University                          101                   5
## Aristotle University of Thessaloniki              401                   2
## Arizona State University                           93                  47
##                                      total_score_Y2015 alumni_Y2015
## Aalborg University                                  NA          0.0
## Aalto University                                    NA          0.0
## Aarhus University                                 27.3         11.5
## Aix Marseille University                            NA         13.6
## Aristotle University of Thessaloniki                NA          0.0
## Arizona State University                          24.5          0.0
##                                      award_Y2015 hici_Y2015 ns_Y2015
## Aalborg University                           0.0       11.2      4.6
## Aalto University                             0.0        0.0     10.4
## Aarhus University                           22.1       12.3     25.8
## Aix Marseille University                     0.0       15.2     20.3
## Aristotle University of Thessaloniki         0.0        4.9      1.5
## Arizona State University                    20.0       22.2     25.5
##                                      pub_Y2015 pcp_Y2015 year_Y2015
## Aalborg University                        30.4      16.8       2015
## Aalto University                          33.4      17.0       2015
## Aarhus University                         51.8      31.0       2015
## Aix Marseille University                  50.1      27.6       2015
## Aristotle University of Thessaloniki      34.0      16.8       2015
## Arizona State University                  42.6      19.1       2015

calculate improvement

improved <- r$'world_rank_Y2005' - r$'world_rank_Y2015'
hist(improved)

hist(improved, breaks = 100)

improved <- improved > 0
table(improved, useNA = "always")
## improved
## FALSE  TRUE  <NA> 
##   227   152   242

add improvement as the class variable (has to be a factor)

r$improved <- as.factor(improved)
dim(r)
## [1] 621  22
summary(r)
##                              university_name world_rank_Y2005
##  Aalborg University                  :  1    Min.   :  1     
##  Aalto University                    :  1    1st Qu.:101     
##  Aarhus University                   :  1    Median :203     
##  Aix Marseille University            :  1    Mean   :216     
##  Aristotle University of Thessaloniki:  1    3rd Qu.:301     
##  Arizona State University            :  1    Max.   :401     
##  (Other)                             :615    NA's   :121     
##  national_rank_Y2005 total_score_Y2005  alumni_Y2005      award_Y2005     
##  Min.   :  1.00      Min.   : 23.90    Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:  4.00      1st Qu.: 27.62    1st Qu.:  0.000   1st Qu.:  0.000  
##  Median : 12.00      Median : 31.55    Median :  0.000   Median :  0.000  
##  Mean   : 31.98      Mean   : 36.68    Mean   :  9.266   Mean   :  6.685  
##  3rd Qu.: 43.25      3rd Qu.: 42.65    3rd Qu.: 16.600   3rd Qu.:  9.100  
##  Max.   :141.00      Max.   :100.00    Max.   :100.000   Max.   :100.000  
##  NA's   :121         NA's   :521       NA's   :121       NA's   :121      
##    hici_Y2005        ns_Y2005        pub_Y2005        pcp_Y2005     
##  Min.   :  0.00   Min.   :  0.00   Min.   : 10.10   Min.   :  9.00  
##  1st Qu.:  7.90   1st Qu.:  7.80   1st Qu.: 27.40   1st Qu.: 14.07  
##  Median : 11.10   Median : 12.45   Median : 33.90   Median : 17.25  
##  Mean   : 15.11   Mean   : 15.73   Mean   : 36.63   Mean   : 19.82  
##  3rd Qu.: 19.20   3rd Qu.: 19.07   3rd Qu.: 43.75   3rd Qu.: 23.02  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00  
##  NA's   :121      NA's   :123      NA's   :121      NA's   :121     
##    year_Y2005   world_rank_Y2015 national_rank_Y2015 total_score_Y2015
##  Min.   :2005   Min.   :  1.0    Min.   :  1.00      Min.   : 23.90   
##  1st Qu.:2005   1st Qu.:101.0    1st Qu.:  4.00      1st Qu.: 27.00   
##  Median :2005   Median :201.0    Median : 10.00      Median : 30.75   
##  Mean   :2005   Mean   :215.8    Mean   : 25.78      Mean   : 35.87   
##  3rd Qu.:2005   3rd Qu.:301.0    3rd Qu.: 29.00      3rd Qu.: 39.33   
##  Max.   :2005   Max.   :401.0    Max.   :126.00      Max.   :100.00   
##  NA's   :121    NA's   :121      NA's   :121         NA's   :521      
##   alumni_Y2015      award_Y2015       hici_Y2015         ns_Y2015     
##  Min.   :  0.000   Min.   :  0.00   Min.   :  0.000   Min.   :  0.00  
##  1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  6.275   1st Qu.:  8.00  
##  Median :  0.000   Median :  0.00   Median : 12.300   Median : 12.10  
##  Mean   :  7.968   Mean   :  7.47   Mean   : 15.218   Mean   : 15.29  
##  3rd Qu.: 13.600   3rd Qu.: 13.30   3rd Qu.: 20.100   3rd Qu.: 19.00  
##  Max.   :100.000   Max.   :100.00   Max.   :100.000   Max.   :100.00  
##  NA's   :121       NA's   :121      NA's   :121       NA's   :123     
##    pub_Y2015        pcp_Y2015        year_Y2015    improved  
##  Min.   :  7.80   Min.   : 11.20   Min.   :2015   FALSE:227  
##  1st Qu.: 30.55   1st Qu.: 16.60   1st Qu.:2015   TRUE :152  
##  Median : 36.70   Median : 19.90   Median :2015   NA's :242  
##  Mean   : 38.78   Mean   : 21.84   Mean   :2015              
##  3rd Qu.: 45.02   3rd Qu.: 24.20   3rd Qu.:2015              
##  Max.   :100.00   Max.   :100.00   Max.   :2015              
##  NA's   :121      NA's   :121      NA's   :121

Do some decision trees

library(rpart)
library(rpart.plot)
library("caret")
## Loading required package: lattice
## Loading required package: ggplot2

Use all the data

tree <- rpart(improved ~ ., data=r)
tree
## n=379 (242 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 379 152 FALSE (0.5989446 0.4010554)  
##   2) university_name=Aalto University,Aristotle University of Thessaloniki,Auburn University,Autonomous University of Madrid,Bar-Ilan University,Baylor College of Medicine,Ben-Gurion University of the Negev,Boston College,Brandeis University,Brigham Young University,California Institute of Technology,Carleton University,Carnegie Mellon University,Case Western Reserve University,Catholic University of Louvain,Chalmers University of Technology,Chiba University,City University of New York City College,Clemson University,Colorado State University,Columbia University,Cornell University,Dartmouth College,Ecole Polytechnique,Eindhoven University of Technology,Emory University,Eotvos Lorand University,Federal University of Rio de Janeiro,Florida International University,Florida State University,Georgetown University,Georgia Institute of Technology,Harvard University,Hiroshima University,Hokkaido University,Indiana University Bloomington,Indian Institute of Science,Istanbul University,Jagiellonian University,Kanazawa University,Kansas State University,Karolinska Institute,Keio University,Kobe University,Kyoto University,Kyungpook National University,Kyushu University,Lancaster University,Leiden University,Linkoping University,Lund University,Mayo Medical School,McMaster University,Medical University of Graz,Medical University of South Carolina,Michigan State University,Moscow State University,Nara Institute of Science and Technology,National and Kapodistrian University of Athens,National Autonomous University of Mexico,National Cheng Kung University,National University of Singapore,North Carolina State University - Raleigh,Okayama University,Oregon State University,Osaka University,Pennsylvania State University - University Park,Pohang University of Science and Technology,Queen's University,Queen's University Belfast,Rensselaer Polytechnic Institute,Rice University,Rockefeller University,Rutgers, The State University of New Jersey - New Brunswick,Saint Louis University,Saint Petersburg State University,San Diego State University,Scuola Normale Superiore - Pisa,Seoul National University,State University of New York Health Science Center at Brooklyn,Stockholm School of Economics,Syracuse University,Technical University Darmstadt,Technical University of Berlin,Technical University of Braunschweig,Tel Aviv University,Temple University,Texas Tech University,The Australian National University,The Hong Kong Polytechnic University,The Imperial College of Science, Technology and Medicine,The Ohio State University - Columbus,The University of Calgary,The University of Edinburgh,The University of Georgia,The University of Glasgow,The University of Reading,The University of Sheffield,The University of Texas at Austin,The University of Texas at Dallas,The University of Texas Health Science Center at Houston,The University of Texas Health Science Center at San Antonio,The University of Texas Medical Branch at Galveston,The University of Texas Southwestern Medical Center at Dallas,The University of Tokushima,The University of Tokyo,Thomas Jefferson University,Tohoku University,Tokyo Institute of Technology,Tokyo Medical and Dental University,Tufts University,Tulane University,Umea University,University College Cork,University of Alaska - Fairbanks,University of Alberta,University of Amsterdam,University of Antwerp,University of Arizona,University of Basel,University of Bath,University of Bayreuth,University of Birmingham,University of Bochum,University of Bremen,University of Bristol,University of British Columbia,University of California, Berkeley,University of California, Davis,University of California, Irvine,University of California, Riverside,University of California, San Diego,University of California, San Francisco,University of California, Santa Barbara,University of Cambridge,University of Chicago,University of Chile,University of Cincinnati,University of Duesseldorf,University of East Anglia,University of Essex,University of Ferrara,University of Florida,University of Frankfurt,University of Freiburg,University of Goettingen,University of Graz,University of Halle-Wittenberg,University of Hamburg,University of Hawaii at Manoa,University of Illinois at Chicago,University of Illinois at Urbana-Champaign,University of Iowa,University of Jena,University of Jyvaskyla,University of Konstanz,University of KwaZulu-Natal,University of Leeds,University of Leicester,University of Leipzig,University of Liverpool,University of Mainz,University of Manitoba,University of Marburg,University of Maryland, Baltimore,University of Maryland, Baltimore County,University of Massachusetts Amherst,University of Miami,University of Milan,University of Missouri - Columbia,University of Muenster,University of Munich,University of Naples Federico II,University of Nebraska - Lincoln,University of New Hampshire - Durham,University of Notre Dame,University of Nottingham,University of Oklahoma - Norman,University of Oregon,University of Oxford,University of Palermo,University of Parma,University of Pavia,University of Pennsylvania,University of Perugia,University of Pisa,University of Quebec,University of Regensburg,University of Rhode Island,University of Rochester,University of Roma - Tor Vergata,University of Rostock,University of Sao Paulo,University of Saskatchewan,University of St Andrews,University of Surrey,University of Sussex,University of Sydney,University of Szeged,University of Toronto,University of Trieste,University of Tsukuba,University of Tuebingen,University of Turku,University of Twente,University of Ulm,University of Utah,University of Valencia,University of Vermont,University of Vienna,University of Virginia,University of Warsaw,University of Wisconsin - Madison,University of Wuerzburg,University of Wyoming,University of Zaragoza,Uppsala University,Utah State University,Utrecht University,Vanderbilt University,Vienna University of Technology,Virginia Polytechnic Institute and State University,Wake Forest University,Washington University in St. Louis,Wayne State University,Weizmann Institute of Science,Yale University 227   0 FALSE (1.0000000 0.0000000) *
##   3) university_name=Aarhus University,Autonomous University of Barcelona,Boston University,Brown University,Cardiff University,Charles University in Prague,City University of Hong Kong,Claude Bernard University Lyon 1,Complutense University of Madrid,Dalhousie University,Delft University of Technology,Drexel University,Duke University,Ecole Normale Superieure - Lyon,Ecole Normale Superieure - Paris,Erasmus University,Flinders University,Fudan University,George Mason University,Ghent University,Hannover Medical School,Hanyang University,Indiana University-Purdue University at Indianapolis,Iowa State University,Jilin University,Joseph Fourier University (Grenoble 1),King's College London,Korea Advanced Institute of Science and Technology,Korea University,Laval University,London School of Economics and Political Science,Louisiana State University - Baton Rouge,Macquarie University,Massachusetts Institute of Technology (MIT),McGill University,Monash University,Nagoya University,Nanjing University,Nanyang Technological University,National Chiao Tung University,National Taiwan University,National Tsing Hua University,Newcastle University,New York University,Northeastern University,Northwestern University,Oregon Health and Science University,Paul Sabatier University (Toulouse 3),Peking University,Pierre and Marie  Curie University - Paris 6,Polytechnic Institute of Milan,Polytechnic University of Valencia,Princeton University,Purdue University - West Lafayette,Radboud University Nijmegen,RWTH Aachen University,Shanghai Jiao Tong University,Simon Fraser University,Stanford University,Stockholm University,Sungkyunkwan University,Swedish University of Agricultural Sciences,Swiss Federal Institute of Technology Zurich,Technical University Munich,Technical University of Denmark,Technion-Israel Institute of Technology,The Chinese University of Hong Kong,The George Washington University,The Hebrew University of Jerusalem,The Hong Kong University of Science and Technology,The University of Adelaide,The University of Auckland,The University of Dundee,The University of Hong Kong,The University of Manchester,The University of New Mexico - Albuquerque,The University of Queensland,The University of Texas M. D. Anderson Cancer Center,The University of Western Australia,Trinity College Dublin,Tsinghua University,University College Dublin,University College London,University of Aberdeen,University of Barcelona,University of Bergen,University of Bern,University of Bologna,University of Bonn,University of Buenos Aires,University of California, Los Angeles,University of California, Santa Cruz,University of Cape Town,University of Central Florida,University of Colorado at Boulder,University of Copenhagen,University of Delaware,University of Duisburg-Essen,University of Erlangen-Nuremberg,University of Exeter,University of Florence,University of Geneva,University of Giessen,University of Gothenburg,University of Granada,University of Groningen,University of Guelph,University of Helsinki,University of Houston,University of Innsbruck,University of Kentucky,University of Kiel,University of Koeln,University of Lausanne,University of Liege,University of Lisbon,University of Maryland, College Park,University of Massachusetts Medical School - Worcester,University of Minnesota, Twin Cities,University of Montreal,University of North Carolina at Chapel Hill,University of Oslo,University of Otago,University of Ottawa,University of Oulu,University of Padua,University of Paris Descartes (Paris 5),University of Science and Technology of China,University of Southampton,University of South Carolina - Columbia,University of Southern California,University of Southern Denmark,University of South Florida,University of Strasbourg,University of Stuttgart,University of Tasmania,University of Tennessee - Knoxville,University of the Witwatersrand,University of Turin,University of Victoria,University of Wageningen,University of Warwick,University of Washington,University of Waterloo,University of York,University of Zurich,Virginia Commonwealth University,VU University Amsterdam,Western University,Yeshiva University,Yonsei University,Zhejiang University 152   0 TRUE (0.0000000 1.0000000) *
rpart.plot(tree, extra = 2, under = TRUE, varlen=0, faclen=0)

Do in-sample testing (resubstitution error). NOTE: You should use a training and test sample.

pred <- predict(tree, r, type="class")
confusionMatrix(data=pred, reference=r$improved, positive = "TRUE")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   227    0
##      TRUE      0  152
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9903, 1)
##     No Information Rate : 0.5989     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4011     
##          Detection Rate : 0.4011     
##    Detection Prevalence : 0.4011     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : TRUE       
## 

Note: Why is this bad?

Without the university name

colnames(r)
##  [1] "university_name"     "world_rank_Y2005"    "national_rank_Y2005"
##  [4] "total_score_Y2005"   "alumni_Y2005"        "award_Y2005"        
##  [7] "hici_Y2005"          "ns_Y2005"            "pub_Y2005"          
## [10] "pcp_Y2005"           "year_Y2005"          "world_rank_Y2015"   
## [13] "national_rank_Y2015" "total_score_Y2015"   "alumni_Y2015"       
## [16] "award_Y2015"         "hici_Y2015"          "ns_Y2015"           
## [19] "pub_Y2015"           "pcp_Y2015"           "year_Y2015"         
## [22] "improved"
r$'university_name' <- NULL
r$'year_Y2015' <- NULL
r$'year_Y2005' <- NULL

tree <- rpart(improved ~ ., data=r)
tree
## n=379 (242 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 379 152 FALSE (0.5989446 0.4010554)  
##     2) world_rank_Y2015>=251 123  13 FALSE (0.8943089 0.1056911)  
##       4) world_rank_Y2005< 351 91   0 FALSE (1.0000000 0.0000000) *
##       5) world_rank_Y2005>=351 32  13 FALSE (0.5937500 0.4062500)  
##        10) world_rank_Y2015>=351 19   0 FALSE (1.0000000 0.0000000) *
##        11) world_rank_Y2015< 351 13   0 TRUE (0.0000000 1.0000000) *
##     3) world_rank_Y2015< 251 256 117 TRUE (0.4570312 0.5429688)  
##       6) world_rank_Y2005< 178 182  65 FALSE (0.6428571 0.3571429)  
##        12) pcp_Y2005>=17.55 169  54 FALSE (0.6804734 0.3195266)  
##          24) world_rank_Y2015>=98.5 81  13 FALSE (0.8395062 0.1604938)  
##            48) world_rank_Y2005< 127 52   0 FALSE (1.0000000 0.0000000) *
##            49) world_rank_Y2005>=127 29  13 FALSE (0.5517241 0.4482759)  
##              98) world_rank_Y2015>=176 16   0 FALSE (1.0000000 0.0000000) *
##              99) world_rank_Y2015< 176 13   0 TRUE (0.0000000 1.0000000) *
##          25) world_rank_Y2015< 98.5 88  41 FALSE (0.5340909 0.4659091)  
##            50) ns_Y2005>=24.3 58  18 FALSE (0.6896552 0.3103448)  
##             100) pcp_Y2015< 23.1 8   0 FALSE (1.0000000 0.0000000) *
##             101) pcp_Y2015>=23.1 50  18 FALSE (0.6400000 0.3600000)  
##               202) award_Y2005>=6.45 43  12 FALSE (0.7209302 0.2790698) *
##               203) award_Y2005< 6.45 7   1 TRUE (0.1428571 0.8571429) *
##            51) ns_Y2005< 24.3 30   7 TRUE (0.2333333 0.7666667) *
##        13) pcp_Y2005< 17.55 13   2 TRUE (0.1538462 0.8461538) *
##       7) world_rank_Y2005>=178 74   0 TRUE (0.0000000 1.0000000) *
rpart.plot(tree, extra = 2, under = TRUE, varlen=0, faclen=0)

pred <- predict(tree, r, type="class")
confusionMatrix(data=pred, reference=r$improved, positive = "TRUE")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   217   12
##      TRUE     10  140
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9134, 0.9633)
##     No Information Rate : 0.5989          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8789          
##  Mcnemar's Test P-Value : 0.8312          
##                                           
##             Sensitivity : 0.9211          
##             Specificity : 0.9559          
##          Pos Pred Value : 0.9333          
##          Neg Pred Value : 0.9476          
##              Prevalence : 0.4011          
##          Detection Rate : 0.3694          
##    Detection Prevalence : 0.3958          
##       Balanced Accuracy : 0.9385          
##                                           
##        'Positive' Class : TRUE            
## 

Note: Why is this bad, too?

Remove 2015 World Rank

colnames(r)
##  [1] "world_rank_Y2005"    "national_rank_Y2005" "total_score_Y2005"  
##  [4] "alumni_Y2005"        "award_Y2005"         "hici_Y2005"         
##  [7] "ns_Y2005"            "pub_Y2005"           "pcp_Y2005"          
## [10] "world_rank_Y2015"    "national_rank_Y2015" "total_score_Y2015"  
## [13] "alumni_Y2015"        "award_Y2015"         "hici_Y2015"         
## [16] "ns_Y2015"            "pub_Y2015"           "pcp_Y2015"          
## [19] "improved"
r$'world_rank_Y2015' <- NULL

tree <- rpart(improved ~ ., data=r)
tree
## n=379 (242 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 379 152 FALSE (0.59894459 0.40105541)  
##     2) pub_Y2015< 28.4 69   6 FALSE (0.91304348 0.08695652) *
##     3) pub_Y2015>=28.4 310 146 FALSE (0.52903226 0.47096774)  
##       6) world_rank_Y2005< 127 135  39 FALSE (0.71111111 0.28888889)  
##        12) award_Y2015< 11.2 47   3 FALSE (0.93617021 0.06382979) *
##        13) award_Y2015>=11.2 88  36 FALSE (0.59090909 0.40909091)  
##          26) ns_Y2015< 17.6 12   0 FALSE (1.00000000 0.00000000) *
##          27) ns_Y2015>=17.6 76  36 FALSE (0.52631579 0.47368421)  
##            54) pcp_Y2005>=30.8 32   8 FALSE (0.75000000 0.25000000) *
##            55) pcp_Y2005< 30.8 44  16 TRUE (0.36363636 0.63636364)  
##             110) ns_Y2005>=27.2 18   8 FALSE (0.55555556 0.44444444) *
##             111) ns_Y2005< 27.2 26   6 TRUE (0.23076923 0.76923077) *
##       7) world_rank_Y2005>=127 175  68 TRUE (0.38857143 0.61142857)  
##        14) pcp_Y2015< 20.15 112  50 FALSE (0.55357143 0.44642857)  
##          28) ns_Y2015< 7.55 23   1 FALSE (0.95652174 0.04347826) *
##          29) ns_Y2015>=7.55 89  40 TRUE (0.44943820 0.55056180)  
##            58) pub_Y2015< 38.45 64  28 FALSE (0.56250000 0.43750000)  
##             116) hici_Y2015< 6.2 17   1 FALSE (0.94117647 0.05882353) *
##             117) hici_Y2015>=6.2 47  20 TRUE (0.42553191 0.57446809)  
##               234) hici_Y2005>=14.65 19   5 FALSE (0.73684211 0.26315789) *
##               235) hici_Y2005< 14.65 28   6 TRUE (0.21428571 0.78571429) *
##            59) pub_Y2015>=38.45 25   4 TRUE (0.16000000 0.84000000) *
##        15) pcp_Y2015>=20.15 63   6 TRUE (0.09523810 0.90476190) *
rpart.plot(tree, extra = 2, under = TRUE, varlen=0, faclen=0)

pred <- predict(tree, r, type="class")
confusionMatrix(data=pred, reference=r$improved, positive = "TRUE")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   205   32
##      TRUE     22  120
##                                           
##                Accuracy : 0.8575          
##                  95% CI : (0.8182, 0.8911)
##     No Information Rate : 0.5989          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7002          
##  Mcnemar's Test P-Value : 0.2207          
##                                           
##             Sensitivity : 0.7895          
##             Specificity : 0.9031          
##          Pos Pred Value : 0.8451          
##          Neg Pred Value : 0.8650          
##              Prevalence : 0.4011          
##          Detection Rate : 0.3166          
##    Detection Prevalence : 0.3747          
##       Balanced Accuracy : 0.8463          
##                                           
##        'Positive' Class : TRUE            
## 

Notes:

  • It is probably not good to have all 2015 variables as predictors. If you have some, explain why.
  • You should probably add variables for change!!!
  • Add information form other sources (other rankings, additional data, etc.)
  • Do not use in-sample testing.