#' ---
#' title: "Project 2: University Rankings"
#' author: "Michael Hahsler"
#' output:
#'  html_document:
#'    toc: true
#' ---


#' # Look at the Shanghai Ranking over Time
#'
#' Can we predict if a university will improve its ranking over time?
shanghai <- read.csv("shanghaiData.csv")
summary(shanghai)

#' clean data
rnk <- as.character(shanghai$world_rank)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
summary(rnk)
shanghai$world_rank <- rnk

rnk <- as.character(shanghai$national_rank)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
summary(rnk)
shanghai$national_rank <- rnk

#' split the data by university
l <- split(shanghai, shanghai$university_name)
years <- sapply(l, nrow)
table(years)

#' pick two years for comparison
r <- lapply(l, FUN = function(x) {
  d_2005 <- x[x$year==2005,]
  d_2015 <- x[x$year==2015,]
  merge(d_2005, d_2015, by = "university_name",
    all = TRUE, suffix = c("_Y2005", "_Y2015"))
})

r <- do.call(rbind, r)

head(r)

#' calculate improvement
improved <- r$'world_rank_Y2005' - r$'world_rank_Y2015'
hist(improved)
hist(improved, breaks = 100)

improved <- improved > 0
table(improved, useNA = "always")


#' add improvement as the class variable (has to be a factor)
r$improved <- as.factor(improved)
dim(r)
summary(r)

#' # Do some decision trees
library(rpart)
library(rpart.plot)
library("caret")

#' ## Use all the data

tree <- rpart(improved ~ ., data=r)
tree

rpart.plot(tree, extra = 2, under = TRUE, varlen=0, faclen=0)

#' Do in-sample testing (resubstitution error).
#' __NOTE:__ You should use a training and test sample.
pred <- predict(tree, r, type="class")
confusionMatrix(data=pred, reference=r$improved, positive = "TRUE")
#' __Note:__ Why is this bad?
#'

#' ## Without the university name
colnames(r)
r$'university_name' <- NULL
r$'year_Y2015' <- NULL
r$'year_Y2005' <- NULL

tree <- rpart(improved ~ ., data=r)
tree

rpart.plot(tree, extra = 2, under = TRUE, varlen=0, faclen=0)

pred <- predict(tree, r, type="class")
confusionMatrix(data=pred, reference=r$improved, positive = "TRUE")
#' __Note:__ Why is this bad, too?
#'
#' ## Remove 2015 World Rank
colnames(r)
r$'world_rank_Y2015' <- NULL

tree <- rpart(improved ~ ., data=r)
tree

rpart.plot(tree, extra = 2, under = TRUE, varlen=0, faclen=0)

pred <- predict(tree, r, type="class")
confusionMatrix(data=pred, reference=r$improved, positive = "TRUE")

#'
#' Notes:
#'
#' * It is probably not good to have all 2015 variables as predictors. If you have some, explain why.
#' * You should probably add variables for change!!!
#' * Add information form other sources (other rankings, additional data, etc.)
#' * Do not use in-sample testing.