#' ---
#' title: "Project 4: University Rankings"
#' author: "Michael Hahsler"
#' output:
#'  html_document:
#'    toc: true
#' ---


#' # Look at the Times Ranking
#'
#' Here I only look at the Times Higher Education World University Ranking.
#'
#' Load data
times <- read.csv("timesData.csv")
head(times)
summary(times)

#' # Use Cleaning From Project 1
#'
#' Many numeric columns are read in as factors because they contain some
#' non-numeric characters (-, %, 'missing', etc.)
#'
#' Ranking variable
rnk <- as.character(times$world_rank)

#' get rid of = and ranges (look up regular expressions!)
rnk <- sub(pattern = "=", "", rnk)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
times$world_rank <- rnk

#'  Intl
intl <- as.character(times$international)
intl[intl == '-'] <- NA
intl <- as.numeric(intl)
times$international <- intl

#' Students
ns <- as.character(times$num_students)
ns <- sub(pattern = ",", "", ns)
ns <- as.numeric(ns)
times$num_students <- ns

#' Clean the other variables as well!

tmp <- as.character(times$female_male_ratio)
tmp <- sub(pattern = " :.*", "", tmp)
tmp <- as.numeric(tmp)
times$female_male_ratio <- NULL

tmp <- as.character(times$international_students)
tmp <- sub(pattern = "%", "", tmp)
tmp <- as.numeric(tmp)
times$international_students <- tmp

tmp <- as.character(times$income)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$income <- tmp

tmp <- as.character(times$total_score)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$total_score <- tmp

summary(times)


#' # Select some data for clustering

data <- times[,c("university_name","teaching",
  "research", "citations", "income",
  "international", "num_students")]
data[,-1] <- scale(data[,-1])


#' kmeans does not like missing data!
data <- na.omit(data)
summary(data)

pairs(data[,-1])

#' # Create clusters
km <- kmeans(data[,-1], centers = 3)


plot(data[,c("research", "num_students")], col = km$cluster)
pairs(data[,-1], col = km$cluster)

library("GGally")
data2 <- data[,-1]
data2$cluster <- as.factor(km$cluster)
ggpairs(data2, mapping = ggplot2::aes(color = cluster))


#' # Cluster just teaching and research
data_tr <- data[, c("teaching", "research")]
plot(data_tr)
cor(data_tr)

km <- kmeans(data_tr, centers = 3)
plot(data_tr, col = km$cluster)

# do PCA
pr <- prcomp(data_tr)
biplot(pr)
plot(pr$x, col = km$cluster)

#' _Note:_ Clustering does just do k-means discretization if we have a single
#' point cloud.
#'
#' You should probably check the clusterability of the data.

#' # Possible ground truth
#'
#'  * Group world rank into three groups and compare if k-means on the rest of the data produces a similar ranking
#'


#' # Find the weirdest universities using an outlier score
library(dbscan)
l <- lof(data[,-1])
hist(l)
data$university_name[order(l, decreasing = TRUE)[1:10]]
data[order(l, decreasing = TRUE)[1:10],]

#' ignore number of students
l <- lof(data[,-c(1, 7)])
hist(l)
data$university_name[order(l, decreasing = TRUE)[1:10]]
data[order(l, decreasing = TRUE)[1:10],]