#' ---
#' title: "Project 1: University Rankings"
#' author: "Michael Hahsler"
#' output:
#'  html_document:
#'    toc: true
#' ---

#' Here I only look at the Times Higher Education World University Ranking.
#'
#' Load data
times <- read.csv("timesData.csv")
head(times)
summary(times)

#' # This needs some cleaning!
#'
#' Many numeric columns are read in as factors because they contain some
#' non-numeric characters (-, %, 'missing', etc.)
#'
#' ## Ranking variable
rnk <- as.character(times$world_rank)
unique(rnk)
cbind(rnk, as.numeric(rnk))

#' get rid of = and ranges (look up regular expressions!)
rnk <- sub(pattern = "=", "", rnk)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
summary(rnk)

times$world_rank <- rnk

#' ##  Intl
intl <- as.character(times$international)
unique(intl)
intl[intl == '-'] <- NA
unique(intl)
intl <- as.numeric(intl)
summary(intl)

times$international <- intl

#' ## Students
ns <- as.character(times$num_students)
ns <- sub(pattern = ",", "", ns)
ns <- as.numeric(ns)
summary(ns)

times$num_students <- ns

#'  Clean the other variables as well!

tmp <- as.character(times$female_male_ratio)
tmp <- sub(pattern = " :.*", "", tmp)
tmp <- as.numeric(tmp)
times$female_male_ratio <- NULL

tmp <- as.character(times$international_students)
tmp <- sub(pattern = "%", "", tmp)
tmp <- as.numeric(tmp)
times$international_students <- tmp

tmp <- as.character(times$income)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$income <- tmp

tmp <- as.character(times$total_score)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$total_score <- tmp

summary(times)

#' # Some visualization

hist(times$world_rank)

#' __Note:__ This is weird!!!

hist(times$num_students, breaks = 100)
hist(times$international)


plot(times$world_rank, times$num_students)
plot(times$world_rank, times$num_students, log = "y")
plot(times$world_rank, times$international)


#' # Fit a (log) linear model on the data with rank > 10
plot(times$world_rank, times$teaching)

plot(times$world_rank, times$teaching, log ="x")

cor(times$world_rank, times$teaching)
cor(times$world_rank, times$teaching, method = "spearman")
cor(log(times$world_rank), times$teaching)

l <- lm(teaching ~ log(world_rank), data = times[times$world_rank>10,])
summary(l)

x <- 1:600
lines(x, predict(l, data.frame(world_rank=x)), col = "red")

plot(times$world_rank, times$teaching)
lines(x, predict(l, data.frame(world_rank=x)), col = "red")

plot(times$world_rank, times$research)
plot(times$world_rank, times$research, log = "x")


#' # More Analysis

sum(times$world_rank==1)
times[times$world_rank==1, ]

#' There is different year so we probalby need to factor this in!
hist(times[times$year==2016,]$world_rank)

#' OK, this makes more sense!!!
#'
#' What about SMU?
times[times$university_name=="Southern Methodist University",]


#' # Combine two rankings
cwur <- read.csv("cwurData.csv")
head(cwur)
summary(cwur)

#' this needs cleaning

#' Let us combine the rankings for 2015
times2015 <- times[times$year=="2015",]
summary(times2015$year)
cwur2015 <- cwur[cwur$year=="2015",]
summary(cwur2015$year)

#' meger does a database JOIN (by defines the keys and all
#' specifies that we want a FULL JOIN)
ranking <- merge(times2015, cwur2015,
  by.x="university_name", by.y="institution", all = TRUE)

plot(ranking$world_rank.x, ranking$world_rank.y,
  xlab = "Times Rank", ylab= "CWUR Rank")

#' # Do a bar plot for SMU
#'
#' Calculate average over all universities
m <- sapply(cwur2015[, -c(1:3, 14)], median, na.rm = TRUE)
oldpar <- par(mar = c(4,10,1,1))
barplot(m, las =2, horiz = TRUE, xlab = "Median Rank")
par(oldpar)

#' Add SMU
smu <- cwur2015[cwur2015$institution=="Southern Methodist University",-c(1:3, 14)]

data <- as.matrix(rbind(SMU = smu, Median = m))

oldpar <- par(mar = c(4,10,1,1))
#barplot(unlist(smu[,-(2:3)]), las =2, horiz = TRUE)
barplot(data, las =2, horiz = TRUE, beside = TRUE, xlab = "Median Rank")
par(oldpar)

#' _Note:_ Add a legend
#'
#' ## Look at the distribution in the ranking
plot(cwur2015$world_rank, cwur2015$national_rank)
plot(cwur2015$world_rank, cwur2015$national_rank, col = cwur2015$country)

l <- split(cwur2015, cwur2015$country)
plot(NA, xlim = c(0,1000), ylim=c(0,300), xlab = "World Rank", ylab="National Rank")
for(i in 1:length(l)) lines(l[[i]]$world_rank, l[[i]]$national_rank, col = i)

num_of_universities <- sapply(l, nrow)
plot(NA, xlim = c(0,1000), ylim=c(0,1), xlab = "World Rank", ylab="National Rank", main = "Country Comparison")
for(i in 1:length(l)) lines(l[[i]]$world_rank, l[[i]]$national_rank/num_of_universities[i], col = i)

#' Countries with more than 10 universities in the ranking and normalize ranks.
#' It would be better to normalize with the actual number of universities in the
#' country and not with the number of universities in the ranking.
countries <- which(num_of_universities>=10)
plot(NA, xlim = c(0,1), ylim=c(0,1), xlab = "Normalized World Rank",
  ylab="Normalized National Rank", main = "Country Comparison")
for(i in 1:length(countries)) lines(l[[countries[i]]]$world_rank/1000, l[[countries[i]]]$national_rank/num_of_universities[countries[i]], col = i, lty =i)
legend(x = "bottomright", legend = names(countries), col = 1:length(countries),lwd =1, lty = 1:length(countries), cex=.7)
abline(a = 0, b = 1, col ="grey", lwd = 3)

#' Country ranking (area under the courve would be better)
country_rnk <- sapply(1:length(countries),  function(i) mean((l[[countries[i]]]$national_rank/num_of_universities[countries[i]] - l[[countries[i]]]$world_rank/1000)))
names(country_rnk) <- names(countries)
country_rnk
sort(country_rnk, decreasing = TRUE)

#' # Compare countries using the total number of universities
#' from: [http://www.webometrics.info/en/node/54](http://www.webometrics.info/en/node/54)
#'
#' Note: This ranking only uses Google Scholar
universities_by_country <- read.csv("universities_per_country.csv")
head(universities_by_country)
rownames(universities_by_country) <- universities_by_country$COUNTRY

universities_by_country$TOP100 <- universities_by_country[,2]
universities_by_country$TOP500 <- universities_by_country[,2] + universities_by_country[,2] + universities_by_country[,3]


fraction100 <- universities_by_country$TOP100/universities_by_country$ALL
names(fraction100) <- rownames(universities_by_country)

oldpar <- par(mar = c(12,4,4,3))
barplot(sort(fraction100[fraction100>0], decreasing = TRUE), las=2, main = "Proportion of Universities in Top 100")
par(oldpar)

fraction500 <- universities_by_country$TOP500/universities_by_country$ALL
names(fraction500) <- rownames(universities_by_country)
oldpar <- par(mar = c(12,4,4,3))
barplot(sort(fraction500[fraction500>0], decreasing = TRUE), las=2, main = "Proportion of Universities in Top 500")
par(oldpar)

#' ## Look at correlation
library(seriation)
m <- na.omit(times2015)
rownames(m) <- m[,2]
m <- m[,-c(2,3,9,13)]
hmap(cor(m), margins=c(10,10))
hmap(abs(cor(m)), margins=c(10,10))

#' top 25 universities
hmap(scale(m[1:25,]), margins=c(10,20))
hmap(t(scale(m[1:25,])), margins=c(17,10), method = "OLO")


#' ## PCA
pr <- prcomp(scale(m))
plot(pr)
biplot(pr, col = c("grey", "red"), main = "All Universities (2015)")

pr <- prcomp(scale(m[1:10,]))
plot(pr)
biplot(pr, col = c("grey", "red"), xlim=c(-1,1), ylim=c(-.5,1), , main = "Top 10 (2015)")

pr <- prcomp(scale(m[1:50,]))
biplot(pr, col = c("grey", "red"), main = "Top 50 (2015)")