#' ---
#' title: "Project 1: University Rankings - Additional information"
#' author: "Michael Hahsler"
#' output:
#'  html_document:
#'    toc: true
#' ---
#'
#' Libraries DT (datatable) and plotly are for interactive displays
library("DT")
library("plotly")

#' # Attainment
attainment <- read.csv("educational_attainment_supplementary_data.csv")
head(attainment)
colnames(attainment)

#' Countries and series?
head(levels(attainment$country_name))
head(levels(attainment$series_name))

head(attainment[attainment$country_name =="United States",])

#' College age population
head(attainment[attainment$series_name =="Barro-Lee: Population in thousands, age 20-24, total",])

age <- attainment[attainment$series_name =="Barro-Lee: Population in thousands, age 20-24, total",]

age2 <- apply(age[,-(1:2)], MARGIN =1, mean, na.rm = TRUE)
age <- data.frame(country=age$country_name, college_age_pop = age2)

datatable(age) %>% formatRound(2,2)

#' Years in school might be an indicator of education level in a country
head(attainment[attainment$series_name =="UIS: Mean years of schooling of the population age 25+. Total",])

years <- attainment[attainment$series_name =="UIS: Mean years of schooling of the population age 25+. Total",]

matplot(t(years[,-(1:2)]), type = "l",)

years2 <- apply(years[,-(1:2)], MARGIN =1, mean, na.rm = TRUE)
years <- data.frame(country=years$country_name, years_of_education = years2)

merged <- cbind(years, college_age_pop = age[,2])

datatable(merged) %>% formatRound(2:3,2)

#' # Expenditure
expenditure <- read.csv("education_expenditure_supplementary_data.csv")
summary(expenditure)
head(expenditure)

expend <- apply(expenditure[4:9], MARGIN = 1, mean, na.rm = TRUE)
expend <- aggregate(expend, by = list(country = expenditure$country), sum, na.rm = TRUE)
colnames(expend)[2] <- "expenditure"

datatable(expend) %>% formatRound(2,2)

#' # Are expenditures and years of education related?
merged <- merge(expend, merged)

datatable(merged) %>% formatRound(2:4, 2)

#' Scatter plot
plot(merged[,2:3], xlim = c(0, 40), ylim =c(5,15))
text(merged[,2], merged[,3], labels = merged[,1], pos = 3,
  col = rgb(.5,0,0, alpha = .5))

#' Use only Country names
library(wordcloud)
merged_no_na <- na.omit(merged)
textplot(merged_no_na[,2], merged_no_na[,3], words = merged_no_na[,1],
  show.lines = FALSE,
  cex = merged_no_na[,4]/ max(merged_no_na[,4]) + .5,
  xlim = c(0, 35), ylim =c(6,14))

#' Is there a correlation?
cor <- cor(merged[,2:4], use = "pairwise")
cor
library(corrplot)
corrplot(cor, method = "ellipse")

cor.test(merged[,"expenditure"], merged[,"years_of_education"])


#' # Interactive plots
#'
#' (round first for display)
merged <- cbind(country = merged[,1], round(merged[,-1], 2))
plot_ly(merged, x = expenditure, y = years_of_education,
  text = country,
  mode = "text+markers", textposition = "top middle")

plot_ly(merged, x = expenditure, y = years_of_education,
  text = country, size = college_age_pop,
  mode = "markers")

