#' ---
#' title: "Project 3: University Rankings"
#' author: "Michael Hahsler"
#' output:
#'  html_document:
#'    toc: true
#' ---


#' # Look at the Times Ranking
#'
#' Here I only look at the Times Higher Education World University Ranking.
#'
#' Load data
times <- read.csv("timesData.csv")
head(times)
summary(times)

#' # Use Cleaning From Project 1
#'
#' Many numeric columns are read in as factors because they contain some
#' non-numeric characters (-, %, 'missing', etc.)
#'
#' Ranking variable
rnk <- as.character(times$world_rank)

#' get rid of = and ranges (look up regular expressions!)
rnk <- sub(pattern = "=", "", rnk)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
times$world_rank <- rnk

#'  Intl
intl <- as.character(times$international)
intl[intl == '-'] <- NA
intl <- as.numeric(intl)
times$international <- intl

#' Students
ns <- as.character(times$num_students)
ns <- sub(pattern = ",", "", ns)
ns <- as.numeric(ns)
times$num_students <- ns

#' Clean the other variables as well!

tmp <- as.character(times$female_male_ratio)
tmp <- sub(pattern = " :.*", "", tmp)
tmp <- as.numeric(tmp)
times$female_male_ratio <- NULL

tmp <- as.character(times$international_students)
tmp <- sub(pattern = "%", "", tmp)
tmp <- as.numeric(tmp)
times$international_students <- tmp

tmp <- as.character(times$income)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$income <- tmp

tmp <- as.character(times$total_score)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$total_score <- tmp

summary(times)

#' # Create Transactions
library(arules)
library(arulesViz)
library(plotly)

#' Keep year as a nominal variable
times$year <- as.factor(times$year)

#' Discretize all other continuous variables
for(i in which(sapply(times, is.numeric)))
  times[[i]] <- discretize(times[[i]], method = "frequency", categories = 5,
    labels = c("very low", "low", "average", "high", "very high"))

#' Get rid of total score (is derived) and university name
times$total_score <- NULL
times$university_name <- NULL
times$year <- NULL

summary(times)

trans <- as(times, "transactions")
trans

summary(trans)

#' # Mine Some Rules
#' What minimum support should we use?
nrow(trans)
10/nrow(trans)

rules <- apriori(trans, parameter = list(support = 0.004))

#' inspect rules
inspect(head(rules, 10, by = "lift"))
inspectDT(rules)

#' visualize rules (interactively)
plotly_arules(rules, jitter = 2)

#' # Look at Rules About China
rules_china <- subset(rules, items %pin% "China")
rules_china
inspect(head(rules_china, 10, by="lift"))
plotly_arules(rules_china, jitter = 1)


#' # Create a China Data Subset
trans_china <- subset(trans, items %pin% "China")
itemFrequencyPlot(trans_china, topN=20)

10/nrow(trans_china)
rules_china <- apriori(trans_china, parameter = list(support = 0.12))
inspect(head(rules_china, 10, by ="lift"))
inspectDT(rules_china)

#' # Todo
#'
#' * Use more data (improvement, etc.)
#' * Compare countries? Datasets and rules?