Here I only look at the Times Higher Education World University Ranking.
Load data
times <- read.csv("timesData.csv")
head(times)
## world_rank university_name
## 1 1 Harvard University
## 2 2 California Institute of Technology
## 3 3 Massachusetts Institute of Technology
## 4 4 Stanford University
## 5 5 Princeton University
## 6 6 University of Cambridge
## country teaching international research citations
## 1 United States of America 99.7 72.4 98.7 98.8
## 2 United States of America 97.7 54.6 98.0 99.9
## 3 United States of America 97.8 82.3 91.4 99.9
## 4 United States of America 98.3 29.5 98.1 99.2
## 5 United States of America 90.9 70.3 95.4 99.9
## 6 United Kingdom 90.5 77.7 94.1 94.0
## income total_score num_students student_staff_ratio
## 1 34.5 96.1 20,152 8.9
## 2 83.7 96.0 2,243 6.9
## 3 87.5 95.6 11,074 9.0
## 4 64.3 94.3 15,596 7.8
## 5 - 94.2 7,929 8.4
## 6 57.0 91.2 18,812 11.8
## international_students female_male_ratio year
## 1 25% 2011
## 2 27% 33 : 67 2011
## 3 33% 37 : 63 2011
## 4 22% 42 : 58 2011
## 5 27% 45 : 55 2011
## 6 34% 46 : 54 2011
summary(times)
## world_rank university_name
## 301-350: 248 Aarhus University : 6
## 601-800: 200 Arizona State University : 6
## 351-400: 198 Australian National University: 6
## 276-300: 104 Bielefeld University : 6
## 201-225: 103 Bilkent University : 6
## 226-250: 100 Birkbeck, University of London: 6
## (Other):1650 (Other) :2567
## country teaching international
## United States of America: 659 Min. : 9.9 20.7 : 10
## United Kingdom : 300 1st Qu.:24.7 29.6 : 10
## Germany : 152 Median :33.9 - : 9
## Australia : 117 Mean :37.8 34.3 : 9
## Canada : 108 3rd Qu.:46.4 46.8 : 9
## Japan : 98 Max. :99.7 48.4 : 9
## (Other) :1169 (Other):2547
## research citations income total_score
## Min. : 2.90 Min. : 1.20 - : 218 - :1402
## 1st Qu.:19.60 1st Qu.: 45.50 100.0 : 68 49.0 : 13
## Median :30.50 Median : 62.50 28.0 : 26 51.1 : 12
## Mean :35.91 Mean : 60.92 31.1 : 20 46.6 : 11
## 3rd Qu.:47.25 3rd Qu.: 79.05 28.8 : 19 46.9 : 10
## Max. :99.40 Max. :100.00 28.5 : 18 50.1 : 10
## (Other):2234 (Other):1145
## num_students student_staff_ratio international_students
## : 59 Min. : 0.60 7% : 142
## 10,221 : 6 1st Qu.: 11.97 10% : 133
## 10,410 : 6 Median : 16.10 9% : 130
## 10,441 : 6 Mean : 18.45 5% : 120
## 10,901 : 6 3rd Qu.: 21.50 8% : 119
## 10,930 : 6 Max. :162.60 12% : 104
## (Other):2514 NA's :59 (Other):1855
## female_male_ratio year
## : 233 Min. :2011
## 54 : 46: 185 1st Qu.:2013
## 52 : 48: 151 Median :2014
## 53 : 47: 138 Mean :2014
## 55 : 45: 135 3rd Qu.:2016
## 56 : 44: 132 Max. :2016
## (Other):1629
Many numeric columns are read in as factors because they contain some non-numeric characters (-, %, ‘missing’, etc.)
Ranking variable
rnk <- as.character(times$world_rank)
get rid of = and ranges (look up regular expressions!)
rnk <- sub(pattern = "=", "", rnk)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
times$world_rank <- rnk
Intl
intl <- as.character(times$international)
intl[intl == '-'] <- NA
intl <- as.numeric(intl)
times$international <- intl
Students
ns <- as.character(times$num_students)
ns <- sub(pattern = ",", "", ns)
ns <- as.numeric(ns)
times$num_students <- ns
Clean the other variables as well!
tmp <- as.character(times$female_male_ratio)
tmp <- sub(pattern = " :.*", "", tmp)
tmp <- as.numeric(tmp)
## Warning: NAs introduced by coercion
times$female_male_ratio <- NULL
tmp <- as.character(times$international_students)
tmp <- sub(pattern = "%", "", tmp)
tmp <- as.numeric(tmp)
times$international_students <- tmp
tmp <- as.character(times$income)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$income <- tmp
tmp <- as.character(times$total_score)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$total_score <- tmp
summary(times)
## world_rank university_name
## Min. : 1 Aarhus University : 6
## 1st Qu.:109 Arizona State University : 6
## Median :201 Australian National University: 6
## Mean :235 Bielefeld University : 6
## 3rd Qu.:301 Bilkent University : 6
## Max. :601 Birkbeck, University of London: 6
## (Other) :2567
## country teaching international
## United States of America: 659 Min. : 9.9 Min. : 7.10
## United Kingdom : 300 1st Qu.:24.7 1st Qu.: 33.42
## Germany : 152 Median :33.9 Median : 50.30
## Australia : 117 Mean :37.8 Mean : 52.01
## Canada : 108 3rd Qu.:46.4 3rd Qu.: 69.00
## Japan : 98 Max. :99.7 Max. :100.00
## (Other) :1169 NA's :9
## research citations income total_score
## Min. : 2.90 Min. : 1.20 Min. : 24.20 Min. :41.40
## 1st Qu.:19.60 1st Qu.: 45.50 1st Qu.: 33.00 1st Qu.:50.30
## Median :30.50 Median : 62.50 Median : 41.00 Median :56.00
## Mean :35.91 Mean : 60.92 Mean : 48.98 Mean :59.85
## 3rd Qu.:47.25 3rd Qu.: 79.05 3rd Qu.: 59.00 3rd Qu.:66.20
## Max. :99.40 Max. :100.00 Max. :100.00 Max. :96.10
## NA's :218 NA's :1402
## num_students student_staff_ratio international_students
## Min. : 462 Min. : 0.60 Min. : 0.00
## 1st Qu.: 12638 1st Qu.: 11.97 1st Qu.: 8.00
## Median : 20851 Median : 16.10 Median :13.00
## Mean : 23874 Mean : 18.45 Mean :15.44
## 3rd Qu.: 29991 3rd Qu.: 21.50 3rd Qu.:21.00
## Max. :379231 Max. :162.60 Max. :82.00
## NA's :59 NA's :59 NA's :67
## year
## Min. :2011
## 1st Qu.:2013
## Median :2014
## Mean :2014
## 3rd Qu.:2016
## Max. :2016
##
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:graphics':
##
## layout
Keep year as a nominal variable
times$year <- as.factor(times$year)
Discretize all other continuous variables
for(i in which(sapply(times, is.numeric)))
times[[i]] <- discretize(times[[i]], method = "frequency", categories = 5,
labels = c("very low", "low", "average", "high", "very high"))
Get rid of total score (is derived) and university name
times$total_score <- NULL
times$university_name <- NULL
times$year <- NULL
summary(times)
## world_rank country teaching
## very low :522 United States of America: 659 very low :523
## low :527 United Kingdom : 300 low :520
## average :554 Germany : 152 average :521
## high :601 Australia : 117 high :519
## very high:399 Canada : 108 very high:520
## Japan : 98
## (Other) :1169
## international research citations income
## very low :522 very low :524 very low :521 very low :478
## low :516 low :518 low :521 low :483
## average :519 average :522 average :520 average :474
## high :520 high :520 high :521 high :473
## very high:517 very high:519 very high:520 very high:477
## NA's : 9 NA's :218
##
## num_students student_staff_ratio international_students
## very low :509 very low :516 very low :632
## low :512 low :509 low :482
## average :508 average :504 average :453
## high :510 high :521 high :470
## very high:505 very high:494 very high:499
## NA's : 59 NA's : 59 NA's : 67
##
trans <- as(times, "transactions")
trans
## transactions in sparse format with
## 2603 transactions (rows) and
## 117 items (columns)
summary(trans)
## transactions as itemMatrix in sparse format with
## 2603 rows (elements/itemsets/transactions) and
## 117 columns (items) and a density of 0.08411727
##
## most frequent items:
## country=United States of America international_students=very low
## 659 632
## world_rank=high world_rank=average
## 601 554
## world_rank=low (Other)
## 527 22645
##
## element (itemset/transaction) length distribution:
## sizes
## 6 7 8 9 10
## 13 46 6 210 2328
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 10.000 10.000 9.842 10.000 10.000
##
## includes extended item information - examples:
## labels variables levels
## 1 world_rank=very low world_rank very low
## 2 world_rank=low world_rank low
## 3 world_rank=average world_rank average
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
What minimum support should we use?
nrow(trans)
## [1] 2603
10/nrow(trans)
## [1] 0.003841721
rules <- apriori(trans, parameter = list(support = 0.004))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.004 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 10
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[117 item(s), 2603 transaction(s)] done [0.00s].
## sorting and recoding items ... [81 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 done [0.03s].
## writing ... [20974 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
inspect rules
inspect(head(rules, 10, by = "lift"))
## lhs rhs support confidence lift
## [1] {international=very low,
## num_students=very low,
## student_staff_ratio=average,
## international_students=very low} => {country=India} 0.004225893 0.8461538 73.41795
## [2] {income=very high,
## student_staff_ratio=low,
## international_students=very low} => {country=China} 0.004610065 0.9230769 28.94903
## [3] {international=low,
## research=low,
## income=low,
## num_students=very high,
## international_students=very low} => {country=Italy} 0.004994237 1.0000000 27.69149
## [4] {international=low,
## research=low,
## num_students=very high,
## student_staff_ratio=very high,
## international_students=very low} => {country=Italy} 0.006915098 1.0000000 27.69149
## [5] {teaching=low,
## international=low,
## num_students=very high,
## student_staff_ratio=very high,
## international_students=very low} => {country=Italy} 0.006530926 1.0000000 27.69149
## [6] {world_rank=high,
## international=low,
## num_students=very high,
## student_staff_ratio=very high,
## international_students=very low} => {country=Italy} 0.005378410 1.0000000 27.69149
## [7] {world_rank=high,
## teaching=low,
## num_students=very high,
## student_staff_ratio=very high,
## international_students=very low} => {country=Italy} 0.004225893 1.0000000 27.69149
## [8] {world_rank=high,
## international=low,
## research=low,
## citations=average,
## num_students=very high} => {country=Italy} 0.004610065 1.0000000 27.69149
## [9] {international=low,
## research=low,
## citations=average,
## num_students=very high,
## international_students=very low} => {country=Italy} 0.005762582 1.0000000 27.69149
## [10] {teaching=low,
## international=low,
## research=low,
## num_students=very high,
## student_staff_ratio=very high,
## international_students=very low} => {country=Italy} 0.005378410 1.0000000 27.69149
inspectDT(rules)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html
visualize rules (interactively)
plotly_arules(rules, jitter = 2)
## Warning in .plotly_scatter(x, measure, shading, max = max, ...): too
## many rules supplied only plotting the best 1000 rules using lift (change
## parameter max if needed)
## Warning in stats::runif(length(x), -amount, amount): '.Random.seed' is not
## an integer vector but of type 'NULL', so ignored
rules_china <- subset(rules, items %pin% "China")
rules_china
## set of 234 rules
inspect(head(rules_china, 10, by="lift"))
## lhs rhs support confidence lift
## [1] {income=very high,
## student_staff_ratio=low,
## international_students=very low} => {country=China} 0.004610065 0.9230769 28.949027
## [2] {international=very low,
## citations=very low,
## income=very high,
## num_students=very high} => {country=China} 0.006146754 0.8421053 26.409639
## [3] {international=very low,
## citations=very low,
## income=very high,
## num_students=very high,
## international_students=very low} => {country=China} 0.006146754 0.8421053 26.409639
## [4] {citations=very low,
## income=very high,
## num_students=very high} => {country=China} 0.006915098 0.8181818 25.659365
## [5] {citations=very low,
## income=very high,
## num_students=very high,
## international_students=very low} => {country=China} 0.006530926 0.8095238 25.387837
## [6] {country=China,
## teaching=low,
## citations=very low} => {world_rank=very high} 0.004225893 0.9166667 5.980159
## [7] {country=China,
## teaching=low,
## international_students=very low} => {world_rank=very high} 0.004225893 0.9166667 5.980159
## [8] {country=China,
## teaching=low,
## international=very low,
## international_students=very low} => {world_rank=very high} 0.004225893 0.9166667 5.980159
## [9] {country=China,
## teaching=low} => {world_rank=very high} 0.005378410 0.8750000 5.708333
## [10] {country=China,
## teaching=low,
## international=very low} => {world_rank=very high} 0.004610065 0.8571429 5.591837
plotly_arules(rules_china, jitter = 1)
trans_china <- subset(trans, items %pin% "China")
itemFrequencyPlot(trans_china, topN=20)
10/nrow(trans_china)
## [1] 0.1204819
rules_china <- apriori(trans_china, parameter = list(support = 0.12))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.12 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[42 item(s), 83 transaction(s)] done [0.00s].
## sorting and recoding items ... [31 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.00s].
## writing ... [898 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(rules_china, 10, by ="lift"))
## lhs rhs support confidence lift
## [1] {world_rank=very low,
## income=very high} => {research=very high} 0.1445783 1 6.916667
## [2] {world_rank=very low,
## num_students=very high} => {research=very high} 0.1445783 1 6.916667
## [3] {world_rank=very low,
## teaching=very high,
## income=very high} => {research=very high} 0.1445783 1 6.916667
## [4] {world_rank=very low,
## teaching=very high,
## num_students=very high} => {research=very high} 0.1445783 1 6.916667
## [5] {world_rank=very low,
## income=very high,
## num_students=very high} => {research=very high} 0.1445783 1 6.916667
## [6] {world_rank=very low,
## country=China,
## income=very high} => {research=very high} 0.1445783 1 6.916667
## [7] {world_rank=very low,
## country=China,
## num_students=very high} => {research=very high} 0.1445783 1 6.916667
## [8] {world_rank=very low,
## teaching=very high,
## income=very high,
## num_students=very high} => {research=very high} 0.1445783 1 6.916667
## [9] {world_rank=very low,
## country=China,
## teaching=very high,
## income=very high} => {research=very high} 0.1445783 1 6.916667
## [10] {world_rank=very low,
## country=China,
## teaching=very high,
## num_students=very high} => {research=very high} 0.1445783 1 6.916667
inspectDT(rules_china)