Here I only look at the Times Higher Education World University Ranking.
Load data
times <- read.csv("timesData.csv")
head(times)
## world_rank university_name
## 1 1 Harvard University
## 2 2 California Institute of Technology
## 3 3 Massachusetts Institute of Technology
## 4 4 Stanford University
## 5 5 Princeton University
## 6 6 University of Cambridge
## country teaching international research citations
## 1 United States of America 99.7 72.4 98.7 98.8
## 2 United States of America 97.7 54.6 98.0 99.9
## 3 United States of America 97.8 82.3 91.4 99.9
## 4 United States of America 98.3 29.5 98.1 99.2
## 5 United States of America 90.9 70.3 95.4 99.9
## 6 United Kingdom 90.5 77.7 94.1 94.0
## income total_score num_students student_staff_ratio
## 1 34.5 96.1 20,152 8.9
## 2 83.7 96.0 2,243 6.9
## 3 87.5 95.6 11,074 9.0
## 4 64.3 94.3 15,596 7.8
## 5 - 94.2 7,929 8.4
## 6 57.0 91.2 18,812 11.8
## international_students female_male_ratio year
## 1 25% 2011
## 2 27% 33 : 67 2011
## 3 33% 37 : 63 2011
## 4 22% 42 : 58 2011
## 5 27% 45 : 55 2011
## 6 34% 46 : 54 2011
summary(times)
## world_rank university_name
## 301-350: 248 Aarhus University : 6
## 601-800: 200 Arizona State University : 6
## 351-400: 198 Australian National University: 6
## 276-300: 104 Bielefeld University : 6
## 201-225: 103 Bilkent University : 6
## 226-250: 100 Birkbeck, University of London: 6
## (Other):1650 (Other) :2567
## country teaching international
## United States of America: 659 Min. : 9.9 20.7 : 10
## United Kingdom : 300 1st Qu.:24.7 29.6 : 10
## Germany : 152 Median :33.9 - : 9
## Australia : 117 Mean :37.8 34.3 : 9
## Canada : 108 3rd Qu.:46.4 46.8 : 9
## Japan : 98 Max. :99.7 48.4 : 9
## (Other) :1169 (Other):2547
## research citations income total_score
## Min. : 2.90 Min. : 1.20 - : 218 - :1402
## 1st Qu.:19.60 1st Qu.: 45.50 100.0 : 68 49.0 : 13
## Median :30.50 Median : 62.50 28.0 : 26 51.1 : 12
## Mean :35.91 Mean : 60.92 31.1 : 20 46.6 : 11
## 3rd Qu.:47.25 3rd Qu.: 79.05 28.8 : 19 46.9 : 10
## Max. :99.40 Max. :100.00 28.5 : 18 50.1 : 10
## (Other):2234 (Other):1145
## num_students student_staff_ratio international_students
## : 59 Min. : 0.60 7% : 142
## 10,221 : 6 1st Qu.: 11.97 10% : 133
## 10,410 : 6 Median : 16.10 9% : 130
## 10,441 : 6 Mean : 18.45 5% : 120
## 10,901 : 6 3rd Qu.: 21.50 8% : 119
## 10,930 : 6 Max. :162.60 12% : 104
## (Other):2514 NA's :59 (Other):1855
## female_male_ratio year
## : 233 Min. :2011
## 54 : 46: 185 1st Qu.:2013
## 52 : 48: 151 Median :2014
## 53 : 47: 138 Mean :2014
## 55 : 45: 135 3rd Qu.:2016
## 56 : 44: 132 Max. :2016
## (Other):1629
Many numeric columns are read in as factors because they contain some non-numeric characters (-, %, ‘missing’, etc.)
Ranking variable
rnk <- as.character(times$world_rank)
get rid of = and ranges (look up regular expressions!)
rnk <- sub(pattern = "=", "", rnk)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
times$world_rank <- rnk
Intl
intl <- as.character(times$international)
intl[intl == '-'] <- NA
intl <- as.numeric(intl)
times$international <- intl
Students
ns <- as.character(times$num_students)
ns <- sub(pattern = ",", "", ns)
ns <- as.numeric(ns)
times$num_students <- ns
Clean the other variables as well!
tmp <- as.character(times$female_male_ratio)
tmp <- sub(pattern = " :.*", "", tmp)
tmp <- as.numeric(tmp)
## Warning: NAs introduced by coercion
times$female_male_ratio <- NULL
tmp <- as.character(times$international_students)
tmp <- sub(pattern = "%", "", tmp)
tmp <- as.numeric(tmp)
times$international_students <- tmp
tmp <- as.character(times$income)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$income <- tmp
tmp <- as.character(times$total_score)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$total_score <- tmp
summary(times)
## world_rank university_name
## Min. : 1 Aarhus University : 6
## 1st Qu.:109 Arizona State University : 6
## Median :201 Australian National University: 6
## Mean :235 Bielefeld University : 6
## 3rd Qu.:301 Bilkent University : 6
## Max. :601 Birkbeck, University of London: 6
## (Other) :2567
## country teaching international
## United States of America: 659 Min. : 9.9 Min. : 7.10
## United Kingdom : 300 1st Qu.:24.7 1st Qu.: 33.42
## Germany : 152 Median :33.9 Median : 50.30
## Australia : 117 Mean :37.8 Mean : 52.01
## Canada : 108 3rd Qu.:46.4 3rd Qu.: 69.00
## Japan : 98 Max. :99.7 Max. :100.00
## (Other) :1169 NA's :9
## research citations income total_score
## Min. : 2.90 Min. : 1.20 Min. : 24.20 Min. :41.40
## 1st Qu.:19.60 1st Qu.: 45.50 1st Qu.: 33.00 1st Qu.:50.30
## Median :30.50 Median : 62.50 Median : 41.00 Median :56.00
## Mean :35.91 Mean : 60.92 Mean : 48.98 Mean :59.85
## 3rd Qu.:47.25 3rd Qu.: 79.05 3rd Qu.: 59.00 3rd Qu.:66.20
## Max. :99.40 Max. :100.00 Max. :100.00 Max. :96.10
## NA's :218 NA's :1402
## num_students student_staff_ratio international_students
## Min. : 462 Min. : 0.60 Min. : 0.00
## 1st Qu.: 12638 1st Qu.: 11.97 1st Qu.: 8.00
## Median : 20851 Median : 16.10 Median :13.00
## Mean : 23874 Mean : 18.45 Mean :15.44
## 3rd Qu.: 29991 3rd Qu.: 21.50 3rd Qu.:21.00
## Max. :379231 Max. :162.60 Max. :82.00
## NA's :59 NA's :59 NA's :67
## year
## Min. :2011
## 1st Qu.:2013
## Median :2014
## Mean :2014
## 3rd Qu.:2016
## Max. :2016
##
data <- times[,c("university_name","teaching",
"research", "citations", "income",
"international", "num_students")]
data[,-1] <- scale(data[,-1])
kmeans does not like missing data!
data <- na.omit(data)
summary(data)
## university_name teaching
## Aarhus University : 6 Min. :-1.58493
## Birkbeck, University of London : 6 1st Qu.:-0.76127
## Boston University : 6 Median :-0.27843
## California Institute of Technology: 6 Mean :-0.04339
## Carnegie Mellon University : 6 3rd Qu.: 0.41459
## Delft University of Technology : 6 Max. : 3.51612
## (Other) :2298
## research citations income
## Min. :-1.4355 Min. :-2.58835 Min. :-1.169969
## 1st Qu.:-0.7709 1st Qu.:-0.69330 1st Qu.:-0.749760
## Median :-0.2687 Median : 0.04240 Median :-0.367323
## Mean :-0.0212 Mean :-0.02148 Mean : 0.007903
## 3rd Qu.: 0.4841 3rd Qu.: 0.75643 3rd Qu.: 0.496702
## Max. : 2.9871 Max. : 1.69367 Max. : 2.408889
##
## international num_students
## Min. :-2.03166 Min. :-1.32450
## 1st Qu.:-0.79205 1st Qu.:-0.63707
## Median :-0.02296 Median :-0.17554
## Mean : 0.03830 Mean :-0.00842
## 3rd Qu.: 0.81400 3rd Qu.: 0.33454
## Max. : 2.17123 Max. :20.10400
##
pairs(data[,-1])
km <- kmeans(data[,-1], centers = 3)
plot(data[,c("research", "num_students")], col = km$cluster)
pairs(data[,-1], col = km$cluster)
library("GGally")
data2 <- data[,-1]
data2$cluster <- as.factor(km$cluster)
ggpairs(data2, mapping = ggplot2::aes(color = cluster))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
data_tr <- data[, c("teaching", "research")]
plot(data_tr)
cor(data_tr)
## teaching research
## teaching 1.00000 0.91712
## research 0.91712 1.00000
km <- kmeans(data_tr, centers = 3)
plot(data_tr, col = km$cluster)
# do PCA
pr <- prcomp(data_tr)
biplot(pr)
plot(pr$x, col = km$cluster)
Note: Clustering does just do k-means discretization if we have a single point cloud.
You should probably check the clusterability of the data. # Possible ground truth
library(dbscan)
l <- lof(data[,-1])
hist(l)
data$university_name[order(l, decreasing = TRUE)[1:10]]
## [1] Anadolu University
## [2] Cairo University
## [3] University of Barcelona
## [4] Panthéon-Sorbonne University – Paris 1
## [5] University of South Africa
## [6] University of Naples Federico II
## [7] Wuhan University of Technology
## [8] Qatar University
## [9] Wuhan University of Technology
## [10] King Abdulaziz University
## 818 Levels: Aalborg University Aalto University ... Zhejiang University
data[order(l, decreasing = TRUE)[1:10],]
## university_name teaching research
## 2414 Anadolu University -1.4542820 -0.6262235
## 2431 Cairo University -1.0736914 -1.1672776
## 142 University of Barcelona -0.2102620 -0.1039886
## 2181 Panthéon-Sorbonne University – Paris 1 0.8747052 0.5499812
## 2563 University of South Africa -1.1020937 -1.1578679
## 2129 University of Naples Federico II -0.7442249 -0.9931993
## 1354 Wuhan University of Technology -1.3065902 -1.3225366
## 2541 Qatar University -1.5849325 -1.2049161
## 1803 Wuhan University of Technology -1.2270638 -1.3131270
## 2075 King Abdulaziz University -0.7896686 -1.1343438
## citations income international num_students
## 2414 -2.1679519 2.40888932 -1.7059238 20.1040003
## 2431 -2.1246116 -0.88196077 -0.9503984 11.7712077
## 142 1.3426116 -0.89612512 -1.3439955 1.3361231
## 2181 -2.3456471 -0.97638975 -0.1993972 1.0376384
## 2563 -2.2329623 -0.95278251 -1.0544528 9.8002241
## 2129 0.8832045 -0.07459296 -1.3982847 3.3819541
## 1354 0.7445156 0.45893080 -1.4978150 1.4927767
## 2541 -1.6738726 0.29368008 2.1667092 -0.8084862
## 1803 0.6448329 0.41643776 -1.3847124 1.4927767
## 2075 0.6795052 1.13881948 1.8545460 0.6797509
ignore number of students
l <- lof(data[,-c(1, 7)])
hist(l)
data$university_name[order(l, decreasing = TRUE)[1:10]]
## [1] Qatar University
## [2] École Normale Supérieure
## [3] Stanford University
## [4] King Abdulaziz University
## [5] Tilburg University
## [6] University of Basel
## [7] National Autonomous University of Mexico
## [8] Yuan Ze University
## [9] University of St Gallen
## [10] National Research Nuclear University MePhI
## 818 Levels: Aalborg University Aalto University ... Zhejiang University
data[order(l, decreasing = TRUE)[1:10],]
## university_name teaching research
## 2541 Qatar University -1.58493255 -1.2049161
## 1857 École Normale Supérieure 1.86310471 0.5546860
## 4 Stanford University 3.43659124 2.9259145
## 2075 King Abdulaziz University -0.78966860 -1.1343438
## 463 Tilburg University -0.86351454 0.6346679
## 311 University of Basel 0.06239992 -0.6591572
## 2260 National Autonomous University of Mexico 0.27825727 0.2018246
## 602 Yuan Ze University -1.53380844 -1.2378499
## 2161 University of St Gallen -0.68173992 -1.1625728
## 838 National Research Nuclear University MePhI -0.96008230 -1.1908017
## citations income international num_students
## 2541 -1.6738726 0.29368008 2.1667092 -0.8084862
## 1857 1.1345782 -0.56090223 1.5152382 -1.2148576
## 4 1.6589957 0.72333196 -1.0182600 -0.4683064
## 2075 0.6795052 1.13881948 1.8545460 0.6797509
## 463 -1.6175302 -0.61283817 0.7732852 -0.9220303
## 311 1.0869039 -0.01793557 1.8274014 -0.6405744
## 2260 -1.9989248 0.69500326 -0.7287173 6.4213953
## 602 -0.1136222 -0.93389671 -1.7737853 -0.8605343
## 2161 -0.3216555 -0.33899411 2.0626548 -0.9632162
## 838 1.6936679 0.50142384 -1.5023391 -0.9093011