Look at the Times Ranking
Use Cleaning From Project 1
Select some data for clustering
Create clusters
Cluster just teaching and research
Find the weirdest universities using an outlier score

Look at the Times Ranking

Here I only look at the Times Higher Education World University Ranking.

Load data

times <- read.csv("timesData.csv")
head(times)

##   world_rank                       university_name
## 1          1                    Harvard University
## 2          2    California Institute of Technology
## 3          3 Massachusetts Institute of Technology
## 4          4                   Stanford University
## 5          5                  Princeton University
## 6          6               University of Cambridge
##                    country teaching international research citations
## 1 United States of America     99.7          72.4     98.7      98.8
## 2 United States of America     97.7          54.6     98.0      99.9
## 3 United States of America     97.8          82.3     91.4      99.9
## 4 United States of America     98.3          29.5     98.1      99.2
## 5 United States of America     90.9          70.3     95.4      99.9
## 6           United Kingdom     90.5          77.7     94.1      94.0
##   income total_score num_students student_staff_ratio
## 1   34.5        96.1       20,152                 8.9
## 2   83.7        96.0        2,243                 6.9
## 3   87.5        95.6       11,074                 9.0
## 4   64.3        94.3       15,596                 7.8
## 5      -        94.2        7,929                 8.4
## 6   57.0        91.2       18,812                11.8
##   international_students female_male_ratio year
## 1                    25%                   2011
## 2                    27%           33 : 67 2011
## 3                    33%           37 : 63 2011
## 4                    22%           42 : 58 2011
## 5                    27%           45 : 55 2011
## 6                    34%           46 : 54 2011

summary(times)

##    world_rank                         university_name
##  301-350: 248   Aarhus University             :   6  
##  601-800: 200   Arizona State University      :   6  
##  351-400: 198   Australian National University:   6  
##  276-300: 104   Bielefeld University          :   6  
##  201-225: 103   Bilkent University            :   6  
##  226-250: 100   Birkbeck, University of London:   6  
##  (Other):1650   (Other)                       :2567  
##                      country        teaching    international 
##  United States of America: 659   Min.   : 9.9   20.7   :  10  
##  United Kingdom          : 300   1st Qu.:24.7   29.6   :  10  
##  Germany                 : 152   Median :33.9   -      :   9  
##  Australia               : 117   Mean   :37.8   34.3   :   9  
##  Canada                  : 108   3rd Qu.:46.4   46.8   :   9  
##  Japan                   :  98   Max.   :99.7   48.4   :   9  
##  (Other)                 :1169                  (Other):2547  
##     research       citations          income      total_score  
##  Min.   : 2.90   Min.   :  1.20   -      : 218   -      :1402  
##  1st Qu.:19.60   1st Qu.: 45.50   100.0  :  68   49.0   :  13  
##  Median :30.50   Median : 62.50   28.0   :  26   51.1   :  12  
##  Mean   :35.91   Mean   : 60.92   31.1   :  20   46.6   :  11  
##  3rd Qu.:47.25   3rd Qu.: 79.05   28.8   :  19   46.9   :  10  
##  Max.   :99.40   Max.   :100.00   28.5   :  18   50.1   :  10  
##                                   (Other):2234   (Other):1145  
##   num_students  student_staff_ratio international_students
##         :  59   Min.   :  0.60      7%     : 142          
##  10,221 :   6   1st Qu.: 11.97      10%    : 133          
##  10,410 :   6   Median : 16.10      9%     : 130          
##  10,441 :   6   Mean   : 18.45      5%     : 120          
##  10,901 :   6   3rd Qu.: 21.50      8%     : 119          
##  10,930 :   6   Max.   :162.60      12%    : 104          
##  (Other):2514   NA's   :59          (Other):1855          
##  female_male_ratio      year     
##         : 233      Min.   :2011  
##  54 : 46: 185      1st Qu.:2013  
##  52 : 48: 151      Median :2014  
##  53 : 47: 138      Mean   :2014  
##  55 : 45: 135      3rd Qu.:2016  
##  56 : 44: 132      Max.   :2016  
##  (Other):1629

Use Cleaning From Project 1

Many numeric columns are read in as factors because they contain some non-numeric characters (-, %, ‘missing’, etc.)

Ranking variable

rnk <- as.character(times$world_rank)

get rid of = and ranges (look up regular expressions!)

rnk <- sub(pattern = "=", "", rnk)
rnk <- sub(pattern = "-.*", "", rnk)
rnk <- as.numeric(rnk)
times$world_rank <- rnk

Intl

intl <- as.character(times$international)
intl[intl == '-'] <- NA
intl <- as.numeric(intl)
times$international <- intl

Students

ns <- as.character(times$num_students)
ns <- sub(pattern = ",", "", ns)
ns <- as.numeric(ns)
times$num_students <- ns

Clean the other variables as well!

tmp <- as.character(times$female_male_ratio)
tmp <- sub(pattern = " :.*", "", tmp)
tmp <- as.numeric(tmp)

## Warning: NAs introduced by coercion

times$female_male_ratio <- NULL

tmp <- as.character(times$international_students)
tmp <- sub(pattern = "%", "", tmp)
tmp <- as.numeric(tmp)
times$international_students <- tmp

tmp <- as.character(times$income)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$income <- tmp

tmp <- as.character(times$total_score)
tmp <- sub(pattern = "-", "", tmp)
tmp <- as.numeric(tmp)
times$total_score <- tmp

summary(times)

##    world_rank                        university_name
##  Min.   :  1   Aarhus University             :   6  
##  1st Qu.:109   Arizona State University      :   6  
##  Median :201   Australian National University:   6  
##  Mean   :235   Bielefeld University          :   6  
##  3rd Qu.:301   Bilkent University            :   6  
##  Max.   :601   Birkbeck, University of London:   6  
##                (Other)                       :2567  
##                      country        teaching    international   
##  United States of America: 659   Min.   : 9.9   Min.   :  7.10  
##  United Kingdom          : 300   1st Qu.:24.7   1st Qu.: 33.42  
##  Germany                 : 152   Median :33.9   Median : 50.30  
##  Australia               : 117   Mean   :37.8   Mean   : 52.01  
##  Canada                  : 108   3rd Qu.:46.4   3rd Qu.: 69.00  
##  Japan                   :  98   Max.   :99.7   Max.   :100.00  
##  (Other)                 :1169                  NA's   :9       
##     research       citations          income        total_score   
##  Min.   : 2.90   Min.   :  1.20   Min.   : 24.20   Min.   :41.40  
##  1st Qu.:19.60   1st Qu.: 45.50   1st Qu.: 33.00   1st Qu.:50.30  
##  Median :30.50   Median : 62.50   Median : 41.00   Median :56.00  
##  Mean   :35.91   Mean   : 60.92   Mean   : 48.98   Mean   :59.85  
##  3rd Qu.:47.25   3rd Qu.: 79.05   3rd Qu.: 59.00   3rd Qu.:66.20  
##  Max.   :99.40   Max.   :100.00   Max.   :100.00   Max.   :96.10  
##                                   NA's   :218      NA's   :1402   
##   num_students    student_staff_ratio international_students
##  Min.   :   462   Min.   :  0.60      Min.   : 0.00         
##  1st Qu.: 12638   1st Qu.: 11.97      1st Qu.: 8.00         
##  Median : 20851   Median : 16.10      Median :13.00         
##  Mean   : 23874   Mean   : 18.45      Mean   :15.44         
##  3rd Qu.: 29991   3rd Qu.: 21.50      3rd Qu.:21.00         
##  Max.   :379231   Max.   :162.60      Max.   :82.00         
##  NA's   :59       NA's   :59          NA's   :67            
##       year     
##  Min.   :2011  
##  1st Qu.:2013  
##  Median :2014  
##  Mean   :2014  
##  3rd Qu.:2016  
##  Max.   :2016  
##

Select some data for clustering

data <- times[,c("university_name","teaching",
  "research", "citations", "income",
  "international", "num_students")]
data[,-1] <- scale(data[,-1])

kmeans does not like missing data!

data <- na.omit(data)
summary(data)

##                            university_name    teaching       
##  Aarhus University                 :   6   Min.   :-1.58493  
##  Birkbeck, University of London    :   6   1st Qu.:-0.76127  
##  Boston University                 :   6   Median :-0.27843  
##  California Institute of Technology:   6   Mean   :-0.04339  
##  Carnegie Mellon University        :   6   3rd Qu.: 0.41459  
##  Delft University of Technology    :   6   Max.   : 3.51612  
##  (Other)                           :2298                     
##     research         citations            income         
##  Min.   :-1.4355   Min.   :-2.58835   Min.   :-1.169969  
##  1st Qu.:-0.7709   1st Qu.:-0.69330   1st Qu.:-0.749760  
##  Median :-0.2687   Median : 0.04240   Median :-0.367323  
##  Mean   :-0.0212   Mean   :-0.02148   Mean   : 0.007903  
##  3rd Qu.: 0.4841   3rd Qu.: 0.75643   3rd Qu.: 0.496702  
##  Max.   : 2.9871   Max.   : 1.69367   Max.   : 2.408889  
##                                                          
##  international       num_students     
##  Min.   :-2.03166   Min.   :-1.32450  
##  1st Qu.:-0.79205   1st Qu.:-0.63707  
##  Median :-0.02296   Median :-0.17554  
##  Mean   : 0.03830   Mean   :-0.00842  
##  3rd Qu.: 0.81400   3rd Qu.: 0.33454  
##  Max.   : 2.17123   Max.   :20.10400  
##

pairs(data[,-1])

Create clusters

km <- kmeans(data[,-1], centers = 3)


plot(data[,c("research", "num_students")], col = km$cluster)

pairs(data[,-1], col = km$cluster)

library("GGally")

data2 <- data[,-1]
data2$cluster <- as.factor(km$cluster)
ggpairs(data2, mapping = ggplot2::aes(color = cluster))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Cluster just teaching and research

data_tr <- data[, c("teaching", "research")]
plot(data_tr)

cor(data_tr)

##          teaching research
## teaching  1.00000  0.91712
## research  0.91712  1.00000

km <- kmeans(data_tr, centers = 3)
plot(data_tr, col = km$cluster)

# do PCA
pr <- prcomp(data_tr)
biplot(pr)

plot(pr$x, col = km$cluster)

Note: Clustering does just do k-means discretization if we have a single point cloud.

You should probably check the clusterability of the data. # Possible ground truth

Group world rank into three groups and compare if k-means on the rest of the data produces a similar ranking

Find the weirdest universities using an outlier score

library(dbscan)
l <- lof(data[,-1])
hist(l)

data$university_name[order(l, decreasing = TRUE)[1:10]]

##  [1] Anadolu University                    
##  [2] Cairo University                      
##  [3] University of Barcelona               
##  [4] Panthéon-Sorbonne University – Paris 1
##  [5] University of South Africa            
##  [6] University of Naples Federico II      
##  [7] Wuhan University of Technology        
##  [8] Qatar University                      
##  [9] Wuhan University of Technology        
## [10] King Abdulaziz University             
## 818 Levels: Aalborg University Aalto University ... Zhejiang University

data[order(l, decreasing = TRUE)[1:10],]

##                             university_name   teaching   research
## 2414                     Anadolu University -1.4542820 -0.6262235
## 2431                       Cairo University -1.0736914 -1.1672776
## 142                 University of Barcelona -0.2102620 -0.1039886
## 2181 Panthéon-Sorbonne University – Paris 1  0.8747052  0.5499812
## 2563             University of South Africa -1.1020937 -1.1578679
## 2129       University of Naples Federico II -0.7442249 -0.9931993
## 1354         Wuhan University of Technology -1.3065902 -1.3225366
## 2541                       Qatar University -1.5849325 -1.2049161
## 1803         Wuhan University of Technology -1.2270638 -1.3131270
## 2075              King Abdulaziz University -0.7896686 -1.1343438
##       citations      income international num_students
## 2414 -2.1679519  2.40888932    -1.7059238   20.1040003
## 2431 -2.1246116 -0.88196077    -0.9503984   11.7712077
## 142   1.3426116 -0.89612512    -1.3439955    1.3361231
## 2181 -2.3456471 -0.97638975    -0.1993972    1.0376384
## 2563 -2.2329623 -0.95278251    -1.0544528    9.8002241
## 2129  0.8832045 -0.07459296    -1.3982847    3.3819541
## 1354  0.7445156  0.45893080    -1.4978150    1.4927767
## 2541 -1.6738726  0.29368008     2.1667092   -0.8084862
## 1803  0.6448329  0.41643776    -1.3847124    1.4927767
## 2075  0.6795052  1.13881948     1.8545460    0.6797509

ignore number of students

l <- lof(data[,-c(1, 7)])
hist(l)

data$university_name[order(l, decreasing = TRUE)[1:10]]

##  [1] Qatar University                          
##  [2] École Normale Supérieure                  
##  [3] Stanford University                       
##  [4] King Abdulaziz University                 
##  [5] Tilburg University                        
##  [6] University of Basel                       
##  [7] National Autonomous University of Mexico  
##  [8] Yuan Ze University                        
##  [9] University of St Gallen                   
## [10] National Research Nuclear University MePhI
## 818 Levels: Aalborg University Aalto University ... Zhejiang University

data[order(l, decreasing = TRUE)[1:10],]

##                                 university_name    teaching   research
## 2541                           Qatar University -1.58493255 -1.2049161
## 1857                   École Normale Supérieure  1.86310471  0.5546860
## 4                           Stanford University  3.43659124  2.9259145
## 2075                  King Abdulaziz University -0.78966860 -1.1343438
## 463                          Tilburg University -0.86351454  0.6346679
## 311                         University of Basel  0.06239992 -0.6591572
## 2260   National Autonomous University of Mexico  0.27825727  0.2018246
## 602                          Yuan Ze University -1.53380844 -1.2378499
## 2161                    University of St Gallen -0.68173992 -1.1625728
## 838  National Research Nuclear University MePhI -0.96008230 -1.1908017
##       citations      income international num_students
## 2541 -1.6738726  0.29368008     2.1667092   -0.8084862
## 1857  1.1345782 -0.56090223     1.5152382   -1.2148576
## 4     1.6589957  0.72333196    -1.0182600   -0.4683064
## 2075  0.6795052  1.13881948     1.8545460    0.6797509
## 463  -1.6175302 -0.61283817     0.7732852   -0.9220303
## 311   1.0869039 -0.01793557     1.8274014   -0.6405744
## 2260 -1.9989248  0.69500326    -0.7287173    6.4213953
## 602  -0.1136222 -0.93389671    -1.7737853   -0.8605343
## 2161 -0.3216555 -0.33899411     2.0626548   -0.9632162
## 838   1.6936679  0.50142384    -1.5023391   -0.9093011

Project 4: University Rankings

Michael Hahsler