knitr::opts_chunk$set(echo = TRUE)
this code shows how to extract json tables and performing primary data analysis tasks
in this example we use data from https://www.topuniversities.com/university-rankings/world-university-rankings/2018
start with inspect > network > fetch/XHR > copy risponse
library(rvest)
library(xml2)
library(jsonlite)
library(tidyverse)
library(ggrepel)
gather the data with some wrangling
XHR<- "https://www.topuniversities.com/sites/default/files/qs-rankings-data/en/357051.txt?rc4i6a"
res<- jsonlite::fromJSON(XHR)$data[, c(
"rank_display", "score", "title", "country", "region" )]
sep<-function(s){
str_remove_all(s, '<div class=\"td-wrap\"><a href=\"')%>%str_remove('</a></div>')%>%
paste0("https://www.topuniversities.com",.)%>%
str_replace_all(.,pattern = '\" class=\"uni-link\">',replacement = ' ')%>%
data.frame(x=.,title=s,stringsAsFactors = FALSE)%>%
separate(x, c("website", "name"), extra = "merge",sep = " " ,fill = "left")
}
res1<- sapply(res$title,sep)
res1<- as.data.frame(t(res1),optional = TRUE)
rownames(res1) <- c()
results<- merge(res,res1,by="title")
results<-results%>% select(name,rank_display,score,country,region,website)
results$score<- as.numeric(results$score)
results<- results%>% filter(!is.na(score))
then we can create informative plots mean scores
results%>%group_by(country,region)%>%summarize(mean_score= mean(score))%>%ungroup()%>%
mutate(country = reorder(country, mean_score)) %>%
ggplot(aes(country, mean_score,fill=region)) +
geom_bar(stat="identity") +
coord_flip() +
theme(axis.text.y = element_text(size = 7)) +
xlab("")
we can also see the change of the score of each of top 20 universities of 2018 in 2022
results_2018_top<- results%>%top_n(.,20,score)
XHR_2<- "https://www.topuniversities.com/sites/default/files/qs-rankings-data/en/3740566.txt?rdin9r" ##same as we did in 2018 data
res_1<- jsonlite::fromJSON(XHR_2)$data[, c(
"rank_display", "score", "title", "country", "region" )]
res_2<- sapply(res_1$title,sep)
res_2<- as.data.frame(t(res_2),optional = TRUE)
rownames(res_2) <- c()
results_2022<- merge(res_1,res_2,by="title")
results_2022<-results_2022%>% select(name,rank_display,score,country,region,website)
results_2022$score<- as.numeric(results_2022$score)
results_2022<- results_2022%>% filter(!is.na(score))
re_wide<- inner_join(results_2018_top,results_2022,by=c("name","region","website","country")) %>%
rename("2018" = score.x ,"2022" = score.y)%>%
select(name, "2018","2022",country,region)
re_wide%>%
ggplot(aes(x= "2018",xend="2022", y= `2018` ,yend=`2022`,
color= region,group=country))+geom_segment()+
geom_text_repel(aes(label=name,x="2018",y=`2018`), size = 2,nudge_x = -.35)+
geom_label(aes(label=`2022`,x="2022",y=`2022`), size = 2.5,nudge_x = 0.08)+
geom_label(aes(label=`2018`,x="2018",y=`2018`), size = 2.5,nudge_x =- 0.08)+
ylab("score") +xlab("year")