diff --git a/10.visualize-gender.Rmd b/10.visualize-gender.Rmd index 46c7d4c..3bec7dd 100644 --- a/10.visualize-gender.Rmd +++ b/10.visualize-gender.Rmd @@ -112,11 +112,31 @@ iscb_pubmed_sum %>% summarise(prob_female_avg = mean(mean_prob)) ``` - ### Supplementary Figure S1 {#sup_fig_s1} -Increasing trend of honorees who were women in each honor category, especially in the group of ISCB Fellows, which markedly increased after 2015. -```{r eval=FALSE} +Additional fig. 1 with separated keynote speakers and fellows + +```{r} +iscb_pubmed %>% + ungroup() %>% + mutate(type2 = case_when( + journal == 'ISCB Fellow' ~ 'ISCB Fellows', + type == 'Keynote speakers/Fellows' ~ 'Keynote speakers', + TRUE ~ 'Pubmed authors' + )) %>% + group_by(type2, year, gender) %>% + summarise( + mean_prob = mean(weighted_probs), + se_prob = sqrt(var(probabilities) * sum(weight^2)/(sum(weight)^2)), + me_prob = alpha_threshold * se_prob, + .groups = 'drop' + ) %>% + gender_breakdown('main', fct_rev(type2)) +``` + + + +```{r eval=FALSE, include=FALSE} # By conference: # fig_1d <- bind_rows(iscb_gender) %>% # gender_breakdown(category = 'sub', journal) + @@ -150,8 +170,9 @@ get_p <- function(inte, colu){ ```{r} iscb_lm <- iscb_pubmed %>% filter(gender == 'probability_female', !is.na(weighted_probs)) %>% - mutate(type = as.factor(type), - type = relevel(type, ref = 'Pubmed authors')) + mutate(type = as.factor(type)) %>% + mutate(type = relevel(type, ref = 'Pubmed authors'), + year = as.factor(year)) main_lm <- glm(type ~ year + weighted_probs, data = iscb_lm, family = 'binomial') @@ -160,15 +181,35 @@ summary(main_lm) ``` The two groups of scientists did not have a significant association with the gender predicted from fore names (_P_ = `r get_p(main_lm, 'p.value')`). -Interaction terms do not predict `probabilities` over and above the main effect of group of scientists and year. +Interaction terms do not predict `type` over and above the main effect of gender probability and year. ```{r} -inte_lm <- glm(type ~ year * weighted_probs, - data = iscb_lm, family = 'binomial') +scaled_iscb <- iscb_lm +# scaled_iscb$s_prob <- scale(scaled_iscb$weighted_probs, scale = F) +# scaled_iscb$s_year <- scale(scaled_iscb$year, scale = F) +main_lm <- glm(type ~ year + weighted_probs, + data = scaled_iscb %>% mutate(year = as.factor(year)), + family = 'binomial') + +summary(main_lm) +inte_lm <- glm( + # type ~ scale(year, scale = F) * scale(weighted_probs, scale = F), + # type ~ s_year * s_prob, + type ~ year * weighted_probs, + data = scaled_iscb %>% mutate(year = as.factor(year)) + , + family = 'binomial') summary(inte_lm) anova(main_lm, inte_lm, test = 'Chisq') +mean(scaled_iscb$year) +mean(scaled_iscb$weighted_probs) ``` +```{r} +# inte_lm <- glm(type ~ (year * weighted_probs), +# data = iscb_lm, +# family = 'binomial') +``` ```{r} sessionInfo() diff --git a/11.visualize-name-origins.Rmd b/11.visualize-name-origins.Rmd index f1a951f..4286182 100644 --- a/11.visualize-name-origins.Rmd +++ b/11.visualize-name-origins.Rmd @@ -169,49 +169,51 @@ ggsave('figs/region_breakdown.svg', fig_4, width = 6.7, height = 5.5) ```{r} iscb_lm <- iscb_pubmed_oth %>% - ungroup() %>% - mutate(year = c(scale(year(year))), - type = as.factor(type) %>% relevel(ref = 'Pubmed authors')) + ungroup() %>% + mutate( + # year = c(scale(year(year))), + year = as.factor(year), + type = as.factor(type) %>% relevel(ref = 'Pubmed authors')) main_lm <- function(regioni){ - glm(type ~ weighted_probs + year, - data = iscb_lm %>% - filter(region == regioni, !is.na(probabilities)) , - family = 'binomial') + glm(type ~ year + weighted_probs, + data = iscb_lm %>% + filter(region == regioni, !is.na(probabilities)) , + family = 'binomial') } inte_lm <- function(regioni){ glm(type ~ year * weighted_probs, data = iscb_lm %>% - filter(region == regioni, !is.na(weighted_probs)), + filter(region == regioni, !is.na(weighted_probs)), family = 'binomial') } main_list <- lapply(large_regions, main_lm) -inte_list <- lapply(large_regions, inte_lm) names(main_list) <- large_regions lapply(main_list, summary) -lapply(inte_list, summary) +inte_list <- lapply(large_regions, inte_lm) +lapply(inte_list, summary) for (i in 1:4){ print(anova(main_list[[i]], inte_list[[i]], test = 'Chisq')) } - ``` +Interaction terms do not predict `type` over and above the main effect of name origin probability and year (_p_ > 0.01). ```{r echo = F} get_p <- function(i, colu){ broom::tidy(main_list[[i]]) %>% filter(term == 'weighted_probs') %>% - pull(colu) %>% - sprintf("%0.5g", .) + pull(colu) } + +print_p <- function(x) sprintf("%0.5g", x) ``` ## Conclusion - -A name coming from the group of honorees has significantly higher probability of being Celtic/English, $\beta_\textrm{Celtic/English} =$ `r get_p(1, 'estimate')` (_P_ = `r get_p(1, 'p.value')`), and lower probability of being East Asian, $\beta_\textrm{East Asian} =$ `r get_p(2, 'estimate')` (_P_ = `r get_p(2, 'p.value')`). The two groups of scientists did not have a significant association with names predicted to be European and in Other categories (_P_ = `r get_p(3, 'p.value')` and _P_ = `r get_p(4, 'p.value')`, respectively). - - +A Celtic/English name has `r exp(get_p(1, 'estimate'))` the odds of being selected as an honoree, significantly higher compared to other names ($\beta_\textrm{Celtic/English} =$ `r print_p(get_p(1, 'estimate'))`, _P_ = `r print_p(get_p(1, 'p.value'))`). +An East Asian name has `r exp(get_p(2, 'estimate'))` the odds of being selected as an honoree, significantly lower than to other names ($\beta_\textrm{East Asian} =$ `r print_p(get_p(2, 'estimate'))`, _P_ = `r print_p(get_p(2, 'p.value'))`). +The two groups of scientists did not have a significant association with names predicted to be European (_P_ = `r print_p(get_p(3, 'p.value'))`) or in Other categories (_P_ = `r print_p(get_p(4, 'p.value'))`). ### Supplementary Figure S5 {#sup_fig_s5} It's difficult to come to a conclusion for other regions with so few data points and the imperfect accuracy of our prediction. diff --git a/14.us-name-origin.Rmd b/14.us-name-origin.Rmd index b46f3d0..7e6b080 100644 --- a/14.us-name-origin.Rmd +++ b/14.us-name-origin.Rmd @@ -121,47 +121,51 @@ ggsave('figs/us_name_origin.svg', fig_us_name_origin, width = 6.5, height = 5.5) ```{r} iscb_lm <- iscb_pubmed_oth %>% ungroup() %>% - mutate(year = c(scale(year)), - type = relevel(as.factor(type), ref = 'Pubmed authors')) + mutate( + # year = c(scale(year)), + year = as.factor(year), + type = relevel(as.factor(type), ref = 'Pubmed authors')) main_lm <- function(regioni){ glm(type ~ year + weighted_probs, data = iscb_lm %>% - filter(region == regioni, !is.na(weighted_probs)), + filter(region == regioni, !is.na(weighted_probs)), family = 'binomial') } inte_lm <- function(regioni){ - glm(type ~ year * weighted_probs, + glm(type ~ weighted_probs*year, data = iscb_lm %>% filter(region == regioni, !is.na(weighted_probs)), family = 'binomial') } main_list <- lapply(large_regions, main_lm) -inte_list <- lapply(large_regions, inte_lm) names(main_list) <- large_regions lapply(main_list, summary) -lapply(inte_list, summary) +inte_list <- lapply(large_regions, inte_lm) +lapply(inte_list, summary) for (i in 1:4){ print(anova(main_list[[i]], inte_list[[i]], test = 'Chisq')) } ``` +Interaction terms do not predict `type` over and above the main effect of name origin probability and year (_p_ > 0.01). ```{r echo = F} -get_p <- function(i, colu){ +get_exp <- function(i, colu){ broom::tidy(main_list[[i]]) %>% filter(term == 'weighted_probs') %>% - pull(colu) %>% - sprintf("%0.5g", .) + pull(colu) } + +print_p <- function(x) sprintf("%0.5g", x) ``` ## Conclusion -A name coming from the group of honorees has significantly lower probability of being East Asian, $\beta_\textrm{East Asian} =$ `r get_p(2, 'estimate')` (_P_ = `r get_p(2, 'p.value')`). The two groups of scientists did not have a significant association with names predicted to be Celtic/English (_P_ = `r get_p(1, 'p.value')`), European (_P_ = `r get_p(3, 'p.value')`), or in Other categories (_P_ = `r get_p(4, 'p.value')`). - +An East Asian name has `r exp(get_exp(2, 'estimate'))` the odds of being selected as an honoree, significantly lower compared to other names ($\beta_\textrm{East Asian} =$ `r print_p(get_exp(2, 'estimate'))`, _P_ = `r print_p(get_exp(2, 'p.value'))`). +The two groups of scientists did not have a significant association with names predicted to be Celtic/English (_P_ = `r print_p(get_exp(1, 'p.value'))`), European (_P_ = `r print_p(get_exp(3, 'p.value'))`), or in Other categories (_P_ = `r print_p(get_exp(4, 'p.value'))`). ## Supplement diff --git a/docs/091.draw-roc.html b/docs/091.draw-roc.html index 90ab6fe..c2826b5 100644 --- a/docs/091.draw-roc.html +++ b/docs/091.draw-roc.html @@ -13,19 +13,6 @@
library(tidyverse)
-# still need to install caret for the calibration function because tidymodels's
-# probably hasn't published this yet
-library(caret)
-
-source('utils/r-utils.R')
-theme_set(theme_bw())
roc_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/models/NamePrism_roc_curves.tsv') %>%
- rename('region' = category) %>%
- # recode_region_letter() %>%
- recode_region() %>%
- group_by(region) %>%
- mutate(Sensitivity = tpr, Specificity = 1-fpr, dSens = c(abs(diff(1-tpr)), 0)) %>%
- ungroup()
## ── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
+## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
+## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
+## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
+## ✓ readr 1.4.0 ✓ forcats 0.5.0
+## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
+## x dplyr::filter() masks stats::filter()
+## x dplyr::lag() masks stats::lag()
+# still need to install caret for the calibration function because tidymodels's
+# probably hasn't published this yet
+library(caret)
## Loading required package: lattice
+##
+## Attaching package: 'caret'
+## The following object is masked from 'package:purrr':
+##
+## lift
+
+roc_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/models/NamePrism_roc_curves.tsv') %>%
+ rename('region' = category) %>%
+ # recode_region_letter() %>%
+ recode_region() %>%
+ group_by(region) %>%
+ mutate(Sensitivity = tpr, Specificity = 1-fpr, dSens = c(abs(diff(1-tpr)), 0)) %>%
+ ungroup()
##
-## ── Column specification ──────────────────────────────────────────────────────────────────────────
+## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## fpr = col_double(),
## tpr = col_double(),
## threshold = col_double(),
## category = col_character()
## )
+## Warning: Problem with `mutate()` input `region`.
+## ℹ Unknown levels in `f`: OtherCategories
+## ℹ Input `region` is `fct_recode(...)`.
## Warning: Unknown levels in `f`: OtherCategories
-auc_df <- roc_df %>%
- group_by(region) %>%
- # add_count() %>%
- summarise(auc = sum((1 - fpr) * dSens),
- n = n()) %>%
- arrange(desc(auc)) %>%
- mutate(auc_pct = 100 * auc,
- reg_auc = paste0(region, ', AUC = ', round(auc_pct, 1), '%'))
-
-# region_levels <- c('Celtic English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Muslim', 'Israeli', 'African')
-region_levels <- paste(c('Celtic/English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Arabic', 'Hebrew', 'African', 'Nordic', 'Greek'), 'names')
-region_levels_let <- toupper(letters[1:8])
-region_cols <- c('#b3de69', '#fdb462', '#bc80bd', '#8dd3c7', '#fccde5', '#ffffb3', '#ccebc5', '#bebada', '#80b1d3', '#fb8072')
-
-fig_3a <- roc_df %>%
- left_join(auc_df, by = 'region') %>%
- ggplot(aes(x = Sensitivity, y = Specificity, color = fct_relevel(reg_auc, as.character(auc_df$reg_auc)))) +
- scale_color_manual(values = region_cols) +
- geom_step(size = 1, alpha = 0.8) +
- coord_fixed() +
- scale_x_reverse(breaks = seq(1, 0, -0.2), labels = scales::percent) +
- scale_y_continuous(breaks = seq(0, 1, 0.2), labels = scales::percent, limits = c(NA, 1.05)) +
- theme(legend.position = c(0.62, 0.42),
- legend.title = element_blank(),
- legend.text.align = 1,
- legend.text = element_text(size = 7),
- legend.margin = margin(-0.2, 0.2, 0.2, 0, unit='cm'))
predictions_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/data/NamePrism_results_test.tsv') %>%
- mutate(y_true = as.factor(truth)) %>%
- select(-truth)
auc_df <- roc_df %>%
+ group_by(region) %>%
+ # add_count() %>%
+ summarise(auc = sum((1 - fpr) * dSens),
+ n = n()) %>%
+ arrange(desc(auc)) %>%
+ mutate(auc_pct = 100 * auc,
+ reg_auc = paste0(region, ', AUC = ', round(auc_pct, 1), '%'))
## `summarise()` ungrouping output (override with `.groups` argument)
+# region_levels <- c('Celtic English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Muslim', 'Israeli', 'African')
+region_levels <- paste(c('Celtic/English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Arabic', 'Hebrew', 'African', 'Nordic', 'Greek'), 'names')
+region_levels_let <- toupper(letters[1:8])
+region_cols <- c('#b3de69', '#fdb462', '#bc80bd', '#8dd3c7', '#fccde5', '#ffffb3', '#ccebc5', '#bebada', '#80b1d3', '#fb8072')
+
+fig_3a <- roc_df %>%
+ left_join(auc_df, by = 'region') %>%
+ ggplot(aes(x = Sensitivity, y = Specificity, color = fct_relevel(reg_auc, as.character(auc_df$reg_auc)))) +
+ scale_color_manual(values = region_cols) +
+ geom_step(size = 1, alpha = 0.8) +
+ coord_fixed() +
+ scale_x_reverse(breaks = seq(1, 0, -0.2), labels = scales::percent) +
+ scale_y_continuous(breaks = seq(0, 1, 0.2), labels = scales::percent, limits = c(NA, 1.05)) +
+ theme(legend.position = c(0.62, 0.42),
+ legend.title = element_blank(),
+ legend.text.align = 1,
+ legend.text = element_text(size = 7),
+ legend.margin = margin(-0.2, 0.2, 0.2, 0, unit='cm'))
predictions_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/data/NamePrism_results_test.tsv') %>%
+ mutate(y_true = as.factor(truth)) %>%
+ select(-truth)
##
-## ── Column specification ──────────────────────────────────────────────────────────────────────────
+## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## African = col_double(),
## CelticEnglish = col_double(),
@@ -1752,21 +1825,25 @@ Name origin prediction method performance
## SouthAsian = col_double(),
## truth = col_character()
## )
-regs <- predictions_df %>% select(African:SouthAsian) %>% colnames()
-cal_dfs <- list()
-for (reg in regs) {
- pred_reg <- predictions_df %>%
- mutate(y_true_bin = as.factor((y_true == reg))) %>%
- rename(prob = reg) %>%
- select(y_true_bin, prob)
-
- cal_dfs[[reg]] <- calibration(y_true_bin ~ prob,
- data = pred_reg,
- cuts = 11,
- class = 'TRUE')$data %>%
- mutate(region = reg)
-}
-cal_dfs$EastAsian
regs <- predictions_df %>% select(African:SouthAsian) %>% colnames()
+cal_dfs <- list()
+for (reg in regs) {
+ pred_reg <- predictions_df %>%
+ mutate(y_true_bin = as.factor((y_true == reg))) %>%
+ rename(prob = reg) %>%
+ select(y_true_bin, prob)
+
+ cal_dfs[[reg]] <- calibration(y_true_bin ~ prob,
+ data = pred_reg,
+ cuts = 11,
+ class = 'TRUE')$data %>%
+ mutate(region = reg)
+}
## Note: Using an external vector in selections is ambiguous.
+## ℹ Use `all_of(reg)` instead of `reg` to silence this message.
+## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
+## This message is displayed once per session.
+
## calibModelVar bin Percent Lower Upper Count midpoint region
## 1 prob [0,0.0909] 0.973038 0.9061138 1.043559 777 4.545455 EastAsian
## 2 prob (0.0909,0.182] 12.715105 10.7555376 14.887108 133 13.636364 EastAsian
@@ -1779,65 +1856,73 @@ Name origin prediction method performance
## 9 prob (0.727,0.818] 61.421320 54.2394760 68.253900 121 77.272727 EastAsian
## 10 prob (0.818,0.909] 71.764706 66.6571343 76.488532 244 86.363636 EastAsian
## 11 prob (0.909,1] 97.209555 96.8524649 97.536348 8953 95.454545 EastAsian
-fig_3b <- bind_rows(cal_dfs) %>%
- recode_region() %>%
- ggplot(aes(x = midpoint/100, y = Percent/100, color = fct_relevel(region, as.character(auc_df$region)))) +
- geom_abline(slope = 1, linetype = 2, alpha = 0.5) +
- scale_y_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(-0.005, 1.045)) +
- scale_x_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(0, 1)) +
- coord_fixed() +
- geom_point() +
- geom_line() +
- scale_color_manual(values = region_cols) +
- theme(legend.position = 'None') +
- labs(x = 'Predicted probability', y = 'Fraction of names')
fig_3b <- bind_rows(cal_dfs) %>%
+ recode_region() %>%
+ ggplot(aes(x = midpoint/100, y = Percent/100, color = fct_relevel(region, as.character(auc_df$region)))) +
+ geom_abline(slope = 1, linetype = 2, alpha = 0.5) +
+ scale_y_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(-0.005, 1.045)) +
+ scale_x_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(0, 1)) +
+ coord_fixed() +
+ geom_point() +
+ geom_line() +
+ scale_color_manual(values = region_cols) +
+ theme(legend.position = 'None') +
+ labs(x = 'Predicted probability', y = 'Fraction of names')
## Warning: Problem with `mutate()` input `region`.
+## ℹ Unknown levels in `f`: OtherCategories
+## ℹ Input `region` is `fct_recode(...)`.
+## Warning: Unknown levels in `f`: OtherCategories
+n_obs <- sum(auc_df$n)
+short_regs <- auc_df$region %>%
+ as.character() %>%
+ gsub(' names', '', .)
+
+heat_dat <- predictions_df %>%
+ group_by(y_true) %>%
+ summarise_if(is.numeric, mean, na.rm = T) %>%
+ ungroup() %>%
+ pivot_longer(- y_true, names_to = 'region', values_to = 'pred_prob') %>%
+ recode_region() %>%
+ rename('reg_hat' = region, 'region' = y_true) %>%
+ recode_region() %>%
+ rename('y_true' = region, 'region' = reg_hat) %>%
+ left_join(auc_df, by = 'region') %>%
+ mutate(scale_pred_prob = log2((pred_prob)/(n/n_obs)),
+ region = region %>% gsub(' names', '', .) %>% fct_relevel(short_regs),
+ y_true = y_true %>% gsub(' names', '', .) %>% fct_relevel(short_regs))
## Warning: Problem with `mutate()` input `region`.
+## ℹ Unknown levels in `f`: OtherCategories
+## ℹ Input `region` is `fct_recode(...)`.
+## Warning: Unknown levels in `f`: OtherCategories
+## Warning: Problem with `mutate()` input `region`.
+## ℹ Unknown levels in `f`: OtherCategories
+## ℹ Input `region` is `fct_recode(...)`.
## Warning: Unknown levels in `f`: OtherCategories
-n_obs <- sum(auc_df$n)
-short_regs <- auc_df$region %>%
- as.character() %>%
- gsub(' names', '', .)
-
-heat_dat <- predictions_df %>%
- group_by(y_true) %>%
- summarise_if(is.numeric, mean, na.rm = T) %>%
- ungroup() %>%
- pivot_longer(- y_true, names_to = 'region', values_to = 'pred_prob') %>%
- recode_region() %>%
- rename('reg_hat' = region, 'region' = y_true) %>%
- recode_region() %>%
- rename('y_true' = region, 'region' = reg_hat) %>%
- left_join(auc_df, by = 'region') %>%
- mutate(scale_pred_prob = log2((pred_prob)/(n/n_obs)),
- region = region %>% gsub(' names', '', .) %>% fct_relevel(short_regs),
- y_true = y_true %>% gsub(' names', '', .) %>% fct_relevel(short_regs))
## Warning: Unknown levels in `f`: OtherCategories
-
-## Warning: Unknown levels in `f`: OtherCategories
-fig_3c <- ggplot(heat_dat, aes(y_true, region,
- fill = scale_pred_prob)) +
- geom_tile() +
- scale_fill_gradientn(
- colours = c("#3CBC75FF","white","#440154FF"),
- values = scales::rescale(
- c(min(heat_dat$scale_pred_prob),
- 0,
- max(heat_dat$scale_pred_prob)))
- ) +
- coord_fixed() +
- labs(x = 'True region', y = 'Predicted region', fill = bquote(log[2]~'FC')) +
- theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
- legend.position = 'top',
- legend.key.height = unit(0.2, 'cm'),
- legend.title = element_text(vjust = 1),
- legend.margin = margin(0, 0,0, -1, unit='cm'),
- axis.title.x = element_text(margin = margin(t = 27, r = 0, b = 0, l = 0)),
- axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)))
-
-fig_3 <- cowplot::plot_grid(fig_3a, fig_3b, fig_3c, labels = 'AUTO', nrow = 1,
- rel_widths = c(2,2,1.6))
-fig_3
fig_3c <- ggplot(heat_dat, aes(y_true, region,
+ fill = scale_pred_prob)) +
+ geom_tile() +
+ scale_fill_gradientn(
+ colours = c("#3CBC75FF","white","#440154FF"),
+ values = scales::rescale(
+ c(min(heat_dat$scale_pred_prob),
+ 0,
+ max(heat_dat$scale_pred_prob)))
+ ) +
+ coord_fixed() +
+ labs(x = 'True region', y = 'Predicted region', fill = bquote(log[2]~'FC')) +
+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
+ legend.position = 'top',
+ legend.key.height = unit(0.2, 'cm'),
+ legend.title = element_text(vjust = 1),
+ legend.margin = margin(0, 0,0, -1, unit='cm'),
+ axis.title.x = element_text(margin = margin(t = 27, r = 0, b = 0, l = 0)),
+ axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)))
+
+fig_3 <- cowplot::plot_grid(fig_3a, fig_3b, fig_3c, labels = 'AUTO', nrow = 1,
+ rel_widths = c(2,2,1.6))
+fig_3
##
-## ── Column specification ──────────────────────────────────────────────────────────────────────────
+## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## fore_name = col_character(),
## last_name = col_character(),
@@ -4251,32 +4308,33 @@ General data read-in
## last_name_simple = col_character(),
## full_name = col_character()
## )
-nat_to_reg <- world %>%
- select(- geometry) %>%
- as_tibble() %>%
- select(iso_a2, name, region_wb) %>%
- rename('countries' = iso_a2,
- 'country_name' = name,
- 'region' = region_wb) %>%
- mutate(country_name = country_name %>%
- gsub('United States of America', 'United States', .))
-nat_to_reg[nat_to_reg$country_name == 'Norway', 'countries'] <- 'NO'
-nat_to_reg[nat_to_reg$country_name == 'Somaliland', 'countries'] <- 'SO'
-nat_to_reg[nat_to_reg$country_name == 'France', 'countries'] <- 'FR'
-nat_to_reg %>% filter(is.na(countries))
## # A tibble: 4 x 3
+nat_to_reg <- world %>%
+ select(- geometry) %>%
+ as_tibble() %>%
+ select(iso_a2, name, region_wb) %>%
+ rename('countries' = iso_a2,
+ 'country_name' = name,
+ 'region' = region_wb) %>%
+ mutate(country_name = country_name %>%
+ gsub('United States of America', 'United States', .))
+nat_to_reg[nat_to_reg$country_name == 'Norway', 'countries'] <- 'NO'
+nat_to_reg[nat_to_reg$country_name == 'Somaliland', 'countries'] <- 'SO'
+nat_to_reg[nat_to_reg$country_name == 'France', 'countries'] <- 'FR'
+nat_to_reg %>% filter(is.na(countries))
+## # A tibble: 5 x 3
## countries country_name region
## <chr> <chr> <chr>
-## 1 <NA> N. Cyprus Europe & Central Asia
-## 2 <NA> Indian Ocean Ter. East Asia & Pacific
-## 3 <NA> Ashmore and Cartier Is. East Asia & Pacific
-## 4 <NA> Siachen Glacier South Asia
-
+## 1 <NA> Ashmore and Cartier Is. East Asia & Pacific
+## 2 <NA> N. Cyprus Europe & Central Asia
+## 3 <NA> Indian Ocean Ter. East Asia & Pacific
+## 4 <NA> Siachen Glacier South Asia
+## 5 <NA> Kosovo Europe & Central Asia
+articles <- readr::read_tsv('data/pubmed/articles.tsv.xz') %>%
+ mutate(year = substr(publication_date, 1, 4) %>% ymd(truncated = 2),
+ publication_date = ymd(publication_date, truncated = 2)) %>%
+ filter(year(publication_date) < 2020)
##
-## ── Column specification ──────────────────────────────────────────────────────────────────────────
+## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## pmid = col_double(),
## pmcid = col_character(),
@@ -4286,21 +4344,21 @@ General data read-in
## pmc_cited_by_count = col_double(),
## title = col_character()
## )
-# citations <- xml2::read_xml('data/pubmed/esummary/compbio-english.xml.xz')
-
-corr_authors <- readr::read_tsv(
- 'data/names/corresponding-authors.tsv.xz',
- col_types = readr::cols(fore_name_simple = readr::col_character())) %>%
- inner_join(articles, by = 'pmid') %>%
- mutate(adjusted_citations = sqrt(pmc_cited_by_count + 1))
-
-keynotes <- readr::read_tsv('data/iscb/keynotes.tsv') %>%
- mutate(publication_date = ymd(year, truncated = 2),
- year = ymd(year, truncated = 2)) %>%
- left_join(select(all_full_names, - full_name), by = c('fore_name', 'last_name')) %>%
- filter(year(year) < 2020, conference != 'PSB') # remove PSB, exclude ISCB Fellows and ISMB speakers in 2020 for now
# citations <- xml2::read_xml('data/pubmed/esummary/compbio-english.xml.xz')
+
+corr_authors <- readr::read_tsv(
+ 'data/names/corresponding-authors.tsv.xz',
+ col_types = readr::cols(fore_name_simple = readr::col_character())) %>%
+ inner_join(articles, by = 'pmid') %>%
+ mutate(adjusted_citations = sqrt(pmc_cited_by_count + 1))
+
+keynotes <- readr::read_tsv('data/iscb/keynotes.tsv') %>%
+ mutate(publication_date = ymd(year, truncated = 2),
+ year = ymd(year, truncated = 2)) %>%
+ left_join(select(all_full_names, - full_name), by = c('fore_name', 'last_name')) %>%
+ filter(year(year) < 2020, conference != 'PSB') # remove PSB, exclude ISCB Fellows and ISMB speakers in 2020 for now
##
-## ── Column specification ──────────────────────────────────────────────────────────────────────────
+## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## year = col_double(),
## full_name = col_character(),
@@ -4311,22 +4369,22 @@ General data read-in
## affiliations = col_character(),
## afflcountries = col_character()
## )
-
+
## # A tibble: 0 x 11
## # … with 11 variables: year <date>, full_name <chr>, fore_name <chr>, last_name <chr>,
-## # conference <chr>, source <chr>, affiliations <chr>, afflcountries <chr>,
-## # publication_date <date>, fore_name_simple <chr>, last_name_simple <chr>
-large_jours <- articles %>%
- count(journal, sort = T) %>%
- head(10)
-
-nationalize_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/6ab0feeca430ae9997dbaf8f81707359be50a17d/data/NamePrism_results_authors.tsv') %>%
- rename('full_name' = X1) %>%
- distinct(full_name, .keep_all = T) %>%
- left_join(all_full_names, by = 'full_name')
large_jours <- articles %>%
+ count(journal, sort = T) %>%
+ head(10)
+
+nationalize_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/6ab0feeca430ae9997dbaf8f81707359be50a17d/data/NamePrism_results_authors.tsv') %>%
+ rename('full_name' = X1) %>%
+ distinct(full_name, .keep_all = T) %>%
+ left_join(all_full_names, by = 'full_name')
## Warning: Missing column names filled in: 'X1' [1]
##
-## ── Column specification ──────────────────────────────────────────────────────────────────────────
+## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## African = col_double(),
@@ -4344,21 +4402,21 @@ General data read-in
Number of articles from 1993-2019: 176773 (~ 100 articles with no authors).
Number of last authors: 176609.
-corr_authors %>%
- count(year, name = 'Number of articles with last authors') %>%
- DT::datatable(rownames = F)
-
-
+corr_authors %>%
+ count(year, name = 'Number of articles with last authors') %>%
+ DT::datatable(rownames = F)
+
+
If we set a threshold at least 200 articles a year, we should only consider articles from 1998 on.
-corr_authors <- corr_authors %>%
- add_count(year, name = 'n_aut_yr') %>%
- filter(n_aut_yr > 200) %>%
- select(- n_aut_yr)
-
-nrow(corr_authors)
+corr_authors <- corr_authors %>%
+ add_count(year, name = 'n_aut_yr') %>%
+ filter(n_aut_yr > 200) %>%
+ select(- n_aut_yr)
+
+nrow(corr_authors)
## [1] 176110
-
+
Number of honorees up until 2019: 412.
Three types of honorees:
## .
## ISCB Fellow ISMB RECOMB
## 75 167 170
There are 412 entries and 290 unique names in the ISCB cohort.
Names sorted by the number of honors:
- - - + + +Number of keynote speakers/fellows across years:
-keynotes %>%
- select(year, conference) %>%
- count(year, conference) %>%
- ggplot(aes(x = year(year), y = n, color = conference)) +
- geom_point() +
- geom_line(alpha = 0.5) +
- coord_cartesian(ylim = c(0, 13)) +
- scale_x_continuous(breaks = seq(1995, 2019, 5)) +
- scale_y_continuous(breaks = seq(0, 14, 2)) +
- scale_color_viridis_d() +
- labs(x = 'Year', y = 'Number of keynote speakers/fellows')
keynotes %>%
+ select(year, conference) %>%
+ count(year, conference) %>%
+ ggplot(aes(x = year(year), y = n, color = conference)) +
+ geom_point() +
+ geom_line(alpha = 0.5) +
+ coord_cartesian(ylim = c(0, 13)) +
+ scale_x_continuous(breaks = seq(1995, 2019, 5)) +
+ scale_y_continuous(breaks = seq(0, 14, 2)) +
+ scale_color_viridis_d() +
+ labs(x = 'Year', y = 'Number of keynote speakers/fellows')
Total number of last authors: 176110.
10 journals with the most computational biology articles:
-corr_authors %>%
- mutate(publication_date = ymd(publication_date, truncated = 2)) %>%
- select(publication_date, journal) %>%
- filter(journal %in% large_jours$journal) %>%
- ggplot(aes(x = publication_date, fill = forcats::fct_infreq(journal))) +
- geom_histogram(size = 0, bins = 27) +
- scale_fill_brewer(palette = 'Set3') +
- theme(legend.position = c(0.2, 0.7)) +
- labs(x = NULL, y = NULL) +
-scale_x_date(
- labels = scales::date_format("%Y"),
- breaks = as.Date(c('2000-01-01', '2010-01-01', '2019-01-01')),
- limits = c(as.Date('1993-01-01'), as.Date('2019-12-31'))) +
- # geom_text(data = jours, aes(x = x, y = y, label = label), color = 'grey10') +
- NULL
corr_authors %>%
+ mutate(publication_date = ymd(publication_date, truncated = 2)) %>%
+ select(publication_date, journal) %>%
+ filter(journal %in% large_jours$journal) %>%
+ ggplot(aes(x = publication_date, fill = forcats::fct_infreq(journal))) +
+ geom_histogram(size = 0, bins = 27) +
+ scale_fill_brewer(palette = 'Set3') +
+ theme(legend.position = c(0.2, 0.7)) +
+ labs(x = NULL, y = NULL) +
+scale_x_date(
+ labels = scales::date_format("%Y"),
+ breaks = as.Date(c('2000-01-01', '2010-01-01', '2019-01-01')),
+ limits = c(as.Date('1993-01-01'), as.Date('2019-12-31'))) +
+ # geom_text(data = jours, aes(x = x, y = y, label = label), color = 'grey10') +
+ NULL
## Warning: Removed 20 rows containing missing values (geom_bar).
-
-nucleic_acids <- corr_authors %>% filter(grepl('Nucleic Acids Res', journal))
-table(nucleic_acids$journal)
nucleic_acids <- corr_authors %>% filter(grepl('Nucleic Acids Res', journal))
+table(nucleic_acids$journal)
##
## Nucleic Acids Res Nucleic Acids Res Suppl
## 3419 5
##
-## ── Column specification ──────────────────────────────────────────────────────────────────────────
+## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## fore_name_simple = col_character(),
## n_authors = col_double(),
@@ -4315,69 +4372,68 @@ Gender analysis
## query_date = col_date(format = ""),
## probability_male = col_double()
## )
-pubmed_gender_pmids <- corr_authors %>%
- left_join(gender_df, by = 'fore_name_simple')
-
-iscb_gender_df <- keynotes %>%
- left_join(gender_df, by = 'fore_name_simple')
pubmed_gender_pmids %>%
- mutate(genderized = ifelse(is.na(probability_male), 'NOT genderized', 'Genderized')) %>%
- ggplot(aes(year(year), fill = genderized)) +
- geom_bar() +
- scale_fill_viridis_d() +
- theme(legend.position = c(0.2, 0.8)) +
- ylab('Number of full names')
gender_check <- pubmed_gender_pmids %>%
- mutate(got_gender = case_when(
- is.na(probability_male) ~ 'Gender not predicted',
- TRUE ~ 'Gender predicted'))
-
-pred_all <- pubmed_gender_pmids %>%
- count(is.na(probability_male))
-
-dash_df <- pubmed_gender_pmids %>%
- filter(is.na(probability_male) & !is.na(fore_name_simple)) %>%
- mutate(dashed_name = grepl('-', fore_name_simple)) %>%
- count(dashed_name)
-
-pred_before_2002 <- gender_check %>%
- filter(year(year) < 2002) %>%
- count(is.na(probability_male), is.na(fore_name_simple))
pubmed_gender_pmids <- corr_authors %>%
+ left_join(gender_df, by = 'fore_name_simple')
+
+iscb_gender_df <- keynotes %>%
+ left_join(gender_df, by = 'fore_name_simple')
pubmed_gender_pmids %>%
+ mutate(genderized = ifelse(is.na(probability_male), 'NOT genderized', 'Genderized')) %>%
+ ggplot(aes(year(year), fill = genderized)) +
+ geom_bar() +
+ scale_fill_viridis_d() +
+ theme(legend.position = c(0.2, 0.8)) +
+ ylab('Number of full names')
gender_check <- pubmed_gender_pmids %>%
+ mutate(got_gender = case_when(
+ is.na(probability_male) ~ 'Gender not predicted',
+ TRUE ~ 'Gender predicted'))
+
+pred_all <- pubmed_gender_pmids %>%
+ count(is.na(probability_male))
+
+dash_df <- pubmed_gender_pmids %>%
+ filter(is.na(probability_male) & !is.na(fore_name_simple)) %>%
+ mutate(dashed_name = grepl('-', fore_name_simple)) %>%
+ count(dashed_name)
+
+pred_before_2002 <- gender_check %>%
+ filter(year(year) < 2002) %>%
+ count(is.na(probability_male), is.na(fore_name_simple))
1014 last authors with empty fore name field (i.e., missing metadata). 12512 authors with no fore_name_simple
.
In total, 153655 authors had gender prediction and 22515 didn’t. 11498 authors with fore name that is NA once simplified (i.e., initials only). Among 10003 authors with fore name but no predictions, ~ 42% has a dash.
Before 2002, 35 authors had gender predictions, 2566 didn’t have gender predictions because of these authors only have initials for fore names.
Mean probability of selecting Asian among these names:
-pubmed_gender_pmids %>%
- filter(is.na(probability_male) & !is.na(fore_name_simple)) %>%
- rename('surname' = last_name_simple) %>%
- predict_race(surname.only = T, impute.missing = F) %>%
- pull(pred.asi) %>%
- mean(na.rm = T)
pubmed_gender_pmids %>%
+ filter(is.na(probability_male) & !is.na(fore_name_simple)) %>%
+ rename('surname' = last_name_simple) %>%
+ predict_race(surname.only = T, impute.missing = F) %>%
+ pull(pred.asi) %>%
+ mean(na.rm = T)
## [1] "Proceeding with surname-only predictions..."
-## Warning in merge_surnames(voter.file, impute.missing = impute.missing): 1305 surnames were not
-## matched.
+## Warning in merge_surnames(voter.file, impute.missing = impute.missing): 1305 surnames were not matched.
## [1] 0.8174599
Honorees that didn’t receive a gender prediction: Chung-I Wu.
In summary, the NA predictions mostly include initials only, hyphenated names and perhaps names with accent marks.
pubmed_aff_pmids <- corr_authors %>%
- tidyr::separate_rows(countries, sep = ',') %>%
- filter(countries == 'US') # looking at only US affiliation
-
-keynotes_us <- keynotes %>%
- tidyr::separate_rows(afflcountries, sep = '\\|') %>%
- filter(afflcountries == 'United States')
-
-pubmed_race_pmids <- pubmed_aff_pmids %>%
- rename('surname' = last_name_simple) %>%
- predict_race(surname.only = T, impute.missing = F)
pubmed_aff_pmids <- corr_authors %>%
+ tidyr::separate_rows(countries, sep = ',') %>%
+ filter(countries == 'US') # looking at only US affiliation
+
+keynotes_us <- keynotes %>%
+ tidyr::separate_rows(afflcountries, sep = '\\|') %>%
+ filter(afflcountries == 'United States')
+
+pubmed_race_pmids <- pubmed_aff_pmids %>%
+ rename('surname' = last_name_simple) %>%
+ predict_race(surname.only = T, impute.missing = F)
## [1] "Proceeding with surname-only predictions..."
-iscb_us_race <- keynotes_us %>%
- rename('surname' = last_name_simple) %>%
- predict_race(surname.only = T, impute.missing = F)
iscb_us_race <- keynotes_us %>%
+ rename('surname' = last_name_simple) %>%
+ predict_race(surname.only = T, impute.missing = F)
## [1] "Proceeding with surname-only predictions..."
Number of honorees affiliated with the US: 239.
Number of authors with no race prediction: 5166 (of which 6 did not have a surname).
Number of honorees with no race prediction: 45 (of which 0 did not have a surname).
region_levels <- paste(c('Celtic/English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Arabic', 'Hebrew', 'African', 'Nordic', 'Greek'), 'names')
-
-pubmed_nat_pmids <- corr_authors %>%
- left_join(nationalize_df, by = c('fore_name', 'last_name'))
-
-pubmed_nat_df <- pubmed_nat_pmids %>%
- group_by(pmid, journal, publication_date, year) %>%
- summarise_at(vars(African:SouthAsian), mean, na.rm = T) %>%
- ungroup()
-
-iscb_nat_df <- keynotes %>%
- left_join(nationalize_df, by = c('fore_name', 'last_name'))
set.seed(0)
-
-top_names <- map_dfc(
- nationalize_df %>%
- select(African:SouthAsian) %>%
- colnames(),
- function(x) {
- nationalize_df %>%
- filter((!!sym(x)) > 0.9) %>%
- sample_n(6) %>%
- select(full_name) %>%
- rename(!!x := full_name)
- })
-
-top_names %>%
- pivot_longer(everything(), names_to = 'region', values_to = 'names') %>%
- recode_region() %>%
- group_by(region) %>%
- summarise(names = paste(names, collapse = ', '), .groups = 'drop') %>%
- arrange(factor(region, levels = region_levels)) %>%
- DT::datatable() %>%
- # write_tsv('data/names/example_name_origin.tsv') %>%
- {.}
region_levels <- paste(c('Celtic/English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Arabic', 'Hebrew', 'African', 'Nordic', 'Greek'), 'names')
+
+pubmed_nat_pmids <- corr_authors %>%
+ left_join(nationalize_df, by = c('fore_name', 'last_name'))
+
+pubmed_nat_df <- pubmed_nat_pmids %>%
+ group_by(pmid, journal, publication_date, year) %>%
+ summarise_at(vars(African:SouthAsian), mean, na.rm = T) %>%
+ ungroup()
+
+iscb_nat_df <- keynotes %>%
+ left_join(nationalize_df, by = c('fore_name', 'last_name'))
set.seed(0)
+
+top_names <- map_dfc(
+ nationalize_df %>%
+ select(African:SouthAsian) %>%
+ colnames(),
+ function(x) {
+ nationalize_df %>%
+ filter((!!sym(x)) > 0.9) %>%
+ sample_n(6) %>%
+ select(full_name) %>%
+ rename(!!x := full_name)
+ })
+
+top_names %>%
+ pivot_longer(everything(), names_to = 'region', values_to = 'names') %>%
+ recode_region() %>%
+ group_by(region) %>%
+ summarise(names = paste(names, collapse = ', '), .groups = 'drop') %>%
+ arrange(factor(region, levels = region_levels)) %>%
+ DT::datatable() %>%
+ # write_tsv('data/names/example_name_origin.tsv') %>%
+ {.}
## Warning: Problem with `mutate()` input `region`.
+## ℹ Unknown levels in `f`: OtherCategories
+## ℹ Input `region` is `fct_recode(...)`.
## Warning: Unknown levels in `f`: OtherCategories
-
-
-
+
+
+
## # A tibble: 2 x 3
## `!is.na(African)` `is.na(fore_name_simple.x)` n
## <lgl> <lgl> <int>
## 1 FALSE TRUE 12512
## 2 TRUE FALSE 163598
0 ISCB speakers did not have nationality predictions. 12512 pubmed full names not nationalized. 12512 of these don’t have fore_name_simple. (See earlier conclusion in Gender analysis.)
- - + +corr_authors %>%
- mutate(affi_country_found = ifelse(is.na(countries), 'Country NOT found', 'Country found')) %>%
- ggplot(aes(year(year), fill = affi_country_found)) +
- geom_bar(position = 'stack') +
- scale_fill_viridis_d() +
- theme(legend.position = c(0.2, 0.8)) +
- ylab('Number of full names')
corr_authors %>%
- mutate(affi_country_found = ifelse(
- is.na(countries), 'Country NOT found', 'Country found'),
- has_pmcid = ifelse(is.na(pmcid), 'No PMCID', 'Has PMCID')) %>%
- count(affi_country_found, has_pmcid)
corr_authors %>%
+ mutate(affi_country_found = ifelse(is.na(countries), 'Country NOT found', 'Country found')) %>%
+ ggplot(aes(year(year), fill = affi_country_found)) +
+ geom_bar(position = 'stack') +
+ scale_fill_viridis_d() +
+ theme(legend.position = c(0.2, 0.8)) +
+ ylab('Number of full names')
corr_authors %>%
+ mutate(affi_country_found = ifelse(
+ is.na(countries), 'Country NOT found', 'Country found'),
+ has_pmcid = ifelse(is.na(pmcid), 'No PMCID', 'Has PMCID')) %>%
+ count(affi_country_found, has_pmcid)
## # A tibble: 4 x 3
## affi_country_found has_pmcid n
## <chr> <chr> <int>
@@ -4480,29 +4539,29 @@ Affiliation analysis
pubmed_nat_pmids <- corr_authors %>%
- separate_rows(countries, sep = ',') %>%
- filter(countries == 'US') %>%
- left_join(nationalize_df, by = c('fore_name', 'last_name'))
-
-pubmed_nat_df <- pubmed_nat_pmids %>%
- group_by(pmid, journal, publication_date, year) %>%
- summarise_at(vars(African:SouthAsian), mean, na.rm = T) %>%
- ungroup()
-
-iscb_nat_df <- keynotes %>%
- separate_rows(afflcountries, sep = '\\|') %>%
- filter(afflcountries == 'United States') %>%
- left_join(nationalize_df, by = c('fore_name', 'last_name'))
pubmed_nat_pmids <- corr_authors %>%
+ separate_rows(countries, sep = ',') %>%
+ filter(countries == 'US') %>%
+ left_join(nationalize_df, by = c('fore_name', 'last_name'))
+
+pubmed_nat_df <- pubmed_nat_pmids %>%
+ group_by(pmid, journal, publication_date, year) %>%
+ summarise_at(vars(African:SouthAsian), mean, na.rm = T) %>%
+ ungroup()
+
+iscb_nat_df <- keynotes %>%
+ separate_rows(afflcountries, sep = '\\|') %>%
+ filter(afflcountries == 'United States') %>%
+ left_join(nationalize_df, by = c('fore_name', 'last_name'))
0 US-affiliated ISCB speakers did not have nationality predictions. pubmed_nat_pmids %>% filter(is.na(African)) %>% nrow()
US-affiliated authors did not have nationality predictions. 1205 of these don’t have fore_name_simple. (See earlier conclusion in Gender analysis.)
Adapted from epitools::riskratio()
.
enrichment_plot <- cowplot::plot_grid(enrichment_plot_left, enrichment_plot_right,
rel_widths = c(1, 1.3))
enrichment_plot
-
+
ggsave('figs/enrichment-plot.png', enrichment_plot, width = 5.5, height = 3.5)
sessionInfo()
## R version 4.0.3 (2020-10-10)
-## Platform: x86_64-apple-darwin17.0 (64-bit)
-## Running under: macOS Catalina 10.15.7
+## Platform: x86_64-pc-linux-gnu (64-bit)
+## Running under: Ubuntu 20.04 LTS
##
## Matrix products: default
-## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
-## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
+## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
-## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
+## [4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
+## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
+## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
-## [1] DT_0.16 epitools_0.5-10.1 wru_0.1-9 rnaturalearth_0.2.0
-## [5] caret_6.0-86 lattice_0.20-41 gdtools_0.2.2 lubridate_1.7.9
-## [9] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
+## [1] DT_0.16 epitools_0.5-10.1 gdtools_0.2.2 wru_0.1-10
+## [5] rnaturalearth_0.1.0 lubridate_1.7.9.2 caret_6.0-86 lattice_0.20-41
+## [9] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
## [13] readr_1.4.0 tidyr_1.1.2 tibble_3.0.4 ggplot2_3.3.2
-## [17] tidyverse_1.3.0 devtools_2.3.2 usethis_1.6.3
+## [17] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
-## [1] colorspace_2.0-0 ellipsis_0.3.1 class_7.3-17
-## [4] rprojroot_2.0.2 fs_1.5.0 rstudioapi_0.13
-## [7] farver_2.0.3 audio_0.1-7 remotes_2.2.0
-## [10] prodlim_2019.11.13 fansi_0.4.1 xml2_1.3.2
-## [13] codetools_0.2-16 splines_4.0.3 knitr_1.30
-## [16] pkgload_1.1.0 jsonlite_1.7.2 pROC_1.16.2
-## [19] broom_0.7.1 dbplyr_1.4.4 rgeos_0.5-5
-## [22] compiler_4.0.3 httr_1.4.2 backports_1.2.0
-## [25] assertthat_0.2.1 Matrix_1.2-18 cli_2.2.0
-## [28] beepr_1.3 htmltools_0.5.0 prettyunits_1.1.1
-## [31] tools_4.0.3 gtable_0.3.0 glue_1.4.2
-## [34] rnaturalearthdata_0.2.0 reshape2_1.4.4 Rcpp_1.0.5
-## [37] cellranger_1.1.0 vctrs_0.3.5 svglite_1.2.3.2
-## [40] nlme_3.1-149 crosstalk_1.1.0.1 iterators_1.0.13
-## [43] timeDate_3043.102 xfun_0.21 gower_0.2.2
-## [46] ps_1.5.0 testthat_3.0.0 rvest_0.3.6
-## [49] lifecycle_0.2.0 MASS_7.3-53 scales_1.1.1
-## [52] ipred_0.9-9 hms_0.5.3 RColorBrewer_1.1-2
-## [55] yaml_2.2.1 curl_4.3 memoise_1.1.0
-## [58] rpart_4.1-15 stringi_1.5.3 desc_1.2.0
-## [61] foreach_1.5.1 e1071_1.7-4 pkgbuild_1.2.0
-## [64] lava_1.6.8 rlang_0.4.9 pkgconfig_2.0.3
-## [67] systemfonts_1.0.1 evaluate_0.14 sf_0.9-6
-## [70] htmlwidgets_1.5.2 recipes_0.1.14 labeling_0.4.2
-## [73] cowplot_1.1.0 processx_3.4.5 tidyselect_1.1.0
-## [76] plyr_1.8.6 magrittr_2.0.1 R6_2.5.0
-## [79] generics_0.1.0 DBI_1.1.0 pillar_1.4.7
-## [82] haven_2.3.1 withr_2.3.0 mgcv_1.8-33
-## [85] units_0.6-7 survival_3.2-7 sp_1.4-4
-## [88] nnet_7.3-14 modelr_0.1.8 crayon_1.3.4
-## [91] utf8_1.1.4 KernSmooth_2.23-17 rmarkdown_2.7
-## [94] grid_4.0.3 readxl_1.3.1 data.table_1.13.2
-## [97] blob_1.2.1 callr_3.5.1 ModelMetrics_1.2.2.2
-## [100] reprex_0.3.0 digest_0.6.27 classInt_0.4-3
-## [103] stats4_4.0.3 munsell_0.5.0 viridisLite_0.3.0
-## [106] sessioninfo_1.1.1
+## [1] colorspace_2.0-0 ellipsis_0.3.1 class_7.3-17 rprojroot_1.3-2
+## [5] fs_1.5.0 rstudioapi_0.12 farver_2.0.3 remotes_2.2.0
+## [9] prodlim_2019.11.13 fansi_0.4.1 xml2_1.3.2 codetools_0.2-16
+## [13] splines_4.0.3 knitr_1.30 pkgload_1.1.0 jsonlite_1.7.1
+## [17] pROC_1.16.2 broom_0.7.2 dbplyr_2.0.0 rgeos_0.5-5
+## [21] compiler_4.0.3 httr_1.4.2 backports_1.2.0 assertthat_0.2.1
+## [25] Matrix_1.2-18 cli_2.1.0 htmltools_0.5.0 prettyunits_1.1.1
+## [29] tools_4.0.3 gtable_0.3.0 glue_1.4.2 rnaturalearthdata_0.1.0
+## [33] reshape2_1.4.4 Rcpp_1.0.5 cellranger_1.1.0 vctrs_0.3.4
+## [37] svglite_1.2.3.2 nlme_3.1-149 iterators_1.0.13 crosstalk_1.1.0.1
+## [41] timeDate_3043.102 gower_0.2.2 xfun_0.19 ps_1.4.0
+## [45] testthat_3.0.0 rvest_0.3.6 lifecycle_0.2.0 devtools_2.3.2
+## [49] MASS_7.3-53 scales_1.1.1 ipred_0.9-9 hms_0.5.3
+## [53] RColorBrewer_1.1-2 yaml_2.2.1 curl_4.3 memoise_1.1.0
+## [57] rpart_4.1-15 stringi_1.5.3 desc_1.2.0 foreach_1.5.1
+## [61] e1071_1.7-4 pkgbuild_1.1.0 lava_1.6.8.1 systemfonts_0.3.2
+## [65] rlang_0.4.8 pkgconfig_2.0.3 evaluate_0.14 sf_0.9-6
+## [69] recipes_0.1.15 htmlwidgets_1.5.2 labeling_0.4.2 cowplot_1.1.0
+## [73] tidyselect_1.1.0 processx_3.4.4 plyr_1.8.6 magrittr_1.5
+## [77] R6_2.5.0 generics_0.1.0 DBI_1.1.0 mgcv_1.8-33
+## [81] pillar_1.4.6 haven_2.3.1 withr_2.3.0 units_0.6-7
+## [85] survival_3.2-7 sp_1.4-4 nnet_7.3-14 modelr_0.1.8
+## [89] crayon_1.3.4 KernSmooth_2.23-17 utf8_1.1.4 rmarkdown_2.5
+## [93] usethis_1.6.3 grid_4.0.3 readxl_1.3.1 data.table_1.13.2
+## [97] callr_3.5.1 ModelMetrics_1.2.2.2 reprex_0.3.0 digest_0.6.27
+## [101] classInt_0.4-3 stats4_4.0.3 munsell_0.5.0 viridisLite_0.3.0
+## [105] sessioninfo_1.1.1