091.draw-roc.Rmd

---
title: "Plotting ROC curves"
output:
  html_document:
    theme: flatly
    toc: true
    toc_float: true
    code_download: true
    highlight: tango
knit: (function(inputFile, encoding) {
  rmarkdown::render(inputFile, encoding = encoding, output_dir = "docs") })
---

## Name origin prediction method performance {#auroc}

```{r}
library(tidyverse)
# still need to install caret for the calibration function because tidymodels's 
# probably hasn't published this yet
library(caret)

source('utils/r-utils.R')
theme_set(theme_bw())

```

```{r}
roc_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/models/NamePrism_roc_curves.tsv') %>%
  rename('region' = category) %>%
  # recode_region_letter() %>%
  recode_region() %>% 
  group_by(region) %>%
  mutate(Sensitivity = tpr, Specificity = 1-fpr, dSens = c(abs(diff(1-tpr)), 0)) %>%
  ungroup()

auc_df <- roc_df %>%
  group_by(region) %>%
  # add_count() %>%
  summarise(auc = sum((1 - fpr) * dSens),
            n = n()) %>%
  arrange(desc(auc)) %>%
  mutate(auc_pct = 100 * auc,
         reg_auc = paste0(region, ', AUC = ', round(auc_pct, 1), '%'))

# region_levels <- c('Celtic English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Muslim', 'Israeli', 'African')
region_levels <- paste(c('Celtic/English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Arabic', 'Hebrew', 'African', 'Nordic', 'Greek'), 'names')
region_levels_let <- toupper(letters[1:8])
region_cols <- c('#b3de69', '#fdb462',  '#bc80bd', '#8dd3c7', '#fccde5', '#ffffb3', '#ccebc5', '#bebada', '#80b1d3', '#fb8072')

fig_3a <- roc_df %>%
  left_join(auc_df, by = 'region') %>%
  ggplot(aes(x = Sensitivity, y = Specificity, color = fct_relevel(reg_auc, as.character(auc_df$reg_auc)))) +
  scale_color_manual(values = region_cols) +
  geom_step(size = 1, alpha = 0.8) +
  coord_fixed() +
  scale_x_reverse(breaks = seq(1, 0, -0.2), labels = scales::percent) +
  scale_y_continuous(breaks = seq(0, 1, 0.2), labels = scales::percent, limits = c(NA, 1.05)) +
  theme(legend.position = c(0.62, 0.42),
        legend.title = element_blank(),
        legend.text.align = 1,
        legend.text = element_text(size = 7),
        legend.margin = margin(-0.2, 0.2, 0.2, 0, unit='cm'))
```


```{r}
predictions_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/data/NamePrism_results_test.tsv') %>%
  mutate(y_true = as.factor(truth)) %>%
  select(-truth)

regs <- predictions_df %>% select(African:SouthAsian) %>% colnames()
cal_dfs <- list()
for (reg in regs) {
  pred_reg <- predictions_df %>%
    mutate(y_true_bin = as.factor((y_true == reg))) %>%
    rename(prob = reg) %>%
    select(y_true_bin, prob)

  cal_dfs[[reg]] <- calibration(y_true_bin ~ prob,
                                data = pred_reg,
                                cuts = 11,
                                class = 'TRUE')$data %>%
    mutate(region = reg)
}
cal_dfs$EastAsian

```

```{r}
fig_3b <- bind_rows(cal_dfs) %>%
  recode_region() %>%
  ggplot(aes(x = midpoint/100, y = Percent/100, color = fct_relevel(region, as.character(auc_df$region)))) +
  geom_abline(slope = 1, linetype = 2, alpha = 0.5) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(-0.005, 1.045)) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(0, 1)) +
  coord_fixed() +
  geom_point() +
  geom_line() +
  scale_color_manual(values = region_cols) +
  theme(legend.position = 'None') +
  labs(x = 'Predicted probability', y = 'Fraction of names')

```


```{r fig.height=3.5, fig.width=10}
n_obs <- sum(auc_df$n)
short_regs <- auc_df$region %>% 
  as.character() %>% 
  gsub(' names', '', .)

heat_dat <- predictions_df %>%
  group_by(y_true) %>%
  summarise_if(is.numeric, mean, na.rm = T) %>%
  ungroup() %>%
  pivot_longer(- y_true, names_to = 'region', values_to = 'pred_prob') %>%
  recode_region() %>%
  rename('reg_hat' = region, 'region' = y_true) %>% 
  recode_region() %>%
  rename('y_true' = region, 'region' = reg_hat) %>% 
  left_join(auc_df, by = 'region') %>%
  mutate(scale_pred_prob = log2((pred_prob)/(n/n_obs)),
         region = region %>% gsub(' names', '', .) %>% fct_relevel(short_regs),
         y_true = y_true %>% gsub(' names', '', .) %>% fct_relevel(short_regs))


fig_3c <- ggplot(heat_dat, aes(y_true, region,
                               fill = scale_pred_prob)) +
  geom_tile() +
  scale_fill_gradientn(
    colours = c("#3CBC75FF","white","#440154FF"),
    values = scales::rescale(
      c(min(heat_dat$scale_pred_prob),
        0,
        max(heat_dat$scale_pred_prob)))
  ) +
  coord_fixed() +
  labs(x = 'True region', y = 'Predicted region', fill = bquote(log[2]~'FC')) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        legend.position = 'top',
        legend.key.height = unit(0.2, 'cm'),
        legend.title = element_text(vjust = 1),
        legend.margin = margin(0, 0,0, -1, unit='cm'),
        axis.title.x = element_text(margin = margin(t = 27, r = 0, b = 0, l = 0)),
        axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)))

fig_3 <- cowplot::plot_grid(fig_3a, fig_3b, fig_3c, labels = 'AUTO', nrow = 1,
                            rel_widths = c(2,2,1.6))
fig_3
# ggsave('figs/fig_3.png', fig_3, height = 4, width = 10)
```