bruna_supplement.Rmd

```{r GlobalOptions, include = FALSE}
# options(knitr.duplicate.label = 'allow')
knitr::opts_chunk$set(fig.pos = "H", out.extra = "")
options(knitr.table.format = "latex")
options(
  knitr.duplicate.label = "allow",
  knitr.kable.NA = ""
)
knitr::opts_knit$set(eval.after = "fig.cap")
# knitr::opts_chunk$set(fig.pos = 'h')
# library("papaja")
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
library(tidyverse)
library(gridExtra)
library(kableExtra)
library(knitr)
library(cowplot)
library(here)
library(magick)
library(xfun)
library(stopwords)
library(tidytext)
library(janitor)

# No. of journals
n_jrnls <- kw %>%
  select(SO) %>%
  summarize(n = n_distinct(SO))


# No. of publications by category
n_pubs <- kw %>% summarize(n = n_distinct(refID))
n_pubs_trop <- kw %>%
  filter(pub_cat_2 == "tropical") %>%
  summarize(n = n_distinct(refID))
n_pubs_gen <- kw %>%
  filter(pub_cat_2 == "general") %>%
  summarize(n = n_distinct(refID))
```

\renewcommand{\appendixname}{Supporting Information}
\renewcommand{\thefigure}{S\arabic{figure}} \setcounter{figure}{0}
\renewcommand{\thetable}{S\arabic{table}} \setcounter{table}{0}
\renewcommand{\theequation}{S\arabic{table}} \setcounter{equation}{0}
\setcounter{page}{1}


\nolinenumbers

## SUPPORTING INFORMATION 

\bigskip
\bigskip

## Is there really such a thing as _Tropical_ Biology?

\bigskip
\bigskip
\bigskip
\bigskip

\noindent Emilio M. Bruna ^1,2^ $^\ast$

\bigskip
\bigskip

<!-- $^1$ $^,$ $^2$ $^\ast$ -->

\noindent ^1^ Department of Wildlife Ecology and Conservation, University of Florida, PO Box 110430, Gainesville, FL 32611-0430, USA  

\noindent ^2^ Center for Latin American Studies, University of Florida, PO Box 115530, Gainesville, FL 32611-5530, USA 

\bigskip

\noindent $^\ast$ Corresponding author; email: embruna@ufl.edu. 

\newpage
\resetlinenumber
\linenumbers


```{r biblio_stats, include = FALSE,echo = FALSE,message=FALSE,warning=FALSE}

# N of publications

n_pubs_terms <- terms %>%
  select(refID) %>%
  n_distinct()
n_pubs_terms <- as.numeric(n_pubs_terms)

n_pubs_trop <- kw %>%
  filter(pub_cat_2 == "tropical") %>%
  summarize(n = n_distinct(refID))
n_pubs_gen <- kw %>%
  filter(pub_cat_2 == "general") %>%
  summarize(n = n_distinct(refID))
# # N of terms
n_kw <- kw %>% summarize(n = n_distinct(final))

# How many articles in each journal used in analysis
kw_articles <- kw %>%
  select(refID, title, jrnl_cat, pub_cat_2) %>%
  distinct(refID, .keep_all = TRUE) %>%
  group_by(jrnl_cat, title, pub_cat_2) %>%
  tally() %>%
  rename(`Article Category` = pub_cat_2) %>%
  ungroup() %>%
  mutate(`Article Category` = fct_recode(`Article Category`,
    "Non-tropical" = "general",
    "Tropical" = "tropical"
  )) %>%
  mutate(jrnl_cat = fct_recode(jrnl_cat,
    "General" = "general",
    "Tropical" = "tropical"
  )) %>%
  rename("Journal" = "title")


bar_order <- kw_articles %>%
  group_by(Journal) %>%
  summarize(jrnl_total = sum(n))


kw_articles$Journal <- factor(kw_articles$Journal, levels = unique(kw_articles$Journal[order(kw_articles$jrnl_cat)]))


kw_articles_plot <- ggplot(
  kw_articles,
  aes(
    fill = `Article Category`,
    x = Journal,
    y = n,
  )
) +
  labs(y = "No. of Articles") +
  geom_bar(position = "stack", stat = "identity") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_fill_manual(values = c("darkblue", "darkseagreen")) +
  theme_classic() +
  theme(
    # legend.position='top',
    legend.box.background = element_rect(color = "black", size = 1),
    # legend.title = element_blank(),
    # legend.key = element_rect(size = 30,color=alpha("transparent",0)),
    strip.text = element_text(size = 6, color = "black", face = "italic"),
    axis.text.x = element_text(color = "black", size = 6, angle = 315, hjust = 0, face = "italic"),
    axis.text.y = element_text(color = "black", size = 6, hjust = 0),
    axis.title.x = element_blank()
  )


# TITLES / BIGRAMS

bigrams <- read_csv(here("data", "data_ms", "clean_bigrams.csv"))

bigrams_count <- bigrams %>%
  # unite("bigram", word1:word2, sep = " ") %>%
  select(term) %>%
  distinct() %>%
  tally()

# How many articles in each journal used in TW analysis

tw_articles <- tw %>%
  group_by(refID, title, PY, jrnl_cat, pub_cat_2) %>%
  tally() %>%
  arrange(jrnl_cat, title, PY) %>%
  group_by(title, PY, jrnl_cat, pub_cat_2) %>%
  tally() %>%
  filter(PY >= start_yr) %>%
  filter(PY <= end_yr) %>%
  rename(`Article Category` = pub_cat_2) %>%
  ungroup() %>%
  mutate(`Article Category` = fct_recode(`Article Category`,
    "Non-tropical" = "general",
    "Tropical" = "tropical"
  )) %>%
  mutate(jrnl_cat = fct_recode(jrnl_cat,
    "General" = "general",
    "Tropical" = "tropical"
  )) %>%
  rename("Journal" = "title")


tw_articles$Journal <- factor(tw_articles$Journal, levels = unique(tw_articles$Journal[order(tw_articles$jrnl_cat)]))


tw_articles_plot <- ggplot(
  tw_articles,
  aes(
    fill = `Article Category`,
    x = Journal,
    y = n,
  )
) +
  labs(y = "No. of Articles") +
  geom_bar(position = "stack", stat = "identity") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_fill_manual(values = c("darkblue", "darkseagreen")) +
  theme_classic() +
  theme(
    # legend.position='top',
    legend.box.background = element_rect(color = "black", size = 1),
    # legend.title = element_blank(),
    # legend.key = element_rect(size = 30,color=alpha("transparent",0)),
    strip.text = element_text(size = 6, color = "black", face = "italic"),
    axis.text.x = element_text(color = "black", size = 6, angle = 315, hjust = 0, face = "italic"),
    axis.text.y = element_text(color = "black", size = 6, hjust = 0),
    axis.title.x = element_blank()
  )
```

## 1. Collection, processing, and visualization of bibliometric data 

\noindent To identify the conceptual domains studied by researchers working in 'Tropical' and "non-Tropical' locations, I used information extracted from the bibliographic records of articles published from `r (as.numeric(start_yr))`-`r (as.numeric(end_yr))` in N = `r n_jrnls` journals (_`r titles_string`_). Specifically, I extracted and summarized the information from two structural components used by authors to describe the subject of their articles: the title and keywords. These provide distinct but complementary information, and so they are often analyzed both independently and in unison. Below I describe how the article records were identified, downloaded, processed, and assigned to the 'Tropical' and "non-Tropical' categories using code written in the `R` programming language [@rcoreteamLanguageEnvironmentStatistical2023].  
    On 8 February 2023, I downloaded all bibliographic data available in SCOPUS and the Web of Science 'Core Collection' for all articles published in the focal journals; both SCOPUS and the Web of Science were queried because they differ in the years indexed for each journal. I then used the `refsplitr` package [@fournierRefsplitrAuthorName2020] to process the records and remove any duplicates. After removing all stopwords [@benoitStopwordsMultilingualStopword2021] from article titles and keywords, I spell-checked, stemmed, and lemmatized all of the keywords and title words and extracted the bigrams (i.e., pairs of sequential words, e.g., _seed predation_, _species diversity_) from titles with the `tidytext` library [@silgeTidytextTextMining2016]. Finally, I identified each article as either 'Tropical' or 'non-Tropical'; all articles published in (_`r titles_string`_) were assigned to the 'Tropical' category, while articles published in the other journals were assigned to one of these categories based on a search of the titles, keywords, or abstracts for a list of domain-specific terms (e.g., tropical: _amazon_, _andes_, _congo_, _bci_, _chamela_; non-tropical: _finland_, _boreal_, _eastern decid_, _arctic_, _polar_). These procedures resulted in N = `r scales::comma(as.numeric(n_pubs))` total articles published, of which N = `r scales::comma(as.numeric(n_pubs_trop))` reported research conducted in the tropics and N = `r scales::comma(as.numeric(n_pubs_gen))` were based on work conducted in other locations. 
    Collectively, the N = `r scales::comma(as.numeric(sum(tw_articles$n)))` contained a total of N = `r scales::comma(as.numeric(bigrams_count))` bigrams. Not all of the articles included keywords, however; from the the N = `r scales::comma(as.numeric(sum(kw_articles$n)))` that did I was able to extract a total of N = `r scales::comma(as.numeric(n_kw))`. There were N = `r scales::comma(as.numeric(n_pubs_terms))` articles from which I was able to extract both title bigrams and keywords. I used these sets of articles to conduct three geographic comparisons: (1) title bigrams, (2) keywords, and (3) title bigrams + keywords (hereafter, 'terms').
    The number of articles varies widely between journals, as does the number of keywords per article or title length. Comparing counts of keyword, bigram, or term frequency in tropical and non-tropical articles could therefore bias results towards the content published a journals allowing more keywords or journals publishing more articles. To correct for this, I calculated the percentage of articles in each geographic category that used each keyword, title bigram, or term. I then selected the N = `r cutoff` most frequently used in each geographic category, and identified (a) any keywords, bigrams, or terms that 'tropical' and 'non-tropical' articles had in common, and (b) any keywords, bigrams, or terms that were unique to each article category.  

## 2. Data and Code

\noindent The data used in this publication, the code used to import, organize, and analyze these data, and the code used to prepare the manuscript are available at Zenodo <https://doi.org/10.5281/zenodo.13821266> and Github <https://github.com/BrunaLab/bruna_biotropica_plenary_ms>.   
  The data used in this paper are part of a larger dataset collected for a longitudinal study of research in the tropics; those data, and the code used to harvest, clean, and organize them, are available at Github <https://github.com/BrunaLab/tropical_bibliometrics>. Questions regarding the data or code, or suggestions for improvement should be posted as Issues on that repository or referred to E. M. Bruna. 

## REFERENCES

\textsc{Benoit, K.}, \textsc{D. Muhr}, and \textsc{K. Watanabe}. 2021. Stopwords: Multilingual stopword lists. https://CRAN.R-project.org/package=stopwords

\textsc{Fournier, A. M. V.}, \textsc{M. E. Boone}, \textsc{F. R. Stevens}, and \textsc{E. M. Bruna}. 2020. \href{https://doi.org/10.21105/joss.02028}{Refsplitr: Author name disambiguation, author georeferencing, and mapping of coauthorship networks with Web of Science data}. Journal of Open Source Software 5: 2028. 

\textsc{R Core Team}. 2023. R: {A} language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.org/

\textsc{Silge, J.}, and \textsc{D. Robinson}. 2016. \href{https://doi.org/10.21105/joss.00037}{Tidytext: Text mining and analysis using tidy data principles in R}. Journal of Open Source Software 1(3).


```{=tex}
\blandscape
```

```{r allterms, fig.align="center", out.height = "85%",out.width = "85%",fig.cap = paste('The N = ', cutoff, 'most common terms (keywords + title bigrams) from articles based on research conducted in (a) the tropics and (b) non-tropical regions. The rank of these words is based on the percentage of articles in each category that included them. Terms reflecting geography (e.g., \\textit{tropics, Peru, Southern}) are indicated in bold and with filled bars.')} 
source(here("code", "plot_kw_bar.R"))
breaks_vec <- seq(0, 10, by = .5)
min_x <- -1.7
max_x <- 10
kw_fig <- barplot_words(terms, cutoff, min_x, max_x, breaks_vec)

ggsave("allterms_fig.jpeg",
  path = "./figures",
  dpi = 700,
  width = 10,
  height = 7,
  units = c("in")
)

ggdraw() + draw_image("./figures/allterms_fig.jpeg", scale = 1)
```
\newpage

```{r keywords, fig.align="center", out.height = "90%",out.width = "90%",fig.cap = paste('The N = ', cutoff, 'most common keywords from articles based on research conducted in (a) the tropics and (b) non-tropical regions. The rank of these words is based on the percentage of articles in each category that included them. Terms reflecting geography (e.g., \\textit{tropics, Peru, Southern}) are indicated in bold and with filled bars.')} 
source(here("code", "plot_kw_bar.R"))
breaks_vec <- seq(0, 6, by = .5)
min_x <- -1.7
max_x <- 6
kw_fig <- barplot_words(kw, cutoff, min_x, max_x, breaks_vec)

ggsave("kw_fig.jpeg",
  path = "./figures",
  dpi = 700,
  width = 10,
  height = 7,
  units = c("in")
)

ggdraw() + draw_image("./figures/kw_fig.jpeg", scale = 1)
```

\newpage 


```{r bigrams, fig.align="center", out.height = "90%",out.width = "90%",fig.cap = paste('The N = ', cutoff, 'most common bigrams in titles of articles based on research conducted in (a) the tropics and (b) non-tropical regions. The rank of these words is based on the percentage of article titles in each category that included those words. Bigrams reflecting geography (e.g., \\textit{tropics, Peru, Atlantic Forest}) are indicated in bold and with filled bars.'), echo = FALSE,message=FALSE,warning=FALSE}
source(here("code", "plot_bigrams_bar.R"))
bigram_plot <- barplot_words()
ggdraw() + draw_image("./figures/bigram_fig.jpeg", scale = 1)
```

```{=tex}
\elandscape
\newpage
```

```{r kwtime, fig.align="center",fig.cap='The number of articles from each journal and geographic category that were used in used the analysis of keywords.',echo = FALSE,message=FALSE,warning=FALSE}
kw_articles_plot
```
\newpage

```{r twtime, fig.align="center",fig.cap='The number of articles from each journal and geographic category that were used in the analysis of title words and title bigrams.',echo = FALSE,message=FALSE,warning=FALSE}
tw_articles_plot
```