-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbruna_supplement.Rmd
301 lines (235 loc) · 13.6 KB
/
bruna_supplement.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
```{r GlobalOptions, include = FALSE}
# options(knitr.duplicate.label = 'allow')
knitr::opts_chunk$set(fig.pos = "H", out.extra = "")
options(knitr.table.format = "latex")
options(
knitr.duplicate.label = "allow",
knitr.kable.NA = ""
)
knitr::opts_knit$set(eval.after = "fig.cap")
# knitr::opts_chunk$set(fig.pos = 'h')
# library("papaja")
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
library(tidyverse)
library(gridExtra)
library(kableExtra)
library(knitr)
library(cowplot)
library(here)
library(magick)
library(xfun)
library(stopwords)
library(tidytext)
library(janitor)
# No. of journals
n_jrnls <- kw %>%
select(SO) %>%
summarize(n = n_distinct(SO))
# No. of publications by category
n_pubs <- kw %>% summarize(n = n_distinct(refID))
n_pubs_trop <- kw %>%
filter(pub_cat_2 == "tropical") %>%
summarize(n = n_distinct(refID))
n_pubs_gen <- kw %>%
filter(pub_cat_2 == "general") %>%
summarize(n = n_distinct(refID))
```
\renewcommand{\appendixname}{Supporting Information}
\renewcommand{\thefigure}{S\arabic{figure}} \setcounter{figure}{0}
\renewcommand{\thetable}{S\arabic{table}} \setcounter{table}{0}
\renewcommand{\theequation}{S\arabic{table}} \setcounter{equation}{0}
\setcounter{page}{1}
\nolinenumbers
## SUPPORTING INFORMATION
\bigskip
\bigskip
## Is there really such a thing as _Tropical_ Biology?
\bigskip
\bigskip
\bigskip
\bigskip
\noindent Emilio M. Bruna ^1,2^ $^\ast$
\bigskip
\bigskip
<!-- $^1$ $^,$ $^2$ $^\ast$ -->
\noindent ^1^ Department of Wildlife Ecology and Conservation, University of Florida, PO Box 110430, Gainesville, FL 32611-0430, USA
\noindent ^2^ Center for Latin American Studies, University of Florida, PO Box 115530, Gainesville, FL 32611-5530, USA
\bigskip
\noindent $^\ast$ Corresponding author; email: [email protected].
\newpage
\resetlinenumber
\linenumbers
```{r biblio_stats, include = FALSE,echo = FALSE,message=FALSE,warning=FALSE}
# N of publications
n_pubs_terms <- terms %>%
select(refID) %>%
n_distinct()
n_pubs_terms <- as.numeric(n_pubs_terms)
n_pubs_trop <- kw %>%
filter(pub_cat_2 == "tropical") %>%
summarize(n = n_distinct(refID))
n_pubs_gen <- kw %>%
filter(pub_cat_2 == "general") %>%
summarize(n = n_distinct(refID))
# # N of terms
n_kw <- kw %>% summarize(n = n_distinct(final))
# How many articles in each journal used in analysis
kw_articles <- kw %>%
select(refID, title, jrnl_cat, pub_cat_2) %>%
distinct(refID, .keep_all = TRUE) %>%
group_by(jrnl_cat, title, pub_cat_2) %>%
tally() %>%
rename(`Article Category` = pub_cat_2) %>%
ungroup() %>%
mutate(`Article Category` = fct_recode(`Article Category`,
"Non-tropical" = "general",
"Tropical" = "tropical"
)) %>%
mutate(jrnl_cat = fct_recode(jrnl_cat,
"General" = "general",
"Tropical" = "tropical"
)) %>%
rename("Journal" = "title")
bar_order <- kw_articles %>%
group_by(Journal) %>%
summarize(jrnl_total = sum(n))
kw_articles$Journal <- factor(kw_articles$Journal, levels = unique(kw_articles$Journal[order(kw_articles$jrnl_cat)]))
kw_articles_plot <- ggplot(
kw_articles,
aes(
fill = `Article Category`,
x = Journal,
y = n,
)
) +
labs(y = "No. of Articles") +
geom_bar(position = "stack", stat = "identity") +
scale_y_continuous(expand = c(0, 0)) +
scale_fill_manual(values = c("darkblue", "darkseagreen")) +
theme_classic() +
theme(
# legend.position='top',
legend.box.background = element_rect(color = "black", size = 1),
# legend.title = element_blank(),
# legend.key = element_rect(size = 30,color=alpha("transparent",0)),
strip.text = element_text(size = 6, color = "black", face = "italic"),
axis.text.x = element_text(color = "black", size = 6, angle = 315, hjust = 0, face = "italic"),
axis.text.y = element_text(color = "black", size = 6, hjust = 0),
axis.title.x = element_blank()
)
# TITLES / BIGRAMS
bigrams <- read_csv(here("data", "data_ms", "clean_bigrams.csv"))
bigrams_count <- bigrams %>%
# unite("bigram", word1:word2, sep = " ") %>%
select(term) %>%
distinct() %>%
tally()
# How many articles in each journal used in TW analysis
tw_articles <- tw %>%
group_by(refID, title, PY, jrnl_cat, pub_cat_2) %>%
tally() %>%
arrange(jrnl_cat, title, PY) %>%
group_by(title, PY, jrnl_cat, pub_cat_2) %>%
tally() %>%
filter(PY >= start_yr) %>%
filter(PY <= end_yr) %>%
rename(`Article Category` = pub_cat_2) %>%
ungroup() %>%
mutate(`Article Category` = fct_recode(`Article Category`,
"Non-tropical" = "general",
"Tropical" = "tropical"
)) %>%
mutate(jrnl_cat = fct_recode(jrnl_cat,
"General" = "general",
"Tropical" = "tropical"
)) %>%
rename("Journal" = "title")
tw_articles$Journal <- factor(tw_articles$Journal, levels = unique(tw_articles$Journal[order(tw_articles$jrnl_cat)]))
tw_articles_plot <- ggplot(
tw_articles,
aes(
fill = `Article Category`,
x = Journal,
y = n,
)
) +
labs(y = "No. of Articles") +
geom_bar(position = "stack", stat = "identity") +
scale_y_continuous(expand = c(0, 0)) +
scale_fill_manual(values = c("darkblue", "darkseagreen")) +
theme_classic() +
theme(
# legend.position='top',
legend.box.background = element_rect(color = "black", size = 1),
# legend.title = element_blank(),
# legend.key = element_rect(size = 30,color=alpha("transparent",0)),
strip.text = element_text(size = 6, color = "black", face = "italic"),
axis.text.x = element_text(color = "black", size = 6, angle = 315, hjust = 0, face = "italic"),
axis.text.y = element_text(color = "black", size = 6, hjust = 0),
axis.title.x = element_blank()
)
```
## 1. Collection, processing, and visualization of bibliometric data
\noindent To identify the conceptual domains studied by researchers working in 'Tropical' and "non-Tropical' locations, I used information extracted from the bibliographic records of articles published from `r (as.numeric(start_yr))`-`r (as.numeric(end_yr))` in N = `r n_jrnls` journals (_`r titles_string`_). Specifically, I extracted and summarized the information from two structural components used by authors to describe the subject of their articles: the title and keywords. These provide distinct but complementary information, and so they are often analyzed both independently and in unison. Below I describe how the article records were identified, downloaded, processed, and assigned to the 'Tropical' and "non-Tropical' categories using code written in the `R` programming language [@rcoreteamLanguageEnvironmentStatistical2023].
On 8 February 2023, I downloaded all bibliographic data available in SCOPUS and the Web of Science 'Core Collection' for all articles published in the focal journals; both SCOPUS and the Web of Science were queried because they differ in the years indexed for each journal. I then used the `refsplitr` package [@fournierRefsplitrAuthorName2020] to process the records and remove any duplicates. After removing all stopwords [@benoitStopwordsMultilingualStopword2021] from article titles and keywords, I spell-checked, stemmed, and lemmatized all of the keywords and title words and extracted the bigrams (i.e., pairs of sequential words, e.g., _seed predation_, _species diversity_) from titles with the `tidytext` library [@silgeTidytextTextMining2016]. Finally, I identified each article as either 'Tropical' or 'non-Tropical'; all articles published in (_`r titles_string`_) were assigned to the 'Tropical' category, while articles published in the other journals were assigned to one of these categories based on a search of the titles, keywords, or abstracts for a list of domain-specific terms (e.g., tropical: _amazon_, _andes_, _congo_, _bci_, _chamela_; non-tropical: _finland_, _boreal_, _eastern decid_, _arctic_, _polar_). These procedures resulted in N = `r scales::comma(as.numeric(n_pubs))` total articles published, of which N = `r scales::comma(as.numeric(n_pubs_trop))` reported research conducted in the tropics and N = `r scales::comma(as.numeric(n_pubs_gen))` were based on work conducted in other locations.
Collectively, the N = `r scales::comma(as.numeric(sum(tw_articles$n)))` contained a total of N = `r scales::comma(as.numeric(bigrams_count))` bigrams. Not all of the articles included keywords, however; from the the N = `r scales::comma(as.numeric(sum(kw_articles$n)))` that did I was able to extract a total of N = `r scales::comma(as.numeric(n_kw))`. There were N = `r scales::comma(as.numeric(n_pubs_terms))` articles from which I was able to extract both title bigrams and keywords. I used these sets of articles to conduct three geographic comparisons: (1) title bigrams, (2) keywords, and (3) title bigrams + keywords (hereafter, 'terms').
The number of articles varies widely between journals, as does the number of keywords per article or title length. Comparing counts of keyword, bigram, or term frequency in tropical and non-tropical articles could therefore bias results towards the content published a journals allowing more keywords or journals publishing more articles. To correct for this, I calculated the percentage of articles in each geographic category that used each keyword, title bigram, or term. I then selected the N = `r cutoff` most frequently used in each geographic category, and identified (a) any keywords, bigrams, or terms that 'tropical' and 'non-tropical' articles had in common, and (b) any keywords, bigrams, or terms that were unique to each article category.
## 2. Data and Code
\noindent The data used in this publication, the code used to import, organize, and analyze these data, and the code used to prepare the manuscript are available at Zenodo <https://doi.org/10.5281/zenodo.13821266> and Github <https://github.com/BrunaLab/bruna_biotropica_plenary_ms>.
The data used in this paper are part of a larger dataset collected for a longitudinal study of research in the tropics; those data, and the code used to harvest, clean, and organize them, are available at Github <https://github.com/BrunaLab/tropical_bibliometrics>. Questions regarding the data or code, or suggestions for improvement should be posted as Issues on that repository or referred to E. M. Bruna.
## REFERENCES
\textsc{Benoit, K.}, \textsc{D. Muhr}, and \textsc{K. Watanabe}. 2021. Stopwords: Multilingual stopword lists. https://CRAN.R-project.org/package=stopwords
\textsc{Fournier, A. M. V.}, \textsc{M. E. Boone}, \textsc{F. R. Stevens}, and \textsc{E. M. Bruna}. 2020. \href{https://doi.org/10.21105/joss.02028}{Refsplitr: Author name disambiguation, author georeferencing, and mapping of coauthorship networks with Web of Science data}. Journal of Open Source Software 5: 2028.
\textsc{R Core Team}. 2023. R: {A} language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.org/
\textsc{Silge, J.}, and \textsc{D. Robinson}. 2016. \href{https://doi.org/10.21105/joss.00037}{Tidytext: Text mining and analysis using tidy data principles in R}. Journal of Open Source Software 1(3).
```{=tex}
\blandscape
```
```{r allterms, fig.align="center", out.height = "85%",out.width = "85%",fig.cap = paste('The N = ', cutoff, 'most common terms (keywords + title bigrams) from articles based on research conducted in (a) the tropics and (b) non-tropical regions. The rank of these words is based on the percentage of articles in each category that included them. Terms reflecting geography (e.g., \\textit{tropics, Peru, Southern}) are indicated in bold and with filled bars.')}
source(here("code", "plot_kw_bar.R"))
breaks_vec <- seq(0, 10, by = .5)
min_x <- -1.7
max_x <- 10
kw_fig <- barplot_words(terms, cutoff, min_x, max_x, breaks_vec)
ggsave("allterms_fig.jpeg",
path = "./figures",
dpi = 700,
width = 10,
height = 7,
units = c("in")
)
ggdraw() + draw_image("./figures/allterms_fig.jpeg", scale = 1)
```
\newpage
```{r keywords, fig.align="center", out.height = "90%",out.width = "90%",fig.cap = paste('The N = ', cutoff, 'most common keywords from articles based on research conducted in (a) the tropics and (b) non-tropical regions. The rank of these words is based on the percentage of articles in each category that included them. Terms reflecting geography (e.g., \\textit{tropics, Peru, Southern}) are indicated in bold and with filled bars.')}
source(here("code", "plot_kw_bar.R"))
breaks_vec <- seq(0, 6, by = .5)
min_x <- -1.7
max_x <- 6
kw_fig <- barplot_words(kw, cutoff, min_x, max_x, breaks_vec)
ggsave("kw_fig.jpeg",
path = "./figures",
dpi = 700,
width = 10,
height = 7,
units = c("in")
)
ggdraw() + draw_image("./figures/kw_fig.jpeg", scale = 1)
```
\newpage
```{r bigrams, fig.align="center", out.height = "90%",out.width = "90%",fig.cap = paste('The N = ', cutoff, 'most common bigrams in titles of articles based on research conducted in (a) the tropics and (b) non-tropical regions. The rank of these words is based on the percentage of article titles in each category that included those words. Bigrams reflecting geography (e.g., \\textit{tropics, Peru, Atlantic Forest}) are indicated in bold and with filled bars.'), echo = FALSE,message=FALSE,warning=FALSE}
source(here("code", "plot_bigrams_bar.R"))
bigram_plot <- barplot_words()
ggdraw() + draw_image("./figures/bigram_fig.jpeg", scale = 1)
```
```{=tex}
\elandscape
\newpage
```
```{r kwtime, fig.align="center",fig.cap='The number of articles from each journal and geographic category that were used in used the analysis of keywords.',echo = FALSE,message=FALSE,warning=FALSE}
kw_articles_plot
```
\newpage
```{r twtime, fig.align="center",fig.cap='The number of articles from each journal and geographic category that were used in the analysis of title words and title bigrams.',echo = FALSE,message=FALSE,warning=FALSE}
tw_articles_plot
```