-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path03_AJOL_analyze.R
165 lines (134 loc) · 5.55 KB
/
03_AJOL_analyze.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#This script queries the OpenAlex and Crossref APIs to check for presence of AJOL journals in OpenAlex
#More information:
#AJOL: https://www.ajol.info/index.php/ajol
#OpenAlex API: https://docs.openalex.org/api
#rcrossref package: https://cran.r-project.org/web/packages/rcrossref/rcrossref.pdf
#STEP 3 - analyze results
#load packages
library(tidyverse)
#set date to date of sampling
#date <- Sys.Date()
date <- "2022-02-19"
#set path
path <- file.path("data",date)
#load data (AJOL journals in long format with unique issns)
filename <- paste0("AJOL_issns_",date,".csv")
filepath <- file.path(path, filename)
data <- read_csv(filepath)
#532 of 545 AJOL journals have issns, 667 issns of which 665 unique
#NB 2 journals (SAFP and SAJCN) have the same issn/eissn
filename <- paste0("AJOL_OpenAlex_",date,".csv")
filepath <- file.path(path, filename)
data_openalex <- read_csv(filepath)
filename <- paste0("AJOL_Crossref_",date,".csv")
filepath <- file.path(path, filename)
data_cr <- read_csv(filepath)
# NB duplicate issn/eissn for SAFP and SAJCN resolve to SAFP in both OpenAlex and Crossref
#----------------------------------------------------
#join OpenAlex and Crossref results to original data
#prepare dataframes
data_openalex_join <- data_openalex %>%
select(-c(publisher, created_date)) %>%
rename(open_alex_title = display_name,
open_alex_count = works_count,
open_alex_venue_id = id,
open_alex_match_issn = match_issn,
open_alex_match_issn_l = match_issn_l) %>%
mutate(in_open_alex = case_when(
!is.na(open_alex_title) ~ "open_alex",
TRUE ~ NA_character_)) %>%
mutate(open_alex_count = case_when(
open_alex_count == 0 ~ NA_real_,
TRUE ~ open_alex_count)) %>%
select(issn_input, in_open_alex, open_alex_venue_id,
open_alex_match_issn, open_alex_match_issn_l,
open_alex_count)
data_cr_join <- data_cr %>%
select(issn_input_cr, title_cr, total_dois_cr) %>%
rename(crossref_title = title_cr,
crossref_count = total_dois_cr) %>%
mutate(in_crossref = case_when(
!is.na(crossref_title) ~ "crossref",
TRUE ~ NA_character_)) %>%
mutate(crossref_count = case_when(
crossref_count == 0 ~ NA_real_,
TRUE ~ crossref_count)) %>%
select(issn_input_cr, in_crossref, crossref_count)
rm(data_cr, data_openalex)
#join dataframes
data_join <- data %>%
left_join(data_openalex_join, by = c("issn_value" = "issn_input")) %>%
left_join(data_cr_join, by = c("issn_value" = "issn_input_cr"))
rm(data_cr_join, data_openalex_join)
#fill Crossref/OpenAlex info across multiple ISSN records per title
data_final <- data_join %>%
group_by(journal) %>%
fill(everything(), .direction = "downup") %>%
ungroup() %>%
select(-issn_value) %>%
distinct()
#548 instead of 533 records
#15 titles linked to multiple OpenAlex venue IDs, either with the same or different works count
#Decide to *only keep highest count* (but could also decide to add counts for both issns)
#For titles with equal count, keep lowest (earliest?) venueID
#TODO check overlap of records for these title variants in OpenAlex
#data_final_corrected <- data_final %>%
data_final_corrected <- data_final %>%
group_by(journal) %>%
arrange(desc(open_alex_count), open_alex_venue_id) %>%
slice(1) %>%
ungroup()
rm(data_join, data_final)
filename <- paste0("AJOL_OpenAlex_Crossref_",date,".csv")
filepath <- file.path(path, filename)
write_csv(data_final_corrected, filepath)
#data_final_corrected <- read_csv(filepath)
#-----------------------------------------------------
#analyze results
#TODO Create function for counts and counts_openalex
counts <- data_final_corrected %>%
select(journal, in_crossref, in_open_alex) %>%
mutate(crossref_only = case_when(
!is.na(in_crossref) & is.na(in_open_alex) ~ "crossref_only",
TRUE ~ NA_character_),
open_alex_only = case_when(
is.na(in_crossref) & !is.na(in_open_alex) ~ "openalex_only",
TRUE ~ NA_character_),
both = case_when(
!is.na(in_crossref) & !is.na(in_open_alex) ~ "both",
TRUE ~ NA_character_),
none = case_when(
is.na(in_crossref) & is.na(in_open_alex) ~ "none",
TRUE ~ NA_character_)) %>%
summarise_all(~ sum(!is.na(.)))
counts_open_alex <- data_final_corrected %>%
select(journal, in_open_alex,
open_alex_match_issn, open_alex_match_issn_l) %>%
mutate(open_alex_issn_only = case_when(
!is.na(open_alex_match_issn) & is.na(open_alex_match_issn_l) ~ "open_alex_issn_only",
TRUE ~ NA_character_),
open_alex_issn_l_only = case_when(
is.na(open_alex_match_issn) & !is.na(open_alex_match_issn_l) ~ "open_alex_issn_l_only",
TRUE ~ NA_character_),
both = case_when(
!is.na(open_alex_match_issn) & !is.na(open_alex_match_issn_l) ~ "both",
TRUE ~ NA_character_),
none = case_when(
is.na(open_alex_match_issn) & is.na(open_alex_match_issn_l) ~ "none",
TRUE ~ NA_character_)) %>%
summarise_all(~ sum(!is.na(.)))
#compare counts for journals in both crossref and openalex
counts_compare <- data_final_corrected %>%
filter(!is.na(in_crossref) & !is.na(in_open_alex)) %>%
mutate(
crossref_more = case_when(
crossref_count > open_alex_count ~ "crossref_more",
TRUE ~ NA_character_),
openalex_more = case_when(
crossref_count < open_alex_count ~ "open_alex_more",
TRUE ~ NA_character_),
equal = case_when(
crossref_count == open_alex_count ~ "equal",
TRUE ~ NA_character_)) %>%
select(journal, crossref_more, openalex_more, equal) %>%
summarise_all(~ sum(!is.na(.)))