-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patheuropePMC_individualfiles_2023-04-17.R
114 lines (93 loc) · 4.08 KB
/
europePMC_individualfiles_2023-04-17.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
library(XML)
library(tidypmc)
# Parsing EuropePMC preprints - individual XML files - to find short ones
# 2023-04, Jessica Polka w/Chat GTP
# Search Europe PMC with a string such as: (OPEN_ACCESS:y) AND (SRC:"PPR") AND (FIRST_PDATE:[2023-01-01 TO 2023-03-31])
# Extract the .zip file to the dir specified by dir_path below
# Set the directory path to where the XML files are stored
dir_path <- "C:/Users/jessi/Downloads/EuropePMCpreprints/"
csv_path = paste(dir_path, "2021-Q12023.csv", sep = "")
# Get a list of all the XML files in the directory
xml_files <- list.files(dir_path, pattern = "\\.xml$", full.names = TRUE)
numberoffiles <- length(xml_files)
# Initialize empty lists to store the extracted data
dois <- vector(mode = "list", length = numberoffiles)
titles <- vector(mode = "list", length = numberoffiles)
abstracts <- vector(mode = "list", length = numberoffiles)
servers <- vector(mode = "list", length = numberoffiles)
#pub_dates <- list()
word_counts <- vector(mode = "list", length = numberoffiles)
figure_counts <- vector(mode = "list", length = numberoffiles)
table_counts <- vector(mode = "list", length = numberoffiles)
urls <- vector(mode = "list", length = numberoffiles)
# Loop through each XML file and extract the data
for (i in seq_along(xml_files)) {
# Load the XML file
xmlfile <- xmlParse(xml_files[i])
# Extract the DOI, title, abstract, and publication date
doi <- xpathSApply(xmlfile, "//front/article-meta/article-id[@pub-id-type='doi']", xmlValue)
if (!is.character(doi) || length(doi) > 1) {
doi <- "null"
}
title <- xpathSApply(xmlfile, "//front/article-meta/title-group/article-title", xmlValue)
if (!is.character(title) || length(title) > 1) {
message(paste(i, "has no proper title"))
title <- "null"
}
abstract <- xpathSApply(xmlfile, "//front/article-meta/abstract", xmlValue)
if (!is.character(abstract) || length(abstract) > 1) {
message(paste(i, "has no proper abstract"))
abstract <- "null"
}
server <- xpathSApply(xmlfile, "//front/journal-meta/journal-id", xmlValue)
if (!is.character(server) || length(server) > 1) {
message(paste(i, "has no proper server"))
server <- "null"
}
pmcid <- xpathSApply(xmlfile, "//front/article-meta/article-id[@pub-id-type='archive']", xmlValue)
if (!is.character(pmcid) || length(pmcid) > 1) {
message(paste(i, "has no proper pmcid"))
pmcid <- "null"
}
# pub_date <- xpathSApply(xmlfile, "//front/article-meta/pub-date[@pub-type='epub']", function(x) {
# paste(x$year, x$month, x$day, sep = "-")
# })
# Extract the main text and count the words
main_text <- xpathSApply(xmlfile, "//body", xmlValue)
if (length(main_text) > 0) {
word_count <- length(strsplit(main_text, "\\s+")[[1]])
} else {
word_count <- 0
}
# Extract the figures and count them
figures <- xpathApply(xmlfile, "//fig", xmlAttrs)
figure_count <- length(figures)
# Extract the tables and count them
tables <- xpathApply(xmlfile, "//table-wrap", xmlAttrs)
table_count <- length(tables)
# Append the extracted data to the respective lists
dois[[i]] <- doi
titles[[i]] <- title
abstracts[[i]] <- abstract
servers[[i]] <- server
# pub_dates[[i]] < && length(x) > 0) x[[1]] else x),- pub_date
word_counts[[i]] <- word_count
figure_counts[[i]] <- figure_count
table_counts[[i]] <- table_count
urls[[i]] <- paste("https://europepmc.org/article/PPR/", pmcid, sep = "")
}
# Combine the extracted data into a data frame
data <- data.frame(
DOI <- unlist(dois),
# DOI <- unlist(sapply(dois, function(x) if (is.list(x) && length(x) > 0) x[[1]] else x)),
Title <- unlist(titles),
Abstract <- sapply(abstracts, function(x) if (is.list(x) && length(x) > 0) x[[1]] else x),
Server <- unlist(servers),
# PubDate = unlist(pub_dates),
WordCount = unlist(word_counts),
FigureCount = unlist(figure_counts),
TableCount = unlist(table_counts),
URL = unlist(urls)
)
# Write the data frame to a CSV file
write.csv(data, file = csv_path, row.names = FALSE)