-
Notifications
You must be signed in to change notification settings - Fork 0
/
eu_funding_analyser_XML.R
151 lines (133 loc) · 7.49 KB
/
eu_funding_analyser_XML.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# FP7 & H2020 parser
# libaries
library(tidyverse)
library(magrittr) # more pipes
library(lubridate) # date/time helpers
library(xml2) # xml parsing
library(beepr) # beep functions
library(parallel) # multicore processing
# settings
setwd('C:/Users/piete/Documents/git_repos/ERC_data')
# fp7_folder <- 'cordis-fp7projects-xml'
# h2020_folder <- 'cordis-h2020projects-xml'
# read reference data
# setwd('C:/Users/piete/Documents/git_repos/ERC_data/data/ref_data')
# # matchen van landen gaat goed op 2 cases na
# # Chech Republic (CZ) staat als Czechia in de EU tabel.
# # Macau staat als Macao in de EU tabel (mag op beide manieren worden gespeld)
# # corrigeren qs data, qs data als geheel inlezen en correcties in R doen, zo blijven de wijzigingen bij elkaar
# # even wachten op antwoord AdR en RvdB
# countries_eu <- read.csv2('cordisref-countries.csv', fileEncoding = 'UTF-8-BOM') %>% filter(language == 'en')
# functions
# process XML function
# ADD: error handling, the enrire process can fail on a single worker error
# import_xml <- function(handle, folder) {
# parsed_data <- list()
# xml_content <- read_xml(file.path('.', folder, handle)) %>% xml_ns_strip()
# rcn <- xml_find_first(xml_content, '/project/rcn') %>% xml_text()
# acro <- xml_find_first(xml_content, '/project/acronym') %>% xml_text()
# title <- xml_find_first(xml_content, '/project/title') %>% xml_text()
# total_cost <- xml_find_first(xml_content, '/project/totalCost') %>% xml_text()
# start_date <- xml_find_first(xml_content, '/project/startDate') %>% xml_text()
# end_date <- xml_find_first(xml_content, '/project/endDate') %>% xml_text()
# call_id <- xml_find_first(xml_content, '/project/relations/associations/call/identifier') %>% xml_text()
# associations <- xml_find_all(xml_content, '/project/relations/associations/organization')
# for (assoc in associations) {
# part_rcn <- xml_find_first(assoc, 'rcn') %>% xml_text()
# part_legal_name <- xml_find_first(assoc, 'legalName') %>% xml_text()
# part_country <- xml_find_first(assoc, 'address/country') %>% xml_text()
# part_contr = xml_attr(assoc, 'ecContribution')
# part_contr = xml_attr(assoc, 'terminated')
# if (xml_attr(assoc, 'type') == 'coordinator') {
# part_is_coord <- 'Y'
# } else {
# part_is_coord <- 'N'
# }
# parsed_assoc <- list(handle, rcn, acro, title, call_id, total_cost, start_date, end_date,
# part_rcn, part_legal_name, part_country, part_contr, part_is_coord)
# parsed_data[[length(parsed_data)+1]] <- parsed_assoc
# }
# return(parsed_data)
# }
# file handles
# fp7_filehandles <- list.files(file.path('.', fp7_folder)) %>% as.list()
# # fp7_filehandles <- list.files(file.path('.', fp7_folder))[1:1000] %>% as.list()
# h2020_filehandles <- list.files(file.path('.', h2020_folder)) %>% as.list()
# # h2020_filehandles <- list.files(file.path('.', h2020_folder))[1:1000] %>% as.list()
# import singlecore for testing
# fp7_filehandles <- list.files(file.path('.', fp7_folder))[1:50] %>% as.list()
# h2020_filehandles <- list.files(file.path('.', h2020_folder))[1:50] %>% as.list()
# fp7_parsed <- lapply(fp7_filehandles, import_xml, folder = fp7_folder)
# h2020_parsed <- lapply(h2020_filehandles, import_XML, folder = h2020_folder)
# import multicore
# to fix: generalise
# clust <- makeCluster(7)
# clusterEvalQ(clust, {library(xml2); library(magrittr)})
# clusterExport(clust, varlist = c('fp7_filehandles', 'fp7_folder'))
# parsed_fp7 <- parLapply(clust, fp7_filehandles, import_xml, folder = fp7_folder)
# stopCluster(clust)
# clust <- makeCluster(7)
# clusterEvalQ(clust, {library(xml2); library(magrittr)})
# clusterExport(clust, varlist = c('h2020_filehandles', 'h2020_folder'))
# parsed_h2020 <- parLapply(clust, h2020_filehandles, import_xml, folder = h2020_folder)
# stopCluster(clust)
# create dataframes
# df_fp7 <- parsed_fp7 %>% flatten() %>% transpose()
# names(df_fp7) <- c('handle', 'rcn', 'acro', 'title', 'call_id', 'total_cost', 'start_date', 'end_date',
# 'part_rcn', 'part_legal_name', 'part_country', 'part_contr', 'part_is_coord')
# df_fp7 <- df_fp7 %>% map(unlist) %>% as_tibble()
#
# df_h2020 <- parsed_h2020 %>% flatten() %>% transpose()
# names(df_h2020) <- c('handle', 'rcn', 'acro', 'title', 'call_id', 'total_cost', 'start_date', 'end_date',
# 'part_rcn', 'part_legal_name', 'part_country', 'part_contr', 'part_is_coord')
# df_h2020 <- df_h2020 %>% map(unlist) %>% as_tibble()
# export files
# write_csv2(df_fp7, 'fp7.csv')
# write_csv2(df_h2020, 'h2020.csv')
# save_rds(df_fp7, 'fp7.rds')
# save_rds(df_h2020, 'h2020.rds')
# ANALYSIS
# ideas
# Analysis from a NL perspective (vs other counties) and from a EUR perspective (vs other NL universities)
# EU Open Data has additional lookup tables, country etc.
# Relative succes per country, number op projects (as coordinator?) divided by the number of universities which supply 80% of the projects
# Succes within NL (relative succes maybe)
# Do coordinators also make the most money?
# Network analysis
# Maybe necessary to focus analysis on specific EU progerammes, facet on programmes for now
# Use a spatial map to visualise ther EUR network
# Use a spatial map to visualise the Dutch network
# read and finetune data
fp7 <- read_rds('fp7.rds')
h2020 <- read_rds('h2020.rds')
erc_all <- bind_rows('fp7' = fp7, 'h2020' = h2020, .id = 'source')
# write_csv2(erc_all, 'erc_all.csv')
erc_all$source %<>% as_factor()
erc_all$total_cost %<>% as.numeric()
erc_all$start_date %<>% as.Date()
erc_all$end_date %<>% as.Date()
erc_all$part_country %<>% as_factor()
erc_all$part_contr %<>% as.numeric() # there are quite a lot NAs and 0 in this variable, look into some projects
erc_all$part_is_coord %<>% as_factor()
# rcn eur: 1905738
erc_all %>% filter(str_detect(part_legal_name, 'ERASMUS')) %>% count(part_rcn, part_legal_name)
# some quick counts
erc_all %>% filter(part_rcn == 1905738) %>% tally()
erc_all %>% filter(part_rcn == 1905738) %>% select(rcn) %>% left_join(erc_all) %>% tally()
# some explorations
# do all projects have only 1 coordinator?
erc_all %>% filter(part_is_coord == 'Y') %>% count(rcn) %>% filter(n > 1) %>% arrange(desc(n))
# no, there are quite some projects with 2 coordinators, in a sample of these cases it turns out there are multiple
# legal entities of the same real organisation involved (different part_rcn). Only one of these legal entities has part_cont > 0
# the other legal entities have part_contr == 0
# a visual inspection of projects with part_contr == o and part_is_coord == Y is wise
check_no_part_contr_but_coord <- erc_all %>% filter(part_contr == 0 & part_is_coord == 'Y') %>% select(rcn) %>% distinct() %>%
inner_join(erc_all)
# however not all legal entities which have part_contr == 0 are a duplicate coordinator
# so what's up with the other part_contr == 0?
# there are also quite a lot of entries with part_contr NA, what is going on here?
# could there be a subset of projects for which no participant receives any money? What are these projects?
erc_all %>% filter(is.na(part_contr)) %>% select(rcn) %>% distinct() %>% inner_join(erc_all) %>% count(part_contr)
# no, there are also projects in which one partners has a NA part_contr and the other partners do have a part_contr
# in these cases the partner is listed as Terminated, they probably have left the projects and have returned the part_contr
# add the Terminated attribute to the dataset in the next iteration (DONE) and investigate further