-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunction_extract.R
296 lines (274 loc) · 12 KB
/
function_extract.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# Functions for the extraction of the refined datasets
library(reshape2) # dcast
library(dplyr) # %>%
source('Filters/functionsFullMatrix.R') # fullness
# source(paste(path,"Filters/functionsFullMatrix.R",sep = "/"))
# 01 getIndicators ----
# getIndicators filter from 'Indicators' the observations which have:
# 1) years in myYear
# 2) countries in myCnt [CountryName]
# 3) indicators in myInd [IndicatorName]
# 4) topics in myTopic
# 5) regions in myRegion (REM: aggregate countries have region = '',
# to pick only them set myRegion = '',
# to exlcude them set agg to FALSE)
# giving as an output a data.frame with 4 columns: cnt, ind, year, val
# (same structure of Indicators, without the Code columns for cnt and ind)
getIndicators <- function(myYear = NULL, myCnt = NULL, myInd = NULL, myTopic = NULL, myRegion = NULL,
agg = T,
ind = Indicators, ser = Series, count = Country){
# -- INPUT:
# - myYear : years [vector of int]
# - myCnt : country names [vector of strings]
# - myInd : indicators [vector of strings]
# - myTopic : topics [vector of strings]
# - myRegion : country geographical regions [vector of strings]
# - agg : set to FALSE if you want to exclude the aggregate countries
# - ind : "Indicator" dataframe
# - count : "Country" dataframe
# - ser : "Series" dataframe
# -- OUTPUT:
# - Indicators : "Indicator" dataframe filtered
# -- USES:
# - %>%
# - dcast
# Indicators
if(!is.null(myInd)){# IndicatorName is an activated criteria
if(!all(myInd %in% Indicators$IndicatorName)){
print('Indicators in myInd not in Indicators$IndicatorName :')
print(setdiff(myInd, Indicators$IndicatorName))
stop("ERROR: at least one indicator in input is not present in the dataframe")
}
Indicators <- Indicators %>% filter(IndicatorName %in% myInd)
}
# Out the aggregate
if(!agg){
my.country.code <- filter(Country, Region == '')$CountryCode
Indicators <- Indicators %>% filter(!(CountryCode %in% my.country.code))
}
# Years
if(!is.null(myYear)){ # Year is an activated criteria
if(!all(myYear %in% Indicators$Year))
stop("ERROR: at least one year is not present in the dataframe")
Indicators <- Indicators %>% filter(Year %in% myYear)
}
# Countries
if(!is.null(myCnt)){ # Countryname is an activated criteria
if(!all(myCnt %in% Country$CountryName)){ # CountryName is added in read_data.R
print('Countries in myCnt not in Country$CountryName :')
print(setdiff(myCnt, Country$CountryName))
stop("Error: at least one country is not present in the dataframe Countries")
}
my.country.name <- Country %>% filter(CountryName %in% myCnt)
Indicators <- Indicators %>% filter(CountryCode %in% my.country.name$CountryCode)
}
# Topics
if(!is.null(myTopic)){ # Topic is an activated criteria
my.idx.code <-Series %>% filter(Topic %in% myTopic)
Indicators <- Indicators %>% filter(IndicatorCode %in% my.idx.code$SeriesCode)
}
# Region
if(!is.null(myRegion)){ # Region is an activated criteria
my.country.code <- Country %>% filter(Region %in% myRegion)
Indicators <- Indicators %>% filter(CountryCode %in% my.country.code$CountryCode)
}
return(Indicators)
}
# 02.1 getCntInd ----
# getCntInd extract for a given year ('year') from a dataframe like Indicators a dataframe with
# rows : countries
# cols : indicators
# if dropNA =T drop the countries with at least one NA
# if showCnt=T print the countries filtered out
getCntInd <- function(df, year, dropNA = T, showCnt = T){
# -- INPUT:
# - df : dataframe Indicators-like
# - year : fixed year
# - dropNA : TRUE for dropping the countries with a NA value
# - showCnt : TRUE for showing the countries filtered out
# -- OUTPUT:
# - dc : df reshaped
# -- USES:
# - %>%
# - dcast
df <- filter(df, Year==year) # fix the year
dc <- dcast(df, CountryName ~ IndicatorName, value.var = "Value") # reshape
row.names(dc) <- dc$CountryName # set cnt as row names
dc <- select(dc,-CountryName)
dcAll <- dc
if(dropNA){ # drop the NA
dc <- na.omit(dcAll)
if(showCnt){ # show the filtered out
cntIn <- dc %>% row.names()
cntOut <- dcAll %>% row.names() %>% setdiff(cntIn)
if(length(cntOut)!=0){print(paste(length(cntOut),'Countries out:')); print(cntOut)}
else{print('No countries has been filtered out')}
}
}
return(dc)
}
# 02.2 getIndYear ----
# getIndYear extract for a given country ('cnt') from a dataframe like Indicators a dataframe with
# rows : indicators
# cols : years
# if dropNA =T drop the indicators with at least one NA
# if showCnt=T print the indicators filtered out
getIndYear <- function(df, cnt, dropNA = T, showInd = T){
# -- INPUT:
# - df : dataframe Indicators-like
# - cnt : fixed country [CountryName]
# - dropNA : TRUE for dropping the countries with a NA value
# - showCnt : TRUE for showing the countries filtered out
# -- OUTPUT:
# - dc : df reshaped
# -- USES:
# - %>%
# - dcast
df <- filter(df, CountryName==cnt) # fix the country
dc <- dcast(df, IndicatorName ~ Year, value.var = "Value") # reshape
row.names(dc) <- dc$IndicatorName # set ind as row names
dc <- select(dc,-IndicatorName)
dcAll <- dc
if(dropNA){ # drop the NA
dc <- na.omit(dc)
if(showInd){ # show the filtered out
indIn <- dc %>% row.names()
indOut <- dcAll %>% row.names() %>% setdiff(indIn)
if(length(indOut)!=0){print(paste(length(indOut),'Indicators out:')); print(indOut)}
else{print('No indicators has been filtered out')}
}
}
return(dc)
}
# 02.3 getYearInd ----
# getYearInd extract for a given country ('cnt') from a dataframe like Indicators a dataframe with
# rows : years
# cols : indicators
# if dropNA =T drop the years with at least one NA
# if showCnt=T print the years filtered out
getYearInd <- function(df, cnt, dropNA = T, showY = T){
# -- INPUT:
# - df : dataframe Indicators-like
# - cnt : fixed country [CountryName]
# - dropNA : TRUE for dropping the countries with a NA value
# - showCnt : TRUE for showing the countries filtered out
# -- OUTPUT:
# - dc : df reshaped
# -- USES:
# - %>%
# - dcast
df <- filter(df, CountryName==cnt) # fix the country
dc <- dcast(df, Year ~ IndicatorName, value.var = "Value") # reshape
row.names(dc) <- dc$Year # set ind as row names
dc <- select(dc,-Year)
dcAll <- dc
if(dropNA){ # drop the NA
dc <- na.omit(dc)
if(showY){ # show the filtered out
yIn <- dc %>% row.names()
yOut <- dcAll %>% row.names() %>% setdiff(yIn)
if(length(yOut)!=0){print(paste(length(yOut),'Indicators out:')); print(yOut)}
else{print('No indicators has been filtered out')}
}
}
return(dc)
}
# 02.4 getCntYear ----
# getCntYear extract for a given indicator ('ind') from a dataframe like Indicators a dataframe with
# rows : countries
# cols : years
# if dropNA =T drop the countries with at least one NA
# if showCnt=T print the countries filtered out
getCntYear <- function(df, ind, dropNA = T, showCnt = T){
# -- INPUT:
# - df : dataframe Indicators-like
# - ind : fixed indicator [CountryName]
# - dropNA : TRUE for dropping the countries with a NA value
# - showCnt : TRUE for showing the countries filtered out
# -- OUTPUT:
# - dc : df reshaped
# -- USES:
# - %>%
# - dcast
df <- filter(df, IndicatorName==ind) # fix the indicator
dc <- dcast(df, CountryName ~ Year, value.var = "Value") # reshape
row.names(dc) <- dc$CountryName # set cnt as row names
dc <- select(dc,-CountryName)
dcAll <- dc
if(dropNA){ # drop the NA
dc <- na.omit(dc)
if(showCnt){ # show the filtered out
cntIn <- dc %>% row.names()
cntOut <- dcAll %>% row.names() %>% setdiff(cntIn)
if(length(cntOut)!=0){print(paste(length(cntOut),'Countries out:')); print(cntOut)}
else{print('No countries has been filtered out')}
}
}
return(dc)
}
# 03 unifCnt ----
# unifCnt for a dataframe Indicators-like performs an intersection of the countries over years
# so that for each year you will have the same Indicators and the smae countries
# (the 3D matrix is full)
# if showCnt=T print the countries filtered out
# if showInd=T print the number of missing values for each indicators
# [use it to figure out if there are some problematic (over time) indicators]
unifCnt <- function(df, showCnt=T, showInd=T){
# -- INPUT:
# - df : dataframe Indicators-like
# - showCnt : TRUE for showing the countries filtered out
# - showInd : TRUE for showing the number of missing values for each indicators (View)
# -- OUTPUT:
# - : df without countries outside the intersection
# -- USES:
# - %>%
# - dcast
multDc <- dcast(df, CountryName + Year ~ IndicatorName, value.var = "Value") # reshape df
if(showInd){ # show the number of missing values for each indicators
numNa <- sapply(multDc[,-c(1,2)], function(x) sum(is.na(x))) # compute the number of NA values
df_numNa <- data.frame(Value=numNa, row.names=names(numNa)) # dataframe for the View
colnames(df_numNa) <- 'Number of missing values' # reset the col name
View(df_numNa)
}
years <- df$Year %>% unique()
cntAll <- df$CountryName %>% unique() # all the cnt
cnt <- cntAll
for(y in years){
dc <- filter(multDc, Year==y) # filter only the observations in year y
dc <- na.omit(dc) # drop the cnt with a missing value
cnt <- intersect(cnt,dc$CountryName) # perform the intersection
}
if(showCnt){ # show the filtered out
cntOut <- setdiff(cntAll,cnt)
if(length(cntOut)!=0){print(paste(length(cntOut),'Countries out:')); print(cntOut)}
else{print('No countries has been filtered out')}
}
return(filter(df,CountryName %in% cnt))
}
# 04 get3D ----
# get3D
# get3D from a dataframe Inidcators-like creates a list of 2D dataframe 'cnt vs ind'
# the i-th element of the list correspond to the i-th year in 'years'
# REM: I need to have 'years' as parameter so I can get the correspondence with the positions
# in the list
get3D <- function(df,years){
# -- INPUT:
# - df : dataframe Indicators-like
# - years : years to pun in the list [vector of int]
# -- OUTPUT:
# - ll : list of 2D dataframe 'cnt vs ind' representing the 3D-matrix
# -- USES:
# - fullness
# - getCntInd
if(!all(years %in% df$Year))
stop("ERROR: at least one year is not present in the dataframe")
if(fullness(df)!=1)
warning("WARNING: the 3D matrix is not full")
i <- 1
ll <- list()
for(y in years){
ll[[i]] <- getCntInd(df, y, dropNA = F, showCnt = F)
i = i+1
}
return(ll)
}