-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_transformations.Rmd
73 lines (65 loc) · 2 KB
/
data_transformations.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
---
title: "data_transformations"
author: "Aaron Weimann"
date: "24/04/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message= F, warning = F)
```
## Data transformations
### Mutate, select, filter
```{r}
library(readr)
library(dplyr)
library(tidyr)
#read in cases csv
cases <- read_csv("datasets/COVID19_cases.csv")
#get days with more than 10 cumulative cases
cases_uk <- filter(cases, Country == "United Kingdom")
cases_uk_10 <- filter(cases_uk, confirm > 10 )
#using just one filter
cases_uk_10 <- filter(cases, confirm > 10 , Country == "United Kingdom")
#get relevant info only
cases_uk_10_red <- select(cases_uk_10, date, confirm, death, recov)
#more concise
cases_uk_10_red <- select(cases_uk_10_red, date, confirm:recov)
#get active cases
cases_uk_10_red_act <- mutate(cases_uk_10_red, active = confirm - death - recov)
```
### Grouping and summarising
```{r}
#read in demographics csv
demographics <- read_tsv("datasets/COVID19_countries_data_red.csv")
#get mean GDP per capita
gdp_mean <- summarise(demographics, mean_gdp_capita = mean(gdp_capita_2018, ignore.na = T))
#get mean GDP per capita for every day
gdp_mean_per_country <- group_by(demographics, `sub-region`) %>%
summarise(mean_gdp_capita = mean(gdp_capita_2018))
```
### Pipes
```{r}
cases_uk <- #days with uk cases over 10
filter(cases, confirm > 10 , Country == "United Kingdom") %>%
#get relevant info only
select(date, confirm:recov) %>%
#get active cases
mutate(active = confirm - death - recov)
```
### Pivoting
```{r}
pivot_longer(table4b, `1999`:`2000`, names_to = 'year', values_to = 'cases')
```
### Joins
```{r}
#read in cases
cases <- read_csv("datasets/COVID19_cases.csv")
#read in demographics csv
demographics <- read_tsv("datasets/COVID19_countries_data_red.csv")
#pick a single day for the cases data frame
cases_country <- group_by(cases, Country) %>%
filter(death > 10) %>%
slice(1)
#join with demographics data
cases_demo <- inner_join(demographics, cases_country, by = "Country")
```