-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtemp_waw_eda.Rmd
237 lines (214 loc) · 9.92 KB
/
temp_waw_eda.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
---
title: "Temperature changes in Warsaw 2004-2019"
output: github_document
---
```{r setup, include=FALSE, }
knitr::opts_chunk$set(echo = FALSE, fig.height=7.5, fig.width=10,
fig.align = 'center')
```
```{r message=FALSE}
# Load packages
library(data.table)
library(dplyr)
library(naniar)
library(lubridate)
library(ggplot2)
devtools::load_all()
```
# Load data
Data was downloaded manually from <https://dane.imgw.pl/> and I stored them in
'data-raw' folder. Here are the dimensions of obtained data frame:
```{r message=FALSE}
# specify location
path <- './data-raw/'
# function for batch .csv upload
do.call_rbind_fread <- function(path, pattern = "*.csv") {
files = list.files(path, pattern, full.names = TRUE)
do.call(rbind, lapply(files, function(x) fread(x, stringsAsFactors = FALSE)))
}
x <- do.call_rbind_fread(path)
dim(x)
```
The data set contained over 140000 rows and 107 columns. Unfortunately tables did not contain column names, so I needed to enter them manually ( I will skip that ugly part).
```{r message=FALSE, include=FALSE}
colnames <- c('Kod stacji','Nazwa stacji', 'Rok', 'Miesiac', 'Dzien', 'Godzina', 'Wysokosc podstawy chmur CL CM szyfrowana','Status pomiaru HPOD', 'Wysokosc podstawy nizszej', 'Status pomiaru HPON', 'Wysokosc podstawy wyzszej', 'Status pomiaru HPOW', 'Wysokosc podstawy tekstowy', 'Pomiar przyrzadem 1 (niższa)', 'Pomiar przyrzadem 2 (wyższa)', 'Widzialnosc', 'Status pomiaru WID', 'Widzialnosc operatora', 'Status pomiaru WIDO','Widzialnosc automat', 'Status pomiaru WIDA', 'Zachmurzenie ogólne', 'Status pomiaru NOG','Kierunek wiatru', 'Status pomiaru KRWR', 'Prędkosc wiatru', 'Status pomiaru FWR ','Poryw wiatru ', 'Status pomiaru PORW ', 'Temperatura powietrza', 'Status pomiaru TEMP','Temperatura termometru zwilżonego', 'Status pomiaru TTZW', 'Wskaznik wentylacji','Wskaznik lodu', 'Cisnienie pary wodnej', 'Status pomiaru CPW', 'Wilgotnosc względna','Status pomiaru WLGW ', 'Temperatura punktu rosy', 'Status pomiaru TPTR', 'Cisnienie na pozimie stacji','Status pomiaru PPPS ', 'Cisnienie na pozimie morza', 'Status pomiaru PPPM', 'Charakterystyka tendencji', 'Wartosc tendencji',
'Status pomiaru APP', 'Opad za 6 godzin', 'Status pomiaru WO6G', 'Rodzaj opadu za 6 godzin',
'Status pomiaru ROPT', 'Pogoda biezaca', 'Pogoda ubiegla', 'Zachmurzenie niskie', 'Status pomiaru CLCM',
'Chmury CL', 'Status pomiaru CHCL', 'Chmury CL tekstem', 'Chmury CM', 'Status pomiaru CHCM',
'Chmury CM tekstem', 'Chmury CH [kod]', 'Status pomiaru CHCH', 'Chmury CH tekstem', 'Stan gruntu',
'Status pomiaru SGRN', 'Niedosyt wilgotnosci', 'Status pomiaru DEFI', 'Usłonecznienie', 'Status pomiaru USLN',
'Wystapienie rosy', 'Status pomiaru ROSW', 'Poryw maksymalny za okres WW', 'Status pomiaru PORK',
'Godzina wystapienia porywu', 'Minuta wystapienia porywu', 'Temperatura gruntu -5', 'Status pomiaru TG05',
'Temperatura gruntu -10', 'Status pomiaru TG10','Temperatura gruntu -20', 'Status pomiaru TG20',
'Temperatura gruntu -50', 'Status pomiaru TG50', 'Temperatura gruntu -100', 'Status pomiaru TG100',
'Temperatura minimalna za 12 godzin', 'Status pomiaru TMIN ', 'Temperatura maksymalna za 12 godzin', 'Status pomiaru TMAX ',
'Temperatura minimalna przy gruncie za 12 godzin', 'Status pomiaru TGMI',
'Równoważnik wodny sniegu', 'Status pomiaru RWSN', 'Wysokosć pokrywy snieżnej', 'Status pomiaru PKSN',
'Wysokosć swieżo spadłego sniegu', 'Status pomiaru HSS', 'Wysokosć sniegu na poletku', 'Status pomiaru GRSN',
'Gatunek sniegu', 'Ukształtowanie pokrywy', 'Wysokosć próbki', 'Status pomiaru HPRO',
'Ciężar próbki', 'Status pomiaru CIPR')
colnames(x) <- colnames
```
This is all about temperature so let's select the right columns and translate
them to English.
```{r message=FALSE}
temp <- x %>% select('Rok', 'Miesiac', 'Dzien', 'Godzina', 'Temperatura powietrza')
colnames(temp) <- c("year", "month", "day", "hour", "air_temp")
```
# Explore data
## Missing values
naniar package is nice for missing data visualization.
```{r echo=FALSE, fig.height=3.5, fig.width=5, eval=FALSE}
naniar::vis_miss(temp)
```
And this is great as we have a complete data set! Let's explore it further.
## Temperature values distribution
```{r echo=FALSE}
temp %>%
ggplot(aes(x=air_temp, fill = as.factor(year)))+
geom_density(alpha = .3)+
facet_wrap(~year)+
labs(fill = "year")+
theme_light()
```
That's quite interesting. We see a bimodal distribution across most of the years (summer and winter? what about spring and autumn? do we still have 4 seasons?). 2012 looks like it had the most moderate temperatures as it is flat at the top. Anyway, let's split it by seasons.
Let's assume that winter is 21-Dec - 20 Mar, spring 21-Mar - 20 Jun, summer 21-Jun - 20-Sept, Autumn 21-Sept - 20-Dec
```{r fig.height=7.5, fig.width=10}
temp_season <- temp %>%
mutate(
month = as.integer(month),
day = as.integer(day),
season = case_when(
month == 12 & day >= 21 ~ "winter",
month %in% c(1,2) ~ "winter",
month == 3 & day < 21~ "winter",
month == 3 & day >= 21 ~ "spring",
month %in% c(4,5) ~ "spring",
month == 6 & day < 21~ "spring",
month == 6 & day >= 21 ~ "summer",
month %in% c(7,8) ~ "summer",
month == 9 & day < 21~ "summer",
month == 9 & day >= 21 ~ "autumn",
month %in% c(10,11) ~ "autumn",
month == 12 & day < 21~ "autumn",
TRUE ~ NA_character_),
season = factor(season, ordered = TRUE, levels = c("spring", "summer", "autumn", "winter"))
)
temp_season_median <- temp_season %>%
group_by(season, year) %>%
summarise(median = median(air_temp))
temp_season %>%
ggplot(aes(x=air_temp, color = as.factor(year)))+
geom_density()+
facet_wrap(~season, ncol = 1)+
labs(color = "year")+
theme_light()
```
It looks like summer is the most stable season. There is a lot of variation in other seasons across the years. How it will look like when we split it by year:
```{r echo=FALSE, fig.height=7.5, fig.width=10}
temp_season %>%
ggplot(aes(x=air_temp, fill = as.factor(season)))+
geom_density(alpha = .3)+
geom_vline(data = temp_season_median,
aes(xintercept = median),
color = 'red')+
facet_grid(year~season, scales = "free_x")+
labs(fill = "year")+
theme_light()
```
Red vertical line indicates the median values per season of each year.
## Trend analysis (seasons)
It's difficult to see the trend on the charts above. Let's plot median values for each season each year:
```{r echo=FALSE}
temp_season_median %>%
ggplot(aes(x=year, y = median, color = season))+
geom_jitter(shape = 21)+
geom_smooth(method = "lm")+
labs(y = "median air_temp")+
theme_light()
```
Yep, the median temperature of all the seasons generally increases in time. Here is the plot with all data points for reference and a boxplot:
```{r echo=FALSE, fig.height=6.5, fig.width=11, fig.align="left"}
p1 <- temp_season %>%
ggplot(aes(x=year, y = air_temp, fill = season))+
geom_jitter(shape = 21, alpha = .1, color = "black")+
geom_smooth(aes(color = season), method = "lm")+
theme_light()+
guides(fill = guide_legend(ncol = 2))
p2 <- temp_season %>%
ggplot(aes(x=as.factor(season), y = air_temp))+
geom_boxplot(aes(fill = as.factor(year)), outlier.shape = 21, outlier.alpha = .1)+
theme_light()+
#facet_wrap(~season, ncol = 4)+
labs(y = "air_temp", x = "season", fill = "year")+
theme(legend.position = "right")+
guides(fill = guide_legend(ncol = 2))
weather::multiplot(p1, p2, cols=1)
```
## Compare trends (seasons)
The way to check which season temperature is increasing faster is to compare betas (slope angle) of linear model. Here are the betas for each season:
```{r echo=FALSE, fig.height=3.5, fig.width=5, message=FALSE}
seasons <- sort(unique(temp_season$season))
trends <- c()
for(s in seq_along(seasons)){
#print(seasons[s])
x <- temp_season %>%
filter(season == seasons[s])
#print(head(x))
mod_lm <- lm(air_temp ~ year, data = x)
#print(summary(mod_lm))
#print('###########################')
slope <- mod_lm$coefficients[which(names(mod_lm$coefficients) == "year")]
#print(slope)
trends[s] <- slope
}
#trends
temp_season %>%
select(season) %>%
distinct() %>%
arrange(season) %>%
mutate(trend = trends) %>%
arrange(desc(trends)) %>%
mutate(season = factor(season, ordered = TRUE, levels = season)) %>%
ggplot(aes(x = season, y = trend))+
geom_col(color = "black")+
geom_text(aes(y = trend + 0.01, label = round(trend, 3)))+
labs(title = "Values of betas of fitted linear model per season")+
theme_light()
```
Winter and spring are warming at the fastest pace. Around 0.16 and 0.15 degrees of Celcius per year. Autumn is the most stable but still warming.
## By months
Let's check how it looks like when we look at the data splitted by months. For me it was quite surprising that during winter months sometimes overlap with temperatures in June, July or August. Maybe those low tempereatures in summere were temperatures during nights?
```{r, by month }
temp_season %>%
ggplot(aes(x=month, y=air_temp)) +
geom_jitter(aes(fill=season), shape=21, color = "black", alpha=.6)+
geom_smooth(color = "darkred")+
theme_light()
```
```{r, by month }
temp_daytime <- temp_season %>%
mutate(
day_time = case_when(
hour %in% c(6:12) ~ "morning",
hour %in% c(13:21) ~ "afternoon",
TRUE ~ "night"),
day_time = as.factor(day_time)
)
temp_daytime %>%
filter(season %in% c("summer", "winter")) %>%
ggplot(aes(x=as.factor(month), y=air_temp)) +
geom_jitter(aes(fill=day_time), shape=21, color = "black", alpha=.6)+
geom_smooth(color = "darkred")+
theme_light()
```
```{r}
temp_daytime %>%
filter(season %in% c("summer", "winter")) %>%
ggplot(aes(x=as.factor(day_time), y=air_temp)) +
geom_jitter(aes(fill=season), shape=21, color = "black", alpha=.6)+
geom_smooth(color = "darkred")+
geom_hline(yintercept = max(temp_daytime$air_temp[which(temp_daytime$season == "winter")]))+
geom_hline(yintercept = min(temp_daytime$air_temp[which(temp_daytime$season == "summer")]))+
theme_light()
```