title | author | date | output | ||||||
---|---|---|---|---|---|---|---|---|---|
| Data from a personal activity monitoring devices
| Report
|
Idriss .S |
2/6/2022 |
|
knitr::opts_chunk$set(echo = TRUE)
library(grid)
library(ggplot2)
library(dplyr)
##
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
##
## filter, lag
## Les objets suivants sont masqués depuis 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(patchwork)
library(lubridate)
##
## Attachement du package : 'lubridate'
## Les objets suivants sont masqués depuis 'package:base':
##
## date, intersect, setdiff, union
1. Unzip file, open and store csv file into the object my_data
unzip(zipfile = "activity.zip")
my_data<- read.csv(file = "activity.csv")
- The object my_data contains three variables :
names(my_data)
## [1] "steps" "date" "interval"
1. Define the variable date as factor for my_data
my_data$date <- as.factor(my_data$date)
- How many levels do we have now ?
nlevels(my_data$date)
## [1] 61
There is 61 days in my_data
- Calculation of the sum of steps per day and store it in my_sum
my_sum <- aggregate(steps ~ date, na.action = NULL, data = my_data, FUN = sum, na.rm=TRUE)
- Function for plots
my_plot_fun <- function(my_data) {
my_data %>% ggplot() +
theme_linedraw() +
theme(
plot.title = element_text(color="red", size=18, face="bold.italic",hjust=0.5),
axis.title.x = element_text(color="#993333", size=18, face="bold"),
axis.text.x = element_text(color="#993333", size=12, angle = 90,vjust = 0.5, hjust=1),
axis.title.y = element_text(color="darkgreen", size=18, face="bold"),
axis.text.y = element_text(face="bold", color="darkgreen", size=16),
legend.text = element_text(size=12),
) +
theme(legend.position = "bottom")
}
- Plot my_sum into a histogram
my_hist <- my_plot_fun(my_sum) +
geom_bar(aes(x=date, y=steps, fill=date), stat="identity", color="black", alpha=0.5)+
xlab("Day") + ylab("Steps")
print(my_hist)
- In this part, the mean and median are related to the number of steps across all 61 days.
my_means <- mean(my_sum[,2])
writeLines(paste("The mean number of steps is :", round(my_means,digits = 1), sep='\n'))
## The mean number of steps is :
## 9354.2
my_median <- median(my_sum[,2])
writeLines(paste("The median number of steps is :",my_median, sep='\n'))
## The median number of steps is :
## 10395
2. Another case can be when days with a number of steps less than 500 are deleted.
my_sub_sum <- my_sum %>% filter(steps>499)
my_means2 <- mean(my_sub_sum[,2])
writeLines(paste("In the second case, the mean number of steps is :", round(my_means2,digits = 1), sep='\n'))
## In the second case, the mean number of steps is :
## 11185.1
my_median2 <- median(my_sub_sum[,2])
writeLines(paste("In the second case, the median number of steps is :",my_median2, sep='\n'))
## In the second case, the median number of steps is :
## 11015
- Define the variable interval as factor for my_data and create a new object my_interv_means which contains the mean across the 61 days for each interval.
my_data$interval <- as.factor(my_data$interval)
my_interv_means <- aggregate(steps ~ interval, na.action = NULL, data = my_data, FUN = mean, na.rm=TRUE)
my_interv_means$interval <- as.numeric(levels(my_data$interval))
- Construction of the plot by using the object my_interv_means :
my_plt <- my_plot_fun(my_interv_means) +
geom_line(aes(x=interval, y=steps), size = 2) +
ggtitle("Time series plot of the average number of steps taken")
- Print the plot
print(my_plt)
- Another case can be when interval with a number of steps less than 5 are deleted.
my_subset_data <- my_data %>% filter(steps > 4)
my_interv_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_subset_data, FUN = mean, na.rm=TRUE)
my_interv_means_sub <- droplevels(my_interv_means_sub)
my_interv_means_sub$interval <- as.numeric(levels(my_interv_means_sub$interval))
# Plot with ggplot
my_plt_sub <- my_plot_fun(my_interv_means_sub) +
geom_line(aes(x=interval, y=steps), size = 2) +
ggtitle("Time series plot of the average number of steps taken with subset")
- Print the second plot
print(my_plt_sub)
6. Superposition of the two last plots
my_superposed_plot <-
my_plot_fun(my_interv_means) +
geom_line(aes(x=interval, y=steps, colour="All steps kept")) +
geom_line(data = my_interv_means_sub, aes(x=interval, y=steps, colour="Steps less than 5 are deleted"),
color = "red") +
scale_color_manual(name = "Steps", values = c("All steps kept" = "black", "Steps less than 5 are deleted" = "red")) + theme(legend.position = "bottom") +
ggtitle("Two time series plots of the average number of steps taken without and with subset")
print(my_superposed_plot)
my_max <- max(my_interv_means[,2])
my_row_max <- which(my_interv_means[,2]==max(my_interv_means[,2]))
writeLines(paste("The 5-minute interval that, on average, contains the maximum number of steps is :", my_interv_means[my_row_max,1], sep='\n'))
## The 5-minute interval that, on average, contains the maximum number of steps is :
## 835
writeLines(paste("This maximum value is :", round(my_interv_means[my_row_max,2],1),"steps.", sep=' '))
## This maximum value is : 206.2 steps.
1. The idea is, in order, to :
- use the ``` aggregate()``` function with the parameter ```na.action = NULL``` ;
- set the parameter ``` FUN``` equal to a function like ``` sum()``` ;
- add extra parameter ```na.rm = TRUE```.
- this is to get rid of values that seem outliers: if the number of steps in a day is less than 500 steps, the day is not taken into account ;
- in the same interval, through each day, if the number of steps is less than 5, the observation is deleted.
- add extra parameter ```na.rm = TRUE```
1. Define the variable date as factor for my_data
my_data$date <- as.factor(my_data$date)
- Calculation of the sum of steps per day and store it in my_sum after removing
NA
by passing the parameterna.rm = TRUE
to the functionsum()
throught the functionaggregate()
my_sum_bis <- aggregate(steps ~ date, na.action = NULL, data = my_data, FUN = sum, na.rm=TRUE)
- Plot my_sum_bis into a histogram The graph below is exactly the same as the one provide in II. 4.
my_hist_2 <- my_plot_fun(my_sum_bis) +
geom_bar(aes(x=date, y=steps, fill=date), stat="identity",color="black",alpha=0.6)+
xlab("Day") + ylab("Steps")
print(my_hist_2)
- Another kind of plot with same data
my_plot_2 <- my_plot_fun(my_sum_bis) +
geom_line(aes(x=date, y=steps, group = 1),color="black") +
xlab("Day") + ylab("Steps") + theme(legend.position = "bottom")
print(my_plot_2)
VIII. Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends
- Redefine the variable date as date by using the package
lubridate
my_data$date <- ymd(my_data$date)
print(class(my_data$date))
## [1] "Date"
- Select and store in the object my_data_weekDAYS the weekdays by using the function
wday()
.
my_data_weekDAYS <- my_data[wday(my_data$date) %in% seq(1,5,1),]
- Define the variable interval as factor for my_data_weekDAYS. Then, create a new object my_interv_weekDAYS_means which contains the mean across the 61 days for each interval by using the function
agregate()
.
my_data_weekDAYS$interval <- as.factor(my_data_weekDAYS$interval)
my_interv_weekDAYS_means <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekDAYS, FUN = mean, na.rm=TRUE)
my_interv_weekDAYS_means$interval <- as.numeric(levels(my_interv_weekDAYS_means$interval))
- Select and store in the object my_data_weekENDS the weekends by using the function
wday()
.
my_data_weekENDS <- my_data[wday(my_data$date) %in% seq(6,7,1),]
- Define the variable interval as factor for my_data_weekENDS. Then, create a new object my_interv_weekENDS_means which contains the mean across the 61 days for each interval by using the function
agregate()
.
my_data_weekENDS$interval <- as.factor(my_data_weekENDS$interval)
my_interv_weekENDS_means <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekENDS, FUN = mean, na.rm=TRUE)
my_interv_weekENDS_means$interval <- as.numeric(levels(my_interv_weekENDS_means$interval))
- With the objects my_interv_weekDAYS_means and my_interv_weekENDS_means, making of two superposed plots.
my_panel_plot_1 <-
my_plot_fun(my_interv_weekDAYS_means) +
geom_line(aes(x=interval, y=steps, colour="Weekdays")) +
my_plot_fun(my_interv_weekENDS_means) +
geom_line(aes(x=interval, y=steps, colour="Weekends"), color = "black") +
scale_color_manual(name = "Steps", values = c("Weekdays" = "black", "Weekends" = "red")) + theme(legend.position = "bottom") + plot_annotation(title = "Panel plot : weekdays and weekends.",
theme = theme(plot.title = element_text(color="red", size=18, face="bold.italic",hjust=0.5)))
- Print panel plots
print(my_panel_plot_1)
- We can superpose plots :
my_superposed_plot_2 <-
my_plot_fun(my_interv_weekDAYS_means) +
geom_line(aes(x=interval, y=steps, colour="Weekdays")) +
geom_line(data = my_interv_weekENDS_means, aes(x=interval, y=steps, colour="Weekends"), color = "red") +
scale_color_manual(name = "Steps", values = c("Weekdays" = "black", "Weekends" = "red")) + theme(legend.position = "bottom") +
ggtitle("Comparaison between weekdays and weekends.")
print(my_superposed_plot_2)
- Another case can be when interval with a number of steps less than 5 are deleted.
# Subset only the interval with unless 5 steps every 5 minutes
my_subset_data <- my_data %>% filter(steps > 4)
my_data_weekDAYS_sub <- my_subset_data[wday(my_subset_data$date) %in% seq(1,5,1),]
my_data_weekENDS_sub <- my_subset_data[wday(my_subset_data$date) %in% seq(6,7,1),]
# Calculation of the weekdays
my_data_weekDAYS_sub$interval <- as.factor(my_data_weekDAYS_sub$interval)
my_interv_weekDAYS_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekDAYS_sub, FUN = mean, na.rm=TRUE)
my_interv_weekDAYS_means_sub <- droplevels(my_interv_weekDAYS_means_sub)
my_interv_weekDAYS_means_sub$interval <- as.numeric(levels(my_interv_weekDAYS_means_sub$interval))
# Calculation of the weekends
my_data_weekENDS_sub$interval <- as.factor(my_data_weekENDS_sub$interval)
my_interv_weekENDS_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekENDS_sub, FUN = mean, na.rm=TRUE)
my_interv_weekENDS_means_sub <- droplevels(my_interv_weekENDS_means_sub)
my_interv_weekENDS_means_sub$interval <- as.numeric(levels(my_interv_weekENDS_means_sub$interval))
my_superposed_plot_3 <-
my_plot_fun(my_interv_weekDAYS_means_sub) +
geom_line(aes(x=interval, y=steps, colour="Weekdays")) +
geom_line(data = my_interv_weekENDS_means_sub, aes(x=interval, y=steps, colour="Weekends"), color = "red") +
scale_color_manual(name = "Steps", values = c("Weekdays" = "black", "Weekends" = "red")) + theme(legend.position = "bottom") +
ggtitle("Comparaison between weekdays and weekends with subset.")
- Print the two other plots
print(my_superposed_plot_3)
11. EXTRA : comparison between weekdays and weekend per hour
# Data frame to store values
my_tmp <- data.frame(interval = integer(),
steps = numeric())
my_data_weekDAYS_sub<- data.frame(steps = numeric(),
date = POSIXct(),
interval = integer())
my_data_weekENDS_sub <- data.frame(steps = numeric(),
date = POSIXct(),
interval = integer())
# Subset only the interval with unless 5 steps every 5 minutes
my_subset_data <- my_data %>% filter(steps > 4)
my_data_weekDAYS_sub <- my_subset_data[wday(my_subset_data$date) %in% seq(1,5,1),]
my_data_weekENDS_sub <- my_subset_data[wday(my_subset_data$date) %in% seq(6,7,1),]
# Vector for colors
red_value = green_value = 1
# Initializing two ggplot graphs
my_plt_weekDAYS_sub <- ggplot() + ggtitle("One plot per hour") +
theme(plot.title = element_text(size = 10, hjust=0.5))
my_plt_weekENDS_sub <- ggplot() + ggtitle("One plot per hour") +
theme(plot.title = element_text(size = 10, hjust=0.5))
my_data_weekDAYS_sub$interval <- as.factor(my_data_weekDAYS_sub$interval)
my_interv_weekDAYS_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekDAYS_sub, FUN = mean, na.rm=TRUE)
my_interv_weekDAYS_means_sub <- droplevels(my_interv_weekDAYS_means_sub)
my_interv_weekDAYS_means_sub$interval <- as.numeric(levels(my_interv_weekDAYS_means_sub$interval))
# Maximum normalization for the y-axis
my_y_lim_1 <- ylim(0,max(my_interv_weekDAYS_means_sub[,2]))
my_data_weekENDS_sub$interval <- as.factor(my_data_weekENDS_sub$interval)
my_interv_weekENDS_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekENDS_sub, FUN = mean, na.rm=TRUE)
my_interv_weekENDS_means_sub <- droplevels(my_interv_weekENDS_means_sub)
my_interv_weekENDS_means_sub$interval <- as.numeric(levels(my_interv_weekENDS_means_sub$interval))
# Maximum normalization for the y-axis
my_y_lim_2 <- ylim(0,max(my_interv_weekENDS_means_sub[,2]))
green_value = 0.3
red_value = 0.4
for (i in seq(0, as.numeric(my_interv_weekDAYS_means_sub[length(my_interv_weekDAYS_means_sub[,1]),1]), by=100)) {
for (j in 1:length(my_interv_weekDAYS_means_sub[,1])){
if (my_interv_weekDAYS_means_sub[j,1]>=i & my_interv_weekDAYS_means_sub[j,1]<=i+55){
my_tmp %<>% add_row(my_interv_weekDAYS_means_sub[j,])
}
}
my_hist <- my_tmp %>%
ggplot(aes(x = interval, y = steps)) +
geom_bar(stat="identity",color=rgb(red_value,green_value,0.6)) +
# The xlim is extended to include bordering values, that's why there is " - 6" and " + 6"
my_y_lim_1 +
theme (
axis.text.x = element_text(color = "#993333", size=10, angle = 90, vjust = 0.5, hjust=1),
)
my_plt_weekDAYS_sub <- my_plt_weekDAYS_sub + my_hist
green_value = green_value + 0.01
red_value = red_value + 0.02
# Subset storage object reset
my_tmp <- data.frame(interval = integer(),
steps = numeric())
}
for (i in seq(0, as.numeric(my_interv_weekENDS_means_sub[length(my_interv_weekENDS_means_sub[,1]),1]), by=100)) {
for (j in 1:length(my_interv_weekENDS_means_sub[,1])){
if (my_interv_weekENDS_means_sub[j,1]>=i & my_interv_weekENDS_means_sub[j,1]<=i+55){
my_tmp %<>% add_row(my_interv_weekENDS_means_sub[j,])
}
}
my_hist <- my_tmp %>%
ggplot(aes(x = interval, y = steps)) +
geom_bar(stat="identity",color=rgb(red_value,green_value,0.6)) +
# The xlim is extended to include bordering values, that's why there is " - 6" and " + 6"
my_y_lim_2 +
theme (
axis.text.x = element_text(color = "#993333", size=10, angle = 90, vjust = 0.5, hjust=1),
)
my_plt_weekENDS_sub <- my_plt_weekENDS_sub + my_hist
green_value = green_value - 0.01
red_value = red_value - 0.02
# Subset storage object reset
my_tmp <- data.frame(interval = integer(),
steps = numeric())
}
for (i in 2:25) {
my_tmp_plt <- (my_plt_weekDAYS_sub[[i]] + ggtitle("Weekdays",) + theme(plot.title = element_text(hjust = 0.5)) +
my_plt_weekENDS_sub[[i]] + ggtitle("Weekends")) + theme(plot.title = element_text(hjust = 0.5)) +
plot_annotation(title = paste("Hour number :",i-1,sep=" ")) + theme_linedraw() + theme(legend.position = "bottom")
print(my_tmp_plt)
}