Skip to content

Latest commit

 

History

History
528 lines (417 loc) · 20.7 KB

PA1_template.md

File metadata and controls

528 lines (417 loc) · 20.7 KB
title author date output
| Data from a personal activity monitoring devices | Report
Idriss .S
2/6/2022
html_document
pagetitle keep_md
Data from a personal activity monitoring device
true
<title>Report - Personal activity</title>

Initialization code

knitr::opts_chunk$set(echo = TRUE)
library(grid)
library(ggplot2)
library(dplyr)
## 
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
## 
##     filter, lag
## Les objets suivants sont masqués depuis 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(patchwork)
library(lubridate)
## 
## Attachement du package : 'lubridate'
## Les objets suivants sont masqués depuis 'package:base':
## 
##     date, intersect, setdiff, union

I. Code for reading in the dataset

1. Unzip file, open and store csv file into the object my_data
unzip(zipfile = "activity.zip")
my_data<- read.csv(file = "activity.csv")
  1. The object my_data contains three variables :
names(my_data)
## [1] "steps"    "date"     "interval"

II. Histogram of the total number of steps taken each day

1. Define the variable date as factor for my_data
my_data$date <- as.factor(my_data$date)
  1. How many levels do we have now ?
nlevels(my_data$date)
## [1] 61

There is 61 days in my_data

  1. Calculation of the sum of steps per day and store it in my_sum
my_sum <- aggregate(steps ~ date, na.action = NULL, data = my_data, FUN = sum, na.rm=TRUE)
  1. Function for plots
my_plot_fun <- function(my_data) {
  my_data %>% ggplot() +
  theme_linedraw() +
    theme(
   plot.title = element_text(color="red", size=18, face="bold.italic",hjust=0.5),
   axis.title.x = element_text(color="#993333", size=18, face="bold"),
   axis.text.x = element_text(color="#993333", size=12, angle = 90,vjust = 0.5, hjust=1),
   axis.title.y = element_text(color="darkgreen", size=18, face="bold"),
   axis.text.y = element_text(face="bold", color="darkgreen", size=16),
   legend.text = element_text(size=12),
   ) +
    theme(legend.position = "bottom")
}
  1. Plot my_sum into a histogram
my_hist <- my_plot_fun(my_sum) +
 geom_bar(aes(x=date, y=steps, fill=date), stat="identity", color="black", alpha=0.5)+
 xlab("Day") + ylab("Steps")
print(my_hist)


III. Mean and median number of steps taken each day

  1. In this part, the mean and median are related to the number of steps across all 61 days.
my_means <- mean(my_sum[,2])
writeLines(paste("The mean number of steps is :", round(my_means,digits = 1), sep='\n'))
## The mean number of steps is :
## 9354.2
my_median <- median(my_sum[,2])
writeLines(paste("The median number of steps is :",my_median, sep='\n'))
## The median number of steps is :
## 10395

2. Another case can be when days with a number of steps less than 500 are deleted.
my_sub_sum <- my_sum %>% filter(steps>499)
my_means2 <- mean(my_sub_sum[,2])
writeLines(paste("In the second case, the mean number of steps is :", round(my_means2,digits = 1), sep='\n'))
## In the second case, the mean number of steps is :
## 11185.1
my_median2 <- median(my_sub_sum[,2])
writeLines(paste("In the second case, the median number of steps is :",my_median2, sep='\n'))
## In the second case, the median number of steps is :
## 11015

IV. Time series plot of the average number of steps taken

  1. Define the variable interval as factor for my_data and create a new object my_interv_means which contains the mean across the 61 days for each interval.
my_data$interval <- as.factor(my_data$interval)
my_interv_means <- aggregate(steps ~ interval, na.action = NULL, data = my_data, FUN = mean, na.rm=TRUE)
my_interv_means$interval <- as.numeric(levels(my_data$interval))
  1. Construction of the plot by using the object my_interv_means :
my_plt <- my_plot_fun(my_interv_means) +
  geom_line(aes(x=interval, y=steps), size = 2) +
  ggtitle("Time series plot of the average number of steps taken")
  1. Print the plot
print(my_plt)

  1. Another case can be when interval with a number of steps less than 5 are deleted.
my_subset_data <- my_data %>% filter(steps > 4)
my_interv_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_subset_data, FUN = mean, na.rm=TRUE)
my_interv_means_sub <- droplevels(my_interv_means_sub)
my_interv_means_sub$interval <- as.numeric(levels(my_interv_means_sub$interval))

# Plot with ggplot
my_plt_sub <- my_plot_fun(my_interv_means_sub) +
  geom_line(aes(x=interval, y=steps), size = 2) +
  ggtitle("Time series plot of the average number of steps taken with subset")
  1. Print the second plot
print(my_plt_sub)

6. Superposition of the two last plots

my_superposed_plot <-
  my_plot_fun(my_interv_means) +
    geom_line(aes(x=interval, y=steps, colour="All steps kept")) +
    geom_line(data = my_interv_means_sub, aes(x=interval, y=steps, colour="Steps less than 5 are deleted"),
              color = "red") +
    scale_color_manual(name = "Steps", values = c("All steps kept" = "black", "Steps less than 5 are deleted" = "red")) + theme(legend.position = "bottom") +
   ggtitle("Two time series plots of the average number of steps taken without and with subset")

print(my_superposed_plot)


V. The 5-minute interval that, on average, contains the maximum number of steps

my_max <- max(my_interv_means[,2])
my_row_max <- which(my_interv_means[,2]==max(my_interv_means[,2]))
writeLines(paste("The 5-minute interval that, on average, contains the maximum number of steps is :", my_interv_means[my_row_max,1], sep='\n'))
## The 5-minute interval that, on average, contains the maximum number of steps is :
## 835
writeLines(paste("This maximum value is :", round(my_interv_means[my_row_max,2],1),"steps.", sep=' '))
## This maximum value is : 206.2 steps.

VI. Code to describe and show a strategy for imputing missing data

1. The idea is, in order, to :
  • use the ``` aggregate()``` function with the parameter ```na.action = NULL``` ;
  • set the parameter ``` FUN``` equal to a function like ``` sum()``` ;
  • add extra parameter ```na.rm = TRUE```.
2. In sections III. 2. and IV. 5. other more advanced explanations have been provided :
  • this is to get rid of values that seem outliers: if the number of steps in a day is less than 500 steps, the day is not taken into account ;
  • in the same interval, through each day, if the number of steps is less than 5, the observation is deleted.
  • add extra parameter ```na.rm = TRUE```

VII. Histogram of the total number of steps taken each day after missing values are imputed

1. Define the variable date as factor for my_data
my_data$date <- as.factor(my_data$date)
  1. Calculation of the sum of steps per day and store it in my_sum after removing NA by passing the parameter na.rm = TRUE to the function sum() throught the function aggregate()
my_sum_bis <- aggregate(steps ~ date, na.action = NULL, data = my_data, FUN = sum, na.rm=TRUE)
  1. Plot my_sum_bis into a histogram The graph below is exactly the same as the one provide in II. 4.
my_hist_2 <- my_plot_fun(my_sum_bis) +
 geom_bar(aes(x=date, y=steps, fill=date), stat="identity",color="black",alpha=0.6)+
 xlab("Day") + ylab("Steps")
print(my_hist_2)

  1. Another kind of plot with same data
my_plot_2 <- my_plot_fun(my_sum_bis) +
 geom_line(aes(x=date, y=steps, group = 1),color="black") +
 xlab("Day") + ylab("Steps") + theme(legend.position = "bottom")
print(my_plot_2)


VIII. Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends

  1. Redefine the variable date as date by using the package lubridate
my_data$date <- ymd(my_data$date)
print(class(my_data$date))
## [1] "Date"
  1. Select and store in the object my_data_weekDAYS the weekdays by using the function wday().
my_data_weekDAYS <- my_data[wday(my_data$date) %in% seq(1,5,1),]
  1. Define the variable interval as factor for my_data_weekDAYS. Then, create a new object my_interv_weekDAYS_means which contains the mean across the 61 days for each interval by using the function agregate().
my_data_weekDAYS$interval <- as.factor(my_data_weekDAYS$interval)
my_interv_weekDAYS_means <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekDAYS, FUN = mean, na.rm=TRUE)
my_interv_weekDAYS_means$interval <- as.numeric(levels(my_interv_weekDAYS_means$interval))
  1. Select and store in the object my_data_weekENDS the weekends by using the function wday().
my_data_weekENDS <- my_data[wday(my_data$date) %in% seq(6,7,1),]
  1. Define the variable interval as factor for my_data_weekENDS. Then, create a new object my_interv_weekENDS_means which contains the mean across the 61 days for each interval by using the function agregate().
my_data_weekENDS$interval <- as.factor(my_data_weekENDS$interval)
my_interv_weekENDS_means <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekENDS, FUN = mean, na.rm=TRUE)
my_interv_weekENDS_means$interval <- as.numeric(levels(my_interv_weekENDS_means$interval))
  1. With the objects my_interv_weekDAYS_means and my_interv_weekENDS_means, making of two superposed plots.
my_panel_plot_1 <-
  my_plot_fun(my_interv_weekDAYS_means) +
    geom_line(aes(x=interval, y=steps, colour="Weekdays")) +
  my_plot_fun(my_interv_weekENDS_means) +
    geom_line(aes(x=interval, y=steps, colour="Weekends"),  color = "black") +
    scale_color_manual(name = "Steps", values = c("Weekdays" = "black", "Weekends" = "red")) + theme(legend.position = "bottom") + plot_annotation(title = "Panel plot : weekdays and weekends.",
    theme = theme(plot.title = element_text(color="red", size=18, face="bold.italic",hjust=0.5)))
  1. Print panel plots
print(my_panel_plot_1)

  1. We can superpose plots :
my_superposed_plot_2 <-
  my_plot_fun(my_interv_weekDAYS_means) +
    geom_line(aes(x=interval, y=steps, colour="Weekdays")) +
    geom_line(data = my_interv_weekENDS_means, aes(x=interval, y=steps, colour="Weekends"),  color = "red") +
    scale_color_manual(name = "Steps", values = c("Weekdays" = "black", "Weekends" = "red")) + theme(legend.position = "bottom") +
   ggtitle("Comparaison between weekdays and weekends.")
print(my_superposed_plot_2)

  1. Another case can be when interval with a number of steps less than 5 are deleted.
# Subset only the interval with unless 5 steps every 5 minutes
my_subset_data <- my_data %>% filter(steps > 4)
my_data_weekDAYS_sub <- my_subset_data[wday(my_subset_data$date) %in% seq(1,5,1),]
my_data_weekENDS_sub <- my_subset_data[wday(my_subset_data$date) %in% seq(6,7,1),]

# Calculation of the weekdays
my_data_weekDAYS_sub$interval <- as.factor(my_data_weekDAYS_sub$interval)
my_interv_weekDAYS_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekDAYS_sub, FUN = mean, na.rm=TRUE)
my_interv_weekDAYS_means_sub <- droplevels(my_interv_weekDAYS_means_sub)
my_interv_weekDAYS_means_sub$interval <- as.numeric(levels(my_interv_weekDAYS_means_sub$interval))

# Calculation of the weekends
my_data_weekENDS_sub$interval <- as.factor(my_data_weekENDS_sub$interval)
my_interv_weekENDS_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekENDS_sub, FUN = mean, na.rm=TRUE)
my_interv_weekENDS_means_sub <- droplevels(my_interv_weekENDS_means_sub)
my_interv_weekENDS_means_sub$interval <- as.numeric(levels(my_interv_weekENDS_means_sub$interval))


my_superposed_plot_3 <-
  my_plot_fun(my_interv_weekDAYS_means_sub) +
    geom_line(aes(x=interval, y=steps, colour="Weekdays")) +
      geom_line(data = my_interv_weekENDS_means_sub, aes(x=interval, y=steps, colour="Weekends"),  color = "red") +
    scale_color_manual(name = "Steps", values = c("Weekdays" = "black", "Weekends" = "red")) + theme(legend.position = "bottom") +
   ggtitle("Comparaison between weekdays and weekends with subset.")
  1. Print the two other plots
print(my_superposed_plot_3)


11. EXTRA : comparison between weekdays and weekend per hour

# Data frame to store values
my_tmp <- data.frame(interval = integer(),
                     steps = numeric())
my_data_weekDAYS_sub<- data.frame(steps = numeric(),
                               date = POSIXct(),
                               interval = integer())
my_data_weekENDS_sub <- data.frame(steps = numeric(),
                               date = POSIXct(),
                               interval = integer())

# Subset only the interval with unless 5 steps every 5 minutes
my_subset_data <- my_data %>% filter(steps > 4)
my_data_weekDAYS_sub <- my_subset_data[wday(my_subset_data$date) %in% seq(1,5,1),]
my_data_weekENDS_sub <- my_subset_data[wday(my_subset_data$date) %in% seq(6,7,1),]

                               
# Vector for colors
red_value = green_value = 1

# Initializing two ggplot graphs
my_plt_weekDAYS_sub <- ggplot() + ggtitle("One plot per hour") + 
  theme(plot.title = element_text(size = 10, hjust=0.5))
my_plt_weekENDS_sub <- ggplot() + ggtitle("One plot per hour") + 
  theme(plot.title = element_text(size = 10, hjust=0.5))


my_data_weekDAYS_sub$interval <- as.factor(my_data_weekDAYS_sub$interval)
my_interv_weekDAYS_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekDAYS_sub, FUN = mean, na.rm=TRUE)
my_interv_weekDAYS_means_sub <- droplevels(my_interv_weekDAYS_means_sub)
my_interv_weekDAYS_means_sub$interval <- as.numeric(levels(my_interv_weekDAYS_means_sub$interval))
# Maximum normalization for the y-axis
my_y_lim_1 <- ylim(0,max(my_interv_weekDAYS_means_sub[,2]))

my_data_weekENDS_sub$interval <- as.factor(my_data_weekENDS_sub$interval)
my_interv_weekENDS_means_sub <- aggregate(steps ~ interval, na.action = NULL, data = my_data_weekENDS_sub, FUN = mean, na.rm=TRUE)
my_interv_weekENDS_means_sub <- droplevels(my_interv_weekENDS_means_sub)
my_interv_weekENDS_means_sub$interval <- as.numeric(levels(my_interv_weekENDS_means_sub$interval))
# Maximum normalization for the y-axis
my_y_lim_2 <- ylim(0,max(my_interv_weekENDS_means_sub[,2]))

green_value = 0.3
red_value = 0.4

for (i in seq(0, as.numeric(my_interv_weekDAYS_means_sub[length(my_interv_weekDAYS_means_sub[,1]),1]), by=100)) {
    for (j in 1:length(my_interv_weekDAYS_means_sub[,1])){
      if (my_interv_weekDAYS_means_sub[j,1]>=i & my_interv_weekDAYS_means_sub[j,1]<=i+55){
        my_tmp %<>% add_row(my_interv_weekDAYS_means_sub[j,])
      }
    }
    my_hist <- my_tmp %>%
      ggplot(aes(x = interval, y = steps)) +
      geom_bar(stat="identity",color=rgb(red_value,green_value,0.6)) +
  
      # The xlim is extended to include bordering values, that's why there is " - 6" and " + 6"
      my_y_lim_1 +
      theme (
        axis.text.x = element_text(color = "#993333", size=10, angle = 90, vjust = 0.5, hjust=1),
      )
  
    my_plt_weekDAYS_sub <- my_plt_weekDAYS_sub + my_hist
    green_value = green_value + 0.01
    red_value = red_value + 0.02
  
    # Subset storage object reset
    my_tmp <- data.frame(interval = integer(),
                       steps = numeric())
}

for (i in seq(0, as.numeric(my_interv_weekENDS_means_sub[length(my_interv_weekENDS_means_sub[,1]),1]), by=100)) {
    for (j in 1:length(my_interv_weekENDS_means_sub[,1])){
      if (my_interv_weekENDS_means_sub[j,1]>=i & my_interv_weekENDS_means_sub[j,1]<=i+55){
        my_tmp %<>% add_row(my_interv_weekENDS_means_sub[j,])
      }
    }
    my_hist <- my_tmp %>%
      ggplot(aes(x = interval, y = steps)) +
      geom_bar(stat="identity",color=rgb(red_value,green_value,0.6)) +
  
      # The xlim is extended to include bordering values, that's why there is " - 6" and " + 6"
      my_y_lim_2 +
      theme (
        axis.text.x = element_text(color = "#993333", size=10, angle = 90, vjust = 0.5, hjust=1),
      )
  
    my_plt_weekENDS_sub <- my_plt_weekENDS_sub + my_hist
    green_value = green_value - 0.01
    red_value = red_value - 0.02
  
    # Subset storage object reset
    my_tmp <- data.frame(interval = integer(),
                       steps = numeric())
}

for (i in 2:25) {
  my_tmp_plt <- (my_plt_weekDAYS_sub[[i]] + ggtitle("Weekdays",) + theme(plot.title = element_text(hjust = 0.5)) +
                   my_plt_weekENDS_sub[[i]] + ggtitle("Weekends")) + theme(plot.title = element_text(hjust = 0.5)) +
      plot_annotation(title = paste("Hour number :",i-1,sep=" ")) + theme_linedraw() + theme(legend.position = "bottom")
  print(my_tmp_plt)
}