-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
102 lines (76 loc) · 5.23 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
############################################################
### COURSE PROJECT: Getting and Cleaning Data ###
### AUTHOR: Asish Mahapatra ###
### GITHUB: kluge-iitk ###
############################################################
### Please refer to the README.md for instructions on how
### to utilize this script
### NOTE: This script can take upto 20 seconds to run.
### DEPENDENCIES: This script requires the dplyr package to run
## SETUP ##
##################################################################################
# This section establishes variables for the paths to various files #
# Main_URL is the path to the data set folder #
# the other variables are the relative paths with respect to the data set folder #
##################################################################################
Main_URL <- "./GettingCleaningData/UCI HAR Dataset"
x_testurl <- "test/X_test.txt"
y_testurl <- "test/y_test.txt"
x_trainurl <- "train/X_train.txt"
y_trainurl <- "train/y_train.txt"
subtesturl <- "test/subject_test.txt"
subtrainurl <- "train/subject_train.txt"
featuresurl <- "features.txt"
activityurl <- "activity_labels.txt"
##########################################################################
## STEP1: Merges the training and the test sets to create one data set. ##
##########################################################################
x_test_data <- read.table(paste(Main_URL, x_testurl, sep = "/"), colClasses = "numeric")
x_train_data <- read.table(paste(Main_URL, x_trainurl, sep = "/"), colClasses = "numeric")
merged_data <- rbind(x_train_data, x_test_data)
# read the feature names
features <- read.table(paste(Main_URL, featuresurl, sep = "/"))
#####################################################################################################
## STEP 2: Extracts only the measurements on the mean and standard deviation for each measurement. ##
## Used a filter that accepts only feature names that have "mean()" or "std()" ##
#####################################################################################################
acceptedvars <- grepl("mean\\(\\)|std\\(\\)", features[[2]])
merged_data <- merged_data[, acceptedvars]
## load the subjects data, merge the test and train sets and add to the merged data set
sub_test_data <- read.table(paste(Main_URL, subtesturl, sep = "/"), colClasses = "integer")
sub_train_data <- read.table(paste(Main_URL, subtrainurl, sep = "/"), colClasses = "integer")
sub_data <- rbind(sub_train_data, sub_test_data)
merged_data$subject <- sub_data$V1
#########################################################################################
## STEP 3: Uses descriptive activity names to name the activities in the data set ##
## load the test and train data for activity, merge them and load the activity labels ##
#########################################################################################
y_test_data <- read.table(paste(Main_URL, y_testurl, sep = "/"), colClasses = "integer")
y_train_data <- read.table(paste(Main_URL, y_trainurl, sep = "/"), colClasses = "integer")
activitylabel <- read.table(paste(Main_URL, activityurl, sep = "/"))
activities <- rbind(y_train_data, y_test_data)
#########################################################################################
## define function to convert the numeric activity data to activity names ##
## add activity data to the merged data set and convert activity numeric data to names ##
#########################################################################################
num_to_label_activity <- function(n){as.character(activitylabel[n,2])}
merged_data$activity <- num_to_label_activity(activities$V1)
#########################################################################################
## STEP 4: Appropriately labels the data set with descriptive variable names. ##
## Selects appropriate feature names and assigns them to names(merged_data) ##
## Variable names are made syntactical by replacing '-' with '.' and removing '()' ##
## Also removed duplicate Body from BodyBody ##
#########################################################################################
names(merged_data) <- c(as.character(features[[2]][acceptedvars]), "subject", "activity")
names(merged_data) <- gsub("BodyBody", "Body", gsub("\\(\\)", "", gsub("-", ".", names(merged_data))))
library(dplyr)
#########################################################################################
## STEP 5: From the data set in step 4, creates a second, independent tidy data set ##
## with the average of each variable for each activity and each subject. ##
## group the merged data by activities and then by subject ##
## summarise the grouped data and calculate the means of the other variables ##
## grouped by activity and subject ##
#########################################################################################
grouped_data <- group_by(merged_data, activity, subject)
final_data <- summarise_each(grouped_data, funs(mean))
View(final_data)