-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
110 lines (90 loc) · 4.76 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#run_analysis.R
#Merges the training and the test sets to create one data set.
#Extracts only the measurements on the mean and standard deviation for each measurement.
#Uses descriptive activity names to name the activities in the data set
#Appropriately labels the data set with descriptive activity names.
#Creates a second, independent tidy data set with the average of each variable for each activity and each subject.
library(reshape2) # for melt()
# Load the main body of data
F <- c('UCI HAR Dataset/train/X_train.txt', 'UCI HAR Dataset/test/X_test.txt')
df <- data.frame()
for (f in F) df <- rbind(df, read.table(f))
# Load the names of variables we are keeping in the main body of data,
# but discard the rest of the columns.
# Keep the statistical mean standard deviation variables.
# The variable numbers in keep_varnums represent the ordinal position in original files.
l <- readLines('UCI HAR Dataset/features.txt')
keep <- grep(pattern='mean|std', x=l, ignore.case=T, value=T)
keepspl <- strsplit(keep, " ")
keep_varnums <- as.integer(unlist(lapply(X=keepspl, FUN=function(s) s[1])))
keep_varnames <- tolower(make.names(unlist(lapply(X=keepspl, FUN=function(s) s[2]))))
# Discard the old dataframe with its extraneous variables,
# and only keep the new dataframe with its useful variable subset.
newdf <- df[, keep_varnums] # Use the variable numbers to identify columns to keep.
names(newdf) <- keep_varnames # Assign them the variable names which are R compatible.
df <- newdf
# Now the dataframe looks like this. Same number of rows as originally
# but there are only 53 columns now.
#> df[1:3,c(1,2,3)]
#tbodyacc_mean___x tbodyacc_mean___y tbodyacc_mean___z
#1 0.2885845 -0.02029417 -0.1329051
#2 0.2784188 -0.01641057 -0.1235202
#3 0.2796531 -0.01946716 -0.1134617
#> ncol(df)
#[1] 53
#> nrow(df)
#[1] 10299
# 'Subject' column will now be added to the dataframe.
F <- c('UCI HAR Dataset/train/subject_train.txt', 'UCI HAR Dataset/test/subject_test.txt')
dfs <- data.frame()
for (f in F) dfs <- rbind(dfs, read.table(f))
names(dfs) <- c('subject')
df$subject <- dfs$subject # Appends the new column 'subject' to df
# Activity is the label, that is, the prediction for an observation.
# Activity column will now be added to the dataframe.
# It is an integer column which we read from the file.
# But, it is a factor column, using descriptive names, which we
# prefer to have added to our dataframe.
# Therefore I will first need to do a translation from integer to descriptive name.
#1 WALKING
#2 WALKING_UPSTAIRS
#3 WALKING_DOWNSTAIRS
#4 SITTING
#5 STANDING
#6 LAYING
F <- c('UCI HAR Dataset/train/y_train.txt', 'UCI HAR Dataset/test/y_test.txt')
dfa <- data.frame()
for (f in F) dfa <- rbind(dfa, read.table(f))
names(dfa) <- c('activity_int')
breaks=c(0,1,2,3,4,5,6) # There is no 0 code. Luckily, (0,1] means that 0 is excluded, 1 is included. Weird but good!
labels <- c('WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING')
dfa$activity <- cut(dfa$activity_int, breaks=breaks, labels=labels)
df$activity <- dfa$activity # Appends the new column 'activity' to df
# Finally, create a second, independent tidy data set
# with the average of each variable for each activity and each subject.
# The subject (int) will be implicitly changed to a factor. (See: aggregate())
# The na columns will be implicitly omitted. (See: aggregate())
variable_means <- aggregate(df[,1:53], by=list(df$subject, df$activity), mean)
# Now, the 2 new column names created from the by clause,
# need to be changed from ‘Group.1’ to meaningful name like ‘Subject’.
# Also, one observation per row would be preferred instead of 53 per row.
names(variable_means)[1:2] <- c('subject','activity')
variable_means_one_observation <- melt(variable_means, id.vars=c('subject','activity'))
# Good, now data looks like this, which is a TIDY one value per row:
#> head(variable_means_one_observation, 40)
#subject activity variable value
#1 1 WALKING tbodyacc_mean___x 0.2773308
#2 2 WALKING tbodyacc_mean___x 0.2764266
#3 3 WALKING tbodyacc_mean___x 0.2755675
#4 4 WALKING tbodyacc_mean___x 0.2785820
#5 5 WALKING tbodyacc_mean___x 0.2778423
#...
#31 1 WALKING_UPSTAIRS tbodyacc_mean___x 0.2554617
#32 2 WALKING_UPSTAIRS tbodyacc_mean___x 0.2471648
#tail(variable_means_one_observation, 40)
# ...
#9538 28 LAYING angle_z,gravitymean_ 0.0503114269
#9539 29 LAYING angle_z,gravitymean_ -0.4234732791
#9540 30 LAYING angle_z,gravitymean_ -0.5070199383
# Run a write.table on the tidy data set, to output a single txt file per requirement.
write.table(x=variable_means_one_observation, file='means.txt', row.names=FALSE, quote=FALSE)