-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_analysis.R
80 lines (53 loc) · 3.04 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#Author: Douglas Cesar Pampu
# 1 - Merges the training and the test sets to create one data set.
# 2 - Extracts only the measurements on the mean and standard deviation for each measurement.
# 3 - Uses descriptive activity names to name the Activities in the data set
# 4 - Appropriately labels the data set with descriptive variable names.
# 5 - From the data set in step 4, creates a second, independent tidy data set with the average of each variable for each activity and each subject.
library(data.table)
library(reshape2)
#Load the datasets
dir_train <- "data/project/UCI HAR Dataset/train"
dir_tests <- "data/project/UCI HAR Dataset/test"
#Load subject Train and Subject Test datasets
dt.Subject_train <- read.table(paste(dir_train, "subject_train.txt", sep="/"))
dt.Subject_test <- read.table(paste(dir_tests, "subject_test.txt", sep="/"))
#x_train and x_test
dt.x_train <- read.table(paste(dir_train, "X_train.txt", sep="/"))
dt.x_test <- read.table(paste(dir_tests, "X_test.txt", sep="/"))
#y_train and y_test
dt.y_train <- read.table(paste(dir_train, "y_train.txt", sep="/"))
dt.y_test <- read.table(paste(dir_tests, "y_test.txt", sep="/"))
#Load the features, skips the first column with the row number
dt.Labels.features <- read.table("data/project/UCI HAR Dataset/features.txt")[,2]
# Uses descriptive activity names to name the Activities in the data set
# Label the test and train data sets
names(dt.x_test) <- dt.Labels.features
names(dt.x_train) <- dt.Labels.features
#Extracts only the measurements on the mean and standard deviation for each measurement.
#Extract he standard deviation and mean for X dataset
dt.x_test <- dt.x_test[,grepl("mean|std", dt.Labels.features)]
dt.x_train <- dt.x_train[,grepl("mean|std", dt.Labels.features)]
#Merge X train and test dataset
dt.mean_std_train_test <- rbind(dt.x_test, dt.x_train)
#Load the Activities labels, skips the first column with the row number
dt.Labels.activity <- read.table("data/project/UCI HAR Dataset/activity_labels.txt")[,2]
#Add a column for the Activities labels and name them
dt.y_test[,2] <- dt.Labels.activity[dt.y_test[,1]]
dt.y_train[,2] <- dt.Labels.activity[dt.y_train[,1]]
names(dt.y_test) <- c("Activity_ID", "Activity_Name")
names(dt.y_train) <- c("Activity_ID", "Activity_Name")
#Merge Y train and test dataset
dt.Activities <- rbind(dt.y_test, dt.y_train)
#Merge Subject Test and Train
dt.Subject <- rbind(dt.Subject_test, dt.Subject_train)
colnames(dt.Subject) <- "Subject_Number"
#Merge X dataset, Y dataset and Subject dataset
dt.Test_Train <- cbind(as.data.table(dt.Subject), dt.Activities, dt.mean_std_train_test)
# From the data set in step 4, creates a second, independent tidy data set with the average of each variable for each activity and each subject.
ID = c("Subject_Number", "Activity_ID", "Activity_Name")
data = setdiff(colnames(dt.Test_Train), ID)
dt.Tidy = melt(dt.Test_Train, id = ID, measure.vars = data)
dt.tidydata = dcast(dt.Tidy, Subject_Number + Activity_ID ~ variable, mean)
# Write the new dataset
write.table(dt.tidydata, file = "tidy_data.txt", row.name = FALSE)