diff --git a/Barbell_lifts.html b/Barbell_lifts.html new file mode 100644 index 0000000..f260e3e --- /dev/null +++ b/Barbell_lifts.html @@ -0,0 +1,650 @@ + + + + + + + + + + + + + + + +Barbell lift + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Introduction

+

This project leverages data from accelerometers worn by six people on +different parts of their bodies while performing barbell lifts. Each +person performed the barbell lift both correctly and incorrectly in five +ways. The goal is to build a machine learning model to predict what is +the type of the barbell lift. First step is to exploring the data, +understanding the feature, and clean up. The two data sets contain 160 +columns and the classifier column is “classe”, which indicate 5 types of +the movement patterns(A,B,C,D,E).

+
# Load the data
+pml_testing <- read.csv("./pml-testing.csv")
+pml_training <- read.csv("./pml-training.csv")
+# Explore the data
+barplot(table(pml_training$user_name),xlab = "Person", ylab = "Frequency",col = rainbow(6), border = "black")
+

+
barplot(table(pml_training$classe),xlab = "Classe", ylab = "Frequency",
+        col = rainbow(6), border = "black")
+

+
# Convert the 'classe' to factor
+pml_training$classe <- as.factor(pml_training$classe)
+
+
+

Data pre-processing

+

Some columns in the dataset contain missing value. These columns will +be removed before splitting the data.The training data contains 14718 +rows, which will be divided into training and validation sets for model +training and evaluation.

+
# find out which column contain na, "", NULL
+keep_col <- pml_training %>% 
+              mutate(across(everything(), ~ ifelse(is.null(.) | . == "", NA, .))) %>%
+              summarise(across(everything(), ~mean(is.na(.)))) %>%
+              select(where(~ . < 0.10)) %>%
+              names()
+pml_training_filted <- pml_training[,keep_col]
+pml_training_filted <- pml_training_filted[,-c(1,3:5)]
+
+# Quick look some of the data
+featurePlot(x = pml_training_filted[,3:5], y = pml_training_filted$classe,
+            plot = "pairs", auto.key = list(pml_training_filted$classe))
+

+
# split the training data ####
+set.seed(20241)
+inTrain = createDataPartition(pml_training_filted$classe, p = 3/4)[[1]]
+training = pml_training_filted[ inTrain,]
+validating = pml_training_filted[-inTrain,]
+
+
+

Model training

+

The data will be tained using 4 machine learning algorithms: Random +Forest, Gradient Boosting (gbm), Linear Discriminant Analysis (lda), and +Support Vector Machines (svm). A ten-fold cross-validation will be +applied to ensure that the model generalizes well to unseen data. The +Random Forest perform very slow compare to other three algorithms.

+
set.seed(20242)
+fitControl <- trainControl(method = "repeatedcv",number = 10,repeats = 3, verboseIter = FALSE)
+# 
+suppressMessages({
+  suppressWarnings({
+    capture.output({
+      pml_rf <- train(classe ~ ., method = "rf", data = training, trControl = fitControl, verbose = FALSE)
+    })
+  })
+})
+
# Train gbm, lda, and svm model
+pml_gbm <- train(classe ~ ., method = "gbm", data = training,
+                 trControl = fitControl,verbose = FALSE)
+pml_lda <- train(classe ~ ., method = "lda", data = training,
+                 trControl = fitControl,verbose = FALSE)
+pml_svm <- svm(classe ~ ., data = training,
+               trControl = fitControl,verbose = FALSE)
+
+
+

Model evaluation

+

The models were evaluated on the validation dataset using accuracy +and other metrics such as precision and recall. Feature importance was +also analyzed to identify the variables that had the greast influence on +the predictions. The evaluation results shewed that the Random Forest +model achieved and accuracy of 1.00, gbm achieved 0.99, lda scored 0.75, +and svm is 0.96. Among these, the Random Forest model performed the +best. For the top two models, Random Forest and gbm, the most important +variables were identified. In both models, the same top 4 variables come +up: num_window,roll_belt,pitch_forearm, and yaw_belt.

+
# Validation
+rf_val <-  predict(pml_rf, newdata = validating)
+gbm_val <- predict(pml_gbm, newdata = validating)
+lda_val <- predict(pml_lda, newdata = validating)
+svm_val <- predict(pml_svm, newdata = validating)
+# Accuracy
+rf_conf <- confusionMatrix(rf_val, validating$classe)
+gbm_conf <- confusionMatrix(gbm_val, validating$classe)
+lda_conf <- confusionMatrix(lda_val, validating$classe)
+svm_conf <- confusionMatrix(svm_val, validating$classe)
+
+print(paste("Accuracy of Random Forest Model: ",rf_conf$overall['Accuracy']))
+
## [1] "Accuracy of Random Forest Model:  0.997553017944535"
+
print(paste("Accuracy of GBM Model:",gbm_conf$overall['Accuracy']))
+
## [1] "Accuracy of GBM Model: 0.986745513866232"
+
print(paste("Accuracy of LDA Model:",lda_conf$overall['Accuracy']))
+
## [1] "Accuracy of LDA Model: 0.752243066884176"
+
print(paste("Accuracy of SVM Model:",svm_conf$overall['Accuracy']))
+
## [1] "Accuracy of SVM Model: 0.945962479608483"
+
#
+rf_importance <- varImp(pml_rf, scale = FALSE)
+print(rf_importance)
+
## rf variable importance
+## 
+##   only 20 most important variables shown (out of 59)
+## 
+##                      Overall
+## num_window            2004.0
+## roll_belt             1368.0
+## pitch_forearm          846.3
+## yaw_belt               701.5
+## magnet_dumbbell_z      627.7
+## pitch_belt             600.8
+## magnet_dumbbell_y      584.5
+## roll_forearm           487.1
+## accel_dumbbell_y       280.7
+## magnet_dumbbell_x      245.9
+## roll_dumbbell          237.0
+## accel_forearm_x        232.6
+## accel_belt_z           217.1
+## accel_dumbbell_z       192.1
+## total_accel_dumbbell   188.9
+## magnet_belt_z          176.7
+## magnet_forearm_z       158.8
+## magnet_belt_y          152.5
+## magnet_belt_x          135.8
+## roll_arm               128.2
+
gbm_importance <- varImp(pml_gbm, scale = FALSE)
+print(rf_importance)
+
## rf variable importance
+## 
+##   only 20 most important variables shown (out of 59)
+## 
+##                      Overall
+## num_window            2004.0
+## roll_belt             1368.0
+## pitch_forearm          846.3
+## yaw_belt               701.5
+## magnet_dumbbell_z      627.7
+## pitch_belt             600.8
+## magnet_dumbbell_y      584.5
+## roll_forearm           487.1
+## accel_dumbbell_y       280.7
+## magnet_dumbbell_x      245.9
+## roll_dumbbell          237.0
+## accel_forearm_x        232.6
+## accel_belt_z           217.1
+## accel_dumbbell_z       192.1
+## total_accel_dumbbell   188.9
+## magnet_belt_z          176.7
+## magnet_forearm_z       158.8
+## magnet_belt_y          152.5
+## magnet_belt_x          135.8
+## roll_arm               128.2
+
print(gbm_importance)
+
## gbm variable importance
+## 
+##   only 20 most important variables shown (out of 59)
+## 
+##                   Overall
+## num_window         4691.6
+## roll_belt          2873.4
+## pitch_forearm      1602.8
+## magnet_dumbbell_z  1226.0
+## yaw_belt           1184.3
+## magnet_dumbbell_y   789.9
+## roll_forearm        728.6
+## pitch_belt          546.4
+## magnet_belt_z       508.8
+## accel_forearm_x     408.2
+## accel_dumbbell_z    404.2
+## roll_dumbbell       341.9
+## accel_dumbbell_y    340.7
+## magnet_forearm_z    309.3
+## gyros_belt_z        306.5
+## gyros_dumbbell_y    299.9
+## accel_dumbbell_x    216.2
+## magnet_belt_x       196.9
+## yaw_arm             184.6
+## magnet_belt_y       177.9
+
+
+

Prediction on Test Data:

+

We used all four models to predict the barbell lift patterns on the +testing data. The results show that the top three models provided +consistent predictions, while the LDA model produced different results. +Given the lower accuracy of the LDA model, its results were +discarded.

+
# trimming the testing data with same column
+pml_testing_filted <- pml_testing[,colnames(training)[-56]]
+# prediction
+rf_pred <-  predict(pml_rf,  newdata = pml_testing_filted)
+gbm_pred <- predict(pml_gbm, newdata = pml_testing_filted)
+lda_pred <- predict(pml_lda, newdata = pml_testing_filted)
+svm_pred <- predict(pml_svm, newdata = pml_testing_filted)
+
+Results <- data.frame(rf = rf_pred, gbm = gbm_pred,
+                      lda = lda_pred, svm = svm_pred)
+print(Results)
+
##    rf gbm lda svm
+## 1   B   B   C   B
+## 2   A   A   A   A
+## 3   B   B   B   A
+## 4   A   A   A   A
+## 5   A   A   A   A
+## 6   E   E   D   E
+## 7   D   D   D   D
+## 8   B   B   D   B
+## 9   A   A   A   A
+## 10  A   A   A   A
+## 11  B   B   D   B
+## 12  C   C   A   C
+## 13  B   B   B   B
+## 14  A   A   A   A
+## 15  E   E   B   E
+## 16  E   E   A   E
+## 17  A   A   A   A
+## 18  B   B   B   B
+## 19  B   B   B   B
+## 20  B   B   B   B
+
+
+

Summary

+

In this project, a comprehensive machine learning approach was used +to build a model that predicts barbell lift patterns. The data +preprocessing step removed the columns with missing values and split +data into training and validation sets. Four machine learning algorithms +were employed: Random Forest, Gradient Boosting, Linear Discriminant +Analysis, and Support Vector Machines. To ensure the models generalized +to new data, ten-fold cross-validation was applied, helping to estimate +the expected out-of-sample error and prevent overfitting. The model +evaluation showed that Random forest had the highest accuracy (0.997), +followed closely by GBM(0.986) and SVM (0.96), while LDA laaged behind +with 0.75 accuracy. The choices made were guided by the goal of +maximizing accuracy and reducing the error. Finally, the prediction +model was used to make predictions on 20 test cases. Since the Random +Forest has perfect accuracy, it was selected as the primary model. If +the speed of training the model is critical, GBM is a good +alternation.

+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/Barbell_lifts_cache/html/__packages b/Barbell_lifts_cache/html/__packages new file mode 100644 index 0000000..f22fac9 --- /dev/null +++ b/Barbell_lifts_cache/html/__packages @@ -0,0 +1,15 @@ +tidyverse +ggplot2 +tibble +tidyr +readr +purrr +dplyr +stringr +forcats +lubridate +lattice +caret +gbm +e1071 +randomForest diff --git a/Barbell_lifts_cache/html/model training 1_79d18f1436de695727695aaf6f496600.RData b/Barbell_lifts_cache/html/model training 1_79d18f1436de695727695aaf6f496600.RData new file mode 100644 index 0000000..8b57544 Binary files /dev/null and b/Barbell_lifts_cache/html/model training 1_79d18f1436de695727695aaf6f496600.RData differ diff --git a/Barbell_lifts_cache/html/model training 1_79d18f1436de695727695aaf6f496600.rdb b/Barbell_lifts_cache/html/model training 1_79d18f1436de695727695aaf6f496600.rdb new file mode 100644 index 0000000..4fd58de Binary files /dev/null and b/Barbell_lifts_cache/html/model training 1_79d18f1436de695727695aaf6f496600.rdb differ diff --git a/Barbell_lifts_cache/html/model training 1_79d18f1436de695727695aaf6f496600.rdx b/Barbell_lifts_cache/html/model training 1_79d18f1436de695727695aaf6f496600.rdx new file mode 100644 index 0000000..053a566 Binary files /dev/null and b/Barbell_lifts_cache/html/model training 1_79d18f1436de695727695aaf6f496600.rdx differ diff --git a/Barbell_lifts_cache/html/model training2_c4a9a0307c8ac0cd9a1cdec19523e3fc.RData b/Barbell_lifts_cache/html/model training2_c4a9a0307c8ac0cd9a1cdec19523e3fc.RData new file mode 100644 index 0000000..fe3492b Binary files /dev/null and b/Barbell_lifts_cache/html/model training2_c4a9a0307c8ac0cd9a1cdec19523e3fc.RData differ diff --git a/Barbell_lifts_cache/html/model training2_c4a9a0307c8ac0cd9a1cdec19523e3fc.rdb b/Barbell_lifts_cache/html/model training2_c4a9a0307c8ac0cd9a1cdec19523e3fc.rdb new file mode 100644 index 0000000..03c53c5 Binary files /dev/null and b/Barbell_lifts_cache/html/model training2_c4a9a0307c8ac0cd9a1cdec19523e3fc.rdb differ diff --git a/Barbell_lifts_cache/html/model training2_c4a9a0307c8ac0cd9a1cdec19523e3fc.rdx b/Barbell_lifts_cache/html/model training2_c4a9a0307c8ac0cd9a1cdec19523e3fc.rdx new file mode 100644 index 0000000..55343fb Binary files /dev/null and b/Barbell_lifts_cache/html/model training2_c4a9a0307c8ac0cd9a1cdec19523e3fc.rdx differ diff --git a/Barbell_lifts_files/figure-html/Explore-1.png b/Barbell_lifts_files/figure-html/Explore-1.png new file mode 100644 index 0000000..41949ec Binary files /dev/null and b/Barbell_lifts_files/figure-html/Explore-1.png differ diff --git a/Barbell_lifts_files/figure-html/Explore-2.png b/Barbell_lifts_files/figure-html/Explore-2.png new file mode 100644 index 0000000..283e6bc Binary files /dev/null and b/Barbell_lifts_files/figure-html/Explore-2.png differ diff --git a/Barbell_lifts_files/figure-html/Peprocess-1.png b/Barbell_lifts_files/figure-html/Peprocess-1.png new file mode 100644 index 0000000..8271272 Binary files /dev/null and b/Barbell_lifts_files/figure-html/Peprocess-1.png differ