diff --git a/Barbell_lifts.html b/Barbell_lifts.html new file mode 100644 index 0000000..f260e3e --- /dev/null +++ b/Barbell_lifts.html @@ -0,0 +1,650 @@ + + + + +
+ + + + + + + + + + +This project leverages data from accelerometers worn by six people on +different parts of their bodies while performing barbell lifts. Each +person performed the barbell lift both correctly and incorrectly in five +ways. The goal is to build a machine learning model to predict what is +the type of the barbell lift. First step is to exploring the data, +understanding the feature, and clean up. The two data sets contain 160 +columns and the classifier column is “classe”, which indicate 5 types of +the movement patterns(A,B,C,D,E).
+# Load the data
+pml_testing <- read.csv("./pml-testing.csv")
+pml_training <- read.csv("./pml-training.csv")
+# Explore the data
+barplot(table(pml_training$user_name),xlab = "Person", ylab = "Frequency",col = rainbow(6), border = "black")
+barplot(table(pml_training$classe),xlab = "Classe", ylab = "Frequency",
+ col = rainbow(6), border = "black")
+# Convert the 'classe' to factor
+pml_training$classe <- as.factor(pml_training$classe)
+Some columns in the dataset contain missing value. These columns will +be removed before splitting the data.The training data contains 14718 +rows, which will be divided into training and validation sets for model +training and evaluation.
+# find out which column contain na, "", NULL
+keep_col <- pml_training %>%
+ mutate(across(everything(), ~ ifelse(is.null(.) | . == "", NA, .))) %>%
+ summarise(across(everything(), ~mean(is.na(.)))) %>%
+ select(where(~ . < 0.10)) %>%
+ names()
+pml_training_filted <- pml_training[,keep_col]
+pml_training_filted <- pml_training_filted[,-c(1,3:5)]
+
+# Quick look some of the data
+featurePlot(x = pml_training_filted[,3:5], y = pml_training_filted$classe,
+ plot = "pairs", auto.key = list(pml_training_filted$classe))
+# split the training data ####
+set.seed(20241)
+inTrain = createDataPartition(pml_training_filted$classe, p = 3/4)[[1]]
+training = pml_training_filted[ inTrain,]
+validating = pml_training_filted[-inTrain,]
+The data will be tained using 4 machine learning algorithms: Random +Forest, Gradient Boosting (gbm), Linear Discriminant Analysis (lda), and +Support Vector Machines (svm). A ten-fold cross-validation will be +applied to ensure that the model generalizes well to unseen data. The +Random Forest perform very slow compare to other three algorithms.
+set.seed(20242)
+fitControl <- trainControl(method = "repeatedcv",number = 10,repeats = 3, verboseIter = FALSE)
+#
+suppressMessages({
+ suppressWarnings({
+ capture.output({
+ pml_rf <- train(classe ~ ., method = "rf", data = training, trControl = fitControl, verbose = FALSE)
+ })
+ })
+})
+# Train gbm, lda, and svm model
+pml_gbm <- train(classe ~ ., method = "gbm", data = training,
+ trControl = fitControl,verbose = FALSE)
+pml_lda <- train(classe ~ ., method = "lda", data = training,
+ trControl = fitControl,verbose = FALSE)
+pml_svm <- svm(classe ~ ., data = training,
+ trControl = fitControl,verbose = FALSE)
+The models were evaluated on the validation dataset using accuracy +and other metrics such as precision and recall. Feature importance was +also analyzed to identify the variables that had the greast influence on +the predictions. The evaluation results shewed that the Random Forest +model achieved and accuracy of 1.00, gbm achieved 0.99, lda scored 0.75, +and svm is 0.96. Among these, the Random Forest model performed the +best. For the top two models, Random Forest and gbm, the most important +variables were identified. In both models, the same top 4 variables come +up: num_window,roll_belt,pitch_forearm, and yaw_belt.
+# Validation
+rf_val <- predict(pml_rf, newdata = validating)
+gbm_val <- predict(pml_gbm, newdata = validating)
+lda_val <- predict(pml_lda, newdata = validating)
+svm_val <- predict(pml_svm, newdata = validating)
+# Accuracy
+rf_conf <- confusionMatrix(rf_val, validating$classe)
+gbm_conf <- confusionMatrix(gbm_val, validating$classe)
+lda_conf <- confusionMatrix(lda_val, validating$classe)
+svm_conf <- confusionMatrix(svm_val, validating$classe)
+
+print(paste("Accuracy of Random Forest Model: ",rf_conf$overall['Accuracy']))
+## [1] "Accuracy of Random Forest Model: 0.997553017944535"
+print(paste("Accuracy of GBM Model:",gbm_conf$overall['Accuracy']))
+## [1] "Accuracy of GBM Model: 0.986745513866232"
+print(paste("Accuracy of LDA Model:",lda_conf$overall['Accuracy']))
+## [1] "Accuracy of LDA Model: 0.752243066884176"
+print(paste("Accuracy of SVM Model:",svm_conf$overall['Accuracy']))
+## [1] "Accuracy of SVM Model: 0.945962479608483"
+#
+rf_importance <- varImp(pml_rf, scale = FALSE)
+print(rf_importance)
+## rf variable importance
+##
+## only 20 most important variables shown (out of 59)
+##
+## Overall
+## num_window 2004.0
+## roll_belt 1368.0
+## pitch_forearm 846.3
+## yaw_belt 701.5
+## magnet_dumbbell_z 627.7
+## pitch_belt 600.8
+## magnet_dumbbell_y 584.5
+## roll_forearm 487.1
+## accel_dumbbell_y 280.7
+## magnet_dumbbell_x 245.9
+## roll_dumbbell 237.0
+## accel_forearm_x 232.6
+## accel_belt_z 217.1
+## accel_dumbbell_z 192.1
+## total_accel_dumbbell 188.9
+## magnet_belt_z 176.7
+## magnet_forearm_z 158.8
+## magnet_belt_y 152.5
+## magnet_belt_x 135.8
+## roll_arm 128.2
+gbm_importance <- varImp(pml_gbm, scale = FALSE)
+print(rf_importance)
+## rf variable importance
+##
+## only 20 most important variables shown (out of 59)
+##
+## Overall
+## num_window 2004.0
+## roll_belt 1368.0
+## pitch_forearm 846.3
+## yaw_belt 701.5
+## magnet_dumbbell_z 627.7
+## pitch_belt 600.8
+## magnet_dumbbell_y 584.5
+## roll_forearm 487.1
+## accel_dumbbell_y 280.7
+## magnet_dumbbell_x 245.9
+## roll_dumbbell 237.0
+## accel_forearm_x 232.6
+## accel_belt_z 217.1
+## accel_dumbbell_z 192.1
+## total_accel_dumbbell 188.9
+## magnet_belt_z 176.7
+## magnet_forearm_z 158.8
+## magnet_belt_y 152.5
+## magnet_belt_x 135.8
+## roll_arm 128.2
+print(gbm_importance)
+## gbm variable importance
+##
+## only 20 most important variables shown (out of 59)
+##
+## Overall
+## num_window 4691.6
+## roll_belt 2873.4
+## pitch_forearm 1602.8
+## magnet_dumbbell_z 1226.0
+## yaw_belt 1184.3
+## magnet_dumbbell_y 789.9
+## roll_forearm 728.6
+## pitch_belt 546.4
+## magnet_belt_z 508.8
+## accel_forearm_x 408.2
+## accel_dumbbell_z 404.2
+## roll_dumbbell 341.9
+## accel_dumbbell_y 340.7
+## magnet_forearm_z 309.3
+## gyros_belt_z 306.5
+## gyros_dumbbell_y 299.9
+## accel_dumbbell_x 216.2
+## magnet_belt_x 196.9
+## yaw_arm 184.6
+## magnet_belt_y 177.9
+We used all four models to predict the barbell lift patterns on the +testing data. The results show that the top three models provided +consistent predictions, while the LDA model produced different results. +Given the lower accuracy of the LDA model, its results were +discarded.
+# trimming the testing data with same column
+pml_testing_filted <- pml_testing[,colnames(training)[-56]]
+# prediction
+rf_pred <- predict(pml_rf, newdata = pml_testing_filted)
+gbm_pred <- predict(pml_gbm, newdata = pml_testing_filted)
+lda_pred <- predict(pml_lda, newdata = pml_testing_filted)
+svm_pred <- predict(pml_svm, newdata = pml_testing_filted)
+
+Results <- data.frame(rf = rf_pred, gbm = gbm_pred,
+ lda = lda_pred, svm = svm_pred)
+print(Results)
+## rf gbm lda svm
+## 1 B B C B
+## 2 A A A A
+## 3 B B B A
+## 4 A A A A
+## 5 A A A A
+## 6 E E D E
+## 7 D D D D
+## 8 B B D B
+## 9 A A A A
+## 10 A A A A
+## 11 B B D B
+## 12 C C A C
+## 13 B B B B
+## 14 A A A A
+## 15 E E B E
+## 16 E E A E
+## 17 A A A A
+## 18 B B B B
+## 19 B B B B
+## 20 B B B B
+In this project, a comprehensive machine learning approach was used +to build a model that predicts barbell lift patterns. The data +preprocessing step removed the columns with missing values and split +data into training and validation sets. Four machine learning algorithms +were employed: Random Forest, Gradient Boosting, Linear Discriminant +Analysis, and Support Vector Machines. To ensure the models generalized +to new data, ten-fold cross-validation was applied, helping to estimate +the expected out-of-sample error and prevent overfitting. The model +evaluation showed that Random forest had the highest accuracy (0.997), +followed closely by GBM(0.986) and SVM (0.96), while LDA laaged behind +with 0.75 accuracy. The choices made were guided by the goal of +maximizing accuracy and reducing the error. Finally, the prediction +model was used to make predictions on 20 test cases. Since the Random +Forest has perfect accuracy, it was selected as the primary model. If +the speed of training the model is critical, GBM is a good +alternation.
+