-
Notifications
You must be signed in to change notification settings - Fork 0
/
random_Forest.R
52 lines (38 loc) · 1.89 KB
/
random_Forest.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#RandomForest
# All data, both training and test set
all_data
# Passenger on row 62 and 830 do not have a value for embarkment.
# Since many passengers embarked at Southampton, we give them the value S.
all_data$Embarked[c(62, 830)] <- "S"
# Factorize embarkment codes.
all_data$Embarked <- factor(all_data$Embarked)
# Passenger on row 1044 has an NA Fare value. Let's replace it with the median fare value.
all_data$Fare[1044] <- median(all_data$Fare, na.rm = TRUE)
# How to fill in missing Age values?
# We make a prediction of a passengers Age using the other variables and a decision tree model.
# This time you give method = "anova" since you are predicting a continuous variable.
library(rpart)
predicted_age <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + family_size,
data = all_data[!is.na(all_data$Age),], method = "anova")
all_data$Age[is.na(all_data$Age)] <- predict(predicted_age, all_data[is.na(all_data$Age),])
# Split the data back into a train set and a test set
train <- all_data[1:891,]
test <- all_data[892:1309,]
# train and test are available in the workspace
str(train)
str(test)
# Load in the package
library(randomForest)
# Set seed for reproducibility
set.seed(111)
# Apply the Random Forest Algorithm
my_forest <- randomForest(as.factor(Survived) ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked+Title,
data=train, importance=TRUE, ntree=1000)
# Make your prediction using the test set
my_prediction <- predict(my_forest, test, type="class")
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution <- data.frame(PassengerId = test$PassengerId, Survived = my_prediction)
#See which variables are important in the model
varImpPlot(my_forest)
# Write your solution away to a csv file with the name my_solution.csv
write.csv(my_solution,file="my_solution.csv", row.names=FALSE)