-
Notifications
You must be signed in to change notification settings - Fork 0
/
r_final_project_machineLearning.Rmd
144 lines (104 loc) · 5.48 KB
/
r_final_project_machineLearning.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
---
html_document:
toc: yes
toc_float: yes
output:
html_document: default
html_notebook: default
title: "R_final_project_machineLearning"
---
#decision tree(boy & girl)
##實驗方法
以decision tree 決定此個體是否屬於受歡迎的一方,由boxplot發現男性,女性在前25%的人氣指數分別高於2012與8524,所以我以此分別做為男女方是否受歡迎的界線,而模型中變數的選擇,我選取於regression中相對具顯著效果的變數
##實驗結果
經由圖示,我們可以清楚發現"職業"不論對女性或男性在人氣指數方面都有相當的關聯性,然而,比較不同的是,女性部分,人氣指數似乎具有地緣效果,而男性在體重變數則具一定影響力(受歡迎的一方,體重大多落在68-80間)
```{r}
library(dplyr)
library(ggplot2)
df<-read.csv("C:/Users/warty/Documents/dsR/data/r_final_project_df.csv")
df<-mutate(df,gender=ifelse(igender%in%"女生",1,0))
df<-mutate(df,single=ifelse(iaffe%in%"單身",1,0))
df<-mutate(df,sig_job=ifelse(ijob%in%c("行銷","未知","經商","金融保險","交通/運輸","房地產","軍警"),1,0))
df$icity<-gsub("[0-9a-zA-Z]{3}","foreign",df$icity)
df$icity<-gsub("(<.*>)","",df$icity)
a<-df%>%group_by(icity)%>%mutate(mean_view=mean(iview))%>%ggplot(aes(x=icity,y=mean_view))+geom_bar(stat="identity",width=.7)+facet_wrap(~igender)
df<-mutate(df,nosmoke=ifelse(ismoke%in%c("討厭人抽","從來不抽"),1,ifelse(ismoke%in%"","",0)))
df<-mutate(df,nodrink=ifelse(idrunk%in%c("偶爾小酌","滴酒不沾"),1,ifelse(idrunk%in%c("酒量很好","應酬交際"),0,"")))
df<-mutate(df,expertdrunk=ifelse(idrunk%in%"行家品酒",1,0))
df<-mutate(df,saiddrink=ifelse(idrunk%in%"藉酒澆愁",1,0))
df<-mutate(df,istudy_level=ifelse(istudy%in%"","",ifelse(istudy%in%"國中以下",1,ifelse(istudy%in%"國中",2,ifelse(istudy%in%"高中/職",3,ifelse(istudy%in%"大學/學院",4,ifelse(istudy%in%"碩士",5,6)))))))
df<-mutate(df,ha_re=ifelse(r_age_start==0&r_height_start==0&r_weight_start==0&rstudy%in%c("","不拘","無"),0,1))
df$iheight<-gsub("未知","",df$iheight)
df$iheight<-as.numeric(df$iheight)
df$istudy_level<-as.numeric(df$istudy_level)
df$nosmoke<-as.numeric(df$nosmoke)
df$nodrink<-as.numeric(df$nodrink)
df_girl<-filter(df,gender==1)
df_boy<-filter(df,gender==0)
```
```{r}
library(rpart)
library(rpart.plot)
library(rattle)
df_girl<-mutate(df_girl,popular=ifelse(iview>=8524,1,0))
df_boy<-mutate(df_boy,popular=ifelse(iview>=2012,1,0))
model <- rpart( popular~factor(icity)+factor(ijob)+iweight+single+sig_job+nosmoke+istudy_level+ha_re, data=df_girl, method="class")
summary(model)
model2 <- rpart( popular~factor(ijob)+iweight+single+sig_job+nosmoke+istudy_level+ha_re, data=df_boy, method="class")
<<<<<<< HEAD
fancyRpartPlot(model,main="GIRLs' decision tree",sub="popular or not")
fancyRpartPlot(model2,main="BOYs' decision tree",sub="popular or not")
=======
fancyRpartPlot(model,main="popular or not",sub="girl's decision tree")
fancyRpartPlot(model2,main="popular or not",sub="boy's decision tree")
>>>>>>> origin/master
# fit <- rpart(popular ~ iheight+iweight+single+nosmoke+istudy_level+ha_re,
# method="class", data=df_girl)
# printcp(fit) # display the results
# plotcp(fit) # visualize cross-validation results
# summary(fit) # detailed summary of splits
#
# # plot tree
# plot(fit, uniform=TRUE,
# main="Classification Tree for Popular")
# text(fit, use.n=TRUE, all=TRUE, cex=.8)
#
# # create attractive postscript plot of tree
# post(fit, file = "c:/tree.ps",
# title = "Classification Tree for Kyphosis")
```
#decision tree + kmeans clustering
##實驗方法
藉由kmeans clustering 分類出高人氣與低人氣,再以decision tree 進行特徵分析,有別於上方,我將所有變數納入模型
##實驗結果
在男性的部分,kmeans+decision tree 所得的結果不勝理想(其原因可能為有太多離群值),故在此不予討論
<hi>而對於女性部分,我們得出另一個結果(在此我把居住城市變數拿掉,使圖形呈現較為簡潔),其中縣市同樣也是一個有相對效果的因子,另外,女性似乎在年齡上的要求也有影響,若你要求年齡在24歲以上,有基督教信仰,星座為天蠍金牛獅子等,有較大可能性屬於人氣高的一方
```{r}
library(rpart.plot)
library(RColorBrewer)
library(caret)
set.seed(123)
#Generate movie clusters and plot
mydataCluster <- kmeans(na.omit(df_boy[,10]), 2, nstart = 20)
df_boy$cluster[which(!is.na(df_boy$iview))] <- mydataCluster$cluster
plot(df_boy$iview, bg=df_boy$cluster, pch=21)
training_size <- floor(0.80 * nrow(df_boy))
train_ind <- sample(seq_len(nrow(df_boy)), size = training_size)
training <- df_boy[train_ind, ]
testing <- df_boy[-train_ind, ]
#Construct decision tree
fit_boy <- rpart(cluster ~isign+iweight+iheight+ijob+iaffe+icity+ismoke+idrunk+ichara+ireligion+istudy+ischool+r_age_start+r_height_start+r_weight_end+rstudy+rchara+ha_re,method="class",data=training)
fancyRpartPlot(fit_boy)
```
```{r}
mydataCluster <- kmeans(na.omit(df_girl[,10]), 2, nstart = 20)
df_girl$cluster[which(!is.na(df_girl$iview))] <- mydataCluster$cluster
plot(df_girl$iview, bg=df_girl$cluster, pch=21)
training_size <- floor(0.80 * nrow(df_girl))
train_ind <- sample(seq_len(nrow(df_girl)), size = training_size)
training <- df_girl[train_ind, ]
testing <- df_girl[-train_ind, ]
#Construct decision tree
fit_girl <- rpart(cluster ~isign+iweight+iheight+ijob+iaffe+ismoke+idrunk+ireligion+istudy+ischool+r_age_start+r_height_start+rstudy+ha_re,method="class",data=training)
fancyRpartPlot(fit_girl)
```