-
Notifications
You must be signed in to change notification settings - Fork 19
/
recitation_9.R
166 lines (78 loc) · 3.53 KB
/
recitation_9.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
##Recitation 9
# Kevin Munger
###Running STM
###Make sure you have the appropriate packages installed
install.packages("quanteda")
install.packages("stm")
install.packages("ggplot2")
library(quanteda)
library(stm)
library(ggplot2)
###First, you need to go to my github and download the data
###Save the two folders to your desktop
###read in the tweets
setwd("C:/Users/kevin/Desktop")
###Get the list of files
g1 <- list.files("MA paper/govdates/", full.names=TRUE)
g2 <- list.files("MA paper/oppdates/", full.names=TRUE)
files<-c(g1, g2)
tweets <- lapply(files, readLines)
##Combine all the tweets per day to form the documents
tweets<-lapply(tweets, function(x) paste(x, collapse=" "))
txt <- unlist(tweets)
###create covariates
team<-rep("gov", 162)
team[163:324]<-rep("opp", 162)
dates<-seq(as.Date("2013/12/18"), by="days", length=162)
days<-dates
days[163:324]<-dates
data<-data.frame(team, txt, days)
##use STM's cleaning functions
processed <- textProcessor(data$txt, metadata=data, language="spanish", stem=TRUE)
##remove some words for speed purposes
out_20 <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh=20)
##Search K; we're not going to run this now
model<-searchK(out_20$documents, out_20$vocab, K=c(25, 50, 75))
plot(model)
###Run the model with the same number of topics as before
fitSpec50 <- stm(out_20$documents,out_20$vocab,K=50, init.type="LDA",
content=~team, prevalence = ~team + as.numeric(days), max.em.its=30, data=out_20$meta, seed=5926696)
fitSpec25 <- stm(out_20$documents,out_20$vocab,K=25, init.type="LDA",
content=~team, prevalence = ~team + as.numeric(days), max.em.its=30, data=out_20$meta, seed=5926696)
##find biggest topics
plot.STM(fitSpec25, type="summary")
plot.STM(fitSpec50, type="summary")
big_25<-c(15)
big_50<-c(15)
###Words in those topics of interest
labelTopics(fitSpec25, big_25)
labelTopics(fitSpec50, big_50)
###Look at how content varies in these topics
##change data types
out_20$meta$team<-as.factor(out_20$meta$team)
out_20$meta$days<-as.numeric(out_20$meta$days)
##pick specifcation
prep<-estimateEffect(big_25 ~ team , fitSpec25, meta=out_20$meta)
##plot effects
plot.estimateEffect(prep, covariate="team", topics=big_25, model=out_20, method="difference", cov.value1 = "gov", cov.value2 = "opp",
xlab = "More Opp......More Gov", xlim=c(-.1, .1))
##pick specifcation----50 topics
prep<-estimateEffect(big_50 ~ team , fitSpec50, meta=out_20$meta)
##plot effects
plot.estimateEffect(prep, covariate="team", topics=big_50, model=out_20, method="difference", cov.value1 = "gov", cov.value2 = "opp",
xlab = "More Opp......More Gov", xlim=c(-.1, .1))
##pick specifcation--over time
prep<-estimateEffect(big_25 ~ s(days) , fitSpec25, meta=out_20$meta)
##plot effects
plot.estimateEffect(prep, covariate="days", topics=big_25, model=out_20, method="continuous")
##pick specifcation--over time--50
prep<-estimateEffect(big_50 ~ s(days) , fitSpec50, meta=out_20$meta)
##plot effects
plot.estimateEffect(prep, covariate="days", topics=big_50, model=out_20, method="continuous")
###Let's see how the terms used vary within a topic
plot.STM(fitSpec25, type="perspectives", topics = 12)
plot.STM(fitSpec25, type="perspectives", topics = xx)
plot.STM(fitSpec25, type="perspectives", topics = xx)
plot.STM(fitSpec50, type="perspectives", topics = xx)
plot.STM(fitSpec50, type="perspectives", topics = xx)
plot.STM(fitSpec50, type="perspectives", topics = xx)