-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathFigure_Creator_script.Rmd
390 lines (325 loc) · 17.6 KB
/
Figure_Creator_script.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
---
title: "Mar_Leg_Figures"
author: "Babak"
date: "10/19/2020"
output: pdf_document
---
```{r setup}
options(stringsAsFactors=F)
library(reshape2)
library(ggplot2)
```
## Topic Contribution
### Data Preparation
```{r loading_comment_data}
# read the data file
dim(d <- data_for_R.f)
# remove non-alphanumeric elements (brackets and commas)
d$topic_assignments <- gsub("\\W", " ", d$topic_assignments)
# remove multiple spaces
d$topic_assignments <- gsub(" +", " ", d$topic_assignments)
# remove the initial and final spaces
d$topic_assignments <- gsub("(^ )|( $)", "", d$topic_assignments)
# split topic assignments for each word and save the list in an array
head(d$topicList <- lapply(strsplit(d$topic_assignments, " "), as.numeric))
```
### Per-Comment Topic Contribution
```{r per_comment_topic_contrib}
# define a variable for number of topics
ntopic <- max(unlist(d$topicList))
# add month information (month index, number of comments per month) and comment information (comment length) to the dataframe d in a convenient format
d <- within(d, {
commentLength <- sapply(topicList, length)
yearMonth <- paste(2008 + (year-1), month)
monthNum <- ((2008 + (year-1))-2008)*12 + month
monthComments <- xtabs(~monthNum)[monthNum]
})
# calculate per-comment topic contribution
for (topic in 0:ntopic){
d[,paste0("t", topic)] <- sapply(d$topicList, function(comment){
sum(comment == topic)
}) / d$commentLength / d$monthComments
}
# head(d)
```
### Per-Month Topic Contribution
```{r per_month_topic_contrib}
# calculate per-month topic contribution
perMonth <- sapply(0:ntopic, function(topic){
with(d, tapply(get(paste0("t", topic)), monthNum, sum, na.rm=T))
})
# rename perMonth column names and transform it into a dataframe
colnames(perMonth) <- 0:ntopic
perMonth <- as.data.frame(perMonth)
# each row in perMonth is now a month, each column associated with a topic, with the value at their intersection determining the contribution of a certain topic to comments in a certain month. The following line ensures that the calculation was done properly and contributions for each month sum to (approximately) 1:
apply(perMonth, 1, sum, na.rm=T)
# add a column with the month indices
perMonth$monthNum <- as.numeric(rownames(perMonth))
# rename rows to more interpretable month-year labels
rownames(perMonth) <- as.list(unique(d$yearMonth))
# add the labels as a separate column
perMonth$month <- rownames(perMonth)
# examine the results
dim(perMonth)
# head(perMonth)
# turn data into a long format where each month and topic combination is in a separate row
head(allMonths <- melt(perMonth, id.vars=c("month","monthNum"), variable.name="topic", value.name="contrib"))
```
### Significant Temporal Trends
```{r sig_topic_trends}
# rename column names in allMonths
colnames(perMonth)[1:(length(colnames(perMonth))-2)] <- paste0("t",colnames(perMonth)[1:(length(colnames(perMonth))-2)])
# Run the cubic regression for each topic (.001 is added to the numerator and denominator of log odds to prevent potential division by zero). Also count the number of coefficients with p < 0.05
topic.ts <-
sapply(paste0("t", 0:ntopic), function(topic){
y <- perMonth[,topic]
sum(summary(lm(log((y+.001)/(1-y+.001)) ~
poly(as.numeric(perMonth$monthNum), 3)))$coefficients[-1,"Pr(>|t|)"] < .001)})
# show the number of significant coefficients for each topic
sort(topic.ts)
```
```{r load_data}
#load("/media/babak_daily/Data/Marij_Leg_Updated_New/Marijuana_Legalization_Corpus_Study/LDA_full-corpus:True_50/R_contrib_workspace.RData")
```
### Choice of Top Topics
```{r important_topics}
# take the average of per-month topic contributions
topicContrib <- sort(apply(perMonth[,0:ntopic+1], 2, mean))
# choose topics with topicContrib > .03
important <- names(topicContrib)[topicContrib > .03]
# Result: c(5,6,13,30,32,36,38,49)
# turn the topic labels into numeric values and print
(important <- sort(as.numeric(gsub("t", "", important))))
important <- c(3, 7, 10, 11, 13, 14, 15, 16, 17, 22, 24, 26, 39, 43, 46, 47, 49) # those in any cluster
# what about just linear trends?
(sapply(paste0("t", 0:ntopic), function(topic){
y <- perMonth[,topic]
sum(summary(lm(log((y+.001)/(1-y+.001)) ~
poly(as.numeric(perMonth$monthNum), 3)))$coefficients[2,"Pr(>|t|)"] < .05)}))
# These don't have significant linear trends: 12,14,19,25,26,42
```
#### Plot of top topics combined
```{r combined_contrib_plot}
# create interpretable labels for months
allMonths.translate <- with(allMonths, tapply(month, monthNum, unique))
allMonths.translate <- gsub(" .*", "", allMonths.translate)
# create the plot object, add proper ticks and labels
ggplot(subset(allMonths, topic %in% important),
aes(monthNum,contrib,color=topic)) +
geom_smooth(method=lm, formula=y ~ poly(x, 3)) +
scale_x_continuous(name="Time",
breaks=c(23+(0:9)*12), labels=allMonths.translate[as.character(c(23+(0:9)*12))]) +
theme_bw() +
ylab("Percentage contribution") +
scale_color_discrete("Topics") +
# include vertical lines for important events (must be identified)
# geom_vline(xintercept=75) +
# geom_vline(xintercept=112)
```
## Discourse Categories
### Obsolete code for human ratings
Run the entire file, but ignore sections under this header.
#### Load Human Ratings
```{r load_human_ratings}
# load sample_keys and sample_ratings
sample_keys <- read.csv("C:\\Users\\Babak\\Downloads\\sample_keys.csv" )
# sample_ratings <- read.csv("C:\\Users\\Babak\\Desktop\\Ratings\\sample_ratings_final_1.csv")
# warning! File changed from the original
sample_ratings <- read.csv("C:\\Users\\Babak\\Downloads\\sample_ratings.csv")
# remove non-relevant comments prior to analysis
sample_ratings$interpretability[sample_ratings$interpretability == 'n'] <- 0
sample_ratings$interpretability[sample_ratings$interpretability != 0] <- 1
sample_ratings$interpretability <- as.numeric(sample_ratings$interpretability)
# sample_ratings <- subset(sample_ratings,interpretability == 1) # wrong file loaded for 12.28.18 analysis
# change the column name for the random index, so that the keys and ratings can be merged using those IDs
colnames(sample_keys)[2] <- 'index'
# merge sample_keys and sample_ratings
dim(combined_key <- merge(sample_keys, sample_ratings, by ='index'))
# rename column header for consistency across the dataframes
colnames(combined_key)[2] <- 'number'
```
#### Determine Top Comment Classification Certainty
```{r non_dominant_top_topic_avg_contrib}
# merge sampled comment information with data abiyt the most likely topic for each word
# dim(with_topics <- merge(combined_key,d,by='number'))
with_topics <- d
# # The merging resulted in two copies of year and month. Here I remove the duplicate columns and rename the original ones
# with_topics$year.y <- NULL
# with_topics$month.y <- NULL
# colnames(with_topics)[3] <- 'month'
# colnames(with_topics)[4] <- 'year'
# create a matrix where each row is a top topic (in order of topic number) and each column is a sampled post. The values show the number of words in a sampled post assigned to a topic
top_counts <- sapply(important, function(top){sapply(with_topics$topicList, function(nums){sum(nums == top)})})
top_counts <- t(top_counts)
colnames(top_counts) <- 1:length(top_counts[1,])
# create a vector containing the fraction of words in each comment that belong to non-dominant top topics
other_top_contrib = rep(0, length(top_counts[1,]))
for (i in 1:length(top_counts[1,])){
other_top_contrib[i] = (sum(top_counts[,i]) - max(top_counts[,i])) / sum(top_counts[,i])
}
# calculate the average fraction, as well as the standard deviation
mean(as.matrix(other_top_contrib))
sd(as.matrix(other_top_contrib))
```
#### Discourse Categories Based on Human Ratings
```{r categorize_discourse_type_per_topic}
# # determine the number of comments sampled for each topic that have a higher value-based rating than consequence-based and other
# value_dominant <- with(with_topics,sapply(important, function(top){sum(I(topic == top & values > consequences & values > other))}))
# # determine the number of comments sampled for each topic that have a higher consequence-based rating than value-based and other
# conseq_dominant <- with(with_topics,sapply(important, function(top){sum(I(topic == top & consequences > values & consequences > other))}))
# # the number of comments sampled for each topic that were deemed relevant to same-sex marriage by the raters
# # valid_counts <- as.matrix(xtabs(~topic,subset(with_topics,interpretability == 1))) # wrong file loaded that doesn't have interpretability ratings on 12.28.18
# # determine the number of comments sampled for each topic that have a higher "other" rating than consequence-based and value-based
# valid_counts <- 250 # fake
# neut_dominant <- 0 #valid_counts - (value_dominant+conseq_dominant) # fake
# determine the discourse category associated with each topic based on the described criteria
discourse_cats <- ifelse((value_dominant > conseq_dominant) & (value_dominant > neut_dominant) & (value_dominant > (valid_counts / 2)),"values",ifelse((conseq_dominant > value_dominant) & (conseq_dominant > neut_dominant) & (conseq_dominant > valid_counts / 2),"consequences","neutral"))
# bind the different matrices together into a data frame
disc_cats <- data.frame(cbind(as.numeric(important),value_dominant,conseq_dominant,neut_dominant,valid_counts,discourse_cats))
# rename the column labels for interpretability
colnames(disc_cats) <- c('topic','values','consequences','neutral','valid','categories')
# print the classifications
(disc_cats[,c(1,6)])
```
#### Discourse Categories and Support for Same-sex Marriage
```{r pro_against}
pro_against <- xtabs(~with_topics$pro,with_topics)
years <- xtabs(~with_topics$year,with_topics)
(pro_dist <- as.matrix(xtabs(~with_topics$topic+with_topics$pro,with_topics)))
```
### Discourse Category Regression and Plot
The following code has been adjusted to classify and plot topics based on results of ten ten-fold cross-validations rather than obsolete human ratings data (see readme.txt for more details).
```{r not_summed_discourse}
# choose only the month-topic rows in allMonths that are associated with a top topic
# only_important <- subset(allMonths,topic %in% important) #12.28.18: include all topics
only_important <- allMonths
# add discourse category ratings and classifications to the newly formed data frame
# only_important <- merge(only_important,disc_cats) # orig file needed for disc_cats changed
# Drop topic 44
# only_important <- subset(only_important, topic != 44) # marijuana one. Don't need this anyway for the latest analysis
# Add a column to only_important with the discourse category associated with each topic
# only_important$discourse_category <- ifelse(only_important$topic %in% c(12,48),"value",ifelse(only_important$topic %in% c(22,23,28),"conseq","neutral"))
# ADDED FOR THE NEW TOP TOPIC ANALYSIS
# c(23,47,28,22,4,14,27) conseq., first two below 50, last one 100
# c(29,12,48,49) val., first one below 50, the last three 100
only_important$discourse_category <- ifelse(only_important$topic %in% c(3, 13, 17, 43, 47),"Consequences",ifelse(only_important$topic %in% c(7,15),"Normative",ifelse(only_important$topic %in% c(11,14,16,24),"Treatment",ifelse(only_important$topic %in% c(10, 39, 46, 49),"Future","neither"))))
only_important <- subset(only_important,discourse_category != "neither")
```
### Reported Value-based vs. Consequence-based Categorizations
#### Pooled contribution estimates
```{r pooled_logodds_top_words_distributions}
# re-define discourse categories assigned to topics
only_important$discourse_category <- as.factor(ifelse(only_important$topic %in% c(3, 13, 17, 43, 47),"Consequence",ifelse(only_important$topic %in% c(7,15),"Normative", ifelse(only_important$topic %in% c(11,14,16,24),"Treament", ifelse(only_important$topic %in% c(10,39,46,49),"Future", "neutral")))))
# pool topic contributions within each discourse category and save the result along with dates to a new data frame
bymonth <- aggregate(contrib ~ month + monthNum + discourse_category, only_important, sum)
# calculate the percentage contribution of all top topics to each month's posts and add the resulting value to the recently created data-frame (bymonth). These values will be used in the calculation of log odds
bymonth_allcontrib <- aggregate(contrib ~ month + monthNum, only_important,sum)
bymonth$allcontrib <- bymonth_allcontrib$contrib
# Distribution of discourse category contributions to posts in the dataset
summary(bymonth)
# Summary of the contributions of all top topics to posts in the dataset
summary(bymonth_allcontrib$contrib)
```
## Main Trend Results
Includes topics reported in the paper
### Regression
```{r pooled_logodds_top_words_regression}
# linear regression with log odds of a discourse category's pooled contribution as the predicted value and discourse_category*timestep as the predictors
bymonth$log_odds <- log(bymonth$contrib/(bymonth$allcontrib-bymonth$contrib))
bymonth$discourse_category_new <- ifelse(bymonth$discourse_category == "conseq",1,0)
poly_qr <- with(bymonth,(lm(log_odds ~ polym(discourse_category_new, monthNum, degree=3, raw=TRUE))))
```
### Plots
#### Discourse Categories
The following chunk produces a plot of the pooled discourse category contributions, along with the best-fitting local polynomial regression line.
```{r pooled_top_words_plot}
ggplot(subset(bymonth, bymonth$monthNum >= 25), aes(x=monthNum, y=contrib, color=discourse_category)) + geom_point() + geom_smooth(span=0.4,size = 1) +
scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
theme_light() +
theme(legend.text=element_text(size=12)) +
geom_point(size = 0.1) +
ylab("Proportion contribution") +
scale_color_manual(values=c("blue","red", "pink", "green"),labels=c("Consequence","Normative", "Treatment", "Future")) +
geom_vline(xintercept=121) +
geom_vline(xintercept=130) +
labs(fill="Discourse Category")
# BUG: The years are off by 1
```
#### Consequence Topics
The following plot shows the trends associated with individual value-based topics.
```{r govt_econ_plot}
# Consequence: [3, 13, 17, 43, 47]
# save for ind figures
# save(allMonths, file="Monthly_Topic_Contrib_objs.RData")
ggplot(subset(subset(allMonths, monthNum >= 25), topic %in% c(3, 13, 17, 43, 47)),
aes(monthNum,contrib,color=topic)) +
geom_smooth(span=0.4) +
scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
scale_fill_manual(values=c("red", "blue", "black","purple","green")) +
scale_color_manual(values=c("red", "blue", "black","purple","green"),labels = c("Unintended Consequences","Biological Function \n of Crispr","Children with \n Genetic Mutations","Changing Human Genetics","Complications with \n Genetic Engineering")) +
theme_bw() +
geom_point(size = 0.1) +
geom_vline(xintercept=121) +
geom_vline(xintercept=130) +
ylab("Percentage contribution")
```
#### Normative Topics
The following plot shows the trends associated with individual consequence-based topics.
```{r comp_plot}
# Comparative (vs. other drugs or hot-button topics): [7,10,12,48]
ggplot(subset(subset(allMonths, monthNum >= 25), topic %in% c(7, 15)),
aes(monthNum,contrib,color=topic)) +
geom_smooth(span=0.4) +
scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
scale_fill_manual(values=c("red", "blue")) +
scale_color_manual(values=c("red", "blue"),labels = c("Morality","Should We \n Be Using \nGenetic Engineering")) +
theme_bw() +
geom_point(size = 0.1) +
geom_vline(xintercept=121) +
geom_vline(xintercept=130) +
ylab("Percentage contribution")
```
#### Treatment Topics
```{r everyday_plot}
# treatment: [11,14,16,24]
ggplot(subset(subset(allMonths, monthNum >= 25), topic %in% c(11,14,16,24)),
aes(monthNum,contrib,color=topic)) +
geom_smooth(span=0.4) +
scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
scale_fill_manual(values=c("red", "blue", "black","purple")) +
scale_color_manual(values=c("red", "blue", "black","purple"),labels = c("Genetic Treatment \n to Disease","Cures","Stem Therapies","Gene Therapy \n Development")) +
theme_bw() +
geom_point(size = 0.1) +
geom_vline(xintercept=121) +
geom_vline(xintercept=130) +
ylab("Percentage contribution")
```
```{r ind_exp_plot}
# Future: [10,39,46,49]
ggplot(subset(subset(allMonths, monthNum >= 25), topic %in% c(10,39,46,49)),
aes(monthNum,contrib,color=topic)) +
geom_smooth(span=0.4) +
scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
scale_fill_manual(values=c("red", "blue", "black","purple","green")) +
scale_color_manual(values=c("red", "blue", "black","purple","green"),labels = c("Future Genetic Changes","Postulating on Crispr","Future with Genetic Engineering","Looking Torwards the Future")) +
theme_bw() +
geom_point(size = 0.1) +
geom_vline(xintercept=121) +
geom_vline(xintercept=130) +
ylab("Percentage contribution")
```
```{r high_impact_plot}
# Greater than 3% avg contrib: [6,13,30,32,38,49]
# a few were junk.
ggplot(subset(allMonths, topic %in% c(6,13,30,32,38,49)),
aes(monthNum,contrib,color=topic)) +
geom_smooth(span=0.4) +
scale_x_continuous(name="Time", breaks=seq(1,145,12), labels=as.character(c(2008:2020))) +
scale_fill_manual(values=c("red", "blue", "black","purple","brown","green")) +
scale_color_manual(values=c("red", "blue", "black","purple","brown","green"),labels = c("Government and systemic power","Reasons for legalization","Good feelings & helpfulness", "Personal anecdotes", "Legalization as an electoral issue","Expletives")) +
theme_bw() +
geom_point() +
# geom_vline(xintercept=75) +
# geom_vline(xintercept=112) +
ylab("Percentage contribution")
```