Figure_Creator_script.Rmd

---
title: "Mar_Leg_Figures"
author: "Babak"
date: "10/19/2020"
output: pdf_document
---

```{r setup}
options(stringsAsFactors=F)
library(reshape2)
library(ggplot2)
```

## Topic Contribution

### Data Preparation

```{r loading_comment_data}
# read the data file
dim(d <- data_for_R.f)
# remove non-alphanumeric elements (brackets and commas)
d$topic_assignments <- gsub("\\W", " ", d$topic_assignments)
# remove multiple spaces
d$topic_assignments <- gsub(" +", " ", d$topic_assignments)
# remove the initial and final spaces
d$topic_assignments <- gsub("(^ )|( $)", "", d$topic_assignments)
# split topic assignments for each word and save the list in an array
head(d$topicList <- lapply(strsplit(d$topic_assignments, " "), as.numeric))
```

### Per-Comment Topic Contribution

```{r per_comment_topic_contrib}
# define a variable for number of topics
ntopic <- max(unlist(d$topicList))
# add month information (month index, number of comments per month) and comment information (comment length) to the dataframe d in a convenient format
d <- within(d, {
    commentLength <- sapply(topicList, length)
    yearMonth <- paste(2008 + (year-1), month)
    monthNum <- ((2008 + (year-1))-2008)*12 + month
    monthComments <- xtabs(~monthNum)[monthNum]
})

# calculate per-comment topic contribution
for (topic in 0:ntopic){
    d[,paste0("t", topic)] <- sapply(d$topicList, function(comment){
        sum(comment == topic)
    }) / d$commentLength / d$monthComments
}
# head(d)
```

### Per-Month Topic Contribution

```{r per_month_topic_contrib}
# calculate per-month topic contribution
perMonth <- sapply(0:ntopic, function(topic){
    with(d, tapply(get(paste0("t", topic)), monthNum, sum, na.rm=T))
})
# rename perMonth column names and transform it into a dataframe
colnames(perMonth) <- 0:ntopic
perMonth <- as.data.frame(perMonth)
# each row in perMonth is now a month, each column associated with a topic, with the value at their intersection determining the contribution of a certain topic to comments in a certain month. The following line ensures that the calculation was done properly and contributions for each month sum to (approximately) 1:
apply(perMonth, 1, sum, na.rm=T)
# add a column with the month indices
perMonth$monthNum <- as.numeric(rownames(perMonth))
# rename rows to more interpretable month-year labels
rownames(perMonth) <- as.list(unique(d$yearMonth))
# add the labels as a separate column
perMonth$month <- rownames(perMonth)
# examine the results
dim(perMonth)
# head(perMonth)
# turn data into a long format where each month and topic combination is in a separate row
head(allMonths <- melt(perMonth, id.vars=c("month","monthNum"), variable.name="topic", value.name="contrib"))
```

### Significant Temporal Trends

```{r sig_topic_trends}
# rename column names in allMonths
colnames(perMonth)[1:(length(colnames(perMonth))-2)] <- paste0("t",colnames(perMonth)[1:(length(colnames(perMonth))-2)])
# Run the cubic regression for each topic (.001 is added to the numerator and denominator of log odds to prevent potential division by zero). Also count the number of coefficients with p < 0.05

topic.ts <-
sapply(paste0("t", 0:ntopic), function(topic){
y <- perMonth[,topic]
sum(summary(lm(log((y+.001)/(1-y+.001)) ~
poly(as.numeric(perMonth$monthNum), 3)))$coefficients[-1,"Pr(>|t|)"] < .001)})

# show the number of significant coefficients for each topic
sort(topic.ts)
```

```{r load_data}
#load("/media/babak_daily/Data/Marij_Leg_Updated_New/Marijuana_Legalization_Corpus_Study/LDA_full-corpus:True_50/R_contrib_workspace.RData")
```

### Choice of Top Topics

```{r important_topics}
# take the average of per-month topic contributions
topicContrib <- sort(apply(perMonth[,0:ntopic+1], 2, mean))
# choose topics with topicContrib > .03
important <- names(topicContrib)[topicContrib > .03]
# Result: c(5,6,13,30,32,36,38,49)
# turn the topic labels into numeric values and print
(important <- sort(as.numeric(gsub("t", "", important))))

important <- c(3, 7, 10, 11, 13, 14, 15, 16, 17, 22, 24, 26, 39, 43, 46, 47, 49) # those in any cluster

# what about just linear trends?
(sapply(paste0("t", 0:ntopic), function(topic){
y <- perMonth[,topic]
sum(summary(lm(log((y+.001)/(1-y+.001)) ~
poly(as.numeric(perMonth$monthNum), 3)))$coefficients[2,"Pr(>|t|)"] < .05)}))
# These don't have significant linear trends: 12,14,19,25,26,42
```

#### Plot of top topics combined

```{r combined_contrib_plot}
# create interpretable labels for months
allMonths.translate <- with(allMonths, tapply(month, monthNum, unique))
allMonths.translate <- gsub(" .*", "", allMonths.translate)
# create the plot object, add proper ticks and labels
ggplot(subset(allMonths, topic %in% important),
    aes(monthNum,contrib,color=topic)) +
    geom_smooth(method=lm, formula=y ~ poly(x, 3)) +
    scale_x_continuous(name="Time",
                       breaks=c(23+(0:9)*12), labels=allMonths.translate[as.character(c(23+(0:9)*12))]) +
  theme_bw() +
  ylab("Percentage contribution") +
  scale_color_discrete("Topics") +
  # include vertical lines for important events (must be identified)
  # geom_vline(xintercept=75) + 
  # geom_vline(xintercept=112) 

```

## Discourse Categories

### Obsolete code for human ratings

Run the entire file, but ignore sections under this header.

#### Load Human Ratings

```{r load_human_ratings}
# load sample_keys and sample_ratings
sample_keys <- read.csv("C:\\Users\\Babak\\Downloads\\sample_keys.csv" )
# sample_ratings <- read.csv("C:\\Users\\Babak\\Desktop\\Ratings\\sample_ratings_final_1.csv") 
# warning! File changed from the original
sample_ratings <- read.csv("C:\\Users\\Babak\\Downloads\\sample_ratings.csv") 
# remove non-relevant comments prior to analysis
sample_ratings$interpretability[sample_ratings$interpretability == 'n'] <- 0
sample_ratings$interpretability[sample_ratings$interpretability != 0] <- 1
sample_ratings$interpretability <- as.numeric(sample_ratings$interpretability)
# sample_ratings <- subset(sample_ratings,interpretability == 1) # wrong file loaded for 12.28.18 analysis
# change the column name for the random index, so that the keys and ratings can be merged using those IDs
colnames(sample_keys)[2] <- 'index'
# merge sample_keys and sample_ratings
dim(combined_key <- merge(sample_keys, sample_ratings, by ='index'))
# rename column header for consistency across the dataframes
colnames(combined_key)[2] <- 'number'
```

#### Determine Top Comment Classification Certainty

```{r non_dominant_top_topic_avg_contrib}
# merge sampled comment information with data abiyt the most likely topic for each word
# dim(with_topics <- merge(combined_key,d,by='number'))
with_topics <- d
# # The merging resulted in two copies of year and month. Here I remove the duplicate columns and rename the original ones
# with_topics$year.y <- NULL
# with_topics$month.y <- NULL
# colnames(with_topics)[3] <- 'month'
# colnames(with_topics)[4] <- 'year'
# create a matrix where each row is a top topic (in order of topic number) and each column is a sampled post. The values show the number of words in a sampled post assigned to a topic
top_counts <- sapply(important, function(top){sapply(with_topics$topicList, function(nums){sum(nums == top)})})
top_counts <- t(top_counts)
colnames(top_counts) <- 1:length(top_counts[1,])
# create a vector containing the fraction of words in each comment that belong to non-dominant top topics
other_top_contrib = rep(0, length(top_counts[1,]))
for (i in 1:length(top_counts[1,])){
  other_top_contrib[i] = (sum(top_counts[,i]) - max(top_counts[,i])) / sum(top_counts[,i])
}
# calculate the average fraction, as well as the standard deviation
mean(as.matrix(other_top_contrib))
sd(as.matrix(other_top_contrib))
```

#### Discourse Categories Based on Human Ratings

```{r categorize_discourse_type_per_topic}
# # determine the number of comments sampled for each topic that have a higher value-based rating than consequence-based and other
# value_dominant <- with(with_topics,sapply(important, function(top){sum(I(topic == top & values > consequences & values > other))}))
# # determine the number of comments sampled for each topic that have a higher consequence-based rating than value-based and other
# conseq_dominant <- with(with_topics,sapply(important, function(top){sum(I(topic == top & consequences > values & consequences > other))}))
# # the number of comments sampled for each topic that were deemed relevant to same-sex marriage by the raters
# # valid_counts <- as.matrix(xtabs(~topic,subset(with_topics,interpretability == 1))) # wrong file loaded that doesn't have interpretability ratings on 12.28.18
# # determine the number of comments sampled for each topic that have a higher "other" rating than consequence-based and value-based
# valid_counts <- 250 # fake
# neut_dominant <- 0 #valid_counts - (value_dominant+conseq_dominant) # fake
# determine the discourse category associated with each topic based on the described criteria

discourse_cats <- ifelse((value_dominant > conseq_dominant) & (value_dominant > neut_dominant) & (value_dominant > (valid_counts / 2)),"values",ifelse((conseq_dominant > value_dominant) & (conseq_dominant > neut_dominant) & (conseq_dominant > valid_counts / 2),"consequences","neutral"))
# bind the different matrices together into a data frame
disc_cats <- data.frame(cbind(as.numeric(important),value_dominant,conseq_dominant,neut_dominant,valid_counts,discourse_cats))
# rename the column labels for interpretability
colnames(disc_cats) <- c('topic','values','consequences','neutral','valid','categories')
# print the classifications
(disc_cats[,c(1,6)])
```

#### Discourse Categories and Support for Same-sex Marriage

```{r pro_against}
pro_against <- xtabs(~with_topics$pro,with_topics)
years <- xtabs(~with_topics$year,with_topics)
(pro_dist <- as.matrix(xtabs(~with_topics$topic+with_topics$pro,with_topics)))
```

### Discourse Category Regression and Plot

The following code has been adjusted to classify and plot topics based on results of ten ten-fold cross-validations rather than obsolete human ratings data (see readme.txt for more details).

```{r not_summed_discourse}
# choose only the month-topic rows in allMonths that are associated with a top topic
# only_important <- subset(allMonths,topic %in% important) #12.28.18: include all topics
only_important <- allMonths
# add discourse category ratings and classifications to the newly formed data frame
# only_important <- merge(only_important,disc_cats) # orig file needed for disc_cats changed
# Drop topic 44
# only_important <- subset(only_important, topic != 44) # marijuana one. Don't need this anyway for the latest analysis
# Add a column to only_important with the discourse category associated with each topic
# only_important$discourse_category <- ifelse(only_important$topic %in% c(12,48),"value",ifelse(only_important$topic %in% c(22,23,28),"conseq","neutral"))
# ADDED FOR THE NEW TOP TOPIC ANALYSIS
# c(23,47,28,22,4,14,27) conseq., first two below 50, last one 100
# c(29,12,48,49) val., first one below 50, the last three 100

only_important$discourse_category <- ifelse(only_important$topic %in% c(3, 13, 17, 43, 47),"Consequences",ifelse(only_important$topic %in% c(7,15),"Normative",ifelse(only_important$topic %in% c(11,14,16,24),"Treatment",ifelse(only_important$topic %in% c(10, 39, 46, 49),"Future","neither"))))

only_important <- subset(only_important,discourse_category != "neither") 
```

### Reported Value-based vs. Consequence-based Categorizations

#### Pooled contribution estimates

```{r pooled_logodds_top_words_distributions}
# re-define discourse categories assigned to topics
only_important$discourse_category <- as.factor(ifelse(only_important$topic %in% c(3, 13, 17, 43, 47),"Consequence",ifelse(only_important$topic %in% c(7,15),"Normative", ifelse(only_important$topic %in% c(11,14,16,24),"Treament", ifelse(only_important$topic %in% c(10,39,46,49),"Future", "neutral")))))
# pool topic contributions within each discourse category and save the result along with dates to a new data frame
bymonth <- aggregate(contrib ~ month + monthNum + discourse_category, only_important, sum)
# calculate the percentage contribution of all top topics to each month's posts and add the resulting value to the recently created data-frame (bymonth). These values will be used in the calculation of log odds
bymonth_allcontrib <- aggregate(contrib ~ month + monthNum, only_important,sum)
bymonth$allcontrib <- bymonth_allcontrib$contrib
# Distribution of discourse category contributions to posts in the dataset
summary(bymonth)
# Summary of the contributions of all top topics to posts in the dataset
summary(bymonth_allcontrib$contrib)
```

## Main Trend Results

Includes topics reported in the paper

### Regression

```{r pooled_logodds_top_words_regression}
# linear regression with log odds of a discourse category's pooled contribution as the predicted value and discourse_category*timestep as the predictors
bymonth$log_odds <- log(bymonth$contrib/(bymonth$allcontrib-bymonth$contrib))
bymonth$discourse_category_new <- ifelse(bymonth$discourse_category == "conseq",1,0)
poly_qr <- with(bymonth,(lm(log_odds ~ polym(discourse_category_new, monthNum, degree=3, raw=TRUE))))
```

### Plots

#### Discourse Categories

The following chunk produces a plot of the pooled discourse category contributions, along with the best-fitting local polynomial regression line.

```{r pooled_top_words_plot}
ggplot(subset(bymonth, bymonth$monthNum >= 25), aes(x=monthNum, y=contrib, color=discourse_category)) + geom_point() + geom_smooth(span=0.4,size = 1) +
  scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
  theme_light() +
  theme(legend.text=element_text(size=12)) +
  geom_point(size = 0.1) +
  ylab("Proportion contribution") +
   scale_color_manual(values=c("blue","red", "pink", "green"),labels=c("Consequence","Normative", "Treatment", "Future")) +
   geom_vline(xintercept=121) +
   geom_vline(xintercept=130) + 
  labs(fill="Discourse Category")
# BUG: The years are off by 1
```

#### Consequence Topics

The following plot shows the trends associated with individual value-based topics. 

```{r govt_econ_plot}
# Consequence: [3, 13, 17, 43, 47]

# save for ind figures
# save(allMonths, file="Monthly_Topic_Contrib_objs.RData")

ggplot(subset(subset(allMonths, monthNum >= 25), topic %in% c(3, 13, 17, 43, 47)),
       aes(monthNum,contrib,color=topic)) +
  geom_smooth(span=0.4) +
  scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
  scale_fill_manual(values=c("red", "blue", "black","purple","green")) +
  scale_color_manual(values=c("red", "blue", "black","purple","green"),labels = c("Unintended Consequences","Biological Function \n of Crispr","Children with \n Genetic Mutations","Changing Human Genetics","Complications with \n Genetic Engineering")) +
  theme_bw() +
  geom_point(size = 0.1) +
  geom_vline(xintercept=121) +
  geom_vline(xintercept=130) +
  ylab("Percentage contribution")
```

#### Normative Topics

The following plot shows the trends associated with individual consequence-based topics.

```{r comp_plot}
# Comparative (vs. other drugs or hot-button topics): [7,10,12,48]

ggplot(subset(subset(allMonths, monthNum >= 25), topic %in% c(7, 15)),
       aes(monthNum,contrib,color=topic)) +
  geom_smooth(span=0.4) +
  scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
  scale_fill_manual(values=c("red", "blue")) +
  scale_color_manual(values=c("red", "blue"),labels = c("Morality","Should We \n Be Using \nGenetic Engineering")) +
  theme_bw() +
  geom_point(size = 0.1) +
  geom_vline(xintercept=121) +
  geom_vline(xintercept=130) +
  ylab("Percentage contribution")
```

#### Treatment Topics

```{r everyday_plot}
# treatment: [11,14,16,24]

ggplot(subset(subset(allMonths, monthNum >= 25), topic %in% c(11,14,16,24)),
       aes(monthNum,contrib,color=topic)) +
  geom_smooth(span=0.4) +
  scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
  scale_fill_manual(values=c("red", "blue", "black","purple")) +
  scale_color_manual(values=c("red", "blue", "black","purple"),labels = c("Genetic Treatment \n to Disease","Cures","Stem Therapies","Gene Therapy \n Development")) +
  theme_bw() +
  geom_point(size = 0.1) +
  geom_vline(xintercept=121) +
  geom_vline(xintercept=130) +
  ylab("Percentage contribution")
```

```{r ind_exp_plot}
# Future: [10,39,46,49]

ggplot(subset(subset(allMonths, monthNum >= 25), topic %in% c(10,39,46,49)),
       aes(monthNum,contrib,color=topic)) +
  geom_smooth(span=0.4) +
  scale_x_continuous(name="Time", breaks=seq(25,145,12), labels=as.character(c(2010:2020))) +
  scale_fill_manual(values=c("red", "blue", "black","purple","green")) +
  scale_color_manual(values=c("red", "blue", "black","purple","green"),labels = c("Future Genetic Changes","Postulating on Crispr","Future with Genetic Engineering","Looking Torwards the Future")) +
  theme_bw() +
  geom_point(size = 0.1) +
   geom_vline(xintercept=121) +
   geom_vline(xintercept=130) +
  ylab("Percentage contribution")
```

```{r high_impact_plot}
# Greater than 3% avg contrib: [6,13,30,32,38,49]
# a few were junk. 

ggplot(subset(allMonths, topic %in% c(6,13,30,32,38,49)),
       aes(monthNum,contrib,color=topic)) +
  geom_smooth(span=0.4) +
  scale_x_continuous(name="Time", breaks=seq(1,145,12), labels=as.character(c(2008:2020))) +
  scale_fill_manual(values=c("red", "blue", "black","purple","brown","green")) +
  scale_color_manual(values=c("red", "blue", "black","purple","brown","green"),labels = c("Government and systemic power","Reasons for legalization","Good feelings & helpfulness", "Personal anecdotes", "Legalization as an electoral issue","Expletives")) +
  theme_bw() +
  geom_point() +
  # geom_vline(xintercept=75) +
  # geom_vline(xintercept=112) +
  ylab("Percentage contribution")
```