-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysisRegressions.R
258 lines (217 loc) · 8.6 KB
/
analysisRegressions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# sendValPredict: linear regression analyses
# Author: Jessica M. Alexander
# Last Updated: 2025-01-03
# Input:
# --preprocessed, aggregated data (data.csv)
# Output:
# For four models:
# *warriner: averaged lexical valence and lexical arousal (from Warriner et al., 2013)
# *glove: 300-dimension centroid of GloVe embeddings
# *acoustics: 88 dimensions of extracted eGeMAPS acoustic features (from Eyben et al., 2016),
# mapped onto 88 principal components to remove collinearity
# *combo: combination of all predictors for both glove and acoustics models
# --plot of human valence ratings for each 5-second window in the dataset, including:
# -coefficient of determination (r-squared)
# -pearson correlation coefficient
# -concordance correlation coefficient
# In line, beta weights for statistically significant predictors for
# glove, acoustics, and combo models can be explored
# Usage Notes:
# --set iterations parameter, then run script
# SETUP
set.seed(2024)
##select number of iterations
iterations = 1000
##load libraries
library(tidyverse)
library(lme4)
library(gridExtra)
library(viridis)
##helper functions
rsquared <- function(x, y){
ssr <- sum((x-y)^2)
sst <- sum((x-mean(x))^2)
r2 <- 1 - (ssr / sst)
return (r2)
}
pcc <- function(x, y){
pearson_numerator <- sum((x - mean(x)) * (y - mean(y)))
pearson_denominator <- sqrt(sum((x - mean(x))**2) * sum((y - mean(y))**2))
pearson_p <- pearson_numerator / pearson_denominator
return (pearson_p)
}
ccc <- function(x, y){
ccc_numerator <- 2 * pcc(x, y) * sd(x) * sd(y)
ccc_denominator <- (sd(x)**2) + (sd(y)**2) + (mean(x) - mean(y))**2
ccc <- ccc_numerator / ccc_denominator
return (ccc)
}
digit_display <- function(number){
if(abs(number)<0.00001){
x <- sprintf("%.6f", number)
}else{
x <- sprintf("%.3f", number)
}
return (x)
}
##load all data
df <- read.csv("data.csv")
##drop rows rows with missing/infinite values (when unable to calculate either acoustic feature or glove centroid)
rows_to_drop <- c()
for(x in 1:nrow(df)){
val <- sum(!is.finite(as.numeric(as.vector(df[x,c(6:ncol(df))]))))
if(val>0){
rows_to_drop <- c(rows_to_drop, x)
}
}
df <- df[-rows_to_drop,]
##drop first five seconds of each narrative (likely that ratings in first 5 seconds represent a lag in rater reactions)
firstfivesecs <- which(df$time==0)
df <- df[-firstfivesecs,]
##convert speaker to factor
df$spkr <- as.factor(df$spkr)
##visually check distribution of human ratings
plot1 <- df %>%
ggplot(aes(x=ewe, y=0, color=spkr)) +
geom_jitter(height=0.25, size=0.25) +
ylim(c(-0.3, 0.3)) +
scale_color_viridis(discrete=TRUE, option="viridis") +
labs(title="Distribution of Human Valence Ratings",
subtitle="for each 5-second window",
x="", y="") +
theme_void() +
theme(legend.position = "none")
plot2 <- df %>%
ggplot(aes(x=ewe)) + geom_density() +
xlim(c(0,100)) +
labs(x="Valence Rating (per 5-second window)", y="") +
theme_classic() +
theme(axis.line.y = element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
comboPlot <- gridExtra::grid.arrange(plot1, plot2,
layout_matrix = matrix(c(1,1,2), byrow = TRUE, ncol = 1))
#output combo: RStudio Export>Save as Image (693 x 352) >> "dist-val-ratings.png"
# ORGANIZE BOOTSTRAPPING DATAFRAMES
#warriner
warrinercolnames <- colnames(df)[c(6:7)]
a <- tibble(i=rep(1:iterations,))
a <- mutate(a, intercept=NA)
a <- cbind(a, setNames( lapply(warrinercolnames, function(x) x=NA), warrinercolnames))
#glove
glovecolnames <- colnames(df)[c(97:ncol(df))]
b <- tibble(i=rep(1:iterations,))
b <- mutate(b, intercept=NA)
b <- cbind(b, setNames( lapply(glovecolnames, function(x) x=NA), glovecolnames))
#acoustics
#with thanks to Gavin Simpson for this post: https://stackoverflow.com/questions/12760108/principal-components-analysis-how-to-get-the-contribution-of-each-paramete
acousticDat <- df[,c(9:96)]
acoustic.pca <- stats::prcomp(acousticDat, scale=TRUE)
#summary(acoustic.pca)
acoustic.pca.x <- as.data.frame(acoustic.pca$x)
acousticolnames <- colnames(acoustic.pca.x)
df <- cbind(df, acoustic.pca.x)
c <- tibble(i=rep(1:iterations,))
c <- mutate(c, intercept=NA)
c <- cbind(c, setNames( lapply(acousticolnames, function(x) x=NA), acousticolnames))
#combo glove+acoustics
combocolnames <- c(glovecolnames, acousticolnames)
d <- tibble(i=rep(1:iterations,))
d <- mutate(d, intercept=NA)
d <- cbind(d, setNames( lapply(combocolnames, function(x) x=NA), combocolnames))
# BOOTSTRAP IT!
for (i in 1:iterations){
sampled_rows <- sample(nrow(df), nrow(df), replace=TRUE)
data <- df[sampled_rows,]
#warriner ratings
print(paste("running warriner model, iteration ", as.character(i)))
mdl <- lmer(ewe ~ valMean + aroMean + (1|spkr), data=data)
a[i,2:ncol(a)] <- as.list(fixef(mdl))
#glove embeddings
print(paste("running glove model, iteration ", as.character(i)))
glovecols <- data[,c(2, 8, 97:396)]
mdl <- lmer(ewe ~ . -spkr + (1|spkr), data=glovecols)
b[i,2:ncol(b)] <- as.list(fixef(mdl))
#acoustic features
print(paste("running acoustic model, iteration ", as.character(i)))
acousticcols <- data[,c(2, 8, 397:484)]
mdl <- lmer(ewe ~ . -spkr + (1|spkr), data=acousticcols)
c[i,2:ncol(c)] <- as.list(fixef(mdl))
#combo glove + acoustic
print(paste("running combo model, iteration ", as.character(i)))
combocols <- data[,c(2, 8, 97:484)]
mdl <- lmer(ewe ~ . -spkr + (1|spkr), data=combocols)
d[i,2:ncol(d)] <- as.list(fixef(mdl))
}
# FUNCTIONS FOR MODEL COMPARISON
##function to calculate evaluation metrics and build plot
evaluate_and_plot <- function(dataframe, databoot, title){
betas <- colMeans(databoot)
intercept <- betas[2]
predictors <- colnames(databoot)[3:ncol(databoot)]
predbetas <- betas[3:length(betas)]
relevantcols <- names(dataframe)[which(names(dataframe) %in% names(databoot))]
trueDat <- select(dataframe, all_of(c("spkr", "ewe", relevantcols)))
trueDat$pred <- c()
for(i in 1:nrow(trueDat)){
y <- intercept
for(j in 1:length(relevantcols)){
thisbeta <- predbetas[j]
val <- trueDat[i, relevantcols[j]]
addl <- thisbeta * val
y <- y + addl
}
trueDat$pred[i] <- y
}
#evaluation metrics
r2 <- rsquared(trueDat$ewe, trueDat$pred)
pearson_p <- pcc(trueDat$ewe, trueDat$pred)
ccc <- ccc(trueDat$ewe, trueDat$pred)
label1 <- paste("R^2 == ", digit_display(r2), sep="")
label2 <- paste("Pearson's r = ", digit_display(pearson_p), sep="")
label3 <- paste("CCC = ", digit_display(ccc), sep="")
plot <- ggplot(data=trueDat, aes(x=ewe, y=pred)) +
geom_point(data=trueDat, aes(color=spkr), size=0.5) +
scale_color_viridis(discrete=TRUE, option="viridis") +
labs(title=title,
x="Human Valence Rating", y="Predicted Valence Rating") +
theme_classic() +
theme(legend.position = "none") +
ylim(c(0,100)) +
annotate("text", x=6, y=95, label=label1, size=3, hjust=0, parse=TRUE) +
annotate("text", x=6, y=90, label=label2, size=3, hjust=0) +
annotate("text", x=6, y=85, label=label3, size=3, hjust=0)
return (plot)
}
##function to calculate significance for each beta
calculate_significance <- function(databoot){
betas <- colMeans(databoot)
stderr <- sapply(databoot, function(x) var(x))
tval <- betas / stderr
tval_abs <- abs(tval)
pvals <- lapply(tval_abs, function(q) 2 * pt(q, nrow(databoot), lower.tail=FALSE))
allps <- unlist(pvals)[3:length(pvals)] #drop "i" column and intercept
betas <- betas[3:length(betas)] #drop intercept
return (betas[allps < 0.05])
}
##############################################################
# COMPARE MODELS
evaluate_and_plot(df, a, "Lexical Valence+Arousal v. Human Raters")
ggsave("figs/warriner_results.png", height=4, width=6, units="in")
evaluate_and_plot(df, b, "GloVe Embeddings v. Human Raters")
ggsave("figs/glove_results.png", height=4, width=6, units="in")
evaluate_and_plot(df, c, "Acoustic Feature PCAs v. Human Raters")
ggsave("figs/acoustics_results.png", height=4, width=6, units="in")
evaluate_and_plot(df, d, "Combo GloVe + Acoustic PCAs v. Human Raters")
ggsave("figs/combo_results.png", height=4, width=6, units="in")
# EXPLORE BETA WEIGHTS
#GloVe model: beta weights for dimensions with p<0.05:
betas_glove <- calculate_significance(b)
round(betas_glove, 3)
#Acoustics model: beta weights for principal components with p<0.05:
betas_acoustic <- calculate_significance(c)
round(betas_acoustic, 3)
#Combo model: beta weights for GloVe dimensions and acoustic PCs with p<0.05:
betas_combo <- calculate_significance(d)
round(betas_combo, 3)