-
Notifications
You must be signed in to change notification settings - Fork 5
/
mindep_analysis.R
68 lines (57 loc) · 2.24 KB
/
mindep_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(tidyverse)
library(lme4)
library(broom)
library(stringr)
library(optimx)
DATA_FILENAME = "mindep_20180309_lin_melted.csv"
BASELINE = "real"
#COMPARISONS = c("free random", "fixed random per language", "free head-consistent random", "fixed head-consistent random", "nonprojective free random", "nonprojective free head-consistent random")
COMPARISONS = c("free random", "rand_proj_lin_r_lic", "rand_proj_lin_perplex", "rand_proj_lin_meaningsame")
OPTIMIZER = "Nelder_Mead"
set.seed(1)
args <- commandArgs(TRUE)
the_lang = args[1]
na_returner = function(err) NA
null_to_na = function(x) ifelse(is.null(x), NA, x)
fit_by_lang = function(dm, baseline, comparison) {
## Make sure the real sentences are the baseline
## Do two regressions
## Dependency length should be predicted by (squared) length for the different factors;
## what would a main effect of these factors on dependency length even mean?
dm %>%
filter(real %in% c(baseline, comparison)) %>%
mutate(real=factor(real, levels=c(baseline, comparison))) %>%
do(
lang=first(.$lang),
model = tryCatch(lmer(value ~ length * real + (1+real|start_line), data=., REML=F), error=na_returner),
model_noint = tryCatch(lmer(value ~ length + real + (1+real|start_line), data=., REML=F), error=na_returner)
)
}
summarise_model = function(dm) {
dm %>%
summarise(
coef = tryCatch(null_to_na(tidy(model)[4,]$estimate), error=na_returner),
p = tryCatch(null_to_na(tidy(anova(model, model_noint))$p.value[2]), error=no_returner),
lang = lang
)
}
run_comparison = function(comparison) {
print(str_c("Running comparison to ", comparison))
d %>%
group_by(lang) %>%
fit_by_lang(BASELINE, comparison) %>%
summarise_model() %>%
ungroup() %>%
mutate(comparison=comparison)
}
d = read_csv(DATA_FILENAME) %>%
#select(-X1) %>%
filter(lang == the_lang) %>%
filter(real != "Unnamed: ") %>%
mutate(start_line = as.factor(start_line))
result = COMPARISONS %>%
map(run_comparison) %>%
reduce(bind_rows, tibble())
outfilename = str_c(the_lang, "_model_coefficients_20180308.csv")
print(outfilename)
write.csv(result, file=outfilename)