Analyses for review patterns

cjvanlissa · Jan 5, 2022 · a5c37a6 · a5c37a6
1 parent a5675b9
commit a5c37a6
Show file tree

Hide file tree

Showing 8 changed files with 638 additions and 156 deletions.
diff --git a/country_variables.png b/country_variables.png
diff --git a/country_variables_byday.png b/country_variables_byday.png
diff --git a/date_distribution.png b/date_distribution.png
diff --git a/date_distribution.svg b/date_distribution.svg
diff --git a/reviews_patterns.R b/reviews_patterns.R
@@ -0,0 +1,98 @@
+# VIFS
+df_training <- read.csv("df_training_imputed.csv", stringsAsFactors = FALSE)
+
+df_training[which(sapply(df_training, is.character))] <- 
+  lapply(df_training[which(sapply(df_training, is.character))], factor)
+
+df_training[which(lapply(sapply(df_training, unique), length) <= 5)] <- 
+  lapply(df_training[which(lapply(sapply(df_training, unique), length) <= 5)], factor)
+
+tmp <- df_training[sapply(df_training, is.factor)]
+
+df2 <- df_training
+
+desc <- read.csv("descriptives_itemscales.csv", stringsAsFactors = FALSE)
+df2 <- df2[, names(df2) %in% desc$name]
+
+df2[names(tmp)] <- NULL
+df2[["date"]] <- NULL
+df2[["countryiso3"]] <- NULL
+
+res <- lm(c19perbeh ~.,data = df2)
+vifs <- car::vif(res)
+
+range(vifs)
+
+
+library(ranger)
+source("scripts/varimpplot_lasso.R")
+source("scripts/model_accuracy.R")
+
+df_training <- read.csv("df_training_imputed.csv", stringsAsFactors = FALSE)
+df_testing <- read.csv("df_testing_imputed.csv", stringsAsFactors = FALSE)
+
+remove <- c("c19normshould",
+  "c19proso",
+  "isoimmi_inperson",
+  "houseleave",
+  "isoothppl_inperson",
+  "c19normdo",
+  "ecoproso")
+
+df_training[remove] <- NULL
+df_testing[remove] <- NULL
+y = "c19perbeh"
+X <- df_training[, !names(df_training) == y]
+X_las <- model.matrix(~., X)[, -1]
+Y <- df_training[[y]]
+
+X_test <- df_testing[, !names(df_testing) == y]
+X_las_test <- model.matrix(~., X_test)[, -1]
+Y_test <- df_testing[[y]]
+
+set.seed(953007)
+res <- ranger(c19perbeh~., data = df_training, num.trees = 1000,
+              min.node.size = 6,
+              mtry = 31,
+              importance = "permutation")
+
+
+fits <- c(
+                    model_accuracy(res,
+                                   olddata = df_training,
+                                   observed = Y,
+                                   ymean = mean(Y, na.rm = TRUE)),
+                    model_accuracy(res,
+                                   newdata = df_testing,
+                                   observed = Y_test,
+                                   ymean = mean(Y, na.rm = TRUE)))
+
+names(fits) <- c(paste0("train_", c("r2", "mse", "r_actual_pred")), paste0("test_", c("r2", "mse", "r_actual_pred")))  
+fits
+
+
+# Separate PDP ------------------------------------------------------------
+
+p <- readRDS("results/pdp_c19perbeh.RData")
+df_vars <- read.csv("scripts/df_training_labs.csv", stringsAsFactors = F)
+var_rename <- tolower(df_vars$lab)
+names(var_rename) <- df_vars$X
+
+p <- for(i in 1:length(p)){
+  thisp <- p[[i]]+facet_grid(.~Variable, labeller = labeller(
+    Variable = setNames(paste0(i, ". ", var_rename[vars[i]]), var_rename[vars[i]])
+  ))
+})
+# Reduce font size
+p <- lapply(p, function(x){ x + theme(strip.text.x = element_text(size = 5),
+                                      axis.text.x = element_text(size = 5),
+                                      axis.text.y = element_text(size = 5))})
+if("countryiso3" %in% vars){
+  p[[which(vars == "countryiso3")]] <- p[[which(vars == "countryiso3")]] + theme(axis.text.x = element_text(angle=90, size = 3))
+}  
+p <- metaforest:::merge_plots(p)
+ggsave(
+  filename = paste0("results/rf_partialdependence_", gsub(".+_(.+)\\.RData", "\\1", thisfile), ".png"),
+  p,
+  device = "png")
+
diff --git a/scripts/conspiracy_analyses.R b/scripts/conspiracy_analyses.R
@@ -0,0 +1,83 @@
+df_vars <- read.csv("scripts/df_training_labs.csv", stringsAsFactors = F)
+df_vars$lab <- tolower(df_vars$lab)
+df_vars <- df_vars[grepl("^c_", df_vars$lab), ]
+df_vars <- df_vars[df_vars$lab %in% c("c_political stability", "c_govt. effectiveness", 
+                                      "c_doctors per 10k", "c_control corruption", "c_govt. response", 
+                                      "c_accountability", "c_containment health index", 
+                                      "c_tourism expenditures", "c_rule of law"), ]
+
+
+tmp <- df_training[, c("countryiso3", df_vars$X)]
+
+names(tmp)[-1] <- df_vars$lab[match(names(tmp)[-1], df_vars$X)]
+tmp <- tmp[!duplicated(tmp$countryiso3), ]
+
+df_plot <- do.call(rbind, lapply(names(tmp)[-1], function(x){
+  data.frame(Country = tmp[["countryiso3"]], Variable = x, Value = tmp[[x]])
+}))
+
+library(ggplot2)
+
+p <- ggplot(df_plot, aes(x = Value, y = 1)) + ggrepel::geom_text_repel(aes(label = Country), angle = 90, size = 3, max.overlaps = 30, direction = "y") + facet_wrap(~Variable, nrow = 3, scales = "free") + theme_bw() + labs(x = NULL, y = NULL) + scale_y_continuous(labels = NULL, breaks = NULL)
+ggsave("country_variables.png", p, "png", width = 8, height = 8)
+
+
+df_plot <- do.call(rbind, lapply(names(tmp)[-1], function(x){
+  data.frame(Country = tmp[["countryiso3"]], Variable = x, Value = tmp[[x]])
+}))
+
+library(ggplot2)
+
+# df_training should be a data.table
+
+df_plot <- lapply(c("deaths", "confirmed", "recovered", "governmentresponseindex", "stringencyindex", "closepublictransport_flag"), function(x){
+  tmp <- df_training[, list(mean=mean(get(x), na.rm = TRUE), sd=sd(get(x), na.rm = TRUE)), by=countryiso3]
+  tmp[, "variable" := x]
+})
+df_plot <- rbindlist(df_plot)
+df_plot$variable <- df_vars$lab[match(df_plot$variable,  df_vars$X)]
+
+p <- ggplot(df_plot, aes(y = countryiso3)) + 
+  geom_errorbarh(aes(xmin = mean-sd, xmax = mean+sd)) +
+  geom_point(aes(x = mean))+
+  facet_wrap(~variable, nrow = 3, scales = "free") + theme_bw() +
+  ylab("Country")
+ggsave("country_variables_byday.png", p, "png", width = 8, height = 10)
+
+
+
+# Check measurement invariance --------------------------------------------
+library(lavaan)
+df_inv <- df$consp01
+df_inv <- df[, c("coded_country", grep("^consp0", names(df), value = T))]
+
+df_tmp <- df_inv
+df_tmp <- df_tmp[!rowSums(is.na(df_tmp)) == 3, ]
+df_tmp <- df_tmp[!df_tmp$coded_country %in% c("Malaysia", "Philippines"), ]
+mod <- paste0('F =~ ', paste0(names(df_tmp[-1]), collapse = " + "))
+# configural invariance
+# configural invariance
+  fit <- cfa(mod, df_tmp)
+
+  fit1 <- cfa(mod, data = df_tmp, group = "coded_country")
+  # metric invariance
+  fit2 <- cfa(mod, data = df_tmp, group = "coded_country",
+              group.equal = "loadings")
+  c(fitmeasures(fit1)[c("chisq", "df", "npar", "bic", "cfi", "tli", "rmsea")],
+    fitmeasures(fit2)[c("chisq", "df", "npar", "bic", "cfi", "tli", "rmsea")],
+    unlist(lavTestLRT(fit1, fit2)[2, 7]))
+  anova(fit1, fit2)
+
+
+df_tmp <- df_inv
+names(df_tmp)[-1] <- paste0("consp_", 1:3)
+library(tidySEM)
+out <- sapply(unique(df_tmp$coded_country), function(i){
+  #i = df_tmp$coded_country[1]
+  tmp <- df_tmp[df_tmp$coded_country == i, -1]
+  tmp <- tidy_sem(tmp)
+  tmp <- create_scales(tmp)
+  c(Country = i, unlist(tmp$descriptives))
+})
+desc <- data.frame(t(out))
+write.csv(desc, "conspiracy_reliability_by_country.csv", row.names = F)
diff --git a/scripts/model_accuracy.R b/scripts/model_accuracy.R
@@ -45,6 +45,13 @@ model_accuracy <-
           predicted <- predict(fit, data = newdata)$predictions
         }
       },
+      lm = {
+        if (is.null(newdata)) {
+          predicted <- fit$fitted.values
+        } else {
+          predicted <- predict(fit, newdata = newdata)
+        }
+      },
       ranger = {
         if (is.null(newdata)) {
           predicted <- fit$predictions