diff --git a/content/publication/franken2024sami.md b/content/publication/franken2024sami.md
index b02dbd7..eefe8e5 100644
--- a/content/publication/franken2024sami.md
+++ b/content/publication/franken2024sami.md
@@ -9,9 +9,9 @@
title = "Self-supervised alignment with mutual information: Learning to follow principles without preference labels"
date = "2024-04-22"
authors = ["J. Fränken","E. Zelikman","R. Rafailov","K. Gandhi","T. Gerstenberg","N. D. Goodman"]
-publication_types = ["1"]
-publication_short = "_arXiv_"
-publication = "Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. _arXiv_."
+publication_types = ["3"]
+publication_short = "_Advances in Neural Information Processing Systems_"
+publication = "Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. _Advances in Neural Information Processing Systems_."
abstract = "When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into a model can be resource-intensive and technically challenging, generally requiring human preference labels or examples. We introduce SAMI, a method for teaching a pretrained LM to follow behavioral principles that does not require any preference labels or demonstrations. SAMI is an iterative algorithm that finetunes a pretrained LM to increase the conditional mutual information between constitutions and self-generated responses given queries from a datasest. On single-turn dialogue and summarization, a SAMI-trained mistral-7b outperforms the initial pretrained model, with win rates between 66% and 77%. Strikingly, it also surpasses an instruction-finetuned baseline (mistral-7b-instruct) with win rates between 55% and 57% on single-turn dialogue. SAMI requires a 'principle writer' model; to avoid dependence on stronger models, we further evaluate aligning a strong pretrained model (mixtral-8x7b) using constitutions written by a weak instruction-finetuned model (mistral-7b-instruct). The SAMI-trained mixtral-8x7b outperforms both the initial model and the instruction-finetuned model, achieving a 65% win rate on summarization. Our results indicate that a pretrained LM can learn to follow constitutions without using preference labels, demonstrations, or human oversight."
image_preview = ""
selected = false
diff --git a/content/publication/gandhi2024affective.md b/content/publication/gandhi2024affective.md
index 6fa1d3b..98bc334 100644
--- a/content/publication/gandhi2024affective.md
+++ b/content/publication/gandhi2024affective.md
@@ -16,7 +16,7 @@ abstract = "Understanding emotions is fundamental to human interaction and exper
image_preview = ""
selected = false
projects = []
-#url_pdf = "papers/gandhi2024affective.pdf"
+url_pdf = "papers/gandhi2024affective.pdf"
url_preprint = "https://arxiv.org/abs/2409.11733"
url_code = ""
url_dataset = ""
diff --git a/content/publication/jin2024marple.md b/content/publication/jin2024marple.md
new file mode 100644
index 0000000..3fdb79c
--- /dev/null
+++ b/content/publication/jin2024marple.md
@@ -0,0 +1,33 @@
++++
+# 0 -> 'Forthcoming',
+# 1 -> 'Preprint',
+# 2 -> 'Journal',
+# 3 -> 'Conference Proceedings',
+# 4 -> 'Book chapter',
+# 5 -> 'Thesis'
+
+title = "MARPLE: A Benchmark for Long-Horizon Inference"
+date = "2024-10-04"
+authors = ["E. Jin","Z. Huang","J. Fränken","W. Liu","H. Cha","E. Brockbank","S. Wu","R. Zhang","J. Wu","T. Gerstenberg"]
+publication_types = ["3"]
+publication_short = "_Advances in Neural Information Processing Systems_"
+publication = "Jin, E., Huang, Z., Fränken, J., Liu, W., Cha, H., Brockbank, E., Wu, S., Zhang, R., Wu, J., Gerstenberg, T. (2024). MARPLE: A Benchmark for Long-Horizon Inference. _Advances in Neural Information Processing Systems_."
+abstract = "Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit'' stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: https://marple-benchmark.github.io/."
+image_preview = ""
+selected = false
+projects = []
+url_pdf = "papers/jin2024marple.pdf"
+url_preprint = "http://arxiv.org/abs/2410.01926"
+url_code = ""
+url_dataset = ""
+url_slides = ""
+url_video = ""
+url_poster = ""
+url_source = ""
+url_custom = [{name = "Project website", url = "https://marple-benchmark.github.io/"}]
+math = true
+highlight = true
+[header]
+# image = "publications/jin2024marple.png"
+caption = ""
++++
\ No newline at end of file
diff --git a/docs/404.html b/docs/404.html
index d06c8d8..d46c2d6 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -237,6 +237,10 @@
+
+ E. Jin, Z. Huang, J. Fränken, W. Liu, H. Cha, E. Brockbank, S. Wu, R. Zhang, J. Wu, T. Gerstenberg
+
+ (2024).
+
+ MARPLE: A Benchmark for Long-Horizon Inference.
+ Advances in Neural Information Processing Systems.
+
+
+
+
+
Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. arXiv.
+
Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. Advances in Neural Information Processing Systems.
+
+ PDF
+
+
diff --git a/docs/publication/index.html b/docs/publication/index.html
index 5b84bc1..f39141b 100644
--- a/docs/publication/index.html
+++ b/docs/publication/index.html
@@ -1597,6 +1597,19 @@
Publications
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -1733,6 +1746,65 @@
Publications
+
+
+
+
+ E. Jin, Z. Huang, J. Fränken, W. Liu, H. Cha, E. Brockbank, S. Wu, R. Zhang, J. Wu, T. Gerstenberg
+
+ (2024).
+
+ MARPLE: A Benchmark for Long-Horizon Inference.
+ Advances in Neural Information Processing Systems.
+
+
+
+
+
+
+ E. Jin, Z. Huang, J. Fränken, W. Liu, H. Cha, E. Brockbank, S. Wu, R. Zhang, J. Wu, T. Gerstenberg
+
+
+
+
+
+
+
+
+
+
+
+
Abstract
+
Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit” stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: https://marple-benchmark.github.io/.
Jin, E., Huang, Z., Fränken, J., Liu, W., Cha, H., Brockbank, E., Wu, S., Zhang, R., Wu, J., Gerstenberg, T. (2024). MARPLE: A Benchmark for Long-Horizon Inference. Advances in Neural Information Processing Systems.
-
- When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into …
-
-
+
+ Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, …
+
+
-
- As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic …
-
-
- This paper examines the transformative potential of Counterfactual World Simulation Models (CWSMs). CWSMs use pieces of multi-modal evidence, such as the CCTV footage or sound recordings of a road accident, to build a high-fidelity 3D reconstruction …
+ When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into …
- As AI systems like language models are increasingly integrated into making decisions that affect people, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. Recent …
+ As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic …
+
+ This paper examines the transformative potential of Counterfactual World Simulation Models (CWSMs). CWSMs use pieces of multi-modal evidence, such as the CCTV footage or sound recordings of a road accident, to build a high-fidelity 3D reconstruction …
+
+
+
+ As AI systems like language models are increasingly integrated into making decisions that affect people, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. Recent …
+
+
-
- When choosing how to describe what happened, we have a number of causal verbs at our disposal. In this paper, we develop a model-theoretic formal semantics for nine causal verbs that span the categories of CAUSE, ENABLE, and PREVENT. We use …
-
-
-
- This work attempts to bridge the divide between accounts of causal reasoning with respect to agents and objects. We begin by examining the influence of animacy. In a collision-based context, we vary the animacy status of an object using 3D …
-
-