diff --git a/content/publication/franken2024sami.md b/content/publication/franken2024sami.md
index b02dbd7..eefe8e5 100644
--- a/content/publication/franken2024sami.md
+++ b/content/publication/franken2024sami.md
@@ -9,9 +9,9 @@
 title = "Self-supervised alignment with mutual information: Learning to follow principles without preference labels"
 date = "2024-04-22"
 authors = ["J. Fränken","E. Zelikman","R. Rafailov","K. Gandhi","T. Gerstenberg","N. D. Goodman"]
-publication_types = ["1"]
-publication_short = "_arXiv_"
-publication = "Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. _arXiv_."
+publication_types = ["3"]
+publication_short = "_Advances in Neural Information Processing Systems_"
+publication = "Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. _Advances in Neural Information Processing Systems_."
 abstract = "When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into a model can be resource-intensive and technically challenging, generally requiring human preference labels or examples. We introduce SAMI, a method for teaching a pretrained LM to follow behavioral principles that does not require any preference labels or demonstrations. SAMI is an iterative algorithm that finetunes a pretrained LM to increase the conditional mutual information between constitutions and self-generated responses given queries from a datasest. On single-turn dialogue and summarization, a SAMI-trained mistral-7b outperforms the initial pretrained model, with win rates between 66% and 77%. Strikingly, it also surpasses an instruction-finetuned baseline (mistral-7b-instruct) with win rates between 55% and 57% on single-turn dialogue. SAMI requires a 'principle writer' model; to avoid dependence on stronger models, we further evaluate aligning a strong pretrained model (mixtral-8x7b) using constitutions written by a weak instruction-finetuned model (mistral-7b-instruct). The SAMI-trained mixtral-8x7b outperforms both the initial model and the instruction-finetuned model, achieving a 65% win rate on summarization. Our results indicate that a pretrained LM can learn to follow constitutions without using preference labels, demonstrations, or human oversight."
 image_preview = ""
 selected = false
diff --git a/content/publication/gandhi2024affective.md b/content/publication/gandhi2024affective.md
index 6fa1d3b..98bc334 100644
--- a/content/publication/gandhi2024affective.md
+++ b/content/publication/gandhi2024affective.md
@@ -16,7 +16,7 @@ abstract = "Understanding emotions is fundamental to human interaction and exper
 image_preview = ""
 selected = false
 projects = []
-#url_pdf = "papers/gandhi2024affective.pdf"
+url_pdf = "papers/gandhi2024affective.pdf"
 url_preprint = "https://arxiv.org/abs/2409.11733"
 url_code = ""
 url_dataset = ""
diff --git a/content/publication/jin2024marple.md b/content/publication/jin2024marple.md
new file mode 100644
index 0000000..3fdb79c
--- /dev/null
+++ b/content/publication/jin2024marple.md
@@ -0,0 +1,33 @@
++++
+# 0 -> 'Forthcoming',
+# 1 -> 'Preprint',
+# 2 -> 'Journal',
+# 3 -> 'Conference Proceedings',
+# 4 -> 'Book chapter',
+# 5 -> 'Thesis'
+
+title = "MARPLE: A Benchmark for Long-Horizon Inference"
+date = "2024-10-04"
+authors = ["E. Jin","Z. Huang","J. Fränken","W. Liu","H. Cha","E. Brockbank","S. Wu","R. Zhang","J. Wu","T. Gerstenberg"]
+publication_types = ["3"]
+publication_short = "_Advances in Neural Information Processing Systems_"
+publication = "Jin, E., Huang, Z., Fränken, J., Liu, W., Cha, H., Brockbank, E., Wu, S., Zhang, R., Wu, J., Gerstenberg, T. (2024). MARPLE: A Benchmark for Long-Horizon Inference. _Advances in Neural Information Processing Systems_."
+abstract = "Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit'' stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: https://marple-benchmark.github.io/."
+image_preview = ""
+selected = false
+projects = []
+url_pdf = "papers/jin2024marple.pdf"
+url_preprint = "http://arxiv.org/abs/2410.01926"
+url_code = ""
+url_dataset = ""
+url_slides = ""
+url_video = ""
+url_poster = ""
+url_source = ""
+url_custom = [{name = "Project website", url = "https://marple-benchmark.github.io/"}]
+math = true
+highlight = true
+[header]
+# image = "publications/jin2024marple.png"
+caption = ""
++++
\ No newline at end of file
diff --git a/docs/404.html b/docs/404.html
index d06c8d8..d46c2d6 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -237,6 +237,10 @@ <h1>Page not found</h1>
   
   <h2>Publications</h2>
   
+  <ul>
+    <li><a href="https://cicl.stanford.edu/publication/jin2024marple/">MARPLE: A Benchmark for Long-Horizon Inference</a></li>
+  </ul>
+  
   <ul>
     <li><a href="https://cicl.stanford.edu/publication/beller2024causation/">Causation, Meaning, and Communication</a></li>
   </ul>
@@ -253,10 +257,6 @@ <h2>Publications</h2>
     <li><a href="https://cicl.stanford.edu/publication/gerstenberg2024counterfactual/">Counterfactual simulation in causal cognition</a></li>
   </ul>
   
-  <ul>
-    <li><a href="https://cicl.stanford.edu/publication/amemiya2024disagreement/">Children use disagreement to infer what happened</a></li>
-  </ul>
-  
   
 
   
diff --git a/docs/bibtex/cic_papers.bib b/docs/bibtex/cic_papers.bib
index d52d0fd..7023118 100644
--- a/docs/bibtex/cic_papers.bib
+++ b/docs/bibtex/cic_papers.bib
@@ -1,13 +1,24 @@
 %% This BibTeX bibliography file was created using BibDesk.
 %% https://bibdesk.sourceforge.io/
 
-%% Created for Tobias Gerstenberg at 2024-09-20 15:09:53 -0700 
+%% Created for Tobias Gerstenberg at 2024-10-04 08:51:52 -0700 
 
 
 %% Saved with string encoding Unicode (UTF-8) 
 
 
 
+@article{jin2024marple,
+	abstract = {Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit'' stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: https: //marple-benchmark.github.io/.},
+	annote = {Comment: NeurIPS 2024. First two authors contributed equally. Project page: https://marple-benchmark.github.io/},
+	author = {Jin, Emily and Huang, Zhuoyi and Fr{\"a}nken, Jan-Philipp and Liu, Weiyu and Cha, Hannah and Brockbank, Erik and Wu, Sarah and Zhang, Ruohan and Wu, Jiajun and Gerstenberg, Tobias},
+	date-added = {2024-10-04 08:51:51 -0700},
+	date-modified = {2024-10-04 08:51:51 -0700},
+	journal = {arXiv},
+	note = {http://arxiv.org/abs/2410.01926},
+	title = {{MARPLE: A Benchmark for Long-Horizon Inference}},
+	year = {2024}}
+
 @article{gandhi2024affective,
 	abstract = {Understanding emotions is fundamental to human interaction and experience. Humans easily infer emotions from situations or facial expressions, situations from emotions, and do a variety of other affective cognition. How adept is modern AI at these inferences? We introduce an evaluation framework for testing affective cognition in foundation models. Starting from psychological theory, we generate 1,280 diverse scenarios exploring relationships between appraisals, emotions, expressions, and outcomes. We evaluate the abilities of foundation models (GPT-4, Claude-3, Gemini-1.5-Pro) and humans (N = 567) across carefully selected conditions. Our results show foundation models tend to agree with human intuitions, matching or exceeding interparticipant agreement. In some conditions, models are ``superhuman'' -- they better predict modal human judgements than the average human. All models benefit from chain-of-thought reasoning. This suggests foundation models have acquired a human-like understanding of emotions and their influence on beliefs and behavior.},
 	author = {Kanishk Gandhi and Zoe Lynch and Jan-Philipp Fr{\"a}nken and Kayla Patterson and Sharon Wambu and Tobias Gerstenberg and Desmond C. Ong and Noah D. Goodman},
diff --git a/docs/index.html b/docs/index.html
index ed450d8..57b7851 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -110,7 +110,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
   
-  <meta property="og:updated_time" content="2024-09-20T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-10-04T00:00:00&#43;00:00">
   
 
   
diff --git a/docs/index.xml b/docs/index.xml
index 90f077e..69171ad 100644
--- a/docs/index.xml
+++ b/docs/index.xml
@@ -6,9 +6,18 @@
     <generator>Hugo -- gohugo.io</generator>
     <language>en-us</language>
     <copyright>&amp;copy; 2024 Tobias Gerstenberg</copyright>
-    <lastBuildDate>Fri, 20 Sep 2024 00:00:00 +0000</lastBuildDate>
+    <lastBuildDate>Fri, 04 Oct 2024 00:00:00 +0000</lastBuildDate>
     <atom:link href="/" rel="self" type="application/rss+xml" />
     
+    <item>
+      <title>MARPLE: A Benchmark for Long-Horizon Inference</title>
+      <link>https://cicl.stanford.edu/publication/jin2024marple/</link>
+      <pubDate>Fri, 04 Oct 2024 00:00:00 +0000</pubDate>
+      
+      <guid>https://cicl.stanford.edu/publication/jin2024marple/</guid>
+      <description></description>
+    </item>
+    
     <item>
       <title>Causation, Meaning, and Communication</title>
       <link>https://cicl.stanford.edu/publication/beller2024causation/</link>
@@ -135,14 +144,5 @@
       <description></description>
     </item>
     
-    <item>
-      <title>STaR-GATE: Teaching Language Models to Ask Clarifying Questions</title>
-      <link>https://cicl.stanford.edu/publication/andukuri2024stargate/</link>
-      <pubDate>Sun, 31 Mar 2024 00:00:00 +0000</pubDate>
-      
-      <guid>https://cicl.stanford.edu/publication/andukuri2024stargate/</guid>
-      <description></description>
-    </item>
-    
   </channel>
 </rss>
diff --git a/docs/member/tobias_gerstenberg/index.html b/docs/member/tobias_gerstenberg/index.html
index c49cc68..1638089 100644
--- a/docs/member/tobias_gerstenberg/index.html
+++ b/docs/member/tobias_gerstenberg/index.html
@@ -356,6 +356,53 @@ <h2 id="publications">Publications</h2>
         
        
        <div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
+  <span itemprop="author">
+    E. Jin, Z. Huang, J. Fränken, W. Liu, H. Cha, E. Brockbank, S. Wu, R. Zhang, J. Wu, T. Gerstenberg</span>
+  
+    (2024).
+  
+  <a href="https://cicl.stanford.edu/publication/jin2024marple/" itemprop="name">MARPLE: A Benchmark for Long-Horizon Inference</a>.
+  <em>Advances in Neural Information Processing Systems</em>.
+  
+
+  
+
+  <p>
+    
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="http://arxiv.org/abs/2410.01926" target="_blank" rel="noopener">
+  Preprint
+</a>
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://cicl.stanford.edu/papers/jin2024marple.pdf" target="_blank" rel="noopener">
+  PDF
+</a>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://marple-benchmark.github.io/" target="_blank" rel="noopener">
+  Project website
+</a>
+
+
+  </p>
+
+</div>
+<div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
   <span itemprop="author">
     A. Beller, T. Gerstenberg</span>
   
@@ -420,6 +467,10 @@ <h2 id="publications">Publications</h2>
 </a>
 
 
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://cicl.stanford.edu/papers/gandhi2024affective.pdf" target="_blank" rel="noopener">
+  PDF
+</a>
+
 
 
 
@@ -926,7 +977,7 @@ <h2 id="publications">Publications</h2>
     (2024).
   
   <a href="https://cicl.stanford.edu/publication/franken2024sami/" itemprop="name">Self-supervised alignment with mutual information: Learning to follow principles without preference labels</a>.
-  <em>arXiv</em>.
+  <em>Advances in Neural Information Processing Systems</em>.
   
 
   
diff --git a/docs/papers/gandhi2024affective.pdf b/docs/papers/gandhi2024affective.pdf
new file mode 100644
index 0000000..fb4c39e
Binary files /dev/null and b/docs/papers/gandhi2024affective.pdf differ
diff --git a/docs/papers/jin2024marple.pdf b/docs/papers/jin2024marple.pdf
new file mode 100644
index 0000000..47d575c
Binary files /dev/null and b/docs/papers/jin2024marple.pdf differ
diff --git a/docs/publication/franken2024sami/index.html b/docs/publication/franken2024sami/index.html
index 9dce50f..61a0a2b 100644
--- a/docs/publication/franken2024sami/index.html
+++ b/docs/publication/franken2024sami/index.html
@@ -266,8 +266,8 @@ <h3>Abstract</h3>
           <div class="col-xs-12 col-sm-3 pub-row-heading">Type</div>
           <div class="col-xs-12 col-sm-9">
             
-            <a href="/publication/#1">
-              Preprint
+            <a href="/publication/#3">
+              Conference Proceedings
             </a>
             
           </div>
@@ -283,7 +283,7 @@ <h3>Abstract</h3>
       <div class="col-sm-10">
         <div class="row">
           <div class="col-xs-12 col-sm-3 pub-row-heading">Publication</div>
-          <div class="col-xs-12 col-sm-9">Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. <em>arXiv</em>.</div>
+          <div class="col-xs-12 col-sm-9">Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. <em>Advances in Neural Information Processing Systems</em>.</div>
         </div>
       </div>
       <div class="col-sm-1"></div>
diff --git a/docs/publication/gandhi2024affective/index.html b/docs/publication/gandhi2024affective/index.html
index e14f2f5..a316eab 100644
--- a/docs/publication/gandhi2024affective/index.html
+++ b/docs/publication/gandhi2024affective/index.html
@@ -320,6 +320,10 @@ <h3>Abstract</h3>
 </a>
 
 
+<a class="btn btn-outline-primary my-1 mr-1" href="https://cicl.stanford.edu/papers/gandhi2024affective.pdf" target="_blank" rel="noopener">
+  PDF
+</a>
+
 
 
 
diff --git a/docs/publication/index.html b/docs/publication/index.html
index 5b84bc1..f39141b 100644
--- a/docs/publication/index.html
+++ b/docs/publication/index.html
@@ -1597,6 +1597,19 @@ <h1>Publications</h1>
 					
 					
 				
+			
+				
+				
+
+				
+					
+					
+					
+					
+					
+					
+					
+				
 			
 				
 				
@@ -1733,6 +1746,65 @@ <h1>Publications</h1>
 				
 				
 
+				<div class='grid-sizer col-md-12 isotope-item pubtype-3 year-2024 author-'>
+					
+						<div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
+  <span itemprop="author">
+    E. Jin, Z. Huang, J. Fränken, W. Liu, H. Cha, E. Brockbank, S. Wu, R. Zhang, J. Wu, T. Gerstenberg</span>
+  
+    (2024).
+  
+  <a href="https://cicl.stanford.edu/publication/jin2024marple/" itemprop="name">MARPLE: A Benchmark for Long-Horizon Inference</a>.
+  <em>Advances in Neural Information Processing Systems</em>.
+  
+
+  
+
+  <p>
+    
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="http://arxiv.org/abs/2410.01926" target="_blank" rel="noopener">
+  Preprint
+</a>
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://cicl.stanford.edu/papers/jin2024marple.pdf" target="_blank" rel="noopener">
+  PDF
+</a>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://marple-benchmark.github.io/" target="_blank" rel="noopener">
+  Project website
+</a>
+
+
+  </p>
+
+</div>
+
+					
+				</div>
+				
+				
+				
+
+				
+				
+
 				<div class='grid-sizer col-md-12 isotope-item pubtype-1 year-2024 author-'>
 					
 						<div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
@@ -1812,6 +1884,10 @@ <h1>Publications</h1>
 </a>
 
 
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://cicl.stanford.edu/papers/gandhi2024affective.pdf" target="_blank" rel="noopener">
+  PDF
+</a>
+
 
 
 
@@ -2441,7 +2517,7 @@ <h1>Publications</h1>
 				
 				
 
-				<div class='grid-sizer col-md-12 isotope-item pubtype-1 year-2024 author-'>
+				<div class='grid-sizer col-md-12 isotope-item pubtype-3 year-2024 author-'>
 					
 						<div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
   <span itemprop="author">
@@ -2450,7 +2526,7 @@ <h1>Publications</h1>
     (2024).
   
   <a href="https://cicl.stanford.edu/publication/franken2024sami/" itemprop="name">Self-supervised alignment with mutual information: Learning to follow principles without preference labels</a>.
-  <em>arXiv</em>.
+  <em>Advances in Neural Information Processing Systems</em>.
   
 
   
diff --git a/docs/publication/index.xml b/docs/publication/index.xml
index d55bb6f..11bc447 100644
--- a/docs/publication/index.xml
+++ b/docs/publication/index.xml
@@ -12,6 +12,15 @@
 	<atom:link href="https://cicl.stanford.edu/publication/index.xml" rel="self" type="application/rss+xml" />
     
     
+    <item>
+      <title>MARPLE: A Benchmark for Long-Horizon Inference</title>
+      <link>https://cicl.stanford.edu/publication/jin2024marple/</link>
+      <pubDate>Fri, 04 Oct 2024 00:00:00 +0000</pubDate>
+      
+      <guid>https://cicl.stanford.edu/publication/jin2024marple/</guid>
+      <description></description>
+    </item>
+    
     <item>
       <title>Causation, Meaning, and Communication</title>
       <link>https://cicl.stanford.edu/publication/beller2024causation/</link>
diff --git a/docs/publication/jin2024marple/index.html b/docs/publication/jin2024marple/index.html
new file mode 100644
index 0000000..26de712
--- /dev/null
+++ b/docs/publication/jin2024marple/index.html
@@ -0,0 +1,496 @@
+<!DOCTYPE html>
+<html lang="en-us">
+<head>
+
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="generator" content="Source Themes Academic 2.4.0">
+  <meta name="generator" content="Hugo 0.55.5" />
+  
+
+  
+  
+  
+  
+    
+  
+  <meta name="description" content="Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit&#39;&#39; stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: https://marple-benchmark.github.io/.">
+
+  
+  <link rel="alternate" hreflang="en-us" href="https://cicl.stanford.edu/publication/jin2024marple/">
+
+  
+
+
+  
+
+  
+  
+  
+  <meta name="theme-color" content="#3f51b5">
+  
+
+  
+  
+  
+  
+    
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha256-eSi1q2PG6J7g7ib17yAaWMcrr5GrtohYChqibrV7PBE=" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/academicons/1.8.6/css/academicons.min.css" integrity="sha256-uFVgMKfistnJAfoCUQigIl+JfUaP47GrRKjf6CTPVmw=" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.0-1/css/all.min.css" integrity="sha384-i090POAzfiGzqb4z2cbVa+SZ/ye0lP0mNl1rCZBTGVVPFpU8a4Ky5aBxJ0Ol5W6s" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/fancybox/3.2.5/jquery.fancybox.min.css" integrity="sha256-ygkqlh3CYSUri3LhQxzdcm0n1EQvH2Y+U5S2idbLtxs=" crossorigin="anonymous">
+
+    
+    
+    
+      
+    
+    
+      
+      
+        
+          <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css" crossorigin="anonymous">
+        
+      
+    
+
+    
+
+    
+
+  
+
+  
+  
+  <link rel="stylesheet" href="//fonts.googleapis.com/css?family=Montserrat:400,700%7cRoboto:400,400italic,700%7cRoboto&#43;Mono">
+  
+
+  <link rel="stylesheet" href="/styles.css">
+  
+  <link rel="stylesheet" href="/css/custom.css">
+  
+
+  
+  
+    <script>
+      window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
+      ga('create', 'UA-40308572-2', 'auto');
+      
+      ga('require', 'eventTracker');
+      ga('require', 'outboundLinkTracker');
+      ga('require', 'urlChangeTracker');
+      ga('send', 'pageview');
+    </script>
+    <script async src="//www.google-analytics.com/analytics.js"></script>
+    
+    <script async src="https://cdnjs.cloudflare.com/ajax/libs/autotrack/2.4.1/autotrack.js" integrity="sha512-HUmooslVKj4m6OBu0OgzjXXr+QuFYy/k7eLI5jdeEy/F4RSgMn6XRWRGkFi5IFaFgy7uFTkegp3Z0XnJf3Jq+g==" crossorigin="anonymous"></script>
+    
+  
+  
+
+  
+  <link rel="alternate" href="https://cicl.stanford.edu/index.xml" type="application/rss+xml" title="Causality in Cognition Lab">
+  <link rel="feed" href="https://cicl.stanford.edu/index.xml" type="application/rss+xml" title="Causality in Cognition Lab">
+  
+
+  <link rel="manifest" href="/site.webmanifest">
+  <link rel="icon" type="image/png" href="/img/icon.png">
+  <link rel="apple-touch-icon" type="image/png" href="/img/icon-192.png">
+
+  <link rel="canonical" href="https://cicl.stanford.edu/publication/jin2024marple/">
+
+  <meta property="twitter:card" content="summary_large_image">
+  
+  <meta property="twitter:site" content="@tobigerstenberg">
+  <meta property="twitter:creator" content="@tobigerstenberg">
+  
+  <meta property="og:site_name" content="Causality in Cognition Lab">
+  <meta property="og:url" content="https://cicl.stanford.edu/publication/jin2024marple/">
+  <meta property="og:title" content="MARPLE: A Benchmark for Long-Horizon Inference | Causality in Cognition Lab">
+  <meta property="og:description" content="Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit&#39;&#39; stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: https://marple-benchmark.github.io/.">
+  <meta property="og:locale" content="en-us">
+  
+  <meta property="article:published_time" content="2024-10-04T00:00:00&#43;00:00">
+  
+  <meta property="article:modified_time" content="2024-10-04T00:00:00&#43;00:00">
+  
+
+  
+
+  
+
+  <title>MARPLE: A Benchmark for Long-Horizon Inference | Causality in Cognition Lab</title>
+
+</head>
+<body id="top" data-spy="scroll" data-target="#toc" data-offset="71" >
+
+<nav class="navbar navbar-light fixed-top navbar-expand-lg py-0" id="navbar-main">
+  <div class="container">
+
+    
+      <a class="navbar-brand" href="/"><img src="/img/cicl_logo.png" alt="Causality in Cognition Lab"></a>
+      
+      <button type="button" class="navbar-toggler" data-toggle="collapse"
+              data-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
+        <span><i class="fas fa-bars"></i></span>
+      </button>
+      
+
+    
+    <div class="collapse navbar-collapse" id="navbar">
+
+      
+      
+      <ul class="navbar-nav ml-auto">
+        
+
+        
+
+        
+        
+        
+          
+        
+
+        <li class="nav-item">
+          <a class="nav-link" href="/#home">
+            
+            <span>Home</span>
+            
+          </a>
+        </li>
+
+        
+        
+
+        
+
+        
+        
+        
+          
+        
+
+        <li class="nav-item">
+          <a class="nav-link" href="/#people">
+            
+            <span>People</span>
+            
+          </a>
+        </li>
+
+        
+        
+
+        
+
+        
+        
+        
+          
+        
+
+        <li class="nav-item">
+          <a class="nav-link" href="/#publications_selected">
+            
+            <span>Publications</span>
+            
+          </a>
+        </li>
+
+        
+        
+
+        
+
+        
+        
+        
+          
+        
+
+        <li class="nav-item">
+          <a class="nav-link" href="/#collaborators">
+            
+            <span>Collaborators</span>
+            
+          </a>
+        </li>
+
+        
+        
+      
+
+      
+      </ul>
+
+    </div>
+  </div>
+</nav>
+
+<div class="pub" itemscope itemtype="http://schema.org/CreativeWork">
+
+  
+
+
+<div class="container">
+  <div class="pub-title"> 
+    <h1 itemprop="name" class ="title-text">MARPLE: A Benchmark for Long-Horizon Inference</h1>
+    <p class="pub-authors" itemprop="author">
+      
+      E. Jin, Z. Huang, J. Fränken, W. Liu, H. Cha, E. Brockbank, S. Wu, R. Zhang, J. Wu, T. Gerstenberg
+      
+    </p>
+    <span class="pull-right">
+      
+
+    </span>
+  </div>
+
+    
+
+    
+    <h3>Abstract</h3>
+    <p class="pub-abstract" itemprop="text">Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit&rdquo; stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: <a href="https://marple-benchmark.github.io/">https://marple-benchmark.github.io/</a>.</p>
+    
+
+    
+
+
+    
+    <div class="row">
+      <div class="col-sm-1"></div>
+      <div class="col-sm-10">
+        <div class="row">
+          <div class="col-xs-12 col-sm-3 pub-row-heading">Type</div>
+          <div class="col-xs-12 col-sm-9">
+            
+            <a href="/publication/#3">
+              Conference Proceedings
+            </a>
+            
+          </div>
+        </div>
+      </div>
+      <div class="col-sm-1"></div>
+    </div>
+    <div class="visible-xs"></div>
+    
+
+    <div class="row">
+      <div class="col-sm-1"></div>
+      <div class="col-sm-10">
+        <div class="row">
+          <div class="col-xs-12 col-sm-3 pub-row-heading">Publication</div>
+          <div class="col-xs-12 col-sm-9">Jin, E., Huang, Z., Fränken, J., Liu, W., Cha, H., Brockbank, E., Wu, S., Zhang, R., Wu, J., Gerstenberg, T. (2024). MARPLE: A Benchmark for Long-Horizon Inference. <em>Advances in Neural Information Processing Systems</em>.</div>
+        </div>
+      </div>
+      <div class="col-sm-1"></div>
+    </div>
+    <div class="visible-xs"></div>
+
+    <div class="row">
+      <div class="col-sm-1"></div>
+      <div class="col-sm-10">
+        <div class="row">
+          <div class="col-xs-12 col-sm-3 pub-row-heading">Date</div>
+          <div class="col-xs-12 col-sm-9" itemprop="datePublished">
+            2024
+          </div>
+        </div>
+      </div>
+      <div class="col-sm-1"></div>
+    </div>
+    <div class="visible-xs"></div>
+
+    
+    <div class="row" style="padding-top: 10px">
+      <div class="col-sm-1"></div>
+      <div class="col-sm-10">
+        <div class="row">
+          <div class="col-xs-12 col-sm-3 pub-row-heading" style="line-height:34px;">Links</div>
+          <div class="col-xs-12 col-sm-9">
+            
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1" href="http://arxiv.org/abs/2410.01926" target="_blank" rel="noopener">
+  Preprint
+</a>
+
+
+<a class="btn btn-outline-primary my-1 mr-1" href="https://cicl.stanford.edu/papers/jin2024marple.pdf" target="_blank" rel="noopener">
+  PDF
+</a>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1" href="https://marple-benchmark.github.io/" target="_blank" rel="noopener">
+  Project website
+</a>
+
+
+          </div>
+        </div>
+      </div>
+      <div class="col-sm-1"></div>
+    </div>
+    <div class="visible-xs space-below"></div>
+    
+    
+    <div class="space-below"></div>
+
+    <div class="article-style"></div>
+
+<a href="../"><p>&lt;&lt; Back to list of publications</p></a>
+</div>
+
+</div>
+
+
+
+
+
+<div class="container">
+  <footer class="site-footer">
+  
+
+  <p class="powered-by">
+    &copy; 2024 Tobias Gerstenberg &middot; 
+
+    Powered by the
+    <a href="https://sourcethemes.com/academic/" target="_blank" rel="noopener">Academic theme</a> for
+    <a href="https://gohugo.io" target="_blank" rel="noopener">Hugo</a>.
+
+    <span class="float-right" aria-hidden="true">
+      <a href="#" id="back_to_top">
+        <span class="button_icon">
+          <i class="fas fa-chevron-up fa-2x"></i>
+        </span>
+      </a>
+    </span>
+  </p>
+</footer>
+
+</div>
+
+
+<div id="modal" class="modal fade" role="dialog">
+  <div class="modal-dialog">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title">Cite</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        <pre><code class="tex hljs"></code></pre>
+      </div>
+      <div class="modal-footer">
+        <a class="btn btn-outline-primary my-1 js-copy-cite" href="#" target="_blank">
+          <i class="fas fa-copy"></i> Copy
+        </a>
+        <a class="btn btn-outline-primary my-1 js-download-cite" href="#" target="_blank">
+          <i class="fas fa-download"></i> Download
+        </a>
+        <div id="modal-error"></div>
+      </div>
+    </div>
+  </div>
+</div>
+
+    
+
+    
+    
+    <script type="text/x-mathjax-config">
+      MathJax.Hub.Config({
+        CommonHTML: { linebreaks: { automatic: true } },
+        tex2jax: { inlineMath: [ ['$', '$'], ['\\(','\\)'] ], displayMath: [ ['$$','$$'], ['\\[', '\\]'] ], processEscapes: false },
+        TeX: { noUndefined: { attributes: { mathcolor: 'red', mathbackground: '#FFEEEE', mathsize: '90%' } } },
+        messageStyle: 'none'
+      });
+    </script>
+    
+
+    
+    
+    
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha512-+NqPlbbtM1QqiK8ZAo4Yrj2c4lNQoGv8P79DPtKzj++l5jnN39rHA/xsqn8zE9l0uSoxaCdrOgFs6yjyfbBxSg==" crossorigin="anonymous"></script>
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.imagesloaded/4.1.3/imagesloaded.pkgd.min.js" integrity="sha512-umsR78NN0D23AzgoZ11K7raBD+R6hqKojyBZs1w8WvYlsI+QuKRGBx3LFCwhatzBunCjDuJpDHwxD13sLMbpRA==" crossorigin="anonymous"></script>
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.1.3/js/bootstrap.min.js" integrity="sha256-VsEqElsCHSGmnmHXGQzvoWjWwoznFSZc6hs7ARLRacQ=" crossorigin="anonymous"></script>
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.isotope/3.0.4/isotope.pkgd.min.js" integrity="sha512-VDBOIlDbuC4VWxGJNmuFRQ0Li0SKkDpmGyuhAG5LTDLd/dJ/S0WMVxriR2Y+CyPL5gzjpN4f/6iqWVBJlht0tQ==" crossorigin="anonymous"></script>
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/fancybox/3.2.5/jquery.fancybox.min.js" integrity="sha256-X5PoE3KU5l+JcX+w09p/wHl9AzK333C4hJ2I9S5mD4M=" crossorigin="anonymous"></script>
+
+      
+        
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js" integrity="sha256-/BfiIkHlHoVihZdc6TFuj7MmJ0TWcWsMXkeDFwhi0zw=" crossorigin="anonymous"></script>
+        
+      
+
+      
+      
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS_CHTML-full" integrity="sha256-GhM+5JHb6QUzOQPXSJLEWP7R73CbkisjzK5Eyij4U9w=" crossorigin="anonymous" async></script>
+      
+    
+
+    <script src="/js/hugo-academic.js"></script>
+    
+
+    
+    
+
+    
+    
+    
+
+    
+    
+    <script>hljs.initHighlightingOnLoad();</script>
+    
+
+    
+    
+    <script>
+      const search_index_filename = "/index.json";
+      const i18n = {
+        'placeholder': "Search...",
+        'results': "Search Results",
+        'no_results': "No results found"
+      };
+      const content_type = {
+        'post': "Posts",
+        'project': "Projects",
+        'publication' : "Publications",
+        'talk' : "Talks"
+        };
+    </script>
+    
+
+    
+    
+
+    
+    
+
+    
+    
+
+    
+
+  </body>
+</html>
+
diff --git a/docs/publication_types/1/index.html b/docs/publication_types/1/index.html
index 593122e..5f75d64 100644
--- a/docs/publication_types/1/index.html
+++ b/docs/publication_types/1/index.html
@@ -265,15 +265,6 @@ <h2><a href="https://cicl.stanford.edu/publication/du2024robotic/">To Err is Rob
     </div>
   </div>
   
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/franken2024sami/">Self-supervised alignment with mutual information: Learning to follow principles without preference labels</a></h2>
-    <div class="article-style">
-      
-      When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into …
-      
-    </div>
-  </div>
-  
   <div>
     <h2><a href="https://cicl.stanford.edu/publication/andukuri2024stargate/">STaR-GATE: Teaching Language Models to Ask Clarifying Questions</a></h2>
     <div class="article-style">
diff --git a/docs/publication_types/1/index.xml b/docs/publication_types/1/index.xml
index dd353ba..5475ab9 100644
--- a/docs/publication_types/1/index.xml
+++ b/docs/publication_types/1/index.xml
@@ -39,15 +39,6 @@
       <description></description>
     </item>
     
-    <item>
-      <title>Self-supervised alignment with mutual information: Learning to follow principles without preference labels</title>
-      <link>https://cicl.stanford.edu/publication/franken2024sami/</link>
-      <pubDate>Mon, 22 Apr 2024 00:00:00 +0000</pubDate>
-      
-      <guid>https://cicl.stanford.edu/publication/franken2024sami/</guid>
-      <description></description>
-    </item>
-    
     <item>
       <title>STaR-GATE: Teaching Language Models to Ask Clarifying Questions</title>
       <link>https://cicl.stanford.edu/publication/andukuri2024stargate/</link>
diff --git a/docs/publication_types/3/index.html b/docs/publication_types/3/index.html
index 861dbc2..5a605e6 100644
--- a/docs/publication_types/3/index.html
+++ b/docs/publication_types/3/index.html
@@ -111,7 +111,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
   
-  <meta property="og:updated_time" content="2024-05-13T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-10-04T00:00:00&#43;00:00">
   
 
   
@@ -238,6 +238,15 @@ <h1 class="pt-3">3</h1>
 
   
   
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/jin2024marple/">MARPLE: A Benchmark for Long-Horizon Inference</a></h2>
+    <div class="article-style">
+      
+      Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, …
+      
+    </div>
+  </div>
+  
   <div>
     <h2><a href="https://cicl.stanford.edu/publication/wu2024whodunnit/">Whodunnit? Inferring what happened from multimodal evidence</a></h2>
     <div class="article-style">
@@ -302,28 +311,19 @@ <h2><a href="https://cicl.stanford.edu/publication/wu2024resource/">Resource-rat
   </div>
   
   <div>
-    <h2><a href="https://cicl.stanford.edu/publication/franken2024rails/">Procedural dilemma generation for evaluating moral reasoning in humans and language models</a></h2>
-    <div class="article-style">
-      
-      As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic …
-      
-    </div>
-  </div>
-  
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/kirfel2023anticipating/">Anticipating the risks and benefits of counterfactual world simulation models</a></h2>
+    <h2><a href="https://cicl.stanford.edu/publication/franken2024sami/">Self-supervised alignment with mutual information: Learning to follow principles without preference labels</a></h2>
     <div class="article-style">
       
-      This paper examines the transformative potential of Counterfactual World Simulation Models (CWSMs). CWSMs use pieces of multi-modal evidence, such as the CCTV footage or sound recordings of a road accident, to build a high-fidelity 3D reconstruction …
+      When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into …
       
     </div>
   </div>
   
   <div>
-    <h2><a href="https://cicl.stanford.edu/publication/franken2023rails/">Off The Rails: Procedural Dilemma Generation for Moral Reasoning</a></h2>
+    <h2><a href="https://cicl.stanford.edu/publication/franken2024rails/">Procedural dilemma generation for evaluating moral reasoning in humans and language models</a></h2>
     <div class="article-style">
       
-      As AI systems like language models are increasingly integrated into making decisions that affect people, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. Recent …
+      As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic …
       
     </div>
   </div>
diff --git a/docs/publication_types/3/index.xml b/docs/publication_types/3/index.xml
index 485b729..1c53827 100644
--- a/docs/publication_types/3/index.xml
+++ b/docs/publication_types/3/index.xml
@@ -7,11 +7,20 @@
     <generator>Hugo -- gohugo.io</generator>
     <language>en-us</language>
     <copyright>&amp;copy; 2024 Tobias Gerstenberg</copyright>
-    <lastBuildDate>Mon, 13 May 2024 00:00:00 +0000</lastBuildDate>
+    <lastBuildDate>Fri, 04 Oct 2024 00:00:00 +0000</lastBuildDate>
     
 	<atom:link href="https://cicl.stanford.edu/publication_types/3/index.xml" rel="self" type="application/rss+xml" />
     
     
+    <item>
+      <title>MARPLE: A Benchmark for Long-Horizon Inference</title>
+      <link>https://cicl.stanford.edu/publication/jin2024marple/</link>
+      <pubDate>Fri, 04 Oct 2024 00:00:00 +0000</pubDate>
+      
+      <guid>https://cicl.stanford.edu/publication/jin2024marple/</guid>
+      <description></description>
+    </item>
+    
     <item>
       <title>Whodunnit? Inferring what happened from multimodal evidence</title>
       <link>https://cicl.stanford.edu/publication/wu2024whodunnit/</link>
@@ -75,6 +84,15 @@
       <description></description>
     </item>
     
+    <item>
+      <title>Self-supervised alignment with mutual information: Learning to follow principles without preference labels</title>
+      <link>https://cicl.stanford.edu/publication/franken2024sami/</link>
+      <pubDate>Mon, 22 Apr 2024 00:00:00 +0000</pubDate>
+      
+      <guid>https://cicl.stanford.edu/publication/franken2024sami/</guid>
+      <description></description>
+    </item>
+    
     <item>
       <title>Procedural dilemma generation for evaluating moral reasoning in humans and language models</title>
       <link>https://cicl.stanford.edu/publication/franken2024rails/</link>
diff --git a/docs/publication_types/3/page/2/index.html b/docs/publication_types/3/page/2/index.html
index 8d4694b..1465bb3 100644
--- a/docs/publication_types/3/page/2/index.html
+++ b/docs/publication_types/3/page/2/index.html
@@ -111,7 +111,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
   
-  <meta property="og:updated_time" content="2024-05-13T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-10-04T00:00:00&#43;00:00">
   
 
   
@@ -238,6 +238,24 @@ <h1 class="pt-3">3</h1>
 
   
   
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/kirfel2023anticipating/">Anticipating the risks and benefits of counterfactual world simulation models</a></h2>
+    <div class="article-style">
+      
+      This paper examines the transformative potential of Counterfactual World Simulation Models (CWSMs). CWSMs use pieces of multi-modal evidence, such as the CCTV footage or sound recordings of a road accident, to build a high-fidelity 3D reconstruction …
+      
+    </div>
+  </div>
+  
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/franken2023rails/">Off The Rails: Procedural Dilemma Generation for Moral Reasoning</a></h2>
+    <div class="article-style">
+      
+      As AI systems like language models are increasingly integrated into making decisions that affect people, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. Recent …
+      
+    </div>
+  </div>
+  
   <div>
     <h2><a href="https://cicl.stanford.edu/publication/franken2023social/">Social Contract AI: Aligning AI Assistants with Implicit Group Norms</a></h2>
     <div class="article-style">
@@ -310,24 +328,6 @@ <h2><a href="https://cicl.stanford.edu/publication/zhang2023llm/">You are what y
     </div>
   </div>
   
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/cao2023semantics/">A Semantics for Causing, Enabling, and Preventing Verbs Using Structural Causal Models</a></h2>
-    <div class="article-style">
-      
-      When choosing how to describe what happened, we have a number of causal verbs at our disposal. In this paper, we develop a model-theoretic formal semantics for nine causal verbs that span the categories of CAUSE, ENABLE, and PREVENT. We use …
-      
-    </div>
-  </div>
-  
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/gonzalez2023agents/">Causal Reasoning Across Agents and Objects</a></h2>
-    <div class="article-style">
-      
-      This work attempts to bridge the divide between accounts of causal reasoning with respect to agents and objects. We begin by examining the influence of animacy. In a collision-based context, we vary the animacy status of an object using 3D …
-      
-    </div>
-  </div>
-  
 
   
 <nav>
diff --git a/docs/publication_types/3/page/3/index.html b/docs/publication_types/3/page/3/index.html
index 51a45be..5fda9a6 100644
--- a/docs/publication_types/3/page/3/index.html
+++ b/docs/publication_types/3/page/3/index.html
@@ -111,7 +111,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
   
-  <meta property="og:updated_time" content="2024-05-13T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-10-04T00:00:00&#43;00:00">
   
 
   
@@ -238,6 +238,24 @@ <h1 class="pt-3">3</h1>
 
   
   
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/cao2023semantics/">A Semantics for Causing, Enabling, and Preventing Verbs Using Structural Causal Models</a></h2>
+    <div class="article-style">
+      
+      When choosing how to describe what happened, we have a number of causal verbs at our disposal. In this paper, we develop a model-theoretic formal semantics for nine causal verbs that span the categories of CAUSE, ENABLE, and PREVENT. We use …
+      
+    </div>
+  </div>
+  
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/gonzalez2023agents/">Causal Reasoning Across Agents and Objects</a></h2>
+    <div class="article-style">
+      
+      This work attempts to bridge the divide between accounts of causal reasoning with respect to agents and objects. We begin by examining the influence of animacy. In a collision-based context, we vary the animacy status of an object using 3D …
+      
+    </div>
+  </div>
+  
   <div>
     <h2><a href="https://cicl.stanford.edu/publication/shin2023abstraction/">Learning what matters: Causal abstraction in human inference</a></h2>
     <div class="article-style">
@@ -310,24 +328,6 @@ <h2><a href="https://cicl.stanford.edu/publication/vodrahalli2022uncalibrated/">
     </div>
   </div>
   
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/davis2021fishing/">Who went fishing? Inferences from social evaluations</a></h2>
-    <div class="article-style">
-      
-      Humans have a remarkable ability to go beyond the observable. From seeing the current state of our shared kitchen, we can infer what happened and who did it. Prior work has shown how the physical state of the world licenses inferences about the …
-      
-    </div>
-  </div>
-  
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/beller2020language/">The language of causation</a></h2>
-    <div class="article-style">
-      
-      People use varied language to express their causal understanding of the world. But how does that language map onto people’s underlying representations, and how do people choose between competing ways to best describe what happened? In this paper we …
-      
-    </div>
-  </div>
-  
 
   
 <nav>
diff --git a/docs/publication_types/3/page/4/index.html b/docs/publication_types/3/page/4/index.html
index a7364d9..c1ac14a 100644
--- a/docs/publication_types/3/page/4/index.html
+++ b/docs/publication_types/3/page/4/index.html
@@ -111,7 +111,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
   
-  <meta property="og:updated_time" content="2024-05-13T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-10-04T00:00:00&#43;00:00">
   
 
   
@@ -238,6 +238,24 @@ <h1 class="pt-3">3</h1>
 
   
   
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/davis2021fishing/">Who went fishing? Inferences from social evaluations</a></h2>
+    <div class="article-style">
+      
+      Humans have a remarkable ability to go beyond the observable. From seeing the current state of our shared kitchen, we can infer what happened and who did it. Prior work has shown how the physical state of the world licenses inferences about the …
+      
+    </div>
+  </div>
+  
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/beller2020language/">The language of causation</a></h2>
+    <div class="article-style">
+      
+      People use varied language to express their causal understanding of the world. But how does that language map onto people’s underlying representations, and how do people choose between competing ways to best describe what happened? In this paper we …
+      
+    </div>
+  </div>
+  
   <div>
     <h2><a href="https://cicl.stanford.edu/publication/bridgers2020granny/">Whom will Granny thank? Thinking about what could have been informs children&#39;s inferences about relative helpfulness</a></h2>
     <div class="article-style">
@@ -310,24 +328,6 @@ <h2><a href="https://cicl.stanford.edu/publication/yildirim2017problem/">Physica
     </div>
   </div>
   
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/niemi2016implicit/">Implicit measurement of motivated causal attribution</a></h2>
-    <div class="article-style">
-      
-      Moral judgment often involves pinning causation for harm to a particular person. Since it reveals “who one sides with”, expression of moral judgment can be a costly social act that people may be motivated to conceal. Here, we demonstrate that a …
-      
-    </div>
-  </div>
-  
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/bramley2016natural/">Natural science: Active learning in dynamic physical microworlds</a></h2>
-    <div class="article-style">
-      
-      In this paper, we bring together research on active learning and intuitive physics to explore how people learn about “microworlds” with continuous spatiotemporal dynamics. Participants interacted with objects in simple two-dimensional worlds governed …
-      
-    </div>
-  </div>
-  
 
   
 <nav>
diff --git a/docs/publication_types/3/page/5/index.html b/docs/publication_types/3/page/5/index.html
index 5c79f5c..6903574 100644
--- a/docs/publication_types/3/page/5/index.html
+++ b/docs/publication_types/3/page/5/index.html
@@ -111,7 +111,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
   
-  <meta property="og:updated_time" content="2024-05-13T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-10-04T00:00:00&#43;00:00">
   
 
   
@@ -238,6 +238,24 @@ <h1 class="pt-3">3</h1>
 
   
   
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/niemi2016implicit/">Implicit measurement of motivated causal attribution</a></h2>
+    <div class="article-style">
+      
+      Moral judgment often involves pinning causation for harm to a particular person. Since it reveals “who one sides with”, expression of moral judgment can be a costly social act that people may be motivated to conceal. Here, we demonstrate that a …
+      
+    </div>
+  </div>
+  
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/bramley2016natural/">Natural science: Active learning in dynamic physical microworlds</a></h2>
+    <div class="article-style">
+      
+      In this paper, we bring together research on active learning and intuitive physics to explore how people learn about “microworlds” with continuous spatiotemporal dynamics. Participants interacted with objects in simple two-dimensional worlds governed …
+      
+    </div>
+  </div>
+  
   <div>
     <h2><a href="https://cicl.stanford.edu/publication/gerstenberg2016almost/">Understanding ``almost&#39;&#39;: Empirical and computational studies of near misses</a></h2>
     <div class="article-style">
@@ -310,24 +328,6 @@ <h2><a href="https://cicl.stanford.edu/publication/bramley2014order/">The order
     </div>
   </div>
   
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/gerstenberg2014replacement/">Wins above replacement: Responsibility attributions as counterfactual replacements</a></h2>
-    <div class="article-style">
-      
-      In order to be held responsible, a person’s action has to have made some sort of difference to the outcome. In this paper, we propose a counterfactual replacement model according to which people attribute responsibility by comparing their prior …
-      
-    </div>
-  </div>
-  
-  <div>
-    <h2><a href="https://cicl.stanford.edu/publication/gerstenberg2013back/">Back on track: Backtracking in counterfactual reasoning</a></h2>
-    <div class="article-style">
-      
-      Would Dan have died if Bob hadn’t shot? In this paper, we show that people’s answer depends on whether or not they are asked about what would have caused Bob not to shoot. Something needs to change in order to turn an actual world into a …
-      
-    </div>
-  </div>
-  
 
   
 <nav>
diff --git a/docs/publication_types/3/page/6/index.html b/docs/publication_types/3/page/6/index.html
index 1ebfd2f..88268d8 100644
--- a/docs/publication_types/3/page/6/index.html
+++ b/docs/publication_types/3/page/6/index.html
@@ -111,7 +111,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
   
-  <meta property="og:updated_time" content="2024-05-13T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-10-04T00:00:00&#43;00:00">
   
 
   
@@ -238,6 +238,24 @@ <h1 class="pt-3">3</h1>
 
   
   
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/gerstenberg2014replacement/">Wins above replacement: Responsibility attributions as counterfactual replacements</a></h2>
+    <div class="article-style">
+      
+      In order to be held responsible, a person’s action has to have made some sort of difference to the outcome. In this paper, we propose a counterfactual replacement model according to which people attribute responsibility by comparing their prior …
+      
+    </div>
+  </div>
+  
+  <div>
+    <h2><a href="https://cicl.stanford.edu/publication/gerstenberg2013back/">Back on track: Backtracking in counterfactual reasoning</a></h2>
+    <div class="article-style">
+      
+      Would Dan have died if Bob hadn’t shot? In this paper, we show that people’s answer depends on whether or not they are asked about what would have caused Bob not to shoot. Something needs to change in order to turn an actual world into a …
+      
+    </div>
+  </div>
+  
   <div>
     <h2><a href="https://cicl.stanford.edu/publication/gerstenberg2012noisy/">Noisy Newtons: Unifying process and dependency accounts of causal attribution</a></h2>
     <div class="article-style">
diff --git a/docs/publication_types/index.html b/docs/publication_types/index.html
index 66d373f..18c8453 100644
--- a/docs/publication_types/index.html
+++ b/docs/publication_types/index.html
@@ -111,7 +111,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
   
-  <meta property="og:updated_time" content="2024-09-20T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-10-04T00:00:00&#43;00:00">
   
 
   
@@ -239,21 +239,21 @@ <h1 class="pt-3">Publication_types</h1>
   
   
   <div>
-    <h2><a href="https://cicl.stanford.edu/publication_types/1/">1</a></h2>
+    <h2><a href="https://cicl.stanford.edu/publication_types/3/">3</a></h2>
     <div class="article-style">
       
     </div>
   </div>
   
   <div>
-    <h2><a href="https://cicl.stanford.edu/publication_types/2/">2</a></h2>
+    <h2><a href="https://cicl.stanford.edu/publication_types/1/">1</a></h2>
     <div class="article-style">
       
     </div>
   </div>
   
   <div>
-    <h2><a href="https://cicl.stanford.edu/publication_types/3/">3</a></h2>
+    <h2><a href="https://cicl.stanford.edu/publication_types/2/">2</a></h2>
     <div class="article-style">
       
     </div>
diff --git a/docs/publication_types/index.xml b/docs/publication_types/index.xml
index 8be2bd1..7d5f056 100644
--- a/docs/publication_types/index.xml
+++ b/docs/publication_types/index.xml
@@ -7,11 +7,20 @@
     <generator>Hugo -- gohugo.io</generator>
     <language>en-us</language>
     <copyright>&amp;copy; 2024 Tobias Gerstenberg</copyright>
-    <lastBuildDate>Fri, 20 Sep 2024 00:00:00 +0000</lastBuildDate>
+    <lastBuildDate>Fri, 04 Oct 2024 00:00:00 +0000</lastBuildDate>
     
 	<atom:link href="https://cicl.stanford.edu/publication_types/index.xml" rel="self" type="application/rss+xml" />
     
     
+    <item>
+      <title>3</title>
+      <link>https://cicl.stanford.edu/publication_types/3/</link>
+      <pubDate>Fri, 04 Oct 2024 00:00:00 +0000</pubDate>
+      
+      <guid>https://cicl.stanford.edu/publication_types/3/</guid>
+      <description></description>
+    </item>
+    
     <item>
       <title>1</title>
       <link>https://cicl.stanford.edu/publication_types/1/</link>
@@ -30,15 +39,6 @@
       <description></description>
     </item>
     
-    <item>
-      <title>3</title>
-      <link>https://cicl.stanford.edu/publication_types/3/</link>
-      <pubDate>Mon, 13 May 2024 00:00:00 +0000</pubDate>
-      
-      <guid>https://cicl.stanford.edu/publication_types/3/</guid>
-      <description></description>
-    </item>
-    
     <item>
       <title>0</title>
       <link>https://cicl.stanford.edu/publication_types/0/</link>
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index d2222cf..f492d0a 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -3,8 +3,8 @@
     
     <url>
     
-      <loc>https://cicl.stanford.edu/publication_types/1/</loc>
-      <lastmod>2024-09-20T00:00:00+00:00</lastmod>
+      <loc>https://cicl.stanford.edu/publication_types/3/</loc>
+      <lastmod>2024-10-04T00:00:00+00:00</lastmod>
       <priority>0</priority>
     </url>
     
@@ -13,7 +13,7 @@
     <url>
     
       <loc>https://cicl.stanford.edu/</loc>
-      <lastmod>2024-09-20T00:00:00+00:00</lastmod>
+      <lastmod>2024-10-04T00:00:00+00:00</lastmod>
       <priority>0</priority>
     </url>
     
@@ -21,15 +21,33 @@
     
     <url>
     
-      <loc>https://cicl.stanford.edu/publication/beller2024causation/</loc>
+      <loc>https://cicl.stanford.edu/publication/jin2024marple/</loc>
+      <lastmod>2024-10-04T00:00:00+00:00</lastmod>
+    </url>
+    
+  
+    
+    <url>
+    
+      <loc>https://cicl.stanford.edu/publication_types/</loc>
+      <lastmod>2024-10-04T00:00:00+00:00</lastmod>
+      <priority>0</priority>
+    </url>
+    
+  
+    
+    <url>
+    
+      <loc>https://cicl.stanford.edu/publication_types/1/</loc>
       <lastmod>2024-09-20T00:00:00+00:00</lastmod>
+      <priority>0</priority>
     </url>
     
   
     
     <url>
     
-      <loc>https://cicl.stanford.edu/publication/gandhi2024affective/</loc>
+      <loc>https://cicl.stanford.edu/publication/beller2024causation/</loc>
       <lastmod>2024-09-20T00:00:00+00:00</lastmod>
     </url>
     
@@ -37,9 +55,8 @@
     
     <url>
     
-      <loc>https://cicl.stanford.edu/publication_types/</loc>
+      <loc>https://cicl.stanford.edu/publication/gandhi2024affective/</loc>
       <lastmod>2024-09-20T00:00:00+00:00</lastmod>
-      <priority>0</priority>
     </url>
     
   
@@ -69,15 +86,6 @@
     
   
     
-    <url>
-    
-      <loc>https://cicl.stanford.edu/publication_types/3/</loc>
-      <lastmod>2024-05-13T00:00:00+00:00</lastmod>
-      <priority>0</priority>
-    </url>
-    
-  
-    
     <url>
     
       <loc>https://cicl.stanford.edu/publication/amemiya2024disagreement/</loc>
diff --git a/static/bibtex/cic_papers.bib b/static/bibtex/cic_papers.bib
index d52d0fd..7023118 100644
--- a/static/bibtex/cic_papers.bib
+++ b/static/bibtex/cic_papers.bib
@@ -1,13 +1,24 @@
 %% This BibTeX bibliography file was created using BibDesk.
 %% https://bibdesk.sourceforge.io/
 
-%% Created for Tobias Gerstenberg at 2024-09-20 15:09:53 -0700 
+%% Created for Tobias Gerstenberg at 2024-10-04 08:51:52 -0700 
 
 
 %% Saved with string encoding Unicode (UTF-8) 
 
 
 
+@article{jin2024marple,
+	abstract = {Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit'' stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: https: //marple-benchmark.github.io/.},
+	annote = {Comment: NeurIPS 2024. First two authors contributed equally. Project page: https://marple-benchmark.github.io/},
+	author = {Jin, Emily and Huang, Zhuoyi and Fr{\"a}nken, Jan-Philipp and Liu, Weiyu and Cha, Hannah and Brockbank, Erik and Wu, Sarah and Zhang, Ruohan and Wu, Jiajun and Gerstenberg, Tobias},
+	date-added = {2024-10-04 08:51:51 -0700},
+	date-modified = {2024-10-04 08:51:51 -0700},
+	journal = {arXiv},
+	note = {http://arxiv.org/abs/2410.01926},
+	title = {{MARPLE: A Benchmark for Long-Horizon Inference}},
+	year = {2024}}
+
 @article{gandhi2024affective,
 	abstract = {Understanding emotions is fundamental to human interaction and experience. Humans easily infer emotions from situations or facial expressions, situations from emotions, and do a variety of other affective cognition. How adept is modern AI at these inferences? We introduce an evaluation framework for testing affective cognition in foundation models. Starting from psychological theory, we generate 1,280 diverse scenarios exploring relationships between appraisals, emotions, expressions, and outcomes. We evaluate the abilities of foundation models (GPT-4, Claude-3, Gemini-1.5-Pro) and humans (N = 567) across carefully selected conditions. Our results show foundation models tend to agree with human intuitions, matching or exceeding interparticipant agreement. In some conditions, models are ``superhuman'' -- they better predict modal human judgements than the average human. All models benefit from chain-of-thought reasoning. This suggests foundation models have acquired a human-like understanding of emotions and their influence on beliefs and behavior.},
 	author = {Kanishk Gandhi and Zoe Lynch and Jan-Philipp Fr{\"a}nken and Kayla Patterson and Sharon Wambu and Tobias Gerstenberg and Desmond C. Ong and Noah D. Goodman},
diff --git a/static/papers/gandhi2024affective.pdf b/static/papers/gandhi2024affective.pdf
new file mode 100644
index 0000000..fb4c39e
Binary files /dev/null and b/static/papers/gandhi2024affective.pdf differ
diff --git a/static/papers/jin2024marple.pdf b/static/papers/jin2024marple.pdf
new file mode 100644
index 0000000..47d575c
Binary files /dev/null and b/static/papers/jin2024marple.pdf differ