added off-the-rails cogsci paper

cicl-stanford · Apr 17, 2024 · 4c7a368 · 4c7a368
1 parent bd854e9
commit 4c7a368
Show file tree

Hide file tree

Showing 21 changed files with 772 additions and 83 deletions.
diff --git a/content/publication/franken2024rails.md b/content/publication/franken2024rails.md
@@ -0,0 +1,33 @@
++++
+# 0 -> 'Forthcoming',
+# 1 -> 'Preprint',
+# 2 -> 'Journal',
+# 3 -> 'Conference Proceedings',
+# 4 -> 'Book chapter',
+# 5 -> 'Thesis'
+
+title = "Procedural dilemma generation for evaluating moral reasoning in humans and language models"
+date = "2024-04-17"
+authors = ["J. Fränken","K. Gandhi","T. Qiu","A. Khawaja","N. D. Goodman","T. Gerstenberg"]
+publication_types = ["3"]
+publication_short = "_Proceedings of the 46th Annual Conference of the Cognitive Science Society_"
+publication = 'Fränken J., Gandhi K., Qiu T., Khawaja A., Goodman N. D., Gerstenberg T. (2024). Procedural dilemma generation for evaluating moral reasoning in humans and language models. In _Proceedings of the 46th Annual Conference of the Cognitive Science Society_.'
+abstract = "As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. We provide a framework that uses a language model to translate causal graphs that capture key aspects of moral dilemmas into prompt templates. With this framework, we procedurally generated a large and diverse set of moral dilemmas---the OffTheRails benchmark---consisting of 50 scenarios and 400 unique test items. We collected moral permissibility and intention judgments from human participants for a subset of our items and compared these judgments to those from two language models (GPT-4 and Claude-2) across eight conditions. We find that moral dilemmas in which the harm is a necessary means (as compared to a side effect) resulted in lower permissibility and higher intention ratings for both participants and language models. The same pattern was observed for evitable versus inevitable harmful outcomes. However, there was no clear effect of whether the harm resulted from an agent's action versus from having omitted to act. We discuss limitations of our prompt generation pipeline and opportunities for improving scenarios to increase the strength of experimental effects."
+image_preview = ""
+selected = false
+projects = []
+url_pdf = "papers/franken2024rails.pdf"
+url_preprint = ""
+url_code = ""
+url_dataset = ""
+url_slides = ""
+url_video = ""
+url_poster = ""
+url_source = ""
+url_custom = [{name = "Github", url = "https://github.com/cicl-stanford/moral-evals/tree/main"}]
+math = true
+highlight = true
+[header]
+# image = "publications/franken2024rails.png"
+caption = ""
++++
diff --git a/docs/404.html b/docs/404.html
@@ -237,6 +237,10 @@ <h1>Page not found</h1>
 
   <h2>Publications</h2>
 
+  <ul>
+    <li><a href="https://cicl.stanford.edu/publication/franken2024rails/">Procedural dilemma generation for evaluating moral reasoning in humans and language models</a></li>
+  </ul>
+
   <ul>
     <li><a href="https://cicl.stanford.edu/publication/andukuri2024stargate/">STaR-GATE: Teaching Language Models to Ask Clarifying Questions</a></li>
   </ul>
@@ -253,10 +257,6 @@ <h2>Publications</h2>
     <li><a href="https://cicl.stanford.edu/publication/kirfel2023anticipating/">Anticipating the risks and benefits of counterfactual world simulation models</a></li>
   </ul>
 
-  <ul>
-    <li><a href="https://cicl.stanford.edu/publication/franken2023rails/">Off The Rails: Procedural Dilemma Generation for Moral Reasoning</a></li>
-  </ul>
-
 
 
 

diff --git a/docs/bibtex/cic_papers.bib b/docs/bibtex/cic_papers.bib
@@ -1,13 +1,22 @@
 %% This BibTeX bibliography file was created using BibDesk.
 %% https://bibdesk.sourceforge.io/
 
-%% Created for Tobias Gerstenberg at 2024-03-31 21:14:50 -0700 
+%% Created for Tobias Gerstenberg at 2024-04-17 14:22:46 -0700 
 
 
 %% Saved with string encoding Unicode (UTF-8) 
 
 
 
+@inproceedings{franken2024rails,
+	abstract = {As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. We provide a framework that uses a language model to translate causal graphs that capture key aspects of moral dilemmas into prompt templates. With this framework, we procedurally generated a large and diverse set of moral dilemmas---the OffTheRails benchmark---consisting of 50 scenarios and 400 unique test items. We collected moral permissibility and intention judgments from human participants for a subset of our items and compared these judgments to those from two language models (GPT-4 and Claude-2) across eight conditions. We find that moral dilemmas in which the harm is a necessary means (as compared to a side effect) resulted in lower permissibility and higher intention ratings for both participants and language models. The same pattern was observed for evitable versus inevitable harmful outcomes. However, there was no clear effect of whether the harm resulted from an agent's action versus from having omitted to act. We discuss limitations of our prompt generation pipeline and opportunities for improving scenarios to increase the strength of experimental effects.},
+	author = {Jan-Philipp Fr{\"a}nken and Kanishk Gandhi and Tori Qiu and Ayesha Khawaja and Noah D. Goodman and Tobias Gerstenberg},
+	booktitle = {{Proceedings of the 46th Annual Conference of the Cognitive Science Society}},
+	date-added = {2024-04-17 14:21:36 -0700},
+	date-modified = {2024-04-17 14:22:45 -0700},
+	title = {Procedural dilemma generation for evaluating moral reasoning in humans and language models},
+	year = {2024}}
+
 @article{andukuri2024stargate,
 	abstract = {When prompting language models to complete a task, users often leave important aspects unsaid. While asking questions could resolve this ambiguity (GATE; Li et al., 2023), models often struggle to ask good questions. We explore a language model's ability to self-improve (STaR; Zelikman et al., 2022) by rewarding the model for generating useful questions---a simple method we dub STaR-GATE. We generate a synthetic dataset of 25,500 unique persona-task prompts to simulate conversations between a pretrained language model---the Questioner---and a Roleplayer whose preferences are unknown to the Questioner. By asking questions, the Questioner elicits preferences from the Roleplayer. The Questioner is iteratively finetuned on questions that increase the probability of high-quality responses to the task, which are generated by an Oracle with access to the Roleplayer's latent preferences. After two iterations of self-improvement, the Questioner asks better questions, allowing it to generate responses that are preferred over responses from the initial model on 72% of tasks. Our results indicate that teaching a language model to ask better questions leads to better personalized responses.},
 	author = {Andukuri, Chinmaya and Fr{\"a}nken, Jan-Philipp and Gerstenberg, Tobias and Goodman, Noah D.},

diff --git a/docs/index.html b/docs/index.html
@@ -110,7 +110,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
 
-  <meta property="og:updated_time" content="2024-03-31T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-04-17T00:00:00&#43;00:00">
 
 
 

diff --git a/docs/index.xml b/docs/index.xml
@@ -6,9 +6,18 @@
     <generator>Hugo -- gohugo.io</generator>
     <language>en-us</language>
     <copyright>&amp;copy; 2024 Tobias Gerstenberg</copyright>
-    <lastBuildDate>Sun, 31 Mar 2024 00:00:00 +0000</lastBuildDate>
+    <lastBuildDate>Wed, 17 Apr 2024 00:00:00 +0000</lastBuildDate>
     <atom:link href="/" rel="self" type="application/rss+xml" />
 
+    <item>
+      <title>Procedural dilemma generation for evaluating moral reasoning in humans and language models</title>
+      <link>https://cicl.stanford.edu/publication/franken2024rails/</link>
+      <pubDate>Wed, 17 Apr 2024 00:00:00 +0000</pubDate>
+
+      <guid>https://cicl.stanford.edu/publication/franken2024rails/</guid>
+      <description></description>
+    </item>
+
     <item>
       <title>STaR-GATE: Teaching Language Models to Ask Clarifying Questions</title>
       <link>https://cicl.stanford.edu/publication/andukuri2024stargate/</link>
@@ -135,14 +144,5 @@
       <description></description>
     </item>
 
-    <item>
-      <title>A computational model of responsibility judgments from counterfactual simulations and intention inferences</title>
-      <link>https://cicl.stanford.edu/publication/wu2023computational/</link>
-      <pubDate>Thu, 11 May 2023 00:00:00 +0000</pubDate>
-
-      <guid>https://cicl.stanford.edu/publication/wu2023computational/</guid>
-      <description></description>
-    </item>
-
   </channel>
 </rss>
diff --git a/docs/member/tobias_gerstenberg/index.html b/docs/member/tobias_gerstenberg/index.html
@@ -356,6 +356,49 @@ <h2 id="publications">Publications</h2>
 
 
        <div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
+  <span itemprop="author">
+    J. Fränken, K. Gandhi, T. Qiu, A. Khawaja, N. D. Goodman, T. Gerstenberg</span>
+
+    (2024).
+
+  <a href="https://cicl.stanford.edu/publication/franken2024rails/" itemprop="name">Procedural dilemma generation for evaluating moral reasoning in humans and language models</a>.
+  <em>Proceedings of the 46th Annual Conference of the Cognitive Science Society</em>.
+
+
+
+
+  <p>
+
+
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://cicl.stanford.edu/papers/franken2024rails.pdf" target="_blank" rel="noopener">
+  PDF
+</a>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://github.com/cicl-stanford/moral-evals/tree/main" target="_blank" rel="noopener">
+  Github
+</a>
+
+
+  </p>
+
+</div>
+<div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
   <span itemprop="author">
     C. Andukuri, J. Fränken, T. Gerstenberg, N. D. Goodman</span>
 

diff --git a/docs/papers/franken2024rails.pdf b/docs/papers/franken2024rails.pdf