Skip to content

Commit

Permalink
added off-the-rails cogsci paper
Browse files Browse the repository at this point in the history
  • Loading branch information
tobiasgerstenberg committed Apr 17, 2024
1 parent bd854e9 commit 4c7a368
Show file tree
Hide file tree
Showing 21 changed files with 772 additions and 83 deletions.
33 changes: 33 additions & 0 deletions content/publication/franken2024rails.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
+++
# 0 -> 'Forthcoming',
# 1 -> 'Preprint',
# 2 -> 'Journal',
# 3 -> 'Conference Proceedings',
# 4 -> 'Book chapter',
# 5 -> 'Thesis'

title = "Procedural dilemma generation for evaluating moral reasoning in humans and language models"
date = "2024-04-17"
authors = ["J. Fränken","K. Gandhi","T. Qiu","A. Khawaja","N. D. Goodman","T. Gerstenberg"]
publication_types = ["3"]
publication_short = "_Proceedings of the 46th Annual Conference of the Cognitive Science Society_"
publication = 'Fränken J., Gandhi K., Qiu T., Khawaja A., Goodman N. D., Gerstenberg T. (2024). Procedural dilemma generation for evaluating moral reasoning in humans and language models. In _Proceedings of the 46th Annual Conference of the Cognitive Science Society_.'
abstract = "As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. We provide a framework that uses a language model to translate causal graphs that capture key aspects of moral dilemmas into prompt templates. With this framework, we procedurally generated a large and diverse set of moral dilemmas---the OffTheRails benchmark---consisting of 50 scenarios and 400 unique test items. We collected moral permissibility and intention judgments from human participants for a subset of our items and compared these judgments to those from two language models (GPT-4 and Claude-2) across eight conditions. We find that moral dilemmas in which the harm is a necessary means (as compared to a side effect) resulted in lower permissibility and higher intention ratings for both participants and language models. The same pattern was observed for evitable versus inevitable harmful outcomes. However, there was no clear effect of whether the harm resulted from an agent's action versus from having omitted to act. We discuss limitations of our prompt generation pipeline and opportunities for improving scenarios to increase the strength of experimental effects."
image_preview = ""
selected = false
projects = []
url_pdf = "papers/franken2024rails.pdf"
url_preprint = ""
url_code = ""
url_dataset = ""
url_slides = ""
url_video = ""
url_poster = ""
url_source = ""
url_custom = [{name = "Github", url = "https://github.com/cicl-stanford/moral-evals/tree/main"}]
math = true
highlight = true
[header]
# image = "publications/franken2024rails.png"
caption = ""
+++
8 changes: 4 additions & 4 deletions docs/404.html
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,10 @@ <h1>Page not found</h1>

<h2>Publications</h2>

<ul>
<li><a href="https://cicl.stanford.edu/publication/franken2024rails/">Procedural dilemma generation for evaluating moral reasoning in humans and language models</a></li>
</ul>

<ul>
<li><a href="https://cicl.stanford.edu/publication/andukuri2024stargate/">STaR-GATE: Teaching Language Models to Ask Clarifying Questions</a></li>
</ul>
Expand All @@ -253,10 +257,6 @@ <h2>Publications</h2>
<li><a href="https://cicl.stanford.edu/publication/kirfel2023anticipating/">Anticipating the risks and benefits of counterfactual world simulation models</a></li>
</ul>

<ul>
<li><a href="https://cicl.stanford.edu/publication/franken2023rails/">Off The Rails: Procedural Dilemma Generation for Moral Reasoning</a></li>
</ul>




Expand Down
11 changes: 10 additions & 1 deletion docs/bibtex/cic_papers.bib
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/
%% Created for Tobias Gerstenberg at 2024-03-31 21:14:50 -0700
%% Created for Tobias Gerstenberg at 2024-04-17 14:22:46 -0700
%% Saved with string encoding Unicode (UTF-8)
@inproceedings{franken2024rails,
abstract = {As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. We provide a framework that uses a language model to translate causal graphs that capture key aspects of moral dilemmas into prompt templates. With this framework, we procedurally generated a large and diverse set of moral dilemmas---the OffTheRails benchmark---consisting of 50 scenarios and 400 unique test items. We collected moral permissibility and intention judgments from human participants for a subset of our items and compared these judgments to those from two language models (GPT-4 and Claude-2) across eight conditions. We find that moral dilemmas in which the harm is a necessary means (as compared to a side effect) resulted in lower permissibility and higher intention ratings for both participants and language models. The same pattern was observed for evitable versus inevitable harmful outcomes. However, there was no clear effect of whether the harm resulted from an agent's action versus from having omitted to act. We discuss limitations of our prompt generation pipeline and opportunities for improving scenarios to increase the strength of experimental effects.},
author = {Jan-Philipp Fr{\"a}nken and Kanishk Gandhi and Tori Qiu and Ayesha Khawaja and Noah D. Goodman and Tobias Gerstenberg},
booktitle = {{Proceedings of the 46th Annual Conference of the Cognitive Science Society}},
date-added = {2024-04-17 14:21:36 -0700},
date-modified = {2024-04-17 14:22:45 -0700},
title = {Procedural dilemma generation for evaluating moral reasoning in humans and language models},
year = {2024}}

@article{andukuri2024stargate,
abstract = {When prompting language models to complete a task, users often leave important aspects unsaid. While asking questions could resolve this ambiguity (GATE; Li et al., 2023), models often struggle to ask good questions. We explore a language model's ability to self-improve (STaR; Zelikman et al., 2022) by rewarding the model for generating useful questions---a simple method we dub STaR-GATE. We generate a synthetic dataset of 25,500 unique persona-task prompts to simulate conversations between a pretrained language model---the Questioner---and a Roleplayer whose preferences are unknown to the Questioner. By asking questions, the Questioner elicits preferences from the Roleplayer. The Questioner is iteratively finetuned on questions that increase the probability of high-quality responses to the task, which are generated by an Oracle with access to the Roleplayer's latent preferences. After two iterations of self-improvement, the Questioner asks better questions, allowing it to generate responses that are preferred over responses from the initial model on 72% of tasks. Our results indicate that teaching a language model to ask better questions leads to better personalized responses.},
author = {Andukuri, Chinmaya and Fr{\"a}nken, Jan-Philipp and Gerstenberg, Tobias and Goodman, Noah D.},
Expand Down
2 changes: 1 addition & 1 deletion docs/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
<meta property="og:description" content="">
<meta property="og:locale" content="en-us">

<meta property="og:updated_time" content="2024-03-31T00:00:00&#43;00:00">
<meta property="og:updated_time" content="2024-04-17T00:00:00&#43;00:00">



Expand Down
20 changes: 10 additions & 10 deletions docs/index.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@
<generator>Hugo -- gohugo.io</generator>
<language>en-us</language>
<copyright>&amp;copy; 2024 Tobias Gerstenberg</copyright>
<lastBuildDate>Sun, 31 Mar 2024 00:00:00 +0000</lastBuildDate>
<lastBuildDate>Wed, 17 Apr 2024 00:00:00 +0000</lastBuildDate>
<atom:link href="/" rel="self" type="application/rss+xml" />

<item>
<title>Procedural dilemma generation for evaluating moral reasoning in humans and language models</title>
<link>https://cicl.stanford.edu/publication/franken2024rails/</link>
<pubDate>Wed, 17 Apr 2024 00:00:00 +0000</pubDate>

<guid>https://cicl.stanford.edu/publication/franken2024rails/</guid>
<description></description>
</item>

<item>
<title>STaR-GATE: Teaching Language Models to Ask Clarifying Questions</title>
<link>https://cicl.stanford.edu/publication/andukuri2024stargate/</link>
Expand Down Expand Up @@ -135,14 +144,5 @@
<description></description>
</item>

<item>
<title>A computational model of responsibility judgments from counterfactual simulations and intention inferences</title>
<link>https://cicl.stanford.edu/publication/wu2023computational/</link>
<pubDate>Thu, 11 May 2023 00:00:00 +0000</pubDate>

<guid>https://cicl.stanford.edu/publication/wu2023computational/</guid>
<description></description>
</item>

</channel>
</rss>
43 changes: 43 additions & 0 deletions docs/member/tobias_gerstenberg/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,49 @@ <h2 id="publications">Publications</h2>


<div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
<span itemprop="author">
J. Fränken, K. Gandhi, T. Qiu, A. Khawaja, N. D. Goodman, T. Gerstenberg</span>

(2024).

<a href="https://cicl.stanford.edu/publication/franken2024rails/" itemprop="name">Procedural dilemma generation for evaluating moral reasoning in humans and language models</a>.
<em>Proceedings of the 46th Annual Conference of the Cognitive Science Society</em>.




<p>





<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://cicl.stanford.edu/papers/franken2024rails.pdf" target="_blank" rel="noopener">
PDF
</a>














<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://github.com/cicl-stanford/moral-evals/tree/main" target="_blank" rel="noopener">
Github
</a>


</p>

</div>
<div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
<span itemprop="author">
C. Andukuri, J. Fränken, T. Gerstenberg, N. D. Goodman</span>

Expand Down
Binary file added docs/papers/franken2024rails.pdf
Binary file not shown.
Loading

0 comments on commit 4c7a368

Please sign in to comment.