-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bd854e9
commit 4c7a368
Showing
21 changed files
with
772 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
+++ | ||
# 0 -> 'Forthcoming', | ||
# 1 -> 'Preprint', | ||
# 2 -> 'Journal', | ||
# 3 -> 'Conference Proceedings', | ||
# 4 -> 'Book chapter', | ||
# 5 -> 'Thesis' | ||
|
||
title = "Procedural dilemma generation for evaluating moral reasoning in humans and language models" | ||
date = "2024-04-17" | ||
authors = ["J. Fränken","K. Gandhi","T. Qiu","A. Khawaja","N. D. Goodman","T. Gerstenberg"] | ||
publication_types = ["3"] | ||
publication_short = "_Proceedings of the 46th Annual Conference of the Cognitive Science Society_" | ||
publication = 'Fränken J., Gandhi K., Qiu T., Khawaja A., Goodman N. D., Gerstenberg T. (2024). Procedural dilemma generation for evaluating moral reasoning in humans and language models. In _Proceedings of the 46th Annual Conference of the Cognitive Science Society_.' | ||
abstract = "As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. We provide a framework that uses a language model to translate causal graphs that capture key aspects of moral dilemmas into prompt templates. With this framework, we procedurally generated a large and diverse set of moral dilemmas---the OffTheRails benchmark---consisting of 50 scenarios and 400 unique test items. We collected moral permissibility and intention judgments from human participants for a subset of our items and compared these judgments to those from two language models (GPT-4 and Claude-2) across eight conditions. We find that moral dilemmas in which the harm is a necessary means (as compared to a side effect) resulted in lower permissibility and higher intention ratings for both participants and language models. The same pattern was observed for evitable versus inevitable harmful outcomes. However, there was no clear effect of whether the harm resulted from an agent's action versus from having omitted to act. We discuss limitations of our prompt generation pipeline and opportunities for improving scenarios to increase the strength of experimental effects." | ||
image_preview = "" | ||
selected = false | ||
projects = [] | ||
url_pdf = "papers/franken2024rails.pdf" | ||
url_preprint = "" | ||
url_code = "" | ||
url_dataset = "" | ||
url_slides = "" | ||
url_video = "" | ||
url_poster = "" | ||
url_source = "" | ||
url_custom = [{name = "Github", url = "https://github.com/cicl-stanford/moral-evals/tree/main"}] | ||
math = true | ||
highlight = true | ||
[header] | ||
# image = "publications/franken2024rails.png" | ||
caption = "" | ||
+++ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Oops, something went wrong.