-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b16136a
commit 5ec1e0b
Showing
35 changed files
with
2,385 additions
and
358 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
+++ | ||
# 0 -> 'Forthcoming', | ||
# 1 -> 'Preprint', | ||
# 2 -> 'Journal', | ||
# 3 -> 'Conference Proceedings', | ||
# 4 -> 'Book chapter', | ||
# 5 -> 'Thesis' | ||
|
||
title = "Off The Rails: Procedural Dilemma Generation for Moral Reasoning" | ||
date = "2023-10-30" | ||
authors = ['J. Fränken',"A. Khawaja","K. Gandhi","J. Moore","N. D. Goodman","T. Gerstenberg"] | ||
publication_types = ["3"] | ||
publication_short = "_AI Meets Moral Philosophy and Moral Psychology Workshop (NeurIPS 2023)_" | ||
publication = "Fränken J., Khawaja A., Gandhi K., Moore J., Goodman N. D., Gerstenberg T. (2023). Off The Rails: Procedural Dilemma Generation for Moral Reasoning. In _AI Meets Moral Philosophy and Moral Psychology Workshop (NeurIPS 2023)_." | ||
abstract = "As AI systems like language models are increasingly integrated into making decisions that affect people, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. Recent work has introduced a method for procedurally generating LLM evaluations from abstract causal templates, and tested this method in the context of social reasoning (i.e., theory-of-mind). In this paper, we extend this method to the domain of moral dilemmas. We develop a framework that translates causal graphs into a prompt template which can then be used to procedurally generate a large and diverse set of moral dilemmas using a language model. Using this framework, we created the OffTheRails dataset which consists of 50 scenarios and 500 unique test items. We evaluated the quality of our model-written test items using two independent human experts and found that 90% of the test-items met the desired structure. We collect moral permissibility and intention judgments from 100 human crowdworkers and compared these judgments with those from GPT-4 and Claude-2 across eight control conditions. Both humans and GPT-4 assigned higher intentionality to agents when a harmful outcome was evitable and a necessary means. However, our findings did not match previous findings on permissibility judgments. This difference may be a result of not controlling the severity of harmful outcomes during scenario generation. We conclude by discussing future extensions of our benchmark to address this limitation." | ||
image_preview = "" | ||
selected = false | ||
projects = [] | ||
#url_pdf = "papers/franken2023rails.pdf" | ||
url_preprint = "" | ||
url_code = "" | ||
url_dataset = "" | ||
url_slides = "" | ||
url_video = "" | ||
url_poster = "" | ||
url_source = "" | ||
#url_custom = [{name = "Github", url = ""}] | ||
math = true | ||
highlight = true | ||
[header] | ||
# image = "publications/franken2023rails.png" | ||
caption = "" | ||
+++ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
+++ | ||
# 0 -> 'Forthcoming', | ||
# 1 -> 'Preprint', | ||
# 2 -> 'Journal', | ||
# 3 -> 'Conference Proceedings', | ||
# 4 -> 'Book chapter', | ||
# 5 -> 'Thesis' | ||
|
||
title = "Social Contract AI: Aligning AI Assistants with Implicit Group Norms" | ||
date = "2023-10-30" | ||
authors = ['J. Fränken',"S. Kwok","P. Ye","K. Gandhi","D. Arumugam","J. Moore","A. Tamkin","T. Gerstenberg","N. D. Goodman"] | ||
publication_types = ["3"] | ||
publication_short = "_Socially Responsible Language Modelling Research Workshop (NeurIPS 2023)_" | ||
publication = "Fränken J., Kwok S., Ye P., Gandhi K., Arumugam D., Moore J., Tamkin A., Gerstenberg T., Goodman N. D. (2023). Social Contract AI: Aligning AI Assistants with Implicit Group Norms. In _Socially Responsible Language Modelling Research Workshop (NeurIPS 2023)." | ||
abstract = "We explore the idea of aligning an AI assistant by inverting a model of users' (unknown) preferences from observed interactions. To validate our proposal, we run proof-of-concept simulations in the economic ultimatum game, formalizing user preferences as policies that guide the actions of simulated players. We find that the AI assistant accurately aligns its behavior to match standard policies from the economic literature (e.g., selfish, altruistic). However, the assistant's learned policies lack robustness and exhibit limited generalization in an out-of-distribution setting when confronted with a currency (e.g., grams of medicine) that was not included in the assistant's training distribution. Additionally, we find that when there is inconsistency in the relationship between language use and an unknown policy (e.g., an altruistic policy combined with rude language), the assistant's learning of the policy is slowed. Overall, our preliminary results suggest that developing simulation frameworks in which AI assistants need to infer preferences from diverse users can provide a valuable approach for studying practical alignment questions." | ||
image_preview = "" | ||
selected = false | ||
projects = [] | ||
url_pdf = "papers/franken2023social.pdf" | ||
url_preprint = "https://arxiv.org/abs/2310.17769" | ||
url_code = "" | ||
url_dataset = "" | ||
url_slides = "" | ||
url_video = "" | ||
url_poster = "" | ||
url_source = "" | ||
url_custom = [{name = "Github", url = "https://github.com/janphilippfranken/scai/tree/release"}] | ||
math = true | ||
highlight = true | ||
[header] | ||
# image = "publications/franken2023social.png" | ||
caption = "" | ||
+++ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
+++ | ||
# 0 -> 'Forthcoming', | ||
# 1 -> 'Preprint', | ||
# 2 -> 'Journal', | ||
# 3 -> 'Conference Proceedings', | ||
# 4 -> 'Book chapter', | ||
# 5 -> 'Thesis' | ||
|
||
title = "Anticipating the risks and benefits of counterfactual world simulation models" | ||
date = "2023-10-30" | ||
authors = ["L. Kirfel","R. J. MacCoun","T. Icard","T. Gerstenberg"] | ||
publication_types = ["3"] | ||
publication_short = "_AI Meets Moral Philosophy and Moral Psychology Workshop (NeurIPS 2023)_" | ||
publication = "Kirfel L., MacCoun R. J., Icard T., Gerstenberg T. (2023). Anticipating the risks and benefits of counterfactual world simulation models. In _AI Meets Moral Philosophy and Moral Psychology Workshop (NeurIPS 2023)_." | ||
abstract = "This paper examines the transformative potential of Counterfactual World Simulation Models (CWSMs). A CWSM uses multi-modal evidence, such as the CCTV footage of a road accident, to build a high-fidelity 3D reconstruction of what happened. It can answer causal questions, such as whether the accident happened because the driver was speeding, by simulating what would have happened in relevant counterfactual situations. We argue for a normative and ethical framework that guides and constrains the simulation of counterfactuals. We address the challenge of ensuring fidelity in reconstructions while simultaneously preventing stereotype perpetuation during counterfactual simulations. We anticipate different modes of how users will interact with CWSMs and discuss how their outputs may be presented. Finally, we address the prospective applications of CWSMs in the legal domain, recognizing both their potential to revolutionize legal proceedings as well as the ethical concerns they engender. Sketching a new genre of AI, this paper seeks to illuminate the path forward for responsible and effective use of CWSMs." | ||
image_preview = "" | ||
selected = false | ||
projects = [] | ||
#url_pdf = "papers/kirfel2023anticipating.pdf" | ||
url_preprint = "" | ||
url_code = "" | ||
url_dataset = "" | ||
url_slides = "" | ||
url_video = "" | ||
url_poster = "" | ||
url_source = "" | ||
#url_custom = [{name = "Github", url = ""}] | ||
math = true | ||
highlight = true | ||
[header] | ||
# image = "publications/kirfel2023anticipating.png" | ||
caption = "" | ||
+++ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.