diff --git a/content/publication/franken2024sami.md b/content/publication/franken2024sami.md index b02dbd7..eefe8e5 100644 --- a/content/publication/franken2024sami.md +++ b/content/publication/franken2024sami.md @@ -9,9 +9,9 @@ title = "Self-supervised alignment with mutual information: Learning to follow principles without preference labels" date = "2024-04-22" authors = ["J. Fränken","E. Zelikman","R. Rafailov","K. Gandhi","T. Gerstenberg","N. D. Goodman"] -publication_types = ["1"] -publication_short = "_arXiv_" -publication = "Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. _arXiv_." +publication_types = ["3"] +publication_short = "_Advances in Neural Information Processing Systems_" +publication = "Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. _Advances in Neural Information Processing Systems_." abstract = "When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into a model can be resource-intensive and technically challenging, generally requiring human preference labels or examples. We introduce SAMI, a method for teaching a pretrained LM to follow behavioral principles that does not require any preference labels or demonstrations. SAMI is an iterative algorithm that finetunes a pretrained LM to increase the conditional mutual information between constitutions and self-generated responses given queries from a datasest. On single-turn dialogue and summarization, a SAMI-trained mistral-7b outperforms the initial pretrained model, with win rates between 66% and 77%. Strikingly, it also surpasses an instruction-finetuned baseline (mistral-7b-instruct) with win rates between 55% and 57% on single-turn dialogue. SAMI requires a 'principle writer' model; to avoid dependence on stronger models, we further evaluate aligning a strong pretrained model (mixtral-8x7b) using constitutions written by a weak instruction-finetuned model (mistral-7b-instruct). The SAMI-trained mixtral-8x7b outperforms both the initial model and the instruction-finetuned model, achieving a 65% win rate on summarization. Our results indicate that a pretrained LM can learn to follow constitutions without using preference labels, demonstrations, or human oversight." image_preview = "" selected = false diff --git a/content/publication/gandhi2024affective.md b/content/publication/gandhi2024affective.md index 6fa1d3b..98bc334 100644 --- a/content/publication/gandhi2024affective.md +++ b/content/publication/gandhi2024affective.md @@ -16,7 +16,7 @@ abstract = "Understanding emotions is fundamental to human interaction and exper image_preview = "" selected = false projects = [] -#url_pdf = "papers/gandhi2024affective.pdf" +url_pdf = "papers/gandhi2024affective.pdf" url_preprint = "https://arxiv.org/abs/2409.11733" url_code = "" url_dataset = "" diff --git a/content/publication/jin2024marple.md b/content/publication/jin2024marple.md new file mode 100644 index 0000000..3fdb79c --- /dev/null +++ b/content/publication/jin2024marple.md @@ -0,0 +1,33 @@ ++++ +# 0 -> 'Forthcoming', +# 1 -> 'Preprint', +# 2 -> 'Journal', +# 3 -> 'Conference Proceedings', +# 4 -> 'Book chapter', +# 5 -> 'Thesis' + +title = "MARPLE: A Benchmark for Long-Horizon Inference" +date = "2024-10-04" +authors = ["E. Jin","Z. Huang","J. Fränken","W. Liu","H. Cha","E. Brockbank","S. Wu","R. Zhang","J. Wu","T. Gerstenberg"] +publication_types = ["3"] +publication_short = "_Advances in Neural Information Processing Systems_" +publication = "Jin, E., Huang, Z., Fränken, J., Liu, W., Cha, H., Brockbank, E., Wu, S., Zhang, R., Wu, J., Gerstenberg, T. (2024). MARPLE: A Benchmark for Long-Horizon Inference. _Advances in Neural Information Processing Systems_." +abstract = "Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit'' stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: https://marple-benchmark.github.io/." +image_preview = "" +selected = false +projects = [] +url_pdf = "papers/jin2024marple.pdf" +url_preprint = "http://arxiv.org/abs/2410.01926" +url_code = "" +url_dataset = "" +url_slides = "" +url_video = "" +url_poster = "" +url_source = "" +url_custom = [{name = "Project website", url = "https://marple-benchmark.github.io/"}] +math = true +highlight = true +[header] +# image = "publications/jin2024marple.png" +caption = "" ++++ \ No newline at end of file diff --git a/docs/404.html b/docs/404.html index d06c8d8..d46c2d6 100644 --- a/docs/404.html +++ b/docs/404.html @@ -237,6 +237,10 @@

Page not found

Publications

+ + @@ -253,10 +257,6 @@

Publications

  • Counterfactual simulation in causal cognition
  • - - diff --git a/docs/bibtex/cic_papers.bib b/docs/bibtex/cic_papers.bib index d52d0fd..7023118 100644 --- a/docs/bibtex/cic_papers.bib +++ b/docs/bibtex/cic_papers.bib @@ -1,13 +1,24 @@ %% This BibTeX bibliography file was created using BibDesk. %% https://bibdesk.sourceforge.io/ -%% Created for Tobias Gerstenberg at 2024-09-20 15:09:53 -0700 +%% Created for Tobias Gerstenberg at 2024-10-04 08:51:52 -0700 %% Saved with string encoding Unicode (UTF-8) +@article{jin2024marple, + abstract = {Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, language, and auditory cues. We introduce MARPLE, a benchmark for evaluating long-horizon inference capabilities using multi-modal evidence. Our benchmark features agents interacting with simulated households, supporting vision, language, and auditory stimuli, as well as procedurally generated environments and agent behaviors. Inspired by classic ``whodunit'' stories, we ask AI models and human participants to infer which agent caused a change in the environment based on a step-by-step replay of what actually happened. The goal is to correctly identify the culprit as early as possible. Our findings show that human participants outperform both traditional Monte Carlo simulation methods and an LLM baseline (GPT-4) on this task. Compared to humans, traditional inference models are less robust and performant, while GPT-4 has difficulty comprehending environmental changes. We analyze what factors influence inference performance and ablate different modes of evidence, finding that all modes are valuable for performance. Overall, our experiments demonstrate that the long-horizon, multimodal inference tasks in our benchmark present a challenge to current models. Project website: https: //marple-benchmark.github.io/.}, + annote = {Comment: NeurIPS 2024. First two authors contributed equally. Project page: https://marple-benchmark.github.io/}, + author = {Jin, Emily and Huang, Zhuoyi and Fr{\"a}nken, Jan-Philipp and Liu, Weiyu and Cha, Hannah and Brockbank, Erik and Wu, Sarah and Zhang, Ruohan and Wu, Jiajun and Gerstenberg, Tobias}, + date-added = {2024-10-04 08:51:51 -0700}, + date-modified = {2024-10-04 08:51:51 -0700}, + journal = {arXiv}, + note = {http://arxiv.org/abs/2410.01926}, + title = {{MARPLE: A Benchmark for Long-Horizon Inference}}, + year = {2024}} + @article{gandhi2024affective, abstract = {Understanding emotions is fundamental to human interaction and experience. Humans easily infer emotions from situations or facial expressions, situations from emotions, and do a variety of other affective cognition. How adept is modern AI at these inferences? We introduce an evaluation framework for testing affective cognition in foundation models. Starting from psychological theory, we generate 1,280 diverse scenarios exploring relationships between appraisals, emotions, expressions, and outcomes. We evaluate the abilities of foundation models (GPT-4, Claude-3, Gemini-1.5-Pro) and humans (N = 567) across carefully selected conditions. Our results show foundation models tend to agree with human intuitions, matching or exceeding interparticipant agreement. In some conditions, models are ``superhuman'' -- they better predict modal human judgements than the average human. All models benefit from chain-of-thought reasoning. This suggests foundation models have acquired a human-like understanding of emotions and their influence on beliefs and behavior.}, author = {Kanishk Gandhi and Zoe Lynch and Jan-Philipp Fr{\"a}nken and Kayla Patterson and Sharon Wambu and Tobias Gerstenberg and Desmond C. Ong and Noah D. Goodman}, diff --git a/docs/index.html b/docs/index.html index ed450d8..57b7851 100644 --- a/docs/index.html +++ b/docs/index.html @@ -110,7 +110,7 @@ - + diff --git a/docs/index.xml b/docs/index.xml index 90f077e..69171ad 100644 --- a/docs/index.xml +++ b/docs/index.xml @@ -6,9 +6,18 @@ Hugo -- gohugo.io en-us © 2024 Tobias Gerstenberg - Fri, 20 Sep 2024 00:00:00 +0000 + Fri, 04 Oct 2024 00:00:00 +0000 + + MARPLE: A Benchmark for Long-Horizon Inference + https://cicl.stanford.edu/publication/jin2024marple/ + Fri, 04 Oct 2024 00:00:00 +0000 + + https://cicl.stanford.edu/publication/jin2024marple/ + + + Causation, Meaning, and Communication https://cicl.stanford.edu/publication/beller2024causation/ @@ -135,14 +144,5 @@ - - STaR-GATE: Teaching Language Models to Ask Clarifying Questions - https://cicl.stanford.edu/publication/andukuri2024stargate/ - Sun, 31 Mar 2024 00:00:00 +0000 - - https://cicl.stanford.edu/publication/andukuri2024stargate/ - - - diff --git a/docs/member/tobias_gerstenberg/index.html b/docs/member/tobias_gerstenberg/index.html index c49cc68..1638089 100644 --- a/docs/member/tobias_gerstenberg/index.html +++ b/docs/member/tobias_gerstenberg/index.html @@ -356,6 +356,53 @@

    Publications

    + + + (2024). + + MARPLE: A Benchmark for Long-Horizon Inference. + Advances in Neural Information Processing Systems. + + + + +

    + + + + + + Preprint + + + + + PDF + + + + + + + + + + + + + + + + + Project website + + + +

    + +
    +
    @@ -420,6 +467,10 @@

    Publications

    + + PDF + + @@ -926,7 +977,7 @@

    Publications

    (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. - arXiv. + Advances in Neural Information Processing Systems. diff --git a/docs/papers/gandhi2024affective.pdf b/docs/papers/gandhi2024affective.pdf new file mode 100644 index 0000000..fb4c39e Binary files /dev/null and b/docs/papers/gandhi2024affective.pdf differ diff --git a/docs/papers/jin2024marple.pdf b/docs/papers/jin2024marple.pdf new file mode 100644 index 0000000..47d575c Binary files /dev/null and b/docs/papers/jin2024marple.pdf differ diff --git a/docs/publication/franken2024sami/index.html b/docs/publication/franken2024sami/index.html index 9dce50f..61a0a2b 100644 --- a/docs/publication/franken2024sami/index.html +++ b/docs/publication/franken2024sami/index.html @@ -266,8 +266,8 @@

    Abstract

    Type
    @@ -283,7 +283,7 @@

    Abstract

    Publication
    -
    Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. arXiv.
    +
    Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. Advances in Neural Information Processing Systems.
    diff --git a/docs/publication/gandhi2024affective/index.html b/docs/publication/gandhi2024affective/index.html index e14f2f5..a316eab 100644 --- a/docs/publication/gandhi2024affective/index.html +++ b/docs/publication/gandhi2024affective/index.html @@ -320,6 +320,10 @@

    Abstract

    + + PDF + + diff --git a/docs/publication/index.html b/docs/publication/index.html index 5b84bc1..f39141b 100644 --- a/docs/publication/index.html +++ b/docs/publication/index.html @@ -1597,6 +1597,19 @@

    Publications

    + + + + + + + + + + + + + @@ -1733,6 +1746,65 @@

    Publications

    +
    + +
    + + + (2024). + + MARPLE: A Benchmark for Long-Horizon Inference. + Advances in Neural Information Processing Systems. + + + + +

    + + + + + + Preprint + + + + + PDF + + + + + + + + + + + + + + + + + Project website + + + +

    + +
    + + +
    + + + + + + +
    @@ -1812,6 +1884,10 @@

    Publications

    + + PDF + + @@ -2441,7 +2517,7 @@

    Publications

    -
    +
    -
    -

    Self-supervised alignment with mutual information: Learning to follow principles without preference labels

    -
    - - When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into … - -
    -
    -

    STaR-GATE: Teaching Language Models to Ask Clarifying Questions

    diff --git a/docs/publication_types/1/index.xml b/docs/publication_types/1/index.xml index dd353ba..5475ab9 100644 --- a/docs/publication_types/1/index.xml +++ b/docs/publication_types/1/index.xml @@ -39,15 +39,6 @@ - - Self-supervised alignment with mutual information: Learning to follow principles without preference labels - https://cicl.stanford.edu/publication/franken2024sami/ - Mon, 22 Apr 2024 00:00:00 +0000 - - https://cicl.stanford.edu/publication/franken2024sami/ - - - STaR-GATE: Teaching Language Models to Ask Clarifying Questions https://cicl.stanford.edu/publication/andukuri2024stargate/ diff --git a/docs/publication_types/3/index.html b/docs/publication_types/3/index.html index 861dbc2..5a605e6 100644 --- a/docs/publication_types/3/index.html +++ b/docs/publication_types/3/index.html @@ -111,7 +111,7 @@ - + @@ -238,6 +238,15 @@

    3

    +
    +

    MARPLE: A Benchmark for Long-Horizon Inference

    +
    + + Reconstructing past events requires reasoning across long time horizons. To figure out what happened, we need to use our prior knowledge about the world and human behavior and draw inferences from various sources of evidence including visual, … + +
    +
    +

    Whodunnit? Inferring what happened from multimodal evidence

    @@ -302,28 +311,19 @@

    Resource-rat

    -

    Procedural dilemma generation for evaluating moral reasoning in humans and language models

    -
    - - As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic … - -
    -
    - -
    -

    Anticipating the risks and benefits of counterfactual world simulation models

    +

    Self-supervised alignment with mutual information: Learning to follow principles without preference labels

    - This paper examines the transformative potential of Counterfactual World Simulation Models (CWSMs). CWSMs use pieces of multi-modal evidence, such as the CCTV footage or sound recordings of a road accident, to build a high-fidelity 3D reconstruction … + When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into …
    -

    Off The Rails: Procedural Dilemma Generation for Moral Reasoning

    +

    Procedural dilemma generation for evaluating moral reasoning in humans and language models

    - As AI systems like language models are increasingly integrated into making decisions that affect people, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. Recent … + As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic …
    diff --git a/docs/publication_types/3/index.xml b/docs/publication_types/3/index.xml index 485b729..1c53827 100644 --- a/docs/publication_types/3/index.xml +++ b/docs/publication_types/3/index.xml @@ -7,11 +7,20 @@ Hugo -- gohugo.io en-us © 2024 Tobias Gerstenberg - Mon, 13 May 2024 00:00:00 +0000 + Fri, 04 Oct 2024 00:00:00 +0000 + + MARPLE: A Benchmark for Long-Horizon Inference + https://cicl.stanford.edu/publication/jin2024marple/ + Fri, 04 Oct 2024 00:00:00 +0000 + + https://cicl.stanford.edu/publication/jin2024marple/ + + + Whodunnit? Inferring what happened from multimodal evidence https://cicl.stanford.edu/publication/wu2024whodunnit/ @@ -75,6 +84,15 @@ + + Self-supervised alignment with mutual information: Learning to follow principles without preference labels + https://cicl.stanford.edu/publication/franken2024sami/ + Mon, 22 Apr 2024 00:00:00 +0000 + + https://cicl.stanford.edu/publication/franken2024sami/ + + + Procedural dilemma generation for evaluating moral reasoning in humans and language models https://cicl.stanford.edu/publication/franken2024rails/ diff --git a/docs/publication_types/3/page/2/index.html b/docs/publication_types/3/page/2/index.html index 8d4694b..1465bb3 100644 --- a/docs/publication_types/3/page/2/index.html +++ b/docs/publication_types/3/page/2/index.html @@ -111,7 +111,7 @@ - + @@ -238,6 +238,24 @@

    3

    +
    +

    Anticipating the risks and benefits of counterfactual world simulation models

    +
    + + This paper examines the transformative potential of Counterfactual World Simulation Models (CWSMs). CWSMs use pieces of multi-modal evidence, such as the CCTV footage or sound recordings of a road accident, to build a high-fidelity 3D reconstruction … + +
    +
    + +
    +

    Off The Rails: Procedural Dilemma Generation for Moral Reasoning

    +
    + + As AI systems like language models are increasingly integrated into making decisions that affect people, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. Recent … + +
    +
    + -
    -

    A Semantics for Causing, Enabling, and Preventing Verbs Using Structural Causal Models

    -
    - - When choosing how to describe what happened, we have a number of causal verbs at our disposal. In this paper, we develop a model-theoretic formal semantics for nine causal verbs that span the categories of CAUSE, ENABLE, and PREVENT. We use … - -
    -
    - -
    -

    Causal Reasoning Across Agents and Objects

    -
    - - This work attempts to bridge the divide between accounts of causal reasoning with respect to agents and objects. We begin by examining the influence of animacy. In a collision-based context, we vary the animacy status of an object using 3D … - -
    -
    -