added sami paper

cicl-stanford · Apr 23, 2024 · 5225e9a · 5225e9a
1 parent ed73c65
commit 5225e9a
Show file tree

Hide file tree

Showing 19 changed files with 773 additions and 68 deletions.
diff --git a/content/publication/franken2024sami.md b/content/publication/franken2024sami.md
@@ -0,0 +1,33 @@
++++
+# 0 -> 'Forthcoming',
+# 1 -> 'Preprint',
+# 2 -> 'Journal',
+# 3 -> 'Conference Proceedings',
+# 4 -> 'Book chapter',
+# 5 -> 'Thesis'
+
+title = "Self-supervised alignment with mutual information: Learning to follow principles without preference labels"
+date = "2024-04-22"
+authors = ["J. Fränken","E. Zelikman","R. Rafailov","K. Gandhi","T. Gerstenberg","N. D. Goodman"]
+publication_types = ["2"]
+publication_short = "_arXiv_"
+publication = "Fränken, J., Zelikman, E., Rafailov, R., Gandhi, K., Gerstenberg, T., Goodman, N. D. (2024). Self-supervised alignment with mutual information: Learning to follow principles without preference labels. _arXiv_."
+abstract = "When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into a model can be resource-intensive and technically challenging, generally requiring human preference labels or examples. We introduce SAMI, a method for teaching a pretrained LM to follow behavioral principles that does not require any preference labels or demonstrations. SAMI is an iterative algorithm that finetunes a pretrained LM to increase the conditional mutual information between constitutions and self-generated responses given queries from a datasest. On single-turn dialogue and summarization, a SAMI-trained mistral-7b outperforms the initial pretrained model, with win rates between 66% and 77%. Strikingly, it also surpasses an instruction-finetuned baseline (mistral-7b-instruct) with win rates between 55% and 57% on single-turn dialogue. SAMI requires a 'principle writer' model; to avoid dependence on stronger models, we further evaluate aligning a strong pretrained model (mixtral-8x7b) using constitutions written by a weak instruction-finetuned model (mistral-7b-instruct). The SAMI-trained mixtral-8x7b outperforms both the initial model and the instruction-finetuned model, achieving a 65% win rate on summarization. Our results indicate that a pretrained LM can learn to follow constitutions without using preference labels, demonstrations, or human oversight."
+image_preview = ""
+selected = false
+projects = []
+url_pdf = "papers/franken2024sami.pdf"
+url_preprint = "https://arxiv.org/abs/2404.14313"
+url_code = ""
+url_dataset = ""
+url_slides = ""
+url_video = ""
+url_poster = ""
+url_source = ""
+url_custom = [{name = "Github", url = "https://github.com/janphilippfranken/sami"}]
+math = true
+highlight = true
+[header]
+# image = "publications/franken2024sami.png"
+caption = ""
++++
diff --git a/docs/404.html b/docs/404.html
@@ -237,6 +237,10 @@ <h1>Page not found</h1>
 
   <h2>Publications</h2>
 
+  <ul>
+    <li><a href="https://cicl.stanford.edu/publication/franken2024sami/">Self-supervised alignment with mutual information: Learning to follow principles without preference labels</a></li>
+  </ul>
+
   <ul>
     <li><a href="https://cicl.stanford.edu/publication/franken2024rails/">Procedural dilemma generation for evaluating moral reasoning in humans and language models</a></li>
   </ul>
@@ -253,10 +257,6 @@ <h2>Publications</h2>
     <li><a href="https://cicl.stanford.edu/publication/prinzing2024purpose/">From Artifacts to Human Lives: Investigating the Domain-Generality of Judgments about Purposes</a></li>
   </ul>
 
-  <ul>
-    <li><a href="https://cicl.stanford.edu/publication/kirfel2023anticipating/">Anticipating the risks and benefits of counterfactual world simulation models</a></li>
-  </ul>
-
 
 
 

diff --git a/docs/bibtex/cic_papers.bib b/docs/bibtex/cic_papers.bib
@@ -1,13 +1,24 @@
 %% This BibTeX bibliography file was created using BibDesk.
 %% https://bibdesk.sourceforge.io/
 
-%% Created for Tobias Gerstenberg at 2024-04-17 14:34:18 -0700 
+%% Created for Tobias Gerstenberg at 2024-04-22 21:44:33 -0700 
 
 
 %% Saved with string encoding Unicode (UTF-8) 
 
 
 
+@article{franken2024sami,
+	abstract = {When prompting a language model (LM), users frequently expect the model to adhere to a set of behavioral principles across diverse tasks, such as producing insightful content while avoiding harmful or biased language. Instilling such principles into a model can be resource-intensive and technically challenging, generally requiring human preference labels or examples. We introduce SAMI, a method for teaching a pretrained LM to follow behavioral principles that does not require any preference labels or demonstrations. SAMI is an iterative algorithm that finetunes a pretrained LM to increase the conditional mutual information between constitutions and self-generated responses given queries from a datasest. On single-turn dialogue and summarization, a SAMI-trained mistral-7b outperforms the initial pretrained model, with win rates between 66% and 77%. Strikingly, it also surpasses an instruction-finetuned baseline (mistral-7b-instruct) with win rates between 55% and 57% on single-turn dialogue. SAMI requires a "principle writer" model; to avoid dependence on stronger models, we further evaluate aligning a strong pretrained model (mixtral-8x7b) using constitutions written by a weak instruction-finetuned model (mistral-7b-instruct). The SAMI-trained mixtral-8x7b outperforms both the initial model and the instruction-finetuned model, achieving a 65% win rate on summarization. Our results indicate that a pretrained LM can learn to follow constitutions without using preference labels, demonstrations, or human oversight.},
+	author = {Fr{\"a}nken, Jan-Philipp and Zelikman, Eric and Rafailov, Rafael and Gandhi, Kanishk and Gerstenberg, Tobias and Goodman, Noah D.},
+	date-added = {2024-04-22 21:44:23 -0700},
+	date-modified = {2024-04-22 21:44:23 -0700},
+	journal = {arXiv},
+	title = {{Self-supervised alignment with mutual information: Learning to follow principles without preference labels}},
+	url = {http://arxiv.org/abs/2404.14313},
+	year = {2024},
+	bdsk-url-1 = {http://arxiv.org/abs/2404.14313}}
+
 @inproceedings{franken2024rails,
 	abstract = {As AI systems like language models are increasingly integrated into decision-making processes affecting people's lives, it's critical to ensure that these systems have sound moral reasoning. To test whether they do, we need to develop systematic evaluations. We provide a framework that uses a language model to translate causal graphs that capture key aspects of moral dilemmas into prompt templates. With this framework, we procedurally generated a large and diverse set of moral dilemmas---the OffTheRails benchmark---consisting of 50 scenarios and 400 unique test items. We collected moral permissibility and intention judgments from human participants for a subset of our items and compared these judgments to those from two language models (GPT-4 and Claude-2) across eight conditions. We find that moral dilemmas in which the harm is a necessary means (as compared to a side effect) resulted in lower permissibility and higher intention ratings for both participants and language models. The same pattern was observed for evitable versus inevitable harmful outcomes. However, there was no clear effect of whether the harm resulted from an agent's action versus from having omitted to act. We discuss limitations of our prompt generation pipeline and opportunities for improving scenarios to increase the strength of experimental effects.},
 	author = {Jan-Philipp Fr{\"a}nken and Kanishk Gandhi and Tori Qiu and Ayesha Khawaja and Noah D. Goodman and Tobias Gerstenberg},

diff --git a/docs/index.html b/docs/index.html
@@ -110,7 +110,7 @@
   <meta property="og:description" content="">
   <meta property="og:locale" content="en-us">
 
-  <meta property="og:updated_time" content="2024-04-17T00:00:00&#43;00:00">
+  <meta property="og:updated_time" content="2024-04-22T00:00:00&#43;00:00">
 
 
 

diff --git a/docs/index.xml b/docs/index.xml
@@ -6,9 +6,18 @@
     <generator>Hugo -- gohugo.io</generator>
     <language>en-us</language>
     <copyright>&amp;copy; 2024 Tobias Gerstenberg</copyright>
-    <lastBuildDate>Wed, 17 Apr 2024 00:00:00 +0000</lastBuildDate>
+    <lastBuildDate>Mon, 22 Apr 2024 00:00:00 +0000</lastBuildDate>
     <atom:link href="/" rel="self" type="application/rss+xml" />
 
+    <item>
+      <title>Self-supervised alignment with mutual information: Learning to follow principles without preference labels</title>
+      <link>https://cicl.stanford.edu/publication/franken2024sami/</link>
+      <pubDate>Mon, 22 Apr 2024 00:00:00 +0000</pubDate>
+
+      <guid>https://cicl.stanford.edu/publication/franken2024sami/</guid>
+      <description></description>
+    </item>
+
     <item>
       <title>Procedural dilemma generation for evaluating moral reasoning in humans and language models</title>
       <link>https://cicl.stanford.edu/publication/franken2024rails/</link>
@@ -135,14 +144,5 @@
       <description></description>
     </item>
 
-    <item>
-      <title>Making a positive difference: Criticality in groups</title>
-      <link>https://cicl.stanford.edu/publication/gerstenberg2023criticality/</link>
-      <pubDate>Wed, 14 Jun 2023 00:00:00 +0000</pubDate>
-
-      <guid>https://cicl.stanford.edu/publication/gerstenberg2023criticality/</guid>
-      <description></description>
-    </item>
-
   </channel>
 </rss>
diff --git a/docs/member/tobias_gerstenberg/index.html b/docs/member/tobias_gerstenberg/index.html
@@ -356,6 +356,53 @@ <h2 id="publications">Publications</h2>
 
 
        <div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
+  <span itemprop="author">
+    J. Fränken, E. Zelikman, R. Rafailov, K. Gandhi, T. Gerstenberg, N. D. Goodman</span>
+
+    (2024).
+
+  <a href="https://cicl.stanford.edu/publication/franken2024sami/" itemprop="name">Self-supervised alignment with mutual information: Learning to follow principles without preference labels</a>.
+  <em>arXiv</em>.
+
+
+
+
+  <p>
+
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://arxiv.org/abs/2404.14313" target="_blank" rel="noopener">
+  Preprint
+</a>
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://cicl.stanford.edu/papers/franken2024sami.pdf" target="_blank" rel="noopener">
+  PDF
+</a>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<a class="btn btn-outline-primary my-1 mr-1 btn-sm" href="https://github.com/janphilippfranken/sami" target="_blank" rel="noopener">
+  Github
+</a>
+
+
+  </p>
+
+</div>
+<div class="pub-list-item" style="margin-bottom: 1rem" itemscope itemtype="http://schema.org/CreativeWork">
   <span itemprop="author">
     J. Fränken, K. Gandhi, T. Qiu, A. Khawaja, N. D. Goodman, T. Gerstenberg</span>
 

diff --git a/docs/papers/franken2024sami.pdf b/docs/papers/franken2024sami.pdf