From 8610267d6838b2ee9f678f547fc6588a59292cc6 Mon Sep 17 00:00:00 2001
From: Quarto GHA Workflow Runner <quarto-github-actions-publish@example.com>
Date: Wed, 29 Nov 2023 03:44:20 +0000
Subject: [PATCH] Built site for gh-pages

---
 .nojekyll                                     |   2 +-
 index.html                                    |   2 +-
 posts/catalog.html                            |   2 +-
 posts/catalog.out.ipynb                       |  10 +-
 posts/fight_the_illusion.html                 |  54 ++---
 ...ion.out.ipynb => fight_the_illusion.ipynb} | 186 ++++++++----------
 sitemap.xml                                   |   4 +-
 7 files changed, 120 insertions(+), 140 deletions(-)
 rename posts/{fight_the_illusion.out.ipynb => fight_the_illusion.ipynb} (60%)
diff --git a/.nojekyll b/.nojekyll
index 70f1b3d..9f16e9d 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-735c29ea
\ No newline at end of file
+fd316aa3
\ No newline at end of file
diff --git a/index.html b/index.html
index f190839..5a5dd73 100644
--- a/index.html
+++ b/index.html
@@ -143,7 +143,7 @@
 
 <div class="quarto-listing quarto-listing-container-grid" id="listing-listing">
 <div class="list grid quarto-listing-cols-3">
-<div class="g-col-1" data-index="0" data-listing-date-sort="1687651200000" data-listing-file-modified-sort="1701217746139" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="7">
+<div class="g-col-1" data-index="0" data-listing-date-sort="1687651200000" data-listing-file-modified-sort="1701229445824" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="7">
 <a href="./posts/catalog.html" class="quarto-grid-link">
 <div class="quarto-grid-item card h-100 card-left">
 <p class="card-img-top"><img src="posts/catalog_files/figure-html/cell-9-output-1.png" style="height: 150px;"  class="thumbnail-image card-img"/></p>
diff --git a/posts/catalog.html b/posts/catalog.html
index e0b3ae2..d421d9f 100644
--- a/posts/catalog.html
+++ b/posts/catalog.html
@@ -798,7 +798,7 @@ <h2 class="anchored" data-anchor-id="github">GitHub</h2>
 });
 </script>
 </div> <!-- /content -->
-<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","loop":true,"closeEffect":"zoom","selector":".lightbox","descPosition":"bottom"});</script>
+<script>var lightboxQuarto = GLightbox({"selector":".lightbox","closeEffect":"zoom","loop":true,"openEffect":"zoom","descPosition":"bottom"});</script>
 
 
 
diff --git a/posts/catalog.out.ipynb b/posts/catalog.out.ipynb
index 2932f68..7850d3b 100644
--- a/posts/catalog.out.ipynb
+++ b/posts/catalog.out.ipynb
@@ -297,7 +297,7 @@
         "Pythia-12B is miscalibrated on 20% of the bigrams and 45% of the\n",
         "trigrams when we ask for prediction of $p \\geq 0.45$."
       ],
-      "id": "eb864443-abf0-47c9-98d3-9d4872a4decd"
+      "id": "8166b4e1-fe28-4a80-b13b-40e262561d90"
     },
     {
       "cell_type": "code",
@@ -313,7 +313,7 @@
         }
       ],
       "source": [],
-      "id": "fb8e8227-8624-431c-9177-b82f0164343f"
+      "id": "9a5d22eb-e826-4e06-8620-b2e9b3e9812e"
     },
     {
       "cell_type": "markdown",
@@ -375,7 +375,7 @@
         "The dataset is available on Huggingface:\n",
         "[pile_scan_4](https://huggingface.co/datasets/Confirm-Labs/pile_scan_4)"
       ],
-      "id": "bb1d4eed-4a7c-48aa-9d74-f50efc4f485f"
+      "id": "c2618577-5464-4db9-a381-ebcb860c4b24"
     },
     {
       "cell_type": "code",
@@ -389,7 +389,7 @@
         }
       ],
       "source": [],
-      "id": "ed447629-d472-4afa-8683-d77f1bd49513"
+      "id": "d49b8ca6-83a7-4d0e-93a6-c071bdeb33d2"
     },
     {
       "cell_type": "markdown",
@@ -419,7 +419,7 @@
         "Charles Foster, Jason Phang, et al. 2020. “The Pile: An 800GB Dataset of\n",
         "Diverse Text for Language Modeling.” *arXiv Preprint arXiv:2101.00027*."
       ],
-      "id": "1c6bcc70-6c37-4f36-b6c8-b3cb3d7527ce"
+      "id": "9be8445b-e0de-44ee-add5-30d2114b7acc"
     }
   ],
   "nbformat": 4,
diff --git a/posts/fight_the_illusion.html b/posts/fight_the_illusion.html
index 3361e82..3b844bc 100644
--- a/posts/fight_the_illusion.html
+++ b/posts/fight_the_illusion.html
@@ -107,7 +107,7 @@
 <div id="quarto-sidebar-glass" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar,#quarto-sidebar-glass"></div>
 <!-- margin-sidebar -->
     <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-    <div class="quarto-alternate-formats"><h2>Other Formats</h2><ul><li><a href="fight_the_illusion.out.ipynb" download="fight_the_illusion.out.ipynb"><i class="bi bi-journal-code"></i>Jupyter</a></li></ul></div></div>
+    <div class="quarto-alternate-formats"><h2>Other Formats</h2><ul><li><a href="fight_the_illusion.ipynb" download="fight_the_illusion.ipynb"><i class="bi bi-journal-code"></i>Jupyter</a></li></ul></div></div>
 <!-- main -->
 <main class="content" id="quarto-document-content">
 
@@ -162,36 +162,38 @@ <h1 class="title">6 Ways to Fight the Interpretability Illusion</h1>
 * Source doc: 6 ways to fight the Interpretability illusion
 ----->
 <p>Recommended pre-reading:</p>
-<p>Atticus Geiger’s <a href="https://arxiv.org/abs/2303.02536">DAS</a> and <a href="https://arxiv.org/pdf/2305.08809.pdf">Boundless DAS</a>. Lesswrong post <a href="https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of">An Interpretability Illusion for Activation Patching of Arbitrary Subspaces</a>. Corresponding <a href="https://openreview.net/forum?id=Ebt7JgMHv1">ICLR paper, “Is This the Subspace You Are Looking For?</a>”</p>
-<p>__<br>
-<br>
-This post is motivated by Lange, Makelov, and Nanda’s lesswrong post <a href="https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of">Interpretability Illusion for Activation Patching</a> and <a href="https://openreview.net/forum?id=Ebt7JgMHv1">ICLR paper</a>. They study Geiger et al’s <a href="https://arxiv.org/abs/2303.02536">DAS</a> method, which uses optimization to identify an abstracted causal model with a small subset of dimensions in a neural network’s residual stream or internal MLP layer. Their results show that DAS can, depending on the situation, turn up both “correct” and spurious” findings on the train-set. From the investigations in the <a href="https://openreview.net/forum?id=Ebt7JgMHv1">ICLR paper</a> and conversations with a few researchers, my understanding is these “spurious” directions have not performed well on held-out generalization sets, so in practice it is easy to distinguish the “illusions” from “real effects”. But, I am interested in developing even stronger optimize-to-interpret methods. With more powerful optimizers, illusion effects should be even stronger, and competition from “spurious” signals may make “true” signals harder to locate in training. So, here are 6 possible ways to fight against the interpretability illusion. Most of them can be tried in combination.</p>
-<ol type="1">
-<li><p><strong>The causal model still holds, and may still be what we want. </strong>I.e.: We call it an interpretability “illusion” because we are failing to describe the model’s normal functioning. But unusual functioning is fine for some goals! Applications include:</p>
+<ul>
+<li>Atticus Geiger’s <a href="https://arxiv.org/abs/2303.02536">DAS</a> and <a href="https://arxiv.org/pdf/2305.08809.pdf">Boundless DAS</a>.</li>
+<li><a href="https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of">An Interpretability Illusion for Activation Patching of Arbitrary Subspaces</a>.</li>
+<li>The corresponding <a href="https://openreview.net/forum?id=Ebt7JgMHv1">ICLR paper, “Is This the Subspace You Are Looking For?</a>”</li>
+</ul>
+<p>__</p>
+<p>This post is motivated by Lange, Makelov, and Nanda’s LessWrong post <a href="https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of">Interpretability Illusion for Activation Patching</a> and <a href="https://openreview.net/forum?id=Ebt7JgMHv1">ICLR paper</a>. They study <a href="https://arxiv.org/abs/2303.02536">Geiger et al’s DAS</a> method, which uses optimization to identify an abstracted causal model with a small subset of dimensions in a neural network’s residual stream or internal MLP layer. Their results show that DAS can, depending on the situation, turn up both “correct” and “spurious” findings on the train-set. From the investigations in the <a href="https://openreview.net/forum?id=Ebt7JgMHv1">ICLR paper</a> and conversations with a few researchers, my understanding is these “spurious” directions have not performed well on held-out generalization sets, so in practice it is easy to distinguish the “illusions” from “real effects”. But, I am interested in developing even stronger optimize-to-interpret methods. With more powerful optimizers, illusion effects should be even stronger, and competition from spurious signals may make true signals harder to locate in training. So, here are 6 possible ways to fight against the interpretability illusion. Most of them can be tried in combination.</p>
 <ol type="1">
-<li>Finding latent circuits which might be targetable by optimized non-routine inputs (e.g.&nbsp;redteaming)</li>
-<li>“Pinning” a false belief into the model, for testing or alignment training [e.g., forcing the model to believe it is not being watched, in order to test deception or escape behavior].</li>
-</ol>
-<p>The key point is that the interpretability illusion is a failure to <em>describe typical model operation</em>, but a success for <em>enacting the causal model.</em></p></li>
-<li><p><strong>Study more detailed causal models with multiple output streams, multiple options for the input variables, or more compositions. </strong>To start, notice that it is obviously good to have more outputs/consequences of the causal mode in the optimization. Why? First, if we have multiple output-measurements at the end of the causal graph, it is harder for a spurious direction to perform well on all of them by chance. Additionally: if an abstract causal model has modular pieces, then there should be exponentially many combinatorial-swap options that we can test. To score well on the IIA train-loss across all swaps, a ‘spurious’ structure would have to be very sophisticated. While Lange et al.&nbsp;show that spurious solutions may arise for searches in 1 direction, it should be less likely to occur for _pairs _of directions, and less likely yet for full spurious circuits. So, illusion problems may be reduced by scaling up model complexity. Some possible issues remain, though:</p>
-<ol start="3" type="1">
+<li><strong>The causal model still holds, and may still be what we want.</strong>: We call it an interpretability <em>illusion</em> because we are failing to describe the model’s normal functioning. But unusual functioning is fine for some goals! Applications include:
+<ul>
+<li>Finding latent circuits which might be targetable by optimized non-routine inputs. For example, these circuits might be used in an adversarial attack.</li>
+<li>“Pinning” a false belief into the model, for testing or alignment training. For example, forcing the model to believe it is not being watched, in order to test deception or escape behavior.</li>
+</ul>
+The key point is that the interpretability illusion is a failure to <em>describe typical model operation</em>, but a success for <em>enacting the causal model</em>.</li>
+<li><strong>Study more detailed causal models with multiple output streams, multiple options for the input variables, or more compositions.</strong> To start, notice that it is obviously good to have more outputs/consequences of the causal mode in the optimization. Why? First, if we have multiple output-measurements at the end of the causal graph, it is harder for a spurious direction to perform well on all of them by chance. Additionally, if an abstract causal model has modular pieces, then there should be exponentially many combinatorial-swap options that we can test. To score well on the IIA train-loss across all swaps, a spurious structure would have to be very sophisticated. While Lange et al.&nbsp;show that spurious solutions may arise for searches in 1 direction, it should be less likely to occur for <em>pairs</em> of directions, and less likely yet for full spurious circuits. So, illusion problems may be reduced by scaling up model complexity. Some possible issues remain, though:
+<ul>
 <li>In some cases we may struggle to identify specific directions within a multi-part model; i.e., we might find convincing overall performance for a circuit, but an individual dimension or two could be spurious, and we might be unable to determine exactly which.</li>
-<li>This approach relies on big, deep, abstract causal models existing inside the networks, with sufficient robustness in their functioning across variable changes. While there is some suggestive work on predictable / standardized structures in LLM’s, from investigations like <a href="https://arxiv.org/pdf/2310.17191.pdf">Feng and Steinhardt (2023</a>)’s entity binding case study, the <a href="https://github.com/redwoodresearch/Easy-Transformer/blob/main/README.md">IOI</a> paper, and studies of <a href="https://arxiv.org/pdf/2305.14699.pdf">recursive tasks</a>, the consistency/robustness and DAS-discoverability of larger structures in scaled-up models is not yet clear. More case studies in larger models would be of value.</li>
-</ol></li>
-<li><p><strong>Measure generalizability, and use it to filter out spurious findings after-the-fact.</strong> This is just common-sense, and researchers are already doing this in several ways. We can construct train/test splits with random sampling, and conclude a found direction is spurious if it does not generalize on the test data; or we could ask how the patched model generalizes out-of-training-distribution following a small perturbation, such as adding extra preceding tokens. Spurious solutions are likely to be sensitive to minor changes, and for many purposes we are primarily interested in causal models that generalize well. As mentioned earlier, the <a href="https://openreview.net/forum?id=Ebt7JgMHv1">ICLR</a> paper’s `spurious’ findings performed sufficiently poorly on generalization sets that they could easily be distinguished from real effects.</p></li>
-<li><p><strong>Quantify a null distribution </strong>In the <a href="https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of">“Illusion” post</a>, Lange et al.&nbsp;show that the strength of the spurious signal depends on how many neurons it is allowed to optimize over. So, a very strong signal, taken over a small optimization set, should be more convincing. Thinking as statisticians, we could attempt to construct a `null distribution’ for the spurious signals; this approach could offer evidence that a causal map element is being represented “at all.” One could imagine doing this kind of inference for individual <em>pieces</em> of a larger causal model, with difference uncertainty bars for different components.</p></li>
-<li><p><strong>Use unsupervised feature extraction as a first step</strong>. Recent interpretability work with <a href="https://transformer-circuits.pub/2023/monosemantic-features">auto-encoders</a> <a href="https://arxiv.org/abs/2309.08600">suggests</a> that many of a small transformer’s most important features can be identified. If this technique scales well, it could <strong><em>vastly</em></strong> reduce the amount of optimization pressure needed to identify the right directions, shrinking the search space and reducing optimistic bias / spurious findings.</p></li>
-<li><p><strong>Incorporate additional information as a prior / penalty for optimization. </strong>As Lange et al.&nbsp;note in the <a href="https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of">“Illusion” post</a>, and as described in Section 5 of the <a href="https://openreview.net/forum?id=Ebt7JgMHv1">ICLR paper</a>, it is possible to supply additional evidence that a found direction is “faithful” (or not). In the case study with the IOI task, they argued the direction found by DAS on a residual layer fell within the query subspace of human-identified “name mover” heads. More generally, if intuitions about faithfulness can be scored with a quantitative metric, then tacking that metric onto the optimization as a penalty should help the optimizer favor “correct” directions over “spurious” solutions. Still, using this approach requires answering two difficult questions: what additional evidence to choose, and then how to quantify it? Some vague possibilities:</p>
-<ol start="5" type="1">
-<li>Perhaps next-gen AI will offer accurate “auto-grading”, giving a general yet quantitative evaluation of plausibility of found solutions</li>
+<li>This approach relies on big, deep, abstract causal models existing inside the networks, with sufficient robustness in their functioning across variable changes. There is some suggestive work on predictable / standardized structures in LLM’s, from investigations like <a href="https://arxiv.org/pdf/2310.17191.pdf">Feng and Steinhardt (2023</a>)’s entity binding case study, the <a href="https://github.com/redwoodresearch/Easy-Transformer/blob/main/README.md">indirect object identification (IOI)</a> paper, and studies of <a href="https://arxiv.org/pdf/2305.14699.pdf">recursive tasks</a>. However, the consistency/robustness and DAS-discoverability of larger structures in scaled-up models is not yet clear. More case studies in larger models would be valuable.</li>
+</ul></li>
+<li><strong>Measure generalizability, and use it to filter out spurious findings after-the-fact.</strong> This is just common-sense, and researchers are already doing this in several ways. We can construct train/test splits with random sampling, and conclude a found direction is spurious if it does not generalize on the test data; or we could ask how the patched model generalizes out-of-training-distribution following a small perturbation, such as adding extra preceding tokens. Spurious solutions are likely to be sensitive to minor changes, and for many purposes we are primarily interested in causal models that generalize well. As mentioned earlier, the <a href="https://openreviewnet/forum?id=Ebt7JgMHv1">ICLR</a> paper’s `spurious’ findings performed sufficiently poorly on generalization sets that they could easily be distinguished from real effects.</li>
+<li><strong>Quantify a null distribution.</strong> In the <a href="https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of">“Illusion” post</a>, Lange et al.&nbsp;show that the strength of the spurious signal depends on how many neurons it is allowed to optimize over. So, a very strong signal, taken over a small optimization set, should be more convincing. Thinking as statisticians, we could attempt to construct a null distribution for the spurious signals; this approach could offer evidence that a causal map element is being represented at all. We can do this inference for individual <em>pieces</em> of a larger causal model, with each component having its own uncertainty.</li>
+<li><strong>Use unsupervised feature extraction as a first step.</strong> Recent interpretability work with auto-encoders (<a href="https://transformer-circuits.pub/2023/monosemantic-features">Bricken et al.&nbsp;2023</a>, <a href="https://arxiv.org/abs/2309.08600">Cunningham et al.&nbsp;2023</a>) suggests that many of a small transformer’s most important features can be identified. If this technique scales well, it could <strong><em>vastly</em></strong> reduce the amount of optimization pressure needed to identify the right directions, shrinking the search space and reducing optimistic bias and spurious findings.</li>
+<li><strong>Incorporate additional information as a prior / penalty for optimization.</strong> As Lange et al.&nbsp;note in the <a href="https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of">“Illusion” post</a>, and as described in Section 5 of the <a href="https://openreview.net/forum?id=Ebt7JgMHv1">ICLR paper</a>, it is possible to supply additional evidence that a found direction is faithful or not. In the case study with the IOI task, they argued the direction found by DAS on a residual layer fell within the query subspace of human-identified name mover heads. More generally, if intuitions about faithfulness can be scored with a quantitative metric, then tacking that metric onto the optimization as a penalty should help the optimizer favor correct directions over spurious solutions. Still, using this approach requires answering two difficult questions: what additional evidence to choose, and then how to quantify it? Some rough possibilities:
+<ul>
+<li>If we know of structures that should be related to the task, such as entity bindings (<a href="https://arxiv.org/pdf/2310.17191.pdf">Feng and Steinhardt (2023)</a>), we can try to build outwards from them; or if we have a reliable feature dictionary from sparse auto-encoders or “<a href="https://arxiv.org/pdf/2111.13654.pdf">belief graph</a>” per Hase et al.&nbsp;2021 which offers advance predictions for how subsequent layers’ features may react to a change, we can penalize lack of correlation or causal effects on downstream features.</li>
 <li>Somehow draw information from analyzing very basic components of the network: punish “MLP-in-the-middle” solutions by using some combination of changes in MLP activations / attention, gradients, sizes of the induced changes in the residual stream, etc.</li>
-<li>If we know of structures that “should be related” to the task, such as entity bindings (<a href="https://arxiv.org/pdf/2310.17191.pdf">Feng and Steinhardt (2023)</a>), we can try to build outwards from them; or if we have a reliable feature dictionary from sparse auto-encoders or “<a href="https://arxiv.org/pdf/2111.13654.pdf">belief graph</a>” per Hase et al.&nbsp;2021 which offers advance predictions for how subsequent layers’ features may react to a change, we can penalize lack of correlation or causal effects on downstream features.</li>
-</ol>
-<p>Using extra information in this way unfortunately spends its usability for validation. But if utilizing it prevents the optimization from getting stuck on false signals, the trade-off should be favorable.</p></li>
+<li>Perhaps next-gen AI will offer accurate “auto-grading”, giving a general yet quantitative evaluation of plausibility of found solutions</li>
+</ul>
+Using extra information in this way unfortunately spends its usability for validation. But if extra information prevents the optimization from getting stuck on false signals, the trade-off should be favorable.</li>
 </ol>
-<p>—-</p>
-<p>Thanks to Atticus Geiger, Jing Huang, Ben Thompson, Zygimantas Straznickas and others for conversations and feedback on earlier drafts.</p>
 <hr>
+<p>Thanks to Atticus Geiger, Jing Huang, Ben Thompson, Zygimantas Straznickas and others for conversations and feedback on earlier drafts.</p>
 
 
 
diff --git a/posts/fight_the_illusion.out.ipynb b/posts/fight_the_illusion.ipynb
similarity index 60%
rename from posts/fight_the_illusion.out.ipynb
rename to posts/fight_the_illusion.ipynb
index ded6417..bd137b7 100644
--- a/posts/fight_the_illusion.out.ipynb
+++ b/posts/fight_the_illusion.ipynb
@@ -9,7 +9,7 @@
         "Michael Sklar  \n",
         "2023-11-28"
       ],
-      "id": "a4a42736-a8aa-4851-9207-3100bd99e88a"
+      "id": "424fbf11-ccd4-4568-ab4e-fddffb1f3ad2"
     },
     {
       "cell_type": "raw",
@@ -38,7 +38,7 @@
         "* Source doc: 6 ways to fight the Interpretability illusion\n",
         "----->"
       ],
-      "id": "4c0ddb5a-143e-4db3-b1f5-7df53cf6db3a"
+      "id": "84ddb8d9-6186-4cfe-910e-6a7a3d14517f"
     },
     {
       "cell_type": "markdown",
@@ -46,86 +46,83 @@
       "source": [
         "Recommended pre-reading:\n",
         "\n",
-        "Atticus Geiger’s [DAS](https://arxiv.org/abs/2303.02536) and [Boundless\n",
-        "DAS](https://arxiv.org/pdf/2305.08809.pdf). Lesswrong post [An\n",
-        "Interpretability Illusion for Activation Patching of Arbitrary\n",
-        "Subspaces](https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of).\n",
-        "Corresponding [ICLR paper, “Is This the Subspace You Are Looking\n",
-        "For?](https://openreview.net/forum?id=Ebt7JgMHv1)”\n",
+        "-   Atticus Geiger’s [DAS](https://arxiv.org/abs/2303.02536) and\n",
+        "    [Boundless DAS](https://arxiv.org/pdf/2305.08809.pdf).\n",
+        "-   [An Interpretability Illusion for Activation Patching of Arbitrary\n",
+        "    Subspaces](https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of).\n",
+        "-   The corresponding [ICLR paper, “Is This the Subspace You Are Looking\n",
+        "    For?](https://openreview.net/forum?id=Ebt7JgMHv1)”\n",
         "\n",
-        "\\_\\_  \n",
-        "  \n",
-        "This post is motivated by Lange, Makelov, and Nanda’s lesswrong post\n",
+        "\\_\\_\n",
+        "\n",
+        "This post is motivated by Lange, Makelov, and Nanda’s LessWrong post\n",
         "[Interpretability Illusion for Activation\n",
         "Patching](https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of)\n",
         "and [ICLR paper](https://openreview.net/forum?id=Ebt7JgMHv1). They study\n",
-        "Geiger et al’s [DAS](https://arxiv.org/abs/2303.02536) method, which\n",
+        "[Geiger et al’s DAS](https://arxiv.org/abs/2303.02536) method, which\n",
         "uses optimization to identify an abstracted causal model with a small\n",
         "subset of dimensions in a neural network’s residual stream or internal\n",
         "MLP layer. Their results show that DAS can, depending on the situation,\n",
-        "turn up both “correct” and spurious” findings on the train-set. From the\n",
-        "investigations in the [ICLR\n",
+        "turn up both “correct” and “spurious” findings on the train-set. From\n",
+        "the investigations in the [ICLR\n",
         "paper](https://openreview.net/forum?id=Ebt7JgMHv1) and conversations\n",
         "with a few researchers, my understanding is these “spurious” directions\n",
         "have not performed well on held-out generalization sets, so in practice\n",
         "it is easy to distinguish the “illusions” from “real effects”. But, I am\n",
         "interested in developing even stronger optimize-to-interpret methods.\n",
         "With more powerful optimizers, illusion effects should be even stronger,\n",
-        "and competition from “spurious” signals may make “true” signals harder\n",
-        "to locate in training. So, here are 6 possible ways to fight against the\n",
+        "and competition from spurious signals may make true signals harder to\n",
+        "locate in training. So, here are 6 possible ways to fight against the\n",
         "interpretability illusion. Most of them can be tried in combination.\n",
         "\n",
-        "1.  **The causal model still holds, and may still be what we want.\n",
-        "    **I.e.: We call it an interpretability “illusion” because we are\n",
-        "    failing to describe the model’s normal functioning. But unusual\n",
-        "    functioning is fine for some goals! Applications include:\n",
-        "\n",
-        "    1.  Finding latent circuits which might be targetable by optimized\n",
-        "        non-routine inputs (e.g. redteaming)\n",
-        "    2.  “Pinning” a false belief into the model, for testing or\n",
-        "        alignment training \\[e.g., forcing the model to believe it is\n",
-        "        not being watched, in order to test deception or escape\n",
-        "        behavior\\].\n",
+        "1.  **The causal model still holds, and may still be what we want.**: We\n",
+        "    call it an interpretability *illusion* because we are failing to\n",
+        "    describe the model’s normal functioning. But unusual functioning is\n",
+        "    fine for some goals! Applications include:\n",
+        "    -   Finding latent circuits which might be targetable by optimized\n",
+        "        non-routine inputs. For example, these circuits might be used in\n",
+        "        an adversarial attack.\n",
+        "    -   “Pinning” a false belief into the model, for testing or\n",
+        "        alignment training. For example, forcing the model to believe it\n",
+        "        is not being watched, in order to test deception or escape\n",
+        "        behavior.\n",
         "\n",
         "    The key point is that the interpretability illusion is a failure to\n",
         "    *describe typical model operation*, but a success for *enacting the\n",
-        "    causal model.*\n",
-        "\n",
+        "    causal model*.\n",
         "2.  **Study more detailed causal models with multiple output streams,\n",
-        "    multiple options for the input variables, or more compositions. **To\n",
+        "    multiple options for the input variables, or more compositions.** To\n",
         "    start, notice that it is obviously good to have more\n",
         "    outputs/consequences of the causal mode in the optimization. Why?\n",
         "    First, if we have multiple output-measurements at the end of the\n",
         "    causal graph, it is harder for a spurious direction to perform well\n",
-        "    on all of them by chance. Additionally: if an abstract causal model\n",
+        "    on all of them by chance. Additionally, if an abstract causal model\n",
         "    has modular pieces, then there should be exponentially many\n",
         "    combinatorial-swap options that we can test. To score well on the\n",
-        "    IIA train-loss across all swaps, a ‘spurious’ structure would have\n",
-        "    to be very sophisticated. While Lange et al. show that spurious\n",
+        "    IIA train-loss across all swaps, a spurious structure would have to\n",
+        "    be very sophisticated. While Lange et al. show that spurious\n",
         "    solutions may arise for searches in 1 direction, it should be less\n",
-        "    likely to occur for \\_pairs \\_of directions, and less likely yet for\n",
+        "    likely to occur for *pairs* of directions, and less likely yet for\n",
         "    full spurious circuits. So, illusion problems may be reduced by\n",
         "    scaling up model complexity. Some possible issues remain, though:\n",
-        "\n",
-        "    1.  In some cases we may struggle to identify specific directions\n",
+        "    -   In some cases we may struggle to identify specific directions\n",
         "        within a multi-part model; i.e., we might find convincing\n",
         "        overall performance for a circuit, but an individual dimension\n",
         "        or two could be spurious, and we might be unable to determine\n",
         "        exactly which.\n",
-        "    2.  This approach relies on big, deep, abstract causal models\n",
+        "    -   This approach relies on big, deep, abstract causal models\n",
         "        existing inside the networks, with sufficient robustness in\n",
-        "        their functioning across variable changes. While there is some\n",
+        "        their functioning across variable changes. There is some\n",
         "        suggestive work on predictable / standardized structures in\n",
         "        LLM’s, from investigations like [Feng and Steinhardt\n",
         "        (2023](https://arxiv.org/pdf/2310.17191.pdf))’s entity binding\n",
-        "        case study, the\n",
-        "        [IOI](https://github.com/redwoodresearch/Easy-Transformer/blob/main/README.md)\n",
+        "        case study, the [indirect object identification\n",
+        "        (IOI)](https://github.com/redwoodresearch/Easy-Transformer/blob/main/README.md)\n",
         "        paper, and studies of [recursive\n",
-        "        tasks](https://arxiv.org/pdf/2305.14699.pdf), the\n",
+        "        tasks](https://arxiv.org/pdf/2305.14699.pdf). However, the\n",
         "        consistency/robustness and DAS-discoverability of larger\n",
         "        structures in scaled-up models is not yet clear. More case\n",
-        "        studies in larger models would be of value.\n",
-        "\n",
+        "        studies in larger models would be valuable.\n",
         "3.  **Measure generalizability, and use it to filter out spurious\n",
         "    findings after-the-fact.** This is just common-sense, and\n",
         "    researchers are already doing this in several ways. We can construct\n",
@@ -136,58 +133,45 @@
         "    adding extra preceding tokens. Spurious solutions are likely to be\n",
         "    sensitive to minor changes, and for many purposes we are primarily\n",
         "    interested in causal models that generalize well. As mentioned\n",
-        "    earlier, the [ICLR](https://openreview.net/forum?id=Ebt7JgMHv1)\n",
+        "    earlier, the [ICLR](https://openreviewnet/forum?id=Ebt7JgMHv1)\n",
         "    paper’s \\`spurious’ findings performed sufficiently poorly on\n",
         "    generalization sets that they could easily be distinguished from\n",
         "    real effects.\n",
-        "\n",
-        "4.  **Quantify a null distribution **In the [“Illusion”\n",
+        "4.  **Quantify a null distribution.** In the [“Illusion”\n",
         "    post](https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of),\n",
         "    Lange et al. show that the strength of the spurious signal depends\n",
         "    on how many neurons it is allowed to optimize over. So, a very\n",
         "    strong signal, taken over a small optimization set, should be more\n",
         "    convincing. Thinking as statisticians, we could attempt to construct\n",
-        "    a \\`null distribution’ for the spurious signals; this approach could\n",
-        "    offer evidence that a causal map element is being represented “at\n",
-        "    all.” One could imagine doing this kind of inference for individual\n",
-        "    *pieces* of a larger causal model, with difference uncertainty bars\n",
-        "    for different components.\n",
-        "\n",
-        "5.  **Use unsupervised feature extraction as a first step**. Recent\n",
-        "    interpretability work with\n",
-        "    [auto-encoders](https://transformer-circuits.pub/2023/monosemantic-features)\n",
-        "    [suggests](https://arxiv.org/abs/2309.08600) that many of a small\n",
-        "    transformer’s most important features can be identified. If this\n",
-        "    technique scales well, it could ***vastly*** reduce the amount of\n",
-        "    optimization pressure needed to identify the right directions,\n",
-        "    shrinking the search space and reducing optimistic bias / spurious\n",
-        "    findings.\n",
-        "\n",
+        "    a null distribution for the spurious signals; this approach could\n",
+        "    offer evidence that a causal map element is being represented at\n",
+        "    all. We can do this inference for individual *pieces* of a larger\n",
+        "    causal model, with each component having its own uncertainty.\n",
+        "5.  **Use unsupervised feature extraction as a first step.** Recent\n",
+        "    interpretability work with auto-encoders ([Bricken et\n",
+        "    al. 2023](https://transformer-circuits.pub/2023/monosemantic-features),\n",
+        "    [Cunningham et al. 2023](https://arxiv.org/abs/2309.08600)) suggests\n",
+        "    that many of a small transformer’s most important features can be\n",
+        "    identified. If this technique scales well, it could ***vastly***\n",
+        "    reduce the amount of optimization pressure needed to identify the\n",
+        "    right directions, shrinking the search space and reducing optimistic\n",
+        "    bias and spurious findings.\n",
         "6.  **Incorporate additional information as a prior / penalty for\n",
-        "    optimization. **As Lange et al. note in the [“Illusion”\n",
+        "    optimization.** As Lange et al. note in the [“Illusion”\n",
         "    post](https://www.lesswrong.com/posts/RFtkRXHebkwxygDe2/an-interpretability-illusion-for-activation-patching-of),\n",
         "    and as described in Section 5 of the [ICLR\n",
         "    paper](https://openreview.net/forum?id=Ebt7JgMHv1), it is possible\n",
-        "    to supply additional evidence that a found direction is “faithful”\n",
-        "    (or not). In the case study with the IOI task, they argued the\n",
-        "    direction found by DAS on a residual layer fell within the query\n",
-        "    subspace of human-identified “name mover” heads. More generally, if\n",
-        "    intuitions about faithfulness can be scored with a quantitative\n",
-        "    metric, then tacking that metric onto the optimization as a penalty\n",
-        "    should help the optimizer favor “correct” directions over “spurious”\n",
-        "    solutions. Still, using this approach requires answering two\n",
-        "    difficult questions: what additional evidence to choose, and then\n",
-        "    how to quantify it? Some vague possibilities:\n",
-        "\n",
-        "    1.  Perhaps next-gen AI will offer accurate “auto-grading”, giving a\n",
-        "        general yet quantitative evaluation of plausibility of found\n",
-        "        solutions\n",
-        "    2.  Somehow draw information from analyzing very basic components of\n",
-        "        the network: punish “MLP-in-the-middle” solutions by using some\n",
-        "        combination of changes in MLP activations / attention,\n",
-        "        gradients, sizes of the induced changes in the residual stream,\n",
-        "        etc.\n",
-        "    3.  If we know of structures that “should be related” to the task,\n",
+        "    to supply additional evidence that a found direction is faithful or\n",
+        "    not. In the case study with the IOI task, they argued the direction\n",
+        "    found by DAS on a residual layer fell within the query subspace of\n",
+        "    human-identified name mover heads. More generally, if intuitions\n",
+        "    about faithfulness can be scored with a quantitative metric, then\n",
+        "    tacking that metric onto the optimization as a penalty should help\n",
+        "    the optimizer favor correct directions over spurious solutions.\n",
+        "    Still, using this approach requires answering two difficult\n",
+        "    questions: what additional evidence to choose, and then how to\n",
+        "    quantify it? Some rough possibilities:\n",
+        "    -   If we know of structures that should be related to the task,\n",
         "        such as entity bindings ([Feng and Steinhardt\n",
         "        (2023)](https://arxiv.org/pdf/2310.17191.pdf)), we can try to\n",
         "        build outwards from them; or if we have a reliable feature\n",
@@ -196,41 +180,35 @@
         "        al. 2021 which offers advance predictions for how subsequent\n",
         "        layers’ features may react to a change, we can penalize lack of\n",
         "        correlation or causal effects on downstream features.\n",
+        "    -   Somehow draw information from analyzing very basic components of\n",
+        "        the network: punish “MLP-in-the-middle” solutions by using some\n",
+        "        combination of changes in MLP activations / attention,\n",
+        "        gradients, sizes of the induced changes in the residual stream,\n",
+        "        etc.\n",
+        "    -   Perhaps next-gen AI will offer accurate “auto-grading”, giving a\n",
+        "        general yet quantitative evaluation of plausibility of found\n",
+        "        solutions\n",
         "\n",
         "    Using extra information in this way unfortunately spends its\n",
-        "    usability for validation. But if utilizing it prevents the\n",
+        "    usability for validation. But if extra information prevents the\n",
         "    optimization from getting stuck on false signals, the trade-off\n",
         "    should be favorable.\n",
         "\n",
-        "—-\n",
+        "------------------------------------------------------------------------\n",
         "\n",
         "Thanks to Atticus Geiger, Jing Huang, Ben Thompson, Zygimantas\n",
-        "Straznickas and others for conversations and feedback on earlier drafts.\n",
-        "\n",
-        "------------------------------------------------------------------------"
+        "Straznickas and others for conversations and feedback on earlier drafts."
       ],
-      "id": "665cbeb6-a7a8-4cdc-b87b-62d5dd00e489"
+      "id": "3498b614-95a8-4bef-8255-56b161af2e4d"
     }
   ],
   "nbformat": 4,
   "nbformat_minor": 5,
   "metadata": {
     "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python"
-    },
-    "language_info": {
-      "name": "python",
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": "3"
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.10"
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
     }
   }
 }
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index 214aa45..133de9c 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,10 +2,10 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://confirmlabs.org/index.html</loc>
-    <lastmod>2023-11-29T00:29:32.823Z</lastmod>
+    <lastmod>2023-11-29T03:44:16.316Z</lastmod>
   </url>
   <url>
     <loc>https://confirmlabs.org/posts/catalog.html</loc>
-    <lastmod>2023-11-29T00:29:35.663Z</lastmod>
+    <lastmod>2023-11-29T03:44:19.160Z</lastmod>
   </url>
 </urlset>