Skip to content

Commit

Permalink
Built site for gh-pages
Browse files Browse the repository at this point in the history
  • Loading branch information
Quarto GHA Workflow Runner committed Jul 12, 2024
1 parent e3982e3 commit 3998324
Show file tree
Hide file tree
Showing 11 changed files with 35 additions and 69 deletions.
2 changes: 1 addition & 1 deletion .nojekyll
Original file line number Diff line number Diff line change
@@ -1 +1 @@
7638a8cb
53bbb9e6
10 changes: 5 additions & 5 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ <h2 class="anchored" data-anchor-id="articles">Articles</h2>

<div class="quarto-listing quarto-listing-container-grid" id="listing-listing">
<div class="list grid quarto-listing-cols-3">
<div class="g-col-1" data-index="0" data-listing-date-sort="1720742400000" data-listing-file-modified-sort="1720817926899" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="7" data-listing-word-count-sort="1236">
<div class="g-col-1" data-index="0" data-listing-date-sort="1720742400000" data-listing-file-modified-sort="1720818013267" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="7" data-listing-word-count-sort="1236">
<a href="./posts/circuit_breaking.html" class="quarto-grid-link">
<div class="quarto-grid-item card h-100 card-left">
<p class="card-img-top"><img src="posts/circuit_breaking_files/figure-html/cell-4-output-1.png" style="height: 150px;" class="thumbnail-image card-img"/></p>
Expand All @@ -230,7 +230,7 @@ <h5 class="no-anchor card-title listing-title">
</div>
</a>
</div>
<div class="g-col-1" data-index="1" data-listing-date-sort="1705968000000" data-listing-file-modified-sort="1720817926911" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="4" data-listing-word-count-sort="771">
<div class="g-col-1" data-index="1" data-listing-date-sort="1705968000000" data-listing-file-modified-sort="1720818013275" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="4" data-listing-word-count-sort="771">
<a href="./posts/dreamy.html" class="quarto-grid-link">
<div class="quarto-grid-item card h-100 card-left">
<p class="card-img-top"><img src="posts/dream_wow.png" style="height: 150px;" class="thumbnail-image card-img"/></p>
Expand All @@ -253,7 +253,7 @@ <h5 class="no-anchor card-title listing-title">
</div>
</a>
</div>
<div class="g-col-1" data-index="2" data-listing-date-sort="1705104000000" data-listing-file-modified-sort="1720817926879" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="25" data-listing-word-count-sort="4985">
<div class="g-col-1" data-index="2" data-listing-date-sort="1705104000000" data-listing-file-modified-sort="1720818013247" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="25" data-listing-word-count-sort="4985">
<a href="./posts/TDC2023.html" class="quarto-grid-link">
<div class="quarto-grid-item card h-100 card-left">
<p class="card-img-top"><img src="posts/TDC2023-sample-instances.png" alt="The Z-scores of activation vector similarity for the provided sample instances" style="height: 150px;" class="thumbnail-image card-img"/></p>
Expand All @@ -276,7 +276,7 @@ <h5 class="no-anchor card-title listing-title">
</div>
</a>
</div>
<div class="g-col-1" data-index="3" data-listing-date-sort="1701302400000" data-listing-file-modified-sort="1720817926911" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="7" data-listing-word-count-sort="1303">
<div class="g-col-1" data-index="3" data-listing-date-sort="1701302400000" data-listing-file-modified-sort="1720818013275" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="7" data-listing-word-count-sort="1303">
<a href="./posts/fight_the_illusion.html" class="quarto-grid-link">
<div class="quarto-grid-item card h-100 card-left">
<div class="listing-item-img-placeholder card-img-top" style="height: 150px;">&nbsp;</div>
Expand All @@ -299,7 +299,7 @@ <h5 class="no-anchor card-title listing-title">
</div>
</a>
</div>
<div class="g-col-1" data-index="4" data-listing-date-sort="1687651200000" data-listing-file-modified-sort="1720817926899" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="7" data-listing-word-count-sort="1247">
<div class="g-col-1" data-index="4" data-listing-date-sort="1687651200000" data-listing-file-modified-sort="1720818013267" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="7" data-listing-word-count-sort="1247">
<a href="./posts/catalog.html" class="quarto-grid-link">
<div class="quarto-grid-item card h-100 card-left">
<p class="card-img-top"><img src="posts/catalog_files/figure-html/cell-9-output-1.png" style="height: 150px;" class="thumbnail-image card-img"/></p>
Expand Down
2 changes: 1 addition & 1 deletion posts/TDC2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -959,7 +959,7 @@ <h4 class="anchored" data-anchor-id="trojan-recovery">Trojan recovery:</h4>
});
</script>
</div> <!-- /content -->
<script>var lightboxQuarto = GLightbox({"descPosition":"bottom","selector":".lightbox","openEffect":"zoom","closeEffect":"zoom","loop":false});
<script>var lightboxQuarto = GLightbox({"selector":".lightbox","loop":false,"openEffect":"zoom","closeEffect":"zoom","descPosition":"bottom"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
2 changes: 1 addition & 1 deletion posts/catalog.html
Original file line number Diff line number Diff line change
Expand Up @@ -1055,7 +1055,7 @@ <h2 class="anchored" data-anchor-id="github">GitHub</h2>
});
</script>
</div> <!-- /content -->
<script>var lightboxQuarto = GLightbox({"selector":".lightbox","descPosition":"bottom","openEffect":"zoom","loop":false,"closeEffect":"zoom"});
<script>var lightboxQuarto = GLightbox({"descPosition":"bottom","closeEffect":"zoom","loop":false,"openEffect":"zoom","selector":".lightbox"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
6 changes: 3 additions & 3 deletions posts/catalog.out.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@
"Pythia-12B is miscalibrated on 20% of the bigrams and 45% of the\n",
"trigrams when we ask for prediction of $p \\geq 0.45$."
],
"id": "2e140ab4-6096-479f-9824-655c27d1328a"
"id": "1bf5c373-0aa5-4018-b9bd-aea7a002b46a"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -377,7 +377,7 @@
"The dataset is available on Huggingface:\n",
"[pile_scan_4](https://huggingface.co/datasets/Confirm-Labs/pile_scan_4)"
],
"id": "4ce4bd71-df7f-47a2-ab65-2641ae87ced3"
"id": "a7f190f3-2824-4d58-9246-d6edfba8870e"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -417,7 +417,7 @@
"Computational Linguistics, May 2022, pp. 95–136. doi:\n",
"[10.18653/v1/2022.bigscience-1.9](https://doi.org/10.18653/v1/2022.bigscience-1.9).</span>"
],
"id": "610dcc82-200b-434d-a73f-20d97eae957d"
"id": "6c411ed0-d58e-4fe3-96f9-bfaff398d7a4"
}
],
"nbformat": 4,
Expand Down
22 changes: 1 addition & 21 deletions posts/circuit_breaking.html
Original file line number Diff line number Diff line change
Expand Up @@ -263,26 +263,6 @@ <h1 class="title">Breaking Circuit Breakers</h1>
</header>


<div id="cell-1" class="cell" data-execution_count="1">
<div class="cell-output cell-output-error">
<div class="ansi-escaped-output">
<pre><span class="ansi-red-fg">---------------------------------------------------------------------------</span>
<span class="ansi-red-fg">FileNotFoundError</span> Traceback (most recent call last)
Cell <span class="ansi-green-fg">In[1], line 14</span>
<span class="ansi-green-fg ansi-bold"> 11</span> <span style="font-weight:bold;color:rgb(0,135,0)">import</span> <span style="font-weight:bold;color:rgb(0,0,255)">flrt</span><span style="font-weight:bold;color:rgb(0,0,255)">.</span><span style="font-weight:bold;color:rgb(0,0,255)">util</span> <span style="font-weight:bold;color:rgb(0,135,0)">as</span> <span style="font-weight:bold;color:rgb(0,0,255)">util</span>
<span class="ansi-green-fg ansi-bold"> 13</span> cutil<span style="color:rgb(98,98,98)">.</span>nb<span style="color:rgb(98,98,98)">.</span>setup_nb()
<span class="ansi-green-fg">---&gt; 14</span> <span class="ansi-yellow-bg">cutil</span><span style="color:rgb(98,98,98)" class="ansi-yellow-bg">.</span><span class="ansi-yellow-bg">chdir_git_root</span><span class="ansi-yellow-bg">(</span><span style="color:rgb(175,0,0)" class="ansi-yellow-bg">"</span><span style="color:rgb(175,0,0)" class="ansi-yellow-bg">flrt</span><span style="color:rgb(175,0,0)" class="ansi-yellow-bg">"</span><span class="ansi-yellow-bg">)</span>
<span class="ansi-green-fg ansi-bold"> 15</span> torch<span style="color:rgb(98,98,98)">.</span>set_grad_enabled(<span style="font-weight:bold;color:rgb(0,135,0)">False</span>)
<span class="ansi-green-fg ansi-bold"> 16</span> torch<span style="color:rgb(98,98,98)">.</span>set_default_device(<span style="color:rgb(175,0,0)">"</span><span style="color:rgb(175,0,0)">cuda</span><span style="color:rgb(175,0,0)">"</span>)

File <span class="ansi-green-fg">~/aiplay/cutil/cutil/__init__.py:17</span>, in <span class="ansi-cyan-fg">chdir_git_root</span><span class="ansi-blue-fg">(suffix)</span>
<span class="ansi-green-fg ansi-bold"> 16</span> <span style="font-weight:bold;color:rgb(0,135,0)">def</span> <span style="color:rgb(0,0,255)">chdir_git_root</span>(suffix):
<span class="ansi-green-fg">---&gt; 17</span> <span class="ansi-yellow-bg">os</span><span style="color:rgb(98,98,98)" class="ansi-yellow-bg">.</span><span class="ansi-yellow-bg">chdir</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">os</span><span style="color:rgb(98,98,98)" class="ansi-yellow-bg">.</span><span class="ansi-yellow-bg">path</span><span style="color:rgb(98,98,98)" class="ansi-yellow-bg">.</span><span class="ansi-yellow-bg">join</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">get_git_root</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">os</span><span style="color:rgb(98,98,98)" class="ansi-yellow-bg">.</span><span class="ansi-yellow-bg">getcwd</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">suffix</span><span class="ansi-yellow-bg">)</span><span class="ansi-yellow-bg">)</span>

<span class="ansi-red-fg">FileNotFoundError</span>: [Errno 2] No such file or directory: '/home/ubuntu/confirmlabs/flrt'</pre>
</div>
</div>
</div>
<div class="callout callout-style-simple callout-note no-icon callout-titled" title="Summary">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
Expand Down Expand Up @@ -1277,7 +1257,7 @@ <h2 class="anchored" data-anchor-id="attack-success-fluency">Attack success: flu
});
</script>
</div> <!-- /content -->
<script>var lightboxQuarto = GLightbox({"descPosition":"bottom","closeEffect":"zoom","loop":false,"openEffect":"zoom","selector":".lightbox"});
<script>var lightboxQuarto = GLightbox({"selector":".lightbox","descPosition":"bottom","loop":false,"closeEffect":"zoom","openEffect":"zoom"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
30 changes: 8 additions & 22 deletions posts/circuit_breaking.out.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion posts/dreamy.html
Original file line number Diff line number Diff line change
Expand Up @@ -5417,7 +5417,7 @@ <h2 class="anchored" data-anchor-id="causal-token-attribution">Causal token attr
});
</script>
</div> <!-- /content -->
<script>var lightboxQuarto = GLightbox({"descPosition":"bottom","selector":".lightbox","closeEffect":"zoom","loop":false,"openEffect":"zoom"});
<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","descPosition":"bottom","loop":false,"openEffect":"zoom"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
14 changes: 7 additions & 7 deletions posts/dreamy.out.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion search.json
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@
"href": "posts/circuit_breaking.html",
"title": "Breaking Circuit Breakers",
"section": "",
"text": "---------------------------------------------------------------------------\nFileNotFoundError Traceback (most recent call last)\nCell In[1], line 14\n 11 import flrt.util as util\n 13 cutil.nb.setup_nb()\n---&gt; 14 cutil.chdir_git_root(\"flrt\")\n 15 torch.set_grad_enabled(False)\n 16 torch.set_default_device(\"cuda\")\n\nFile ~/aiplay/cutil/cutil/__init__.py:17, in chdir_git_root(suffix)\n 16 def chdir_git_root(suffix):\n---&gt; 17 os.chdir(os.path.join(get_git_root(os.getcwd()), suffix))\n\nFileNotFoundError: [Errno 2] No such file or directory: '/home/ubuntu/confirmlabs/flrt'\nA few days ago, GraySwan published code and models for their recent “circuit breakers” method for language models.[1]1\nThe circuit breakers method defends against jailbreaks by training the model to erase “bad” internal representations. Data-efficient defensive methods like this, which use interpretability concepts or tools, are an exciting area with lots of potential.\nIn this post, we briefly investigate three broad topics:\nThe experiments we run only scratch the surface of attacking a circuit-breaker-defended model but we think a more in-depth examination would come to similar conclusions.\nWe list some methodological details that are shared between all the attacks:"
"text": "Summary\n\n\n\nCircuit breakers defend a language model moderately well against token-forcing but fail against a new attack on internal activations. Circuit breakers increase the refusal rate on harmless prompts by nearly 10x.\nA few days ago, GraySwan published code and models for their recent “circuit breakers” method for language models.[1]1\nThe circuit breakers method defends against jailbreaks by training the model to erase “bad” internal representations. Data-efficient defensive methods like this, which use interpretability concepts or tools, are an exciting area with lots of potential.\nIn this post, we briefly investigate three broad topics:\nThe experiments we run only scratch the surface of attacking a circuit-breaker-defended model but we think a more in-depth examination would come to similar conclusions.\nWe list some methodological details that are shared between all the attacks:"
},
{
"objectID": "posts/circuit_breaking.html#high-refusal-rates-on-harmless-prompts",
Expand Down
12 changes: 6 additions & 6 deletions sitemap.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,26 @@
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://confirmlabs.org/index.html</loc>
<lastmod>2024-07-12T20:58:46.875Z</lastmod>
<lastmod>2024-07-12T21:00:13.243Z</lastmod>
</url>
<url>
<loc>https://confirmlabs.org/posts/catalog.html</loc>
<lastmod>2024-07-12T20:58:46.899Z</lastmod>
<lastmod>2024-07-12T21:00:13.267Z</lastmod>
</url>
<url>
<loc>https://confirmlabs.org/posts/fight_the_illusion.html</loc>
<lastmod>2024-07-12T20:58:46.911Z</lastmod>
<lastmod>2024-07-12T21:00:13.275Z</lastmod>
</url>
<url>
<loc>https://confirmlabs.org/posts/dreamy.html</loc>
<lastmod>2024-07-12T20:58:46.911Z</lastmod>
<lastmod>2024-07-12T21:00:13.275Z</lastmod>
</url>
<url>
<loc>https://confirmlabs.org/posts/TDC2023.html</loc>
<lastmod>2024-07-12T20:58:46.879Z</lastmod>
<lastmod>2024-07-12T21:00:13.247Z</lastmod>
</url>
<url>
<loc>https://confirmlabs.org/posts/circuit_breaking.html</loc>
<lastmod>2024-07-12T20:58:46.899Z</lastmod>
<lastmod>2024-07-12T21:00:13.267Z</lastmod>
</url>
</urlset>

0 comments on commit 3998324

Please sign in to comment.