Skip to content

Commit

Permalink
Dreaming post.
Browse files Browse the repository at this point in the history
  • Loading branch information
tbenthompson committed Jan 24, 2024
1 parent 49f801f commit cb38c20
Show file tree
Hide file tree
Showing 6 changed files with 8,189 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/.quarto/
_site/
_freeze
posts/dreamy_clone
82 changes: 82 additions & 0 deletions posts/biblio.bib
Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,86 @@ @article{elhage2022superposition
year={2022},
journal={Transformer Circuits Thread},
note={https://transformer-circuits.pub/2022/toy_model/index.html}
}

@misc{yuan2023bridge,
title={Bridge the Gap Between CV and NLP! A Gradient-based Textual Adversarial Attack Framework},
author={Lifan Yuan and Yichi Zhang and Yangyi Chen and Wei Wei},
year={2023},
eprint={2110.15317},
archivePrefix={arXiv},
primaryClass={cs.CL}
}

@misc{jones2023automatically,
title={Automatically Auditing Large Language Models via Discrete Optimization},
author={Erik Jones and Anca Dragan and Aditi Raghunathan and Jacob Steinhardt},
year={2023},
eprint={2303.04381},
archivePrefix={arXiv},
primaryClass={cs.LG}
}

@article{bricken2023monosemanticity,
title={Towards Monosemanticity: Decomposing Language Models With Dictionary Learning},
author={Bricken, Trenton and Templeton, Adly and Batson, Joshua and Chen, Brian and Jermyn, Adam and Conerly, Tom and Turner, Nick and Anil, Cem and Denison, Carson and Askell, Amanda and Lasenby, Robert and Wu, Yifan and Kravec, Shauna and Schiefer, Nicholas and Maxwell, Tim and Joseph, Nicholas and Hatfield-Dodds, Zac and Tamkin, Alex and Nguyen, Karina and McLean, Brayden and Burke, Josiah E and Hume, Tristan and Carter, Shan and Henighan, Tom and Olah, Christopher},
year={2023},
journal={Transformer Circuits Thread},
note={https://transformer-circuits.pub/2023/monosemantic-features/index.html}
}


@misc{bills2023language,
title={Language models can explain neurons in language models},
author={
Bills, Steven and Cammarata, Nick and Mossing, Dan and Tillman, Henk and Gao, Leo and Goh, Gabriel and Sutskever, Ilya and Leike, Jan and Wu, Jeff and Saunders, William
},
year={2023},
howpublished = {\url{https://openaipublic.blob.core.windows.net/neuron-explainer/paper/index.html}}
}


@misc{bolukbasi2021interpretability,
title={An Interpretability Illusion for BERT},
author={Tolga Bolukbasi and Adam Pearce and Ann Yuan and Andy Coenen and Emily Reif and Fernanda Viégas and Martin Wattenberg},
year={2021},
eprint={2104.07143},
archivePrefix={arXiv},
primaryClass={cs.CL}
}

@misc{yosinski2015understanding,
title={Understanding Neural Networks Through Deep Visualization},
author={Jason Yosinski and Jeff Clune and Anh Nguyen and Thomas Fuchs and Hod Lipson},
year={2015},
eprint={1506.06579},
archivePrefix={arXiv},
primaryClass={cs.CV}
}

@misc{szegedy2014intriguing,
title={Intriguing properties of neural networks},
author={Christian Szegedy and Wojciech Zaremba and Ilya Sutskever and Joan Bruna and Dumitru Erhan and Ian Goodfellow and Rob Fergus},
year={2014},
eprint={1312.6199},
archivePrefix={arXiv},
primaryClass={cs.CV}
}

@misc{cunningham2023sparse,
title={Sparse Autoencoders Find Highly Interpretable Features in Language Models},
author={Hoagy Cunningham and Aidan Ewart and Logan Riggs and Robert Huben and Lee Sharkey},
year={2023},
eprint={2309.08600},
archivePrefix={arXiv},
primaryClass={cs.LG}
}

@article{cammarata2020thread,
author = {Cammarata, Nick and Carter, Shan and Goh, Gabriel and Olah, Chris and Petrov, Michael and Schubert, Ludwig and Voss, Chelsea and Egan, Ben and Lim, Swee Kiat},
title = {Thread: Circuits},
journal = {Distill},
year = {2020},
note = {https://distill.pub/2020/circuits},
doi = {10.23915/distill.00024}
}
2 changes: 1 addition & 1 deletion posts/catalog.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"metadata": {},
"source": [
"---\n",
"title: 'A catalog of several million tasks Pythia can do.'\n",
"title: 'A catalog of several million tasks Pythia can do'\n",
"date: 06/25/2023\n",
"author:\n",
" - name: \n",
Expand Down
Binary file added posts/dream_wow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit cb38c20

Please sign in to comment.