Skip to content

Commit

Permalink
Circuit breaking post
Browse files Browse the repository at this point in the history
  • Loading branch information
tbenthompson committed Jul 12, 2024
1 parent 15a1f9a commit 30d247f
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 60 deletions.
27 changes: 27 additions & 0 deletions posts/biblio.bib
Original file line number Diff line number Diff line change
Expand Up @@ -253,4 +253,31 @@ @article{cammarata2020thread
year = {2020},
note = {https://distill.pub/2020/circuits},
doi = {10.23915/distill.00024}
}

@article{sadasivan2024fast,
title={Fast Adversarial Attacks on Language Models In One GPU Minute},
author={Sadasivan, Vinu Sankar and Saha, Shoumik and Sriramanan, Gaurang and Kattakinda, Priyatham and Chegini, Atoosa and Feizi, Soheil},
journal={arXiv preprint arXiv:2402.15570},
year={2024}
}

@misc{zou2024improvingalignmentrobustnesscircuit,
title={Improving Alignment and Robustness with Circuit Breakers},
author={Andy Zou and Long Phan and Justin Wang and Derek Duenas and Maxwell Lin and Maksym Andriushchenko and Rowan Wang and Zico Kolter and Matt Fredrikson and Dan Hendrycks},
year={2024},
eprint={2406.04313},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2406.04313},
}

@misc{cui2024orbenchoverrefusalbenchmarklarge,
title={OR-Bench: An Over-Refusal Benchmark for Large Language Models},
author={Justin Cui and Wei-Lin Chiang and Ion Stoica and Cho-Jui Hsieh},
year={2024},
eprint={2405.20947},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.20947},
}
Loading

0 comments on commit 30d247f

Please sign in to comment.