diff --git a/.gitmodules b/.gitmodules index 1ab84657..e7917f49 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13062,3 +13062,12 @@ [submodule "2024/05/28/neo4j"] path = 2024/05/28/neo4j url = https://github.com/neo4j/neo4j +[submodule "2024/05/28/ot-node"] + path = 2024/05/28/ot-node + url = https://github.com/OriginTrail/ot-node +[submodule "2024/05/28/ChatDKG"] + path = 2024/05/28/ChatDKG + url = https://github.com/OriginTrail/ChatDKG +[submodule "2024/05/28/polygon-edge"] + path = 2024/05/28/polygon-edge + url = https://github.com/0xPolygon/polygon-edge diff --git a/2024/05/26/note-jmd.org b/2024/05/26/note-jmd.org deleted file mode 100644 index 12a8d08f..00000000 --- a/2024/05/26/note-jmd.org +++ /dev/null @@ -1,172 +0,0 @@ -Thoughts on the cost of compute: - -Hosting -Aquisition of hardware -Transportation of hardware -Time to setup -Networking costs -Storage costs -Physical Space required -Power required -Control/Censorship of server action -Privacy -Data storage -Quantum Proof Encryption -Know your peers -software development -shells and layers of privacy and data. - -* examples -Start9.com -Rocket.chat - javascript - - -* submodule - -#+begin_src shell -grep "url =" /mnt/data1/nix/time/.gitmodules |cut -d= -f2 | cut -b2- |sort -u > submodules.txt - -for x in `grep "> " diff.txt |cut -b3-`' do github submodule add $x; done -diff submodules.txt starsrepos.txt > diff.txt -grep "> " diff.txt |cut -b3- >modules.txt -#+end_src - -adding in all submodules - -* consider git - -consider the objects in multiple git repos -we might even find the same object hashs in multiple -repos. - -#+begin_src shell -`find > files.txt` -#+end_src - -running this will produce a bunch of -.git references. - -#+begin_src shell -find . -name \*.idx -exec git verify-pack -v {} \; -print - -sort < packs.txt |uniq -c |sort -n > report.txt - -grep blob packs.txt |cut "-d " -f1 |sort | uniq -c |sort -n |tail - -** example common blobs -#+begin_src shell -grep blob /mnt/data1/nix/packs.txt |cut "-d " -f1 |sort | uniq -c |sort -n |tail -10 -#+end_src - -#+RESULTS: -| 4 | eafb3fa03a67a7a8046e7ca485bee71b26035da6 | -| 4 | ecb8613a7e4dbf9c1f7772db07f8d50dd7280537 | -| 4 | f1c181ec9c5c921245027c6b452ecfc1d3626364 | -| 5 | d00491fd7e5bb6fa28c517a0bb32b8b506539d4d | -| 6 | f288702d2fa16d3cdf0035b15a9fcbc552cd88e7 | -| 9 | 94a9ed024d3859793618152ea559a168bbcbb5e2 | -| 9 | d645695673349e3947e8e5ae42332d0ac3164cd7 | -| 15 | 261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 | -| 17 | 8b137891791fe96927ad78e64b0aad7bded08bdc | -| 53 | e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 | - - -* model - git repo - git sub repo - pack files -> object files. - common object files between repos - (think autogpt clones) - construct a graph of repositories - containing the same or similar files. - branches and commits connecting those and people. - - repo1: - c file - hash 456, user2 - readme - hash 123, user2 - repo2: - readme - hash 123, user1 - - graph -> dkge, rdf4j - - similarity search, -copies and forks of repo from contents -themselves. -deep LLM embedding or vector of contents. - - attributes: - comprehensive, - search, query, edit, delete, insert, - bootstrapable, - secure, - audited, - open source, - self sustaining, - self modifying, - self describing, - unifying framework. - - usage of knowledge graph: - search - -* bootstrap - -1. guix bootstrap starting from objects with hash BOOTSTRAP1. - in git repo at tree commit version V1. -2. processes running on those files to produce new files with hash O1 -3. introspection processes that are tracing those derivations. - strace, ltrace, user probes, log files, valgrind information. -4. this will give us a native partitioning and attention vector to - process the information and we can see what intermediate blocks are created - in memory from those inputs. -5. we can look at each instruction and trace it back to source code, - we can show that the instructions are derived from source. -6. we can look at the graph of the compilation, can show the compilation of - the compiler in relationship to the compilation of the user program in a complex relationship. - - -* history - -#+begin_src shell - grep "url =" /mnt/data1/nix/time/.gitmodules |cut -d= -f2 | cut -b2- |sort -u > submodules.txt - cut -d, -f1 stars.csv | sort -u > starsrepos.txt - diff submodules.txt starsrepos.txt > diff.txt - for x in `grep "> " diff.txt |cut -b3-`;' do git submodule add $x; done - 518 grep "> " diff.txt |cut -b3- >modules.txt - 519 bash ./run.sh - - 618 python3 ./export_stars.py --user h4ck3rm1k3 --github-token XXXX - 627 sort -u < stars.csv > stars2.csv - 628 diff stars.csv stars2.csv - 629 cp stars2.csv stars.csv - 630 git commit -m 'merge' stars.csv - - 653 go install github.com/TimothyJones/csv-check@latest - 662 go install github.com/TimothyJones/csv-clean@latest - 666 ~/go/bin/csv-clean < stars.csv >stars2.csv - 667 cp stars2.csv stars.csv - 670 ~/go/bin/csv-check < stars.csv -#+end_src - -* thoughts. - -1. the creation of, - construction of, - stringing together, - composing parts of, - selection of, - symbols is called semiosis, - it can be formal of bloomy. -2. we can consider stream of - consiousness writing, as also reading - while you are writing, - -* user_lists.json - -get the list of user lists and then pull out one list and clone them -#+begin_src shell -./user_lists.sh > user_lists.json -jq -c '.data[]|.lists|.edges[]|.node|{ "name":.name, "items": .items|.edges[]|.node.url }' ./user_lists.json > ./user_lists2.json -for x in `jq . ./user_lists2.json | grep "p2p git" -C2 |grep https | cut -d: -f2-|cut '-d"' -f2`; do git submodule add $x; done -#+end_src - diff --git a/2024/05/29/notes-jmd.bbl b/2024/05/29/notes-jmd.bbl new file mode 100644 index 00000000..d0eb4e17 --- /dev/null +++ b/2024/05/29/notes-jmd.bbl @@ -0,0 +1,5 @@ +# we can include the github docs in our bib +\bibitem[libp2p(2022)]{libp2p-circuit-relay} +libp2p. +\newblock libp2p circuit relay. +\newblock \url{https://docs.libp2p.io/concepts/nat/circuit-relay/}, 2022. diff --git a/2024/05/29/notes-jmd.org b/2024/05/29/notes-jmd.org new file mode 100644 index 00000000..eef4fe8b --- /dev/null +++ b/2024/05/29/notes-jmd.org @@ -0,0 +1,148 @@ +* executable + +The connection between executable and understandable. + +The translation from human readable text +to computer executable actions. + +The translation from to computer executable actions +to human readable text. + +* libp2p deamon go. + +The petals hivemind project +spawns the p2pd service. + +See these projects: + +./04/27/go-libp2p-daemon/ +./04/27/jvm-libp2p/ +./03/27/hivemind/hivemind/ + + +* How do the humanities create new knowledge? + +New Books Network: Chris Haufe, "Do the Humanities Create Knowledge?" (Cambridge UP, 2023) +Starting from: 00:24:00 +Media file: https://www.podtrac.com/pts/redirect.mp3/pdst.fm/e/chtbl.com/track/1C3AGD/traffic.megaphone.fm/NBNK4055109581.mp3?updated=1716820376#t=1619 + +** Consensus + +In science also in humanities especially in science these days. +Because sometimes it's difficult to tell the difference between +science or pseudo-science or anti-science in the end of the day, we need to +rely on Community consensus. You discuss this +epistemically, like-minded Community generating knowledge in +natural sciences. In a way, maybe not to that extennt qw have this sort of +consensus but maybe it's not as rigorous as it is in +science. So can we say in this way, does Humanitys function more or +less the same way as science does. + +On a social level, there are a lot of similarities. +One one important factor for understanding the similarities is +The knowledge that scientific consensus doesn't really work by like +people voting or even like coming together and agreeing like this is +what we're gonna Believe, or this is what, you know we're gonna adopt. I mean, that +does happen. Um, but it's not the typical kind of Object of consensus does not arise to that status +through a explicit discussion and vote. +It's like a cultural Trend (meme), that just percolates up and, +It's not just a fad in the way that any old cultural Trend might might arise +Things only really percolate up to that level and the natural Sciences +if they satisfy. A bunch of Important criteria for scientific knowledge. +Okay, so, you know, it's not just anything, that's going to be able to rise to that level. Um, +but when it does it does so and and I think very much the same fashion, +it's highly uncoordinated, right? It's just individuals. +It's something resonating with individuals, it's Satisfying +the criteria that they, insist upon for their own work. + +Moving forward with it and broadly speaking, this is how things become Exemplars, right? How things +Rise to a certain kind of Very general level of acceptance in the +humanities. +We're employing different criteria or it's like I don't care. How many decimal places You know, a a result in philosophy +or you know, an argument in literary criticism has. +I have different criteria, you know that that I use to govern my acceptance +or my interest in a result. + +But I do employ some criteria. I would assume, I inherited much of +those criteria from my training and kind of my inclusion as a member +of this discipline. It is just not an accident, when +something, Some major work say in philosophy is published and gains +very, very broad acceptance. +It has done so because it resonates so powerfully with so many different members of Um, the +community. And I mean, add it fundamentally that is what the +process of scientific consensus looks like. + +** Scientific knowledge is to science as canonical texts are to the humanities. + +There's this beautiful quote in your book. This, this sentence that I +really love : +""scientific knowledge is to science as canonical texts are to the humanities."" + +Think of something like Newton's mechanics, contemporary quantum mechanics as a set of +stable set of ideas that are not for the purpose of being believed by +other scientists. But are there for the purpose of Generating New +Pathways for inquiry that the current generation of scientists will move down. +(note: this is the growing stock idea) + +That to me is what scientific knowledge is, It's a stock That +Generates further inquiry. + +From my perspective, this is always the way in which canonical texts +have functioned in the humanities, right? They're not there as as A +stock of ideas to be believed and accepted into one's heart. rather, +What they're there for is,to get Scholars to reflect on, what is +important and this Arena and Frame new inquiries On the basis of those +Reflections. + +On the basis of those kind of Norms of importance are value that, that +they've inherited. That are reflected or exemplified in those +canonical works. There's a lot of results in in the principia that +are just not correct. And the subsequent generations of Scholars did +not accept and knew were wrong. But they accepted The value of +Newton's approach to the study of nature. And, They were not going to +give that up, no matter what I mean, even if, every result that Newton +had published was wrong, That it didn't it wouldn't have mattered. + +I mean, it was just such a powerfully well organized. Well-conceived, +way to structure problems that, the specific claims that Newton makes. +And that's why we're still using it. + +** Relections + +I think we can reflect this into memes, he does not mention +memes directly, but he talks of the "stock" growing, this is the vine analogy, +and we can consider it to be like scombinators representing +memes in a continuation. + +He talks of the memeification, so we have a disciplined group of +people with criteria for accepting knowledge. + +My thoughts on broad resonance remind me of spectral decomposition. +We can think of different groups as holding values dear that resonate +with each other. Those might holder of certain meme coins or memes, or be engaged +with the certain behaviours or mimicry and the furtherance of those. + +My thoughts on broad resonance also lead me to the etomology +of the word, sonos and hearning, and we can think of music +as being made of many parts or frequencies that come together in harmony +like music. this leads us back the the story of the muses, and the mnemosyne +the mother of the muses who is cultural memory and how these timeless metaphors +might resonate with the idea of conciousness itself. + +We can think of the transformers paper as being one such paper that changed +how people think. See the +The TWIML AI Podcast (formerly This Week in Machine Learning & Artificial Intelligence): Language Understanding and LLMs with Christopher Manning - #686 +Episode webpage: https://twimlai.com/podcast/twimlai/language-understanding-and-llms/ + + + +* bibliography + +we can include the github docs in our bib + + + + + + + diff --git a/2024/05/29/papers/2002.04013/algorithm.sty b/2024/05/29/papers/2002.04013/algorithm.sty new file mode 100644 index 00000000..843e3d5b --- /dev/null +++ b/2024/05/29/papers/2002.04013/algorithm.sty @@ -0,0 +1,79 @@ +% ALGORITHM STYLE -- Released 8 April 1996 +% for LaTeX-2e +% Copyright -- 1994 Peter Williams +% E-mail Peter.Williams@dsto.defence.gov.au +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithm} +\typeout{Document Style `algorithm' - floating environment} + +\RequirePackage{float} +\RequirePackage{ifthen} +\newcommand{\ALG@within}{nothing} +\newboolean{ALG@within} +\setboolean{ALG@within}{false} +\newcommand{\ALG@floatstyle}{ruled} +\newcommand{\ALG@name}{Algorithm} +\newcommand{\listalgorithmname}{List of \ALG@name s} + +% Declare Options +% first appearance +\DeclareOption{plain}{ + \renewcommand{\ALG@floatstyle}{plain} +} +\DeclareOption{ruled}{ + \renewcommand{\ALG@floatstyle}{ruled} +} +\DeclareOption{boxed}{ + \renewcommand{\ALG@floatstyle}{boxed} +} +% then numbering convention +\DeclareOption{part}{ + \renewcommand{\ALG@within}{part} + \setboolean{ALG@within}{true} +} +\DeclareOption{chapter}{ + \renewcommand{\ALG@within}{chapter} + \setboolean{ALG@within}{true} +} +\DeclareOption{section}{ + \renewcommand{\ALG@within}{section} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsection}{ + \renewcommand{\ALG@within}{subsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsubsection}{ + \renewcommand{\ALG@within}{subsubsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{nothing}{ + \renewcommand{\ALG@within}{nothing} + \setboolean{ALG@within}{true} +} +\DeclareOption*{\edef\ALG@name{\CurrentOption}} + +% ALGORITHM +% +\ProcessOptions +\floatstyle{\ALG@floatstyle} +\ifthenelse{\boolean{ALG@within}}{ + \ifthenelse{\equal{\ALG@within}{part}} + {\newfloat{algorithm}{htbp}{loa}[part]}{} + \ifthenelse{\equal{\ALG@within}{chapter}} + {\newfloat{algorithm}{htbp}{loa}[chapter]}{} + \ifthenelse{\equal{\ALG@within}{section}} + {\newfloat{algorithm}{htbp}{loa}[section]}{} + \ifthenelse{\equal{\ALG@within}{subsection}} + {\newfloat{algorithm}{htbp}{loa}[subsection]}{} + \ifthenelse{\equal{\ALG@within}{subsubsection}} + {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{} + \ifthenelse{\equal{\ALG@within}{nothing}} + {\newfloat{algorithm}{htbp}{loa}}{} +}{ + \newfloat{algorithm}{htbp}{loa} +} +\floatname{algorithm}{\ALG@name} + +\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}} + diff --git a/2024/05/29/papers/2002.04013/algorithmic.sty b/2024/05/29/papers/2002.04013/algorithmic.sty new file mode 100644 index 00000000..ad614783 --- /dev/null +++ b/2024/05/29/papers/2002.04013/algorithmic.sty @@ -0,0 +1,201 @@ +% ALGORITHMIC STYLE -- Released 8 APRIL 1996 +% for LaTeX version 2e +% Copyright -- 1994 Peter Williams +% E-mail PeterWilliams@dsto.defence.gov.au +% +% Modified by Alex Smola (08/2000) +% E-mail Alex.Smola@anu.edu.au +% +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithmic} +\typeout{Document Style `algorithmic' - environment} +% +\RequirePackage{ifthen} +\RequirePackage{calc} +\newboolean{ALC@noend} +\setboolean{ALC@noend}{false} +\newcounter{ALC@line} +\newcounter{ALC@rem} +\newlength{\ALC@tlm} +% +\DeclareOption{noend}{\setboolean{ALC@noend}{true}} +% +\ProcessOptions +% +% ALGORITHMIC +\newcommand{\algorithmicrequire}{\textbf{Require:}} +\newcommand{\algorithmicensure}{\textbf{Ensure:}} +\newcommand{\algorithmiccomment}[1]{\{#1\}} +\newcommand{\algorithmicend}{\textbf{end}} +\newcommand{\algorithmicif}{\textbf{if}} +\newcommand{\algorithmicthen}{\textbf{then}} +\newcommand{\algorithmicelse}{\textbf{else}} +\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif} +\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif} +\newcommand{\algorithmicfor}{\textbf{for}} +\newcommand{\algorithmicforall}{\textbf{for all}} +\newcommand{\algorithmicdo}{\textbf{do}} +\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor} +\newcommand{\algorithmicwhile}{\textbf{while}} +\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile} +\newcommand{\algorithmicloop}{\textbf{loop}} +\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop} +\newcommand{\algorithmicrepeat}{\textbf{repeat}} +\newcommand{\algorithmicuntil}{\textbf{until}} + +%changed by alex smola +\newcommand{\algorithmicinput}{\textbf{input}} +\newcommand{\algorithmicoutput}{\textbf{output}} +\newcommand{\algorithmicset}{\textbf{set}} +\newcommand{\algorithmictrue}{\textbf{true}} +\newcommand{\algorithmicfalse}{\textbf{false}} +\newcommand{\algorithmicand}{\textbf{and\ }} +\newcommand{\algorithmicor}{\textbf{or\ }} +\newcommand{\algorithmicfunction}{\textbf{function}} +\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction} +\newcommand{\algorithmicmain}{\textbf{main}} +\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain} +%end changed by alex smola + +\def\ALC@item[#1]{% +\if@noparitem \@donoparitem + \else \if@inlabel \indent \par \fi + \ifhmode \unskip\unskip \par \fi + \if@newlist \if@nobreak \@nbitem \else + \addpenalty\@beginparpenalty + \addvspace\@topsep \addvspace{-\parskip}\fi + \else \addpenalty\@itempenalty \addvspace\itemsep + \fi + \global\@inlabeltrue +\fi +\everypar{\global\@minipagefalse\global\@newlistfalse + \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels + \penalty\z@ \fi + \everypar{}}\global\@nobreakfalse +\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi +\sbox\@tempboxa{\makelabel{#1}}% +\global\setbox\@labels + \hbox{\unhbox\@labels \hskip \itemindent + \hskip -\labelwidth \hskip -\ALC@tlm + \ifdim \wd\@tempboxa >\labelwidth + \box\@tempboxa + \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi + \hskip \ALC@tlm}\ignorespaces} +% +\newenvironment{algorithmic}[1][0]{ +\let\@item\ALC@item + \newcommand{\ALC@lno}{% +\ifthenelse{\equal{\arabic{ALC@rem}}{0}} +{{\footnotesize \arabic{ALC@line}:}}{}% +} +\let\@listii\@listi +\let\@listiii\@listi +\let\@listiv\@listi +\let\@listv\@listi +\let\@listvi\@listi +\let\@listvii\@listi + \newenvironment{ALC@g}{ + \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@ + \listparindent\z@ \rightmargin\z@ + \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@ + \leftmargin 1em + \addtolength{\ALC@tlm}{\leftmargin} + } + } + {\end{list}} + \newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item} + \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}% +{}{\ \algorithmiccomment{##1}}} + \newcommand{\REQUIRE}{\item[\algorithmicrequire]} + \newcommand{\ENSURE}{\item[\algorithmicensure]} + \newcommand{\STATE}{\ALC@it} + \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}} +%changes by alex smola + \newcommand{\INPUT}{\item[\algorithmicinput]} + \newcommand{\OUTPUT}{\item[\algorithmicoutput]} + \newcommand{\SET}{\item[\algorithmicset]} +% \newcommand{\TRUE}{\algorithmictrue} +% \newcommand{\FALSE}{\algorithmicfalse} + \newcommand{\AND}{\algorithmicand} + \newcommand{\OR}{\algorithmicor} + \newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}} +%end changes by alex smola + \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}} + \renewcommand{\\}{\@centercr} + \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\ + \algorithmicthen\ {##2}} + \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\ELSIF}[2][default]% +{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ % + \algorithmicdo\ {##2}} + \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@whl}} + \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop% +\ALC@com{##1}\begin{ALC@loop}} +%changed by alex smola + \newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ % + \ALC@com{##1}\begin{ALC@func}} + \newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ % + \ALC@com{##1}\begin{ALC@main}} +%end changed by alex smola + \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat% + \ALC@com{##1}\begin{ALC@rpt}} + \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1} + \ifthenelse{\boolean{ALC@noend}}{ + \newcommand{\ENDIF}{\end{ALC@if}} + \newcommand{\ENDFOR}{\end{ALC@for}} + \newcommand{\ENDWHILE}{\end{ALC@whl}} + \newcommand{\ENDLOOP}{\end{ALC@loop}} + \newcommand{\ENDFUNCTION}{\end{ALC@func}} + \newcommand{\ENDMAIN}{\end{ALC@main}} + }{ + \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif} + \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor} + \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile} + \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop} + \newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction} + \newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain} + } + \renewcommand{\@toodeep}{} + \begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}% + \itemsep\z@ \itemindent\z@ \listparindent\z@% + \partopsep\z@ \parskip\z@ \parsep\z@% + \labelsep 0.5em \topsep 0.2em% + \ifthenelse{\equal{#1}{0}} + {\labelwidth 0.5em } + {\labelwidth 1.2em } + \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep} + \ALC@tlm\labelsep + } + } + {\end{list}} + + + + + + + + + + + + + + diff --git a/2024/05/29/papers/2002.04013/discussion.tex b/2024/05/29/papers/2002.04013/discussion.tex new file mode 100644 index 00000000..54808b05 --- /dev/null +++ b/2024/05/29/papers/2002.04013/discussion.tex @@ -0,0 +1,39 @@ +\section*{Broader Impact} +\label{sect:broader} +\vspace{-4px} + +The approach proposed in this work is only a prototype with limited direct consequences, but the long-term goal of training huge models with volunteer computing can have a lasting effect on both the research community and the general public. + +\vspace{-6px} +\subsection*{Funding bias vs crowdsourcing bias} +\vspace{-6px} +The main positive outcome we pursue is to let researchers harness volunteer computing and train models on the scale currently available only to large corporations. Ideally, a deep learning researcher with a promising idea will be able to amass the computation needed to realize this idea by involving volunteers. However, the project's appeal for volunteers depends on many factors such as subject area, current societal trends, and even researcher's personality. + +For example, a project about teaching agents to play games~\cite{lc0} or fighting global pandemics~\cite{folding_covid} is likely to attract more resources than deep learning applied to soil science. In essence, volunteer computing is biased towards exciting or socially relevant research the same way as traditional HPC is biased towards the interests of those who fund it. + +\vspace{-6px} +\subsection*{Alternative use and misuse} +\vspace{-6px} +The proposed technology can be used with different economic models. If a deep learning system is immediately useful (e.g. for machine translation, information retrieval, etc), the participants could use it for their needs based on their contributions to training. This can take many forms: several labs combining their hardware and training larger models; a web-service that lets people contribute their compute instead of using ads/subscriptions; or simply a framework that someone can use to run distributed training across two or more datacenters. + +Unfortunately, this also allows several opportunities for malicious use. If a machine is hacked, the attacker can use its compute unnoticed by the machine owner --- much the same way that botnets are currently used to mine cryptocurrencies. Furthermore, due to decentalized nature even legitimate Learning@home projects can be hijacked by hackers. + +\vspace{-6px} +\subsection*{Security} +\vspace{-6px} +Using crowdsourced hardware makes Learning@home susceptible to attacks from malicious participants. There are multiple attack vectors already known in P2P community: denial of service attacks, Sybil attacks, Eclipse attacks and more \cite{urdaneta2011survey, sybil_attacks_dht, dos_resistance, sybil_nodes}. Fortunately, there are variations of the DHT protocol that make it resistant to said attacks: if a reader wishes to learn more about DHT security, we recommend starting with \cite{urdaneta2011survey}. + +Another source of vulnerability stems from the sequential nature of neural networks. If a single expert were to return incorrect (e.g. NaN) outputs or gradients, it could compromise the outputs of the entire network and even poison adjacent nodes through backpropagation. Recent studies expose similar attack patterns on federated learning systems \cite{bagdasaryan2018backdoor, bhagoji2018analyzing}. + +The redundant nature of mixture-of-experts layers provides some degree of resistance against those attacks. A single malicious expert will only affect a small fraction of inputs that pass through this specific expert. Furthermore, a trainer with access to predictions from multiple experts could provide a higher degree of robustness by using statistical techniques (e.g., by ignoring outlier gradients). However, such techniques need to be carefully designed so as not to introduce harmful side effects. + +\vspace{-6px} +\subsection*{The burden on the network} +\vspace{-6px} +Finally, we would like to point out the potential harm that our approach can do to network infrastructure. The experiments we ran in Section \ref{sect:exp_throughput} saturate with the bandwidth of $100-200$Mbps, most of which is tensors passed between experts and trainers. + +This coincides with the typical home internet speed available in major cities of developed countries. However, not all ISPs design their infrastructure for users who always use up all their bandwidth. If too many Learning@home participants are located in one LAN or MAN, it can cause congestion or even failures in the network infrastructure. + +Similar situations frequently took place in late 2000s due to growing popularity of BitTorrent for file sharing. Fortunately, the network infrastructure is continually improving, which leads us to believe that this problem will eventually be solved. Until then, we describe several ways to reduce network load of Learning@home in Appendix E. + + diff --git a/2024/05/29/papers/2002.04013/experiments.tex b/2024/05/29/papers/2002.04013/experiments.tex new file mode 100644 index 00000000..b12d18f4 --- /dev/null +++ b/2024/05/29/papers/2002.04013/experiments.tex @@ -0,0 +1,105 @@ +\vspace{-6px} +\section{Experiments}\label{sect:experiments} +\vspace{-4px} + +The design of Learning@home was driven by two key assumptions: first, that MoE-based architectures can maintain high throughput under latency and second, that they can converge despite the presence of stale gradients. In this section we run several benchmarks in order to verify these assumptions. We intentionally focus on small-scale experiments to make them easier to reproduce and analyze. While solving practical vision and NLP problems is certainly our end goal, choosing a particular task would make it much harder to understand the general properties of our approach. + +\vspace{-6px} +\subsection{Model throughput}\label{sect:exp_throughput} +\vspace{-4px} + +Our first benchmark evaluates the performance of asynchronous training schemes under latency. We quantify this with training throughput, i.e., the number of training batches processed per second. +To emulate the distributed training environment, we create a model from a large number of identical blocks distributed evenly across 4 NVIDIA GTX 1080 GPUs. +We simulate network latency by adding an artificial delay after computation of each block. The delay time is sampled from the exponential distribution, which was shown to model latency well \cite{sukhov2016generating}.% + +\vspace{-1px} + +Since our model size exceeds the memory limits of a single consumer GPU, the only mainstream paradigm that can compete with Learning@home is model parallel training. We also report the ``upper bound'' on training throughput by running the same computations with no network delays in a model parallel regime with pipelining similar to~\cite{huang2019gpipe}. For Learning@home, we use 64 trainer processes to send requests to the runtime processes\footnote{See the full setup: \url{https://github.com/mryab/learning-at-home\#running-the-experiments}}. + +\vspace{-1px} + +To measure the effect on blocks with different computation to communication ratio, we evaluate two popular block architectures. The first architecture is composed of $224$ feed-forward blocks, each having hidden dimensions of $1024\to4096\to4096\to 1024$ with layer normalization and ReLU activations in between. These blocks are treated as separate ``experts'' and process batches of size $2048$. The second architecture consists of $224$ BERT-like Transformer blocks \cite{bert} with hidden dimension 1024 and GELU activations \cite{hendrycks2016gaussian} applied to sequences of length $512$ with batch size $4$. + +\vspace{-1px} + +With this setup in mind, we can measure the throughput of the entire model as the time it takes to process 10 batches and dividing it by the total number of processed examples. These experiments were repeated 5 times for all methods to measure the mean and standard deviation of throughput. + +\vspace{-1px} + +Figure~\ref{fig:throughput} demonstrates that even with delay times approaching 200ms the asynchronous scheduler we have implemented as part of Learning@home maintains nearly the same throughput. In turn, model-parallel training throughput quickly degrades under latency, which is not surprising as it was not designed with slow communication in mind. % + +\vspace{-1px} + +To verify the validity of our conclusions, we have conducted similar experiments on cloud GPU instances in different regions. +This allows us to measure performance in a non-simulated scenario closer to the desired area of application. +In particular, we rented 3 instances with Tesla K80 hosted in West US, East US, and West Europe with average network latency of $92.49\pm32.42$ ms. The throughput values in Table \ref{tab:cloudk80} are similar to results for simulated latencies~(Figure \ref{fig:throughput}). + +\vspace{-1px} +Finally, we tested the scalability of our infrastructure by deploying DHT nodes in the same cloud regions and measuring the latency of beam search~(batch size 64, see Appendix C). Finding top-4 experts took $317\pm58$ms for 100 nodes, $528\pm127$ms for 1,000 nodes and $764\pm106$ms for 10,000 DHT nodes. + +\begin{figure}[h] +\vspace{-2px} + \hspace{-24px}\begin{minipage}{0.6\textwidth} + \centering + \includegraphics[width=210px]{resources/throughput_new.pdf} + \captionof{figure}{Throughput with simulated latency.} + \label{fig:throughput} + \end{minipage} + \hspace{-10px} + \begin{minipage}{0.48\textwidth} + \setlength{\tabcolsep}{2pt} + \begin{tabular}{ccc} + \toprule + \multirow{2}{*}{Approach} & \multirow{2}{*}{Feed-forward} & Transformer \\ + & & encoder \\ + \midrule + Model parallel & $7.23 \pm 0.06$ & $0.01 \pm 0.001$\\ + Learning@home & $300.8\pm 15.9$ & $0.68 \pm 0.01$\\ + \bottomrule + \end{tabular} + \captionof{table}{Throughput (samples/s) for 3 cloud K80 in East US, West US and West Europe.} + \label{tab:cloudk80} + \end{minipage} +\end{figure} + +\vspace{-16px} + +\subsection{Convergence}\label{sect:exp_convergence} +\vspace{-4px} + +Our second experiment aims to verify the robustness of DMoE to delayed updates. +For this goal, we choose one of the simpler tasks in deep learning, namely the MNIST digit recognition dataset \cite{mnist}, and compare convergence rates under varying network latency. All modern architectures can reliably solve this task, making it easier for us to isolate the effect of gradient staleness. + +We evaluate four models: a traditional feed-forward model and three DMoE variations with different numbers of experts. The feed-forward network (FFN) consists of 4 stacked feed-forward blocks. Each block architecture is same as described in Section~\ref{sect:exp_throughput}, but with half as many hidden units. In turn, its DMoE counterparts have four DMoE layers, each composed of blocks with 1/4 of the FFN size. Both DMoE-based models use only 4 experts at a time regardless of their total number, hence being computationally equivalent to the FFN baseline. + +We train all models asynchronously in high-latency and low-latency scenarios, using the same distribution for delay. In the high-latency scenario, each of 64 workers is delayed for 1 second on average while processing a batch. This corresponds to 125ms for each forward and backward pass through DMoE. For low latency emulation, we use 16 workers and 100ms average delay. The third experiment simulates node failure: each expert does not respond to a request with probability 0.1. + +The results are presented in Figure \ref{fig:convergence_mnist}; as expected, the plots demonstrate that the higher latency scenario is more difficult for all models. However, the degree to which it affects the performance of DMoE architectures is much lower, especially for the largest of mixtures. + +\begin{figure}[h] +\vspace{-6px} + \begin{minipage}{0.99\textwidth} + \hspace{-16px}\includegraphics[width=417px]{resources/convergence.pdf} + \end{minipage} + \vspace{-4px} + \captionof{figure}{Convergence plots for feedforward models with different network latencies and failure rates. Pale areas on depict unbiased standard deviations over 5 runs.} + \label{fig:convergence_mnist} +\end{figure} + +\vspace{-4px} + +\subsection{Language models}\label{sect:exp_lm} + +The third and final benchmark is neural language modeling. Specifically, we train Transformer-XL~\cite{dai2019transformer} on the WikiText-2~\cite{wikitext2} dataset. Both baseline and DMoE models use official recommended parameters with additional regularization proposed in \cite{dettmerswikitext2}. + +The \texttt{base} model contains $16$ Transformer layers with the hidden size of $400$ and $900$ units in the feedforward layer. We also train a \texttt{small} baseline model with $200$ hidden and $450$ feedforward units. Our DMoE Transformer uses $256$ experts split evenly between $16$ layers. Each expert is a Transformer layer with the same dimensions as layers of the \texttt{small} baseline model. The DMoE layers route to top-$4$ experts, making our model roughly equivalent to \texttt{base} in terms of FLOPs per sample. Similarly to Section~\ref{sect:exp_convergence}, we train DMoE with 32 trainers (batch size 1 each), $1000$ms average latency, and $10\%$ failure rate. + +\begin{figure}[h] +\vspace{-2px} + \centering + \includegraphics[width=0.5\textwidth]{resources/convergence_wikitext.pdf} + \captionof{figure}{Convergence plots for Transformer language models on the WikiText-2 dataset. Pale areas on depict unbiased standard deviations over 5 runs.} + \label{fig:convergence_lm} +\end{figure} + +The results depicted in Figure~\ref{fig:convergence_lm} demonstrate a similar pattern to what was previously observed on feedforward networks. Curiously enough, we found that in this specific scenario the $10\%$ failure rate has a positive effect on the DMoE performance. We attribute this effect to a form of dropout regularization that prevents our model from overfitting the limited training data. diff --git a/2024/05/29/papers/2002.04013/fancyhdr.sty b/2024/05/29/papers/2002.04013/fancyhdr.sty new file mode 100644 index 00000000..77ed4e30 --- /dev/null +++ b/2024/05/29/papers/2002.04013/fancyhdr.sty @@ -0,0 +1,485 @@ +% fancyhdr.sty version 3.2 +% Fancy headers and footers for LaTeX. +% Piet van Oostrum, +% Dept of Computer and Information Sciences, University of Utrecht, +% Padualaan 14, P.O. Box 80.089, 3508 TB Utrecht, The Netherlands +% Telephone: +31 30 2532180. Email: piet@cs.uu.nl +% ======================================================================== +% LICENCE: +% This file may be distributed under the terms of the LaTeX Project Public +% License, as described in lppl.txt in the base LaTeX distribution. +% Either version 1 or, at your option, any later version. +% ======================================================================== +% MODIFICATION HISTORY: +% Sep 16, 1994 +% version 1.4: Correction for use with \reversemargin +% Sep 29, 1994: +% version 1.5: Added the \iftopfloat, \ifbotfloat and \iffloatpage commands +% Oct 4, 1994: +% version 1.6: Reset single spacing in headers/footers for use with +% setspace.sty or doublespace.sty +% Oct 4, 1994: +% version 1.7: changed \let\@mkboth\markboth to +% \def\@mkboth{\protect\markboth} to make it more robust +% Dec 5, 1994: +% version 1.8: corrections for amsbook/amsart: define \@chapapp and (more +% importantly) use the \chapter/sectionmark definitions from ps@headings if +% they exist (which should be true for all standard classes). +% May 31, 1995: +% version 1.9: The proposed \renewcommand{\headrulewidth}{\iffloatpage... +% construction in the doc did not work properly with the fancyplain style. +% June 1, 1995: +% version 1.91: The definition of \@mkboth wasn't restored on subsequent +% \pagestyle{fancy}'s. +% June 1, 1995: +% version 1.92: The sequence \pagestyle{fancyplain} \pagestyle{plain} +% \pagestyle{fancy} would erroneously select the plain version. +% June 1, 1995: +% version 1.93: \fancypagestyle command added. +% Dec 11, 1995: +% version 1.94: suggested by Conrad Hughes +% CJCH, Dec 11, 1995: added \footruleskip to allow control over footrule +% position (old hardcoded value of .3\normalbaselineskip is far too high +% when used with very small footer fonts). +% Jan 31, 1996: +% version 1.95: call \@normalsize in the reset code if that is defined, +% otherwise \normalsize. +% this is to solve a problem with ucthesis.cls, as this doesn't +% define \@currsize. Unfortunately for latex209 calling \normalsize doesn't +% work as this is optimized to do very little, so there \@normalsize should +% be called. Hopefully this code works for all versions of LaTeX known to +% mankind. +% April 25, 1996: +% version 1.96: initialize \headwidth to a magic (negative) value to catch +% most common cases that people change it before calling \pagestyle{fancy}. +% Note it can't be initialized when reading in this file, because +% \textwidth could be changed afterwards. This is quite probable. +% We also switch to \MakeUppercase rather than \uppercase and introduce a +% \nouppercase command for use in headers. and footers. +% May 3, 1996: +% version 1.97: Two changes: +% 1. Undo the change in version 1.8 (using the pagestyle{headings} defaults +% for the chapter and section marks. The current version of amsbook and +% amsart classes don't seem to need them anymore. Moreover the standard +% latex classes don't use \markboth if twoside isn't selected, and this is +% confusing as \leftmark doesn't work as expected. +% 2. include a call to \ps@empty in ps@@fancy. This is to solve a problem +% in the amsbook and amsart classes, that make global changes to \topskip, +% which are reset in \ps@empty. Hopefully this doesn't break other things. +% May 7, 1996: +% version 1.98: +% Added % after the line \def\nouppercase +% May 7, 1996: +% version 1.99: This is the alpha version of fancyhdr 2.0 +% Introduced the new commands \fancyhead, \fancyfoot, and \fancyhf. +% Changed \headrulewidth, \footrulewidth, \footruleskip to +% macros rather than length parameters, In this way they can be +% conditionalized and they don't consume length registers. There is no need +% to have them as length registers unless you want to do calculations with +% them, which is unlikely. Note that this may make some uses of them +% incompatible (i.e. if you have a file that uses \setlength or \xxxx=) +% May 10, 1996: +% version 1.99a: +% Added a few more % signs +% May 10, 1996: +% version 1.99b: +% Changed the syntax of \f@nfor to be resistent to catcode changes of := +% Removed the [1] from the defs of \lhead etc. because the parameter is +% consumed by the \@[xy]lhead etc. macros. +% June 24, 1997: +% version 1.99c: +% corrected \nouppercase to also include the protected form of \MakeUppercase +% \global added to manipulation of \headwidth. +% \iffootnote command added. +% Some comments added about \@fancyhead and \@fancyfoot. +% Aug 24, 1998 +% version 1.99d +% Changed the default \ps@empty to \ps@@empty in order to allow +% \fancypagestyle{empty} redefinition. +% Oct 11, 2000 +% version 2.0 +% Added LPPL license clause. +% +% A check for \headheight is added. An errormessage is given (once) if the +% header is too large. Empty headers don't generate the error even if +% \headheight is very small or even 0pt. +% Warning added for the use of 'E' option when twoside option is not used. +% In this case the 'E' fields will never be used. +% +% Mar 10, 2002 +% version 2.1beta +% New command: \fancyhfoffset[place]{length} +% defines offsets to be applied to the header/footer to let it stick into +% the margins (if length > 0). +% place is like in fancyhead, except that only E,O,L,R can be used. +% This replaces the old calculation based on \headwidth and the marginpar +% area. +% \headwidth will be dynamically calculated in the headers/footers when +% this is used. +% +% Mar 26, 2002 +% version 2.1beta2 +% \fancyhfoffset now also takes h,f as possible letters in the argument to +% allow the header and footer widths to be different. +% New commands \fancyheadoffset and \fancyfootoffset added comparable to +% \fancyhead and \fancyfoot. +% Errormessages and warnings have been made more informative. +% +% Dec 9, 2002 +% version 2.1 +% The defaults for \footrulewidth, \plainheadrulewidth and +% \plainfootrulewidth are changed from \z@skip to 0pt. In this way when +% someone inadvertantly uses \setlength to change any of these, the value +% of \z@skip will not be changed, rather an errormessage will be given. + +% March 3, 2004 +% Release of version 3.0 + +% Oct 7, 2004 +% version 3.1 +% Added '\endlinechar=13' to \fancy@reset to prevent problems with +% includegraphics in header when verbatiminput is active. + +% March 22, 2005 +% version 3.2 +% reset \everypar (the real one) in \fancy@reset because spanish.ldf does +% strange things with \everypar between << and >>. + +\def\ifancy@mpty#1{\def\temp@a{#1}\ifx\temp@a\@empty} + +\def\fancy@def#1#2{\ifancy@mpty{#2}\fancy@gbl\def#1{\leavevmode}\else + \fancy@gbl\def#1{#2\strut}\fi} + +\let\fancy@gbl\global + +\def\@fancyerrmsg#1{% + \ifx\PackageError\undefined + \errmessage{#1}\else + \PackageError{Fancyhdr}{#1}{}\fi} +\def\@fancywarning#1{% + \ifx\PackageWarning\undefined + \errmessage{#1}\else + \PackageWarning{Fancyhdr}{#1}{}\fi} + +% Usage: \@forc \var{charstring}{command to be executed for each char} +% This is similar to LaTeX's \@tfor, but expands the charstring. + +\def\@forc#1#2#3{\expandafter\f@rc\expandafter#1\expandafter{#2}{#3}} +\def\f@rc#1#2#3{\def\temp@ty{#2}\ifx\@empty\temp@ty\else + \f@@rc#1#2\f@@rc{#3}\fi} +\def\f@@rc#1#2#3\f@@rc#4{\def#1{#2}#4\f@rc#1{#3}{#4}} + +% Usage: \f@nfor\name:=list\do{body} +% Like LaTeX's \@for but an empty list is treated as a list with an empty +% element + +\newcommand{\f@nfor}[3]{\edef\@fortmp{#2}% + \expandafter\@forloop#2,\@nil,\@nil\@@#1{#3}} + +% Usage: \def@ult \cs{defaults}{argument} +% sets \cs to the characters from defaults appearing in argument +% or defaults if it would be empty. All characters are lowercased. + +\newcommand\def@ult[3]{% + \edef\temp@a{\lowercase{\edef\noexpand\temp@a{#3}}}\temp@a + \def#1{}% + \@forc\tmpf@ra{#2}% + {\expandafter\if@in\tmpf@ra\temp@a{\edef#1{#1\tmpf@ra}}{}}% + \ifx\@empty#1\def#1{#2}\fi} +% +% \if@in +% +\newcommand{\if@in}[4]{% + \edef\temp@a{#2}\def\temp@b##1#1##2\temp@b{\def\temp@b{##1}}% + \expandafter\temp@b#2#1\temp@b\ifx\temp@a\temp@b #4\else #3\fi} + +\newcommand{\fancyhead}{\@ifnextchar[{\f@ncyhf\fancyhead h}% + {\f@ncyhf\fancyhead h[]}} +\newcommand{\fancyfoot}{\@ifnextchar[{\f@ncyhf\fancyfoot f}% + {\f@ncyhf\fancyfoot f[]}} +\newcommand{\fancyhf}{\@ifnextchar[{\f@ncyhf\fancyhf{}}% + {\f@ncyhf\fancyhf{}[]}} + +% New commands for offsets added + +\newcommand{\fancyheadoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyheadoffset h}% + {\f@ncyhfoffs\fancyheadoffset h[]}} +\newcommand{\fancyfootoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyfootoffset f}% + {\f@ncyhfoffs\fancyfootoffset f[]}} +\newcommand{\fancyhfoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyhfoffset{}}% + {\f@ncyhfoffs\fancyhfoffset{}[]}} + +% The header and footer fields are stored in command sequences with +% names of the form: \f@ncy with for [eo], from [lcr] +% and from [hf]. + +\def\f@ncyhf#1#2[#3]#4{% + \def\temp@c{}% + \@forc\tmpf@ra{#3}% + {\expandafter\if@in\tmpf@ra{eolcrhf,EOLCRHF}% + {}{\edef\temp@c{\temp@c\tmpf@ra}}}% + \ifx\@empty\temp@c\else + \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument: + [#3]}% + \fi + \f@nfor\temp@c{#3}% + {\def@ult\f@@@eo{eo}\temp@c + \if@twoside\else + \if\f@@@eo e\@fancywarning + {\string#1's `E' option without twoside option is useless}\fi\fi + \def@ult\f@@@lcr{lcr}\temp@c + \def@ult\f@@@hf{hf}{#2\temp@c}% + \@forc\f@@eo\f@@@eo + {\@forc\f@@lcr\f@@@lcr + {\@forc\f@@hf\f@@@hf + {\expandafter\fancy@def\csname + f@ncy\f@@eo\f@@lcr\f@@hf\endcsname + {#4}}}}}} + +\def\f@ncyhfoffs#1#2[#3]#4{% + \def\temp@c{}% + \@forc\tmpf@ra{#3}% + {\expandafter\if@in\tmpf@ra{eolrhf,EOLRHF}% + {}{\edef\temp@c{\temp@c\tmpf@ra}}}% + \ifx\@empty\temp@c\else + \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument: + [#3]}% + \fi + \f@nfor\temp@c{#3}% + {\def@ult\f@@@eo{eo}\temp@c + \if@twoside\else + \if\f@@@eo e\@fancywarning + {\string#1's `E' option without twoside option is useless}\fi\fi + \def@ult\f@@@lcr{lr}\temp@c + \def@ult\f@@@hf{hf}{#2\temp@c}% + \@forc\f@@eo\f@@@eo + {\@forc\f@@lcr\f@@@lcr + {\@forc\f@@hf\f@@@hf + {\expandafter\setlength\csname + f@ncyO@\f@@eo\f@@lcr\f@@hf\endcsname + {#4}}}}}% + \fancy@setoffs} + +% Fancyheadings version 1 commands. These are more or less deprecated, +% but they continue to work. + +\newcommand{\lhead}{\@ifnextchar[{\@xlhead}{\@ylhead}} +\def\@xlhead[#1]#2{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#2}} +\def\@ylhead#1{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#1}} + +\newcommand{\chead}{\@ifnextchar[{\@xchead}{\@ychead}} +\def\@xchead[#1]#2{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#2}} +\def\@ychead#1{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#1}} + +\newcommand{\rhead}{\@ifnextchar[{\@xrhead}{\@yrhead}} +\def\@xrhead[#1]#2{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#2}} +\def\@yrhead#1{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#1}} + +\newcommand{\lfoot}{\@ifnextchar[{\@xlfoot}{\@ylfoot}} +\def\@xlfoot[#1]#2{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#2}} +\def\@ylfoot#1{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#1}} + +\newcommand{\cfoot}{\@ifnextchar[{\@xcfoot}{\@ycfoot}} +\def\@xcfoot[#1]#2{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#2}} +\def\@ycfoot#1{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#1}} + +\newcommand{\rfoot}{\@ifnextchar[{\@xrfoot}{\@yrfoot}} +\def\@xrfoot[#1]#2{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#2}} +\def\@yrfoot#1{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#1}} + +\newlength{\fancy@headwidth} +\let\headwidth\fancy@headwidth +\newlength{\f@ncyO@elh} +\newlength{\f@ncyO@erh} +\newlength{\f@ncyO@olh} +\newlength{\f@ncyO@orh} +\newlength{\f@ncyO@elf} +\newlength{\f@ncyO@erf} +\newlength{\f@ncyO@olf} +\newlength{\f@ncyO@orf} +\newcommand{\headrulewidth}{0.4pt} +\newcommand{\footrulewidth}{0pt} +\newcommand{\footruleskip}{.3\normalbaselineskip} + +% Fancyplain stuff shouldn't be used anymore (rather +% \fancypagestyle{plain} should be used), but it must be present for +% compatibility reasons. + +\newcommand{\plainheadrulewidth}{0pt} +\newcommand{\plainfootrulewidth}{0pt} +\newif\if@fancyplain \@fancyplainfalse +\def\fancyplain#1#2{\if@fancyplain#1\else#2\fi} + +\headwidth=-123456789sp %magic constant + +% Command to reset various things in the headers: +% a.o. single spacing (taken from setspace.sty) +% and the catcode of ^^M (so that epsf files in the header work if a +% verbatim crosses a page boundary) +% It also defines a \nouppercase command that disables \uppercase and +% \Makeuppercase. It can only be used in the headers and footers. +\let\fnch@everypar\everypar% save real \everypar because of spanish.ldf +\def\fancy@reset{\fnch@everypar{}\restorecr\endlinechar=13 + \def\baselinestretch{1}% + \def\nouppercase##1{{\let\uppercase\relax\let\MakeUppercase\relax + \expandafter\let\csname MakeUppercase \endcsname\relax##1}}% + \ifx\undefined\@newbaseline% NFSS not present; 2.09 or 2e + \ifx\@normalsize\undefined \normalsize % for ucthesis.cls + \else \@normalsize \fi + \else% NFSS (2.09) present + \@newbaseline% + \fi} + +% Initialization of the head and foot text. + +% The default values still contain \fancyplain for compatibility. +\fancyhf{} % clear all +% lefthead empty on ``plain'' pages, \rightmark on even, \leftmark on odd pages +% evenhead empty on ``plain'' pages, \leftmark on even, \rightmark on odd pages +\if@twoside + \fancyhead[el,or]{\fancyplain{}{\sl\rightmark}} + \fancyhead[er,ol]{\fancyplain{}{\sl\leftmark}} +\else + \fancyhead[l]{\fancyplain{}{\sl\rightmark}} + \fancyhead[r]{\fancyplain{}{\sl\leftmark}} +\fi +\fancyfoot[c]{\rm\thepage} % page number + +% Use box 0 as a temp box and dimen 0 as temp dimen. +% This can be done, because this code will always +% be used inside another box, and therefore the changes are local. + +\def\@fancyvbox#1#2{\setbox0\vbox{#2}\ifdim\ht0>#1\@fancywarning + {\string#1 is too small (\the#1): ^^J Make it at least \the\ht0.^^J + We now make it that large for the rest of the document.^^J + This may cause the page layout to be inconsistent, however\@gobble}% + \dimen0=#1\global\setlength{#1}{\ht0}\ht0=\dimen0\fi + \box0} + +% Put together a header or footer given the left, center and +% right text, fillers at left and right and a rule. +% The \lap commands put the text into an hbox of zero size, +% so overlapping text does not generate an errormessage. +% These macros have 5 parameters: +% 1. LEFTSIDE BEARING % This determines at which side the header will stick +% out. When \fancyhfoffset is used this calculates \headwidth, otherwise +% it is \hss or \relax (after expansion). +% 2. \f@ncyolh, \f@ncyelh, \f@ncyolf or \f@ncyelf. This is the left component. +% 3. \f@ncyoch, \f@ncyech, \f@ncyocf or \f@ncyecf. This is the middle comp. +% 4. \f@ncyorh, \f@ncyerh, \f@ncyorf or \f@ncyerf. This is the right component. +% 5. RIGHTSIDE BEARING. This is always \relax or \hss (after expansion). + +\def\@fancyhead#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset + \@fancyvbox\headheight{\hbox + {\rlap{\parbox[b]{\headwidth}{\raggedright#2}}\hfill + \parbox[b]{\headwidth}{\centering#3}\hfill + \llap{\parbox[b]{\headwidth}{\raggedleft#4}}}\headrule}}#5} + +\def\@fancyfoot#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset + \@fancyvbox\footskip{\footrule + \hbox{\rlap{\parbox[t]{\headwidth}{\raggedright#2}}\hfill + \parbox[t]{\headwidth}{\centering#3}\hfill + \llap{\parbox[t]{\headwidth}{\raggedleft#4}}}}}#5} + +\def\headrule{{\if@fancyplain\let\headrulewidth\plainheadrulewidth\fi + \hrule\@height\headrulewidth\@width\headwidth \vskip-\headrulewidth}} + +\def\footrule{{\if@fancyplain\let\footrulewidth\plainfootrulewidth\fi + \vskip-\footruleskip\vskip-\footrulewidth + \hrule\@width\headwidth\@height\footrulewidth\vskip\footruleskip}} + +\def\ps@fancy{% +\@ifundefined{@chapapp}{\let\@chapapp\chaptername}{}%for amsbook +% +% Define \MakeUppercase for old LaTeXen. +% Note: we used \def rather than \let, so that \let\uppercase\relax (from +% the version 1 documentation) will still work. +% +\@ifundefined{MakeUppercase}{\def\MakeUppercase{\uppercase}}{}% +\@ifundefined{chapter}{\def\sectionmark##1{\markboth +{\MakeUppercase{\ifnum \c@secnumdepth>\z@ + \thesection\hskip 1em\relax \fi ##1}}{}}% +\def\subsectionmark##1{\markright {\ifnum \c@secnumdepth >\@ne + \thesubsection\hskip 1em\relax \fi ##1}}}% +{\def\chaptermark##1{\markboth {\MakeUppercase{\ifnum \c@secnumdepth>\m@ne + \@chapapp\ \thechapter. \ \fi ##1}}{}}% +\def\sectionmark##1{\markright{\MakeUppercase{\ifnum \c@secnumdepth >\z@ + \thesection. \ \fi ##1}}}}% +%\csname ps@headings\endcsname % use \ps@headings defaults if they exist +\ps@@fancy +\gdef\ps@fancy{\@fancyplainfalse\ps@@fancy}% +% Initialize \headwidth if the user didn't +% +\ifdim\headwidth<0sp +% +% This catches the case that \headwidth hasn't been initialized and the +% case that the user added something to \headwidth in the expectation that +% it was initialized to \textwidth. We compensate this now. This loses if +% the user intended to multiply it by a factor. But that case is more +% likely done by saying something like \headwidth=1.2\textwidth. +% The doc says you have to change \headwidth after the first call to +% \pagestyle{fancy}. This code is just to catch the most common cases were +% that requirement is violated. +% + \global\advance\headwidth123456789sp\global\advance\headwidth\textwidth +\fi} +\def\ps@fancyplain{\ps@fancy \let\ps@plain\ps@plain@fancy} +\def\ps@plain@fancy{\@fancyplaintrue\ps@@fancy} +\let\ps@@empty\ps@empty +\def\ps@@fancy{% +\ps@@empty % This is for amsbook/amsart, which do strange things with \topskip +\def\@mkboth{\protect\markboth}% +\def\@oddhead{\@fancyhead\fancy@Oolh\f@ncyolh\f@ncyoch\f@ncyorh\fancy@Oorh}% +\def\@oddfoot{\@fancyfoot\fancy@Oolf\f@ncyolf\f@ncyocf\f@ncyorf\fancy@Oorf}% +\def\@evenhead{\@fancyhead\fancy@Oelh\f@ncyelh\f@ncyech\f@ncyerh\fancy@Oerh}% +\def\@evenfoot{\@fancyfoot\fancy@Oelf\f@ncyelf\f@ncyecf\f@ncyerf\fancy@Oerf}% +} +% Default definitions for compatibility mode: +% These cause the header/footer to take the defined \headwidth as width +% And to shift in the direction of the marginpar area + +\def\fancy@Oolh{\if@reversemargin\hss\else\relax\fi} +\def\fancy@Oorh{\if@reversemargin\relax\else\hss\fi} +\let\fancy@Oelh\fancy@Oorh +\let\fancy@Oerh\fancy@Oolh + +\let\fancy@Oolf\fancy@Oolh +\let\fancy@Oorf\fancy@Oorh +\let\fancy@Oelf\fancy@Oelh +\let\fancy@Oerf\fancy@Oerh + +% New definitions for the use of \fancyhfoffset +% These calculate the \headwidth from \textwidth and the specified offsets. + +\def\fancy@offsolh{\headwidth=\textwidth\advance\headwidth\f@ncyO@olh + \advance\headwidth\f@ncyO@orh\hskip-\f@ncyO@olh} +\def\fancy@offselh{\headwidth=\textwidth\advance\headwidth\f@ncyO@elh + \advance\headwidth\f@ncyO@erh\hskip-\f@ncyO@elh} + +\def\fancy@offsolf{\headwidth=\textwidth\advance\headwidth\f@ncyO@olf + \advance\headwidth\f@ncyO@orf\hskip-\f@ncyO@olf} +\def\fancy@offself{\headwidth=\textwidth\advance\headwidth\f@ncyO@elf + \advance\headwidth\f@ncyO@erf\hskip-\f@ncyO@elf} + +\def\fancy@setoffs{% +% Just in case \let\headwidth\textwidth was used + \fancy@gbl\let\headwidth\fancy@headwidth + \fancy@gbl\let\fancy@Oolh\fancy@offsolh + \fancy@gbl\let\fancy@Oelh\fancy@offselh + \fancy@gbl\let\fancy@Oorh\hss + \fancy@gbl\let\fancy@Oerh\hss + \fancy@gbl\let\fancy@Oolf\fancy@offsolf + \fancy@gbl\let\fancy@Oelf\fancy@offself + \fancy@gbl\let\fancy@Oorf\hss + \fancy@gbl\let\fancy@Oerf\hss} + +\newif\iffootnote +\let\latex@makecol\@makecol +\def\@makecol{\ifvoid\footins\footnotetrue\else\footnotefalse\fi +\let\topfloat\@toplist\let\botfloat\@botlist\latex@makecol} +\def\iftopfloat#1#2{\ifx\topfloat\empty #2\else #1\fi} +\def\ifbotfloat#1#2{\ifx\botfloat\empty #2\else #1\fi} +\def\iffloatpage#1#2{\if@fcolmade #1\else #2\fi} + +\newcommand{\fancypagestyle}[2]{% + \@namedef{ps@#1}{\let\fancy@gbl\relax#2\relax\ps@fancy}} diff --git a/2024/05/29/papers/2002.04013/intro.tex b/2024/05/29/papers/2002.04013/intro.tex new file mode 100644 index 00000000..d3fb0559 --- /dev/null +++ b/2024/05/29/papers/2002.04013/intro.tex @@ -0,0 +1,50 @@ +\vspace{-12pt} +\section{Introduction}\label{sect:intro} +\vspace{-4pt} + +Our investigation begins with a thought experiment. Imagine a deep neural network with capacity 1000 times greater than today's most powerful architectures: for example, a language model trained on all digitally available texts or a generative model for all images ever uploaded to the Internet. How can we train such a model? + +\vspace{-1.5pt} + +Viewed from a historical perspective, the 1000-fold increase in capacity is not unrealistic. Over the past decade, the deep learning community has made remarkable progress by training large models on abundant data, and the scale of those models keeps growing. Since the advent of the ImageNet challenge \cite{imagenet_cvpr09} with 1.3M labeled images, the typical size of convolutional neural networks increased from a few megabytes to hundreds of megabytes \cite{alexnet, resnet, huang2019gpipe}. Recent studies report even larger models for datasets with hundreds of millions of images \cite{kolesnikovlarge, jft300data}. + +\vspace{-1.5pt} + +Another trend from natural language processing is to train large Transformer-like language models~\cite{bert, roberta, kaplan2020scaling}. The data for this task is nearly unlimited, allowing researchers to train models with tens or even hundreds of gigabytes of parameters~\cite{brown2020language,shoeybi2019megatron,zellers2019defending,tnlg}. While we may not need the 1000-fold increase at the moment, planning for it will prepare us for the next big leap in model capacity. + +\vspace{-1.5pt} + +To be specific, let us focus on training large Transformer networks for the language modeling task. At the time of writing, the largest conventional model for that task is GPT-3 with 175 billion parameters. Scaling it up 1000 times gives us 175 trillion; depending on whether you use single or half-precision, this requires 300--600 terabytes of memory just to store the model. No modern mass-produced hardware accelerator is up to such task. Even high-end servers with 16x V100 accelerators can store only 0.15\% of that model in combined GPU memory, let alone train it. + +The dominant way of growing neural network size has so far been to scale up: deploy more powerful computational accelerators in specialized tightly interconnected clusters. However, this approach will only work up to a point. Models such as T-NLG~\cite{tnlg} and Megatron-LM~\cite{shoeybi2019megatron} were already trained on DGX-SuperPOD --- a supercomputer with hundreds of Tesla V100 GPUs spread over tens of servers. As for GPT-3~\cite{brown2020language}, a single \textit{training run} was estimated to cost 4.6 -- 12 million dollars~\cite{gpt3costlambda,gpt3cost}. + +Even today, the need for costly hardware weighs heavily on the research community. Most researchers cannot contribute to the development of large neural networks because conducting the necessary experiments would be too expensive for them. If we continue to increase the model size by scaling up, eventually the only labs that can conduct competitive research will be those with massive budgets. + +However, there is another solution: to scale out. Instead of using a supercomputer, researchers could crowdsource the computation from volunteers with regular PCs. % +This paradigm is known as volunteer computing and was successfully applied to solve problems in biology \cite{larson_crowd}, high energy physics \cite{adam2015atlas} and other subject areas. While a single volunteer PC may be slow and unreliable, the combined floating-point performance of such projects is on par with largest supercomputers \cite{gross_folding}. + +The main challenge of volunteer computing is how to utilize this performance. Unlike server pods, consumer-grade PCs communicate over the Internet, which is significantly slower, especially in terms of latency. They are also more prone to failures as they lack many reliability features of their server-grade counterparts. Therefore, volunteer computing was traditionally used for tasks that have high computation to communication ratio and can recover from individual node failures. + +Unfortunately, existing paradigms of distributed training require nodes to continuously transfer large amounts of intermediate data \cite{Dettmers20158BitAF,Sun2019OptimizingNP}, making them unsuitable for volunteer computing. In this work, we take a different approach. Instead of adopting the existing distributed training strategies, we identify the advantages of volunteer computing and design a new strategy that capitalizes on them. + +We summarize the contributions of our paper as follows: + +\vspace{-6px} +\begin{minipage}{0.55\textwidth} + +\begin{itemize}[leftmargin=*] + \item We propose Decentralized Mixture of Experts (DMoE) --- a layer designed for training with vast amounts of unreliable consumer-grade hardware;% + \vspace{1px}\item We describe a framework for training large neural networks composed of DMoE layers;% + \vspace{1px}\item We confirm the efficiency and reliability of this approach using formal guarantees and experiments; + \vspace{1px}\item The PyTorch source code that can be used to reproduce our results is available online\footnotemark. +\end{itemize} +\end{minipage} +\hspace{5px} +\begin{minipage}{0.45\textwidth} +\vspace{-6px} + \centering + \raisebox{\dimexpr \topskip-\height}{\includegraphics[width=180px]{resources/teasseract3.pdf}} + \captionof{figure}{High-level scheme of Decentralized Mixture of Experts. See Section \ref{sect:method} for details.} + \label{fig:teaser} +\end{minipage} +\footnotetext{\url{https://github.com/mryab/learning-at-home}} \ No newline at end of file diff --git a/2024/05/29/papers/2002.04013/main.bbl b/2024/05/29/papers/2002.04013/main.bbl new file mode 100644 index 00000000..55818e92 --- /dev/null +++ b/2024/05/29/papers/2002.04013/main.bbl @@ -0,0 +1,482 @@ +\begin{thebibliography}{10} + +\bibitem{imagenet_cvpr09} +J.~Deng, W.~Dong, R.~Socher, L.-J. Li, K.~Li, and L.~Fei-Fei. +\newblock {ImageNet: A Large-Scale Hierarchical Image Database}. +\newblock In {\em CVPR09}, 2009. + +\bibitem{alexnet} +Alex Krizhevsky, Ilya Sutskever, and Geoffrey~E Hinton. +\newblock Imagenet classification with deep convolutional neural networks. +\newblock In F.~Pereira, C.~J.~C. Burges, L.~Bottou, and K.~Q. Weinberger, + editors, {\em Advances in Neural Information Processing Systems 25}, pages + 1097--1105. Curran Associates, Inc., 2012. + +\bibitem{resnet} +Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. +\newblock Deep residual learning for image recognition. +\newblock {\em 2016 IEEE Conference on Computer Vision and Pattern Recognition + (CVPR)}, pages 770--778, 2015. + +\bibitem{huang2019gpipe} +Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, + HyoukJoong Lee, Jiquan Ngiam, Quoc~V Le, Yonghui Wu, et~al. +\newblock Gpipe: Efficient training of giant neural networks using pipeline + parallelism. +\newblock In {\em Advances in Neural Information Processing Systems}, pages + 103--112, 2019. + +\bibitem{kolesnikovlarge} +Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, + Sylvain Gelly, and Neil Houlsby. +\newblock Large scale learning of general visual representations for transfer. +\newblock {\em CoRR}, abs/1912.11370, 2019. + +\bibitem{jft300data} +Baoyuan Wu, Weidong Chen, Yanbo Fan, Yong Zhang, Jinlong Hou, Jie Liu, and Tong + Zhang. +\newblock Tencent ml-images: A large-scale multi-label image database for + visual representation learning. +\newblock {\em IEEE Access}, 7:172683--172693, 2019. + +\bibitem{bert} +Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. +\newblock Bert: Pre-training of deep bidirectional transformers for language + understanding. +\newblock In {\em NAACL-HLT}, 2019. + +\bibitem{roberta} +Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer + Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. +\newblock Roberta: A robustly optimized bert pretraining approach. +\newblock {\em ArXiv}, abs/1907.11692, 2019. + +\bibitem{kaplan2020scaling} +Jared Kaplan, Sam McCandlish, Tom Henighan, Tom~B. Brown, Benjamin Chess, Rewon + Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. +\newblock Scaling laws for neural language models, 2020. + +\bibitem{brown2020language} +Tom~B Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla + Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, + et~al. +\newblock Language models are few-shot learners. +\newblock {\em arXiv preprint arXiv:2005.14165}, 2020. + +\bibitem{shoeybi2019megatron} +Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, + and Bryan Catanzaro. +\newblock Megatron-lm: Training multi-billion parameter language models using + gpu model parallelism. +\newblock {\em arXiv preprint arXiv:1909.08053}, 2019. + +\bibitem{zellers2019defending} +Rowan Zellers, Ari Holtzman, Hannah Rashkin, Yonatan Bisk, Ali Farhadi, + Franziska Roesner, and Yejin Choi. +\newblock Defending against neural fake news. +\newblock In {\em Advances in Neural Information Processing Systems}, pages + 9051--9062, 2019. + +\bibitem{tnlg} +Corby Rosset. +\newblock Turing-nlg: A 17-billion-parameter language model by microsoft. +\newblock + https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/. + +\bibitem{gpt3costlambda} +Chuan Li. +\newblock Demystifying gpt-3 language model: A technical overview. +\newblock "\url{https://lambdalabs.com/blog/demystifying-gpt-3}". + +\bibitem{gpt3cost} +Elliot Turner. +\newblock Estimate of GPT-3 training cost based on public cloud GPU/TPU cost + models, from Elliot Turner's personal page (accessed on May 29, 2020). + +\bibitem{larson_crowd} +Stefan Larson, Christopher Snow, Michael Shirts, and Vijay Pande. +\newblock Folding@home and genome@home: Using distributed computing to tackle + previously intractable problems in computational biology. +\newblock {\em arXiv}, 02 2009. + +\bibitem{adam2015atlas} +C~Adam-Bourdarios, D~Cameron, A~Filip{\v{c}}i{\v{c}}, E~Lancon, Wenjing Wu, + et~al. +\newblock Atlas@ home: harnessing volunteer computing for hep. +\newblock In {\em Journal of Physics: Conference Series}, volume 664, page + 022009. IOP Publishing, 2015. + +\bibitem{gross_folding} +Michael Gross. +\newblock Folding research recruits unconventional help. +\newblock In {\em Current Biology. 22 (2): R35–R38}, 2012. + +\bibitem{Dettmers20158BitAF} +Tim Dettmers. +\newblock 8-bit approximations for parallelism in deep learning. +\newblock {\em ICLR}, 2015. + +\bibitem{Sun2019OptimizingNP} +Peng Sun, Wansen Feng, Ruobing Han, Shengen Yan, and Yonggang Wen. +\newblock Optimizing network performance for distributed dnn training on gpu + clusters: Imagenet/alexnet training in 1.5 minutes. +\newblock {\em ArXiv}, abs/1902.06855, 2019. + +\bibitem{anderson2004boinc} +David~P Anderson. +\newblock Boinc: A system for public-resource computing and storage. +\newblock In {\em Fifth IEEE/ACM international workshop on grid computing}, + pages 4--10. IEEE, 2004. + +\bibitem{folding_timeline} +{\em Folding@home project timeline}. +\newblock \url{https://foldingathome.org/project-timeline}(accessed on May 30, + 2020). + +\bibitem{speedtest} +Speedtest global index for fixed broadband. +\newblock \url{https://www.speedtest.net/global-index} (accessed on 11.08.2020, + bandwidth for top countries and general trend). + +\bibitem{li2017case} +Fuliang Li, Xingwei Wang, Tian Pan, and Jiahai Yang. +\newblock A case study of ipv6 network performance: Packet delay, loss, and + reordering. +\newblock {\em Mathematical Problems in Engineering}, 2017, 2017. + +\bibitem{valiant1990bridging} +Leslie~G Valiant. +\newblock A bridging model for parallel computation. +\newblock {\em Communications of the ACM}, 33(8):103--111, 1990. + +\bibitem{goyal2017accurate} +Priya Goyal, Piotr Dollár, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, + Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. +\newblock Accurate, large minibatch sgd: Training imagenet in 1 hour, 2017. + +\bibitem{You2020Large} +Yang You, Jing Li, Sashank Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh + Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, and Cho-Jui Hsieh. +\newblock Large batch optimization for deep learning: Training bert in 76 + minutes. +\newblock In {\em International Conference on Learning Representations}, 2020. + +\bibitem{recht2011hogwild} +Benjamin Recht, Christopher Re, Stephen Wright, and Feng Niu. +\newblock Hogwild: A lock-free approach to parallelizing stochastic gradient + descent. +\newblock In {\em Advances in neural information processing systems}, pages + 693--701, 2011. + +\bibitem{zhang2015staleness} +Wei Zhang, Suyog Gupta, Xiangru Lian, and Ji~Liu. +\newblock Staleness-aware async-sgd for distributed deep learning. +\newblock {\em arXiv preprint arXiv:1511.05950}, 2015. + +\bibitem{stale_gradients_can_win} +Sanghamitra Dutta, Gauri Joshi, Soumyadip Ghosh, Parijat Dube, and Priya + Nagpurkar. +\newblock Slow and stale gradients can win the race: Error-runtime trade-offs + in distributed sgd. +\newblock 03 2018. + +\bibitem{zero} +Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. +\newblock Zero: Memory optimization towards training a trillion parameter + models. +\newblock 10 2019. + +\bibitem{pipemare} +Bowen Yang, Jian Zhang, Jonathan Li, Christopher R{\'e}, Christopher~R. + Aberger, and Christopher~De Sa. +\newblock Pipemare: Asynchronous pipeline parallel dnn training. +\newblock {\em ArXiv}, abs/1910.05124, 2019. + +\bibitem{pipedream} +Deepak Narayanan, Aaron Harlap, Amar Phanishayee, Vivek Seshadri, Nikhil~R. + Devanur, Gregory~R. Ganger, Phillip~B. Gibbons, and Matei Zaharia. +\newblock Pipedream: Generalized pipeline parallelism for dnn training. +\newblock In {\em Proceedings of the 27th ACM Symposium on Operating Systems + Principles}, SOSP ’19, page 1–15, New York, NY, USA, 2019. Association + for Computing Machinery. + +\bibitem{mcmahan2017communication} +Brendan McMahan, Eider Moore, Daniel Ramage, Seth Hampson, and Blaise~Aguera + y~Arcas. +\newblock Communication-efficient learning of deep networks from decentralized + data. +\newblock In {\em Artificial Intelligence and Statistics}, pages 1273--1282, + 2017. + +\bibitem{bonawitz2017practical} +Keith Bonawitz, Vladimir Ivanov, Ben Kreuter, Antonio Marcedone, H~Brendan + McMahan, Sarvar Patel, Daniel Ramage, Aaron Segal, and Karn Seth. +\newblock Practical secure aggregation for privacy-preserving machine learning. +\newblock In {\em Proceedings of the 2017 ACM SIGSAC Conference on Computer and + Communications Security}, pages 1175--1191, 2017. + +\bibitem{desell2017} +T.~{Desell}. +\newblock Developing a volunteer computing project to evolve convolutional + neural networks and their hyperparameters. +\newblock In {\em 2017 IEEE 13th International Conference on e-Science + (e-Science)}, pages 19--28, 2017. + +\bibitem{volunteer_dl_async} +Ekasit Kijsipongse, Apivadee Piyatumrong, and Suriya U-ruekolan. +\newblock A hybrid gpu cluster and volunteer computing platform for scalable + deep learning. +\newblock {\em The Journal of Supercomputing}, 04 2018. + +\bibitem{lc0} +{Pascutto, Gian-Carlo and Linscott, Gary}. +\newblock Leela chess zero. +\newblock 2019. + +\bibitem{moe_first} +Robert~A. Jacobs, Michael~I. Jordan, Steven~J. Nowlan, and Geoffrey~E. Hinton. +\newblock Adaptive mixtures of local experts. +\newblock {\em Neural Computation}, 3(1):79–87, March 1991. + +\bibitem{jordan1994hierarchical} +Michael~I Jordan and Robert~A Jacobs. +\newblock Hierarchical mixtures of experts and the em algorithm. +\newblock {\em Neural computation}, 6(2):181--214, 1994. + +\bibitem{yao2009hierarchical} +Bangpeng Yao, Dirk Walther, Diane Beck, and Li~Fei-Fei. +\newblock Hierarchical mixture of classification experts uncovers interactions + between brain regions. +\newblock In {\em Advances in Neural Information Processing Systems}, pages + 2178--2186, 2009. + +\bibitem{moe_lifelong} +Rahaf Aljundi, Punarjay Chakravarty, and Tinne Tuytelaars. +\newblock Expert gate: Lifelong learning with a network of experts. +\newblock pages 7120--7129, 07 2017. + +\bibitem{rasmussen2002infinite} +Carl~E Rasmussen and Zoubin Ghahramani. +\newblock Infinite mixtures of gaussian process experts. +\newblock In {\em Advances in neural information processing systems}, pages + 881--888, 2002. + +\bibitem{moe_svm} +Ronan Collobert, Samy Bengio, and Yoshua Bengio. +\newblock A parallel mixture of svms for very large scale problems. +\newblock In {\em Advances in Neural Information Processing Systems}, pages + 633--640, 2002. + +\bibitem{moe_dirichlet} +Babak Shahbaba and Radford Neal. +\newblock Nonlinear models using dirichlet process mixtures. +\newblock {\em Journal of Machine Learning Research}, 10(Aug):1829--1850, 2009. + +\bibitem{eigen2013learning} +David Eigen, Marc'Aurelio Ranzato, and Ilya Sutskever. +\newblock Learning factored representations in a deep mixture of experts. +\newblock {\em arXiv preprint arXiv:1312.4314}, 2013. + +\bibitem{shazeer2017outrageously} +Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, + Geoffrey Hinton, and Jeff Dean. +\newblock Outrageously large neural networks: The sparsely-gated + mixture-of-experts layer. +\newblock {\em arXiv preprint arXiv:1701.06538}, 2017. + +\bibitem{Lepikhin2020GShardSG} +Dmitry Lepikhin, H.~Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Y.~Huang, + M.~Krikun, Noam Shazeer, and Z.~Chen. +\newblock Gshard: Scaling giant models with conditional computation and + automatic sharding. +\newblock {\em ArXiv}, abs/2006.16668, 2020. + +\bibitem{pkm} +Guillaume Lample, Alexandre Sablayrolles, Marc\'~Aurelio Ranzato, Ludovic + Denoyer, and Herve Jegou. +\newblock Large memory layers with product keys. +\newblock In H.~Wallach, H.~Larochelle, A.~Beygelzimer, F.~d\' Alch\'{e}-Buc, + E.~Fox, and R.~Garnett, editors, {\em Advances in Neural Information + Processing Systems 32}, pages 8546--8557. Curran Associates, Inc., 2019. + +\bibitem{puigcerver2020scalable} +Joan Puigcerver, Carlos Riquelme, Basil Mustafa, Cedric Renggli, + Andr{\'e}~Susano Pinto, Sylvain Gelly, Daniel Keysers, and Neil Houlsby. +\newblock Scalable transfer learning with expert models. +\newblock {\em arXiv preprint arXiv:2009.13239}, 2020. + +\bibitem{tewari1998beyond} +Renu Tewari, Michael Dahlin, Harrick Vin, and John Kay. +\newblock Beyond hierarchies: Design considerations for distributed caching on + the internet. +\newblock Technical report, Citeseer. + +\bibitem{can} +Sylvia Ratnasamy, Paul Francis, Mark Handley, Richard Karp, and Scott Shenker. +\newblock A scalable content-addressable network. +\newblock In {\em Proceedings of the 2001 conference on Applications, + technologies, architectures, and protocols for computer communications}, + pages 161--172, 2001. + +\bibitem{chord} +Hari Balakrishnan, M~Frans Kaashoek, David Karger, Robert Morris, and Ion + Stoica. +\newblock Looking up data in p2p systems. +\newblock {\em Communications of the ACM}, 46(2):43--48, 2003. + +\bibitem{pastry} +Antony Rowstron and Peter Druschel. +\newblock Pastry: Scalable, decentralized object location, and routing for + large-scale peer-to-peer systems. +\newblock In {\em IFIP/ACM International Conference on Distributed Systems + Platforms and Open Distributed Processing}, pages 329--350. Springer, 2001. + +\bibitem{tapestry} +Ben Zhao, Ling Huang, Jeremy Stribling, Sean Rhea, Anthony Joseph, and John + Kubiatowicz. +\newblock Tapestry: A resilient global-scale overlay for service deployment. +\newblock {\em IEEE Journal on Selected Areas in Communications}, 22, 07 2003. + +\bibitem{kademlia} +Petar Maymounkov and David Mazieres. +\newblock Kademlia: A peer-to-peer information system based on the xor metric. +\newblock In {\em International Workshop on Peer-to-Peer Systems}, pages + 53--65. Springer, 2002. + +\bibitem{kaashoek2003koorde} +M~Frans Kaashoek and David~R Karger. +\newblock Koorde: A simple degree-optimal distributed hash table. +\newblock In {\em International Workshop on Peer-to-Peer Systems}, pages + 98--107. Springer, 2003. + +\bibitem{srivastava2014dropout} +Nitish Srivastava, Geoffrey Hinton, Alex Krizhevsky, Ilya Sutskever, and Ruslan + Salakhutdinov. +\newblock Dropout: a simple way to prevent neural networks from overfitting. +\newblock {\em The journal of machine learning research}, 15(1):1929--1958, + 2014. + +\bibitem{gradient_checkpointing_autograd} +Andreas Griewank and Andrea Walther. +\newblock Algorithm 799: revolve: an implementation of checkpointing for the + reverse or adjoint mode of computational differentiation. +\newblock {\em ACM Transactions on Mathematical Software (TOMS)}, 26(1):19--45, + 2000. + +\bibitem{gradient_checkpointing_dl} +Tianqi Chen, Bing Xu, Chiyuan Zhang, and Carlos Guestrin. +\newblock Training deep nets with sublinear memory cost. +\newblock {\em arXiv preprint arXiv:1604.06174}, 2016. + +\bibitem{sukhov2016generating} +Andrei~M Sukhov, MA~Astrakhantseva, AK~Pervitsky, SS~Boldyrev, and AA~Bukatov. +\newblock Generating a function for network delay. +\newblock {\em Journal of High Speed Networks}, 22(4):321--333, 2016. + +\bibitem{hendrycks2016gaussian} +Dan Hendrycks and Kevin Gimpel. +\newblock Gaussian error linear units (gelus), 2016. + +\bibitem{mnist} +Yann LeCun, L{\'e}on Bottou, Yoshua Bengio, and Patrick Haffner. +\newblock Gradient-based learning applied to document recognition. +\newblock {\em Proceedings of the IEEE}, 86(11):2278--2324, 1998. + +\bibitem{dai2019transformer} +Zihang Dai, Zhilin Yang, Yiming Yang, Jaime~G Carbonell, Quoc Le, and Ruslan + Salakhutdinov. +\newblock Transformer-xl: Attentive language models beyond a fixed-length + context. +\newblock In {\em Proceedings of the 57th Annual Meeting of the Association for + Computational Linguistics}, pages 2978--2988, 2019. + +\bibitem{wikitext2} +2016 Stephen Merity~et al. +\newblock Wikitext-2. + +\bibitem{dettmerswikitext2} +Tim Dettmers. +\newblock https://github.com/TimDettmers/transformer-xl/tree/wikitext2. + +\bibitem{folding_covid} +\url{https://foldingathome.org/covid19/}(accessed on June 4, 2020). + +\bibitem{urdaneta2011survey} +Guido Urdaneta, Guillaume Pierre, and Maarten~Van Steen. +\newblock A survey of dht security techniques. +\newblock {\em ACM Computing Surveys (CSUR)}, 43(2):1--49, 2011. + +\bibitem{sybil_attacks_dht} +Liang Wang and Jussi Kangasharju. +\newblock Real-world sybil attacks in bittorrent mainline dht. +\newblock In {\em 2012 IEEE Global Communications Conference (GLOBECOM)}, pages + 826--832. IEEE, 2012. + +\bibitem{dos_resistance} +Baruch Awerbuch and Christian Scheideler. +\newblock A denial-of-service resistant dht. +\newblock In {\em International Symposium on Distributed Computing}, pages + 33--47. Springer, 2007. + +\bibitem{sybil_nodes} +Zied Trifa and Maher Khemakhem. +\newblock Sybil nodes as a mitigation strategy against sybil attack. +\newblock {\em Procedia Computer Science}, 32:1135--1140, 2014. + +\bibitem{bagdasaryan2018backdoor} +Eugene Bagdasaryan, Andreas Veit, Yiqing Hua, Deborah Estrin, and Vitaly + Shmatikov. +\newblock How to backdoor federated learning. +\newblock {\em arXiv preprint arXiv:1807.00459}, 2018. + +\bibitem{bhagoji2018analyzing} +Arjun~Nitin Bhagoji, Supriyo Chakraborty, Prateek Mittal, and Seraphin Calo. +\newblock Analyzing federated learning through an adversarial lens. +\newblock {\em arXiv preprint arXiv:1811.12470}, 2018. + +\bibitem{paszke2019pytorch} +Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory + Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et~al. +\newblock Pytorch: An imperative style, high-performance deep learning library. +\newblock In {\em Advances in Neural Information Processing Systems}, pages + 8024--8035, 2019. + +\bibitem{lambdabenchmarks} +Chuan~Li Stephen~Balaban. +\newblock Deep learning gpu benchmarks, lambda labs website, 2018/10/08. + +\bibitem{natural_compression} +Samuel Horvath, Chen{-}Yu Ho, Ludovit Horvath, Atal~Narayan Sahu, Marco Canini, + and Peter Richt{\'{a}}rik. +\newblock Natural compression for distributed deep learning. +\newblock {\em CoRR}, abs/1905.10988, 2019. + +\bibitem{NIPS2019_8736} +Xiao Sun, Jungwook Choi, Chia-Yu Chen, Naigang Wang, Swagath Venkataramani, + Vijayalakshmi~(Viji) Srinivasan, Xiaodong Cui, Wei Zhang, and Kailash + Gopalakrishnan. +\newblock Hybrid 8-bit floating point (hfp8) training and inference for deep + neural networks. +\newblock In H.~Wallach, H.~Larochelle, A.~Beygelzimer, F.~d\' Alch\'{e}-Buc, + E.~Fox, and R.~Garnett, editors, {\em Advances in Neural Information + Processing Systems 32}, pages 4901--4910. Curran Associates, Inc., 2019. + +\bibitem{ma2019hsic} +Wan-Duo~Kurt Ma, J.~P. Lewis, and W.~Bastiaan Kleijn. +\newblock The hsic bottleneck: Deep learning without back-propagation, 2019. + +\bibitem{jaderberg2017decoupled} +Max Jaderberg, Wojciech~Marian Czarnecki, Simon Osindero, Oriol Vinyals, Alex + Graves, David Silver, and Koray Kavukcuoglu. +\newblock Decoupled neural interfaces using synthetic gradients. +\newblock In {\em Proceedings of the 34th International Conference on Machine + Learning-Volume 70}, pages 1627--1635. JMLR. org, 2017. + +\bibitem{real2017large} +Esteban Real, Sherry Moore, Andrew Selle, Saurabh Saxena, Yutaka~Leon Suematsu, + Jie Tan, Quoc~V Le, and Alexey Kurakin. +\newblock Large-scale evolution of image classifiers. +\newblock In {\em Proceedings of the 34th International Conference on Machine + Learning-Volume 70}, pages 2902--2911. JMLR. org, 2017. + +\end{thebibliography} diff --git a/2024/05/29/papers/2002.04013/main.tex b/2024/05/29/papers/2002.04013/main.tex new file mode 100644 index 00000000..7f30a357 --- /dev/null +++ b/2024/05/29/papers/2002.04013/main.tex @@ -0,0 +1,89 @@ +\documentclass{article} + + + + + + +\usepackage[utf8]{inputenc} % +\usepackage[T1]{fontenc} % +\usepackage{url} % +\usepackage{booktabs} % +\usepackage{amsfonts} % +\usepackage{nicefrac} % +\usepackage{microtype} % +\usepackage{lipsum} + +\usepackage{graphicx} +\usepackage{subfigure} +\usepackage{makecell,multirow} % + +\def\UrlBreaks{\do\/\do-} +\usepackage{breakurl} +\usepackage[breaklinks]{hyperref} +\usepackage{amsmath} +\newcommand{\theHalgorithm}{\arabic{algorithm}} +\usepackage{caption} +\usepackage{mwe} + +\usepackage{algorithm} +\usepackage{algorithmic} +\usepackage[final,nonatbib]{neurips_2020} +\usepackage{xcolor} +\usepackage{enumitem} + +\title{Towards Crowdsourced Training of Large Neural Networks using Decentralized Mixture-of-Experts} + +\author{% + Max Ryabinin\thanks{Corresponding author.} \\ + Yandex\\ + National Research University\\ + Higher School of Economics\\ + \texttt{mryabinin@hse.ru} \\ + \And + Anton Gusev \\ + Independent \\ + \texttt{uartman@mail.ru} \\ +} + + +\begin{document} + +\maketitle + +\vspace{-4px} +\begin{abstract} +Many recent breakthroughs in deep learning were achieved by training increasingly larger models on massive datasets. However, training such models can be prohibitively expensive. For instance, the cluster used to train GPT-3 costs over \$250 million\footnote{\hspace{-2px}A conservative estimate based on \url{https://blogs.microsoft.com/ai/openai-azure-supercomputer}}. As a result, most researchers cannot afford to train state of the art models and contribute to their development. Hypothetically, a researcher could crowdsource the training of large neural networks with thousands of regular PCs provided by volunteers. The raw computing power of a hundred thousand \$2500 desktops dwarfs that of a \$250M server pod, but one cannot utilize that power efficiently with conventional distributed training methods. In this work, we propose Learning@home: a novel neural network training paradigm designed to handle large amounts of poorly connected participants. We analyze the performance, reliability, and architectural constraints of this paradigm and compare it against existing distributed training techniques. +\end{abstract} +\input{intro.tex} + +\input{related.tex} + +\input{method.tex} + +\input{experiments.tex} + +\section{Conclusion} +The main purpose of this study is to convey the idea that one \textit{can} train large neural networks on unreliable hardware. We propose a specialized layer and training infrastructure designed to meet the requirements of volunteer computing over the Internet. +The preliminary experiments demonstrate that Learning@home can scale to thousands of nodes and successfully train popular model archetypes despite network latency and node failures. + +We believe that decentralized deep learning will change the way we think about training neural networks. Instead of running isolated experiments, researchers and practitioners will be able to join forces and solve the biggest problems together. Instead of being confined to a single supercomputer, our models will naturally grow in capacity as more people and organizations around the world join in. +We expand on the ramifications of deep learning decentralization in the broader impact statement. + +However, reaching the full potential of this idea requires expertise not only in deep learning, but also information security, distributed systems, crowdsourcing and many other areas. We believe that this monumental task is best solved through scientific collaboration. To that end, we will continue to develop Learning@home as a public open-source project\footnote{\url{https://learning-at-home.github.io}}. + +\section*{Acknowledgements and funding} +We would like to thank Artem Babenko and Vladimir Aliev for their invaluable assistance in both brainstorming and proofreading the final paper. We are also grateful to anonymous reviewers for their helpful suggestions on improving the presentation of the paper. Max Ryabinin was supported by Yandex and National Research University Higher School of Economics. + +\input{discussion.tex} + + +\nocite{paszke2019pytorch} +\bibliography{bibliography} +\bibliographystyle{unsrt} + +\appendix +\input{supplementary.tex} + + +\end{document} \ No newline at end of file diff --git a/2024/05/29/papers/2002.04013/method.tex b/2024/05/29/papers/2002.04013/method.tex new file mode 100644 index 00000000..5d160a0f --- /dev/null +++ b/2024/05/29/papers/2002.04013/method.tex @@ -0,0 +1,117 @@ +\vspace{-4px} +\section{Learning@home}\label{sect:method} +\vspace{-2px} + +Our main idea is to use the existing properties of mixture-of-experts and distributed hash tables to work around the limitations of volunteer computing. We begin with a method for distributed training of MoE layers, then extend it to provide fault tolerance and decentralized bookkeeping. + +\vspace{-2px} +\subsection{Decentralized Mixture-of-Experts}\label{sect:method_dmoe} +\vspace{-2px} + +The fundamental building block of our approach is Decentralized Mixture-of-Experts (DMoE) --- a layer that contains multiple independent ``expert'' sub-networks distributed over a pool of workers. In addition to experts, each worker has a gating function: a lightweight sub-network that selects experts depending on the input. Similarly to regular mixture-of-experts, DMoE is a general-purpose layer that can process any input type by using the appropriate experts (e.g., convolutional or attentive). + +Workers within the DMoE layer interact using Kademlia DHT protocol (Section \ref{sect:related_dht}). This DHT stores metadata, such as expert weights and worker status. Figure \ref{fig:dmoe_inference} explains DMoE inference: + +\begin{figure}[h] +\vspace{-10px} + \centering + \includegraphics[width=400px,height=100px]{resources/schematic-training-v2.pdf} + \caption{Forward and backward passes for Decentralized Mixture of Experts.} + \label{fig:dmoe_inference} +\end{figure} + +\vspace{-4px} + + +This procedure takes at most $O(k \log N)$ DHT queries to locate the chosen experts and $k$ direct interactions with these experts to do the actual processing. As long as $k~\ll~N$, we can increase the total number of experts without compromising the inference speed. Furthermore, we argue that DMoE layers automatically solve most of the issues that arise in the volunteer computing scenario. + +\textbf{Fault tolerance.} If some of the $k$ chosen experts fail to respond due to a hardware or network error, DMoE can exclude those experts from averaging. The effect of such exclusion is similar to using Dropout \cite{srivastava2014dropout} with regular mixture-of-experts. As a side effect, training DMoE on a faulty infrastructure will automatically adapt the mixture to the failure points of that infrastructure. + +\textbf{Volunteer hardware.} Compute nodes can serve different numbers of experts based on their hardware capabilities. If one node leaves the network, another can take its place by retrieving the latest expert checkpoints from the DHT. + +\textbf{Load balancing.} Mixture-of-experts layers can be regularized to balance the rate at which they select each expert in the mixture \cite{shazeer2017outrageously, pkm}. Originally designed to improve MoE quality, this regularization has a side-effect of improving resource utilization by balancing computation load between workers. + +\textbf{Asynchronous training.} Due to communication latency in distributed systems, a single input can take a long time to process. The traditional solution is to train asynchronously \cite{volunteer_dl_async}. Instead of waiting for the results on one training batch, a worker can start processing the next batch right away. This approach can significantly improve hardware utilization at the cost of stale gradients. + +Fortunately, Mixture-of-Experts accumulates staleness at a slower pace than regular neural networks. Only a small subset of all experts processes a single input; therefore, two individual inputs are likely to affect completely different experts. In that case, updating expert weights for the first input will not introduce staleness for the second one. +We elaborate on this claim in Section \ref{sect:exp_convergence}. + +\vspace{-3px} +\subsection{Structured Gating Function}\label{sect:method_gating} +\vspace{-2px} + +Since DMoE can use up to millions of experts, the gating function can no longer iterate over each expert in the mixture. Furthermore, the nodes in such a system are continually joining and leaving. Consequently, the expert selection procedure cannot rely on the availability of any individual node. + +\vspace{-1px} + +With this in mind, we propose a gating function inspired by product key layers~\cite{pkm}. First, we organize experts into a $d$-dimensional grid. Each expert $f$ is associated with a unique tuple of integers: $\textrm{uid}(f) = (u_0, u_1, \ldots, u_{d-1}), u_i \in [0, M)$. The grid dimensions $d, M$ should be chosen to accommodate all experts with some level of redundancy. Having extra grid space allows DMoE to allocate additional experts midway through training if more volunteers join. % + +\vspace{-1px} + +The gating function itself consists of $d$ linear layers $g_0,\dots\,g_{d-1}$ and computes expert priority in an additive manner: $g(x, f) = \sum_{i=0}^{d - 1} g_i(x)[u_i]$. Such a function only needs to predict $d$ vectors of size $M$, which makes it significantly easier to compute and send over the network. Furthermore, this gating function can choose top-$k$ highest-scoring experts in logarithmic time (see Appendix B, C). + +\vspace{-4px} + +After choosing the appropriate experts, a worker should find their respective servers (in $O(k \log N)$ time using DHT) and pass the input vector for processing (see Figure \ref{fig:teaser}). Once all the experts have finished processing, the worker aggregates expert outputs by weighted averaging: +\begin{equation} +\label{eq:dmoe_averaging} + \textrm{DMoE}(x) = \!\!\!\!\!\! \sum_{f \in \mathrm{TopK}(x)} \!\!\!\!\!\! f(x) \frac{\exp\left({g(x, f)}\right)}{\sum_{f' \in \mathrm{TopK}(x)} \exp\left({g(x, f')}\right)} + \space\text{ , $\mathrm{TopK}(x)$ are $k$ best experts w.r.t. $g$} +\end{equation} +\vspace{-8px} + +If some of the chosen experts have crashed or taken too long to perform the computation, we can exclude them from averaging and renormalize weights so that they still add up to 1. Trained with this exclusion policy, DMoE will learn experts with overlapping specializations that are more resistant to individual node failure. +\vspace{-4px} +\subsection{Training infrastructure}\label{sect:method_athome} +\vspace{-2px} + +Finally, we describe Learning@home --- a deep learning infrastructure that performs distributed training of large models on hardware provided by volunteers. Each worker runs three components: + +\begin{figure}[h!] +\vspace{-6px} + \begin{minipage}{0.45\linewidth} + \begin{itemize}[leftmargin=*] + \item \textbf{Trainer} --- forming batches and training; + \item \textbf{Runtime} --- inference and expert updates; + \item \textbf{DHT Node} --- bookkeeping and routing; + \end{itemize}\end{minipage}\begin{minipage}{0.55\linewidth} + \centering\raisebox{\dimexpr \topskip-\height}{ + \includegraphics[width=90px]{resources/l_at_home.pdf}} + \end{minipage} + \caption{Learning@home components and their interaction.} + \vspace{-12pt} + \label{fig:dmoe} +\end{figure} + + +\textbf{Trainer} generates batches and propagates them through the model. After forming a batch and converting it into an input vector, the trainer iterates over a sequence of DMoE layers and organizes forward and backward passes, as described in Sections \ref{sect:method_dmoe} and \ref{sect:method_gating}. Learning@home fully embraces the asynchronous training paradigm, where a trainer can process hundreds of concurrent batches. + +\vspace{-1px} + +\textbf{Runtime} is responsible for expert inference and training. This is the only process that has access to participant's GPU device(s). Once all the experts are initialized, runtime listens to the incoming connections from trainers and handles two types of requests: +\vspace{-4px} +\begin{itemize}[leftmargin=*] + \item \textbf{Forward}: given inputs, compute and return expert outputs on these inputs (no side-effects); + \item \textbf{Backward}: given inputs and gradients of loss function w.r.t. outputs, return gradients w.r.t. inputs and \textit{update expert parameters by gradient descent}. +\end{itemize} +\vspace{-4px} + +Since trainers can operate under latency, the runtime is not required to process all requests right away. Instead, it aggregates requests into batches for better GPU utilization. + +\vspace{-1px} + +The runtime process relies on gradient checkpointing to avoid storing intermediate expert activations \cite{gradient_checkpointing_autograd,gradient_checkpointing_dl}. +This choice means that the expert $f_i(x)$ is called both during the forward and the backward passes. +We elaborate on the role of gradient checkpointing in Appendix D. + +\vspace{-1px} + +\textbf{DHT Node.} The final component of Learning@home infrastructure is a DHT for bookkeeping. For simplicity, we use unmodified Kademlia protocol\footnote{In particular, publicly available Kademlia implementation from \url{github.com/bmuller/kademlia}}, leaving further investigation to future work. + +\vspace{-1px} + +Each runtime periodically announces its experts to the DHT, associating their identifiers with the address of that runtime and the current timestamp (details in Appendix C). Trainers can then use those entries to find the workers responsible for the chosen experts. In addition to timestamps, a runtime also regularly saves latest expert weights into the same DHT for persistence. The resulting infrastructure becomes elastic and fault-tolerant as long as it has enough active participants. + + + + diff --git a/2024/05/29/papers/2002.04013/neurips_2020.sty b/2024/05/29/papers/2002.04013/neurips_2020.sty new file mode 100644 index 00000000..6a1a741d --- /dev/null +++ b/2024/05/29/papers/2002.04013/neurips_2020.sty @@ -0,0 +1,371 @@ +% partial rewrite of the LaTeX2e package for submissions to the +% Conference on Neural Information Processing Systems (NeurIPS): +% +% - uses more LaTeX conventions +% - line numbers at submission time replaced with aligned numbers from +% lineno package +% - \nipsfinalcopy replaced with [final] package option +% - automatically loads times package for authors +% - loads natbib automatically; this can be suppressed with the +% [nonatbib] package option +% - adds foot line to first page identifying the conference +% - adds preprint option for submission to e.g. arXiv +% - conference acronym modified +% +% Roman Garnett (garnett@wustl.edu) and the many authors of +% nips15submit_e.sty, including MK and drstrip@sandia +% +% last revision: January 2020 + +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{neurips_2020}[2020/01/31 NeurIPS 2020 submission/camera-ready style file] + +% declare final option, which creates camera-ready copy +\newif\if@neuripsfinal\@neuripsfinalfalse +\DeclareOption{final}{ + \@neuripsfinaltrue +} + +% declare nonatbib option, which does not load natbib in case of +% package clash (users can pass options to natbib via +% \PassOptionsToPackage) +\newif\if@natbib\@natbibtrue +\DeclareOption{nonatbib}{ + \@natbibfalse +} + +% declare preprint option, which creates a preprint version ready for +% upload to, e.g., arXiv +\newif\if@preprint\@preprintfalse +\DeclareOption{preprint}{ + \@preprinttrue +} + +\ProcessOptions\relax + +% determine whether this is an anonymized submission +\newif\if@submission\@submissiontrue +\if@neuripsfinal\@submissionfalse\fi +\if@preprint\@submissionfalse\fi + +% fonts +\renewcommand{\rmdefault}{ptm} +\renewcommand{\sfdefault}{phv} + +% change this every year for notice string at bottom +\newcommand{\@neuripsordinal}{34th} +\newcommand{\@neuripsyear}{2020} +\newcommand{\@neuripslocation}{Vancouver, Canada} + +% acknowledgments +\usepackage{environ} +\newcommand{\acksection}{\section*{Acknowledgments and Disclosure of Funding}} +\NewEnviron{ack}{% + \acksection + \BODY +} + +% handle tweaks for camera-ready copy vs. submission copy +\if@preprint + \newcommand{\@noticestring}{% + Preprint. Under review.% + } +\else + \if@neuripsfinal + \newcommand{\@noticestring}{% + \@neuripsordinal\/ Conference on Neural Information Processing Systems + (NeurIPS \@neuripsyear), \@neuripslocation.% + } + \else + \newcommand{\@noticestring}{% + Submitted to \@neuripsordinal\/ Conference on Neural Information + Processing Systems (NeurIPS \@neuripsyear). Do not distribute.% + } + + % hide the acknowledgements + \NewEnviron{hide}{} + \let\ack\hide + \let\endack\endhide + + % line numbers for submission + \RequirePackage{lineno} + \linenumbers + + % fix incompatibilities between lineno and amsmath, if required, by + % transparently wrapping linenomath environments around amsmath + % environments + \AtBeginDocument{% + \@ifpackageloaded{amsmath}{% + \newcommand*\patchAmsMathEnvironmentForLineno[1]{% + \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname + \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname + \renewenvironment{#1}% + {\linenomath\csname old#1\endcsname}% + {\csname oldend#1\endcsname\endlinenomath}% + }% + \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{% + \patchAmsMathEnvironmentForLineno{#1}% + \patchAmsMathEnvironmentForLineno{#1*}% + }% + \patchBothAmsMathEnvironmentsForLineno{equation}% + \patchBothAmsMathEnvironmentsForLineno{align}% + \patchBothAmsMathEnvironmentsForLineno{flalign}% + \patchBothAmsMathEnvironmentsForLineno{alignat}% + \patchBothAmsMathEnvironmentsForLineno{gather}% + \patchBothAmsMathEnvironmentsForLineno{multline}% + }{} + } + \fi +\fi + +% load natbib unless told otherwise +\if@natbib + \RequirePackage{natbib} +\fi + +% set page geometry +\usepackage[verbose=true,letterpaper]{geometry} +\AtBeginDocument{ + \newgeometry{ + textheight=9in, + textwidth=5.5in, + top=1in, + headheight=12pt, + headsep=25pt, + footskip=30pt + } + \@ifpackageloaded{fullpage} + {\PackageWarning{neurips_2020}{fullpage package not allowed! Overwriting formatting.}} + {} +} + +\widowpenalty=10000 +\clubpenalty=10000 +\flushbottom +\sloppy + +% font sizes with reduced leading +\renewcommand{\normalsize}{% + \@setfontsize\normalsize\@xpt\@xipt + \abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@ + \abovedisplayshortskip \z@ \@plus 3\p@ + \belowdisplayskip \abovedisplayskip + \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@ +} +\normalsize +\renewcommand{\small}{% + \@setfontsize\small\@ixpt\@xpt + \abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@ + \abovedisplayshortskip \z@ \@plus 2\p@ + \belowdisplayskip \abovedisplayskip + \belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@ +} +\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt} +\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt} +\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt} +\renewcommand{\large}{\@setfontsize\large\@xiipt{14}} +\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}} +\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}} +\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}} +\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}} + +% sections with less space +\providecommand{\section}{} +\renewcommand{\section}{% + \@startsection{section}{1}{\z@}% + {-2.0ex \@plus -0.5ex \@minus -0.2ex}% + { 1.5ex \@plus 0.3ex \@minus 0.2ex}% + {\large\bf\raggedright}% +} +\providecommand{\subsection}{} +\renewcommand{\subsection}{% + \@startsection{subsection}{2}{\z@}% + {-1.8ex \@plus -0.5ex \@minus -0.2ex}% + { 0.8ex \@plus 0.2ex}% + {\normalsize\bf\raggedright}% +} +\providecommand{\subsubsection}{} +\renewcommand{\subsubsection}{% + \@startsection{subsubsection}{3}{\z@}% + {-1.5ex \@plus -0.5ex \@minus -0.2ex}% + { 0.5ex \@plus 0.2ex}% + {\normalsize\bf\raggedright}% +} +\providecommand{\paragraph}{} +\renewcommand{\paragraph}{% + \@startsection{paragraph}{4}{\z@}% + {1.5ex \@plus 0.5ex \@minus 0.2ex}% + {-1em}% + {\normalsize\bf}% +} +\providecommand{\subparagraph}{} +\renewcommand{\subparagraph}{% + \@startsection{subparagraph}{5}{\z@}% + {1.5ex \@plus 0.5ex \@minus 0.2ex}% + {-1em}% + {\normalsize\bf}% +} +\providecommand{\subsubsubsection}{} +\renewcommand{\subsubsubsection}{% + \vskip5pt{\noindent\normalsize\rm\raggedright}% +} + +% float placement +\renewcommand{\topfraction }{0.85} +\renewcommand{\bottomfraction }{0.4} +\renewcommand{\textfraction }{0.1} +\renewcommand{\floatpagefraction}{0.7} + +\newlength{\@neuripsabovecaptionskip}\setlength{\@neuripsabovecaptionskip}{7\p@} +\newlength{\@neuripsbelowcaptionskip}\setlength{\@neuripsbelowcaptionskip}{\z@} + +\setlength{\abovecaptionskip}{\@neuripsabovecaptionskip} +\setlength{\belowcaptionskip}{\@neuripsbelowcaptionskip} + +% swap above/belowcaptionskip lengths for tables +\renewenvironment{table} + {\setlength{\abovecaptionskip}{\@neuripsbelowcaptionskip}% + \setlength{\belowcaptionskip}{\@neuripsabovecaptionskip}% + \@float{table}} + {\end@float} + +% footnote formatting +\setlength{\footnotesep }{6.65\p@} +\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@} +\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@} +\setcounter{footnote}{0} + +% paragraph formatting +\setlength{\parindent}{\z@} +\setlength{\parskip }{5.5\p@} + +% list formatting +\setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@} +\setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@} +\setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} +\setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} +\setlength{\leftmargin }{3pc} +\setlength{\leftmargini }{\leftmargin} +\setlength{\leftmarginii }{2em} +\setlength{\leftmarginiii}{1.5em} +\setlength{\leftmarginiv }{1.0em} +\setlength{\leftmarginv }{0.5em} +\def\@listi {\leftmargin\leftmargini} +\def\@listii {\leftmargin\leftmarginii + \labelwidth\leftmarginii + \advance\labelwidth-\labelsep + \topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@ + \parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ + \itemsep \parsep} +\def\@listiii{\leftmargin\leftmarginiii + \labelwidth\leftmarginiii + \advance\labelwidth-\labelsep + \topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ + \parsep \z@ + \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@ + \itemsep \topsep} +\def\@listiv {\leftmargin\leftmarginiv + \labelwidth\leftmarginiv + \advance\labelwidth-\labelsep} +\def\@listv {\leftmargin\leftmarginv + \labelwidth\leftmarginv + \advance\labelwidth-\labelsep} +\def\@listvi {\leftmargin\leftmarginvi + \labelwidth\leftmarginvi + \advance\labelwidth-\labelsep} + +% create title +\providecommand{\maketitle}{} +\renewcommand{\maketitle}{% + \par + \begingroup + \renewcommand{\thefootnote}{\fnsymbol{footnote}} + % for perfect author name centering + \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}} + % The footnote-mark was overlapping the footnote-text, + % added the following to fix this problem (MK) + \long\def\@makefntext##1{% + \parindent 1em\noindent + \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1 + } + \thispagestyle{empty} + \@maketitle + \@thanks + \@notice + \endgroup + \let\maketitle\relax + \let\thanks\relax +} + +% rules for title box at top of first page +\newcommand{\@toptitlebar}{ + \hrule height 4\p@ + \vskip 0.25in + \vskip -\parskip% +} +\newcommand{\@bottomtitlebar}{ + \vskip 0.29in + \vskip -\parskip + \hrule height 1\p@ + \vskip 0.09in% +} + +% create title (includes both anonymized and non-anonymized versions) +\providecommand{\@maketitle}{} +\renewcommand{\@maketitle}{% + \vbox{% + \hsize\textwidth + \linewidth\hsize + \vskip 0.1in + \@toptitlebar + \centering + {\LARGE\bf \@title\par} + \@bottomtitlebar + \if@submission + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@} + Anonymous Author(s) \\ + Affiliation \\ + Address \\ + \texttt{email} \\ + \end{tabular}% + \else + \def\And{% + \end{tabular}\hfil\linebreak[0]\hfil% + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% + } + \def\AND{% + \end{tabular}\hfil\linebreak[4]\hfil% + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% + } + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}% + \fi + \vskip 0.3in \@minus 0.1in + } +} + +% add conference notice to bottom of first page +\newcommand{\ftype@noticebox}{8} +\newcommand{\@notice}{% + % give a bit of extra room back to authors on first page + \enlargethispage{2\baselineskip}% + \@float{noticebox}[b]% + \footnotesize\@noticestring% + \end@float% +} + +% abstract styling +\renewenvironment{abstract}% +{% + \vskip 0.075in% + \centerline% + {\large\bf Abstract}% + \vspace{0.5ex}% + \begin{quote}% +} +{ + \par% + \end{quote}% + \vskip 1ex% +} + +\endinput diff --git a/2024/05/29/papers/2002.04013/related.tex b/2024/05/29/papers/2002.04013/related.tex new file mode 100644 index 00000000..e723814f --- /dev/null +++ b/2024/05/29/papers/2002.04013/related.tex @@ -0,0 +1,106 @@ +\vspace{-14px} +\section{Related work}\label{sect:related} +\vspace{-4px} + +\subsection{Volunteer computing}\label{sect:related_volunteer} +\vspace{-4px} + +Using volunteer hardware has long been a viable alternative to high-performance computing. Since the development of BOINC \cite{anderson2004boinc} research organizations with sufficient public outreach have been able to run massive scientific computations on devices provided by volunteers. Successful projects such as Folding@home can have over $10^5$ active participants, rivaling the floating-point performance of world's fastest supercomputers\footnote{In January 2019, Folding@home reported 146,091 teraflops; in November 2019, the top-1 supercomputer ``Summit'' reported 148,600 teraflops; see \url{top500.org/lists/2019/11} .}. In fact, Folding@home was the first ``supercomputer'' to reach both 1 and 10 petaflops milestones~\cite{folding_timeline}. + +However, unlike traditional HPC, the volunteer nature of these projects imposes some additional limitations. First, the majority of volunteers are only available part-time. +For instance, a participant can provide an office workstation that only contributes compute outside of business hours. +Second, volunteer hardware is heterogeneous: different nodes may have different performance, memory limits, and even operating systems. Finally, participants usually communicate over the Internet, which is 2--3 orders of magnitude slower than typical HPC connections. As a result, both compute nodes and communication channels are not nearly as reliable as in traditional supercomputers. + +Due to the limitations mentioned above, volunteer computing works best for tasks that can be split into many independent chunks. A single Folding@home task is to run a physical simulation of a protein for a specified number of frames. Together, volunteers can perform hundreds of thousands of concurrent tasks and only need to communicate with the server to submit their results. Other projects like SETI@home and Einstein@home follow a similar pattern.% + +Based on the existing volunteer computing projects, we formulate the following usage scenario: +\vspace{-4px} +\begin{itemize}[leftmargin=*] + \item \textbf{Large pool of weak computers:} the infrastructure consists of $10^3 \sim 10^6$ heterogeneous PCs\footnote{Typical specifications: 2--8 CPU cores, 4--16GB RAM, and a single customer-grade GPU with 2--12GB of memory and 4--14 float32 TFLOPS (based on \url{https://pcpartpicker.com} and \url{https://techpowerup.com})}; + \item \textbf{Communication:} nodes communicate with speed and reliability of a home internet connection\footnote{We assume 20--250ms latency and 100Mbps symmetric bandwidth, $0.33\%$ packet loss based on \cite{speedtest,li2017case}}; + \item \textbf{Frequent node failures:} a compute node may fail to process a task for a variety of reasons. We expect 5--20\% of computers to have at least one failure a day under normal operating conditions. +\end{itemize} +\vspace{-6px} + +\subsection{Distributed training}\label{sect:related_distributed} +\vspace{-3px} + +To analyze the existing distributed training approaches from the perspective of volunteer computing, we broadly divide them into several categories. + +\textbf{Synchronous data parallel training} \cite{valiant1990bridging}\textbf{.} Each worker stores a copy of model parameters, computing gradients for a fraction of the training batch. The gradients are then averaged across workers and applied to the model, making up the same update on all machines. Due to its simplicity and scalability, this method has been widely used to reduce the training time of large neural networks to the order of minutes \cite{goyal2017accurate,You2020Large}. + +However, with low-end or midrange hardware it is not always possible to store the entire model on each worker. In addition, gradient communication, even when overlapped with computation, requires a high-speed connection between all participants, often faster than hundreds of megabytes per second, which is unrealistic when considering typical household Internet connections. + +\textbf{Asynchronous training} \cite{recht2011hogwild, zhang2015staleness} usually involves a single parameter server and multiple compute nodes fetching the latest parameters, processing batches, and submitting updates back to the server. This technique improves worker throughput, but this improvement comes at a cost. If several workers submit simultaneous updates, they might get applied in an arbitrary order, which leads to the issue of \textit{stale gradients} \cite{stale_gradients_can_win} and possibly hinders model convergence. + +\textbf{Model parallel training.} Each node stores a fraction of model layers, each training batch is processed by all nodes in a sequential order determined by the layer distribution scheme. The training batch can be divided into several micro-batches and processed in a pipeline fashion, significantly increasing hardware utilization \cite{huang2019gpipe,zero,pipemare,pipedream}. + +Unlike the two previous paradigms, this method allows training models that exceed the memory limit of any individual worker. Notable examples of successful model parallel training for large neural networks are \cite{huang2019gpipe} and \cite{shoeybi2019megatron}, yet these systems also have a high-speed network between workers. On top of that, model parallelism is highly vulnerable to node and network failures: if a single worker in a chain turns off or stops sending outputs, the training stops entirely. + +It is possible to combine data and model parallelism to mitigate the outlined issues to some degree, but the requirement for fast worker interconnect holds even in that case. In light of this, the method we design has to maintain high throughput even in the presence of slow and unreliable network connections, possibly sacrificing the latency (time to process a given batch) as a necessary tradeoff. + +This constraint may be justified by the following observation: the wall-clock training time of a neural network (with model and optimizer fixed) mostly depends on how many batches it processes per second. As we show in Section \ref{sect:exp_convergence}, the effect of stale gradients can be mitigated with the right architecture. We summarize the desired properties in Table \ref{tab:distributed}. + +\begin{table*}[t] +\caption{Comparison of distributed training schemes in the volunteer computing context. ``Desired'' denotes the algorithm with properties that would be beneficial for this setting. ``Only workers'' means that the system has central components that are not fault-tolerant.} +\setlength{\tabcolsep}{3pt} +\hspace{-6pt}\begin{tabular}{cccccccc} +\toprule + \multirow{2}{*}{Training method}& Model & Training & \multirow{2}{*}{Scalability} & \multirow{2}{*}{Fault tolerance} & Worker & \multicolumn{2}{c}{Network} \\ + & size limit & throughput & & & hot-join & Bandwidth & Latency \\ +\midrule +Data parallel & Worker & \textbf{High } & Medium & \textbf{Full} & \textbf{Yes } & \textbf{High} & Low \\ +Asynchronous & Worker & \textbf{High } & \textbf{High} & Only workers\textbf{} & \textbf{Yes } & Medium & \textbf{Any} \\ +Model parallel & \textbf{System} & Medium & Low & No & No & High & Low \\ +Federated & Worker & Low & \textbf{High} & Only workers\textbf{} & \textbf{Yes } & \textbf{Low} & \textbf{Any} \\ +Desired & \textbf{System} & \textbf{High } & \textbf{High} & \textbf{Full} & \textbf{Yes } & \textbf{Low} & \textbf{Any} \\ +\bottomrule +\end{tabular} +\label{tab:distributed} +\vspace{-12pt} +\end{table*} + +\textbf{Federated learning.} The problem of utilizing large quantities of consumer devices for training a single model has also been discussed within the context of data-private learning. Federated learning \cite{mcmahan2017communication} attempts to mitigate the issue by keeping the data on devices, training a local version of the model, and sending only the parameter updates. These updates are encrypted so that the server can only decrypt their average across several devices. + +\vspace{-1px} + +Unsurprisingly, federated learning sacrifices performance for privacy. Secure aggregation procedures \cite{bonawitz2017practical} require multiple workers to communicate and scale quadratically with their number. These properties hardly align with the scenario from Section \ref{sect:related_volunteer}, making federated learning a poor fit for jointly training large models. + +\textbf{Deep learning with volunteer computing.} To the best of our knowledge, there are three projects that use volunteer computing for training neural networks. The first work~\cite{desell2017} leverages volunteer resources for evaluation of CNN architectures generated by evolution algorithms; each model is trained on a single device. +The second study~\cite{volunteer_dl_async} relies on standard asynchronous training and is therefore inapplicable to models that do not fit into a single consumer-grade GPU. Moreover, the architecture described in that study is only partially decentralized, relying on a centralized parameter server that communicates with all nodes. Lastly, the project known as Leela Chess Zero~\cite{lc0}, relies on volunteer hardware to play massive amounts of chess games for generating self-play data used in reinforcement learning. However, the model itself is trained on a single central server. + +Our primary insight from this section is that existing methods for training general large neural networks do not fit well into the volunteer computing scenario. However, there is a subclass of deep learning architectures which is much better suited for this task. + +\vspace{-2px} +\subsection{Mixture-of-Experts}\label{sect:related_moe} +\vspace{-2px} + +Mixture-of-Experts (MoE) was first proposed almost three decades ago as a method to train multiple neural networks (``experts'') for a common task \cite{moe_first}. The intent is for each expert to specialize in making predictions for a small subset of data. Presented with an input, MoE first determines which experts are best suited to process that input using a separate \textit{gating function}. Then it applies the chosen experts and aggregates their outputs into the final prediction. This work has sparked many follow-ups that reveal different MoE structures \cite{jordan1994hierarchical, yao2009hierarchical,moe_lifelong,rasmussen2002infinite} and individual expert types \cite{moe_svm,moe_dirichlet}. + +A subsequent study~\cite{eigen2013learning} demonstrates that Mixture-of-Experts can be used as a layer within larger neural networks and trained jointly by backpropagation. Depending on the task, individual experts can utilize convolutional, recurrent, or other specialized layers. Such MoE can have a large number of experts, but it only needs to compute a few of them to process any given input. + +Shazeer et al.~\cite{shazeer2017outrageously} (and later~\cite{Lepikhin2020GShardSG}) brought that idea to the extreme by training ``outrageously'' large mixtures with thousands of experts. The drastic increase in capacity allows authors to achieve superior performance in large-scale machine translation and language modeling. The paper also addresses problems that arise with increased mixture size. When trained na\"ively, the gating function learns to use a small fraction of available experts for all inputs, not taking full advantage of the available capacity. The authors alleviate this issue by adding a regularization term that promotes ``load-balancing'' across all experts. + +However, scaling this approach from thousands to millions of experts reveals additional problems in the design of a gating function. In order to choose the most appropriate experts for the task, MoE predicts a ``priority'' value for each expert and selects the ones with the highest priority. As the number of experts approaches millions, such a gating function itself becomes computationally intractable, especially in our decentralized setting. + +A popular solution to this problem is to structure the set of experts in a search-friendly way. For instance, Hierarchical Mixture-of-Experts~\cite{jordan1994hierarchical} organizes experts in a tree-like structure. Selecting the best experts is then reduced to a beam search over this tree, which scales logarithmically in the number of experts. More recent study by Lample et al. \cite{pkm} explores this idea at scale by organizing over a million keys in a factorized 1024-by-1024 grid. For this grid, the gating function only needs to predict two vectors of size 1024. This work also demonstrates that such layers can benefit Transformer models in the masked language modeling task. + +However, these works require a centralized infrastructure for training. When the gating function picks appropriate experts for the input at hand, it must somehow find these experts across all nodes. In our scenario, even maintaining the dynamic ``address book'' of all active experts would be infeasible for any single participant. + +\nocite{puigcerver2020scalable} + +\vspace{-2px} + +\subsection{Distributed Hash Tables}\label{sect:related_dht} + +\vspace{-2px} + +Fortunately, there is a way to implement bookkeeping in a decentralized system --- the distributed hash table (DHT). This is a family of distributed data structures that store key-value pairs across multiple computers in a network. A single computer within such structure only needs to ``know'' $O(\log N)$ out of $N$ computers; at the same time it can look up any key with at most $O(\log N)$ requests to his peers. There are several DHT variants, but they all have common properties: +\vspace{-4px} +\begin{itemize}[leftmargin=*] + \item \textbf{Decentralization:} nodes form and maintain DHT without any central coordination; + \item \textbf{Scalability:} DHT can scale to millions of active nodes that are continually joining and leaving; + \item \textbf{Fault tolerance:} a failure in one or a few nodes does not affect DHT integrity and availability; +\end{itemize} + +A DHT-like protocol was first proposed in 1998 by \cite{tewari1998beyond} and popularized in early 2000s by four protocols: CAN~\cite{can}, Chord~\cite{chord}, Pastry~\cite{pastry} and Tapestry~\cite{tapestry}. By far, the most popular DHT variation is Kademlia~\cite{kademlia} with numerous applications such as BitTorrent, I2P, and Ethereum. A more recent work~\cite{kaashoek2003koorde} further improves theoretical performance for either lookup time or the number of connections; however, this version is less widespread due to being significantly harder to implement. diff --git a/2024/05/29/papers/2002.04013/resources/convergence.pdf b/2024/05/29/papers/2002.04013/resources/convergence.pdf new file mode 100644 index 00000000..77ee3048 Binary files /dev/null and b/2024/05/29/papers/2002.04013/resources/convergence.pdf differ diff --git a/2024/05/29/papers/2002.04013/resources/convergence_wikitext.pdf b/2024/05/29/papers/2002.04013/resources/convergence_wikitext.pdf new file mode 100644 index 00000000..61c22956 Binary files /dev/null and b/2024/05/29/papers/2002.04013/resources/convergence_wikitext.pdf differ diff --git a/2024/05/29/papers/2002.04013/resources/l_at_home.pdf b/2024/05/29/papers/2002.04013/resources/l_at_home.pdf new file mode 100644 index 00000000..06e0d8db Binary files /dev/null and b/2024/05/29/papers/2002.04013/resources/l_at_home.pdf differ diff --git a/2024/05/29/papers/2002.04013/resources/schematic-training-v2.pdf b/2024/05/29/papers/2002.04013/resources/schematic-training-v2.pdf new file mode 100644 index 00000000..a639572c Binary files /dev/null and b/2024/05/29/papers/2002.04013/resources/schematic-training-v2.pdf differ diff --git a/2024/05/29/papers/2002.04013/resources/teasseract3.pdf b/2024/05/29/papers/2002.04013/resources/teasseract3.pdf new file mode 100644 index 00000000..e73c08c0 Binary files /dev/null and b/2024/05/29/papers/2002.04013/resources/teasseract3.pdf differ diff --git a/2024/05/29/papers/2002.04013/resources/throughput_new.pdf b/2024/05/29/papers/2002.04013/resources/throughput_new.pdf new file mode 100644 index 00000000..4ac5f36a Binary files /dev/null and b/2024/05/29/papers/2002.04013/resources/throughput_new.pdf differ diff --git a/2024/05/29/papers/2002.04013/supplementary.tex b/2024/05/29/papers/2002.04013/supplementary.tex new file mode 100644 index 00000000..46331f53 --- /dev/null +++ b/2024/05/29/papers/2002.04013/supplementary.tex @@ -0,0 +1,102 @@ + +\section{Cost and performance estimate of \$2500 desktop PCs} +\vspace{-2px} + +According to several PC building websites (\url{https://pcpartpicker.com}, \url{https://newegg.com}), most popular \$2250--2750 desktops are equipped with RTX 2080/2080Ti or GTX 1080Ti GPU. These GPUs are 50--80\% as fast as Tesla V100 for deep learning \cite{lambdabenchmarks}. As a rough estimate, the combined throughput of 10,000 desktops is 8--15 times that of server pod with 512 V100 GPUs. + +\section{A primer on Distributed Hash Tables} +\vspace{-2px} + +On a high level, DHT is a dictionary that can be accessed by every participant. Each key-value pair is stored on a small subset of peers determined by the hash function of the key. +\begin{itemize} + \item Each participant has a unique identifier (ID) that is sampled uniformly from the space possible outputs of the hash function. + \item When storing a $(key,\ value)$ pair, one should search for $k$ peers whose IDs are closest to $\mathrm{hash}(key)$. Then, request each of these $k$ peers to store the $(key,\ value)$ pair. + \item When retrieving a value for a key, one should compute $\mathrm{hash}(key)$, search for peers with IDs similar to that hash value and request value from those peers. +\end{itemize} + +Specific DHT variants such as Chord~\cite{chord} or Kademlia~\cite{kademlia} employ different hash types and different algorithms for finding nearest peers. For instance, Kademlia DHT selects nearest peers based on the XOR distance function: $d(x, y) = \mathrm{int}(x \oplus y)$. + +Each participant is directly aware of only a small subset of DHT peers. When storing or retrieving a key, the participant requests additional peers from its neighbors in a semi-greedy search, minimizing XOR distance until it finds $k$ nearest peers. In Kademlia, nodes form a special navigable graph structure that lets them find nearest peers in at most $O(k + \log_2 N)$ requests to other DHT peers, where $N$ is the total number of participants. + +\section{Finding best experts across the DHT}\label{appendix:find_experts} +\vspace{-2px} + +Recall that the gating function is defined as +\[ +g(x, f) = \sum_{i=0}^{d - 1} g_i(x)[u_i], +\] +where $g_0,\dots\,g_{d-1}$ are linear layers, $u_i$ is the $i$-th component of the expert unique identifier $\mathrm{uid}(f)$, and $[k]$ takes $k$-th component of a vector. Our objective is to find $k$ experts with largest $g(x, \cdot)$. In a centralized setting, one can find $k$ largest scores from each linear layer $g_i$ using the algorithm described in \cite{pkm}. + +Unfortunately, in our case not all combinations of indices correspond to valid experts. Therefore, we developed a specialized beam search algorithm similar to the one used in machine translation. The core idea is to start with top-$k$ indices along the first grid dimension and add one dimension at a time. + +In order for this algorithm to work, participants maintain the following information on the DHT: + +\begin{itemize} + \item For every expert UID, store its server address and the timestamp; + \item For every prefix in expert UID, store all suffixes corresponding to active experts and the timestamp. +\end{itemize} + +For instance, if there are 6 experts: "ffn.1.3", "ffn.2.1", "ffn.2.2", "ffn.2.6" and "ffn.3.2" and "ffn.3.5"; the DHT will contain the following information: + +\begin{figure}[h!] + \centering + \setlength{\tabcolsep}{3pt} + \renewcommand{\arraystretch}{1.2} + \begin{tabular}{c|c|c|c|c|c|c|c|c|c} + \toprule + Key & ffn.1.* & ffn.2.* & ffn.3.* & ffn.1.3 & ffn.2.1 & ffn.2.2 & ffn.2.6 & ffn.3.2 & ffn.3.5 \\ + Value & [3],$t_1$ & [1, 2, 6],$t_2$ & [2, 5],$t_3$ & \multicolumn{6}{c}{[Address of a server that hosts the given expert]}\\ + \bottomrule + \end{tabular} + \caption{DHT keys and values for 6 experts defined above, t corresponds to last update timestamp.} +\end{figure} + +For higher grid dimensions, we store similar information for every grid prefix. For instance, an expert with UID "transformer.10.20.30" will affect 3 keys: "transformer.10.*", "transformer.10.20.*" and "transformer.10.20.30". Each prefix key stores at most as many values as there are indices in the next grid dimension, typically 100 or 256. + +With this data structure, DMoE can use beam search to select the best experts. Algorithm \ref{alg:beam_search} starts from the leftmost dimension of the grid and processes one dimension at each step. The worst case complexity of this algorithm is $O(d k \log N)$ from $O(d k)$ lookups to the DHT. + + +\begin{algorithm}[h] + \caption{SelectExperts} + \label{alg:beam_search} +\begin{algorithmic} + \STATE {\bfseries Input:} $x, k, d, M,\ (g_0, \ldots, g_{d-1})$ + \STATE beam $ := [0, 1, ..., M - 1]$ \quad \quad \quad \quad \quad \quad \quad // all 1-prefixes + \STATE scores $ := [g_0(x, 0) ... g_0(x, M - 1)]$ \quad \quad \quad // initial scores + \STATE // select $k$ best starting points + \STATE beam, scores $:=$ TopK(beam, scores, k) + \FOR{$i \in [1,\ \ldots,\ d - 1]$} + \STATE // expand all candidates in beam + \STATE new\_beam, new\_scores $ := [\ ], [\ ]$ + \FOR{prefix, score $\in$ beam, scores} + \FOR{$j \in \mathrm{ActiveSuffixes(prefix)}$} + \STATE new\_beam.add(prefix$ \bigoplus [j]$) // concat + \STATE new\_scores.add(score $ + g_i(x, j)$) + \ENDFOR + \ENDFOR + \STATE // select at most $k$ best prefixes + \STATE beam, scores $:=$ TopK(new\_beam, new\_scores, k) + \ENDFOR + \STATE {\bfseries Return} beam +\end{algorithmic} +\end{algorithm} + +The TopK function simply sorts the inputs by score and returns $k$ inputs with highest scores. In turn, the ActiveSuffixes function queries the DHT for a given prefix and returns a set of all active suffixes as described above. Assuming that servers re-publish their experts every $t$ seconds, the function can simply check whether the timestamp for a given prefix is less than $t$ seconds old. + +\vspace{-4pt} +\section{On gradient checkpointing in Learning@home}\label{appendix:checkpoints} +\vspace{-2px} + +In general, gradient checkpointing increases computation per training batch by approximately 1/3, but allows training larger models with the same GPU memory. More importantly, in our scenario checkpointing also removes the need to store intermediate activations. In our experiments, this has led to both significantly higher training throughput and a smaller memory footprint. + +Without gradient checkpointing, we would have to store intermediate activations in memory. Since the GPU can only fit a few batches at a time, it quickly runs out of memory and is forced to wait for the backward pass. For Transformer layers (see Figure 4, top), this results in approximately 9 times less throughput at 100ms latency. + +\vspace{-4pt} +\section{Reducing the network load}\label{appendix:networkload} +\vspace{-4pt} + +One way to reduce the communication load is to convert tensors to a lower precision before transfer. Prior work in this area suggests that distributed training works even when communicating with 8-bit precision tensors~\cite{Dettmers20158BitAF, natural_compression}. Many popular architectures, including Transformers, can train entirely in that precision mode \cite{NIPS2019_8736}. Consequently, low precision communication appears as a logical way of reducing communication requirements. + +In addition, the deep learning architectures discussed in this work rely on backpropagation for training. With the advancement of optimization methods allowing nearly independent layer-wise training~\cite{ma2019hsic,jaderberg2017decoupled,real2017large}, it might be even more suitable to use these techniques for asynchronous training with fewer restrictions on the architectures being used. + +Another solution is to use experts that have a higher capacity to input size ratio. The architectures used in Section 4.1 are already somewhat biased in that direction, but they are far from optimal. \ No newline at end of file diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/albert_hours.pdf b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/albert_hours.pdf new file mode 100644 index 00000000..0edcf2fb Binary files /dev/null and b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/albert_hours.pdf differ diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/algorithm.sty b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/algorithm.sty new file mode 100644 index 00000000..843e3d5b --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/algorithm.sty @@ -0,0 +1,79 @@ +% ALGORITHM STYLE -- Released 8 April 1996 +% for LaTeX-2e +% Copyright -- 1994 Peter Williams +% E-mail Peter.Williams@dsto.defence.gov.au +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithm} +\typeout{Document Style `algorithm' - floating environment} + +\RequirePackage{float} +\RequirePackage{ifthen} +\newcommand{\ALG@within}{nothing} +\newboolean{ALG@within} +\setboolean{ALG@within}{false} +\newcommand{\ALG@floatstyle}{ruled} +\newcommand{\ALG@name}{Algorithm} +\newcommand{\listalgorithmname}{List of \ALG@name s} + +% Declare Options +% first appearance +\DeclareOption{plain}{ + \renewcommand{\ALG@floatstyle}{plain} +} +\DeclareOption{ruled}{ + \renewcommand{\ALG@floatstyle}{ruled} +} +\DeclareOption{boxed}{ + \renewcommand{\ALG@floatstyle}{boxed} +} +% then numbering convention +\DeclareOption{part}{ + \renewcommand{\ALG@within}{part} + \setboolean{ALG@within}{true} +} +\DeclareOption{chapter}{ + \renewcommand{\ALG@within}{chapter} + \setboolean{ALG@within}{true} +} +\DeclareOption{section}{ + \renewcommand{\ALG@within}{section} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsection}{ + \renewcommand{\ALG@within}{subsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsubsection}{ + \renewcommand{\ALG@within}{subsubsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{nothing}{ + \renewcommand{\ALG@within}{nothing} + \setboolean{ALG@within}{true} +} +\DeclareOption*{\edef\ALG@name{\CurrentOption}} + +% ALGORITHM +% +\ProcessOptions +\floatstyle{\ALG@floatstyle} +\ifthenelse{\boolean{ALG@within}}{ + \ifthenelse{\equal{\ALG@within}{part}} + {\newfloat{algorithm}{htbp}{loa}[part]}{} + \ifthenelse{\equal{\ALG@within}{chapter}} + {\newfloat{algorithm}{htbp}{loa}[chapter]}{} + \ifthenelse{\equal{\ALG@within}{section}} + {\newfloat{algorithm}{htbp}{loa}[section]}{} + \ifthenelse{\equal{\ALG@within}{subsection}} + {\newfloat{algorithm}{htbp}{loa}[subsection]}{} + \ifthenelse{\equal{\ALG@within}{subsubsection}} + {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{} + \ifthenelse{\equal{\ALG@within}{nothing}} + {\newfloat{algorithm}{htbp}{loa}}{} +}{ + \newfloat{algorithm}{htbp}{loa} +} +\floatname{algorithm}{\ALG@name} + +\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}} + diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/algorithmic.sty b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/algorithmic.sty new file mode 100644 index 00000000..ad614783 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/algorithmic.sty @@ -0,0 +1,201 @@ +% ALGORITHMIC STYLE -- Released 8 APRIL 1996 +% for LaTeX version 2e +% Copyright -- 1994 Peter Williams +% E-mail PeterWilliams@dsto.defence.gov.au +% +% Modified by Alex Smola (08/2000) +% E-mail Alex.Smola@anu.edu.au +% +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithmic} +\typeout{Document Style `algorithmic' - environment} +% +\RequirePackage{ifthen} +\RequirePackage{calc} +\newboolean{ALC@noend} +\setboolean{ALC@noend}{false} +\newcounter{ALC@line} +\newcounter{ALC@rem} +\newlength{\ALC@tlm} +% +\DeclareOption{noend}{\setboolean{ALC@noend}{true}} +% +\ProcessOptions +% +% ALGORITHMIC +\newcommand{\algorithmicrequire}{\textbf{Require:}} +\newcommand{\algorithmicensure}{\textbf{Ensure:}} +\newcommand{\algorithmiccomment}[1]{\{#1\}} +\newcommand{\algorithmicend}{\textbf{end}} +\newcommand{\algorithmicif}{\textbf{if}} +\newcommand{\algorithmicthen}{\textbf{then}} +\newcommand{\algorithmicelse}{\textbf{else}} +\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif} +\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif} +\newcommand{\algorithmicfor}{\textbf{for}} +\newcommand{\algorithmicforall}{\textbf{for all}} +\newcommand{\algorithmicdo}{\textbf{do}} +\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor} +\newcommand{\algorithmicwhile}{\textbf{while}} +\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile} +\newcommand{\algorithmicloop}{\textbf{loop}} +\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop} +\newcommand{\algorithmicrepeat}{\textbf{repeat}} +\newcommand{\algorithmicuntil}{\textbf{until}} + +%changed by alex smola +\newcommand{\algorithmicinput}{\textbf{input}} +\newcommand{\algorithmicoutput}{\textbf{output}} +\newcommand{\algorithmicset}{\textbf{set}} +\newcommand{\algorithmictrue}{\textbf{true}} +\newcommand{\algorithmicfalse}{\textbf{false}} +\newcommand{\algorithmicand}{\textbf{and\ }} +\newcommand{\algorithmicor}{\textbf{or\ }} +\newcommand{\algorithmicfunction}{\textbf{function}} +\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction} +\newcommand{\algorithmicmain}{\textbf{main}} +\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain} +%end changed by alex smola + +\def\ALC@item[#1]{% +\if@noparitem \@donoparitem + \else \if@inlabel \indent \par \fi + \ifhmode \unskip\unskip \par \fi + \if@newlist \if@nobreak \@nbitem \else + \addpenalty\@beginparpenalty + \addvspace\@topsep \addvspace{-\parskip}\fi + \else \addpenalty\@itempenalty \addvspace\itemsep + \fi + \global\@inlabeltrue +\fi +\everypar{\global\@minipagefalse\global\@newlistfalse + \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels + \penalty\z@ \fi + \everypar{}}\global\@nobreakfalse +\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi +\sbox\@tempboxa{\makelabel{#1}}% +\global\setbox\@labels + \hbox{\unhbox\@labels \hskip \itemindent + \hskip -\labelwidth \hskip -\ALC@tlm + \ifdim \wd\@tempboxa >\labelwidth + \box\@tempboxa + \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi + \hskip \ALC@tlm}\ignorespaces} +% +\newenvironment{algorithmic}[1][0]{ +\let\@item\ALC@item + \newcommand{\ALC@lno}{% +\ifthenelse{\equal{\arabic{ALC@rem}}{0}} +{{\footnotesize \arabic{ALC@line}:}}{}% +} +\let\@listii\@listi +\let\@listiii\@listi +\let\@listiv\@listi +\let\@listv\@listi +\let\@listvi\@listi +\let\@listvii\@listi + \newenvironment{ALC@g}{ + \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@ + \listparindent\z@ \rightmargin\z@ + \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@ + \leftmargin 1em + \addtolength{\ALC@tlm}{\leftmargin} + } + } + {\end{list}} + \newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item} + \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}% +{}{\ \algorithmiccomment{##1}}} + \newcommand{\REQUIRE}{\item[\algorithmicrequire]} + \newcommand{\ENSURE}{\item[\algorithmicensure]} + \newcommand{\STATE}{\ALC@it} + \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}} +%changes by alex smola + \newcommand{\INPUT}{\item[\algorithmicinput]} + \newcommand{\OUTPUT}{\item[\algorithmicoutput]} + \newcommand{\SET}{\item[\algorithmicset]} +% \newcommand{\TRUE}{\algorithmictrue} +% \newcommand{\FALSE}{\algorithmicfalse} + \newcommand{\AND}{\algorithmicand} + \newcommand{\OR}{\algorithmicor} + \newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}} +%end changes by alex smola + \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}} + \renewcommand{\\}{\@centercr} + \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\ + \algorithmicthen\ {##2}} + \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\ELSIF}[2][default]% +{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ % + \algorithmicdo\ {##2}} + \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@whl}} + \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop% +\ALC@com{##1}\begin{ALC@loop}} +%changed by alex smola + \newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ % + \ALC@com{##1}\begin{ALC@func}} + \newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ % + \ALC@com{##1}\begin{ALC@main}} +%end changed by alex smola + \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat% + \ALC@com{##1}\begin{ALC@rpt}} + \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1} + \ifthenelse{\boolean{ALC@noend}}{ + \newcommand{\ENDIF}{\end{ALC@if}} + \newcommand{\ENDFOR}{\end{ALC@for}} + \newcommand{\ENDWHILE}{\end{ALC@whl}} + \newcommand{\ENDLOOP}{\end{ALC@loop}} + \newcommand{\ENDFUNCTION}{\end{ALC@func}} + \newcommand{\ENDMAIN}{\end{ALC@main}} + }{ + \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif} + \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor} + \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile} + \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop} + \newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction} + \newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain} + } + \renewcommand{\@toodeep}{} + \begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}% + \itemsep\z@ \itemindent\z@ \listparindent\z@% + \partopsep\z@ \parskip\z@ \parsep\z@% + \labelsep 0.5em \topsep 0.2em% + \ifthenelse{\equal{#1}{0}} + {\labelwidth 0.5em } + {\labelwidth 1.2em } + \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep} + \ALC@tlm\labelsep + } + } + {\end{list}} + + + + + + + + + + + + + + diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/cloud_costs.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/cloud_costs.tex new file mode 100644 index 00000000..838b13ee --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/cloud_costs.tex @@ -0,0 +1,43 @@ +\section{GPU instance costs} +\label{sect:cloud_costs} + +This section provides a brief cost analysis of typical deep learning compute resources both in the cloud and on-premises. +For brevity, we limit this analysis to the popular GPUs available at the time of submission. Note that the exact costs will depend on a variety of factors such as the cloud provider, the region, electricity costs, and market fluctuations. Therefore, we warn the reader to consider this analysis only as a rough estimate. + +Specifically, we estimate the compute costs for the occasional usage scenario: running a single set of experiments over several weeks or conducting infrequent experiments. This scenario covers most research scientists and small organizations. The most straightforward way to provision a GPU server in such a scenario is to rent it from a cloud provider (e.g., GCP or AWS) or a public marketplace (e.g., Vast.ai or Golem). + +While the exact server specifications vary from one provider to another, there are two broad categories of GPU machines: regular and preemptible. Regular instance types typically offer 1--8 GPUs per node with tight uptime guarantees (typically $99.99\%$) and a high-bandwidth network (tens of Gb/s). In turn, preemptible instances provide the same resource type at a significant discount with the condition that the machine can be terminated at any time after short notice. + +To account for individual variations, we report the average rent price over three popular cloud providers. +We consider three popular instance types: two high-end instances with 8 Tesla V100 or A100 GPUs and a low-end instance with a single Tesla T4 GPU. +We also describe several low-end servers and workstations available on a public marketplace. Unlike cloud VMs, these instances are hosted on non-curated hardware with less uptime guarantees (typically 95\% -- 99.9\%), slower network and significant variation in performance. However, marketplace instances are the cheapest in terms of cost per TFLOPS. To quantify this, we report the average over three most affordable instances that fit the chosen minimum requirements. + +As a point of comparison, we also measure each system's training performance for BERT-Large~\cite{bert} fine-tuning on SQuAD v1.1~\cite{squad} in PyTorch with mixed precision. We follow the official benchmarking protocol by~\cite{nvidia_perf} and reuse the official performance results for V100, A100, and T4 instances. The only exception is GTX 1080Ti, where we use full 32-bit precision because that device does not support efficient half-precision operations. + +\begin{table}[h] +\small +\setlength{\tabcolsep}{2pt} +\renewcommand{\arraystretch}{1} +\centering +\caption{Cloud and marketplace GPU instance pricing for short-term usage.} +\label{fig:cloud_costs} +\begin{tabular}{@{}ccccccc@{}} +\toprule +\multicolumn{4}{c}{Minimum system specifications} & \multicolumn{2}{c}{Average cost, \$/hour} & \multirow{2}[2]{*}{\shortstack{BERT-Large\\ training samples/s}} \\ +\cmidrule(lr){1-4}\cmidrule(lr){5-6} +GPU & CPU cores & CPU type & RAM, GB & Regular & Preemptible & \\ \midrule +\multicolumn{7}{c}{Cloud instances} \\ \midrule +8$\times$ V100 & 64 & Intel Xeon Broadwell & 480 & 23.47 & 7.13 & 354 \\ +8$\times$ A100 & 96 & AMD Epyc ROME & 960 & 30.65 & 10.18 & 755 \\ +1$\times$ T4 & 4 & Intel Xeon Cascade Lake & 16 & 0.46 & 0.18 & 18 \\ \midrule +\multicolumn{7}{c}{Marketplace instances} \\ \midrule +6$\times$ 3090 & 32 & AMD Epyc Rome & 480 & 5.04 & 4.17 & 154 \\ +4$\times$ 2080Ti & 16 & Intel Xeon Haswell & 240 & 0.96 & 0.84 & 83.4 \\ +1$\times$ RTX 1080Ti & 8 & Intel Xeon Haswell & 16 & 0.22 & 0.16 & 12 \\ \bottomrule +\end{tabular} +\end{table} + +Table~\ref{fig:cloud_costs} shows two main tendencies. First, preemptible \textit{cloud} instances are, on average, three times cheaper than their non-preemptible counterparts\footnote{The cost can be up to $11{\times}$ cheaper for some instance types, e.g. Azure V100 instances in the central US region at the time of writing.}. Second, the high-end HPC-grade servers that offer the highest raw performance are less cost-effective than lower-tier servers and marketplace instances. In theory, one could match the raw floating-point performance of a $8{\times}$V100 instance at a fraction of its cost using multiple lower-tier workstations, such as $4{\times}$ RTX 2080Ti, with a smaller total cost. +However, in practice, running distributed training with these workstations is challenging due to their unreliability and slow network connection. + +Note that this analysis does not represent the cloud costs for sustained GPU usage. If an organization plans to constantly use GPU resources over a period of multiple years, they can reduce the costs by deploying their own compute infrastructure or relying on the sustained usage discounts reaching up to 60--70\%. Thus, the long-term compute costs are much harder to analyze and depend on a number of additional factors, such as local electricity prices for on-premise infrastructure. However, this scenario offers similar trade-offs: HPC-grade infrastructure offers greater interconnectivity, but requires expensive network interface cards, high-end switches and a more complex setup process. \ No newline at end of file diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/detailed_setup.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/detailed_setup.tex new file mode 100644 index 00000000..efe5f3f7 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/detailed_setup.tex @@ -0,0 +1,112 @@ +\section{Detailed experimental setup} +\label{sect:detailed_setup} + +In this section, we provide the detailed hardware configuration of servers used for each of our distributed training experiments. + +\subsection{ImageNet training}\label{sect:detailed_setup_resnet} + +Both homogeneous and heterogeneous training setups for ImageNet are provisioned in our on-premise infrastructure across multiple data centers and an office space (for the heterogeneous setup only). + +\paragraph{Homogeneous.}For the homogeneous setup, we use 16 identical instances with the following specifications: +\begin{itemize} + \item \textbf{GPU:} V100-PCIe, + \item \textbf{CPU:} 6 vCPUs (Xeon E5-2650v4), + \item \textbf{RAM:} 64GB. +\end{itemize} + +\paragraph{Heterogeneous.}In turn, the heterogeneous setup contains multiple instance types listed in Table~\ref{fig:tab_setup_resnet}: +\begin{table}[h] +\centering +\caption{\textbf{Heterogeneous} setup for ImageNet training.} +\label{fig:tab_setup_resnet} +\renewcommand{\arraystretch}{1} +\begin{tabular}{@{}cccccc@{}} +\toprule +Instances & GPUs & GPU type & Cores & RAM, GB & CPU type \\ +\midrule +4 & 1 & V100-PCIe & 6 & 64 & E5-2650v4 \\ +17 & 2 & GTX 1080Ti & 8 & 64 & E5-2650v4 \\ +7 & 1 & GTX 1080Ti & 4 & 32 & E5-2650v4 \\ +16 & 1 & P40 & 4 & 32 & E5-2667v2 \\ +20 & 1 & M40-24GB & 4 & 32 & E5-2667v2 \\ + +\bottomrule +\end{tabular} +\end{table} + + + + +\subsection{ALBERT training}\label{sect:detailed_setup_albert} + + +\paragraph{Homogeneous.}For the homogeneous setup, we use a single virtual machine with the following specifications: +\begin{itemize} + \item \textbf{GPU:} $8{\times}$ V100-PCIe, + \item \textbf{CPU:} 48 vCPUs (Xeon E5-2650v4), + \item \textbf{RAM:} 488GB. +\end{itemize} + +At the time of writing, the cloud rent cost for this instance is \textbf{\$24.48} per hour. + +\paragraph{Heterogeneous.}Our heterogeneous setup is composed of two parts: AWS EC2 Spot instances and crowdsourced machines from the \texttt{Vast.ai} marketplace. For spot instances, we picked the smallest suitable instance size available from the cloud provider and further limited their bandwidth to 1Gb/s\footnote{We use \texttt{tc qdisc} Linux utility to artificially limit the network throughput, similarly to~\cite{MLSYS2019_d09bf415}}. As for marketplace instances, we report the hardware specifications for each worker gathered 1 hour after the start of ALBERT training. + +Since both cloud and marketplace instances are preemptible, the actual cost of the server fleet will vary based on the current price. For simplicity, we report the maximum hourly price we ended up paying for this instance (enforced via maximum bid). Finally, some marketplace instances have missing specifications, such as unknown CPU type. This is likely caused by non-standard virtualization configured by the device owner. The resulting fleet configuration, shown in Table~\ref{fig:tab_setup}, costs up to \$15.43/hour, depending on the number of active instances. + +\begin{table*}[ht!] +\centering +\caption{\textbf{Heterogeneous} setup for ALBERT training.} +\label{fig:tab_setup} +\small +\setlength{\tabcolsep}{2pt} +\hspace{7pt}\begin{tabular}{@{}ccccccc@{}} +\toprule +GPU & Cores & RAM, GB & CPU type & Download, Mb/s & Upload, Mb/s & +Cost, \$/hour \\ +\midrule +\multicolumn{7}{c}{Preemptible \texttt{g4dn.xlarge} instances ($32{\times}$)} \\ +\midrule +T4 & 4 & 16 & Xeon Platinum 8259CL & 1000 & 1000 & 0.1578 \\ + +\midrule +\multicolumn{7}{c}{Marketplace instances} \\ +\midrule +GTX 1070Ti & 6 & 16 & E5-2640 & 425 & 255 & 0.036 \\ +GTX 1070Ti & 6 & 16 & i3-6100T & 121 & 36 & 0.06 \\ +GTX 1080Ti & 4 & 20 & i3-6096P & 817 & 308 & 0.101 \\ +GTX 1080Ti & 20 & 129 & E5-2630v4 & 660 & 475 & 0.182 \\ +GTX 1080Ti & 1 & 16 & i7-7700K & 245 & 210 & 0.302 \\ +GTX 1080Ti & 48 & 97 & Xeon Platinum 8124 & 583 & 539 & 0.217 \\ +GTX 1080Ti & 10 & 16 & Unknown & n/a & n/a & 0.15 \\ +GTX 1080Ti & 4 & 16 & Xeon Gold 6149 & 98 & 100 & 0.2 \\ % +GTX 1080Ti & 4 & 16 & Xeon Gold 6149 & 99 & 98 & 0.2 \\ % +GTX 1080Ti & 4 & 16 & Xeon Gold 6149 & 99 & 99 & 0.2 \\ % +GTX 1080Ti & 4 & 16 & Xeon Gold 6149 & 99 & 99 & 0.2 \\ % +RTX 2070S & 24 & 32 & E5-2620v2 & 199 & 25 & 0.199 \\ +RTX 2070S & 32 & 97 & E5-2650 & 162 & 64 & 0.285 \\ +RTX 2080 & 6 & 16 & E5-2620v3 & 271 & 287 & 0.25 \\ +RTX 2080 & 24 & 32 & E5-2630v3 & 199 & 25 & 0.302 \\ +RTX 2080S & 4 & 32 & E5-2697v4 & 101 & 99 & 0.292 \\ % +RTX 2080S & 4 & 32 & E5-2697v4 & 93 & 99 & 0.292 \\ % +RTX 2080S & 4 & 32 & E5-2697v4 & 94 & 98 & 0.292 \\ % +RTX 2080S & 4 & 32 & E5-2697v4 & 94 & 98 & 0.292 \\ % +RTX 2080S & 4 & 32 & E5-2697v4 & 100 & 99 & 0.292 \\ % +RTX 2080Ti & 4 & 16 & Ryzen Threadripper 3960x & 279 & 271 & 0.35 \\ +RTX 2080Ti & 8 & 129 & E5-2670v3 & 616 & 672 & 0.201 \\ +RTX 2080Ti & 6 & 32 & E5-2620v3 & 217 & 61 & 0.22 \\ +RTX 2080Ti & 8 & 16 & E5-2697v2 & 100 & 58 & 0.3 \\ +RTX 2080Ti & 8 & 21 & E5-2697v2 & 145 & 49 & 0.243 \\ +RTX 2080Ti & 12 & 32 & Unknown & 111 & 92 & 0.326 \\ +RTX 2080Ti & 12 & 64 & E5-2690v3 & 205 & 61 & 0.549 \\ +RTX 3080 & 16 & 16 & i7-10700K & 69 & 49 & 0.462 \\ +RTX 3090 & 14 & 32 & E5-2695v3 & 93 & 37 & 0.498 \\ +RTX 3090 & 16 & 32 & Ryzen 9 3950X & 338 & 38 & 0.511 \\ +Titan RTX & 4 & 32 & Xeon W-3223 & 321 & 115 & 1 \\ +Titan RTX & 4 & 32 & Xeon Gold 6149 & 99 & 100 & 0.702 \\ % +Titan V & 8 & 32 & i7-7700K & 97 & 50 & 0.282 \\ +V100-FHHL & 8 & 60 & Xeon Gold 6148 & 544 & 584 & 0.39 \\ +\midrule +\multicolumn{6}{c}{Total hourly cost (as listed):} &\bf 15.43 \\ +\bottomrule +\end{tabular} +\end{table*} diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/experiments.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/experiments.tex new file mode 100644 index 00000000..14822e19 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/experiments.tex @@ -0,0 +1,96 @@ +\vspace{-10pt} +\section{Experiments}\label{sect:experiments} +\vspace{-2pt} +In this section, we +conduct empirical evaluation of the proposed averaging protocol and its corresponding optimization algorithm. +First, we check the theoretical properties of Moshpit All-Reduce in a controlled setup (Section~\ref{sect:experiments_averaging}). Then, we compare Moshpit SGD with other distributed methods on practical tasks of image classification and masked language model pretraining (Sections~\ref{sect:experiments_vision} and~\ref{sect:experiments_nlp}). + +\vspace{-4pt} +\subsection{Decentralized averaging} +\label{sect:experiments_averaging} +In this series of experiments, we aim to empirically verify the convergence and fault tolerance properties proven in Section~\ref{sect:method_convergence}. +To measure this in a controlled setting, we create peers with parameters that are scalar values drawn from the standard Gaussian distribution. We study the convergence of different distributed methods with respect to the number of workers $N$ and their individual failure rate for a single iteration of averaging $p$ (failed peers return in the next round). + +We compare Moshpit Averaging with the following algorithms from prior work: All-Reduce (with restarts in case of node failures), Gossip, PushSum (equivalent to the method described in~\cite{sgpush}). Also, we provide the results of averaging in random groups as a simpler version of our approach. However, the implementation of group averaging maintains approximately the same group size across all iterations: this property might be hard to achieve in a decentralized setting, and as a result, the estimate of this method's performance should be considered highly optimistic. + +We report the average squared difference between the worker parameters and the actual average of all values; the results are averaged across 100 restarts from different random initializations. +We compare the convergence for 512--1024 peers and consider failure probabilities ranging from 0 to 0.01. For Moshpit Averaging and random group averaging, we use groups of size 32, which corresponds to $M=32$ and $d=2$ for Algorithm~\ref{alg:moshpit}. + +\vspace{-4pt} +\begin{figure}[h] +\noindent +\centering +\includegraphics[width=\textwidth]{resources/averaging.pdf} +\caption{Convergence of averaging algorithms in different configurations.} +\label{fig:averaging} +\end{figure} +\vspace{-8pt} + +Figure~\ref{fig:averaging} displays the results of experiments for several combinations of $N$ and $p$; the complete results with additional grid configurations are available in Appendix~\ref{sect:extra_averaging}. We make several key observations: +\begin{enumerate}[leftmargin=*] + \vspace{-2pt}\item When the failure rate of each peer is zero, standard All-Reduce predictably computes the average faster than all other methods. However, as soon as $p$ reaches a value of at least 0.005, the number of retries needed for the success becomes prohibitively high. + \vspace{-2pt}\item Previous decentralized averaging methods, such as Gossip or PushSum, require significantly more iterations for convergence to the global average than Moshpit All-Reduce, likely due to the structure of their communication graphs. + \vspace{-2pt}\item As discussed in Section~\ref{sect:method_algorithm}, when the total number of peers is equal to the grid capacity and there are no failures, Moshpit All-Reduce matches the result of regular All-Reduce with the number of steps equal to the number of grid dimensions (2 in this case). + \vspace{-2pt}\item Averaging in random groups can perform comparably to Moshpit Averaging when the number of peers is less than half of the grid capacity. The reason for this behavior is that when the workers do not fully occupy the grid, the group sizes are no longer guaranteed to be equal across groups and across iterations. In the worst case, there can be groups of only one peer for certain grid coordinates, which may significantly affect the convergence. However, as the grid utilization grows, Moshpit Averaging starts to outperform random group averaging. Moreover, even if we use 512 peers, arranging them in a proper 8x8x8 grid leads to faster convergence. +\end{enumerate} + +\pagebreak[4] + + +\subsection{ImageNet training}\label{sect:experiments_vision} +Here, we evaluate the performance of Moshpit SGD in distributed training. More specifically, we train ResNet-50~\cite{resnet} on the ILSVRC~\cite{imagenet_cvpr09} dataset, following the training protocol of~\cite{goyal2017accurate}. Trainers use SGD with Nesterov momentum with a batch size of 256 and 32-bit precision regardless of the GPU type\footnote{For GPUs that cannot fit this into memory, we accumulate gradients over 2 batches of 128 examples.}. We evaluate the following training strategies: +\begin{itemize}[leftmargin=*]\vspace{-2px} + \item \textbf{All-Reduce SGD (AR-SGD)} --- traditional distributed training with all-reduce gradient averaging; + \item \textbf{Asynchronous Decentralized Parallel SGD (AD-PSGD)} --- parallel SGD that runs gossip communication in a cycle: each worker averages parameters with 2 neighbors~\cite{ad_psgd}. Communication rounds are overlapped with computation; + \item \textbf{Stochastic Gradient Push (SGP)} --- a more advanced algorithm with an exponential communication graph and push-based communication~\cite{sgpush}; + \item \textbf{Moshpit SGD} --- similar to \textbf{SGP}, but with 1 round of Moshpit Averaging instead of PushSum. +\end{itemize}\vspace{-2px} + +We report top-1 validation accuracy as a function of training time in two experimental setups: +\begin{itemize}[leftmargin=*]\vspace{-4px} + \item \textbf{Homogeneous}: 16 servers with a single Tesla V100-PCIe GPU, 6 CPU cores, and 64GB RAM. + \item \textbf{Heterogeneous}: a total of 81 GPUs (V100, 1080Ti, and P40) across 64 servers and workstations.\footnote{We provide a detailed configuration in Appendix~\ref{sect:detailed_setup}.} +\end{itemize}\vspace{-4px} + +All servers and workstations communicate over the network with 1Gb/s Ethernet (non-dedicated symmetric bandwidth). The machines are located in two data centers and one office within 300 km of one another. The communication latency is 1--6ms depending on the location. To simulate shared usage, at the beginning of each communication round we inject additional latency sampled from the exponential distribution~\cite{sukhov2016generating} with the mean of 100ms. + +For Moshpit SGD, we use a two-dimensional ``grid'' with 4 and 8 groups for homogeneous and heterogeneous setups respectively. For AD-PSGD, we attempt to compensate for slow convergence by training for 60 more epochs without changing the learning rate schedule. Finally, we only report AR-SGD in the first setup, as it is unsuitable for heterogeneous hardware.% + + + +The results in Figure~\ref{fig:all} (Left) demonstrate that the two most efficient strategies for our setting are Moshpit SGD and SGP. In the \textbf{homogeneous} setup, Moshpit is only slightly more efficient than SGP, likely due to higher efficiency of all-reduce. This advantage increases to over 30\% for the \textbf{heterogeneous} setup with 64 servers. In turn, AR-SGD demonstrates the best performance per iteration, but its training time is by far the longest due to network latency ($1.5{\times}$ of Moshpit SGD). Finally, AD-PSGD predictably shows the best throughput (time per epoch), but achieves lower accuracy even after training for 150 epochs. We report results for smaller setups in Appendix~\ref{sect:extra_classification}. % + + +\subsection{Masked Language Model training} +\label{sect:experiments_nlp} +Finally, we evaluate Moshpit All-Reduce training performance in the wild with preemptible cloud instances. For this experiment, we perform one of the most resource-demanding tasks in modern deep learning --- unsupervised pretraining of Transformers~\cite{bert,roberta,radford2019language,gpt3}. +We opt for the ALBERT model~\cite{albert} to make better use of communication-constrained devices. This model has fewer trainable parameters due to layer-wise weight sharing. + +\begin{figure*}[t] + \noindent + \centering + \vspace{-10pt} + \includegraphics[width=\textwidth]{resources/albert_hours.pdf} + \vspace{-16pt} + \caption{\textbf{(Left, Middle)} ResNet-50 top-1 validation accuracy for ImageNet as a function of training time (left) and epochs (middle). \textbf{(Right)} Full training objective (MLM + SOP) of ALBERT-large on BookCorpus as a function of training time.} + \label{fig:all}\vspace{-6pt} +\end{figure*} + + +Specifically, we train ALBERT-large (18M parameters) on the BookCorpus~\cite{bookcorpus} dataset, following the training setup from the original paper. We minimize the masked language modeling loss (MLM) along with the sentence order prediction loss (SOP) using the LAMB optimizer~\cite{You2020Large} with a global batch size of 4096 and sequence length 512. We measure convergence in terms of full training loss~\cite{lin2020multinode,fedus2021switch}. Similarly to Section~\ref{sect:experiments_vision}, we use two training setups: +\vspace{-4pt}\begin{itemize}[leftmargin=*] + \item \textbf{Homogeneous:} a single cloud instance with $8$ Tesla V100-PCIe GPUs and 56 vCPUs; + \item \textbf{Heterogeneous:} a total of 66 preemptible GPUs, 32 of which are cloud T4, and the remaining 34 are various devices rented on a public marketplace. +\end{itemize}\vspace{-4pt} + +Despite the fact that the latter setup has almost $3{\times}$ more raw compute\footnote{Based on official performance benchmarks~\cite{nvidia_perf}.}, its hourly rent costs less than the homogeneous setup due to relying on preemptible instances\footnote{Please refer to Appendix~\ref{sect:detailed_setup} for full experimental setups.}. This instance type is much cheaper than regular cloud instances, but it can be interrupted at any time. As a side-effect, the participants in \textbf{heterogeneous} setup are also spread across 3 continents with uneven network bandwidth, ranging from 100Mb/s to 1500Mb/s per worker. These limitations make it impractical to deploy conventional all-reduce protocols. By contrast, the fully decentralized nature of Moshpit SGD allows it to operate on unreliable nodes. + +In this setup, the participants accumulate gradients over multiple local batches and use DHT to track the global batch size. Once the swarm collectively accumulates gradients over 4096 training samples, it runs 2 rounds of Moshpit All-Reduce with $M{=}8$ and $d{=}2$. Unfortunately, training with simple parameter averaging does not converge, likely due to diverging LAMB statistics. To mitigate this issue, workers recover ``pseudo-gradients''~\cite{reddi2021adaptive,chen2020toward} after averaging to update the optimizer statistics. + + + + +Figure~\ref{fig:all} (right) demonstrates that Moshpit SGD with a fully preemptible fleet of machines trains 1.5 times faster than the traditional data-parallel setup. +The final loss achieved by two training strategies is the same within the margin of error. +A closer investigation reveals that this speedup is entirely explained by the reduced iteration time. +An interesting observation is that the iteration time of Moshpit SGD varies between {10--22} seconds, while AR-SGD consistently spends {25}s per step. This can be explained by natural variation in the preemptible fleet size: there were 30--66 active participants depending on the resource availability. diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/extra_plots.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/extra_plots.tex new file mode 100644 index 00000000..72603c7a --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/extra_plots.tex @@ -0,0 +1,63 @@ +\section{Additional averaging experiments} +\label{sect:extra_averaging} + +In this section, we evaluate the averaging precision with the same methodology as in~\ref{sect:experiments_averaging}, but for multiple different worker configurations. + +Table~\ref{tab:full_averaging} provides the complete results of our experiments that were used to make conclusions in the main experimental section: instead of reporting the mean squared error for different iterations, we provide the number of rounds that was required to achieve the error of $10^{-9}$ and $10^{-4}$. + +In Figure~\ref{fig:many_averagings}, plots 1--5 explore several combinations of grid sizes and failure rates, whereas plot 6 (bottom right) demonstrates a setup with the same number of peers ($10^6$) arranged into several different grid sizes and its relation to convergence. Note that $M{=}32$ outperforms the alternatives only for the specific failure rate of $0.001$. + +\begin{table}[ht] +\centering +\caption{Averaging performance of different algorithms. Values denote the number of iterations required to achieve the error of $10^{-9}$ ($10^{-4}$ in parentheses), the best result is in bold.} +\vspace{1em} +\label{tab:full_averaging} +\begin{tabular}{@{}llccccc@{}} +\toprule +$N$ & $p$ & All-Reduce & Gossip & PushSum & Random groups & Moshpit \\ \midrule +512 & 0 & \bf 1.0 (1.0) & 50.0 (50.0) & 47.6 (15.6) & 6.1 (3.0) & 8.2 (3.5) \\ +512 & 0.001 & \bf 1.6 (1.6) & 50.0 (50.0) & 47.6 (15.6) & 6.3 (3.0) & 8.1 (3.7) \\ +512 & 0.005 & 10.9 (10.9) & 50.0 (50.0) & 47.8 (15.6) & \bf 6.3 (3.0) & 8.7 (3.9) \\ +512 & 0.01 & 41.7 (41.7) & 50.0 (50.0) & 47.8 (15.6) & \bf 6.6 (3.0) & 9.1 (3.9) \\ \midrule +768 & 0 & \bf 1.0 (1.0) & 50.0 (50.0) & 43.2 (13.8) & 6.2 (3.0) & 6.0 (3.0) \\ +768 & 0.001 & \bf 1.8 (1.8) & 50.0 (50.0) & 43.2 (13.8) & 6.5 (3.0) & 6.2 (3.0) \\ +768 & 0.005 & 28.7 (28.7) & 50.0 (50.0) & 43.2 (14.1) & \bf 6.6 (3.0) & \bf 6.6 (3.0) \\ +768 & 0.01 & 50.0 (50.0) & 50.0 (50.0) & 43.9 (14.2) & 7.0 (3.0) & \bf 6.8 (3.0) \\ \midrule +900 & 0 & \bf 1.0 (1.0) & 50.0 (50.0) & 45.0 (14.7) & 6.4 (3.0) & 5.0 (2.8) \\ +900 & 0.001 & \bf 1.8 (1.8) & 50.0 (50.0) & 45.0 (14.7) & 6.3 (3.0) & 5.5 (3.0) \\ +900 & 0.005 & 50.0 (50.0) & 50.0 (50.0) & 45.2 (14.7) & 6.7 (3.0) &\bf 5.9 (3.0) \\ +900 & 0.01 & 50.0 (50.0) & 50.0 (50.0) & 45.6 (14.9) & 7.0 (3.1) & \bf 6.4 (3.1) \\ \midrule +1024 & 0 & \bf 1.0 (1.0) & 50.0 (50.0) & 49.0 (16.2) & 6.2 (3.0) & 2.0 (2.0) \\ +1024 & 0.001 & \bf 2.0 (2.0) & 50.0 (50.0) & 49.0 (16.3) & 6.5 (3.0) & 3.4 (2.2) \\ +1024 & 0.005 & 42.6 (42.6) & 50.0 (50.0) & 49.5 (16.3) & 6.7 (3.0) & \bf 5.4 (2.9) \\ +1024 & 0.01 & 50.0 (50.0) & 50.0 (50.0) & 49.5 (16.3) & 6.9 (3.1) & \bf 5.9 (3.0) \\ \bottomrule +\end{tabular} +\end{table} + +\begin{figure}[h] + \centering + \includegraphics[width=\linewidth]{resources/multiple_graphics.pdf} + \vspace{-20pt} + \caption{Averaging error of Moshpit All-Reduce as a function of the iteration number for different configurations and failure rates.} + \label{fig:many_averagings} +\end{figure} + +\section{Additional image classification experiments} +\label{sect:extra_classification} + +Aside from the two evaluation scenarios provided in~\ref{sect:experiments_vision}, we also measure the performance of Moshpit-SGD in a non-distributed setup, i.e. on a single server with multiple GPUs. We conduct this experiment on the same $8{\times}$ V100 machine that was used in the \textbf{homogeneous} setup for training ALBERT (see Appendix~\ref{sect:detailed_setup_albert}). + +\begin{figure}[h] + \centering + \begin{tabular}{cc} + \hspace{-10pt} + \includegraphics[width=0.5\textwidth]{resources/resnet50_local.pdf} & + \includegraphics[width=0.5\textwidth]{resources/resnet50_local_epochs.pdf} + \end{tabular} + \caption{ + ResNet-50 top-1 validation accuracy on ImageNet when training on a single node with $8{\times}$ V100-PCIe GPUs. + \textbf{(Left)} Convergence in terms of training time, \textbf{(Right)} Convergence in terms of training epochs} + \label{fig:resnet_local}\vspace{-8pt} +\end{figure} + +As Figure~\ref{fig:resnet_local} demonstrates, Moshpit SGD is slower than AR-SGD by approximately $25\%$. This result is expected, since our implementation of Moshpit All-Reduce is more general and communicates over a TCP connection, whereas AR-SGD uses direct peer-to-peer GPU communication over PCIe. On average, this incurs a slowdown of $27\%$ in terms of training time. \ No newline at end of file diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/intro.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/intro.tex new file mode 100644 index 00000000..e57bc2c0 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/intro.tex @@ -0,0 +1,28 @@ +\section{Introduction}\label{sect:intro} + +Many recent influential discoveries in deep learning were enabled by the trend of scaling model and dataset size. +Over the last decade, computer vision has grown from training models with 60 million parameters~\cite{alexnet} on 1.3 million images~\cite{imagenet_cvpr09} to 15 times more parameters~\cite{Kolesnikov2020BigT} and 200 times more training data~\cite{jft-300m}. In natural language processing, the state-of-the-art language models~\cite{gpt3} with 175 billion parameters are trained on over 570GB of texts, and even this does not saturate the model quality~\cite{kaplan2020scaling}. +Training these large models can take years even with a top-of-the-line GPU server~\cite{gpt3costlambda}. As a result, researchers and practitioners often have to run distributed training with multiple machines~\cite{mlperf}. + +The dominant approach to distributed deep learning is data-parallel training~\cite{valiant1990bridging}, where each worker processes a fraction of the training batch and then exchanges its gradients with peers. If done naïvely, the gradient exchange step can overload the network as the number of workers increases. To combat this issue, modern distributed training algorithms take advantage of communication-efficient protocols, such as all-reduce~\cite{bandwidth_optimal_allreduce}. These protocols +allow workers to collectively compute the global average gradient with a constant communication overhead, regardless of the total number of peers. + +However, this efficiency makes the protocols more fragile: if any single participant fails or takes too long to process its batch, all other nodes are stalled. +Therefore, scaling all-reduce protocols beyond a couple of servers requires specialized infrastructure with dedicated ultra-high bandwidth networking~\cite{mlperf}. +This kind of infrastructure is notoriously expensive compared to regular +GPU servers or preemptible cloud VMs (see Appendix~\ref{sect:cloud_costs} for details). + +Hence, it is tempting to consider distributed training on cheap unreliable instances as a cost-efficient alternative. A similar scenario arises in federated learning~\cite{mcmahan2017communication}, where a single model is trained on heterogeneous devices due to privacy concerns. +In both scenarios, workers use a shared network, where both latency and bandwidth can vary drastically due to interference from other users~\cite{variability_azure}\nocite{variability_aws}. Furthermore, compute nodes are also subject to failure (or preemption) caused by factors beyond the protocol's control. + +Running large-scale distributed training in these circumstances requires fault- and latency-tolerant algorithms~\cite{lian2017can,sgpush}. Most of these algorithms replace all-reduce averaging with \textbf{gossip}: each participant periodically downloads the latest parameters from their neighbors in a sparsely connected communication graph and averages the results. The updates gradually propagate through the graph over multiple rounds of averaging. +However, the communication required to perform gossip grows linearly with the number of neighbors. Hence, when scaling to hundreds of peers, decentralized SGD has to keep the communication graph sparse, slowing down the convergence. + +In this work, we propose an alternative approach. Instead of relying on a predefined communication graph, participants dynamically organize themselves into groups using a fully decentralized matchmaking algorithm called \textbf{Moshpit All-Reduce}. This strategy allows us to use communication-efficient all-reduce protocols that significantly reduce the network load compared to gossip-based averaging, while still being able to operate in unreliable hardware and network conditions. + +Our contributions can be summarized as follows: +\begin{itemize} + \item We propose {\bf Moshpit All-Reduce} --- a novel decentralized averaging protocol for large-scale training with unreliable communication-constrained devices. According to our analysis, this method has exponential convergence rate independent of network topology and size. + \item Armed with this averaging protocol, we develop {\bf Moshpit SGD} for distributed optimization. We derive convergence rates for this algorithm and establish its equivalence to Centralized (Local) SGD in terms of iteration complexity under realistic assumptions. + \item Our experiments demonstrate that Moshpit All-Reduce is significantly more efficient under network latency in realistic conditions. In particular, we train ResNet-50 on ImageNet to 75\% accuracy 1.3 times faster than existing decentralized training algorithms and pretrain ALBERT-large 1.5 times faster on preemptible cloud VMs.\footnote{Implementation and code of experiments are at \href{https://github.com/yandex-research/moshpit-sgd}{\texttt{github.com/yandex-research/moshpit-sgd}}.} +\end{itemize} diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/load_balancing.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/load_balancing.tex new file mode 100644 index 00000000..36b87d18 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/load_balancing.tex @@ -0,0 +1,38 @@ +\section{Training with a dynamic number of peers} +\label{sect:load_state_from_peers} + +Many practical setups with unreliable devices allow peers to join or leave at any time, which can produce undesirable side-effects. For instance, consider a participant that joins the ``swarm'' midway through the training process. If this participant starts with the initial model parameters, it can undo some of the progress made by other peers. + +To circumvent this issue, we require each new participant to download the latest parameters from a random up-to-date peer discovered through DHT. The same technique is used to synchronize the optimizer statistics and the learning rate schedule. This protocol is also triggered if a peer becomes desynchronized with others, e.g., after a network freeze. + +\section{Load balancing via linear programming} +\label{sect:load_balancing} + +When running Moshpit Averaging on heterogeneous devices, one must regularly perform Butterfly All-Reduce among peers with uneven network bandwidth. +In order to speed up the protocol, we can make low-throughput peers receive, average, and send smaller partitions of the averaged vector; conversely, the high-throughput peers can process greater fractions of the input vector. +To compute the optimal partitioning, peers must solve an optimization problem that minimizes the total time spent on communication during all-reduce. + +Consider a group of $M$ peers with network bandwidths $b_1, ..., b_M$, defined for simplicity as the minimum of the upload and download speed for each peer. Our objective is to find $w_i$ --- a fraction of all input vectors to be processed by the $i$-th peer. + +In Butterfly All-Reduce, each peer $i$ splits its vector into parts and sends these parts to corresponding peers. Since there is no need to send $w_i$ to itself, $i$-th peer will upload a total of $1 - w_i$ of the vector to its peers. +On the receiving side, peer $i$ will average $w_i$ of the vector from all peers in its group. To do so, it must download $M-1$ vector parts of size $w_i$ from all other peers. +After that, peers distribute the averaged parts by running the same procedure in reverse (see Figure~\ref{fig:butterfly_allreduce}). + +Thus, the communication time for each peer is proportional to $t_i = (1-w_i+(M-1) w_i) \cdot \frac{1}{b_i}$ and the total runtime of Butterfly All-Reduce is the maximum communication time over all peers: $T = \max_i t_i=\max_i (1-w_i+(M-1) w_i) \cdot \frac{1}{b_i}$. Formally, we minimize $T$ with respect to $w_i$ with two constraints on the fraction weights: +\begin{alignat*}{3} +\min_w&\quad &\max_i &(1-w_i +&(M-1)w_i)\cdot\frac{1}{b_i}&\\ +\text{subject to}&\quad& \sum_{i=1}^M w_i = 1&&&\\ +&&w_i \geq 0 &&&\forall i=1,\ldots,M +\end{alignat*} + +Because the functions being maximized and the constraints are linear in $w_i$, this problem can be reduced to linear programming~\cite{kaplan1974application}. Namely, we can minimize a surrogate variable $\xi$ such that $\forall i, \ \xi \geq (1-w_i+(M-1)\cdot w_i) \cdot \frac{1}{b_i}$. The resulting linear program is formulated as follows: + +\begin{alignat*}{3} +\min_{w,\xi}&\quad& \xi && &\\ +\text{subject to}&\quad& \sum_{i=1}^M w_i& = 1 &&\\ +&\quad& w_i& \geq 0 &&\quad \forall i=1,\ldots,M\\ +&\quad&\xi&\geq (1-&w_i+(M-1)w_i)\cdot\frac{1}{b_i}&\quad\forall i=1,\ldots,M +\end{alignat*} + +We solve this problem using the interior point method~\cite{andersen} implemented as part of the SciPy package (\texttt{scipy.optimize.linprog}). +Note that depending on the conditions given by participant bandwidth, optimal weights of specific peers might be equal to 0 in some cases. In essence, this allows our method to smoothly interpolate between data parallelism~\cite{valiant1990bridging}, parameter server~\cite{parameter_server_first} and sharded parameter server~\cite{sharded_ps_first} in manner similar to BytePS~\cite{byteps}. \ No newline at end of file diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/main.bbl b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/main.bbl new file mode 100644 index 00000000..2ac26dbd --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/main.bbl @@ -0,0 +1,825 @@ +\begin{thebibliography}{100} + +\bibitem{alexnet} +Alex Krizhevsky, Ilya Sutskever, and Geoffrey~E Hinton. +\newblock Imagenet classification with deep convolutional neural networks. +\newblock In F.~Pereira, C.~J.~C. Burges, L.~Bottou, and K.~Q. Weinberger, + editors, {\em Advances in Neural Information Processing Systems 25}, pages + 1097--1105. Curran Associates, Inc., 2012. + +\bibitem{imagenet_cvpr09} +J.~Deng, W.~Dong, R.~Socher, L.-J. Li, K.~Li, and L.~Fei-Fei. +\newblock {ImageNet: A Large-Scale Hierarchical Image Database}. +\newblock In {\em CVPR09}, 2009. + +\bibitem{Kolesnikov2020BigT} +Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, + S.~Gelly, and N.~Houlsby. +\newblock Big transfer (bit): General visual representation learning. +\newblock In {\em ECCV}, 2020. + +\bibitem{jft-300m} +Chen Sun, Abhinav Shrivastava, Saurabh Singh, and Abhinav Gupta. +\newblock Revisiting unreasonable effectiveness of data in deep learning era. +\newblock In {\em ICCV}, 2017. + +\bibitem{gpt3} +Tom~B Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla + Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, + et~al. +\newblock Language models are few-shot learners. +\newblock {\em arXiv preprint arXiv:2005.14165}, 2020. + +\bibitem{kaplan2020scaling} +Jared Kaplan, Sam McCandlish, Tom Henighan, Tom~B. Brown, Benjamin Chess, Rewon + Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. +\newblock Scaling laws for neural language models, 2020. + +\bibitem{gpt3costlambda} +Chuan Li. +\newblock Demystifying gpt-3 language model: A technical overview, 2020. +\newblock "\url{https://lambdalabs.com/blog/demystifying-gpt-3}". + +\bibitem{mlperf} +Peter Mattson, Christine Cheng, Cody Coleman, Greg Diamos, Paulius + Micikevicius, David Patterson, Hanlin Tang, Gu-Yeon Wei, Peter Bailis, Victor + Bittorf, David Brooks, Dehao Chen, Debojyoti Dutta, Udit Gupta, Kim + Hazelwood, Andrew Hock, Xinyuan Huang, Bill Jia, Daniel Kang, David Kanter, + Naveen Kumar, Jeffery Liao, Guokai Ma, Deepak Narayanan, Tayo Oguntebi, + Gennady Pekhimenko, Lillian Pentecost, Vijay~Janapa Reddi, Taylor Robie, + Tom~St. John, Carole-Jean Wu, Lingjie Xu, Cliff Young, and Matei Zaharia. +\newblock {MLPerf Training Benchmark}. +\newblock In {\em {Proceedings of the 3rd Conference on Machine Learning and + Systems (MLSys'20)}}, 2020. + +\bibitem{valiant1990bridging} +Leslie~G Valiant. +\newblock A bridging model for parallel computation. +\newblock {\em Communications of the ACM}, 33(8):103--111, 1990. + +\bibitem{bandwidth_optimal_allreduce} +Pitch Patarasuk and Xin Yuan. +\newblock Bandwidth optimal all-reduce algorithms for clusters of workstations. +\newblock {\em J. Parallel Distrib. Comput.}, 69(2):117–124, February 2009. + +\bibitem{mcmahan2017communication} +Brendan McMahan, Eider Moore, Daniel Ramage, Seth Hampson, and Blaise~Aguera + y~Arcas. +\newblock Communication-efficient learning of deep networks from decentralized + data. +\newblock In {\em Artificial Intelligence and Statistics}, pages 1273--1282, + 2017. + +\bibitem{variability_azure} +V.~{Persico}, P.~{Marchetta}, A.~{Botta}, and A.~{Pescape}. +\newblock On network throughput variability in microsoft azure cloud. +\newblock In {\em 2015 IEEE Global Communications Conference (GLOBECOM)}, pages + 1--6, 2015. + +\bibitem{variability_aws} +Valerio Persico, Pietro Marchetta, Alessio Botta, and Antonio Pescapè. +\newblock Measuring network throughput in the cloud: The case of amazon ec2. +\newblock {\em Computer Networks}, 93:408 -- 422, 2015. +\newblock Cloud Networking and Communications II. + +\bibitem{lian2017can} +Xiangru Lian, Ce~Zhang, Huan Zhang, Cho-Jui Hsieh, Wei Zhang, and Ji~Liu. +\newblock Can decentralized algorithms outperform centralized algorithms? a + case study for decentralized parallel stochastic gradient descent. +\newblock In {\em Advances in Neural Information Processing Systems}, pages + 5330--5340, 2017. + +\bibitem{sgpush} +Mahmoud Assran, Nicolas Loizou, Nicolas Ballas, and Mike Rabbat. +\newblock Stochastic gradient push for distributed deep learning. +\newblock In Kamalika Chaudhuri and Ruslan Salakhutdinov, editors, {\em + Proceedings of the 36th International Conference on Machine Learning}, + volume~97 of {\em Proceedings of Machine Learning Research}, pages 344--353. + PMLR, 09--15 Jun 2019. + +\bibitem{goyal2017accurate} +Priya Goyal, Piotr Dollár, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, + Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. +\newblock Accurate, large minibatch sgd: Training imagenet in 1 hour, 2017. + +\bibitem{You2020Large} +Yang You, Jing Li, Sashank Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh + Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, and Cho-Jui Hsieh. +\newblock Large batch optimization for deep learning: Training bert in 76 + minutes. +\newblock In {\em International Conference on Learning Representations}, 2020. + +\bibitem{parameter_server_first} +Mu~Li. +\newblock Scaling distributed machine learning with the parameter server. +\newblock In {\em Proceedings of the 2014 International Conference on Big Data + Science and Computing}, BigDataScience '14, New York, NY, USA, 2014. + Association for Computing Machinery. + +\bibitem{adam} +Diederik~P. Kingma and Jimmy Ba. +\newblock Adam: {A} method for stochastic optimization. +\newblock In {\em 3rd International Conference on Learning Representations, + {ICLR} 2015}, 2015. + +\bibitem{survey_distributed2} +Salem Alqahtani and Murat Demirbas. +\newblock Performance analysis and comparison of distributed machine learning + systems, 2019. + +\bibitem{survey_distributed} +Joost Verbraeken, Matthijs Wolting, Jonathan Katzy, Jeroen Kloppenburg, Tim + Verbelen, and Jan~S. Rellermeyer. +\newblock A survey on distributed machine learning. +\newblock {\em ACM Comput. Surv.}, 53(2), March 2020. + +\bibitem{localsgd_first} +Martin Zinkevich, Markus Weimer, Lihong Li, and Alex Smola. +\newblock Parallelized stochastic gradient descent. +\newblock In J.~Lafferty, C.~Williams, J.~Shawe-Taylor, R.~Zemel, and + A.~Culotta, editors, {\em Advances in Neural Information Processing Systems}, + volume~23, pages 2595--2603. Curran Associates, Inc., 2010. + +\bibitem{lin2018deep} +Yujun Lin, Song Han, Huizi Mao, Yu~Wang, and Bill Dally. +\newblock Deep gradient compression: Reducing the communication bandwidth for + distributed training. +\newblock In {\em International Conference on Learning Representations}, 2018. + +\bibitem{pmlr-v97-koloskova19a} +Anastasia Koloskova, Sebastian Stich, and Martin Jaggi. +\newblock Decentralized stochastic optimization and gossip algorithms with + compressed communication. +\newblock In Kamalika Chaudhuri and Ruslan Salakhutdinov, editors, {\em + Proceedings of the 36th International Conference on Machine Learning}, + volume~97 of {\em Proceedings of Machine Learning Research}, pages + 3478--3487. PMLR, 09--15 Jun 2019. + +\bibitem{sharded_ps_first} +Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, + Marc\textquotesingle~aurelio Ranzato, Andrew Senior, Paul Tucker, Ke~Yang, + Quoc Le, and Andrew Ng. +\newblock Large scale distributed deep networks. +\newblock In F.~Pereira, C.~J.~C. Burges, L.~Bottou, and K.~Q. Weinberger, + editors, {\em Advances in Neural Information Processing Systems}, volume~25, + pages 1223--1231. Curran Associates, Inc., 2012. + +\bibitem{byteps} +Yimin Jiang, Yibo Zhu, Chang Lan, Bairen Yi, Yong Cui, and Chuanxiong Guo. +\newblock A unified architecture for accelerating distributed {DNN} training in + heterogeneous gpu/cpu clusters. +\newblock In {\em 14th {USENIX} Symposium on Operating Systems Design and + Implementation ({OSDI} 20)}, pages 463--479. {USENIX} Association, November + 2020. + +\bibitem{mikami2019massively} +Hiroaki Mikami, Hisahiro Suganuma, Pongsakorn U-chupala, Yoshiki Tanaka, and + Yuichi Kageyama. +\newblock Massively distributed sgd: Imagenet/resnet-50 training in a flash, + 2019. + +\bibitem{shoeybi2019megatron} +Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, + and Bryan Catanzaro. +\newblock Megatron-lm: Training multi-billion parameter language models using + gpu model parallelism. +\newblock {\em arXiv preprint arXiv:1909.08053}, 2019. + +\bibitem{secure_aggregation} +Aaron Segal, Antonio Marcedone, Benjamin Kreuter, Daniel Ramage, H.~Brendan + McMahan, Karn Seth, K.~A. Bonawitz, Sarvar Patel, and Vladimir Ivanov. +\newblock Practical secure aggregation for privacy-preserving machine learning. +\newblock In {\em CCS}, 2017. + +\bibitem{federatedlearningatscale} +K.~A. Bonawitz, Hubert Eichner, Wolfgang Grieskamp, Dzmitry Huba, Alex + Ingerman, Vladimir Ivanov, Chloé~M Kiddon, Jakub Konečný, Stefano + Mazzocchi, Brendan McMahan, Timon~Van Overveldt, David Petrou, Daniel Ramage, + and Jason Roselander. +\newblock Towards federated learning at scale: System design. +\newblock In {\em SysML 2019}, 2019. +\newblock To appear. + +\bibitem{fed_intel} +Micah~J. Sheller, Brandon Edwards, G.~Anthony Reina, Jason Martin, Sarthak + Pati, Aikaterini Kotrotsou, Mikhail Milchenko, Weilin Xu, Daniel Marcus, + Rivka~R. Colen, and Spyridon Bakas. +\newblock Federated learning in medicine: facilitating multi-institutional + collaborations without sharing patient data. +\newblock {\em Scientific Reports}, 10(1):12598, Jul 2020. + +\bibitem{fed_nvidia} +Wenqi Li, Fausto Milletar{\`i}, Daguang Xu, Nicola Rieke, Jonny Hancox, Wentao + Zhu, Maximilian Baust, Yan Cheng, S{\'e}bastien Ourselin, {M. Jorge} Cardoso, + and Andrew Feng. +\newblock {\em Privacy-Preserving Federated Brain Tumour Segmentation}, pages + 133--141. +\newblock Lecture Notes in Computer Science (including subseries Lecture Notes + in Artificial Intelligence and Lecture Notes in Bioinformatics). SPRINGER, + January 2019. +\newblock 10th International Workshop on Machine Learning in Medical Imaging, + MLMI 2019 held in conjunction with the 22nd International Conference on + Medical Image Computing and Computer-Assisted Intervention, MICCAI 2019 ; + Conference date: 13-10-2019 Through 13-10-2019. + +\bibitem{fed_google1} +Andrew Hard, Chloé~M Kiddon, Daniel Ramage, Francoise Beaufays, Hubert + Eichner, Kanishka Rao, Rajiv Mathews, and Sean Augenstein. +\newblock Federated learning for mobile keyboard prediction, 2018. + +\bibitem{fed_google2} +Timothy Yang, Galen Andrew, Hubert Eichner, Haicheng Sun, Wei Li, Nicholas + Kong, Daniel Ramage, and Françoise Beaufays. +\newblock Applied federated learning: Improving google keyboard query + suggestions, 2018. + +\bibitem{volunteer_dl_async} +Ekasit Kijsipongse, Apivadee Piyatumrong, and Suriya U-ruekolan. +\newblock A hybrid gpu cluster and volunteer computing platform for scalable + deep learning. +\newblock {\em The Journal of Supercomputing}, 04 2018. + +\bibitem{learning_at_home} +Max Ryabinin and Anton Gusev. +\newblock Towards crowdsourced training of large neural networks using + decentralized mixture-of-experts. +\newblock In {\em Advances in Neural Information Processing Systems}, 2020. + +\bibitem{proteus} +Aaron Harlap, Alexey Tumanov, Andrew Chung, Gregory~R. Ganger, and Phillip~B. + Gibbons. +\newblock Proteus: Agile ml elasticity through tiered reliability in dynamic + resource markets. +\newblock In {\em Proceedings of the Twelfth European Conference on Computer + Systems}, EuroSys '17, page 589–604, New York, NY, USA, 2017. Association + for Computing Machinery. + +\bibitem{boyd2006randomized} +Stephen Boyd, Arpita Ghosh, Balaji Prabhakar, and Devavrat Shah. +\newblock Randomized gossip algorithms. +\newblock {\em IEEE transactions on information theory}, 52(6):2508--2530, + 2006. + +\bibitem{tsitsiklis1984problems} +John~Nikolas Tsitsiklis. +\newblock Problems in decentralized decision making and computation. +\newblock Technical report, Massachusetts Inst of Tech Cambridge Lab for + Information and Decision Systems, 1984. + +\bibitem{scaman2017optimal} +Kevin Scaman, Francis Bach, S{\'e}bastien Bubeck, Yin~Tat Lee, and Laurent + Massouli{\'e}. +\newblock Optimal algorithms for smooth and strongly convex distributed + optimization in networks. +\newblock In {\em International Conference on Machine Learning}, pages + 3027--3036, 2017. + +\bibitem{scaman2018optimal} +Kevin Scaman, Francis Bach, S{\'e}bastien Bubeck, Laurent Massouli{\'e}, and + Yin~Tat Lee. +\newblock Optimal algorithms for non-smooth distributed optimization in + networks. +\newblock In {\em Advances in Neural Information Processing Systems}, pages + 2740--2749, 2018. + +\bibitem{scaman2019optimal} +Kevin Scaman, Francis Bach, S{\'e}bastien Bubeck, Yin Lee, and Laurent + Massouli{\'e}. +\newblock Optimal convergence rates for convex distributed optimization in + networks. +\newblock {\em Journal of Machine Learning Research}, 20:1--31, 2019. + +\bibitem{assran2019stochastic} +Mahmoud Assran, Nicolas Loizou, Nicolas Ballas, and Mike Rabbat. +\newblock Stochastic gradient push for distributed deep learning. +\newblock In {\em International Conference on Machine Learning}, pages + 344--353. PMLR, 2019. + +\bibitem{xiao2004fast} +Lin Xiao and Stephen Boyd. +\newblock Fast linear iterations for distributed averaging. +\newblock {\em Systems \& Control Letters}, 53(1):65--78, 2004. + +\bibitem{merris1994laplacian} +Russell Merris. +\newblock Laplacian matrices of graphs: a survey. +\newblock {\em Linear algebra and its applications}, 197:143--176, 1994. + +\bibitem{uribe2020dual} +C{\'e}sar~A Uribe, Soomin Lee, Alexander Gasnikov, and Angelia Nedi{\'c}. +\newblock A dual approach for optimal algorithms in distributed optimization + over networks. +\newblock {\em Optimization Methods and Software}, pages 1--40, 2020. + +\bibitem{nedic2014distributed} +Angelia Nedi{\'c} and Alex Olshevsky. +\newblock Distributed optimization over time-varying directed graphs. +\newblock {\em IEEE Transactions on Automatic Control}, 60(3):601--615, 2014. + +\bibitem{nedic2016stochastic} +Angelia Nedi{\'c} and Alex Olshevsky. +\newblock Stochastic gradient-push for strongly convex functions on + time-varying directed graphs. +\newblock {\em IEEE Transactions on Automatic Control}, 61(12):3936--3947, + 2016. + +\bibitem{nedic2018network} +Angelia Nedi{\'c}, Alex Olshevsky, and Michael~G Rabbat. +\newblock Network topology and communication-computation tradeoffs in + decentralized optimization. +\newblock {\em Proceedings of the IEEE}, 106(5):953--976, 2018. + +\bibitem{rogozin2019projected} +Alexander Rogozin and Alexander Gasnikov. +\newblock Projected gradient method for decentralized optimization over + time-varying networks. +\newblock {\em arXiv preprint arXiv:1911.08527}, 2019. + +\bibitem{ram2009asynchronous} +S~Sundhar Ram, A~Nedi{\'c}, and Venugopal~V Veeravalli. +\newblock Asynchronous gossip algorithms for stochastic optimization. +\newblock In {\em Proceedings of the 48h IEEE Conference on Decision and + Control (CDC) held jointly with 2009 28th Chinese Control Conference}, pages + 3581--3586. IEEE, 2009. + +\bibitem{yan2012distributed} +Feng Yan, Shreyas Sundaram, SVN Vishwanathan, and Yuan Qi. +\newblock Distributed autonomous online learning: Regrets and intrinsic + privacy-preserving properties. +\newblock {\em IEEE Transactions on Knowledge and Data Engineering}, + 25(11):2483--2493, 2012. + +\bibitem{yuan2016convergence} +Kun Yuan, Qing Ling, and Wotao Yin. +\newblock On the convergence of decentralized gradient descent. +\newblock {\em SIAM Journal on Optimization}, 26(3):1835--1854, 2016. + +\bibitem{torus_allreduce} +Paul Sack and William Gropp. +\newblock Collective algorithms for multiported torus networks. +\newblock {\em ACM Trans. Parallel Comput.}, 1(2), February 2015. + +\bibitem{kademlia} +Petar Maymounkov and David Mazieres. +\newblock Kademlia: A peer-to-peer information system based on the xor metric. +\newblock In {\em International Workshop on Peer-to-Peer Systems}, pages + 53--65. Springer, 2002. + +\bibitem{nemirovski2009robust} +Arkadi Nemirovski, Anatoli Juditsky, Guanghui Lan, and Alexander Shapiro. +\newblock Robust stochastic approximation approach to stochastic programming. +\newblock {\em SIAM Journal on optimization}, 19(4):1574--1609, 2009. + +\bibitem{ghadimi2013stochastic} +Saeed Ghadimi and Guanghui Lan. +\newblock Stochastic first-and zeroth-order methods for nonconvex stochastic + programming. +\newblock {\em SIAM Journal on Optimization}, 23(4):2341--2368, 2013. + +\bibitem{gower2019sgd} +Robert~Mansel Gower, Nicolas Loizou, Xun Qian, Alibek Sailanbayev, Egor + Shulgin, and Peter Richt{\'a}rik. +\newblock Sgd: General analysis and improved rates. +\newblock In {\em International Conference on Machine Learning}, pages + 5200--5209. PMLR, 2019. + +\bibitem{karimireddy2020scaffold} +Sai~Praneeth Karimireddy, Satyen Kale, Mehryar Mohri, Sashank Reddi, Sebastian + Stich, and Ananda~Theertha Suresh. +\newblock Scaffold: Stochastic controlled averaging for federated learning. +\newblock In {\em International Conference on Machine Learning}, pages + 5132--5143. PMLR, 2020. + +\bibitem{gorbunov2020local} +Eduard Gorbunov, Filip Hanzely, and Peter Richtarik. +\newblock Local sgd: Unified theory and new efficient methods. +\newblock In Arindam Banerjee and Kenji Fukumizu, editors, {\em Proceedings of + The 24th International Conference on Artificial Intelligence and Statistics}, + volume 130 of {\em Proceedings of Machine Learning Research}, pages + 3556--3564. PMLR, 13--15 Apr 2021. + +\bibitem{khaled2020tighter} +Ahmed Khaled, Konstantin Mishchenko, and Peter Richt{\'a}rik. +\newblock Tighter theory for local sgd on identical and heterogeneous data. +\newblock In {\em International Conference on Artificial Intelligence and + Statistics}, pages 4519--4529. PMLR, 2020. + +\bibitem{woodworth2020local} +Blake Woodworth, Kumar~Kshitij Patel, Sebastian Stich, Zhen Dai, Brian Bullins, + Brendan Mcmahan, Ohad Shamir, and Nathan Srebro. +\newblock Is local sgd better than minibatch sgd? +\newblock In {\em International Conference on Machine Learning}, pages + 10334--10343. PMLR, 2020. + +\bibitem{koloskova2020unified} +Anastasia Koloskova, Nicolas Loizou, Sadra Boreiri, Martin Jaggi, and Sebastian + Stich. +\newblock A unified theory of decentralized sgd with changing topology and + local updates. +\newblock In {\em International Conference on Machine Learning}, pages + 5381--5393. PMLR, 2020. + +\bibitem{li2019communication} +Xiang Li, Wenhao Yang, Shusen Wang, and Zhihua Zhang. +\newblock Communication efficient decentralized training with multiple local + updates. +\newblock {\em arXiv preprint arXiv:1910.09126}, 5, 2019. + +\bibitem{resnet} +Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. +\newblock Deep residual learning for image recognition. +\newblock {\em 2016 IEEE Conference on Computer Vision and Pattern Recognition + (CVPR)}, pages 770--778, 2015. + +\bibitem{ad_psgd} +Xiangru Lian, Wei Zhang, Ce~Zhang, and Ji~Liu. +\newblock Asynchronous decentralized parallel stochastic gradient descent. +\newblock In Jennifer Dy and Andreas Krause, editors, {\em Proceedings of the + 35th International Conference on Machine Learning}, volume~80 of {\em + Proceedings of Machine Learning Research}, pages 3043--3052. PMLR, 10--15 Jul + 2018. + +\bibitem{sukhov2016generating} +Andrei~M Sukhov, MA~Astrakhantseva, AK~Pervitsky, SS~Boldyrev, and AA~Bukatov. +\newblock Generating a function for network delay. +\newblock {\em Journal of High Speed Networks}, 22(4):321--333, 2016. + +\bibitem{bert} +Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. +\newblock Bert: Pre-training of deep bidirectional transformers for language + understanding. +\newblock In {\em NAACL-HLT}, 2019. + +\bibitem{roberta} +Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer + Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. +\newblock Roberta: A robustly optimized bert pretraining approach. +\newblock {\em ArXiv}, abs/1907.11692, 2019. + +\bibitem{radford2019language} +Alec Radford, Jeff Wu, Rewon Child, David Luan, Dario Amodei, and Ilya + Sutskever. +\newblock Language models are unsupervised multitask learners. +\newblock 2019. + +\bibitem{albert} +Zhen-Zhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, + and Radu Soricut. +\newblock Albert: A lite bert for self-supervised learning of language + representations. +\newblock In {\em International Conference on Learning Representations}, 2020. + +\bibitem{bookcorpus} +Yukun Zhu, Ryan Kiros, Rich Zemel, Ruslan Salakhutdinov, Raquel Urtasun, + Antonio Torralba, and Sanja Fidler. +\newblock Aligning books and movies: Towards story-like visual explanations by + watching movies and reading books. +\newblock In {\em Proceedings of the IEEE international conference on computer + vision}, pages 19--27, 2015. + +\bibitem{lin2020multinode} +Jiahuang Lin, Xin Li, and Gennady Pekhimenko. +\newblock Multi-node bert-pretraining: Cost-efficient approach, 2020. + +\bibitem{fedus2021switch} +William Fedus, Barret Zoph, and Noam Shazeer. +\newblock Switch transformers: Scaling to trillion parameter models with simple + and efficient sparsity, 2021. + +\bibitem{nvidia_perf} +NVIDIA. +\newblock Nvidia data center deep learning product performance. +\newblock + "\url{https://developer.nvidia.com/deep-learning-performance-training-inference}", + accessed at 2021.02.03. + +\bibitem{reddi2021adaptive} +Sashank~J. Reddi, Zachary Charles, Manzil Zaheer, Zachary Garrett, Keith Rush, + Jakub Kone{\v{c}}n{\'y}, Sanjiv Kumar, and Hugh~Brendan McMahan. +\newblock Adaptive federated optimization. +\newblock In {\em International Conference on Learning Representations}, 2021. + +\bibitem{chen2020toward} +Xiangyi Chen, Xiaoyun Li, and Ping Li. +\newblock Toward communication efficient adaptive gradient method. +\newblock In {\em Proceedings of the 2020 ACM-IMS on Foundations of Data + Science Conference}, FODS '20, page 119–128, New York, NY, USA, 2020. + Association for Computing Machinery. + +\bibitem{squad} +Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. +\newblock Squad: 100, 000+ questions for machine comprehension of text. +\newblock In {\em EMNLP}, 2016. + +\bibitem{aldous2002reversible} +David Aldous and James~Allen Fill. +\newblock Reversible markov chains and random walks on graphs, 2002. unfinished + monograph, recompiled 2014, 2002. + +\bibitem{xu2020distributed} +Jinming Xu, Ye~Tian, Ying Sun, and Gesualdo Scutari. +\newblock Distributed algorithms for composite optimization: Unified and tight + convergence analysis. +\newblock {\em arXiv preprint arXiv:2002.11534}, 2020. + +\bibitem{fallah2019robust} +Alireza Fallah, Mert Gurbuzbalaban, Asu Ozdaglar, Umut Simsekli, and Lingjiong + Zhu. +\newblock Robust distributed accelerated stochastic gradient methods for + multi-agent networks. +\newblock {\em arXiv preprint arXiv:1910.08701}, 2019. + +\bibitem{kovalev2020optimal} +Dmitry Kovalev, Adil Salim, and Peter Richt{\'a}rik. +\newblock Optimal and practical algorithms for smooth and strongly convex + decentralized optimization. +\newblock {\em Advances in Neural Information Processing Systems}, 33, 2020. + +\bibitem{arjevani2015communication} +Yossi Arjevani and Ohad Shamir. +\newblock Communication complexity of distributed convex learning and + optimization. +\newblock {\em Advances in neural information processing systems}, + 28:1756--1764, 2015. + +\bibitem{seide20141} +Frank Seide, Hao Fu, Jasha Droppo, Gang Li, and Dong Yu. +\newblock 1-bit stochastic gradient descent and its application to + data-parallel distributed training of speech dnns. +\newblock In {\em Fifteenth Annual Conference of the International Speech + Communication Association}, 2014. + +\bibitem{alistarh2017qsgd} +Dan Alistarh, Demjan Grubic, Jerry~Z Li, Ryota Tomioka, and Milan Vojnovic. +\newblock Qsgd: communication-efficient sgd via gradient quantization and + encoding. +\newblock In {\em Proceedings of the 31st International Conference on Neural + Information Processing Systems}, pages 1707--1718, 2017. + +\bibitem{suresh2017distributed} +Ananda~Theertha Suresh, X~Yu Felix, Sanjiv Kumar, and H~Brendan McMahan. +\newblock Distributed mean estimation with limited communication. +\newblock In {\em International Conference on Machine Learning}, pages + 3329--3337. PMLR, 2017. + +\bibitem{ramezani2021nuqsgd} +Ali Ramezani-Kebrya, Fartash Faghri, Ilya Markov, Vitalii Aksenov, Dan + Alistarh, and Daniel~M Roy. +\newblock Nuqsgd: Provably communication-efficient data-parallel sgd via + nonuniform quantization. +\newblock {\em Journal of Machine Learning Research}, 22(114):1--43, 2021. + +\bibitem{faghri2020adaptive} +Fartash Faghri, Iman Tabrizian, Ilia Markov, Dan Alistarh, Daniel~M Roy, and + Ali Ramezani-Kebrya. +\newblock Adaptive gradient quantization for data-parallel sgd. +\newblock {\em Advances in Neural Information Processing Systems}, + 33:3174--3185, 2020. + +\bibitem{horvath2019natural} +Samuel Horvath, Chen-Yu Ho, Ludovit Horvath, Atal~Narayan Sahu, Marco Canini, + and Peter Richtarik. +\newblock Natural compression for distributed deep learning. +\newblock {\em arXiv preprint arXiv:1905.10988}, 2019. + +\bibitem{beznosikov2020biased} +Aleksandr Beznosikov, Samuel Horv{\'a}th, Peter Richt{\'a}rik, and Mher + Safaryan. +\newblock On biased compression for distributed learning. +\newblock {\em arXiv preprint arXiv:2002.12410}, 2020. + +\bibitem{wen2017terngrad} +Wei Wen, Cong Xu, Feng Yan, Chunpeng Wu, Yandan Wang, Yiran Chen, and Hai Li. +\newblock Terngrad: ternary gradients to reduce communication in distributed + deep learning. +\newblock In {\em Proceedings of the 31st International Conference on Neural + Information Processing Systems}, pages 1508--1518, 2017. + +\bibitem{mishchenko2019distributed} +Konstantin Mishchenko, Eduard Gorbunov, Martin Tak{\'a}{\v{c}}, and Peter + Richt{\'a}rik. +\newblock Distributed learning with compressed gradient differences. +\newblock {\em arXiv preprint arXiv:1901.09269}, 2019. + +\bibitem{horvath2019stochastic} +Samuel Horv{\'a}th, Dmitry Kovalev, Konstantin Mishchenko, Sebastian Stich, and + Peter Richt{\'a}rik. +\newblock Stochastic distributed learning with gradient quantization and + variance reduction. +\newblock {\em arXiv preprint arXiv:1904.05115}, 2019. + +\bibitem{li2020acceleration} +Zhize Li, Dmitry Kovalev, Xun Qian, and Peter Richtarik. +\newblock Acceleration for compressed gradient descent in distributed and + federated optimization. +\newblock In {\em International Conference on Machine Learning}, pages + 5895--5904. PMLR, 2020. + +\bibitem{gorbunov2020linearly} +Eduard Gorbunov, Dmitry Kovalev, Dmitry Makarenko, and Peter Richtarik. +\newblock Linearly converging error compensated sgd. +\newblock In H.~Larochelle, M.~Ranzato, R.~Hadsell, M.~F. Balcan, and H.~Lin, + editors, {\em Advances in Neural Information Processing Systems}, volume~33, + pages 20889--20900. Curran Associates, Inc., 2020. + +\bibitem{philippenko2020artemis} +Constantin Philippenko and Aymeric Dieuleveut. +\newblock Artemis: tight convergence guarantees for bidirectional compression + in federated learning. +\newblock {\em arXiv preprint arXiv:2006.14591}, 2020. + +\bibitem{li2020unified} +Zhize Li and Peter Richt{\'a}rik. +\newblock A unified analysis of stochastic gradient methods for nonconvex + federated optimization. +\newblock {\em arXiv preprint arXiv:2006.07013}, 2020. + +\bibitem{haddadpour2020federated} +Farzin Haddadpour, Mohammad~Mahdi Kamani, Aryan Mokhtari, and Mehrdad Mahdavi. +\newblock Federated learning with compression: Unified analysis and sharp + guarantees. +\newblock {\em arXiv preprint arXiv:2007.01154}, 2020. + +\bibitem{das2020improved} +Rudrajit Das, Abolfazl Hashemi, Sujay Sanghavi, and Inderjit~S Dhillon. +\newblock Improved convergence rates for non-convex federated learning with + compression. +\newblock {\em arXiv preprint arXiv:2012.04061}, 2020. + +\bibitem{pmlr-v139-gorbunov21a} +Eduard Gorbunov, Konstantin~P. Burlachenko, Zhize Li, and Peter Richtarik. +\newblock Marina: Faster non-convex distributed learning with compression. +\newblock In Marina Meila and Tong Zhang, editors, {\em Proceedings of the 38th + International Conference on Machine Learning}, volume 139 of {\em Proceedings + of Machine Learning Research}, pages 3788--3798. PMLR, 18--24 Jul 2021. + +\bibitem{stich2018sparsified} +Sebastian~U Stich, Jean-Baptiste Cordonnier, and Martin Jaggi. +\newblock Sparsified sgd with memory. +\newblock In {\em Proceedings of the 32nd International Conference on Neural + Information Processing Systems}, pages 4452--4463, 2018. + +\bibitem{karimireddy2019error} +Sai~Praneeth Karimireddy, Quentin Rebjock, Sebastian Stich, and Martin Jaggi. +\newblock Error feedback fixes signsgd and other gradient compression schemes. +\newblock In {\em International Conference on Machine Learning}, pages + 3252--3261. PMLR, 2019. + +\bibitem{qian2020error} +Xun Qian, Peter Richt{\'a}rik, and Tong Zhang. +\newblock Error compensated distributed sgd can be accelerated. +\newblock {\em arXiv preprint arXiv:2010.00091}, 2020. + +\bibitem{reisizadeh2019exact} +Amirhossein Reisizadeh, Aryan Mokhtari, Hamed Hassani, and Ramtin Pedarsani. +\newblock An exact quantized decentralized gradient descent algorithm. +\newblock {\em IEEE Transactions on Signal Processing}, 67(19):4934--4947, + 2019. + +\bibitem{kovalev2020linearly} +Dmitry Kovalev, Anastasia Koloskova, Martin Jaggi, Peter Richtarik, and + Sebastian Stich. +\newblock A linearly convergent algorithm for decentralized optimization: + Sending less bits for free! +\newblock In Arindam Banerjee and Kenji Fukumizu, editors, {\em Proceedings of + The 24th International Conference on Artificial Intelligence and Statistics}, + volume 130 of {\em Proceedings of Machine Learning Research}, pages + 4087--4095. PMLR, 13--15 Apr 2021. + +\bibitem{Koloskova2020Decentralized} +Anastasia Koloskova, Tao Lin, Sebastian~U Stich, and Martin Jaggi. +\newblock Decentralized deep learning with arbitrary communication compression. +\newblock In {\em International Conference on Learning Representations}, 2020. + +\bibitem{konevcny2016federated} +Jakub Kone{\v{c}}n{\`y}, H~Brendan McMahan, Felix~X Yu, Peter Richt{\'a}rik, + Ananda~Theertha Suresh, and Dave Bacon. +\newblock Federated learning: Strategies for improving communication + efficiency. +\newblock {\em arXiv preprint arXiv:1610.05492}, 2016. + +\bibitem{kairouz2019advances} +Peter Kairouz, H~Brendan McMahan, Brendan Avent, Aur{\'e}lien Bellet, Mehdi + Bennis, Arjun~Nitin Bhagoji, Keith Bonawitz, Zachary Charles, Graham Cormode, + Rachel Cummings, et~al. +\newblock Advances and open problems in federated learning. +\newblock {\em arXiv preprint arXiv:1912.04977}, 2019. + +\bibitem{Stich18local} +Sebastian~Urban Stich. +\newblock Local {SGD} converges fast and communicates little. +\newblock {\em International Conference on Learning Representations (ICLR)}, + page arXiv:1805.09767, 2019. + +\bibitem{LinSPJ2018local} +Tao Lin, Sebastian~Urban Stich, Kumar~Kshitij Patel, and Martin Jaggi. +\newblock Don't use large mini-batches, use local {SGD}. +\newblock {\em ICLR}, page arXiv:1808.07217, 2020. + +\bibitem{woodworth2020minibatch} +Blake Woodworth, Kumar~Kshitij Patel, and Nathan Srebro. +\newblock Minibatch vs local sgd for heterogeneous distributed learning. +\newblock {\em arXiv preprint arXiv:2006.04735}, 2020. + +\bibitem{yuan2020federated} +Honglin Yuan and Tengyu Ma. +\newblock Federated accelerated stochastic gradient descent. +\newblock {\em Advances in Neural Information Processing Systems}, 33, 2020. + +\bibitem{basu2019qsparse} +Debraj Basu, Deepesh Data, Can Karakus, and Suhas Diggavi. +\newblock Qsparse-local-{SGD}: Distributed {SGD} with quantization, + sparsification and local computations. +\newblock In {\em Advances in Neural Information Processing Systems}, pages + 14668--14679, 2019. + +\bibitem{yuan2020federated_comp} +Honglin Yuan, Manzil Zaheer, and Sashank Reddi. +\newblock Federated composite optimization. +\newblock {\em arXiv preprint arXiv:2011.08474}, 2020. + +\bibitem{assran2020advances} +Mahmoud Assran, Arda Aytekin, Hamid~Reza Feyzmahdavian, Mikael Johansson, and + Michael~G Rabbat. +\newblock Advances in asynchronous parallel and distributed optimization. +\newblock {\em Proceedings of the IEEE}, 108(11):2013--2031, 2020. + +\bibitem{recht2011hogwild} +Benjamin Recht, Christopher Re, Stephen Wright, and Feng Niu. +\newblock Hogwild: A lock-free approach to parallelizing stochastic gradient + descent. +\newblock In {\em Advances in neural information processing systems}, pages + 693--701, 2011. + +\bibitem{zhao2016fast} +Shen-Yi Zhao and Wu-Jun Li. +\newblock Fast asynchronous parallel stochastic gradient descent: A lock-free + approach with convergence guarantee. +\newblock In {\em Proceedings of the AAAI Conference on Artificial + Intelligence}, volume~30, 2016. + +\bibitem{leblond2017asaga} +R{\'e}mi Leblond, Fabian Pedregosa, and Simon Lacoste-Julien. +\newblock Asaga: asynchronous parallel saga. +\newblock In {\em Artificial Intelligence and Statistics}, pages 46--54. PMLR, + 2017. + +\bibitem{peng2016arock} +Zhimin Peng, Yangyang Xu, Ming Yan, and Wotao Yin. +\newblock Arock: an algorithmic framework for asynchronous parallel coordinate + updates. +\newblock {\em SIAM Journal on Scientific Computing}, 38(5):A2851--A2879, 2016. + +\bibitem{mishchenko2018delay} +Konstantin Mishchenko, Franck Iutzeler, J{\'e}r{\^o}me Malick, and Massih-Reza + Amini. +\newblock A delay-tolerant proximal-gradient algorithm for distributed + learning. +\newblock In {\em International Conference on Machine Learning}, pages + 3587--3595. PMLR, 2018. + +\bibitem{agarwal2011distributed} +Alekh Agarwal and John~C Duchi. +\newblock Distributed delayed stochastic optimization. +\newblock In {\em Proceedings of the 24th International Conference on Neural + Information Processing Systems}, pages 873--881, 2011. + +\bibitem{feyzmahdavian2016asynchronous} +Hamid~Reza Feyzmahdavian, Arda Aytekin, and Mikael Johansson. +\newblock An asynchronous mini-batch algorithm for regularized stochastic + optimization. +\newblock {\em IEEE Transactions on Automatic Control}, 61(12):3740--3754, + 2016. + +\bibitem{arjevani2020tight} +Yossi Arjevani, Ohad Shamir, and Nathan Srebro. +\newblock A tight convergence analysis for stochastic gradient descent with + delayed updates. +\newblock In {\em Algorithmic Learning Theory}, pages 111--132. PMLR, 2020. + +\bibitem{chord} +Hari Balakrishnan, M~Frans Kaashoek, David Karger, Robert Morris, and Ion + Stoica. +\newblock Looking up data in p2p systems. +\newblock {\em Communications of the ACM}, 46(2):43--48, 2003. + +\bibitem{kaplan1974application} +Seymour Kaplan. +\newblock Application of programs with maximin objective functions to problems + of optimal resource allocation. +\newblock {\em Operations Research}, 22(4):802--807, 1974. + +\bibitem{andersen} +Erling~D. Andersen and Knud~D. Andersen. +\newblock The mosek interior point optimizer for linear programming: An + implementation of the homogeneous algorithm. +\newblock In {\em Applied Optimization}, pages 197--232. Springer {US}, 2000. + +\bibitem{MLSYS2019_d09bf415} +Anand Jayarajan, Jinliang Wei, Garth Gibson, Alexandra Fedorova, and Gennady + Pekhimenko. +\newblock Priority-based parameter propagation for distributed dnn training. +\newblock In A.~Talwalkar, V.~Smith, and M.~Zaharia, editors, {\em Proceedings + of Machine Learning and Systems}, volume~1, pages 132--145, 2019. + +\end{thebibliography} diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/main.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/main.tex new file mode 100644 index 00000000..b88399d4 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/main.tex @@ -0,0 +1,222 @@ +\documentclass[letterpaper]{article} + + +\usepackage[final,nonatbib]{neurips_2021} + + + + +\usepackage[utf8]{inputenc} % +\usepackage[T1]{fontenc} % +\usepackage{hyperref} % +\usepackage{url} % +\usepackage{booktabs} % +\usepackage{amsfonts} % +\usepackage{nicefrac} % +\usepackage{microtype} % +\usepackage{xcolor} % +\usepackage{graphicx} +\usepackage{subcaption} +\usepackage{booktabs} % +\usepackage{lipsum} +\usepackage{amsmath} +\usepackage{amssymb, amsthm, latexsym} +\usepackage{multirow} +\usepackage{wrapfig} + + +\usepackage{algorithm,algorithmic} + + + +\usepackage{enumitem} +\usepackage{caption} +\setlist[itemize]{itemsep=0pt} + + +\newcommand{\Exp}{\mathbf{E}} +\newcommand{\Prob}{\mathbf{P}} +\newcommand{\R}{\mathbb{R}} +\newcommand{\eqdef}{\stackrel{\text{def}}{=}} +\newcommand{\ve}[2]{\left\langle #1 , #2 \right\rangle} +\def\<#1,#2>{\left\langle #1,#2\right\rangle} + +\usepackage{thmtools} + +\newtheorem{lemma}{Lemma}[section] +\newtheorem{theorem}{Theorem}[section] +\newtheorem{definition}{Definition}[section] +\newtheorem{proposition}{Proposition}[section] +\newtheorem{assumption}{Assumption}[section] +\newtheorem{corollary}{Corollary}[section] +\newtheorem{remark}{Remark}[section] + + + + + +\newcommand\tagthis{\addtocounter{equation}{1}\tag{\theequation}} +\newcommand{\argmin}{\mathop{\arg\!\min}} + +\newcommand{\cA}{{\cal A}} +\newcommand{\cB}{{\cal B}} +\newcommand{\cC}{{\cal C}} +\newcommand{\cD}{{\cal D}} +\newcommand{\cE}{{\cal E}} +\newcommand{\cF}{{\cal F}} +\newcommand{\cG}{{\cal G}} +\newcommand{\cH}{{\cal H}} +\newcommand{\cJ}{{\cal J}} +\newcommand{\cK}{{\cal K}} +\newcommand{\cL}{{\cal L}} +\newcommand{\cM}{{\cal M}} +\newcommand{\cN}{{\cal N}} +\newcommand{\cO}{{\cal O}} +\newcommand{\cP}{{\cal P}} +\newcommand{\cQ}{{\cal Q}} +\newcommand{\cR}{{\cal R}} +\newcommand{\cS}{{\cal S}} +\newcommand{\cT}{{\cal T}} +\newcommand{\cU}{{\cal U}} +\newcommand{\cV}{{\cal V}} +\newcommand{\cX}{{\cal X}} +\newcommand{\cY}{{\cal Y}} +\newcommand{\cW}{{\cal W}} +\newcommand{\cZ}{{\cal Z}} +\newcommand{\Var}{\mathrm{Var}} + +\newcommand{\mA}{{\bf A}} +\newcommand{\mB}{{\bf B}} +\newcommand{\mC}{{\bf C}} +\newcommand{\mE}{{\bf E}} +\newcommand{\mF}{{\bf F}} +\newcommand{\mG}{{\bf G}} +\newcommand{\mH}{{\bf H}} +\newcommand{\mI}{{\bf I}} +\newcommand{\mJ}{{\bf J}} +\newcommand{\mK}{{\bf K}} +\newcommand{\mL}{{\bf L}} +\newcommand{\mM}{{\bf M}} +\newcommand{\mN}{{\bf N}} +\newcommand{\mO}{{\bf O}} +\newcommand{\mP}{{\bf P}} +\newcommand{\mQ}{{\bf Q}} +\newcommand{\mR}{{\bf R}} +\newcommand{\mS}{{\bf S}} +\newcommand{\mT}{{\bf T}} +\newcommand{\mU}{{\bf U}} +\newcommand{\mV}{{\bf V}} +\newcommand{\mW}{{\bf W}} +\newcommand{\mX}{{\bf X}} +\newcommand{\mY}{{\bf Y}} +\newcommand{\mZ}{{\bf Z}} + +\newcommand{\sign}{\mathrm{sign}} +\newcommand{\cnorm}{\omega} +\newcommand{\EE}{\mathbb{E}} +\newcommand{\PP}{\mathbb{P}} +\newcommand{\VV}{\mathbb{V}} + +\newcommand{\prox}{\mathop{\mathrm{prox}}\nolimits} +\newcommand{\proxR}{\prox_{\gamma R}} +\newcommand{\proxkR}{\prox_{\gamma^k R}} +\newcommand{\mean}{\overline} +\newcommand{\sumin}{\sum_{i=1}^n} + + +\newcommand{\Mod}[1]{\ \mathrm{mod}\ #1} + +\title{Moshpit SGD: Communication-Efficient\\ Decentralized Training\\ on Heterogeneous Unreliable Devices} + +\author{% + Max Ryabinin\thanks{Equal contribution. Correspondence to \texttt{mryabinin0@gmail.com}.} \\ + Yandex, Russia\\ + HSE University, Russia\\ + \And + Eduard Gorbunov\footnotemark[1]\\ + MIPT, Russia\\ + HSE University, Russia\\ + Yandex, Russia\\ + \And + Vsevolod Plokhotnyuk\\ + Yandex, Russia\\ + HSE University, Russia\\ + \And + Gennady Pekhimenko\\ + University of Toronto, Canada\\ + Vector Institute, Canada +} + +\begin{document} + +\maketitle + +\begin{abstract} +Training deep neural networks on large datasets can often be accelerated by using multiple compute nodes. +This approach, known as distributed training, can utilize hundreds of computers via specialized message-passing protocols such as Ring All-Reduce. +However, running these protocols at scale requires reliable high-speed networking that is only available in dedicated clusters. +In contrast, many real-world applications, such as federated learning and cloud-based distributed training, operate on unreliable devices with unstable network bandwidth. +As a result, these applications are restricted to using parameter servers or gossip-based averaging protocols. +In this work, we lift that restriction by proposing Moshpit All-Reduce --- an iterative averaging protocol that exponentially converges to the global average. +We demonstrate the efficiency of our protocol for distributed optimization with strong theoretical guarantees. +The experiments show 1.3x speedup for ResNet-50 training on ImageNet compared to competitive gossip-based strategies and 1.5x speedup when training ALBERT-large on preemptible compute nodes. +\end{abstract} + +\input{intro.tex} + +\input{related.tex} + +\input{method.tex} + +\input{experiments.tex} + +\vspace{-6pt} +\section{Conclusion and future work} +\vspace{-4pt} +In this work, we propose Moshpit All-Reduce, a decentralized averaging protocol intended for distributed optimization in unstable and network-constrained environments. It has favorable theoretical properties when compared to gossip-based approaches and achieves considerable speedups in distributed training for image classification and masked language modeling. + +Our approach was primarily designed for cloud-based training and federated learning, as well as for distributed training on unreliable instances; future work might explore additional settings, such as collaborative training of neural networks. +Another potential research direction is to study the interactions of Moshpit All-Reduce with other methods that improve communication efficiency of distributed optimization, such as gradient compression. +Finally, the idea of arranging All-Reduce nodes into groups can be improved to address specific issues that may arise in practice, such as the varying number of workers and their geographical distribution. + +\vspace{-6pt} +\section*{Acknowledgements} +\vspace{-4pt} +We would like to thank Anastasia Koloskova, Liudmila Prokhorenkova and Anton Osokin for helpful feedback and discussions. We are also grateful to the anonymous reviewers for their suggestions on improving the paper. Finally, we would like to thank Dmitry Afanasiev, Vladimir Aliev, Anand Jayarajan and Michael Solotky for their suggestions on the technical aspects of our study. +This project was supported in +part by the Canada Foundation for Innovation JELF grant, +NSERC Discovery grant, AWS Machine Learning Research +Award, and Facebook Faculty Research Award. The paper was also partially supported by by a grant for research centers in the field of artificial intelligence, provided by the Analytical Center for the Government of the Russian Federation in accordance with the subsidy agreement (agreement identifier 000000D730321P5Q0002) and the agreement with the Moscow Institute of Physics and Technology dated November 1, 2021 No. 70-2021-00138. The computational resources for the experiments were provided by the Amazon Research Awards program and Yandex. + +\bibliographystyle{unsrt} +\bibliography{bibliography} + + + + + + + + + + +\clearpage +\part*{Supplementary Material} +\appendix + +\input{cloud_costs.tex} + +\input{post_related.tex} + +\input{proofs_mixing.tex} + +\input{proofs_opt.tex} + +\input{matchmaking.tex} +\input{load_balancing} + +\input{detailed_setup} + +\input{extra_plots} + +\end{document} \ No newline at end of file diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/matchmaking.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/matchmaking.tex new file mode 100644 index 00000000..826a0911 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/matchmaking.tex @@ -0,0 +1,24 @@ +\section{Decentralized matchmaking} +\label{sect:matchmaking} + +In order to run group all-reduce over unreliable devices, Moshpit Averaging must be able to dynamically form groups of active devices that share the same key $C_i$. +In theory, this matchmaking can be implemented precisely as described in Algorithm~\ref{alg:moshpit}: each peer adds itself to a certain DHT key, waits for a said period of time, and then reads the same key to retrieve a list of its groupmates. + +However, in practice, this kind of matchmaking would be extremely fragile: if any peer arrives late (for example, due to latency), it may join the group when other peers have already finished matchmaking. As a result, some workers will treat this peer as active, while others will behave as though there is no such peer at all, breaking the consensus and rendering all peers unable to run all-reduce in a stable manner. + +To avoid this and other similar inconsistencies, Moshpit All-Reduce employs a more sophisticated matchmaking protocol with the following guarantees +\begin{enumerate} + \item Peers that join the same group are guaranteed to have the same list of groupmates; + \item The group will have the maximum possible number of peers, unless some of them fail; + \item If some peers fail, matchmaking will still form the group out of the remaining ones. +\end{enumerate} + +To achieve this, each peer first declares itself onto the DHT (as in Algorithm~\ref{alg:moshpit}). Then, peers attempt to form groups by calling the \texttt{REQUEST\_JOIN\_GROUP} remote procedure call. Intuitively, if peer A calls this RPC on peer B, then \textit{peer A requests to join peer B's group}, which can be either accepted or rejected by the group ``leader'' B, which may or may not have other ``followers''. + +If a peer is accepted to a group, it commits to stay active (i.e. to await other peers) for a set period of time and perform all-reduce with the peers supplied by the group ``leader''. On the other hand, a peer can be rejected if (a) the potential ``leader'' is already a follower in another group, (b) the group is already running all-reduce, or (c) if the ``leader'' failed or left during matchmaking. + +To ensure that this protocol forms groups of maximum size, each peer generates a unique ``priority'' based on its local timestamp\footnote{More specifically, the priority is a tuple of $\texttt{(timestamp, peer\_id)}$, where \texttt{peer\_id} is used to break ties.}. Peers prioritize joining the group of neighbors that have the lowest ``priority''. Under normal circumstances, all workers will join the group of a peer that was first to start matchmaking according to its own local time. However, if this peer has failed or already finished matchmaking, the group will be formed around one of the remaining peers. + +Matchmaking for 64 peers can take less than 1 second if all workers are located in the same cloud region and are highly synchronized. However, this can grow to 2.9 seconds for two different cloud regions and up to 9 seconds when training with commodity hardware around the world. + +To ensure that this latency does not affect the training performance, Moshpit SGD performs matchmaking asynchronously in the background thread, while the model is accumulating gradients. All peers begin matchmaking 15 seconds before the estimated averaging round, so that in $\ge 95\%$ of averaging iterations, the matchmaking step is already finished by the time peers need to run all-reduce. \ No newline at end of file diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/method.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/method.tex new file mode 100644 index 00000000..29aa318e --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/method.tex @@ -0,0 +1,210 @@ +\section{Moshpit SGD}\label{sect:method} + + +Large-scale training with unreliable participants requires a protocol that is both communication-efficient and fault-tolerant. Unfortunately, existing methods have only provide one of these properties. To better address our conditions, we propose Moshpit All-Reduce --- a fully decentralized averaging protocol that combines the efficiency of all-reduce and the fault tolerance of gossip-based averaging. + +The rest of this section is organized as follows: +\begin{itemize} + \item Section~\ref{sect:method_algorithm} describes the protocol and proves its correctness and communication efficiency; + \item Section~\ref{sect:method_convergence} provides the analysis of the protocol and proves exponential convergence rate for averaging and the rate matching the one of centralized Local-SGD for optimization; + \item Section~\ref{sect:method_implementation_details} contains implementation details for training with heterogeneous compute nodes. +\end{itemize} + +\subsection{Moshpit All-Reduce} +\label{sect:method_algorithm} + +The core idea of Moshpit All-Reduce is that workers perform averaging in small independent groups. That way, a single failed participant would only affect his current group. In turn, the composition of each group should be chosen dynamically to converge in the least number of steps. +Ideally, if there are 9 peers with local parameters $\theta$, we can average them in 2 rounds, as demonstrated in Figure~\ref{fig:square_allreduce}: + +\vspace{-4pt} +\noindent +\begin{minipage}{0.45\textwidth} +\centering +\includegraphics[width=\textwidth]{resources/moshpit.pdf} +\captionof{figure}{Example averaging order for 9 peers in 2 rounds. On each round, peers are split into 3 groups that run All-Reduce in parallel.} +\label{fig:square_allreduce} +\end{minipage} +\begin{minipage}{0.55\textwidth} +\begin{algorithm}[H] +\caption{Moshpit All-Reduce (for $i$-th peer)} + \label{alg:moshpit} +\begin{algorithmic}[H] + \STATE {\bfseries Input:} parameters $\{\theta_j\}_{j=1}^N$, number of peers $N$, $d$, $M$, number of iterations $T$, peer index $i$ + + $\theta_{i}^0 := \theta_i$ + + $C^0_i :=\texttt{get\_initial\_index(i)}$ + + \FOR{$t \in 1 \dots T$} + \STATE $\texttt{DHT}[C^{t-1}_i, t].\texttt{add}(\texttt{address}_i)$ + + \STATE \texttt{Matchmaking()} // wait for peers to assemble + + \STATE $\texttt{peers}_t := \texttt{DHT}.\texttt{get}([C^{t-1}_i, t])$ + + \STATE $\theta_{i}^t, c^t_i := \texttt{AllReduce}(\theta_{i}^{t - 1}, \texttt{peers}_t)$ + + \STATE $C^t_i := (C^{t-1}_i\texttt{[1:]}, c^t_i)$ // same as eq. (1) + \ENDFOR + \STATE {\bfseries Return} $\theta^T_i$ +\end{algorithmic} +\end{algorithm} +\end{minipage} + +To achieve this in a decentralized system, we use Distributed Hash Tables (DHT) --- a decentralized key-value storage; \autoref{sect:post_related} contains its more detailed description. On each averaging round: +\begin{itemize} + \item Each worker computes its group key $C_i$; + \item Workers add their network addresses to the DHT key corresponding to $C_i$; + \item Each worker can now fetch a full list of peers that have the same $C_i$ and run All-Reduce with those peers. +\end{itemize} + +Unfortunately, the averaging structure from Figure~\ref{fig:square_allreduce} is impossible to maintain when participants are constantly joining, leaving, and failing. However, we can achieve equivalent results without global structure using a simple rule: \textit{if two peers were in the same group in round $t$, they must choose different groups in round $t {+} 1$.} + +A natural way to enforce this rule is to take advantage of the chunk indices from Butterfly All-Reduce (see Figure~\ref{fig:butterfly_allreduce}). Recall that each worker accumulates a \textit{unique} chunk of parameters defined by an index $c_i$. By setting $C_i := c_i$, we can guarantee that any workers that were in the same group at a round $t$ will have different group indices in round $t {+} 1$. + +This averaging scheme can be generalized to more than two dimensions in order to fit a larger number of peers or reduce the group size. For a $d$-dimensional hypercube, nodes should find groups of peers that they have not communicated with during $d {-} 1$ previous rounds. To that end, we define $C_i$ as tuples containing chunk indices from $d{-}1$ previous rounds ($t$ denotes the communication round): +\vspace{-2pt} +\begin{equation} + C^t_i := (c^{t-d+1}_i, c^{t-d+2}_i, \ldots, c^{t }_i). + \label{eq:group} +\end{equation} + +The above intuition can be formalized with Algorithm \ref{alg:moshpit}. +Here, $N$ peers form a virtual $d$-dimensional grid with $M$ peers per row and average their parameters $\theta_i$ over $T$ rounds. $\texttt{DHT}[\cdot]$ is a shortcut for using the DHT to add or retrieve values for a given key. The \texttt{Matchmaking} step corresponds to the decentralized matchmaking procedure that organizes active workers with the same index into groups, described in detail in ~\autoref{sect:matchmaking}. In turn, \texttt{AllReduce} denotes running all-reduce to compute the average $\theta$ in a given group. The \texttt{get\_initial\_index} function takes the peer index $i$ and returns $d{-}1$ integers +in range $[0, M)$ such that the size of initial groups does not exceed $M$. +This way, the groups formed on subsequent rounds will also have at most $M$ participants. One possible strategy is: + +\vspace{-8pt} +\begin{equation} + \texttt{get\_initial\_index}(i) = + \begin{pmatrix} + \lfloor i / M^{d{-}1} \rfloor \Mod M \\ + \end{pmatrix}_{j\in \{1,\ \ldots,\ d\}} + \label{eq:get_initial_index} +\end{equation} + +If $N {=} M^d$ and there are no node/network failures, Algorithm~\ref{alg:moshpit} is equivalent to Torus All-Reduce~\cite{torus_allreduce}, achieving the exact average after $d$ rounds of communication (see Appendix~\ref{sect:equiv_to_torus}). +However, our typical use case is far from this perfect scenario; for example, some groups can have less than $M$ members. Furthermore, a peer might fail during all-reduce, causing its groupmates to skip a round of averaging. +Still, Moshpit All-Reduce is applicable even in these conditions: +\begin{theorem}[Correctness]\label{thm:quality_of_avg_deterministic_vectors_0} +If all workers have a non-zero probability of successfully running a communication round and the order of $\texttt{peers}_t$ is random, then all local vectors $\theta^t_i$ converge to the global average with probability 1: +\vspace{-4px} +\begin{equation} + \forall i, \Big|\Big|\theta^t_i - \frac1N \sum_i \theta^0_i\Big|\Big|^2_2 \xrightarrow[t\to\infty]{} 0. +\end{equation} +\end{theorem}\vspace{-16pt} +\begin{proof}[Proof (sketch, complete in Appendix~\ref{sect:correctness_proof})] +Running all-reduce with a subset of peers preserves the invariant $\frac1N \sum_i \theta^t_i=\frac1N \sum_i \theta^{t-1}_i$ and reduces the deviation of $\theta^t_i$ from the overall average. +\end{proof}\vspace{-6pt} + +\textbf{Complexity.} The matchmaking protocol is implemented over Kademlia DHT~\cite{kademlia}, meaning that each read and write operation needs at most $\cO(\log N)$ requests and $\cO(M)$ bandwidth to load $\texttt{peers}_t$. + +After the matchmaking is over, each group runs a single all-reduce round to compute the average. In principle, Moshpit Averaging can use any general-purpose all-reduce protocol. We opted for a butterfly-like version (Figure~\ref{fig:butterfly_allreduce}), as it is simpler than Ring All-Reduce while still being communication-efficient. The communication complexity of this algorithm is $\cO\left(\max(s, M) \times \frac{M - 1}{M}\right)$, where $s$ is the size of vector $\theta$. Thus, the total time complexity of Algorithm \ref{alg:moshpit} becomes: +\begin{equation} + \cO\left(T \times \left[\log_2{N} + M + \max(s, M) \times {\frac{M - 1}{M}}\right]\right). +\end{equation} +This compares favorably to gossip, where network load grows linearly with the number of neighbors. + +\vspace{-2pt} +\subsection{Convergence analysis}\label{sect:method_convergence} +\subsubsection{Mixing properties of Moshpit Averaging}\label{sect:theory_about_avg} +As stated in the previous section, Moshpit All-Reduce computes the exact average when $N = M^d$, which cannot be guaranteed in practice. Therefore, additional analysis is needed to establish how quickly Moshpit Averaging approximates the actual average of $N$ vectors stored on peers. + +In the following theorem, we provide such analysis for a simplified version of Moshpit Averaging. One can find the full proof in Appendix~\ref{sec:proof_quality_of_avg_deterministic_vectors}. +\begin{theorem}\label{thm:quality_of_avg_deterministic_vectors} + Consider a modification of Moshpit All-Reduce that works as follows: at each iteration $k\ge 1$, 1) peers are randomly split in $r$ disjoint groups of sizes $M_1^k,\ldots, M_r^k$ in such a way that $\sum_{i=1}^r M_i^k = N$ and $M_i^k \ge 1$ for all $i = 1,\ldots,r$ and 2) peers from each group compute their group average via All-Reduce. Let $\theta_1,\ldots,\theta_N$ be the input vectors of this procedure and $\theta_1^T,\ldots,\theta_N^T$ be the outputs after $T$ iterations. Also, let $\overline{\theta} = \frac{1}{N}\sum_{i=1}^N\theta_i$ Then, + \begin{equation} + \hspace{-0.1cm}\EE\left[\frac{1}{N}\sum\limits_{i=1}^N\|\theta_i^T - \overline{\theta}\|^2\right]= \left(\frac{r-1}{N} + \frac{r}{N^2}\right)^T\frac{1}{N}\sum\limits_{i=1}^N\|\theta_i - \overline{\theta}\|^2. \label{eq:determ_quality_of_avg} + \end{equation} +\end{theorem} + +\begin{algorithm}[h] + \caption{Moshpit SGD} + \label{alg:moshpit_local_sgd} +\begin{algorithmic}[1] + \STATE {\bfseries Input:} starting point $\theta^0$, learning rate $\gamma > 0$, communication period $\tau \ge 1$ + \FOR{$k = 0, 1, \ldots$} + \FOR{each peer $i\in P_{k+1}$ in parallel} + \STATE Compute the stochastic gradient $g_i^k$ at the current point $\theta_i^k$ + \IF{$k+1 \mod \tau = 0$} + \STATE $\theta_i^{k+1} = \text{Moshpit All-Reduce}_{j\in P_{k+1}}(\theta_j^k - \gamma g_j^k)$ for $i$-th peer (Algorithm~\ref{alg:moshpit}) + \ELSE + \STATE $\theta_i^{k+1} = \theta_i^k - \gamma g_i^k$ + \ENDIF + \ENDFOR + \ENDFOR +\end{algorithmic} +\end{algorithm}\setlength{\textfloatsep}{12pt} + +In particular, this result implies that even if workers are randomly split into pairs at each iteration, the simplified version of Moshpit Averaging makes the average distortion (the left-hand side of Equation~\ref{eq:determ_quality_of_avg}) less than $\varepsilon$ in expectation after $\cO\left(\log(\nicefrac{1}{\varepsilon})\right)$ iterations. That is, this algorithm finds $\varepsilon$-accurate average on each node with the rate that \textit{does not} depend on the spectral properties of the communication graph like gossip and its variants (see Section~\ref{sect:related_decentralized_training} and Appendix~\ref{sect:post_related_gossip}). Since Moshpit Averaging prevents two peers from participating in the same groups during successive iterations, the actual algorithm should find $\varepsilon$-accurate averages on participating peers even faster than Equation~\ref{eq:determ_quality_of_avg} predicts. Moreover, in Appendix~\ref{sec:proof_quality_of_avg_deterministic_vectors} we explain how this result can be generalized to the case when $\{M_i^k\}_{i=1}^N$ and $r$ depends on $k$ or even is random. In Appendix~\ref{sec:mix_rand_proof}, we also provide the guarantees measuring how fast Algorithm~\ref{alg:moshpit} reduces the variance when averaging random vectors. + +\vspace{-4pt} +\subsubsection{Moshpit SGD}\label{sect:optim_theory} +We consider a classical distributed optimization problem +\vspace{-6pt} +\begin{equation} + \min\limits_{\theta\in\R^n}\left\{f(\theta) = \frac{1}{N}\sum\limits_{i=1}^N f_i(\theta)\right\}, \label{eq:main_problem} +\end{equation} +\vspace{-6pt} +where $N$ is the number of workers and worker $i$ has access only to the function $f_i$. + +We propose a new algorithm called Moshpit SGD to solve this problem (see Algorithm~\ref{alg:moshpit_local_sgd}). In this algorithm, workers perform independent local SGD steps and periodically synchronize their parameters $\theta_i^k$ with other peers using Moshpit All-Reduce. Moreover, we define the indices of participating nodes at iteration $k$ as $P_{k+1}$ ($P_0 = \{1,\ldots,N\}$) allowing peers to vanish. + + +First of all, we list the key assumptions that we use in the convergence analysis of Moshpit SGD. +\begin{assumption}[Bounded variance]\label{as:bounded_var} + We assume that for all $k\ge 0$ and $i=1,\ldots, N$ stochastic gradients $g_i^k$ satisfy $\EE\left[g_i^k\mid \theta_i^k\right] = \nabla f_i(\theta_i^k)$ and + \begin{eqnarray} + \EE\left[\|g_i^k - \nabla f_i(\theta_i^k)\|^2\mid \theta_i^k\right] &\le& \sigma^2.\label{eq:bounded_variance} + \end{eqnarray} +\end{assumption}\vspace{-6px} +This assumption is classical in the stochastic optimization literature \cite{nemirovski2009robust,ghadimi2013stochastic}. We notice that our analysis can be generalized to the settings when the stochastic gradients satisfy less restrictive assumptions such as expected smoothness \cite{gower2019sgd} or have more sophisticated structure similar to \cite{karimireddy2020scaffold} using the theoretical framework from \cite{gorbunov2020local}. + +The following assumption controls the averaging properties and the effect of the peers' vanishing. +\begin{assumption}[Averaging quality \& peers' vanishing]\label{as:averaging_quality} + We assume that the vanishing of peers does not change the global average of the iterates of Moshpit SGD too much, i.e., $P_{k+1}\subseteq P_{k}$ and $|P_k| \ge N_{\min}$ for all $k\ge 0$, $|P_{a\tau}| \le 2|P_{a(\tau+1)}|$ for all non-negative integers $a\ge 0$, and there exist such $\widetilde{\theta}\in \R^n$ and a sequence of non-negative numbers $\{\Delta_{pv}^k\}_{k\ge 0}$ that $\forall k \ge 0$ + \begin{align} + \EE\left[\langle\theta^{k+1} - \widehat{\theta}^{k+1}, \theta^{k+1}+\widehat{\theta}^{k+1} - 2\widetilde\theta\rangle\right] \!\le\! \Delta_{pv}^k\label{eq:stationary_avg_almost}&,f\text{ convex;}\\ + \EE\!\left[\langle\nabla f(\theta^k), \theta^{k+1}-\widehat{\theta}^{k+1}\rangle + L\|\widehat{\theta}^{k+1} - \theta^{k+1}\|^2\right] \!\le\! \Delta_{pv}^k\label{eq:stationary_avg_almost_2}&,f\text{ non-convex, $L$-smooth, (Def.~\ref{def:L_smoothness})} + \end{align} + where $N_k = |P_k|$, $\theta^{k+1} = \frac{1}{N_{k+1}}\sum_{i\in P_{k+1}}\theta_i^{k+1}$, and $\widehat \theta^{k+1} = \frac{1}{N_{k}}\sum_{i\in P_{k}}(\theta_i^{k}-\gamma g_i^k)$ for $k\ge 0$. + + Moreover, we assume that for some $\delta_{aq} \ge 0$ and for all non-negative integers $a\ge 0$, + \begin{eqnarray} + \EE\left[\frac{1}{N_{a\tau}}\sum\limits_{i\in P_{a\tau}}\|\theta_i^{a\tau} - \theta^{a\tau}\|^2\right] &\le& \gamma^2\delta_{aq}^2.\label{eq:quality_of_avg} + \end{eqnarray} +\end{assumption} +If $P_k = P_{k+1} = \{1,\ldots,N\}$ for all $k\ge 0$, i.e., peers do not vanish, then $\theta^{k} = \widehat{\theta}^{k}$ and properties (\ref{eq:stationary_avg_almost}, \ref{eq:stationary_avg_almost_2}) hold with $\Delta_{pv}^k \equiv 0$ for all $k\ge 0$. Moreover, according to the mixing properties of Moshpit Averaging established in Theorem~\ref{thm:quality_of_avg_deterministic_vectors}, inequality \ref{eq:quality_of_avg} holds after $\cO\left(\log\left(\nicefrac{1}{\gamma^2\delta_{aq}^2}\right)\right)$ iterations of Algorithm~\ref{alg:moshpit}. Therefore, the assumption above is natural and well-motivated. + +Under these assumptions, we derive the convergence rates both for convex and non-convex problems. The full statements and complete proofs are deferred to Appendix~\ref{sect:missing_proofs_local_sgd}. +\begin{theorem}[Convex case]\label{thm:cvx_convergence} + Let $f_1 = \ldots = f_N = f$, function $f$ be $\mu$-strongly convex (Def.~\ref{def:str_cvx}) and $L$-smooth (see Def.~\ref{def:L_smoothness}), and Assumptions~\ref{as:bounded_var}~and~\ref{as:averaging_quality} hold with $\Delta_{pv}^k = \delta_{pv,1}\gamma\mu\EE[\|\theta^k-\theta^*\|^2] + \gamma^2\delta_{pv,2}^2$ and $\widetilde{\theta} = \theta^*$, where $\theta^* \in \argmin_{\theta\in\R^n} f(\theta)$ and $\delta_{pv,1}\in [0,1)$, $\delta_{pv,2}\ge 0$. Then there exists a choice of $\gamma$ such that $\EE\left[f(\overline{\theta}^K) - f(\theta^*)\right]\le \varepsilon$ after $K$ iterations of Moshpit SGD, where $K$ equals + \vspace{-2pt} + \begin{align} + \widetilde{\cO}\!\left(\!\frac{L}{(1\!-\!\delta_{pv,1})\mu}\! +\! \frac{\delta_{pv,2}^2\!+\!\nicefrac{\sigma^2}{N_{\min}}}{(1-\delta_{pv,1})\mu\varepsilon}\! +\! \sqrt{\frac{L((\tau\!-\!1)\sigma^2\!+\!\delta_{aq}^2)}{(1\!-\!\delta_{pv,1})^2\mu^2\varepsilon}}\!\right)&,\ \mu>0;\\ + \cO\!\left(\!\frac{LR_0^2}{\varepsilon}\!+\! \frac{R_0^2(\delta_{pv,2}^2\!+\!\nicefrac{\sigma^2}{N_{\min}})}{\varepsilon^2}\!+\! \frac{R_0^2\!\sqrt{L\!(\!(\tau\!-\!1)\!\sigma^2\!+\!\delta_{aq}^2)}}{\varepsilon^{\nicefrac{3}{2}}}\!\right)&,\ \mu=0, + \end{align} + where $\overline{\theta}^K = \frac{1}{W_K}\sum\limits_{k=0}^K\frac{1}{N_k}\sum\limits_{i\in P_k} w_k \theta_i^k$, $w_k = (1-\gamma\mu)^{-(k+1)}$, $W_K = \sum_{k=0}^Kw_k$, $R_0 = \|\theta^0 - \theta^*\|$ and $\widetilde{\cO}(\cdot)$ hides constant and $\log(\nicefrac{1}{\varepsilon})$ factors. +\end{theorem} +That is, if $\delta_{pv,1} \le \nicefrac{1}{2}$, $N_{\min} = \Omega(N)$, $\delta_{pv,2}^2 = \cO(\nicefrac{\sigma^2}{N_{\min}})$, and $\delta_{aq}^2 = \cO((\tau-1)\sigma^2)$, then Moshpit SGD has the same iteration complexity as Local-SGD in the homogeneous case \cite{khaled2020tighter,woodworth2020local}. However, the averaging steps of Moshpit SGD are much faster than those of the parameter-server architecture when the number of peers is large. Also, unlike the state-of-the-art convergence guarantees for Decentralized Local-SGD \cite{koloskova2020unified}, our bounds do not depend on the spectral properties of the communication graph (see Appendix~\ref{sect:post_related_gossip} for the details). + +\begin{theorem}[Non-convex case]\label{thm:non_cvx_convergence} + Let $f_1 = \ldots = f_N = f$, function $f$ be $L$-smooth and bounded from below by $f_*$, and Assumptions~\ref{as:bounded_var}~and~\ref{as:averaging_quality} hold with $\Delta_{pv}^k = \delta_{pv,1}\gamma\EE[\|\nabla f(\theta^k)\|^2] + L\gamma^2\delta_{pv,2}^2$, $\delta_{pv,1}\in [0,\nicefrac{1}{2})$, $\delta_{pv,2}\ge 0$. Then there exists such choice of $\gamma$ that $\EE\left[\|\nabla f(\theta_{\text{rand}}^K)\|^2\right]\le \varepsilon^2$ after $K$ iterations of Moshpit SGD, where $K$ equals + {\begin{eqnarray*} + \cO\Bigg(\tfrac{L\Delta_0}{(\!1\!-\!2\delta_{pv,1}\!)^2\varepsilon^2}\!\Bigg[\!1\! +\!\tau\sqrt{1\!-\!2\delta_{pv,1}}\! +\! \tfrac{\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}}}{\varepsilon^2}\!+\! \tfrac{\sqrt{(1-2\delta_{pv,1})(\delta_{aq}^2+(\tau-1)\sigma^2)}}{\varepsilon}\!\Bigg]\!\Bigg), + \end{eqnarray*}} + $\Delta_0 = f(\theta^0) - f(\theta^*)$ and $\theta_{\text{rand}}^K$ is chosen uniformly from $\{\theta^0,\theta^1,\ldots,\theta^{K-1}\}$ defined in As.~\ref{as:averaging_quality}. +\end{theorem} +Again, if $\delta_{pv,1} \le \nicefrac{1}{3}$, $N_{\min} = \Omega(N)$, $\delta_{pv,2}^2 = \cO(\nicefrac{\sigma^2}{N_{\min}})$, and $\delta_{aq}^2 = \cO((\tau-1)\sigma^2)$, then the above theorem recovers the state-of-the-art results in the non-convex case for Local-SGD \cite{li2019communication,koloskova2020unified}. + +\subsection{Implementation details} +\label{sect:method_implementation_details} + +Training on heterogeneous unreliable hardware also poses a number of engineering challenges. The most obvious one is that the system must be able to recover from node failures. To address this challenge, we use a fully decentralized infrastructure where all information is replicated in a Distributed Hash Table; see Appendix~\ref{sect:related_dht} for details. When a new worker joins midway through training, it can download the latest model parameters and metadata from any other peer (see \autoref{sect:load_state_from_peers}). Another challenge arises when devices in a group have uneven network bandwidth. In that case, we dynamically adjust the communication load of each peer to avoid being bottlenecked. More information on this procedure can be found in \autoref{sect:load_balancing}. + + + + + + + diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/neurips_2021.sty b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/neurips_2021.sty new file mode 100644 index 00000000..da925294 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/neurips_2021.sty @@ -0,0 +1,377 @@ +% partial rewrite of the LaTeX2e package for submissions to the +% Conference on Neural Information Processing Systems (NeurIPS): +% +% - uses more LaTeX conventions +% - line numbers at submission time replaced with aligned numbers from +% lineno package +% - \nipsfinalcopy replaced with [final] package option +% - automatically loads times package for authors +% - loads natbib automatically; this can be suppressed with the +% [nonatbib] package option +% - adds foot line to first page identifying the conference +% - adds preprint option for submission to e.g. arXiv +% - conference acronym modified +% +% Roman Garnett (garnett@wustl.edu) and the many authors of +% nips15submit_e.sty, including MK and drstrip@sandia +% +% last revision: March 2021 + +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{neurips_2021}[2021/03/31 NeurIPS 2021 submission/camera-ready style file] + +% declare final option, which creates camera-ready copy +\newif\if@neuripsfinal\@neuripsfinalfalse +\DeclareOption{final}{ + \@neuripsfinaltrue +} + +% declare nonatbib option, which does not load natbib in case of +% package clash (users can pass options to natbib via +% \PassOptionsToPackage) +\newif\if@natbib\@natbibtrue +\DeclareOption{nonatbib}{ + \@natbibfalse +} + +% declare preprint option, which creates a preprint version ready for +% upload to, e.g., arXiv +\newif\if@preprint\@preprintfalse +\DeclareOption{preprint}{ + \@preprinttrue +} + +\ProcessOptions\relax + +% determine whether this is an anonymized submission +\newif\if@submission\@submissiontrue +\if@neuripsfinal\@submissionfalse\fi +\if@preprint\@submissionfalse\fi + +% fonts +\renewcommand{\rmdefault}{ptm} +\renewcommand{\sfdefault}{phv} + +% change this every year for notice string at bottom +\newcommand{\@neuripsordinal}{35th} +\newcommand{\@neuripsyear}{2021} +\newcommand{\@neuripslocation}{virtual} + +% acknowledgments +\usepackage{environ} +\newcommand{\acksection}{\section*{Acknowledgments and Disclosure of Funding}} +\NewEnviron{ack}{% + \acksection + \BODY +} + +% handle tweaks for camera-ready copy vs. submission copy +\if@preprint + \newcommand{\@noticestring}{% + Preprint. Under review.% + } +\else + \if@neuripsfinal + \newcommand{\@noticestring}{% + \@neuripsordinal\/ Conference on Neural Information Processing Systems + (NeurIPS \@neuripsyear).%, \@neuripslocation.% + } + \else + \newcommand{\@noticestring}{% + Submitted to \@neuripsordinal\/ Conference on Neural Information + Processing Systems (NeurIPS \@neuripsyear). Do not distribute.% + } + + % hide the acknowledgements + \NewEnviron{hide}{} + \let\ack\hide + \let\endack\endhide + + % line numbers for submission + \RequirePackage{lineno} + \linenumbers + + % fix incompatibilities between lineno and amsmath, if required, by + % transparently wrapping linenomath environments around amsmath + % environments + \AtBeginDocument{% + \@ifpackageloaded{amsmath}{% + \newcommand*\patchAmsMathEnvironmentForLineno[1]{% + \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname + \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname + \renewenvironment{#1}% + {\linenomath\csname old#1\endcsname}% + {\csname oldend#1\endcsname\endlinenomath}% + }% + \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{% + \patchAmsMathEnvironmentForLineno{#1}% + \patchAmsMathEnvironmentForLineno{#1*}% + }% + \patchBothAmsMathEnvironmentsForLineno{equation}% + \patchBothAmsMathEnvironmentsForLineno{align}% + \patchBothAmsMathEnvironmentsForLineno{flalign}% + \patchBothAmsMathEnvironmentsForLineno{alignat}% + \patchBothAmsMathEnvironmentsForLineno{gather}% + \patchBothAmsMathEnvironmentsForLineno{multline}% + }{} + } + \fi +\fi + +% load natbib unless told otherwise +\if@natbib + \RequirePackage{natbib} +\fi + +% set page geometry +\usepackage[verbose=true,letterpaper]{geometry} +\AtBeginDocument{ + \newgeometry{ + textheight=9in, + textwidth=5.5in, + top=1in, + headheight=12pt, + headsep=25pt, + footskip=30pt + } + \@ifpackageloaded{fullpage} + {\PackageWarning{neurips_2021}{fullpage package not allowed! Overwriting formatting.}} + {} +} + +\widowpenalty=10000 +\clubpenalty=10000 +\flushbottom +\sloppy + +% font sizes with reduced leading +\renewcommand{\normalsize}{% + \@setfontsize\normalsize\@xpt\@xipt + \abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@ + \abovedisplayshortskip \z@ \@plus 3\p@ + \belowdisplayskip \abovedisplayskip + \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@ +} +\normalsize +\renewcommand{\small}{% + \@setfontsize\small\@ixpt\@xpt + \abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@ + \abovedisplayshortskip \z@ \@plus 2\p@ + \belowdisplayskip \abovedisplayskip + \belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@ +} +\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt} +\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt} +\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt} +\renewcommand{\large}{\@setfontsize\large\@xiipt{14}} +\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}} +\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}} +\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}} +\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}} + +% sections with less space +\providecommand{\section}{} +\renewcommand{\section}{% + \@startsection{section}{1}{\z@}% + {-2.0ex \@plus -0.5ex \@minus -0.2ex}% + { 1.5ex \@plus 0.3ex \@minus 0.2ex}% + {\large\bf\raggedright}% +} +\providecommand{\subsection}{} +\renewcommand{\subsection}{% + \@startsection{subsection}{2}{\z@}% + {-1.8ex \@plus -0.5ex \@minus -0.2ex}% + { 0.8ex \@plus 0.2ex}% + {\normalsize\bf\raggedright}% +} +\providecommand{\subsubsection}{} +\renewcommand{\subsubsection}{% + \@startsection{subsubsection}{3}{\z@}% + {-1.5ex \@plus -0.5ex \@minus -0.2ex}% + { 0.5ex \@plus 0.2ex}% + {\normalsize\bf\raggedright}% +} +\providecommand{\paragraph}{} +\renewcommand{\paragraph}{% + \@startsection{paragraph}{4}{\z@}% + {1.5ex \@plus 0.5ex \@minus 0.2ex}% + {-1em}% + {\normalsize\bf}% +} +\providecommand{\subparagraph}{} +\renewcommand{\subparagraph}{% + \@startsection{subparagraph}{5}{\z@}% + {1.5ex \@plus 0.5ex \@minus 0.2ex}% + {-1em}% + {\normalsize\bf}% +} +\providecommand{\subsubsubsection}{} +\renewcommand{\subsubsubsection}{% + \vskip5pt{\noindent\normalsize\rm\raggedright}% +} + +% float placement +\renewcommand{\topfraction }{0.85} +\renewcommand{\bottomfraction }{0.4} +\renewcommand{\textfraction }{0.1} +\renewcommand{\floatpagefraction}{0.7} + +\newlength{\@neuripsabovecaptionskip}\setlength{\@neuripsabovecaptionskip}{7\p@} +\newlength{\@neuripsbelowcaptionskip}\setlength{\@neuripsbelowcaptionskip}{\z@} + +\setlength{\abovecaptionskip}{\@neuripsabovecaptionskip} +\setlength{\belowcaptionskip}{\@neuripsbelowcaptionskip} + +% swap above/belowcaptionskip lengths for tables +\renewenvironment{table} + {\setlength{\abovecaptionskip}{\@neuripsbelowcaptionskip}% + \setlength{\belowcaptionskip}{\@neuripsabovecaptionskip}% + \@float{table}} + {\end@float} + +% footnote formatting +\setlength{\footnotesep }{6.65\p@} +\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@} +\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@} +\setcounter{footnote}{0} + +% paragraph formatting +\setlength{\parindent}{\z@} +\setlength{\parskip }{5.5\p@} + +% list formatting +\setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@} +\setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@} +\setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} +\setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} +\setlength{\leftmargin }{3pc} +\setlength{\leftmargini }{\leftmargin} +\setlength{\leftmarginii }{2em} +\setlength{\leftmarginiii}{1.5em} +\setlength{\leftmarginiv }{1.0em} +\setlength{\leftmarginv }{0.5em} +\def\@listi {\leftmargin\leftmargini} +\def\@listii {\leftmargin\leftmarginii + \labelwidth\leftmarginii + \advance\labelwidth-\labelsep + \topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@ + \parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ + \itemsep \parsep} +\def\@listiii{\leftmargin\leftmarginiii + \labelwidth\leftmarginiii + \advance\labelwidth-\labelsep + \topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ + \parsep \z@ + \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@ + \itemsep \topsep} +\def\@listiv {\leftmargin\leftmarginiv + \labelwidth\leftmarginiv + \advance\labelwidth-\labelsep} +\def\@listv {\leftmargin\leftmarginv + \labelwidth\leftmarginv + \advance\labelwidth-\labelsep} +\def\@listvi {\leftmargin\leftmarginvi + \labelwidth\leftmarginvi + \advance\labelwidth-\labelsep} + +% create title +\providecommand{\maketitle}{} +\renewcommand{\maketitle}{% + \par + \begingroup + \renewcommand{\thefootnote}{\fnsymbol{footnote}} + % for perfect author name centering + \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}} + % The footnote-mark was overlapping the footnote-text, + % added the following to fix this problem (MK) + \long\def\@makefntext##1{% + \parindent 1em\noindent + \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1 + } + \thispagestyle{empty} + \@maketitle + \@thanks + \@notice + \endgroup + \let\maketitle\relax + \let\thanks\relax +} + +% rules for title box at top of first page +\newcommand{\@toptitlebar}{ + \hrule height 4\p@ + \vskip 0.25in + \vskip -\parskip% +} +\newcommand{\@bottomtitlebar}{ + \vskip 0.29in + \vskip -\parskip + \hrule height 1\p@ + \vskip 0.09in% +} + +% create title (includes both anonymized and non-anonymized versions) +\providecommand{\@maketitle}{} +\renewcommand{\@maketitle}{% + \vbox{% + \hsize\textwidth + \linewidth\hsize + \vskip 0.1in + \@toptitlebar + \centering + {\LARGE\bf \@title\par} + \@bottomtitlebar + \if@submission + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@} + Anonymous Author(s) \\ + Affiliation \\ + Address \\ + \texttt{email} \\ + \end{tabular}% + \else + \def\And{% + \end{tabular}\hfil\linebreak[0]\hfil% + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% + } + \def\AND{% + \end{tabular}\hfil\linebreak[4]\hfil% + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% + } + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}% + \fi + \vskip 0.3in \@minus 0.1in + } +} + +% add conference notice to bottom of first page +\newcommand{\ftype@noticebox}{8} +\newcommand{\@notice}{% + % give a bit of extra room back to authors on first page + \enlargethispage{2\baselineskip}% + \@float{noticebox}[b]% + \footnotesize\@noticestring% + \end@float% +} + +% abstract styling +\renewenvironment{abstract}% +{% + \vskip 0.075in% + \centerline% + {\large\bf Abstract}% + \vspace{0.5ex}% + \begin{quote}% +} +{ + \par% + \end{quote}% + \vskip 1ex% +} + +% For the paper checklist +\newcommand{\answerYes}[1][]{\textcolor{blue}{[Yes] #1}} +\newcommand{\answerNo}[1][]{\textcolor{orange}{[No] #1}} +\newcommand{\answerNA}[1][]{\textcolor{gray}{[N/A] #1}} +\newcommand{\answerTODO}[1][]{\textcolor{red}{\bf [TODO]}} + +\endinput diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/post_related.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/post_related.tex new file mode 100644 index 00000000..b6aa2a1a --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/post_related.tex @@ -0,0 +1,43 @@ +\section{Additional Related Work} +\label{sect:post_related} + +In this section, we review some of the papers relevant to our work, but omitted from the main part due to space constraints. + +\subsection{Decentralized training}\label{sect:post_related_gossip} +In this subsection, we give additional details about the dependence of gossip-based optimization methods on the spectral properties on the communication graph through the spectral properties of the mixing matrix~\cite{xiao2004fast,scaman2019optimal} or the Laplacian matrix~\cite{merris1994laplacian,uribe2020dual} of the network. +That is, gossip finds approximate average on nodes with accuracy $\varepsilon$ after $\cO\left((1-\lambda_2(\mM))^{-1}\log(\varepsilon^{-1})\right)$ iterations, where $\mM$ is the mixing matrix and $\lambda_2(\mM)$ is the second largest eigenvalue of $\mM$ when sorted by absolute value. +The quantity $\eta = 1-\lambda_2(\mM)$ is called the spectral gap of the mixing matrix $\mM$, and $\eta^{-1}$ is typically a polynomial of the total number of nodes $N$ when the maximal degree of the node is $\cO(1)$. For example, for uniformly averaging $\mM$ one can show that $\eta^{-1} = \cO(N^2)$ for the ring topology (node degree $2$), $\eta^{-1} = \cO(N)$ for the two-dimensional torus topology (node degree $2$), and $\eta^{-1} = \cO(1)$ for the fully connected graph (node degree $N-1$); one can find more examples in~\cite{aldous2002reversible}. Similarly, the communication complexity of decentralized optimization methods often has multiplicative dependence on either $\cO(\eta^{-1})$ (see~\cite{xu2020distributed} and references therein) or $\cO(\eta^{-\nicefrac{1}{2}})$~\cite{scaman2019optimal,uribe2020dual,fallah2019robust,kovalev2020optimal}, which is not improvable for gossip-based methods~\cite{arjevani2015communication,scaman2017optimal}. + +Contrary to this, Moshpit All-Reduce does not depend on a fixed communication graph and the properties of its mixing matrix. +However, it depends on the number of averaging groups and the total number of peers (see Theorem~\ref{thm:quality_of_avg_deterministic_vectors}), which can be viewed as properties of a time-varying random communication graph. Fortunately, this dependence is often much better than in gossip: as we mentioned in the main part of the paper, even if workers are randomly split into pairs at each iteration, the simplified version of Moshpit All-Reduce makes the average distortion (the left-hand side of Equation~\ref{eq:determ_quality_of_avg}) at least $2$ times smaller after each round on average. + +\subsection{Compressed communication} +Another popular approach to address the communication bottleneck is communication compression~\cite{seide20141,alistarh2017qsgd,suresh2017distributed, ramezani2021nuqsgd, faghri2020adaptive}: before sending any information (e.g., iterates, gradients, Hessians or more sophisticated data) over the network, peers compress this information by applying a possibly random transformation. As the result, peers send fewer bits for each communication round, but the total number of communication rounds needed to achieve the predefined accuracy of the solution increases. However, compression can be useful in situations when the reduction in communication costs of one round is more important than the increase in the number of these rounds~\cite{horvath2019natural}. + +There are two distinct groups of works on distributed training with compressed communication: ones that focus on unbiased compression operators (e.g., Rand-K, $\ell_p$-quantization) and ones studying algorithms with biased compressors (e.g., Top-K); see a detailed summary of popular compression operators in~\cite{beznosikov2020biased}). +Quantized SGD (QSGD)~\cite{alistarh2017qsgd} and TernGrad~\cite{wen2017terngrad} were among the first compression methods with convergence guarantees. Next, the convergence analysis of these methods was generalized and tightened in the (strongly) convex case in~\cite{mishchenko2019distributed}. Moreover, the authors of \cite{mishchenko2019distributed} proposed a modification of QSGD called DIANA: this algorithm is based on the quantization of gradients' differences, which helps it achieve linear convergence in the strongly convex case when peers compute full gradients. Next, DIANA was generalized to arbitrary unbiased compression in~\cite{horvath2019stochastic}, where authors also developed and analyzed the variance-reduced version of DIANA. After that, several further modifications, such as Accelerated DIANA~\cite{li2020acceleration} and DIANA with bidirectional compression~\cite{gorbunov2020linearly,philippenko2020artemis}, were proposed. Finally, we refer the reader to~\cite{li2020unified,haddadpour2020federated,das2020improved, pmlr-v139-gorbunov21a} for state-of-the-art results for distributed methods with unbiased compression in the non-convex case. + +However, naïve application of biased compression operators can lead to significantly worse performance in practice. For instance, as it was shown recently in~\cite{beznosikov2020biased}, parallel SGD with Top-1 compression can diverge exponentially fast. Therefore, biased compressors are used jointly with so-called error-compensation~\cite{seide20141}. The first analysis of Error-Compensated SGD (EC-SGD) was proposed in~\cite{stich2018sparsified,karimireddy2019error} which then was generalized and tightened in~\cite{beznosikov2020biased}. Next, several further improvements, such as an accelerated version of EC-SGD~\cite{qian2020error} and linearly converging EC-SGD~\cite{gorbunov2020linearly}, were recently proposed. However, current theory does not show any superiority of distributed methods with biased compressors to the ones with unbiased compression operators. +In addition, one can combine decentralized communication with compression. Such combinations with unbiased compression operators were studied in~\cite{reisizadeh2019exact,kovalev2020linearly} and with biased operators in~\cite{pmlr-v97-koloskova19a,Koloskova2020Decentralized}. +In this paper, we do not study the interaction of different compression methods and Moshpit Averaging, leaving this promising direction to future work. + +\subsection{Multiple local steps} +Alternatively, to reduce the impact of the communication bottleneck, it is possible to perform several local optimization steps on each peer between the communication rounds. +This approach is based on the idea that the increased computational load of peers will decrease the number of communication rounds required to obtain the optimal parameters; it is frequently used in federated learning~\cite{konevcny2016federated,kairouz2019advances}. In particular, one of the most popular methods with multiple local steps is called Local-SGD or Federated Averaging~\cite{konevcny2016federated,Stich18local}. The first results on its convergence were given in \cite{Stich18local,LinSPJ2018local}, and later they were tightened and generalized both for homogeneous~\cite{khaled2020tighter,woodworth2020local} and heterogeneous cases~\cite{khaled2020tighter,woodworth2020minibatch}. Recently, further modifications of Local-SGD were proposed and analyzed: these modifications include acceleration \cite{yuan2020federated}, variance reduction \cite{gorbunov2020local}, communication compression \cite{basu2019qsparse,haddadpour2020federated,das2020improved}, decentralization \cite{li2019communication,koloskova2020unified}, adaptive and proximal methods \cite{reddi2021adaptive,yuan2020federated_comp}, and resistance to client drift \cite{karimireddy2020scaffold}. +Moshpit SGD can perform multiple local gradient steps before synchronization by design, as shown in Algorithm~\ref{alg:moshpit_local_sgd}. + + +\subsection{Asynchronous methods} +In the previous subsections, we mostly discussed synchronous distributed methods, since they are more widespread and better studied than asynchronous ones. Mainly, this is because asynchronous methods are more difficult to implement, debug and analyze under general assumptions. However, such methods can be more efficient in terms of using computational resources, which leads to faster wall-clock convergence \cite{assran2020advances}. In recent years, several asynchronous stochastic methods~\cite{recht2011hogwild,zhao2016fast,leblond2017asaga}, methods with no shared memory~\cite{peng2016arock,mishchenko2018delay}, and methods with delayed updates~\cite{agarwal2011distributed,feyzmahdavian2016asynchronous,arjevani2020tight,gorbunov2020linearly} were proposed and analyzed: one can find more details in a recent survey~\cite{assran2020advances}. +Moshpit SGD belongs to this family of asynchronous approaches as well, because the averaging steps happen in smaller groups and can be interleaved with local parameter updates. + +\subsection{Distributed Hash Tables} +\label{sect:related_dht} + +In this work, we set out to improve distributed averaging with a dynamic matchmaking protocol. Without a central server, this protocol relies on decentralized data structures to organize peers. The main data structure we use is the Distributed Hash Table, or DHT. On a high level, DHT is a distributed fault-tolerant ``dictionary'' that can be accessed by every participant. Each key-value pair is stored on a subset of peers determined by the $\mathrm{hash}$ function of the key. + +Each participant has a unique identifier (ID) sampled uniformly from the $\mathrm{hash}$ function output range. When storing a $(key,\ value)$ pair, one must find $k$ peers whose IDs are nearest to $\mathrm{hash}(key)$ according to a chosen metric. After that, the participant requests each of those peers to store $(key,\ value)$. When retrieving a value for a key, one should compute $\mathrm{hash}(key)$, search for peers with IDs nearest to that $\mathrm{hash}$ value and request the value from those peers. + +Specific DHT versions, such as Chord~\cite{chord} or Kademlia~\cite{kademlia}, employ different hash types and algorithms for finding nearest peers. For instance, Kademlia DHT sorts peers based on the XOR distance function: $d(x, y) = \mathrm{int}(x \oplus y)$. + +In DHT, each participant is directly aware of only a small subset of peers. When storing or retrieving a key, the participant requests additional peers from its neighbors in a semi-greedy search, minimizing the XOR distance until it finds $k$ nearest peers. In Kademlia, nodes form a special navigable graph structure that lets them find nearest peers in at most $\cO(k + \log N)$ requests to other peers, where $N$ is the total number of participants. Due to their scalability and fault-tolerance, DHTs found numerous applications including BitTorrent, Ethereum, I2P and decentralized deep learning~\cite{learning_at_home}. \ No newline at end of file diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/proofs_mixing.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/proofs_mixing.tex new file mode 100644 index 00000000..4919cb99 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/proofs_mixing.tex @@ -0,0 +1,338 @@ +\section{Proofs of Mixing Properties of Moshpit All-Reduce}\label{sect:missing_proofs} + +\textbf{Notation.} Throughout the following sections, we use the standard notation from the literature on stochastic optimization. That is, for any $n$-dimensional vectors $x = (x_1,\ldots,x_n)^\top,y = (y_1,\ldots,y_n)^\top\in\R^n$ we use $\langle x,y\rangle$ to denote the standard inner product: $\langle x, y\rangle = x_1y_1 + \ldots + x_ny_n$. Next, we use $\|x\|$ to denote the $\ell_2$=norm of $x$ ($\|x\| = \sqrt{\langle x, x\rangle}$), $\EE[\xi]$ to denote an expectation of a random variable $\xi$, $\EE[\xi\mid \eta]$ is used for the conditional expectation of $\xi$ given $\eta$, and $\PP\{E\}$ denotes the probability of an event $E$. + +\subsection{Computing exact average in a full grid}\label{sect:equiv_to_torus} +As discussed in Section~\ref{sect:method_algorithm}, Moshpit All-Reduce obtains the exact average of parameter vectors from $N$ peers arranged in a grid with $d$ coordinates and $M$ positions per coordinate when $N\equiv M^d$. That is, when the grid is full and each step averages $M$ parameter values along a single grid coordinate without repetitions, the algorithm needs only $d$ steps to compute the actual average across all nodes. In this section, we give a proof of this fact. + +First, let us formally define the setting and the averaging steps of Moshpit All-Reduce in this specific case. Let $\theta_{i_1 i_2\ldots i_d}$ be the parameter vector of the worker with coordinates $i_1, i_2,\ldots, i_d$; each coordinate $i_k$ takes values from $1$ to $M$, because the hypercube of peers is completely full (thus, due to the pigeonhole principle, there are no unoccupied coordinates). Next, arrange the coordinates of these vector according to the order of averaging iterations: namely, at iteration 1 +\begin{equation} + \overline{\theta}_{i_1 i_2\ldots i_d}^1=\frac{1}{M}\sum_{j_1=1}^M \theta_{j_1 i_2\ldots i_d},\quad i_1\in\{1,\ldots,M\}, +\end{equation} +which means that for the first iteration, we take the average across the first axis $\overline{\theta}^1$ and replicate it across all $M$ resulting vectors regardless of their index $i_1$. The next averaging steps can be expressed similarly with a simple recurrence relation: +\begin{equation} +\label{eqn:avg_recurrence} + \overline{\theta}_{i_1 i_2 \ldots i_d}^t=\frac{1}{M}\sum_{j_t=1}^M \overline{\theta}_{i_1\ldots i_{t-1} j_t i_{t+1}\ldots i_d}^{t-1}. +\end{equation} +Given this formal definition, we can now state and prove the exact averaging result: +\begin{theorem}[Exact average in a full $d$-dimensional hypercube after $d$ steps] +Assume that $M^d$ peers are arranged in a $d$-dimensional hypercube with $M$ positions in each dimension. Also, assume that each peer fully participates in every averaging step and $M$-sized groups for each averaging iteration are determined based on the hypercube coordinates. Then, if Moshpit All-Reduce is ran in the above setup for $d$ iterations without repeating groups (i.e. averaging across each dimension exactly once), its result for each participant is the average value of $\theta$ across all $M^d$ peers. +\end{theorem} +\begin{proof} +We can directly obtain the expression for the average by expanding the recurrence and rearranging the sums: +\begin{eqnarray*} + \overline{\theta}_{i_1 i_2\ldots i_d}^d &=& \frac{1}{M}\sum_{j_d=1}^M\overline{\theta}_{i_1\ldots i_{d-1} j_d}^{d-1}=\frac{1}{M}\sum_{j_d=1}^M\left(\frac{1}{M}\sum_{j_{d-1}=1}^M \overline{\theta}_{i_1 i_2\ldots j_{d-1}j_d}\right)=\ldots\\ + &=& \frac{1}{M}\Bigg(\underbrace{\sum_{j_d=1}^M\Bigg(\frac{1}{M}\sum_{j_{d-1}=1}^M\ldots\sum_{j_2=1}^M\Bigg(\frac{1}{M}\sum_{j_1=1}^M}_{d\textrm{ summations}} \theta_{j_1 \ldots j_d}\Bigg)\Bigg)\Bigg)\\ + &=& \frac{1}{M^d}\sum_{j_d=1}^M\sum_{j_{d-1}=1}^M\ldots\sum_{j_2=1}^M\sum_{j_1=1}^M \theta_{j_1 \ldots j_d} =\frac{1}{M^d}\sum_{j_1, \ldots, j_d=1}^M \theta_{j_1 \ldots j_d}. +\end{eqnarray*} +But this is exactly the global average of all $\theta$, since there are $M^d$ participants and each vector is represented in the sum because of summation over all possible indices. +\end{proof} + +Notice that for a given grid of peers, if some of its indices do not have corresponding parameter vectors, Equation~\ref{eqn:avg_recurrence} may result in different average vectors on different workers due to different numbers of peers along a coordinate for different indices. For example, running two iterations of Moshpit Averaging with $d=2,\ M=2$ and three parameter vectors $\theta_{11},\ \theta_{21},\ \theta_{22}$ results in $\frac{\theta_{11}+\theta_{21}}{2}$ on the first worker and $\frac{\theta_{11}+\theta_{21}}{4}+\theta_{22}$ on other workers, with neither equal to the global average. However, the variance of the averaged vectors does decrease, which is formally proven in Section~\ref{sec:proof_quality_of_avg_deterministic_vectors}. + +\subsection{Proof of Theorem~\ref{thm:quality_of_avg_deterministic_vectors_0}}\label{sect:correctness_proof} +Below we provide the complete proof of Theorem~\ref{thm:quality_of_avg_deterministic_vectors_0}. For the readers' convenience, we restate the theorem. +\begin{theorem}[Theorem~\ref{thm:quality_of_avg_deterministic_vectors_0}]\label{thm:quality_of_avg_deterministic_vectors_0_supp} +If all workers have non-zero probability of successfully running a communication round in Moshpit Averaging and the order of $\texttt{peers}_t$ is random, then all local vectors $\theta^t_i$ converge to the global average with probability $1$: +\begin{equation} + \forall i = 1,\ldots, N\quad \left\|\theta^t_i - \frac1N \sum_{i=1}^N \theta^0_i\right\|^2 \xrightarrow[t\to\infty]{} 0.\label{eq:quality_of_avg_deterministic_vectors_0_supp} +\end{equation} +\end{theorem} +\begin{proof}[Proof of Theorem~\ref{thm:quality_of_avg_deterministic_vectors_0}] + First of all, we notice that \eqref{eq:quality_of_avg_deterministic_vectors_0_supp} is equivalent to + \begin{equation} + \forall i = 1,\ldots, N,\;\forall j=1,\ldots,n\quad \left(\theta^t_i(j) - \frac1N \sum_{i=1}^N \theta^0_i(j)\right)^2 \xrightarrow[t\to\infty]{} 0,\label{eq:quality_of_avg_deterministic_vectors_0_supp_tech_1} + \end{equation} + where $\theta_i^t(j)$ denotes $j$-th component of $\theta_i^t$. Consider an arbitrary component $j \in \{1,\ldots,n\}$ and the sequence of intervals $\{I_{j,t}\}_{t\ge 0}$ where $I_{j,t} = \text{conv}\{\theta_1^t(j),\theta_2^t(j),\ldots, \theta_N^t(j)\}$. Then, $\{I_{j,t}\}_{t\ge 0}$ is a sequence of nested intervals ($I_{j,t+1} \subseteq I_{j,t} \forall t\ge 0$), since averaging in groups does not expand the convex hull of $\{\theta_1^t,\theta_2^t,\ldots, \theta_N^t\}$. For convenience, we specify the bounds of the intervals: $I_{j,t} = [a_{j,t}, b_{j,t}]$. Using the Cantor's intersection theorem, we conclude that + \begin{equation*} + \bigcap\limits_{t=0}^\infty I_{j,t} = I_j = [a_j, b_j], + \end{equation*} + where $\overline{\theta}(j) = \frac{1}{N}\sum_{i=1}^n\theta_i^0(j) \in [a_j, b_j]$. If $[a_j, b_j] = \{\overline{\theta}(j)\}$ with probability $1$, then \eqref{eq:quality_of_avg_deterministic_vectors_0_supp_tech_1} holds with probability $1$ as well. Suppose the opposite: there exist such $j \in \{1,\ldots,n\}$, $[a,b]$ and $\delta,\Delta > 0$ that $\overline{\theta}(j) \in [a,b]$, $b-a = \Delta$ and + \begin{equation*} + \PP\Bigg\{\underbrace{[a,b] \subseteq \bigcap\limits_{t=0}^\infty I_{j,t}}_{E}\Bigg\} = \delta > 0\quad \text{ and }\quad \forall \varepsilon > 0\; \PP\Bigg\{\underbrace{[a-\varepsilon,b+\varepsilon] \subseteq \bigcap\limits_{t=0}^\infty I_{j,t}}_{E_{\varepsilon}}\Bigg\} < \delta. + \end{equation*} + This implies that for all $\varepsilon > 0$ there exists such $T_{\varepsilon} > 0$ that + \begin{equation*} + \PP\Big\{\underbrace{\forall t \ge T_{\varepsilon}\;\; a_{j,t}\in [a-\varepsilon,a], b_{j,t}\in[b,b+\varepsilon]}_{E_{\varepsilon}'}\Big\} = \delta_{\varepsilon} > 0. + \end{equation*} + Consider $\varepsilon = \frac{\Delta}{(2N+100)^{2N}}$ and assume that the event $E_{\varepsilon}'$ holds. Next, we introduce new notation: $J_{\text{left}}^t = \{i \in \{1,\ldots, n\}\mid \theta_{i}^t(j) \in [a-\varepsilon,a]\}$ and $J_{\text{right}}^t = \{i \in \{1,\ldots, n\}\mid \theta_{i}^t(j) \in [b,b+\varepsilon]\}$. Since $E_{\varepsilon}'$ holds the sets $J_{\text{left}}^t$ and $J_{\text{right}}^t$ are non-empty for all $t\ge T_{\varepsilon}$ with probability $\delta_{\varepsilon} > 0$: + \begin{equation} + \PP\left\{\forall t \ge T_{\varepsilon}\;\; J_{\text{left}}^t \neq \varnothing\text{ and } J_{\text{right}}^t \neq \varnothing\right\} = \delta_{\varepsilon} > 0. \label{eq:quality_of_avg_deterministic_vectors_0_supp_tech_2} + \end{equation} + We notice that every pair of workers $i_1,i_2$ has a non-zero probability of taking part in the averaging inside the common group at each iteration since all workers have a non-zero probability of successfully running a communication round and the order of $\texttt{peers}_t$ is random. This implies that every pair of workers $i_1,i_2$ with probability $1$ take part in the averaging inside the common group infinitely many times when $t$ goes to the infinity. + + Next, we choose some $t_0 \ge T_{\varepsilon}$. Let $J_{\text{left}}^{t_0} = \{i_{l,1},\ldots, i_{l,q_l}\}$ and $J_{\text{right}}^{t_0} = \{i_{r,1},\ldots, i_{r,q_r}\}$. Consider the event $E_{\varepsilon,0}' \subseteq E_{\varepsilon}'$ such that in $E_{\varepsilon,0}'$ peer $i_{l,1}$ computes an average in the group containing any peer from $J_{\text{right}}^{t_0}$ at some iteration $t_1 > t_0$. Our observations above imply that $\PP\{E_{\varepsilon,0}'\} = \PP\{E_{\varepsilon}'\} = \delta_{\varepsilon} > 0$. Then, $\theta_{i_{l,1}}^{t_1}(j) \ge \frac{N-1}{N}(a-\varepsilon) + \frac{1}{N}b = a-\varepsilon + \frac{1}{N}(\Delta + \varepsilon) = a - \frac{\Delta}{(2N+100)^{2N}} + \frac{1}{N}\left(\Delta + \frac{\Delta}{(2N+100)^{2N}}\right) > a + \frac{\Delta}{2N}$, i.e., $\theta_{i_{l,1}}^{t_1}(j) \in (a,b]$ meaning that $i_{l,1} \not\in J_{\text{left}}^{t_1}$. The last part of the proof shows that for any $t\ge t_1$, the peer $i_{l,1}$ will never be the part of $J_{\text{left}}^t$ and after a finite number of iterations $J_{\text{left}}^t = \varnothing$ with probability $\delta_{\varepsilon} > 0$ when $E_{\varepsilon,0}'$ holds, implying the contradiction with \eqref{eq:quality_of_avg_deterministic_vectors_0_supp_tech_2}. + + To show that, we consider the following set of peers: $\widehat{J}_{\text{left}}^{t_1} = \{i\in\{1,\ldots,n\}\mid \exists t \ge t_1:\; \theta_i^{t}(j)\in [a-\varepsilon, a+\frac{\Delta}{2N})\}$. Next, we consider the event $E_{\varepsilon,1}'\subseteq E_{\varepsilon,0}'$ such that in $E_{\varepsilon,1}'$ peer $i_{l,1}$ computes an average in the group containing some peer $i_{l,avg,1}$ from $\widehat{J}_{\text{left}}^{t_1}$ at some iteration $t_2 > t_1$ (and $t_2$ is the first such moment after $t_1$). Again, our observations imply $\PP\{E_{\varepsilon,1}'\} = \PP\{E_{\varepsilon,0}'\} = \delta_{\varepsilon}>0$. Then, $\theta_{i_{l,1}}^{t_2}(j) = \theta_{i_{l,avg,1}}^{t_2}(j) > \frac{N-1}{N}(a-\varepsilon) + \frac{1}{N}\left(a+\frac{\Delta}{2N}\right) = a + \frac{\Delta}{2N^2} - \frac{(N-1)\Delta}{N(2N+100)^{2N}} > a + \frac{\Delta}{4N^2}$. After that, we consider the event $E_{\varepsilon,2}'\subseteq E_{\varepsilon,1}'$ such that in $E_{\varepsilon,2}'$ peer $i_{l,1}$ or $i_{l,avg,1}$ computes an average in the group containing a peer $i_{l,avg,2}\neq i_{l,avg,1}$ from $\widehat{J}_{\text{left}}^{t_1}$ at an iteration $t_3 > t_2$ (and $t_3$ is the first such moment after $t_2$). Then, $\theta_{i_{l,1}}^{t_3}(j), \theta_{i_{l,avg,1}}^{t_3}(j)$ and $\theta_{i_{l,avg,2}}^{t_3}(j)$ are greater than $\frac{N-1}{N}(a-\varepsilon) + \frac{1}{N}\left(a + \frac{\Delta}{4N^2}\right) = a + \frac{\Delta}{4N^3} - \frac{(N-1)\Delta}{N(2N+100)^{2N}} > a + \frac{\Delta}{8N^3}$. + + Therefore, after at least $N-1$ of such averaging iterations, with probability $\delta_\varepsilon$ all $\theta_i^t(j)$ will be greater than $a + \frac{\Delta}{(2N)^N} > a$ while $E_{\varepsilon}'$ holds. This contradicts \eqref{eq:quality_of_avg_deterministic_vectors_0_supp_tech_2}. Therefore, + \begin{equation*} + \bigcap\limits_{t=0}^\infty I_{j,t} = \{\overline{\theta}(j)\} + \end{equation*} + with probability $1$, which concludes the proof. +\end{proof} + + +\subsection{Proof of Theorem~\ref{thm:quality_of_avg_deterministic_vectors}}\label{sec:proof_quality_of_avg_deterministic_vectors} +In this section, we provide the complete proof of Theorem~\ref{thm:quality_of_avg_deterministic_vectors}. For convenience, we restate the theorem below. +\begin{theorem}[Theorem~\ref{thm:quality_of_avg_deterministic_vectors}, averaging convergence rate]\label{thm:quality_of_avg_deterministic_vectors_supp} + Consider the modification of Moshpit All-Reduce that works as follows: at each iteration $k\geq 1$ 1) peers are randomly split into $r$ disjoint groups of sizes $M_1^k,\ldots, M_r^k$ in such a way that $\sum_{i=1}^r M_i^k = N$ and $M_i^k \ge 1\ \forall i = 1,\ldots,r$ and 2) peers from each group compute their group average via All-Reduce. Let $\theta_1,\ldots,\theta_N$ be the input vectors of this procedure and $\theta_1^T,\ldots,\theta_N^T$ be the outputs after $T$ iterations. Then, + \begin{eqnarray} + \EE\left[\frac{1}{N}\sum\limits_{i=1}^N\|\theta_i^T - \overline{\theta}\|^2\right] = \left(\frac{r-1}{N} + \frac{r}{N^2}\right)^T\cdot\frac{1}{N}\sum\limits_{i=1}^N\|\theta_i - \overline{\theta}\|^2, \label{eq:determ_quality_of_avg_supp} + \end{eqnarray} + where $\overline{\theta} = \frac{1}{N}\sum_{i=1}^N\theta_i$. +\end{theorem} +\begin{proof} +First of all, let us clarify the procedure of random splitting of peers in $r$ groups. We assume that at iteration $k$ of the modified algorithm we generate a random permutation $\pi^k = (\pi_1^k,\ldots,\pi_N^k)$ of $1,\ldots, N$. Next, $J_1^k = \{\pi_1^k,\ldots,\pi_{M_1^k}^k\}$ form the indices of the first group of workers, $J_2^k = \{\pi_{M_1^k+1}^k,\ldots,\pi_{M_2^k}^k\}$ are the indices of the second group, and $J_r^k = \{\pi_{M_1^k+M_2^k+\ldots+M_{r-1}^k+1}^k,\ldots,\pi_{N}^k\}$ are the indices of group $r$. In other words, we generate a random permutation and take contiguous subgroups of indices corresponding to predefined group sizes $M_i^k$, starting from the first group. + +By definition, we have $\bigsqcup_{i=1}^r J_i^k = \{1,2,\ldots,N\}$, where $\sqcup$ defines the disjoint union operator. Moreover, notice that group sizes $M_1^k,\ldots,M_r^k$ can depend on $k$ and even be random: for our analysis, it is sufficient that the randomness defining the permutation is independent from $M_1^k,\ldots,M_r^k$. Next, vectors $\theta_1^k,\ldots,\theta_N^k$ are obtained by the following formula: +\begin{equation*} + \forall j=1,\ldots,N,\quad \theta_j^k = \frac{1}{M_i^k}\sum\limits_{t\in J_i^k}\theta_t^{k-1},\quad \text{where } J_i^k \text{ is the group for which } j\in J_i^k. +\end{equation*} +Using this, we show that the average of vectors $\{\theta_i^k\}_{i=1}^n$ remains the same throughout the iterations of Moshpit All-Reduce: +\begin{equation*} + \frac{1}{N}\sum\limits_{j=1}^N\theta_j^k = \frac{1}{N}\sum\limits_{i=1}^rM_i^k\cdot\frac{1}{M_i^k}\sum\limits_{t\in J_i^k}\theta_t^{k-1} = \frac{1}{N}\sum\limits_{i=1}^r\sum\limits_{t\in J_i^k}\theta_t^{k-1} = \frac{1}{N}\sum\limits_{j=1}^N\theta_j^{k-1}. +\end{equation*} +Therefore, the quantity $\frac{1}{N}\sum_{j=1}^N\|\theta_j^k - \overline{\theta}\|^2$ (average distortion) measures the quality of averaging. For this quantity, we can derive the following expression: +\begin{eqnarray} + \frac{1}{N}\sum\limits_{j=1}^N\|\theta_j^k - \overline{\theta}\|^2 &=& \frac{1}{N}\sum\limits_{i=1}^r M_i^k\left\|\frac{1}{M_i^k}\sum\limits_{t\in J_i^k}\theta_t^{k-1} - \overline{\theta}\right\|^2\notag\\ + &=& \frac{1}{N}\sum\limits_{i=1}^r\frac{1}{M_i^k}\left(\sum\limits_{t\in J_i^k}\|\theta_t^{k-1} - \overline{\theta}\|^2 + 2\sum\limits_{t,l\in J_i^k, t < l}\langle \theta_t^{k-1} - \overline{\theta}, \theta_l^{k-1} - \overline{\theta} \rangle\right).\notag +\end{eqnarray} +Taking the expectation $\EE_{\pi^k}[\cdot]$ with respect to the randomness coming from the choice of $\pi^k$ we get +\begin{eqnarray} + \EE_{\pi^k}\left[\frac{1}{N}\sum\limits_{j=1}^N\|\theta_j^k - \overline{\theta}\|^2\right] &\notag\\ + &\hspace{-2.5cm}= \frac{1}{N}\sum\limits_{i=1}^r\frac{1}{M_i^k}\left(\EE_{\pi^k}\left[\sum\limits_{t\in J_i^k}\|\theta_t^{k-1} - \overline{\theta}\|^2\!\right] \!+\! 2\EE_{\pi^k}\!\left[\sum\limits_{t,l\in J_i^k, t < l}\langle \theta_t^{k-1} - \overline{\theta}, \theta_l^{k-1} - \overline{\theta} \rangle\right]\right).\notag +\end{eqnarray} +Since $\forall j,j_1,j_2 \in\{1,\ldots,N\},j_1\neq j_2$ and for all $i=1,\ldots,r$ +\begin{equation*} + \PP\left\{j\in J_i^k\right\} = \frac{M_i^k}{N},\quad \PP\left\{j_1,j_2 \in J_i^k\right\} = \frac{M_{i}^k(M_i^k - 1)}{N^2}, +\end{equation*} +we have +\begin{eqnarray*} + \EE_{\pi^k}\left[\frac{1}{N}\sum\limits_{j=1}^N\|\theta_j^k - \overline{\theta}\|^2\right] &=& \frac{1}{N}\sum\limits_{i=1}^r\frac{1}{N}\sum\limits_{j=1}^N\|\theta_j^{k-1} - \overline{\theta}\|^2\\ + &&\quad +\frac{1}{N}\sum\limits_{i=1}^r2\frac{M_i^k - 1}{N^2}\sum\limits_{1 \le j_1 < j_2 \le N}\langle \theta_{j_1}^{k-1} - \overline{\theta}, \theta_{j_2}^{k-1} - \overline{\theta}\rangle\\ + &=& \frac{r}{N^2}\sum\limits_{j=1}^N\|\theta_j^{k-1} - \overline{\theta}\|^2 + 2\frac{N-r}{N^3}\sum\limits_{1 \le j_1 < j_2 \le N}\langle \theta_{j_1}^{k-1} - \overline{\theta}, \theta_{j_2}^{k-1} - \overline{\theta}\rangle\\ + &=& \left(\frac{r}{N^2} - \frac{N-r}{N^3}\right)\sum\limits_{j=1}^N\|\theta_j^{k-1} - \overline{\theta}\|^2 +\frac{N-r}{N^3}\sum\limits_{j=1}^N\|\theta_j^{k-1} - \overline{\theta}\|^2\\ + &&\quad +2\frac{N-r}{N^3}\sum\limits_{1 \le j_1 < j_2 \le N}\langle \theta_{j_1}^{k-1} - \overline{\theta}, \theta_{j_2}^{k-1} - \overline{\theta}\rangle\\ + &=& \frac{N(r-1)+r}{N^3}\sum\limits_{j=1}^N\|\theta_j^{k-1} - \overline{\theta}\|^2 + \frac{N-r}{N^3}\underbrace{\left\|\sum\limits_{j=1}^N(\theta_j^{k-1} - \overline{\theta})\right\|^2}_{\|N\overline{\theta} - N\overline{\theta}\|^2 = 0}\\ + &=& \left(\frac{r-1}{N} + \frac{r}{N^2}\right)\cdot\frac{1}{N}\sum\limits_{j=1}^N\|\theta_j^{k-1} - \overline{\theta}\|^2. +\end{eqnarray*} +Finally, we take the full expectation from the both sides of the above equation and apply the tower property $\EE\left[\EE_{\pi^k}\left[\cdot\right]\right] = \EE\left[\cdot\right]$: +\begin{equation*} + \EE\left[\frac{1}{N}\sum\limits_{j=1}^N\|\theta_j^k - \overline{\theta}\|^2\right] = \left(\frac{r-1}{N} + \frac{r}{N^2}\right)\EE\left[\frac{1}{N}\sum\limits_{j=1}^N\|\theta_j^{k-1} - \overline{\theta}\|^2\right]. +\end{equation*} +Unrolling the recurrence for $k=T$, we establish \eqref{eq:determ_quality_of_avg_supp}. +\end{proof} + +\begin{remark} + The result implies that increasing the group size $\alpha > 1$ times implies almost $\alpha$ times faster convergence to the average. +\end{remark} + +\begin{remark} + Our analysis can be easily generalized to the case when number of groups $r$ can depend on $k$ and be a random variable independent from the choice of permutations and the number of groups at previous steps. In this case, \eqref{eq:determ_quality_of_avg_supp} transforms into + \begin{equation} + \EE\left[\frac{1}{N}\sum\limits_{i=1}^N\|\theta_i^T - \overline{\theta}\|^2\right] = \frac{1}{N}\sum\limits_{i=1}^N\|\theta_i - \overline{\theta}\|^2\cdot\prod_{k=1}^T\left(\frac{\EE[r_k]-1}{N} + \frac{\EE[r_k]}{N^2}\right), \label{eq:determ_quality_of_avg_generalized_supp} + \end{equation} + where $r_k$ is the number of groups at iteration $k$. +\end{remark} + +\subsection{Additional Guarantees For Moshpit Averaging}\label{sec:mix_rand_proof} +In this section, we derive the result measuring the rate of variance reduction when averaging random vectors with Algorithm~\ref{alg:moshpit}. We start with the following technical lemma: +\begin{lemma}\label{lem:ode_lemma} + Let $\xi \sim \text{Binom}(M,p)$ have a binomial distribution with parameters $M$ (number of trials) and $p$ (probability of success for each trial). Then + \begin{eqnarray} + m_1(M,p) := \EE\left[\min\left\{\frac{1}{\xi},1\right\}\right] &=& (1-p)^M + \sum\limits_{i=1}^M\frac{(1-p)^{M-i} - (1-p)^M}{i}, \label{eq:binom_first_inverse_moment}\\ + m_2(M,p) := \EE\left[\min\left\{\frac{1}{\xi^2},1\right\}\right] &=& (1-p)^M + \sum\limits_{i=1}^M\frac{(1-p)^{M-i} - (1-p)^M}{i}\sum\limits_{j=i}^M\frac{1}{j}. \label{eq:binom_second_inverse_moment} + \end{eqnarray} +\end{lemma} +\begin{proof} + We start with the proof of \eqref{eq:binom_first_inverse_moment}. By definition of the expectation, we have + \begin{eqnarray*} + \EE\left[\min\left\{\frac{1}{\xi},1\right\}\right] &=& (1-p)^M + \sum\limits_{i=1}^M \frac{1}{i}p^i(1-p)^{M-i}\binom{M}{i}. + \end{eqnarray*} + For simplicity of further derivations, we introduce the following notation: $m_1(M,p) = \EE\left[\min\left\{\frac{1}{\xi},1\right\}\right]$ and $m_2(M,p) = \EE\left[\min\left\{\frac{1}{\xi^2},1\right\}\right]$. Taking the derivative of $m_1(M,p)$ by $p$, we obtain + \begin{eqnarray*} + m_1'(M,p) &=& -M(1-p)^{M-1} + \sum\limits_{i=1}^Mp^{i-1}(1-p)^{M-i}\binom{M}{i} \\ + &&\quad - \sum\limits_{i=1}^M\frac{M-i}{i}p^i(1-p)^{M-i-1}\binom{M}{i}\\ + &=& -M(1-p)^{M-1} + \frac{1}{p}\left(-(1-p)^M + \sum\limits_{i=0}^Mp^{i}(1-p)^{M-i}\binom{M}{i}\right)\\ + && - \frac{M}{1-p}\sum\limits_{i=1}^M\frac{1}{i}p^i(1-p)^{M-i}\binom{M}{i}\\ + &&\quad + \frac{1}{1-p}\left(-(1-p)^M + \sum\limits_{i=0}^Mp^i(1-p)^{M-i}\binom{M}{i}\right)\\ + &=& -M(1-p)^{M-1} + \frac{1}{p}\left(1 - (1-p)^M\right) - \frac{M}{1-p}\left(m_1(M,p) - (1-p)^M\right)\\ + &&\quad+ \frac{1}{1-p}\left(1- (1-p)^M\right)\\ + &=& \frac{1}{p(1-p)} - \frac{(1-p)^{M-1}}{p} - \frac{M}{1-p}m_1(M,p). + \end{eqnarray*} + Rearranging the terms, we get the following linear first-order ODE + \begin{equation} + m_1'(M,p) + \frac{M}{1-p}m_1(M,p) = \frac{1}{p(1-p)} - \frac{(1-p)^{M-1}}{p}. \label{eq:first_moment_ODE} + \end{equation} + To solve it, we consider the following homogeneous ODE: + \begin{equation*} + m_1'(M,p) + \frac{M}{1-p}m_1(M,p) = 0. + \end{equation*} + The solution of this ODE is $m_1(M,p) = C(1-p)^M$, where $C\in\R$ is an arbitrary real constant. Next, we go back to the initial ODE \eqref{eq:first_moment_ODE} and try to find a solution of the form $m_1(M,p) = C(p)(1-p)^M$, where $C(p):\R \to \R$ is a differentiable function: + \begin{eqnarray*} + \left(C(p)(1-p)^M\right)' + \frac{M}{1-p}C(p)(1-p)^M &=& \frac{1}{p(1-p)} - \frac{(1-p)^{M-1}}{p}\\ + &\Downarrow&\\ + C'(p)(1-p)^M &=& \frac{1}{p(1-p)} - \frac{(1-p)^{M-1}}{p}\\ + &\Downarrow&\\ + C'(p) &=& \frac{1}{p(1-p)^{M+1}} - \frac{1}{p(1-p)}. + \end{eqnarray*} + Since + \begin{equation} + \frac{1}{x(1-x)^{k+1}} = \frac{1}{x(1-x)^{k}} + \frac{1}{(1-x)^{k+1}}\label{eq:technical_expansion} + \end{equation} + for all $x\not\in \{0,1\}$ and all non-negative integers $k$, we have + \begin{eqnarray*} + C'(p) &=& \frac{1}{p} + \frac{1}{1-p} + \frac{1}{(1-p)^2} + \ldots + \frac{1}{(1-p)^{M+1}} - \frac{1}{p} - \frac{1}{1-p}\\ + &\Downarrow&\\ + C'(p) &=& \sum\limits_{i=1}^M(1-p)^{-i-1}, + \end{eqnarray*} + hence + \begin{eqnarray*} + C(p) = \hat{C} + \sum\limits_{i=1}^M\frac{1}{i}(1-p)^{-i}, + \end{eqnarray*} + where $\hat{C}$ is a real constant. Putting all together, we obtain + \begin{eqnarray*} + m_1(M,p) &=& C(p)(1-p)^M = \hat{C}(1-p)^M + \sum\limits_{i=1}^M\frac{1}{i}(1-p)^{M-i}. + \end{eqnarray*} + Taking $m_1(M,0) = 1$ into account, we conclude that $\hat{C} = 1 - \sum_{i=1}^M\frac{1}{i}$ and obtain \eqref{eq:binom_first_inverse_moment}. + + Using a similar technique, we derive \eqref{eq:binom_second_inverse_moment}. By definition of the expectation, we have + \begin{eqnarray*} + m_2(M,p) &=& (1-p)^M + \sum\limits_{i=1}^M \frac{1}{i^2}p^i(1-p)^{M-i}\binom{M}{i}. + \end{eqnarray*} + Taking the derivative of $m_2(M,p)$ by $p$, we obtain + \begin{eqnarray*} + m_2'(M,p) &=& -M(1-p)^{M-1} + \sum\limits_{i=1}^M\frac{1}{i}p^{i-1}(1-p)^{M-i}\binom{M}{i}\\ + &&\quad - \sum\limits_{i=1}^M\frac{M-i}{i^2}p^i(1-p)^{M-i-1}\binom{M}{i}\\ + &=& -M(1-p)^{M-1} + \frac{1}{p} \sum\limits_{i=1}^M\frac{1}{i}p^{i}(1-p)^{M-i}\binom{M}{i}\\ + && - \frac{M}{1-p}\sum\limits_{i=1}^M\frac{1}{i^2}p^i(1-p)^{M-i}\binom{M}{i} + \frac{1}{1-p}\sum\limits_{i=1}^M\frac{1}{i}p^i(1-p)^{M-i}\binom{M}{i}\\ + &=& -M(1-p)^{M-1} + \frac{1}{p}\left(m_1(M,p) - (1-p)^M\right) \\ + &&\quad + \frac{1}{1-p}\left(-M m_2(M,p) + M(1-p)^M + m_1(M,p) - (1-p)^M\right)\\ + &=& \frac{m_1(M,p)}{p(1-p)} - \frac{(1-p)^{M-1}}{p} - \frac{M}{1-p}m_2(M,p). + \end{eqnarray*} + Rearranging the terms, we get the following linear first-order ODE + \begin{equation} + m_2'(M,p) + \frac{M}{1-p}m_2(M,p) = \frac{m_1(M,p)}{p(1-p)} - \frac{(1-p)^{M-1}}{p}. \label{eq:second_moment_ODE} + \end{equation} + To solve this ODE, we consider the homogeneous ODE: + \begin{equation*} + m_2'(M,p) + \frac{M}{1-p}m_2(M,p) = 0. + \end{equation*} + The solution of this ODE is $m_2(M,p) = C(1-p)^M$, where $C\in\R$ is an arbitrary real constant. Next, we go back to the initial ODE \eqref{eq:second_moment_ODE} and try to find a solution of the form $m_2(M,p) = C(p)(1-p)^M$, where $C(p):\R \to \R$ is a differentiable function: + \begin{eqnarray*} + \left(C(p)(1-p)^M\right)' + \frac{M}{1-p}C(p)(1-p)^M &=& \frac{m_1(M,p)}{p(1-p)} - \frac{(1-p)^{M-1}}{p}\\ + &\Downarrow&\\ + C'(p)(1-p)^M &=& \frac{m_1(M,p)}{p(1-p)} - \frac{(1-p)^{M-1}}{p}\\ + &\Downarrow&\\ + C'(p) &=& \frac{m_1(M,p)}{p(1-p)^{M+1}} - \frac{1}{p(1-p)}. + \end{eqnarray*} + Using \eqref{eq:technical_expansion} and \eqref{eq:binom_first_inverse_moment}, we derive + \begin{eqnarray*} + C'(p) &\overset{\eqref{eq:binom_first_inverse_moment}}{=}& -\frac{\sum\limits_{i=1}^M\frac{1}{i}}{p(1-p)} + \frac{\sum\limits_{i=1}^M\frac{1}{i}(1-p)^{M-i}}{p(1-p)^{M+1}}\\ + &=& -\sum\limits_{i=1}^M \frac{1}{ip(1-p)} + \sum\limits_{i=1}^M\frac{1}{ip(1-p)^{i+1}}\\ + &\overset{\eqref{eq:technical_expansion}}{=}& -\sum\limits_{i=1}^M\frac{1}{i}\left(\frac{1}{p} + \frac{1}{1-p}\right)\\ + &&\quad + \sum\limits_{i=1}^M\frac{1}{i}\left(\frac{1}{p} + \frac{1}{1-p} + \frac{1}{(1-p)^2} + \ldots + \frac{1}{(1-p)^{i+1}}\right)\\ + &=& \sum\limits_{i=1}^M\frac{1}{i}\left(\frac{1}{(1-p)^2} + \ldots + \frac{1}{(1-p)^{i+1}}\right) = \sum\limits_{i=1}^M \frac{1}{(1-p)^{i+1}}\sum\limits_{j=i}^M\frac{1}{j}, + \end{eqnarray*} + hence + \begin{eqnarray*} + C(p) = \hat{C} + \sum\limits_{i=1}^M\frac{1}{i}(1-p)^{-i}\sum\limits_{j=i}^M\frac{1}{j}, + \end{eqnarray*} + where $\hat{C}$ is a real constant. Putting all together, we obtain + \begin{eqnarray*} + m_2(M,p) &=& C(p)(1-p)^M = \hat{C}(1-p)^M + \sum\limits_{i=1}^M\frac{1}{i}(1-p)^{M-i}\sum\limits_{j=i}^M\frac{1}{j}. + \end{eqnarray*} + Taking $m_2(M,0) = 1$ into account, we conclude that $\hat{C} = 1 - \sum_{i=1}^M\frac{1}{i}\sum_{j=i}^M\frac{1}{j}$ and obtain \eqref{eq:binom_second_inverse_moment}. +\end{proof} + +Using this lemma, we derive the following result: +\begin{theorem}\label{thm:quality_of_avg_supp} + Assume that peers participating in Moshpit Averaging have independent random vectors $\theta_1,\ldots,\theta_N$ with means $\overline{\theta}_1,\ldots,\overline{\theta}_N$ and variances bounded by $\sigma^2$ before the averaging. Let $\theta_1^T,\ldots,\theta_N^T$ be the outputs of Moshpit Averaging after $T$ iterations. Finally, we assume that each peer from the grid can be dropped out for the whole averaging process before averaging independently from other peers, i.e., $N \sim \text{Binom}(M^d,p)$. Then, for all $i = 1,\ldots,N$ we have + \begin{equation} + \EE\left[\left\|\theta_i^T - \EE_{\theta}\left[\theta_i^T\right]\right\|^2\right] \leq M^{T-1}\sigma^2 m_1(M-1,p)\left(m_2(M-1,p)\right)^{T-1},\label{eq:variance_bound_supp} + \end{equation} + where functions $m_1(M,p)$ and $m_2(M,p)$ are defined in \eqref{eq:binom_first_inverse_moment} and \eqref{eq:binom_second_inverse_moment} respectively, and $\EE_\theta\left[\cdot\right]$ denotes the expectation w.r.t.\ the randomness from $\theta_1,\ldots,\theta_N$. Moreover, if $p \ge \frac{2}{3}$ and $M \ge 11$, then $m_1(M-1,p) \le \frac{2}{M}$, $m_2(M-1,p) \le \frac{3}{M^2}$ and + \begin{equation} + \EE\left[\left\|\theta_i^T - \EE_{\theta}\left[\theta_i^T\right]\right\|^2\right] \leq \frac{2\sigma^2}{M(\nicefrac{M}{3})^{T-1}}.\label{eq:variance_bound_2_supp} + \end{equation} +\end{theorem} +\begin{proof} +First of all, we recall an equivalent formulation of Moshpit Averaging. Consider a hypercube $\{1,\ldots,M\}^d$. One can consider the elements of this hypercube as hyperindices and assign a unique hyperindex to each peer so that peers can be viewed as vertices in the hypercube. Then, during the $k$-th iteration of Moshpit All-Reduce, each worker computes the average among those peers that have hyperindices with the same values except the $k$-th index; in other words, peers compute averages along the $k$-th dimension of the hypercube. Next, if $N = 0$, we assume that $\theta_i^T = \EE_{\theta}\left[\theta_i^T\right]$ and \eqref{eq:variance_bound_supp} holds for free. Therefore, to derive \eqref{eq:variance_bound_supp}, we assume that $N > 0$. + +More formally, we use the following notation: $\theta_{C_i} = \theta_i$ for all $i= 1,\ldots,N$, where $C_{i} = (c_{1}^i, c_2^i,\ldots, c_d^i)$, $c_{j}^i \in \{1,\ldots,M\}$ for all $j = 1,\ldots,M$, and $C_{i} \neq C_k$ for $i\neq k$. Let $\cC$ be the set of hyperindices corresponding to all peers. Next, we use $\theta_{C_i}^t$ to define the vector stored on $i$-th peer after $t$ iterations of Moshpit Averaging. Then, for all $i = 1,\ldots,N$ we have $\theta_{C_i}^0 = \theta_{C_i}$ and for all $t = 1,\ldots,d$ +\begin{equation*} + \theta_{C_i}^{t} = \frac{1}{b_{i,t}}\sum\limits_{k\in J_{i,t}}\theta_{C_k}^{t-1}, +\end{equation*} +where $J_{i,t} = \{k \in N\mid C_k = (c_1^k,\ldots,c_d^k) \in \cC \text{ and } c_j^k = c_j^i\; \forall j \neq t\}$ and $b_{i,t} = |J_{i,t}|$. Using this, we derive the following formula for $\theta_{C_i}^t$: +\begin{equation*} + \theta_i^T \equiv \theta_{C_i}^T = \frac{1}{b_{i,T}}\sum\limits_{i_1\in J_{i,T}}\frac{1}{b_{i_1,T-1}}\sum\limits_{i_2\in J_{i_1,T-1}}\frac{1}{b_{i_2,T-2}}\sum\limits_{i_3\in J_{i_2,T-1}}\ldots\frac{1}{b_{i_{T-1},1}}\sum\limits_{i_T\in J_{i_{T-1},1}}\theta_{i_{T}}. +\end{equation*} +Taking the expectation w.r.t. $\theta_1,\ldots,\theta_N$, we get +\begin{equation*} + \EE_{\theta}\left[\theta_i^T\right] = \frac{1}{b_{i,T}}\sum\limits_{i_1\in J_{i,T}}\frac{1}{b_{i_1,T-1}}\sum\limits_{i_2\in J_{i_1,T-1}}\frac{1}{b_{i_2,T-2}}\sum\limits_{i_3\in J_{i_2,T-1}}\ldots\frac{1}{b_{i_{T-1},1}}\sum\limits_{i_T\in J_{i_{T-1},1}}\overline{\theta}_{i_{T}}. +\end{equation*} +Using the independence of $\theta_1,\ldots,\theta_N$, we derive +\begin{eqnarray*} + \EE_\theta\left[\left\|\theta_i^T - \EE_{\theta}\left[\theta_i^T\right]\right\|^2\right] &=& \EE_\theta\left[\left\|\sum\limits_{i_1\in J_{i,T}}\sum\limits_{i_2\in J_{i_1,T-1}}\ldots \sum\limits_{i_{T}\in J_{i_{T-1},1}}\frac{\theta_{i_T} - \overline{\theta}_{i_T}}{b_{i,T} b_{i_1,T-1}\ldots b_{i_{T-1},1}}\right\|^2\right]\\ + &=& \sum\limits_{i_1\in J_{i,T}}\sum\limits_{i_2\in J_{i_1,T-1}}\ldots \sum\limits_{i_{T}\in J_{i_{T-1},1}}\frac{\EE_\theta\left[\|\theta_{i_T} - \overline{\theta}_{i_T}\|^2\right]}{b_{i,T}^2 b_{i_1,T-1}^2\ldots b_{i_{T-1},1}^2}\\ + &\le& \sum\limits_{i_1\in J_{i,T}}\sum\limits_{i_2\in J_{i_1,T-1}}\ldots \sum\limits_{i_{T}\in J_{i_{T-1},1}}\frac{\sigma^2}{b_{i,T}^2 b_{i_1,T-1}^2\ldots b_{i_{T-1},1}^2}\\ + &=& \sum\limits_{i_1\in J_{i,T}}\sum\limits_{i_2\in J_{i_1,T-1}}\ldots \sum\limits_{i_{T-1}\in J_{i_{T-2},2}}\frac{\sigma^2}{b_{i,T}^2 b_{i_1,T-1}^2\ldots b_{i_{T-2},2}^2b_{i_{T-1},1}}. +\end{eqnarray*} +Next, taking the full expectation from the both sides of the previous inequality and using the tower property, we obtain +\begin{equation} + \EE\!\left[\!\left\|\theta_i^T - \EE_{\theta}\left[\theta_i^T\right]\right\|^2\!\right] \!\le\! \EE\!\left[\!\sum\limits_{i_1\in J_{i,T}}\sum\limits_{i_2\in J_{i_1,T-1}}\ldots \sum\limits_{i_{T-1}\in J_{i_{T-2},2}}\frac{\sigma^2}{b_{i,T}^2 b_{i_1,T-1}^2\ldots b_{i_{T-2},2}^2b_{i_{T-1},1}}\!\right]\!. \label{eq:rand_mix_thm_technical_1} +\end{equation} +Notice that $J_{i_k,T-k} \cap J_{i_{k+1},T-k-1} = \{i_{k+1}\}$ for all $k=0,\ldots,T-1$, where $i_0 = i$. Moreover, for $k_1, k_2 \in\{0,1,\ldots,T\}$, $k_1 < k_2$ either $J_{i_{k_1},T-k_1} \cap J_{i_{k_2},T-k_2} = \{k_2\}$ or $J_{i_{k_1},T-k_1} \cap J_{i_{k_2},T-k_2} = \varnothing$. The first situation is possible iff $i_{k_1} = i_{k_1+1} = \ldots i_{k_2-1}$. + +Taking these observations about sets $J_{i_{k}, T-k}$ into account, we consider the sets $J_{i_k,T-k}' = J_{i_k,T-k}\setminus\{i_{k}\}$ for $k = 0, 1, \ldots, T-1$. These sets are pairwise disjoint and their cardinalities $b_{i_k,T-k}' = |J_{i_k,T-k}'|$ satisfy the following relations: $b_{i_k,T-k} = 1 + b_{i_k,T-k}' \ge \max\{1, b_{i_k,T-k}'\} =: \hat{b}_{i_k,T-k}$ for $k = 1, 2, \ldots, T-1$. Moreover, $b_{i,T}', b_{i_1,T-1}',\ldots, b_{i_{T-1},1}'$ are independent random variables from the binomial distribution $\text{Binom}(M-1, p)$. Finally, we notice that the number of terms in \eqref{eq:rand_mix_thm_technical_1} is upper-bounded by $M^{T-1}$, since $|J_{i,t}| \le M$ for all $i = 1,\ldots,N$ and $t=0,\ldots,T$. + +Putting all together, we obtain +\begin{eqnarray*} + \EE\left[\left\|\theta_i^T - \EE_{\theta}\left[\theta_i^T\right]\right\|^2\right] &\le& \EE\left[\sum\limits_{i_1\in J_{i,T}}\sum\limits_{i_2\in J_{i_1,T-1}}\ldots \sum\limits_{i_{T-1}\in J_{i_{T-2},2}}\frac{\sigma^2}{\hat b_{i,T}^2 \hat b_{i_1,T-1}^2\ldots \hat b_{i_{T-2},2}^2\hat b_{i_{T-1},1}}\right]\\ + &\le& M^{T-1}\sigma^2\EE\left[\frac{1}{\hat\xi_{1}^2 \hat\xi_{2}^2\ldots \hat\xi_{T-1}^2\hat\xi_{T}}\right]\\ + &=& M^{T-1}\sigma^2\EE\left[\frac{1}{\hat\xi_{1}^2}\right]\EE\left[\frac{1}{\hat\xi_{2}^2}\right]\ldots \EE\left[\frac{1}{\hat\xi_{T-1}^2}\right]\EE\left[\frac{1}{\hat\xi_{T}}\right], +\end{eqnarray*} +where $\hat \xi_k^2 = \max\{1,\xi_1^2\}$ for $k=1,\ldots,T$ and $\xi_1,\ldots,\xi_T$ are i.i.d.\ random variables having the binomial distribution $\text{Binom}(M-1, p)$. Then one can simplify the inequality above using Lemma~\ref{lem:ode_lemma} and get +\begin{eqnarray*} + \EE\left[\left\|\theta_i^T - \EE_{\theta}\left[\theta_i^T\right]\right\|^2\right] &\le& M^{T-1}\sigma^2 m_1(M-1,p)\left(m_2(M-1,p)\right)^{T-1}, +\end{eqnarray*} +where functions $m_1(M,p)$ and $m_2(M,p)$ are defined in \eqref{eq:binom_first_inverse_moment} and \eqref{eq:binom_second_inverse_moment} respectively. + +Next, we simplify the obtained upper bound under the assumption that $M$ and $p$ are not too small; specifically, $M\ge 11$ and $p\ge \nicefrac{2}{3}$. From \eqref{eq:binom_first_inverse_moment}, we have +\begin{eqnarray*} + m_1(M-1,p) &=& (1-p)^{M-1} + \sum\limits_{i=1}^{M-1}\frac{1}{i}\left((1-p)^{M-1-i} - (1-p)^{M-1}\right)\\ + &\le& (1-p)^{M-1}\sum\limits_{i=1}^{M-1}\frac{1}{i(1-p)^{i}}. +\end{eqnarray*} +Since +\begin{equation*} + \frac{1}{(k+1)(1-p)^{k+1}}\cdot\frac{k(1-p)^k}{1} = \frac{k}{(k+1)(1-p)} \xrightarrow[k\to\infty]{}\frac{1}{1-p} \ge 3, +\end{equation*} +we have +\begin{equation*} + (1-p)^{M-1}\sum\limits_{i=1}^{M-1}\frac{1}{i(1-p)^{i}} = \Theta\left((1-p)^M\cdot\frac{1}{M(1-p)^M}\right) = \Theta\left(\frac{1}{M}\right). +\end{equation*} +Using simple algebra, one can prove that for $M\ge 11$ and $p \ge\nicefrac{2}{3}$ the following inequality holds: +\begin{equation*} + m_1(M-1,p)\le (1-p)^{M-1}\sum\limits_{i=1}^{M-1}\frac{1}{i(1-p)^{i}} \le \frac{2}{M}. +\end{equation*} +Similarly, we analyze $m_2(M-1, p)$: +\begin{eqnarray*} + m_2(M-1,p) &=& (1-p)^{M-1} + \sum\limits_{i=1}^{M-1}\frac{1}{i}\left((1-p)^{M-1-i} - (1-p)^{M-1}\right)\sum\limits_{j=i}^{M-1}\frac{1}{j}\\ + &\le& (1-p)^{M-1}\sum\limits_{i=1}^{M-1}\frac{1}{i(1-p)^i}\sum\limits_{j=i}^{M-1}\frac{1}{j}. +\end{eqnarray*} +Since +\begin{eqnarray*} + \frac{\frac{1}{k(1-p)^k}\sum\limits_{j=k}^{M-1}\frac{1}{j}}{\frac{1}{(k-1)(1-p)^{k-1}}\sum\limits_{j=k-1}^{M-1}\frac{1}{j}} &=& \frac{(k-1)\sum\limits_{j=k}^{M-1}\frac{1}{j}}{k(1-p)\left(\frac{1}{k-1} + \sum\limits_{j=k}^{M-1}\frac{1}{j}\right)} \ge \frac{3(k-1)\cdot\frac{1}{k}}{k\left(\frac{1}{k-1}+\frac{1}{k}\right)}\\ + &=& \frac{3(k-1)^2}{k(2k-1)}\xrightarrow[k\to\infty]{} \frac{3}{2}, +\end{eqnarray*} +we have +\begin{equation*} + (1-p)^{M-1}\sum\limits_{i=1}^{M-1}\frac{1}{i(1-p)^i}\sum\limits_{j=i}^{M-1}\frac{1}{j} = \Theta\left((1-p)^M\cdot\frac{1}{M^2(1-p)^M}\right) = \Theta\left(\frac{1}{M^2}\right). +\end{equation*} +Next, one can prove with simple algebra that for $M\ge 11$ and $p \ge\nicefrac{2}{3}$ the following inequality holds: +\begin{equation*} + m_2(M-1,p) \le (1-p)^{M-1}\sum\limits_{i=1}^{M-1}\frac{1}{i(1-p)^i}\sum\limits_{j=i}^{M-1}\frac{1}{j} \le \frac{3}{M^2}. +\end{equation*} +Plugging the obtained upper bounds for $m_1(M-1,p)$ and $m_2(M-1,p)$ in \eqref{eq:variance_bound_supp}, we obtain \eqref{eq:variance_bound_2_supp}. +\end{proof} diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/proofs_opt.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/proofs_opt.tex new file mode 100644 index 00000000..9d397299 --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/proofs_opt.tex @@ -0,0 +1,484 @@ +\section{Convergence Proofs of Moshpit SGD}\label{sect:missing_proofs_local_sgd} +In this section, we provide the complete statements of the theorems establishing the convergence of Moshpit SGD together with the full proofs. First, we introduce all necessary definitions, basic inequalities and auxiliary lemmas; then we prove the convergence in strongly convex and convex cases; lastly, we provide the proofs for the non-convex case. + +\subsection{Definitions, Basic Facts and Auxiliary Results}\label{sect:basic_facts} + + +Below we provide several classical definitions and results which are used in our proofs. + +\subsubsection{Standard Definitions from Optimization Theory} + +\begin{definition}[$L$-smoothness]\label{def:L_smoothness} +A function $f:\R^n \to \R$ is called $L$-smooth if for all $x,y\in \R^n$, the following inequality holds: +\begin{equation} + \|\nabla f(x) - \nabla f(y)\| \le L\|x-y\|.\label{eq:L_smoothness_def} +\end{equation} +\end{definition} +If the function $f$ is $L$-smooth, then for all $x,y\in\R^n$ +\begin{equation} + f(y) \le f(x) + \langle\nabla f(x), y-x \rangle + \frac{L}{2}\|y-x\|^2. \label{eq:L_smoothness_cor} +\end{equation} +Next, if $f$ is additionally convex and $x^*$ is its minimizer, then for all $x\in\R^d$ +\begin{equation} + \|\nabla f(x)\|^2 \le 2L\left(f(x) - f(x^*)\right). \label{eq:L_smoothness_cor_2} +\end{equation} + + +\begin{definition}[$\mu$-strong convexity]\label{def:str_cvx} + A differentiable function $f:\R^n \to\R$ is called $\mu$-strongly convex if there exists a constant $\mu \ge 0$ such that for all $x,y\in \R^n$ + \begin{equation} + f(y) \ge f(x) + \langle\nabla f(x), y-x \rangle + \frac{\mu}{2}\|y-x\|^2. \label{eq:str_cvx_def} + \end{equation} +\end{definition} + +\subsubsection{Basic Facts} +For all $a,b,\theta_1,\ldots,\theta_N\in\R^n$ and $\alpha > 0$, the following inequalities hold: +\begin{eqnarray} + \|a+b\|^2 &\le& 2\|a\|^2 + 2\|b\|^2, \label{eq:a+b}\\ + \left\|\frac{1}{N}\sum\limits_{i=1}^N\theta_i\right\|^2 &\le& \frac{1}{N}\sum\limits_{i=1}^N\|\theta_i\|^2, \label{eq:jensen_ineq}\\ + \langle a,b\rangle &\le& \frac{\|a\|^2}{2\alpha} + \frac{\alpha\|b\|^2}{2}. \label{eq:young_inequality} +\end{eqnarray} + +\subsubsection{Properties of Expectation} +\textbf{Variance decomposition.} For a random vector $\eta \in \R^d$ and any deterministic vector $x \in \R^d$, the variance satisfies +\begin{equation}\label{eq:variance_decomposition} + \EE\left[\left\|\eta - \EE\eta\right\|^2\right] = \EE\left[\|\eta-x\|^2\right] - \left\|\EE\eta - x\right\|^2 +\end{equation} + +\textbf{Tower property of expectation.} For any random variables $\xi,\eta\in \R^d$ we have +\begin{equation} + \EE\left[\xi\right] = \EE\left[\EE\left[\xi\mid \eta\right]\right]\label{eq:tower_property} +\end{equation} +under the assumption that $\EE[\xi]$ and $\EE\left[\EE\left[\xi\mid \eta\right]\right]$ are well-defined. + +\subsubsection{Auxiliary Results} +For the readers' convenience, we list all auxiliary results that we use in our proofs below. The first result is classical and establishes that the gradient descent step is a contractive operator. +\begin{lemma}[Lemma 6 from \cite{karimireddy2020scaffold}]\label{lem:gd_contraction} + For any $L$-smooth and $\mu$-strongly convex function $f:\R^n\to\R$, points $x,y\in \R^n$, and stepsize $\gamma \in (0,\nicefrac{1}{L}]$, the following inequality holds: + \begin{equation} + \|x - \gamma\nabla f(x) - y + \gamma\nabla f(y)\|^2 \le (1-\gamma\mu)\|x-y\|^2. \label{eq:gd_contraction} + \end{equation} +\end{lemma} + +The next two lemmas are useful for estimating typical recurrences appearing in the analysis. +\begin{lemma}[Lemma~I.2 from \cite{gorbunov2020local}]\label{lem:lemma_i_2_gorbunov} + Let $\{r_k\}_{k\ge 0}$ satisfy + \begin{equation*} + r_K \le \frac{a}{\gamma W_K} + c_1\gamma + c_2\gamma^2 + \end{equation*} + for all $K \ge 0$ with some constants $a,c_2 \ge 0$, $c_1 \ge 0$, where $w_k = (1-\gamma\mu(1-\delta_{pv,1}))^{-(k+1)}$, $W_K = \sum_{k=0}^Kw_k$, $\mu > 0$, $\delta_{pv,1}\in [0,1)$ and $\gamma \le \gamma_0$ for some $\gamma_0 > 0$, $\gamma_0 \le \nicefrac{1}{\mu(1-\delta_{pv,1})}$. Then, for all $K$ such that + \begin{align*} + \text{either } & \frac{\ln\left(\max\left\{2, \min\left\{\nicefrac{a\mu^2(1-\delta_{pv,1})^2K^2}{c_1},\nicefrac{a\mu^3(1-\delta_{pv,1})^3K^3}{c_2}\right\}\right\}\right)}{K} \le 1\\ + \text{or } & \gamma_0 \le \frac{\ln\left(\max\left\{2, \min\left\{\nicefrac{a\mu^2(1-\delta_{pv,1})^2K^2}{c_1},\nicefrac{a\mu^3(1-\delta_{pv,1})^3K^3}{c_2}\right\}\right\}\right)}{(1-\delta_{pv,1})\mu K} + \end{align*} + and + \begin{equation*} + \gamma = \min\left\{\gamma_0, \frac{\ln\left(\max\left\{2, \min\left\{\nicefrac{a\mu^2(1-\delta_{pv,1})^2K^2}{c_1},\nicefrac{a\mu^3(1-\delta_{pv,1})^3K^3}{c_2}\right\}\right\}\right)}{(1-\delta_{pv,1})\mu K}\right\} + \end{equation*} + we have that + \begin{equation*} + r_K = \widetilde{\cO}\left(\frac{a}{\gamma_0}\exp\left(-\gamma_0\mu(1-\delta_{pv,1})K\right) + \frac{c_1}{(1-\delta_{pv,1})\mu K} + \frac{c_2}{(1-\delta_{pv,1})^2\mu^2 K^2}\right). + \end{equation*} +\end{lemma} + +\begin{lemma}[Lemma~I.3 from \cite{gorbunov2020local}]\label{lem:lemma_i_3_gorbunov} + Let $\{r_k\}_{k\ge 0}$ satisfy + \begin{equation*} + r_K \le \frac{a}{\gamma K} + c_1\gamma + c_2\gamma^2 + \end{equation*} + for all $K \ge 0$ with some constants $a,c_2 \ge 0$, $c_1 \ge 0$ where $\gamma \le \gamma_0$ for some $\gamma_0 > 0$. Then for all $K$ and + \begin{equation*} + \gamma = \min\left\{\gamma_0, \sqrt{\frac{a}{c_1 K}}, \sqrt[3]{\frac{a}{c_2 K}}\right\} + \end{equation*} + we have that + \begin{equation*} + r_K = \cO\left(\frac{a}{\gamma_0 K} + \sqrt{\frac{ac_1}{K}} + \frac{\sqrt[3]{a^2c_2}}{K^{\nicefrac{2}{3}}}\right). + \end{equation*} +\end{lemma} + +Finally, the lemma below is useful for our convergence analysis in the non-convex case. +\begin{lemma}[Lemma~I.1 from \cite{gorbunov2020local}]\label{lem:lemma_i_1_gorbunov} + For any $\tau$ random vectors $\xi_1,\ldots,\xi_\tau\in\R^d$ such that $\forall t=2,\ldots,\tau$ the random vector $\xi_t$ depends on $\xi_{1},\ldots,\xi_{t-1}$ and does not depend on $\xi_{t+1},\ldots,\xi_{\tau}$ the following inequality holds + \begin{equation} + \EE\left[\left\|\sum\limits_{t=1}^\tau\xi_t\right\|^2\right] \le e\tau\sum\limits_{t=1}^\tau\EE\left[\left\|\EE_t[\xi_{t}]\right\|^2\right] + e\sum\limits_{t=1}^\tau\EE\left[\left\|\xi_t-\EE_t[\xi_{t}]\right\|^2\right], \label{eq:lemma_i_1_gorbunov} + \end{equation} + where $\EE_t[\cdot]$ denotes the conditional expectation $\EE[\ \cdot\mid \xi_{t-1},\ldots,\xi_1]$. +\end{lemma} + +\subsection{Convex Case} +In this section, we give the full proof of Theorem~\ref{thm:cvx_convergence} about the convergence of Moshpit SGD for convex and strongly convex problems. The scheme of the proof follows the similar steps as in the state-of-the-art analysis of Local-SGD \cite{khaled2020tighter,woodworth2020local,gorbunov2020local}. We start with the following lemma: +\begin{lemma}\label{lem:key_lemma_cvx} + Let $f_1 = \ldots = f_N = f$, function $f$ be $\mu$-strongly convex (Def.~\ref{def:str_cvx}) and $L$-smooth (see Def.~\ref{def:L_smoothness}), and Assumptions~\ref{as:bounded_var}~and~\ref{as:averaging_quality} hold with $\Delta_{pv}^k = \delta_{pv,1}\gamma\mu\EE[\|\theta^k-\theta^*\|^2] + \gamma^2\delta_{pv,2}^2$ and $\widetilde{\theta} = \theta^*$, where $\theta^* \in \argmin_{\theta\in\R^n} f(\theta)$ and $\delta_{pv,1}\in [0,1)$, $\delta_{pv,2}\ge 0$. Then, for any $k \ge 0$ the iterates produced by Moshpit SGD with $\gamma \le \nicefrac{1}{4L}$ satisfy + \begin{eqnarray} + \gamma\EE\left[f(\theta^k) - f(\theta^*)\right] &\le& (1-\gamma\mu(1-\delta_{pv,1}))\EE\left[\|\theta^k - \theta^*\|^2\right] - \EE\left[\|\theta^{k+1} - \theta^*\|^2\right]\notag\\ + &&\quad+ \frac{3L\gamma}{2}\EE[V_k] + \gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right),\label{eq:key_lemma_cvx} + \end{eqnarray} + where $V_k = \frac{1}{N_k}\sum_{i\in P_k}\|\theta_i^k - \theta^k\|^2$ and $\theta^k = \frac{1}{N_k}\sum_{i\in P_k}\theta_i^k$. +\end{lemma} +\begin{proof} +Recall that Assumption~\ref{as:averaging_quality} with $\Delta_{pv}^k = \delta_{pv,1}\gamma\mu\EE[\|\theta^k-\theta^*\|^2] + \gamma^2\delta_{pv,2}^2$ and $\widetilde{\theta} = \theta^*$ states +\begin{equation} + \EE\left[\langle\theta^{k+1} - \widehat{\theta}^{k+1}, \theta^{k+1}+\widehat{\theta}^{k+1} - 2\theta^*\rangle\right] \le \delta_{pv,1}\gamma\mu\EE[\|\theta^k-\theta^*\|^2] + \gamma^2\delta_{pv,2}^2, \label{eq:key_lemma_cvx_tech_1} +\end{equation} +where $\widehat \theta^{k+1} = \frac{1}{N_{k}}\sum_{i\in P_{k}}(\theta_i^{k}-\gamma g_i^k)$. Next, the definition of $\widehat \theta^{k+1}$ implies +\begin{equation} + \widehat \theta^{k+1} = \frac{1}{N_k}\sum\limits_{i\in P_{k}}\theta_i^{k} - \frac{\gamma}{N_k}\sum\limits_{i\in P_{k}} g_i^k = \theta^k - \gamma g^k,\notag +\end{equation} +where $g^k = \frac{1}{N_k}\sum_{i\in P_k}g_i^k$. Using this, we derive +\begin{eqnarray} + \|\theta^{k+1} - \theta^*\|^2 &=& \|\widehat{\theta}^{k+1} - \theta^*\|^2 + 2\langle \theta^{k+1} - \widehat{\theta}^{k+1}, \widehat{\theta}^{k+1} - \theta^* \rangle + \|\theta^{k+1} - \widehat{\theta}^{k+1}\|^2\notag\\ + &=& \|\theta^k - \theta^* - \gamma g^k\|^2 + \langle\theta^{k+1} - \widehat{\theta}^{k+1}, \theta^{k+1}+\widehat{\theta}^{k+1} - 2\theta^*\rangle \notag\\ + &=& \|\theta^k - \theta^*\|^2 -2\gamma\langle\theta^k - \theta^*, g^k\rangle + \gamma^2\|g^k\|^2\notag\\ + &&\quad + \langle\theta^{k+1} - \widehat{\theta}^{k+1}, \theta^{k+1}+\widehat{\theta}^{k+1} - 2\theta^*\rangle. \notag +\end{eqnarray} +Taking the conditional expectation $\EE\left[\ \cdot \mid \theta^k\right] := \EE\left[\ \cdot \mid P_k, \theta_i^k, i\in P_k\right]$ from the both sides of the previous equation and using Assumption~\ref{as:bounded_var}, we obtain +\begin{eqnarray} + \EE\left[\|\theta^{k+1} - \theta^*\|^2\mid \theta^k\right] &=& \|\theta^k - \theta^*\|^2 -2\gamma\left\langle\theta^k - \theta^*, \frac{1}{N_k}\sum\limits_{i\in P_k}\nabla f(\theta_i^k)\right\rangle\notag\\ + &&\quad + \gamma^2\EE\left[\left\|\frac{1}{N_k}\sum\limits_{i\in P_k}g_i^k\right\|^2\mid \theta^k\right] \notag\\ + &&\quad + \EE\left[\langle\theta^{k+1} - \widehat{\theta}^{k+1}, \theta^{k+1}+\widehat{\theta}^{k+1} - 2\theta^*\rangle\mid \theta^k\right]. \label{eq:key_lemma_cvx_tech_2} +\end{eqnarray} +Next, we estimate the second and the third terms in the right-hand side of \eqref{eq:key_lemma_cvx_tech_2}. First, +\begin{eqnarray} + -2\gamma\left\langle\theta^k - \theta^*, \frac{1}{N_k}\sum\limits_{i\in P_k}\nabla f(\theta_i^k)\right\rangle &=& \frac{2\gamma}{N_k}\sum\limits_{i\in P_k}\left(\langle\theta^* - \theta_i^k, \nabla f(\theta_i^k) \rangle + \langle\theta_i^k - \theta^k, \nabla f(\theta_i^k) \rangle \right)\notag\\ + &\overset{\eqref{eq:str_cvx_def},\eqref{eq:L_smoothness_cor}}{\le}& \frac{2\gamma}{N_k}\sum\limits_{i\in P_k}\left( f(\theta^*) - f(\theta_i^k) - \frac{\mu}{2}\|\theta_i^k - \theta^*\|^2\right)\notag\\ + &&\quad + \frac{2\gamma}{N_k}\sum\limits_{i\in P_k}\left(f(\theta_i^k) - f(\theta^k) + \frac{L}{2}\|\theta_i^k - \theta^k\|^2\right)\notag\\ + &\overset{\eqref{eq:jensen_ineq}}{\le}& 2\gamma\left(f(\theta^*) - f(\theta^k)\right) -\gamma\mu\|\theta^k - \theta^*\|^2 + L\gamma V_k, \label{eq:key_lemma_cvx_tech_3} +\end{eqnarray} +where $V_k = \frac{1}{N_k}\sum_{i\in P_k}\|\theta_i^k - \theta^k\|^2$. Secondly, since stochastic gradients $\{g_i^k\}_{i\in P_k}$ are computed independently, we get +\begin{eqnarray} + \gamma^2\EE\left[\left\|\frac{1}{N_k}\sum\limits_{i\in P_k}g_i^k\right\|^2\mid \theta^k\right] &\overset{\eqref{eq:variance_decomposition}}{=}& \gamma^2\left\|\frac{1}{N_k}\sum\limits_{i\in P_k}\nabla f(\theta_i^k)\right\|^2\notag\\ + &&\quad + \gamma^2\EE\left[\left\|\frac{1}{N_k}\sum\limits_{i\in P_k}(g_i^k-\nabla f(\theta_i^k))\right\|^2\mid \theta^k\right]\notag\\ + &\overset{\eqref{eq:jensen_ineq}}{\le}& 2\gamma^2 \left\|\frac{1}{N_k}\sum\limits_{i\in P_k}(\nabla f(\theta_i^k)-\nabla f(\theta^k))\right\|^2 + 2\gamma^2\|\nabla f(\theta^k)\|^2 \notag\\ + &&\quad + \frac{\gamma^2}{N_k^2}\sum\limits_{i\in P_k}\EE\left[\|g_i^k - \nabla f(\theta_i^k)\|^2\mid \theta^k\right]\notag\\ + &\overset{\eqref{eq:jensen_ineq},\eqref{eq:L_smoothness_cor_2},\eqref{eq:bounded_variance}}{\le}& \frac{2\gamma^2}{N_k}\sum\limits_{i\in P_k}\|\nabla f(\theta_i^k)-\nabla f(\theta^k)\|^2 \notag\\ + &&\quad + 4L\gamma^2\left(f(\theta^k) - f(\theta^*)\right) + \frac{\gamma^2\sigma^2}{N_k}\notag\\ + &\overset{\eqref{eq:L_smoothness_def}}{\le}& \underbrace{\frac{2L^2\gamma^2}{N_k}\sum\limits_{i\in P_k}\|\theta_i^k - \theta^k\|^2}_{2L^2\gamma^2 V_k}\notag\\ + &&\quad + 4L\gamma^2\left(f(\theta^k) - f(\theta^*)\right) + \frac{\gamma^2\sigma^2}{N_{\min}}. \label{eq:key_lemma_cvx_tech_4} +\end{eqnarray} +Plugging \eqref{eq:key_lemma_cvx_tech_3} and \eqref{eq:key_lemma_cvx_tech_4} in \eqref{eq:key_lemma_cvx_tech_2}, we obtain +\begin{eqnarray} + \EE\left[\|\theta^{k+1} - \theta^*\|^2\mid \theta^k\right] &\le& (1-\gamma\mu)\|\theta^k - \theta^*\|^2 - 2\gamma\left(1 - 2L\gamma\right)\left(f(\theta^k) - f(\theta^*)\right)\notag\\ + &&\quad + L\gamma\left(1+2L\gamma\right)V_k + \frac{\gamma^2\sigma^2}{N_{\min}} \notag\\ + &&\quad + \EE\left[\langle\theta^{k+1} - \widehat{\theta}^{k+1}, \theta^{k+1}+\widehat{\theta}^{k+1} - 2\theta^*\rangle\mid \theta^k\right], \notag +\end{eqnarray} +and +\begin{eqnarray} + \EE\left[\|\theta^{k+1} - \theta^*\|^2\right] &\overset{\eqref{eq:key_lemma_cvx_tech_1}}{\le}& (1-\gamma\mu(1-\delta_{pv,1}))\EE\left[\|\theta^k - \theta^*\|^2\right] - 2\gamma\left(1 - 2L\gamma\right)\EE\left[f(\theta^k) - f(\theta^*)\right]\notag\\ + &&\quad+ L\gamma\left(1+2L\gamma\right)\EE[V_k] + \gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right)\notag\\ + &\le& (1-\gamma\mu(1-\delta_{pv,1}))\EE\left[\|\theta^k - \theta^*\|^2\right] - \gamma\EE\left[f(\theta^k) - f(\theta^*)\right]\notag\\ + &&\quad+ \frac{3L\gamma}{2}\EE[V_k] + \gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right),\notag +\end{eqnarray} +where in the last inequality we use $\gamma \le \nicefrac{1}{4L}$. +\end{proof} + +Next, we estimate the term $\EE[V_k]$ measuring the expected dissimilarity between local iterates and their global average at iteration $k$. + +\begin{lemma}\label{lem:V_k_lemma_cvx} + Let $f_1 = \ldots = f_N = f$, function $f$ be $\mu$-strongly convex (Def.~\ref{def:str_cvx}) and $L$-smooth (see Def.~\ref{def:L_smoothness}), and Assumptions~\ref{as:bounded_var}~and~\ref{as:averaging_quality} hold with $\Delta_{pv}^k = \delta_{pv,1}\gamma\mu\EE[\|\theta^k-\theta^*\|^2] + \gamma^2\delta_{pv,2}^2$ and $\widetilde{\theta} = \theta^*$, where $\theta^* \in \argmin_{\theta\in\R^n} f(\theta)$ and $\delta_{pv,1}\in [0,1)$, $\delta_{pv,2}\ge 0$. Then, for any $k \ge 0$ the iterates produced by Moshpit SGD with $\gamma \le \nicefrac{1}{4L}$ satisfy + \begin{equation} + \EE[V_k] \le 2\gamma^2\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right), \label{eq:V_k_bound_cvx} + \end{equation} + where $V_k = \frac{1}{N_k}\sum_{i\in P_k}\|\theta_i^k - \theta^k\|^2$ and $\theta^k = \frac{1}{N_k}\sum_{i\in P_k}\theta_i^k$. +\end{lemma} +\begin{proof} + First of all, if $k = a\tau$ for some integer $a\ge 0$, then \eqref{eq:V_k_bound_cvx} follows from Assumption~\ref{as:averaging_quality} (eq.~\eqref{eq:quality_of_avg}). Therefore, we consider such $k$ that $k = a\tau + t'$ for some $t'\in (0,\tau)$. Then, for any $i,j \in P_{k}$, $i\neq j$ + \begin{eqnarray*} + \EE\left[\|\theta_i^k - \theta_j^k\|^2\mid \theta^{k-1}\right] &=& \EE\left[\|\theta_i^{k-1} - \gamma g_i^{k-1} - \theta_j^{k-1} + \gamma g_{j}^{k-1}\|^2\mid \theta^{k-1}\right]\\ + &\overset{\eqref{eq:variance_decomposition}}{=}& \|\theta_i^{k-1} - \gamma \nabla f(\theta_i^{k-1}) - \theta_j^{k-1} + \gamma \nabla f(\theta_j^{k-1})\|^2\\ + &&\quad +\gamma^2\EE\left[\|g_i^{k-1} - \nabla f(\theta_i^{k-1}) + g_{j}^{k-1} - \nabla f(\theta_j^{k-1})\|^2\mid \theta^{k-1}\right]. + \end{eqnarray*} + Using Lemma~\ref{lem:gd_contraction} and independence of $g_i^{k-1}$ and $g_j^{k-1}$ for given $\theta_i^{k-1}, \theta_j^{k-1}$, $i\neq j$ we derive + \begin{eqnarray*} + \EE\left[\|\theta_i^k - \theta_j^k\|^2\mid \theta^{k-1}\right] &\overset{\eqref{eq:gd_contraction}}{\le}& (1-\gamma\mu)\|\theta_i^{k-1} - \theta_j^{k-1}\|^2 +\gamma^2\EE\left[\|g_i^{k-1} - \nabla f(\theta_i^{k-1})\|^2\mid \theta^{k-1}\right]\\ + &&\quad +\gamma^2\EE\left[\|g_j^{k-1} - \nabla f(\theta_j^{k-1})\|^2\mid \theta^{k-1}\right]\\ + &\overset{\eqref{eq:bounded_variance}}{\le}& (1-\gamma\mu)\|\theta_i^{k-1} - \theta_j^{k-1}\|^2 + 2\gamma^2\sigma^2, + \end{eqnarray*} + from which we get the following: + \begin{equation} + \EE_g\left[\|\theta_i^k - \theta_j^k\|^2\right] \le (1-\gamma\mu)\EE_g\left[\|\theta_i^{k-1} - \theta_j^{k-1}\|^2\right] + 2\gamma^2\sigma^2 \le \EE_g\left[\|\theta_i^{k-1} - \theta_j^{k-1}\|^2\right] + 2\gamma^2\sigma^2.\notag % + \end{equation} + Here, $\EE_g[\cdot]$ denotes the expectation conditioned on $\{P_k\}_{k = a\tau}^{(a+1)\tau-1}$. Unrolling the recurrence, we get + \begin{eqnarray} + \EE_g\left[\|\theta_i^k - \theta_j^k\|^2\right] &\le& \EE_g\left[\|\theta_i^{a\tau} - \theta_j^{a\tau}\|^2\right] + 2(k-a\tau)\gamma^2\sigma^2\notag \\ + &\le& \EE_g\left[\|\theta_i^{a\tau} - \theta_j^{a\tau}\|^2\right] + 2(\tau-1)\gamma^2\sigma^2.\label{eq:V_k_lemma_technical_1} + \end{eqnarray} + Using this, we estimate $\EE_{g}[V_k]$: + \begin{eqnarray*} + \EE_g[V_k] &=& \frac{1}{N_k}\sum\limits_{i\in P_k}\EE_g\left[\left\|\theta_i^k - \frac{1}{N_k}\sum\limits_{j\in P_k}\theta_j^k\right\|^2\right] \overset{\eqref{eq:jensen_ineq}}{\le} \frac{1}{N_k^2}\sum\limits_{i,j \in P_k}\EE_g\left[\|\theta_i^k - \theta_j^k\|^2\right]\\ + &\overset{\eqref{eq:V_k_lemma_technical_1}}{\le}& \frac{1}{N_k^2}\sum\limits_{i,j \in P_k}\EE_g\left[\|\theta_i^{a\tau} - \theta_j^{a\tau}\|^2\right] + 2(\tau-1)\gamma^2\sigma^2 \\ + &\overset{\eqref{eq:a+b}}{\le}& \frac{2}{N_k^2}\sum\limits_{i,j \in P_k}\left(\EE_g\left[\|\theta_i^{a\tau} - \theta^{a\tau}\|^2\right] + \EE_g\left[\|\theta_j^{a\tau} - \theta^{a\tau}\|^2\right]\right) + 2(\tau-1)\gamma^2\sigma^2\\ + &=& \frac{4}{N_k}\sum\limits_{i\in P_k}\EE_g\left[\|\theta_i^{a\tau} - \theta^{a\tau}\|^2\right]+ 2(\tau-1)\gamma^2\sigma^2\\ + &\le& \frac{4}{N_{a\tau}}\cdot\frac{N_{a\tau}}{N_k}\sum\limits_{i\in P_{a\tau}}\EE_g\left[\|\theta_i^{a\tau} - \theta^{a\tau}\|^2\right]+ 2(\tau-1)\gamma^2\sigma^2\\ + &\le& \EE_g\left[\frac{8}{N_{a\tau}}\sum\limits_{i\in P_{a\tau}}\|\theta_i^{a\tau} - \theta^{a\tau}\|^2\right]+ 2(\tau-1)\gamma^2\sigma^2, + \end{eqnarray*} + where in the last inequality we use $2N_{(a+1)\tau} = 2|P_{(a+1)\tau}| \ge |P_{a\tau}| = N_{a\tau}$ and $|N_k|\le |N_{k-1}|$ following from Assumption~\ref{as:averaging_quality}. Finally, we take the full expectation from the previous inequality: + \begin{eqnarray*} + \EE[V_k] &\overset{\eqref{eq:tower_property}}{\le}& 8\EE\left[\frac{1}{N_{a\tau}}\sum\limits_{i\in P_{a\tau}}\|\theta_i^{a\tau} - \theta^{a\tau}\|^2\right]+ 2(\tau-1)\gamma^2\sigma^2 \overset{\eqref{eq:quality_of_avg}}{\le} 2\gamma^2\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right). + \end{eqnarray*} + This finishes the proof. +\end{proof} + +Combining Lemmas~\ref{lem:key_lemma_cvx}~and~\ref{lem:V_k_lemma_cvx}, we get the following result: +\begin{theorem}[Theorem~\ref{thm:cvx_convergence}, convergence in the convex case]\label{thm:cvx_convergence_supp} + Let $f_1 = \ldots = f_N = f$ be $\mu$-strongly convex (Def.~\ref{def:str_cvx}) and $L$-smooth (see Def.~\ref{def:L_smoothness}), and Assumptions~\ref{as:bounded_var}~and~\ref{as:averaging_quality} hold with $\Delta_{pv}^k = \delta_{pv,1}\gamma\mu\EE[\|\theta^k-\theta^*\|^2] + \gamma^2\delta_{pv,2}^2$ and $\widetilde{\theta} = \theta^*$, where $\theta^* \in \argmin_{\theta\in\R^n} f(\theta)$ and $\delta_{pv,1}\in [0,1)$, $\delta_{pv,2}\ge 0$. Then, for any $K \ge 0$, the iterates produced by Moshpit SGD with $\gamma \le \nicefrac{1}{4L}$ satisfy + \begin{eqnarray} + \EE\left[f(\overline{\theta}^K) - f(\theta^*)\right] &\le& (1-\gamma\mu(1-\delta_{pv,1}))^K\frac{R_0^2}{\gamma}\notag\\ + &&\quad + \gamma\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2 + 3L\gamma\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right)\right), \label{eq:str_cvx_bound_supp} + \end{eqnarray} + when $\mu > 0$, and + \begin{equation} + \EE\left[f(\overline{\theta}^K) - f(\theta^*)\right] \le \frac{R_0^2}{\gamma K} + \gamma\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2 + 3L\gamma\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right)\right), \label{eq:cvx_bound_supp} + \end{equation} + when $\mu = 0$, where $R_0 = \|\theta^0 - \theta^*\|$, $\overline{\theta}^K = \frac{1}{W_K}\sum_{k=0}^Kw_k\theta^k = \frac{1}{W_K}\sum_{k=0}^K\frac{w_k}{N_k}\sum_{i\in P_k}\theta_i^k$, $w_k = (1-\gamma\mu(1-\delta_{pv,1}))^{-(k+1)}$, and $W_K = \sum_{k=0}^Kw_k$. That is, Moshpit SGD achieves $\EE[f(\overline{\theta}^K) - f(\theta^*)] \le \varepsilon$ after + \begin{equation} + K = \widetilde{\cO}\left(\frac{L}{(1-\delta_{pv,1})\mu} + \frac{\sigma^2}{N_{\min}(1-\delta_{pv,1})\mu\varepsilon} + \frac{\delta_{pv,2}^2}{(1-\delta_{pv,1})\mu\varepsilon} + \sqrt{\frac{L((\tau-1)\sigma^2+\delta_{aq}^2)}{(1-\delta_{pv,1})^2\mu^2\varepsilon}}\right)\label{eq:str_cvx_bound_2_supp} + \end{equation} + iterations with + \begin{equation*} + \gamma = \min\left\{\frac{1}{4L}, \frac{\ln\left(\max\left\{2, \min\left\{\frac{R_0^2\mu^2(1-\delta_{pv,1})^2K^2}{(\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}}) },\frac{R_0^2\mu^3(1-\delta_{pv,1})^3K^3}{3L\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right)}\right\}\right\}\right)}{(1-\delta_{pv,1})\mu K}\right\} + \end{equation*} + when $\mu > 0$, and after + \begin{equation} + K = \cO\left(\frac{LR_0^2}{\varepsilon} + \frac{R_0^2\sigma^2}{N_{\min}\varepsilon^2} + \frac{R_0^2\delta_{pv,2}^2}{\varepsilon^2} + \frac{R_0^2\sqrt{L((\tau-1)\sigma^2+\delta_{aq}^2)}}{\varepsilon^{\nicefrac{3}{2}}}\right)\label{eq:cvx_bound_2_supp} + \end{equation} + iterations with + \begin{equation*} + \gamma = \min\left\{\frac{1}{4L} \sqrt{\frac{R_0}{(\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}})K}}, \sqrt[3]{\frac{R_0^2}{3L\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right) K}}\right\} + \end{equation*} + when $\mu = 0$. +\end{theorem} +\begin{proof} + Plugging the result of Lemma~\ref{lem:V_k_lemma_cvx} in inequality \eqref{eq:key_lemma_cvx} from Lemma~\ref{lem:key_lemma_cvx}, we obtain + \begin{eqnarray} + \gamma\EE\left[f(\theta^k) - f(\theta^*)\right] &\le& (1-\gamma\mu(1-\delta_{pv,1}))\EE\left[\|\theta^k - \theta^*\|^2\right] - \EE\left[\|\theta^{k+1} - \theta^*\|^2\right]\notag\\ + &&\quad+ 3L\gamma^3\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right) + \gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right).\notag + \end{eqnarray} + Next, we sum up these inequalities for $k=0,\ldots, K$ with weights $w_k = (1-\gamma\mu(1-\delta_{pv,1}))^{-(k+1)}$ and divide both sides by $\gamma W_K$, where $W_K = \sum_{k=0}^Kw_k$: + \begin{eqnarray*} + \frac{1}{W_K}\sum\limits_{k=0}^K w_k\EE\left[f(\theta^k) - f(\theta^*)\right] &\le& \frac{1}{\gamma W_K}\sum\limits_{k=0}^K(1-\gamma\mu(1-\delta_{pv,1}))w_k\EE\left[\|\theta^k - \theta^*\|^2\right]\notag\\ + &&\quad - \frac{1}{\gamma W_K}\sum\limits_{k=0}^K w_k\EE\left[\|\theta^{k+1} - \theta^*\|^2\right]\notag\\ + &&\quad+ \gamma\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2 + 3L\gamma\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right)\right)\\ + &=& \frac{1}{\gamma W_K}\sum\limits_{k=0}^K\left(w_{k-1}\EE\left[\|\theta^k - \theta^*\|^2\right] - w_k\EE\left[\|\theta^{k+1} - \theta^*\|^2\right]\right)\notag\\ + &&\quad+ \gamma\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2 + 3L\gamma\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right)\right)\\ + &=& \frac{w_{-1}\|\theta^0 - \theta^*\|^2 - w_K\EE\left[\|\theta^{K+1}-\theta^*\|^2\right]}{\gamma W_K}\\ + &&\quad+ \gamma\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2 + 3L\gamma\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right)\right)\\ + &\le& \frac{\|\theta^0 - \theta^*\|^2}{\gamma W_K} \\ + &&\quad + \gamma\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2 + 3L\gamma\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right)\right). + \end{eqnarray*} + Since $f$ is convex, we apply the Jensen's inquality + \begin{eqnarray*} + f\left(\frac{1}{W_K}\sum\limits_{k=0}^K w_k\theta^k\right) &\le& \frac{1}{W_K}\sum\limits_{k=0}^K w_k f(\theta^k) + \end{eqnarray*} + to the previous result and get + \begin{eqnarray*} + \EE\left[f(\overline{\theta}^K) - f(\theta^*)\right] &\le& \frac{R_0^2}{\gamma W_K} + \gamma\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2 + 3L\gamma\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right)\right), + \end{eqnarray*} + where $R_0 = \|\theta^0 - \theta^*\|$ and $\overline{\theta}^K = \frac{1}{W_K}\sum_{k=0}^Kw_k\theta^k = \frac{1}{W_K}\sum_{k=0}^K\frac{w_k}{N_k}\sum_{i\in P_k}\theta_i^k$. If $\mu > 0$, then $W_K \ge w_K \ge (1-\gamma\mu(1-\delta_{pv,1}))^{-K}$, implying \eqref{eq:str_cvx_bound_supp}. Next, $w_k = 1$ and $W_K = K$ when $\mu = 0$ gives \eqref{eq:cvx_bound_supp}. It remains to estimate the total number of iterations $K$ required by Moshpit SGD to find an $\varepsilon$-solution, i.e., to achieve $\EE[f(\overline{\theta}^K) - f(\theta^*)] \le \varepsilon$. Applying Lemma~\ref{lem:lemma_i_2_gorbunov} to \eqref{eq:str_cvx_bound_supp}, we get the following result: if $\mu > 0$ and + \begin{equation*} + \gamma = \min\left\{\frac{1}{4L}, \frac{\ln\left(\max\left\{2, \min\left\{\frac{R_0^2\mu^2(1-\delta_{pv,1})^2K^2}{\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}} },\frac{R_0^2\mu^3(1-\delta_{pv,1})^3K^3}{3L\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right)}\right\}\right\}\right)}{(1-\delta_{pv,1})\mu K}\right\}, + \end{equation*} + then $\EE\left[f(\overline{\theta}^K) - f(\theta^*)\right]$ equals + \begin{equation*} + \widetilde{\cO}\left(LR_0^2\exp\left(-\frac{\mu}{L}(1-\delta_{pv,1})K\right) + \frac{\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}}}{(1-\delta_{pv,1})\mu K} + \frac{L\left(\delta_{aq}^2 + (\tau-1)\sigma^2\right)}{(1-\delta_{pv,1})^2\mu^2 K^2}\right), + \end{equation*} + implying \eqref{eq:str_cvx_bound_2_supp}. Similarly, we apply Lemma~\ref{lem:lemma_i_3_gorbunov} to \eqref{eq:cvx_bound_supp} and get that for $\mu = 0$ and + \begin{equation*} + \gamma = \min\left\{\frac{1}{4L} \sqrt{\frac{R_0}{(\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}})K}}, \sqrt[3]{\frac{R_0^2}{3L\left(4\delta_{aq}^2 + (\tau-1)\sigma^2\right) K}}\right\}, + \end{equation*} + \begin{equation*} + \EE\left[f(\overline{\theta}^K) - f(\theta^*)\right] = \cO\left(\frac{LR_0^2}{K} + \sqrt{\frac{R_0^2(\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}})}{K}} + \frac{\sqrt[3]{R_0^4L\left(\delta_{aq}^2 + (\tau-1)\sigma^2\right)}}{K^{\nicefrac{2}{3}}}\right), + \end{equation*} + implying \eqref{eq:cvx_bound_2_supp}. +\end{proof} + + + + + + + + +\subsection{Non-Convex Case} +In this section, we give the full proof of Theorem~\ref{thm:non_cvx_convergence} about convergence of Moshpit SGD for general non-convex problems. The proof follows the similar steps as in the state-of-the-art analysis of Local-SGD in non-convex case~\cite{li2019communication,koloskova2020unified}. We start with the following lemma: +\begin{lemma}\label{lem:key_lemma_non_cvx} + Let $f_1 = \ldots = f_N = f$, function $f$ be $L$-smooth and bounded from below by $f_*$, and Assumptions~\ref{as:bounded_var}~and~\ref{as:averaging_quality} hold with $\Delta_{pv}^k = \delta_{pv,1}\gamma\EE[\|\nabla f(\theta^k)\|^2] + L\gamma^2\delta_{pv,2}^2$, $\delta_{pv,1}\in [0,\nicefrac{1}{2})$, $\delta_{pv,2}\ge 0$. Then, for any $K \ge 0$ the iterates produced by Moshpit SGD with $\gamma \le \nicefrac{(1-2\delta_{pv,1})}{8L}$ satisfy + \begin{eqnarray} + \frac{(1-2\delta_{pv,1})\gamma}{4}\sum\limits_{k=0}^{K-1}\EE\left[\|\nabla f(\theta^k)\|^2\right] &\le& f(\theta^0) - f_* + \gamma L^2\sum\limits_{k=0}^{K-1} \EE[V_k]\notag\\ + &&\quad + KL\gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right),\label{eq:key_lemma_non_cvx} + \end{eqnarray} + where $V_k = \frac{1}{N_k}\sum_{i\in P_k}\|\theta_i^k - \theta^k\|^2$ and $\theta^k = \frac{1}{N_k}\sum_{i\in P_k}\theta_i^k$. +\end{lemma} +\begin{proof} + Recall that Assumption~\ref{as:averaging_quality} with $\Delta_{pv}^k = \delta_{pv,1}\gamma\EE[\|\nabla f(\theta^k)\|^2] + L\gamma^2\delta_{pv,2}^2$ states +\begin{equation} + \EE\left[\langle\nabla f(\theta^k), \theta^{k+1}-\widehat{\theta}^{k+1}\rangle + L\|\widehat{\theta}^{k+1} - \theta^{k+1}\|^2\right] \le \delta_{pv,1}\gamma\EE[\|\nabla f(\theta^k)\|^2] + L\gamma^2\delta_{pv,2}^2, \label{eq:key_lemma_non_cvx_tech_1} +\end{equation} +where $\widehat \theta^{k+1} = \frac{1}{N_{k}}\sum_{i\in P_{k}}(\theta_i^{k}-\gamma g_i^k)$. As for the convex case, the definition of $\widehat \theta^{k+1}$ implies +\begin{equation} + \widehat \theta^{k+1} = \frac{1}{N_k}\sum\limits_{i\in P_{k}}\theta_i^{k} - \frac{\gamma}{N_k}\sum\limits_{i\in P_{k}} g_i^k = \theta^k - \gamma g^k,\notag +\end{equation} +where $g^k = \frac{1}{N_k}\sum_{i\in P_k}g_i^k$. Using this and L-smoothness of $f$, we derive + \begin{eqnarray*} + f(\theta^{k+1}) - f(\theta^k) &\overset{\eqref{eq:L_smoothness_cor}}{\le}& \langle\nabla f(\theta^k), \theta^{k+1} - \theta^k \rangle + \frac{L}{2}\|\theta^{k+1} - \theta^k\|^2\\ + &\overset{\eqref{eq:a+b}}{\le}& \langle\nabla f(\theta^k), \widehat{\theta}^{k+1} - \theta^k \rangle + \langle\nabla f(\theta^k), \theta^{k+1} - \widehat{\theta}^{k+1} \rangle\\ + &&\quad+ L\|\widehat{\theta}^{k+1} - \theta^k\|^2 + L\|\theta^{k+1} - \widehat{\theta}^{k+1}\|^2\\ + &=& - \gamma\langle\nabla f(\theta^k), g^k\rangle + L\gamma^2\|g^k\|^2 + \langle\nabla f(\theta^k), \theta^{k+1} - \widehat{\theta}^{k+1} \rangle\\ + &&\quad + L\|\theta^{k+1} - \widehat{\theta}^{k+1}\|^2, + \end{eqnarray*} + from which it follows that + \begin{eqnarray} + \EE\left[f(\theta^{k+1}) - f(\theta^k)\mid \theta^k\right] &\le& -\gamma\left\langle\nabla f(\theta^k), \frac{1}{N_k}\sum\limits_{i\in P_k}\nabla f(\theta_i^k) \right\rangle\notag\\ + &&\quad + \EE\left[\langle\nabla f(\theta^k), \theta^{k+1} - \widehat{\theta}^{k+1} \rangle\mid \theta^k\right]\notag\\ + &&\quad + \EE\left[L\|\theta^{k+1} - \widehat{\theta}^{k+1}\|^2\mid \theta^k\right]\notag\\ + &&\quad + L\gamma^2\EE\left[\left\|\frac{1}{N_k}\sum\limits_{i\in P_k}g_i^k\right\|^2\mid \theta^k\right],\label{eq:key_lemma_non_cvx_tech_2} + \end{eqnarray} + where $\EE\left[\ \cdot \mid \theta^k\right] := \EE\left[\ \cdot \mid P_k, \theta_i^k, i\in P_k\right]$. Next, we estimate the last three terms in the right-hand side of \eqref{eq:key_lemma_non_cvx_tech_2}. First of all, +\begin{eqnarray} + -\gamma\left\langle\nabla f(\theta^k), \frac{1}{N_k}\sum\limits_{i\in P_k}\nabla f(\theta_i^k)\right\rangle &=& -\gamma\|\nabla f(\theta^k)\|^2 \notag\\ + &&\quad - \gamma\left\langle\nabla f(\theta^k), \frac{1}{N_k}\sum\limits_{i\in P_k}\nabla f(\theta_i^k) - \nabla f(\theta^k)\right\rangle \notag\\ + &\overset{\eqref{eq:young_inequality}}{\le}& -\gamma\|\nabla f(\theta^k)\|^2 + \frac{\gamma}{2}\|\nabla f(\theta^k)\|^2\notag\\ + &&\quad+ \frac{\gamma}{2}\left\|\frac{1}{N_k}\sum\limits_{i\in P_k}(\nabla f(\theta_i^k) - \nabla f(\theta^k))\right\|^2\notag\\ + &\overset{\eqref{eq:jensen_ineq}}{\le}& - \frac{\gamma}{2}\|\nabla f(\theta^k)\|^2 + \frac{\gamma}{2N_k}\sum\limits_{i\in P_k}\|\nabla f(\theta_i^k) - \nabla f(\theta^k)\|^2\notag\\ + &\overset{\eqref{eq:L_smoothness_def}}{\le}& - \frac{\gamma}{2}\|\nabla f(\theta^k)\|^2 + \frac{\gamma L^2}{2}V_k, \label{eq:key_lemma_non_cvx_tech_3} +\end{eqnarray} +where $V_k = \frac{1}{N_k}\sum_{i\in P_k}\|\theta_i^k - \theta^k\|^2$. Secondly, since the stochastic gradients $\{g_i^k\}_{i\in P_k}$ are computed independently, we derive +\begin{eqnarray} + L\gamma^2\EE\left[\left\|\frac{1}{N_k}\sum\limits_{i\in P_k}g_i^k\right\|^2\mid \theta^k\right] &\overset{\eqref{eq:variance_decomposition}}{=}& L\gamma^2\left\|\frac{1}{N_k}\sum\limits_{i\in P_k}\nabla f(\theta_i^k)\right\|^2\notag\\ + &&\quad + L\gamma^2\EE\left[\left\|\frac{1}{N_k}\sum\limits_{i\in P_k}(g_i^k-\nabla f(\theta_i^k))\right\|^2\mid \theta^k\right]\notag\\ + &\overset{\eqref{eq:jensen_ineq}}{\le}& 2L\gamma^2 \left\|\frac{1}{N_k}\sum\limits_{i\in P_k}(\nabla f(\theta_i^k)-\nabla f(\theta^k))\right\|^2 \notag\\ + &&\quad + 2L\gamma^2\|\nabla f(\theta^k)\|^2 \notag\\ + &&\quad + \frac{\gamma^2L}{N_k^2}\sum\limits_{i\in P_k}\EE\left[\|g_i^k - \nabla f(\theta_i^k)\|^2\mid \theta^k\right]\notag\\ + &\overset{\eqref{eq:jensen_ineq},\eqref{eq:bounded_variance}}{\le}& \frac{2\gamma^2L}{N_k}\sum\limits_{i\in P_k}\|\nabla f(\theta_i^k)-\nabla f(\theta^k)\|^2\notag\\ + &&\quad + 2L\gamma^2\|\nabla f(\theta^k)\|^2 + \frac{\gamma^2L\sigma^2}{N_k}\notag\\ + &\overset{\eqref{eq:L_smoothness_def}}{\le}& \underbrace{\frac{2L^3\gamma^2}{N_k}\sum\limits_{i\in P_k}\|\theta_i^k - \theta^k\|^2}_{2L^3\gamma^2 V_k} + 2L\gamma^2\|\nabla f(\theta^k)\|^2\notag\\ + &&\quad + \frac{\gamma^2L\sigma^2}{N_{\min}}. \label{eq:key_lemma_non_cvx_tech_4} +\end{eqnarray} +Plugging \eqref{eq:key_lemma_non_cvx_tech_3} and \eqref{eq:key_lemma_non_cvx_tech_4} in \eqref{eq:key_lemma_non_cvx_tech_2}, we obtain +\begin{eqnarray} + \EE\left[f(\theta^{k+1}) - f(\theta^k)\mid \theta^k\right] &\le& -\frac{\gamma}{2}\left(1 - 4L\gamma\right)\|\nabla f(\theta^k)\|^2 + \frac{\gamma L^2}{2}\left(1 + 4L\gamma\right)V_k + \frac{L\gamma^2\sigma^2}{N_{\min}}\notag\\ + &&\quad + \EE\left[\langle\nabla f(\theta^k), \theta^{k+1} - \widehat{\theta}^{k+1} \rangle + L\|\theta^{k+1} - \widehat{\theta}^{k+1}\|^2\mid \theta^k\right].\notag +\end{eqnarray} +Next, we take the full expectation from the both sides of the above inequality, apply the tower property \eqref{eq:tower_property} and take into account that $\gamma \le \nicefrac{(1-2\delta_{pv,1})}{8L}$: +\begin{eqnarray*} + \EE\left[f(\theta^{k+1}) - f(\theta^k)\right] &\le& -\frac{\gamma}{2}\left(1 - 4L\gamma\right)\EE\left[\|\nabla f(\theta^k)\|^2\right] + \frac{\gamma L^2}{2}\left(1 + 4L\gamma\right)\EE[V_k] + \frac{L\gamma^2\sigma^2}{N_{\min}}\\ + &&\quad + \EE\left[\langle\nabla f(\theta^k), \theta^{k+1} - \widehat{\theta}^{k+1} \rangle + L\|\theta^{k+1} - \widehat{\theta}^{k+1}\|^2\right]\\ + &\overset{\eqref{eq:key_lemma_non_cvx_tech_1}}{\le}& -\frac{\gamma}{2}\left(1 - 2\delta_{pv,1} - 4L\gamma\right)\EE\left[\|\nabla f(\theta^k)\|^2\right] + \frac{\gamma L^2}{2}\left(1 + 4L\gamma\right)\EE[V_k] \notag\\ + &&\quad + L\gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right)\\ + &\le& -\frac{(1-2\delta_{pv,1})\gamma}{4}\EE\left[\|\nabla f(\theta^k)\|^2\right] + \gamma L^2 \EE[V_k]\notag\\ + &&\quad + L\gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right). +\end{eqnarray*} +Summing up the obtained inequalities for $k = 0,\ldots, K-1$ and rearranging the terms, we derive +\begin{eqnarray*} + \frac{(1-2\delta_{pv,1})\gamma}{4}\sum\limits_{k=0}^{K-1}\EE\left[\|\nabla f(\theta^k)\|^2\right] &\le& \sum\limits_{k=0}^{K-1} \EE\left[f(\theta^k) - f(\theta^{k+1})\right] + \gamma L^2\sum\limits_{k=0}^{K-1} \EE[V_k]\notag\\ + &&\quad + KL\gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right)\\ + &=& f(\theta^0) - \EE[f(\theta^{K})] + \gamma L^2\sum\limits_{k=0}^{K-1} \EE[V_k] \\ + &&\quad + KL\gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right)\\ + &\le& f(\theta^0) - f_* + \gamma L^2\sum\limits_{k=0}^{K-1} \EE[V_k]\\ + &&\quad + KL\gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right), +\end{eqnarray*} +where $f_*$ is a uniform lower bound for $f$. +\end{proof} +The next step towards completing the proof of Theorem~\ref{thm:non_cvx_convergence} gives the upper bound for $\sum_{k=0}^{K-1} \EE[V_k]$ that appeared in \eqref{eq:key_lemma_non_cvx}. + +\begin{lemma}\label{lem:V_k_lemma_non_cvx} + Let $f_1 = \ldots = f_N = f$ be $L$-smooth and bounded from below by $f_*$, and Assumptions~\ref{as:bounded_var}~and~\ref{as:averaging_quality} hold with $\Delta_{pv}^k = \delta_{pv,1}\gamma\EE[\|\nabla f(\theta^k)\|^2] + L\gamma^2\delta_{pv,2}^2$, $\delta_{pv,1}\in [0,\nicefrac{1}{2})$, $\delta_{pv,2}\ge 0$. Then, for any $K \ge 0$ the iterates produced by Moshpit SGD with $\gamma \le \nicefrac{1}{\left(4\sqrt{e}L(\tau-1)\right)}$ satisfy + \begin{eqnarray} + \sum\limits_{k=0}^{K-1}\EE[V_k] &\le& 8e\gamma^2(\tau-1)^2\sum\limits_{k=0}^{K-1}\EE[\|\nabla f(\theta^k)\|^2] + 4\gamma^2K\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right) ,\label{eq:V_k_lemma_non_cvx} + \end{eqnarray} + where $V_k = \frac{1}{N_k}\sum_{i\in P_k}\|\theta_i^k - \theta^k\|^2$ and $\theta^k = \frac{1}{N_k}\sum_{i\in P_k}\theta_i^k$. +\end{lemma} +\begin{proof} + First of all, consider $k$ such that $k = a\tau + t'$ for some $t'\in [0,\tau)$. Let $\EE_g[\cdot]$ denote the expectation conditioned on $\{P_t\}_{t=a\tau}^{(a+1)\tau-1}$. Then + \begin{eqnarray} + \EE_g[V_k] &=& \frac{1}{N_k}\sum\limits_{i\in P_k}\EE_g\left[\|\theta_i^k - \theta^k\|^2\right] \overset{\eqref{eq:variance_decomposition}}{\le} \frac{1}{N_k}\sum\limits_{i\in P_k}\EE_g\left[\|\theta_i^k - \theta^{a\tau}\|^2\right] \notag\\ + &=& \frac{1}{N_k}\sum\limits_{i\in P_k}\EE_g\left[\left\|\theta_i^{a\tau} - \theta^{a\tau} - \gamma\sum\limits_{t=a\tau}^{k-1} g_i^t\right\|^2\right]\notag\\ + &\overset{\eqref{eq:a+b}}{\le}& \frac{2}{N_k} \sum\limits_{i\in P_k}\EE_g\left[\|\theta_i^{a\tau} - \theta^{a\tau}\|^2\right] + \frac{2\gamma^2}{N_k}\sum\limits_{i\in P_k}\EE_g\left[\left\|\sum\limits_{t=a\tau}^{k-1} g_i^t\right\|^2\right]. \label{eq:V_k_lemma_non_cvx_tech_1} + \end{eqnarray} + Next, we estimate the second term in the right-hand side of \eqref{eq:V_k_lemma_non_cvx_tech_1} using Lemma~\ref{lem:lemma_i_1_gorbunov}: + \begin{eqnarray} + \frac{2\gamma^2}{N_k}\sum\limits_{i\in P_k}\EE_g\left[\left\|\sum\limits_{t=a\tau}^{k-1} g_i^t\right\|^2\right] &\overset{\eqref{eq:lemma_i_1_gorbunov}}{\le}& \frac{2e\gamma^2(k - a\tau)}{N_k} \sum\limits_{i\in P_k} \sum\limits_{t=a\tau}^{k-1}\EE_g[\|\nabla f(\theta_i^t)\|^2]\notag\\ + &&\quad + \frac{2e\gamma^2}{N_k}\sum\limits_{i\in P_k} \sum\limits_{t=a\tau}^{k-1}\EE_g[\|g_i^t - \nabla f(\theta_i^t)\|^2]\notag\\ + &\overset{\eqref{eq:a+b},\eqref{eq:bounded_variance}}{\le}& 4e\gamma^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE_g[\|\nabla f(\theta^t)\|^2] \notag\\ + &&\quad+ 4e\gamma^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\frac{1}{N_k}\sum\limits_{i\in P_k}\EE_g[\|\nabla f(\theta_i^t) - \nabla f(\theta^t)\|^2] \notag\\ + &&\quad+ 2e\gamma^2 (k - a\tau)\sigma^2\notag\\ + &\overset{\eqref{eq:L_smoothness_def}}{\le}& 4e\gamma^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE_g[\|\nabla f(\theta^t)\|^2]\notag\\ + &&\quad + 4e\gamma^2L^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\frac{N_t}{N_k}\cdot\frac{1}{N_t}\sum\limits_{i\in P_t}\EE_g[\|\theta_i^t - \theta^t\|^2]\notag\\ + &&\quad + 2e\gamma^2(\tau-1)\sigma^2\notag\\ + &\le& 4e\gamma^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE_g[\|\nabla f(\theta^t)\|^2] \notag\\ + &&\quad + 8e\gamma^2L^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE_g[V_t] + 2e\gamma^2(\tau-1)\sigma^2,\notag + \end{eqnarray} + where in the last two inequalities we use $N_k = |P_k| \le |P_{k-1}| = N_{k-1}$ for all $k\ge 1$ and $N_{a\tau} \le 2 N_{(a+1)\tau}$ for all integer $a \ge 0$. Plugging this inequality in \eqref{eq:V_k_lemma_non_cvx_tech_1} and taking the full expectation from the result, we get + \begin{eqnarray} + \EE[V_k] &\le& 2\EE\left[\frac{1}{N_k}\sum\limits_{i\in P_k}\|\theta_i^{a\tau} - \theta^{a\tau}\|^2\right] + 4e\gamma^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE[\|\nabla f(\theta^t)\|^2]\notag\\ + &&\quad + 8e\gamma^2L^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE[V_t] + 2e\gamma^2(\tau-1)\sigma^2\notag\\ + &\le& 4\EE\left[\frac{1}{N_{a\tau}}\sum\limits_{i\in P_{a\tau}}\|\theta_i^{a\tau} - \theta^{a\tau}\|^2\right] + 4e\gamma^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE[\|\nabla f(\theta^t)\|^2] \notag\\ + &&\quad + 8e\gamma^2L^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE[V_t] + 2e\gamma^2(\tau-1)\sigma^2\notag\\ + &\overset{\eqref{eq:quality_of_avg}}{\le}& 4e\gamma^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE[\|\nabla f(\theta^t)\|^2] + 8e\gamma^2L^2(\tau-1) \sum\limits_{t=a\tau}^{k-1}\EE[V_t]\notag\\ + &&\quad + 2\gamma^2\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right),\notag + \end{eqnarray} + where in the second inequality we also use $N_k = |P_k| \le |P_{k-1}| = N_{k-1}$ for all $k\ge 1$ and $N_{a\tau} \le 2 N_{(a+1)\tau}$ for all integer $a \ge 0$. Summing up the obtained inequalities for $k = a\tau, a\tau+1,\ldots, K'$ for some $K' \in[a\tau, (a+1)\tau-1]$ we derive + \begin{eqnarray*} + \sum\limits_{k=a\tau}^{K'}\EE[V_k] &\le& 4e\gamma^2(\tau-1)\sum\limits_{k=a\tau}^{K'} \sum\limits_{t=a\tau}^{k-1}\EE[\|\nabla f(\theta^t)\|^2] + 8e\gamma^2L^2(\tau-1) \sum\limits_{k=a\tau}^{K'}\sum\limits_{t=a\tau}^{k-1}\EE[V_t]\\ + &&\quad + 2\gamma^2(K'-a\tau+1)\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right)\\ + &\le& 4e\gamma^2(\tau-1)^2\sum\limits_{k=a\tau}^{K'} \EE[\|\nabla f(\theta^k)\|^2] + 8e\gamma^2L^2(\tau-1)^2 \sum\limits_{k=a\tau}^{K'}\EE[V_k]\\ + &&\quad + 2\gamma^2(K'-a\tau+1)\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right)\\ + &\le& 4e\gamma^2(\tau-1)^2\sum\limits_{k=a\tau}^{K'} \EE[\|\nabla f(\theta^k)\|^2] + \frac{1}{2} \sum\limits_{k=a\tau}^{K'}\EE[V_k]\notag\\ + &&\quad + 2\gamma^2(K'-a\tau+1)\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right), + \end{eqnarray*} + where in the last inequality we use $\gamma \le \nicefrac{1}{\left(4\sqrt{e}L(\tau-1)\right)}$. Rearranging the terms, we get that for $K' \ge 0$ + \begin{eqnarray*} + \sum\limits_{k=a\tau}^{K'} \EE[V_k] &\le& 8e\gamma^2(\tau-1)^2\sum\limits_{k=a\tau}^{K'}\EE[\|\nabla f(\theta^k)\|^2] + 4\gamma^2(K'-a\tau+1)\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right), + \end{eqnarray*} + where $a\ge 0$ is an integer such that $a\tau \le K' \le (a+1)\tau - 1$. Summing up the obtained inequalities for $K' = \tau-1, 2\tau-1,\ldots, \tau\lfloor\nicefrac{(K-1)}{\tau}\rfloor - 1, K-1$, we derive \eqref{eq:V_k_lemma_non_cvx}. +\end{proof} + +Combining Lemmas~\ref{lem:key_lemma_non_cvx}~and~\ref{lem:V_k_lemma_non_cvx}, we get the following result: +\begin{theorem}[Theorem~\ref{thm:non_cvx_convergence}] + Let $f_1 = \ldots = f_N = f$, function $f$ be $L$-smooth and bounded from below by $f_*$, and Assumptions~\ref{as:bounded_var}~and~\ref{as:averaging_quality} hold with $\Delta_{pv}^k = \delta_{pv,1}\gamma\EE[\|\nabla f(\theta^k)\|^2] + L\gamma^2\delta_{pv,2}^2$, $\delta_{pv,1}\in [0,\nicefrac{1}{2})$, $\delta_{pv,2}\ge 0$. Then, for any $K \ge 0$ the iterates produced by Moshpit SGD with + \begin{equation*} + \gamma \le \min\left\{\frac{1-2\delta_{pv,1}}{8L},\frac{\sqrt{1-2\delta_{pv,1}}}{8\sqrt{e}L(\tau-1)}\right\} + \end{equation*} + satisfy + \begin{eqnarray} + \EE\left[\|\nabla f(\theta_{\text{rand}}^K)\|^2\right] &\le& \frac{8\Delta_0}{(1-2\delta_{pv,1})K\gamma} \notag\\ + &&\quad + \frac{8L\gamma}{1-2\delta_{pv,1}}\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2 + 4\gamma L\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right)\right), \label{eq:non_cvx_bound_supp} + \end{eqnarray} + where $\Delta_0 = f(\theta^0) - f_*$ and $\theta_{\text{rand}}^K$ is chosen uniformly at random from $\{\theta^0,\theta^1,\ldots,\theta^{K-1}\}$. That is, Moshpit SGD achieves $\EE\left[\|\nabla f(\theta_{\text{rand}}^K)\|^2\right] \le \varepsilon^2$ after + \begin{eqnarray} + \cO\Bigg(\frac{L\Delta_0}{(1-2\delta_{pv,1})^2\varepsilon^2}\Bigg[1 +(\tau-1)\sqrt{1-2\delta_{pv,1}} + \frac{\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}}}{\varepsilon^2}&\notag\\ + &\hspace{-2cm} + \frac{\sqrt{(1-2\delta_{pv,1})(\delta_{aq}^2+(\tau-1)\sigma^2)}}{\varepsilon}\Bigg]\Bigg)\label{eq:non_cvx_bound_2_supp} + \end{eqnarray} + iterations with + \begin{equation*} + \gamma = \min\left\{\frac{1-2\delta_{pv,1}}{8L},\frac{\sqrt{1-2\delta_{pv,1}}}{8\sqrt{e}L(\tau-1)}, \sqrt{\frac{\Delta_0}{LK\left(\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}}\right)}}, \sqrt[3]{\frac{\Delta_0}{4L^2\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right)}}\right\}. + \end{equation*} +\end{theorem} +\begin{proof}[Proof of Theorem~\ref{thm:non_cvx_convergence}] + Plugging the result of Lemma~\ref{lem:V_k_lemma_non_cvx} in the inequality \eqref{eq:key_lemma_non_cvx} from Lemma~\ref{lem:key_lemma_non_cvx}, we obtain + \begin{eqnarray*} + \frac{(1-2\delta_{pv,1})\gamma}{4}\sum\limits_{k=0}^{K-1}\EE\left[\|\nabla f(\theta^k)\|^2\right] &\le& f(\theta^0) - f_* + 8e\gamma^3L^2\tau(\tau-1)\sum\limits_{k=0}^{K-1}\EE[\|\nabla f(\theta^k)\|^2] \\ + &&\quad + KL\gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right)\\ + &&\quad + 4KL^2\gamma^3\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right)\\ + &\le& f(\theta^0) - f_* + \frac{(1-2\delta_{pv,1})\gamma}{8}\sum\limits_{k=0}^{K-1}\EE\left[\|\nabla f(\theta^k)\|^2\right] \\ + &&\quad + KL\gamma^2\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2\right)\\ + &&\quad + 4KL^2\gamma^3\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right). + \end{eqnarray*} + Next, + \begin{eqnarray*} + \frac{1}{K}\sum\limits_{k=0}^K\EE\left[\|\nabla f(\theta^k)\|^2\right] &\le& \frac{8\Delta_0}{(1-2\delta_{pv,1})K\gamma} \\ + &&\quad + \frac{8L\gamma}{1-2\delta_{pv,1}}\left(\frac{\sigma^2}{N_{\min}} + \delta_{pv,2}^2 + 4\gamma L\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right)\right), + \end{eqnarray*} + where $\Delta_0 = f(\theta^0) - f_*$. Since $\theta_{\text{rand}}^K$ is chosen uniformly at random from $\{\theta^0,\theta^1,\ldots,\theta^{K-1}\}$, + \begin{equation*} + \EE\left[\|\nabla f(\theta_{\text{rand}}^K)\|^2\right] \overset{\eqref{eq:tower_property}}{=} \frac{1}{K}\sum\limits_{k=0}^K\EE\left[\|\nabla f(\theta^k)\|^2\right] + \end{equation*} + and \eqref{eq:non_cvx_bound_supp} holds. Applying Lemma~\ref{lem:lemma_i_3_gorbunov} to \eqref{eq:non_cvx_bound_supp}, we get the following result: if + \begin{equation*} + \gamma = \min\left\{\frac{1-2\delta_{pv,1}}{8L},\frac{\sqrt{1-2\delta_{pv,1}}}{8\sqrt{e}L(\tau-1)}, \sqrt{\frac{\Delta_0}{LK\left(\delta_{pv,2}^2 + \nicefrac{\sigma^2}{N_{\min}}\right)}}, \sqrt[3]{\frac{\Delta_0}{4L^2\left(2\delta_{aq}^2 + e(\tau-1)\sigma^2\right)}}\right\}, + \end{equation*} + then $\EE\left[\|\nabla f(\theta_{\text{rand}}^K)\|^2\right]$ equals + \begin{equation*} + \cO\!\left(\!\frac{L\Delta_0\left(1\!+\! (\tau\!-\!1)\sqrt{1\!-\!2\delta_{pv,1}}\right)}{(1\!-\!2\delta_{pv,1})^2K} + \sqrt{\frac{L\Delta_0\left(\delta_{pv,2}^2\! +\! \nicefrac{\sigma^2}{N_{\min}}\right)}{(1\!-\!2\delta_{pv,1})^2K}} + \frac{\sqrt[3]{L^2\Delta_0^2(\delta_{aq}^2\! +\! (\tau\!-\!1)\sigma^2)}}{(1\!-\!2\delta_{pv,1})K^{\nicefrac{2}{3}}}\!\right)\!, + \end{equation*} + which implies the desired convergence result from \eqref{eq:non_cvx_bound_2_supp}. +\end{proof} \ No newline at end of file diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/related.tex b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/related.tex new file mode 100644 index 00000000..32d6d18d --- /dev/null +++ b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/related.tex @@ -0,0 +1,42 @@ +\vspace{-10px}% +\section{Related Work}\label{sect:related} +\vspace{-4px} +\subsection{Data parallel training}\label{sect:related_data_parallel} +\vspace{-4px} + +The most popular way to accelerate neural network training with multiple devices is data-parallel training~\cite{valiant1990bridging,goyal2017accurate,You2020Large}. On each optimization step, this strategy splits the training batch among participants. Each participant then runs forward and backward passes to obtain gradients of the objective function on their part of the training batch. After that, we can aggregate the gradients from workers and perform an optimization step. There are two main strategies for this aggregation. + +Historically, the first solution to gradient aggregation was to use Parameter Server (PS)~\cite{parameter_server_first}: a separate process or a dedicated server that keeps track of model parameters and optimizer statistics. After each round, the PS accumulates the gradients from each worker and updates the model parameters using SGD or any other optimizer, such as Adam~\cite{adam}. Finally, the server distributes the updated model parameters to workers. + +This strategy is robust and easy to implement, but it requires the server to regularly download full model gradients from every single worker. As a result, the parameter server can quickly become a bottleneck for large-scale training~\cite{survey_distributed2}\nocite{survey_distributed}. Since the original PS, researchers have proposed several modifications that reduce the communication load: accumulating multiple batches~\cite{localsgd_first}, compression~\cite{lin2018deep,pmlr-v97-koloskova19a}, server sharding~\cite{sharded_ps_first,byteps}. A more detailed overview is given in Appendix~\ref{sect:post_related}. + +In turn, many practical distributed training systems have instead switched to averaging with All-Reduce ~\cite{goyal2017accurate,mikami2019massively,shoeybi2019megatron,You2020Large}. This name refers to a collection of protocols originally developed for HPC applications. Workers can follow these protocols to collectively compute the average\footnote{All-Reduce works with any commutative associative operation, such as min, max, or product.} gradient more efficiently than with a central server. + +\subsection{Communication-efficient All-Reduce}\label{sect:related_allreduce} +There are several all-reduce protocols optimized for different network topologies. The simplest one is known as Butterfly All-Reduce~\cite{bandwidth_optimal_allreduce}. Each of $N$ participants splits its local vector into $N$ chunks. Then, $i$-th worker aggregates $i$-th chunk of data from all peers and sends back the averaged chunk. + +\begin{figure}[h!] + \centering + \includegraphics[width=0.65\linewidth]{resources/butterfly.pdf} + \caption{A schematic illustration of Butterfly All-Reduce.} + \label{fig:butterfly_allreduce} +\end{figure} + +As long as the vector size $s$ is greater than $N$, this protocol uses $\cO\left(s \times \frac{N - 1}{N}\right)$ total bandwidth on each worker. However, it requires all-to-all communication, which is not always practical for the HPC infrastructure due to network contention~\cite{bandwidth_optimal_allreduce}. As a result, real-world systems typically use Ring or Tree All-Reduce, where each worker only communicates with a small subset of its peers. + +These protocols enable highly efficient and scalable averaging with $\cO(1)$ or $\cO(\log N)$ total communication per worker, but they also share a common drawback: they cannot tolerate node failures or network instability. If any single participant fails to execute its part or takes long to respond, this paralyzes all other workers. + +\subsection{Distributed training in unstable conditions}\label{sect:related_unreliable} +Some distributed training applications must deal with unstable network bandwidth and/or unreliable workers. This issue is most prevalent in federated learning~\cite{mcmahan2017communication,secure_aggregation,federatedlearningatscale}. When dealing with privacy-sensitive data distributed across multiple actors, such as hospital servers~\cite{fed_intel,fed_nvidia} or mobile phones~\cite{fed_google1,fed_google2}, one must train the model using whichever hardware and network available to those actors. + +Another important motivational factor is cost: HPC-grade infrastructure can be prohibitively expensive, pushing researchers and practitioners towards commodity servers or preemptible cloud VMs that are significantly cheaper (see Appendix~\ref{sect:cloud_costs}). Another solution is to use volunteer computing~\cite{volunteer_dl_async, learning_at_home} with abundant, but even less reliable, compute resources. + +Training under these conditions requires specialized strategies. At a small scale, one can deploy one or a few reliable parameter servers to aggregate the updates from workers. This strategy can tolerate individual node failures~\cite{proteus}, but scales poorly due to the reasons discussed in Section~\ref{sect:related_data_parallel}. + +\subsection{Decentralized training}\label{sect:related_decentralized_training} +If there are too many participants for PS, it can be advantageous to use decentralized SGD via \textbf{gossip-based} averaging \cite{boyd2006randomized,tsitsiklis1984problems,lian2017can}. In this scenario, participants form a sparse graph: each worker periodically downloads parameters from its neighbors and mixes them with local parameters. + +In essence, gossip-based averaging removes the communication bottlenecks of PS at the cost of using different local parameters on each peer. That said, gossip-based optimization algorithms can match, and sometimes even outperform, their centralized counterparts in terms of training speed~\cite{scaman2017optimal,scaman2018optimal,scaman2019optimal,lian2017can,assran2019stochastic}. However, the convergence properties of gossip averaging and gossip-based optimization methods significantly depend on the communication graph through the spectral properties of the mixing matrix~\cite{xiao2004fast,scaman2019optimal} or the Laplacian matrix of the network~\cite{merris1994laplacian,uribe2020dual}. + +Consequently, as the number of peers increases, gossip-based averaging has to either increase the number of neighbors (hence more communication) or accept slower convergence speed. Because of this, gossip is less communication-efficient than all-reduce algorithms reviewed in Section~\ref{sect:related_allreduce}. However, gossip-based algorithms are more robust to changes, which makes them applicable to time-varying networks~\cite{nedic2014distributed,nedic2016stochastic,nedic2018network,rogozin2019projected} and federated learning~\cite{ram2009asynchronous,yan2012distributed,yuan2016convergence}. + diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/albert_hours.pdf b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/albert_hours.pdf new file mode 100644 index 00000000..0edcf2fb Binary files /dev/null and b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/albert_hours.pdf differ diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/averaging.pdf b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/averaging.pdf new file mode 100644 index 00000000..745e334f Binary files /dev/null and b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/averaging.pdf differ diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/butterfly.pdf b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/butterfly.pdf new file mode 100644 index 00000000..0fe739d2 Binary files /dev/null and b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/butterfly.pdf differ diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/moshpit.pdf b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/moshpit.pdf new file mode 100644 index 00000000..ca3cfd9a Binary files /dev/null and b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/moshpit.pdf differ diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/multiple_graphics.pdf b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/multiple_graphics.pdf new file mode 100644 index 00000000..2ad13270 Binary files /dev/null and b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/multiple_graphics.pdf differ diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/resnet50_local.pdf b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/resnet50_local.pdf new file mode 100644 index 00000000..dbe4d4f3 Binary files /dev/null and b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/resnet50_local.pdf differ diff --git a/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/resnet50_local_epochs.pdf b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/resnet50_local_epochs.pdf new file mode 100644 index 00000000..120206e7 Binary files /dev/null and b/2024/05/29/papers/2103.03239/2103.03239.tar.gz/resources/resnet50_local_epochs.pdf differ diff --git a/2024/05/29/papers/2105.06987/IEEEbib.bst b/2024/05/29/papers/2105.06987/IEEEbib.bst new file mode 100644 index 00000000..f9bf3cca --- /dev/null +++ b/2024/05/29/papers/2105.06987/IEEEbib.bst @@ -0,0 +1,1021 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% IEEE.bst %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Bibliography Syle file for articles according to IEEE instructions +% balemi@aut.ee.ethz.ch <22-JUN-93> +% modified from unsrt.bib. Contributions by Richard H. Roy + +ENTRY + { address + author + booktitle + chapter + edition + editor + howpublished + institution + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + volume + year + } + {} + { label } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = +% next line commented out by rhr and changed to write comma +% { add.period$ write$ + { ", " * write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem{" write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +% 5/24/89 rhr +% modified fin.entry function - prints note field after body of entry +%FUNCTION {fin.entry} +%{ add.period$ +% note empty$ +% 'write$ +% { "\par\bgroup\parindent=0em " * annote * "\par\egroup " * write$ +% } +% if$ +% newline$ +%} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} + +% new block without terminating last block with a comma +FUNCTION {new.ncblock} +{ + write$ + newline$ + "\newblock " + before.all 'output.state := +} + +FUNCTION {new.nccont} +{ + write$ + " " + before.all 'output.state := +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {new.block.checka} +{ empty$ + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.sentence.checka} +{ empty$ + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {new.sentence.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "{\em " swap$ * "}" * } + if$ +} + +FUNCTION {boldface} +{ duplicate$ empty$ + { pop$ "" } + { "{\bf " swap$ * "}" * } + if$ +} + +%FUNCTION {boldface} +%{ 's swap$ := +% s "" = +% { "" } +% { "{\bf " s * "}" * } +% if$ +%} +% +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't := + nameptr #1 > + { namesleft #1 > + { ", " * t * } + { numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " et~al." * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { ", Eds." * } + { ", Ed." * } + if$ + } + if$ +} + +FUNCTION {format.title} +{ title empty$ + { "" } + { "``" title "t" change.case$ * } + if$ +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {format.date} +{ year empty$ + { month empty$ + { "" } + { "there's a month but no year in " cite$ * warning$ + month + } + if$ + } + { month empty$ + 'year + { month " " * year * } + if$ + } + if$ +} + +% FUNCTION {format.date} +% { year empty$ +% 'year +% { " " year * } +% if$ +% } + +FUNCTION {format.btitle} +{ title emphasize +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { "vol." volume tie.or.space.connect + series empty$ + 'skip$ + { " of " * series emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} + +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { output.state mid.sentence = + { "number" } + { "Number" } + if$ + number tie.or.space.connect + series empty$ + { "there's a number but no series in " cite$ * warning$ } + { " in " * series * } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { output.state mid.sentence = + { edition "l" change.case$ " edition" * } + { edition "t" change.case$ " edition" * } + if$ + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { "pp." pages n.dashify tie.or.space.connect } + { "p." pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.vol.num.pages} +{ +volume empty$ + {"" } + {"vol. " volume *} +if$ +number empty$ + 'skip$ + {", no. " number * *} +if$ +pages empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.pages } + { ", pp. " * pages n.dashify * } + if$ + } +if$ +} + +%FUNCTION {format.vol.num.pages} +%%boldface added 3/17/87 rhr +%{ volume field.or.null boldface +% number empty$ +% 'skip$ +% { "(" number * ")" * * +% volume empty$ +% { "there's a number but no volume in " cite$ * warning$ } +% 'skip$ +% if$ +% } +% if$ +% pages empty$ +% 'skip$ +% { duplicate$ empty$ +% { pop$ format.pages } +% { ":" * pages n.dashify * } +% if$ +% } +% if$ +%} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "chapter" } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "" } + { editor empty$ + { "in " booktitle emphasize * } + { "in " booktitle emphasize * ", " * format.editors * } + if$ + } + if$ +} + +FUNCTION {empty.misc.check} +{ author empty$ title empty$ howpublished empty$ + month empty$ year empty$ note empty$ + and and and and and + { "all relevant fields are empty in " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type "t" change.case$ + } + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Tech. {R}ep." } + 'type + if$ + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.article.crossref} +{ key empty$ + { journal empty$ + { "need key or journal for " cite$ * " to crossref " * crossref * + warning$ + "" + } + { "In {\em " journal * "\/}" * } + if$ + } + { "In " key * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.crossref.editor} +{ editor #1 "{vv~}{ll}" format.name$ + editor num.names$ duplicate$ + #2 > + { pop$ " et~al." * } + { #2 < + 'skip$ + { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { " et~al." * } + { " and " * editor #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + "In " + } + { "vol." volume tie.or.space.connect + " of " * + } + if$ + editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { series empty$ + { "need editor, key, or series for " cite$ * " to crossref " * + crossref * warning$ + "" * + } + { "{\em " * series * "\/}" * } + if$ + } + { key * } + if$ + } + { format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { booktitle empty$ + { "need editor, key, or booktitle for " cite$ * " to crossref " * + crossref * warning$ + "" + } + { "In {\em " booktitle * "\/}" * } + if$ + } + { "In " key * } + if$ + } + { "In " format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + new.block + format.title ",''" * "title" output.check + new.ncblock + crossref missing$ + { journal emphasize "journal" output.check + format.vol.num.pages output + format.date "year" output.check + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + new.block + format.title ",''" * "title" output.check + new.nccont + howpublished address new.block.checkb + howpublished output + address output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + new.block + format.title ",''" * "title" output.check + new.ncblock + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.chapter.pages output + new.sentence + publisher "publisher" output.check + address output + format.edition output + format.date "year" output.check + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + new.block + format.title ",''" * "title" output.check + new.ncblock + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + address empty$ + { organization publisher new.sentence.checkb + organization output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + organization output + } + if$ + format.bvolume output + format.number.series output + format.pages output + publisher output + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + author empty$ + { organization empty$ + 'skip$ + { organization output.nonnull + address output + } + if$ + } + { format.authors output.nonnull } + if$ + new.block + format.btitle "title" output.check + author empty$ + { organization empty$ + { address new.block.checka + address output + } + 'skip$ + if$ + } + { organization address new.block.checkb + organization output + address output + } + if$ + format.edition output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.title ",''" * "title" output.check + new.ncblock + "M.S. thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + title howpublished new.block.checkb + format.title ",''" * output + new.nccont + howpublished new.block.checka + howpublished output + format.date output + new.block + note output + fin.entry + empty.misc.check +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.btitle "title" output.check + new.block + "Ph.D. thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + editor empty$ + { organization output } + { format.editors output.nonnull } + if$ + new.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + address empty$ + { editor empty$ + { publisher new.sentence.checka } + { organization publisher new.sentence.checkb + organization output + } + if$ + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + editor empty$ + 'skip$ + { organization output } + if$ + publisher output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + new.block + format.title ",''" * "title" output.check + new.ncblock + format.tr.number output.nonnull + institution "institution" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + new.block + format.title ",''" * "title" output.check + new.ncblock + note "note" output.check + format.date output + fin.entry +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"Jan."} + +MACRO {feb} {"Feb."} + +MACRO {mar} {"Mar."} + +MACRO {apr} {"Apr."} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"Aug."} + +MACRO {sep} {"Sept."} + +MACRO {oct} {"Oct."} + +MACRO {nov} {"Nov."} + +MACRO {dec} {"Dec."} + +MACRO {acmcs} {"ACM Computing Surveys"} + +MACRO {acta} {"Acta Informatica"} + +MACRO {cacm} {"Communications of the ACM"} + +MACRO {ibmjrd} {"IBM Journal of Research and Development"} + +MACRO {ibmsj} {"IBM Systems Journal"} + +MACRO {ieeese} {"IEEE Transactions on Software Engineering"} + +MACRO {ieeetc} {"IEEE Transactions on Computers"} + +MACRO {ieeetcad} + {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} + +MACRO {ipl} {"Information Processing Letters"} + +MACRO {jacm} {"Journal of the ACM"} + +MACRO {jcss} {"Journal of Computer and System Sciences"} + +MACRO {scp} {"Science of Computer Programming"} + +MACRO {sicomp} {"SIAM Journal on Computing"} + +MACRO {tocs} {"ACM Transactions on Computer Systems"} + +MACRO {tods} {"ACM Transactions on Database Systems"} + +MACRO {tog} {"ACM Transactions on Graphics"} + +MACRO {toms} {"ACM Transactions on Mathematical Software"} + +MACRO {toois} {"ACM Transactions on Office Information Systems"} + +MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"} + +MACRO {tcs} {"Theoretical Computer Science"} + +READ + +STRINGS { longest.label } + +INTEGERS { number.label longest.label.width } + +FUNCTION {initialize.longest.label} +{ "" 'longest.label := + #1 'number.label := + #0 'longest.label.width := +} + +FUNCTION {longest.label.pass} +{ number.label int.to.str$ 'label := + number.label #1 + 'number.label := + label width$ longest.label.width > + { label 'longest.label := + label width$ 'longest.label.width := + } + 'skip$ + if$ +} + +EXECUTE {initialize.longest.label} + +ITERATE {longest.label.pass} + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{" longest.label * "}" * write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%% End of IEEE.bst %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/2024/05/29/papers/2105.06987/abstract.tex b/2024/05/29/papers/2105.06987/abstract.tex new file mode 100644 index 00000000..fdd59b98 --- /dev/null +++ b/2024/05/29/papers/2105.06987/abstract.tex @@ -0,0 +1,4 @@ +Ensembles of machine learning models yield improved system performance as well as robust and interpretable uncertainty estimates; however, their inference costs may often be prohibitively high. +\emph{Ensemble Distribution Distillation} is an approach that allows a single model to efficiently capture both the predictive performance and uncertainty estimates of an ensemble. For classification, this is achieved by training a Dirichlet distribution over the ensemble members' output distributions via the maximum likelihood criterion. Although theoretically principled, this criterion exhibits poor convergence when applied to large-scale tasks where the number of classes is very high. +In our work, we analyze this effect and show that for the Dirichlet log-likelihood criterion classes with low probability induce larger gradients than high-probability classes. This forces the model to focus on the distribution of the ensemble tail-class probabilities. +We propose a new training objective which minimizes the reverse KL-divergence to a \emph{Proxy-Dirichlet} target derived from the ensemble. This loss resolves the gradient issues of Ensemble Distribution Distillation, as we demonstrate both theoretically and empirically on the ImageNet and WMT17 En-De datasets containing 1000 and 40,000 classes, respectively. \ No newline at end of file diff --git a/2024/05/29/papers/2105.06987/algorithm.sty b/2024/05/29/papers/2105.06987/algorithm.sty new file mode 100644 index 00000000..843e3d5b --- /dev/null +++ b/2024/05/29/papers/2105.06987/algorithm.sty @@ -0,0 +1,79 @@ +% ALGORITHM STYLE -- Released 8 April 1996 +% for LaTeX-2e +% Copyright -- 1994 Peter Williams +% E-mail Peter.Williams@dsto.defence.gov.au +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithm} +\typeout{Document Style `algorithm' - floating environment} + +\RequirePackage{float} +\RequirePackage{ifthen} +\newcommand{\ALG@within}{nothing} +\newboolean{ALG@within} +\setboolean{ALG@within}{false} +\newcommand{\ALG@floatstyle}{ruled} +\newcommand{\ALG@name}{Algorithm} +\newcommand{\listalgorithmname}{List of \ALG@name s} + +% Declare Options +% first appearance +\DeclareOption{plain}{ + \renewcommand{\ALG@floatstyle}{plain} +} +\DeclareOption{ruled}{ + \renewcommand{\ALG@floatstyle}{ruled} +} +\DeclareOption{boxed}{ + \renewcommand{\ALG@floatstyle}{boxed} +} +% then numbering convention +\DeclareOption{part}{ + \renewcommand{\ALG@within}{part} + \setboolean{ALG@within}{true} +} +\DeclareOption{chapter}{ + \renewcommand{\ALG@within}{chapter} + \setboolean{ALG@within}{true} +} +\DeclareOption{section}{ + \renewcommand{\ALG@within}{section} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsection}{ + \renewcommand{\ALG@within}{subsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsubsection}{ + \renewcommand{\ALG@within}{subsubsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{nothing}{ + \renewcommand{\ALG@within}{nothing} + \setboolean{ALG@within}{true} +} +\DeclareOption*{\edef\ALG@name{\CurrentOption}} + +% ALGORITHM +% +\ProcessOptions +\floatstyle{\ALG@floatstyle} +\ifthenelse{\boolean{ALG@within}}{ + \ifthenelse{\equal{\ALG@within}{part}} + {\newfloat{algorithm}{htbp}{loa}[part]}{} + \ifthenelse{\equal{\ALG@within}{chapter}} + {\newfloat{algorithm}{htbp}{loa}[chapter]}{} + \ifthenelse{\equal{\ALG@within}{section}} + {\newfloat{algorithm}{htbp}{loa}[section]}{} + \ifthenelse{\equal{\ALG@within}{subsection}} + {\newfloat{algorithm}{htbp}{loa}[subsection]}{} + \ifthenelse{\equal{\ALG@within}{subsubsection}} + {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{} + \ifthenelse{\equal{\ALG@within}{nothing}} + {\newfloat{algorithm}{htbp}{loa}}{} +}{ + \newfloat{algorithm}{htbp}{loa} +} +\floatname{algorithm}{\ALG@name} + +\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}} + diff --git a/2024/05/29/papers/2105.06987/algorithmic.sty b/2024/05/29/papers/2105.06987/algorithmic.sty new file mode 100644 index 00000000..ad614783 --- /dev/null +++ b/2024/05/29/papers/2105.06987/algorithmic.sty @@ -0,0 +1,201 @@ +% ALGORITHMIC STYLE -- Released 8 APRIL 1996 +% for LaTeX version 2e +% Copyright -- 1994 Peter Williams +% E-mail PeterWilliams@dsto.defence.gov.au +% +% Modified by Alex Smola (08/2000) +% E-mail Alex.Smola@anu.edu.au +% +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithmic} +\typeout{Document Style `algorithmic' - environment} +% +\RequirePackage{ifthen} +\RequirePackage{calc} +\newboolean{ALC@noend} +\setboolean{ALC@noend}{false} +\newcounter{ALC@line} +\newcounter{ALC@rem} +\newlength{\ALC@tlm} +% +\DeclareOption{noend}{\setboolean{ALC@noend}{true}} +% +\ProcessOptions +% +% ALGORITHMIC +\newcommand{\algorithmicrequire}{\textbf{Require:}} +\newcommand{\algorithmicensure}{\textbf{Ensure:}} +\newcommand{\algorithmiccomment}[1]{\{#1\}} +\newcommand{\algorithmicend}{\textbf{end}} +\newcommand{\algorithmicif}{\textbf{if}} +\newcommand{\algorithmicthen}{\textbf{then}} +\newcommand{\algorithmicelse}{\textbf{else}} +\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif} +\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif} +\newcommand{\algorithmicfor}{\textbf{for}} +\newcommand{\algorithmicforall}{\textbf{for all}} +\newcommand{\algorithmicdo}{\textbf{do}} +\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor} +\newcommand{\algorithmicwhile}{\textbf{while}} +\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile} +\newcommand{\algorithmicloop}{\textbf{loop}} +\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop} +\newcommand{\algorithmicrepeat}{\textbf{repeat}} +\newcommand{\algorithmicuntil}{\textbf{until}} + +%changed by alex smola +\newcommand{\algorithmicinput}{\textbf{input}} +\newcommand{\algorithmicoutput}{\textbf{output}} +\newcommand{\algorithmicset}{\textbf{set}} +\newcommand{\algorithmictrue}{\textbf{true}} +\newcommand{\algorithmicfalse}{\textbf{false}} +\newcommand{\algorithmicand}{\textbf{and\ }} +\newcommand{\algorithmicor}{\textbf{or\ }} +\newcommand{\algorithmicfunction}{\textbf{function}} +\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction} +\newcommand{\algorithmicmain}{\textbf{main}} +\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain} +%end changed by alex smola + +\def\ALC@item[#1]{% +\if@noparitem \@donoparitem + \else \if@inlabel \indent \par \fi + \ifhmode \unskip\unskip \par \fi + \if@newlist \if@nobreak \@nbitem \else + \addpenalty\@beginparpenalty + \addvspace\@topsep \addvspace{-\parskip}\fi + \else \addpenalty\@itempenalty \addvspace\itemsep + \fi + \global\@inlabeltrue +\fi +\everypar{\global\@minipagefalse\global\@newlistfalse + \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels + \penalty\z@ \fi + \everypar{}}\global\@nobreakfalse +\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi +\sbox\@tempboxa{\makelabel{#1}}% +\global\setbox\@labels + \hbox{\unhbox\@labels \hskip \itemindent + \hskip -\labelwidth \hskip -\ALC@tlm + \ifdim \wd\@tempboxa >\labelwidth + \box\@tempboxa + \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi + \hskip \ALC@tlm}\ignorespaces} +% +\newenvironment{algorithmic}[1][0]{ +\let\@item\ALC@item + \newcommand{\ALC@lno}{% +\ifthenelse{\equal{\arabic{ALC@rem}}{0}} +{{\footnotesize \arabic{ALC@line}:}}{}% +} +\let\@listii\@listi +\let\@listiii\@listi +\let\@listiv\@listi +\let\@listv\@listi +\let\@listvi\@listi +\let\@listvii\@listi + \newenvironment{ALC@g}{ + \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@ + \listparindent\z@ \rightmargin\z@ + \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@ + \leftmargin 1em + \addtolength{\ALC@tlm}{\leftmargin} + } + } + {\end{list}} + \newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item} + \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}% +{}{\ \algorithmiccomment{##1}}} + \newcommand{\REQUIRE}{\item[\algorithmicrequire]} + \newcommand{\ENSURE}{\item[\algorithmicensure]} + \newcommand{\STATE}{\ALC@it} + \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}} +%changes by alex smola + \newcommand{\INPUT}{\item[\algorithmicinput]} + \newcommand{\OUTPUT}{\item[\algorithmicoutput]} + \newcommand{\SET}{\item[\algorithmicset]} +% \newcommand{\TRUE}{\algorithmictrue} +% \newcommand{\FALSE}{\algorithmicfalse} + \newcommand{\AND}{\algorithmicand} + \newcommand{\OR}{\algorithmicor} + \newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}} +%end changes by alex smola + \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}} + \renewcommand{\\}{\@centercr} + \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\ + \algorithmicthen\ {##2}} + \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\ELSIF}[2][default]% +{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ % + \algorithmicdo\ {##2}} + \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@whl}} + \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop% +\ALC@com{##1}\begin{ALC@loop}} +%changed by alex smola + \newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ % + \ALC@com{##1}\begin{ALC@func}} + \newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ % + \ALC@com{##1}\begin{ALC@main}} +%end changed by alex smola + \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat% + \ALC@com{##1}\begin{ALC@rpt}} + \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1} + \ifthenelse{\boolean{ALC@noend}}{ + \newcommand{\ENDIF}{\end{ALC@if}} + \newcommand{\ENDFOR}{\end{ALC@for}} + \newcommand{\ENDWHILE}{\end{ALC@whl}} + \newcommand{\ENDLOOP}{\end{ALC@loop}} + \newcommand{\ENDFUNCTION}{\end{ALC@func}} + \newcommand{\ENDMAIN}{\end{ALC@main}} + }{ + \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif} + \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor} + \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile} + \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop} + \newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction} + \newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain} + } + \renewcommand{\@toodeep}{} + \begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}% + \itemsep\z@ \itemindent\z@ \listparindent\z@% + \partopsep\z@ \parskip\z@ \parsep\z@% + \labelsep 0.5em \topsep 0.2em% + \ifthenelse{\equal{#1}{0}} + {\labelwidth 0.5em } + {\labelwidth 1.2em } + \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep} + \ALC@tlm\labelsep + } + } + {\end{list}} + + + + + + + + + + + + + + diff --git a/2024/05/29/papers/2105.06987/apn.tex b/2024/05/29/papers/2105.06987/apn.tex new file mode 100644 index 00000000..baf073ec --- /dev/null +++ b/2024/05/29/papers/2105.06987/apn.tex @@ -0,0 +1,115 @@ +\section{Emulating Ensembles via Autoregressive Prior Networks} +While ensemble methods give a theoretically meaningful approach to modelling uncertainty, they can be prohibitively computationally expensive for many real applications. This is even more true for large auto-regressive models like Transformers~\cite{attentionisallyouneed,vgg-transformer}. One approach + +Here we consider how to generalize distribution distillation to auto-regressive structured predictions models, yielding Sequence Ensemble Distributions Distillation (SEnD$^2$). Unlike, ensembles of classification models, which can be interpreted as samples from implicit conditional distribution over discrete output distributions, ensembles of auto-regressive models can be interpreted as samples of prefix-trees samples from a distribution over prefix-trees conditioned on the input $\bm{x}f$. Unfortunately, it is not possible to explicitly parameterize such a distribution in practice. However, it was shown in ~\cite{malinin-structured-2020}, ensemble-combination should be done at the token, not sequence level for improved prediction and uncertainty estimation. The reason being that an 'atomic' component of an auto-regressive model is an unstructured prediction of the next token $y_l$ given a context $\bm{y}_{ 0,\ \hat \alpha_0^{(l)} = \sum_{c=1}^K \hat \alpha_c^{(l)} +\end{split} +\label{eqn:DPN1} +\end{empheq} + + + +Let's examine how given this model we can obtain measures of sequence-level \emph{total} and \emph{knowledge} uncertainty. \emph{Total Uncertainty} will be given by the sequence-level entropy. +\begin{empheq}{align} +\begin{split} +\mathcal{\hat H}\big[{\tt P}(\bm{y}| \bm{x}; \bm{\hat \phi})\big] =&\ \frac{1}{L} \mathbb{E}_{{\tt P}(\bm{y}| \bm{x}; \bm{\hat \phi})}\big[{\tt P}(\bm{y}| \bm{x}; \bm{\hat \phi})\big] =\ \frac{1}{L}\sum_{l=1}^L \mathbb{E}_{{\tt P}(\bm{y}_{0\\ +\mathcal{C}(\bm{\alpha}) =&\ \frac{\Gamma(\alpha_0)}{\prod_{c=1}^K\Gamma(\alpha_c)},\quad \alpha_0 = \sum_{c=1}^K \alpha_c +\end{split} +\end{empheq} +where $\Gamma(\cdot)$ is the \emph{Gamma function}. + +\subsection{Differential Entropy} +The differential entropy of the Dirichlet distribution can be derived as follows: +\begin{empheq}{align} +\begin{split} + \mathcal{H}[{\tt p}(\bm{\pi}|\bm{x}^{*};\bm{\hat \theta})]=&\ -\mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x};\bm{\hat \theta})}[\ln({\tt p}(\bm{\pi}|\bm{x};\bm{\hat \theta}))] \\ +=&\ \sum_{c=1}^K\ln\Gamma(\hat \alpha_c)-\ln\Gamma(\hat \alpha_0) - \sum_{c=1}^K(\hat \alpha_c-1)\mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x};\bm{\hat \theta})}[\ln\pi_c] \\ +=&\ \sum_{c=1}^K\ln\Gamma(\hat \alpha_c)-\ln\Gamma(\hat \alpha_0) - \sum_{c=1}^K(\hat \alpha_c-1)\cdot\big(\psi(\hat \alpha_c)-\psi(\hat \alpha_0)\big) +\end{split} +\end{empheq} +where $\psi$ is the \emph{digamma function} and $\mathbb{E}_{{\tt p}(\bm{\pi}|\bm{\hat \alpha})}[\ln(\pi_c)]=\psi(\hat \alpha_c)-\psi(\hat \alpha_0)$ is a standard result. + +\subsection{Mutual Information} +The mutual information between the labels y and the categorical $\bm{\pi}$ for a Dirichlet distribution can be calculated as follows, using the fact that mutual information is the difference of the entropy of the expected distribution and the expected entropy of the distribution. +\begin{empheq}{align} +\begin{split} +\underbrace{\mathcal{I}[y,\bm{\pi} |\bm{x}^{*},\bm{\hat \theta}]}_{Knowledge\ Uncertainty} = &\ \underbrace{\mathcal{H}[ \mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x}^{*}, \bm{\hat \theta})}[{\tt P}(y|\bm{\pi}]]}_{Total\ Uncertainty} - \underbrace{\mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x}^{*}, \bm{\hat \theta})}[\mathcal{H}[{\tt P}(y|\bm{\pi})]]}_{Expected\ Data\ Uncertainty} \\ += &\ \mathcal{H}[{\tt P}(y|\bm{x}^{*},\bm{\hat \theta})] + \sum_{c=1}^K \mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x}^{*}, \bm{\hat \theta})}[\pi_c\ln\pi_c] \\ +=&\ -\sum_{c=1}^K\frac{\hat \alpha_c}{\hat \alpha_0}\Big(\ln\frac{\hat \alpha_c}{\hat \alpha_0} - \psi(\hat \alpha_c+1) +\psi(\hat \alpha_0+1) \Big) +\end{split} +\end{empheq} +The second term in this derivation is a non-standard result. The expected entropy of the distribution can be calculated in the following way: +\begin{empheq}{align} +\begin{split} +\mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x}^{*}, \bm{\hat \theta})}[\pi_c\ln\pi_c] = &\ \frac{\Gamma(\hat \alpha_0)}{\prod_{c=1}^K\Gamma(\hat \alpha_c)}\int_{\mathcal{S}_K}\pi_c\ln\pi_c\prod_{c=1}^K \pi_c^{\hat \alpha_c -1}d\bm{\pi} \\ += &\ \frac{\hat \alpha_c}{\hat \alpha_0}\frac{\Gamma(\hat \alpha_0+1)}{\Gamma(\hat \alpha_c+1)\prod_{c'=1, \neq c}^K\Gamma(\hat \alpha_{c'})}\int_{\mathcal{S}_K}\pi_c^{\hat \alpha_c}\ln\pi_c\prod_{c'=1,\neq c}^K \pi_{c'}^{\hat \alpha_{c'} -1}d\bm{\pi} \\ += &\ \frac{\hat \alpha_c}{\hat \alpha_0}\big(\psi(\hat \alpha_c+1) - \psi(\hat \alpha_0+1)\big) +\end{split} +\end{empheq} +Here the expectation is calculated by noting that the standard result of the expectation of $\ln\pi_c$ with respect to a Dirichlet distribution can be used if the extra factor $\pi_c$ is accounted for by adding 1 to the associated concentration parameter $\hat \alpha_c$ and multiplying by $\frac{\hat \alpha_c}{\hat \alpha_0}$ in order to have the correct normalizing constant. + +\subsection{Expected Pairwise KL-divergence} +Similarly, the Expected Pairwise KL-divergence can also be analytically calculated for the Dirichlet distribution using the following derivation: +\begin{empheq}{align} +\begin{split} +\mathcal{K}[{\tt p}(\bm{\pi}|\bm{x}^{*};\bm{\hat \theta})] = &\ \mathbb{E}_{{\tt p}(\bm{\pi}^{(1)}|\bm{x}^{*};\bm{\hat \theta}),{\tt p}(\bm{\pi}^{(2)}|\bm{x}^{*};\bm{\hat \theta})}\big[{\tt KL}[{\tt P}(y|\bm{\pi}^{(1)})||{\tt P}(y|\bm{\pi}^{(2)})]\big] \\ += &\ - \sum_{c=1}^K\mathbb{E}_{{\tt p}(\bm{\pi}^{(1)}|\bm{x}^{*};\bm{\hat \theta})}[{\tt P}(\omega_c|\bm{\pi}^{(1)})]\mathbb{E}_{{\tt p}(\bm{\pi}^{(2)}|\bm{x}^{*};\bm{\hat \theta})}[\ln {\tt P}(\omega_c|\bm{\pi}^{(2)})] \\ +- &\ \mathbb{E}_{{\tt p}(\bm{\pi}^{(1)}|\bm{x}^{*};\bm{\hat \theta})}\big[\mathcal{H}[{\tt P}(y|\bm{\pi}^{(1)})]\big] \\ += &\ \sum_{c=1}^K\mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x}^{*};\bm{\hat \theta})}[\pi_c\ln\pi_c] - \sum_{c=1}^K\mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x}^{*};\bm{\hat \theta})}[\pi_c]\mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x}^{*};\bm{\hat \theta})}[\ln\pi_c] +\end{split} +\end{empheq} +The last step is valid only if ${\tt p}(\bm{\pi}^{(1)}|\bm{x}^{*};\bm{\hat \theta}) = {\tt p}(\bm{\pi}^{(2)}|\bm{x}^{*};\bm{\hat \theta}) = {\tt p}(\bm{\pi}|\bm{x}^{*};\bm{\hat \theta})$, which represents independent draws of categorical from the Dirichlet. This expression then leads to a particularly elegant solution: +\begin{empheq}{align} +\begin{split} +\mathcal{K}[{\tt p}(\bm{\pi}|\bm{x}^{*};\bm{\hat \theta})] = &\ \sum_{c=1}^K\frac{\hat \alpha_c}{\hat \alpha_0}\big(\psi(\hat \alpha_c+1) -\psi(\hat \alpha_0+1)\big) - \sum_{c=1}^K\frac{\hat \alpha_c}{\hat \alpha_0}\big(\psi(\hat \alpha_c)-\psi(\hat \alpha_0)\big) \\ += &\ \frac{K-1}{\hat \alpha_0} +\end{split} +\end{empheq} +Thus, the expected pairwise KL-divergence is inversely proportional to the concentration of the Dirichlet and is maximized when the concentration $\hat \alpha_0$ tends to 0. \ No newline at end of file diff --git a/2024/05/29/papers/2105.06987/background.tex b/2024/05/29/papers/2105.06987/background.tex new file mode 100644 index 00000000..22ec44a5 --- /dev/null +++ b/2024/05/29/papers/2105.06987/background.tex @@ -0,0 +1,89 @@ +\section{Preliminaries: Ensembles and Distillation} + +We view ensembles within a Bayesian framework where the model parameters $\bm{\theta}$ are random variables over which a prior distribution ${\tt p}(\bm{\theta})$ is placed. The posterior distribution ${\tt p}(\bm{\theta}|\mathcal{D})$ is obtained via Bayes' rule: +\begin{empheq}{align} +\begin{split} + {\tt p}(\bm{\theta}|\mathcal{D}) &= \frac{{\tt p}(\mathcal{D}|\bm{\theta}){\tt p}(\bm{\theta})}{{\tt p}(\mathcal{D})} \propto {\tt p}(\mathcal{D}|\bm{\theta}){\tt p}(\bm{\theta}) +\end{split} +\label{eqn:bayesposterior} +\end{empheq} +Consider an ensemble of models $\{{\tt P}(y|\bm{x}^{*}, \bm{\theta}^{(m)})\}_{m=1}^M $ sampled from the posterior: +\begin{empheq}{align} +\begin{split} +\big\{{\tt P}(y| \bm{x}, \bm{\theta}^{(m)} )\big\}_{m=1}^M \rightarrow& \big\{{\tt P}(y| \bm{\pi}^{(m)} )\big\}_{m=1}^M,\quad \bm{\pi}^{(m)} =\ \bm{f}(\bm{x}; \bm{\theta}^{(m)}),\ \bm{\theta}^{(m)}\sim {\tt p}(\bm{\theta}|\mathcal{D}) +\end{split} +\end{empheq} +where $\bm{\pi}$ are the parameters of a categorical distribution $[ {\tt P}(y=\omega_1),\cdots, {\tt P}(y=\omega_K)]^{\tt T}$. The predictive distribution, or \emph{predictive posterior}, for a test input $\bm{x}^{*}$ is obtained by taking the expectation with respect to the model posterior: +\begin{empheq}{align} +\begin{split} + {\tt P}(y| \bm{x}^{*}, \mathcal{D}) = &\ \mathbb{E}_{{\tt p}(\bm{\theta}|\mathcal{D})}\big[{\tt P}(y|\bm{x}^{*}, \bm{\theta})\big] + \approx \ \frac{1}{M}\sum_{m=1}^M{\tt P}(y|\bm{x}^{*}, \bm{\theta}^{(m)}) +\end{split} +\label{eqn:modunc} +\end{empheq} +In practice this is intractable and we approximate via Monte-Carlo sampling. Given the ensemble, the entropy of the predictive posterior is a measure of \emph{total uncertainty}. \emph{Knowledge uncertainty} can be assessed via measures of the spread, or `disagreement', of the ensemble such as \emph{mutual information}: +\begin{empheq}{align} +\begin{split} +\underbrace{\mathcal{I}[y,\bm{\theta}| \bm{x}^{*},\mathcal{D}]}_{\text{Knowledge Uncertainty}} = &\ \underbrace{\mathcal{H}\big[\mathbb{E}_{{\tt p}(\bm{\theta}|\mathcal{D})}[{\tt P}(y|\bm{x}^{*}, \bm{\theta})]\big]}_{\text{Total Uncertainty}} - \underbrace{\mathbb{E}_{{\tt p}(\bm{\theta}|\mathcal{D})}\big[\mathcal{H}[{\tt P}(y|\bm{x}^{*},\bm{\theta})]\big]}_{\text{Expected Data Uncertainty}} +\end{split} +\label{eqn:mibayes} +\end{empheq} + +While ensembles yield improved predictive performance and theoretically interpretable uncertainty estimates, they are expensive during training, and especially so during inference. Thus, it is common to \emph{distill} an ensemble into a single model. Typically, this is done by minimizing the KL-divergence to the predictive posterior of the ensemble: +\begin{empheq}{align} +\begin{split} +\mathcal{L}^{\text{EnD}}(\bm{\phi},\mathcal{D}_{\tt ens}) =& \mathbb{E}_{{\tt \hat p}(\bm{x})}\Big[{\tt KL}\big[{\tt P}(y| \bm{x}, \mathcal{D})\ ||\ {\tt P}(y| \bm{x};\bm{\phi})\big] \Big] +\end{split} +\end{empheq} +This approach has been thoroughly investigated for a range of tasks, such as image classifcation, machine translation, etc.. +While distillation allows a single model to capture the predictive quality and estimates of \emph{total uncertainty} of the ensemble at low computational and memory cost, information about the diversity of the ensemble is lost. Consequently, it is no longer possible to obtain estimates of \emph{knowledge uncertainty} which is particularly useful for anomaly detection~\cite{malinin-thesis, malinin-endd-2019}. + +\cite{malinin-endd-2019} recently proposed a class of distillation techniques called \emph{Ensemble Distribution Distillation} (\Endd), where the goal is to capture both the mean and the diversity of an ensemble within a single model. The proposed solution to \Endd was to distill an ensemble into a Prior Network model which parameterizes the Dirichlet distribution as follows: +% \begin{empheq}{align} +% \begin{split} +% \big\{{\tt P}(y| \bm{x}^{*}, \bm{\theta}^{(m)} )\big\}_{m=1}^M \rightarrow& \big\{{\tt P}(y| \bm{\pi}^{(m)} )\big\}_{m=1}^M \\ +% \bm{\pi}^{(m)} \sim&\ {\tt p}(\bm{\pi} | \bm{x}^{*}, \mathcal{D}) +% \end{split} +% \end{empheq} +\begin{empheq}{align} +\begin{split} +{\tt p}(\bm{\pi} | \bm{x};\bm{\hat \phi}) =& {\tt Dir}(\bm{\pi} | \bm{\hat \alpha}), \bm{\hat \alpha} = e^{\bm{z}}, \bm{z}= \bm{f}(\bm{x};\bm{\hat \phi}),\ +\hat \alpha_c > 0,\ \hat \alpha_0 = \sum_{c=1}^K \hat \alpha_c +\end{split} +\label{eqn:DPN1} +\end{empheq} + +% In this work we consider how an ensemble, which is a set of samples from an \emph{implicit} distribution over distributions, can be \emph{distribution distilled} into an \emph{explicit} distribution over distributions modelled using a single Prior Network model, ie: $\big\{{\tt P}(y | \bm{x} ; \bm{\theta}^{(m)} )\big\}_{m=1}^M \rightarrow {\tt p}(\bm{\pi} | \bm{x};\bm{\hat \phi})$. + +Distribution distillation is then accomplished as follows. Firstly, a \emph{transfer dataset} $\mathcal{D}_{\tt ens}= \{\bm{x}^{(i)}, \bm{\pi}^{(i,1:M)} \}_{i=1}^N \sim {\tt \hat p}(\bm{x},\bm{\pi})$ is composed of the inputs $\bm{x}_i$ from the original training set $\mathcal{D}=\{\bm{x}^{(i)},y^{(i)}\}_{i=1}^N$ and the categorical distributions $\{\bm{\pi}^{(i,1:M)}\}_{i=1}^N$ derived from the ensemble for each input. Secondly, given this transfer set, the model ${\tt p}(\bm{\pi} | \bm{x};\bm{\phi})$ is trained by minimizing the negative log-likelihood of each categorical distribution $\bm{\pi}^{(im)}$: +\begin{empheq}{align} +\begin{split} +\mathcal{L}^{\text{EnD}^2}(\bm{\phi},\mathcal{D}_{\tt ens}) =&\ -\mathbb{E}_{{\tt \hat p}(\bm{x})}\big[\mathbb{E}_{{\tt \hat p}(\bm{\pi}|\bm{x})}[\ln{\tt p}(\bm{\pi} | \bm{x};\bm{\phi}) ] \big] %\\ +%=&\ - \frac{1}{N}\sum_{i=1}^N\Big[\ln\Gamma(\hat \alpha_{0}^{(i)}) - \sum_{c=1}^K\ln\Gamma(\hat \alpha_{c}^{(i)}) + \frac{1}{M}\sum_{m=1}^M\sum_{c=1}^K(\hat \alpha_{c}^{(i)} -1)\ln\pi_{c}^{(im)}\Big] +\end{split} +\label{eqn:endd-loss1} +\end{empheq} +Given a distribution-distilled Prior Network, the predictive distribution is given by the expected categorical distribution $\bm{\hat \pi}$ under the Dirichlet prior: +\begin{empheq}{align} +\begin{split} +{\tt P}(y = \omega_c| \bm{x}^{*};\bm{\hat \phi}) = &\ \mathbb{E}_{{\tt p}(\bm{\pi} | \bm{x}^{*};\bm{\hat \phi})}[{\tt P}(y = \omega_c | \bm{\pi})]=\ \hat \pi_c\ = \frac{\hat \alpha_c}{\sum_{k=1}^K \hat \alpha_k} =\ \frac{ e^{\hat z_c}}{\sum_{k=1}^K e^{\hat z_k}} +\end{split}\label{eqn:dirposterior} +\end{empheq} +Measures of \emph{total} and \emph{knowledge uncertainty} are obtained by considering the mutual information between the prediction $y$ and the parameters of $\bm{\pi}$ of the categorical: +\begin{empheq}{align} +\begin{split} + \underbrace{\mathcal{I}[y,{\tt \bm{\pi}} |\bm{x}^{*};\bm{\hat \phi}]}_{\text{Knowledge Uncertainty}}=&\ \underbrace{\mathcal{H}\big[\mathbb{E}_{{\tt p}({\tt \bm{\pi}}|\bm{x}^{*};\bm{\hat \phi})}[{\tt P}(y|{\tt \bm{\pi}})]\big]}_{\text{Total Uncertainty}} - \underbrace{\mathbb{E}_{{\tt p}({\tt \bm{\pi}}|\bm{x}^{*};\bm{\hat \phi})}\big[\mathcal{H}[{\tt P}(y|{\tt \bm{\pi}})]\big]}_{\text{Expected Data Uncertainty}} +\end{split} + \label{eqn:mipn} +\end{empheq} + +It is important to highlight that \Endd can also be accomplished by distilling an ensemble into a mixture model which yields a separate softmax for each ensemble member~\cite{hydra,mdd}. The principle downside of this approach is that it requires more parameters, and attempts to model the ensemble in excessive detail, which requires more flexible and powerful models. As a result, for good performance, it necessary to split the model into multiple heads at an earlier stage, which significantly increases computational and memory complexity. In contrast, \Endd via Prior Networks has a fixed computational and memory cost of one model regardless of the size of the original ensemble. + + + + + +% Specifically, for an in-domain test input $\bm{x}^{*}$, the ensemble should produce a consistent set of predictions with little spread, as described in figure~\ref{fig:dirs-confident} and figure~\ref{fig:dirs-dataunc}. In other words, the models should agree in their estimates of \emph{data uncertainty}. On the other hand, for inputs which are different from the training data, the models in the ensemble should `disagree' and produce a diverse set of predictions, as shown in figure~\ref{fig:dirs-knowunc}. Ideally, the models should yield increasingly diverse predictions as input $\bm{x}^{*}$ moves further away from the training data. If an input is completely unlike the training data, then the level of disagreement should be significant. Hence, the measures of \emph{model uncertainty} will capture \emph{knowledge uncertainty} given an appropriate choice of prior. + +%G +%This formulation of mutual information allows the \emph{total uncertainty} to be decomposed into \emph{knowledge uncertainty} and \emph{expected data uncertainty}~\citep{mutual-information,mutual-information2}. The entropy of the predictive posterior, or \emph{total uncertainty}, will be high whenever the model is uncertain - both in regions of severe class overlap and out-of-domain. However, the difference of the entropy of the predictive posterior and the expected entropy of the individual models will be non-zero only if the models disagree. For example, in regions of class overlap, \emph{each} member of the ensemble will yield a high entropy distribution (figure~\ref{fig:dirs}b) - the entropy of the predictive posterior and the expected entropy will be similar and mutual information will be low. In this situation \emph{total uncertainty} is dominated by \emph{data uncertainty}. On the other hand, for out-of-domain inputs the ensemble yields diverse distributions over classes such that the predictive posterior is near uniform (figure~\ref{fig:dirs-knowunc}), while the expected entropy of each model may be much lower. In this region of input space the models' understanding of data is low and, therefore, \emph{knowledge uncertainty} is high. \ No newline at end of file diff --git a/2024/05/29/papers/2105.06987/bibliography.bib b/2024/05/29/papers/2105.06987/bibliography.bib new file mode 100644 index 00000000..21cd7858 --- /dev/null +++ b/2024/05/29/papers/2105.06987/bibliography.bib @@ -0,0 +1,3288 @@ +@COMMENT{Abbreviations: long versions} + +@STRING{P_EUROSPEECH = {Proc. Eurospeech}} +@STRING{P_INTERSPEECH = {Proc. INTERSPEECH}} +@STRING{P_EMNLP = {Proc. EMNLP}} +@STRING{P_SLATE = {Proc. ISCA Workshop on Speech and Language Technology for Education (SLaTE)}} +@STRING{P_ICSLP = {Proc. International Conference on Spoken Language Processing (ICSLP)}} +@STRING{P_ASRU = {Proc. Automatic Speech Recognition and Understanding Workshop (ASRU)}} +@STRING{P_ICASSP = {Proc. International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}} + +@STRING{PIEEE = {Proceedings of the \textsc{ieee}}} +@STRING{TASSP = {\textsc{ieee} Transactions on Acoustics, Speech and Signal Processing}} +@STRING{TSAP = {\textsc{ieee} Transactions on Speech and Audio Processing}} +@STRING{TASLP = {\textsc{ieee} Transactions on Audio, Speech, and Language Processing}} +@STRING{SPL = {\textsc{ieee} Signal Processing Letters}} +@STRING{STSP = {\textsc{ieee} Transactions on Signal Processing}} +@STRING{TNN = {\textsc{ieee} Transactions on Neural Networks}} + +@STRING{CSL = {Computer Speech and Language}} +@STRING{SPEECHCOM = {Speech Communication}} +@STRING{ACL = {Proc. of the 34th Annual Meeting of the ACL}} +@STRING{VLSISP = {Journal of \textsc{vlsi} Signal Processing}} +@STRING{NC = {Journal of Neural Computation}} + +@STRING{NIPS = {Proc. Conference on Neural Information Processing Systems (NIPS)}} +@STRING{ICML = {Proc. International Conference on Machine Learning (ICML)}} +@STRING{ICLR = {Proc. International Conference on Learning Representations (ICLR)}} +@STRING{CNLL = {Proc. International Conference on Computational Natural Language Learning}} +@STRING{JMLR = {Journal of Machine Learning Research}} +@STRING{MITPress = {\textsc{mit} Press}} +@STRING{NAACL-HLT = {Proc. NAACL-HLT}} + +@COMMENT{End abbreviations} + + BibTeX | EndNote | ACM Ref + +@misc{minka2000estimating, + title={Estimating a Dirichlet distribution}, + author={Minka, Thomas}, + year={2000}, + publisher={Technical report, MIT} +} + +@article{malinin2020regression, + title={Regression Prior Networks}, + author={Malinin, Andrey and Chervontsev, Sergey and Provilkov, Ivan and Gales, Mark}, + journal={arXiv preprint arXiv:2006.11590}, + year={2020} +} + +@article{mdd, + title={Ensemble Approaches for Uncertainty in Spoken Language Assessment}, + author={Wu, Xixin and Knill, Kate M and Gales, Mark JF and Malinin, Andrey}, + journal={Proc. Interspeech 2020}, + pages={3860--3864}, + year={2020} +} + +@misc{hydra, +title={Hydra: Preserving Ensemble Diversity for Model Distillation}, +author={Linh Tran and Bastiaan S. Veeling and Kevin Roth and Jakub {\'S}wi{\k{a}}tkowski and Joshua V. Dillon and Jasper Snoek and Stephan Mandt and Tim Salimans and Sebastian Nowozin and Rodolphe Jenatton}, +year={2020}, +url={https://openreview.net/forum?id=ByeaXeBFvH} +} + +@article{der, + title={Deep Evidential Regression}, + author={Amini, Alexander and Schwarting, Wilko and Soleimany, Ava and Rus, Daniela}, + journal={Advances in Neural Information Processing Systems}, + volume={33}, + year={2020} +} + + +@book{koehn, + title={Statistical machine translation}, + author={Koehn, Philipp}, + year={2009}, + publisher={Cambridge University Press} +} + +@inproceedings{kim2016sequence, + title={Sequence-Level Knowledge Distillation}, + author={Kim, Yoon and Rush, Alexander M}, + booktitle={Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing}, + pages={1317--1327}, + year={2016} +} + +@article{malinin-structured-2020, + title={Uncertainty in Structured Prediction}, + author={Malinin, Andrey and Gales, Mark}, + journal={arXiv preprint arXiv:2002.07650}, + year={2020} +} + +@article{mt-uncertainty, + title={Analyzing uncertainty in neural machine translation}, + author={Ott, Myle and Auli, Michael and Grangier, David and Ranzato, Marc'Aurelio}, + journal={arXiv preprint arXiv:1803.00047}, + year={2018} +} + +@inproceedings{chen2017confidence, + title={Confidence measures for ctc-based phone synchronous decoding}, + author={Chen, Zhehuai and Zhuang, Yimeng and Yu, Kai}, + booktitle={2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + pages={4850--4854}, + year={2017}, + organization={IEEE} +} + +@article{del2018speaker, + title={Speaker-adapted confidence measures for ASR using deep bidirectional recurrent neural networks}, + author={Del-Agua, Miguel Angel and Gimenez, Adria and Sanchis, Albert and Civera, Jorge and Juan, Alfons}, + journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, + volume={26}, + number={7}, + pages={1198--1206}, + year={2018}, + publisher={IEEE} +} + +@inproceedings{asr-confidence-decoding, + title={Uncertainty decoding for noise robust speech recognition}, + author={Liao, Hank and Gales, Mark JF}, + booktitle={Proceedings of Interspeech}, + volume={37}, + year={2007}, + organization={Citeseer} +} + +@inproceedings{asr-confidence-lattice-lstm, + title={Bi-directional lattice recurrent neural networks for confidence estimation}, + author={Li, Qiujia and Ness, PM and Ragni, Anton and Gales, Mark JF}, + booktitle={ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + pages={6755--6759}, + year={2019}, + organization={IEEE} +} + +@inproceedings{asr-confidence-bilstm, + title={Confidence estimation and deletion prediction using bidirectional recurrent neural networks}, + author={Ragni, Anton and Li, Qiujia and Gales, Mark JF and Wang, Yongqiang}, + booktitle={2018 IEEE Spoken Language Technology Workshop (SLT)}, + pages={204--211}, + year={2018}, + organization={IEEE} +} + +@inproceedings{sacrebleu, + title = "A Call for Clarity in Reporting {BLEU} Scores", + author = "Post, Matt", + booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers", + month = oct, + year = "2018", + address = "Belgium, Brussels", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/W18-6319", + pages = "186--191", +} + +@article{sennrich2015neural, + title={Neural machine translation of rare words with subword units}, + author={Sennrich, Rico and Haddow, Barry and Birch, Alexandra}, + journal={arXiv preprint arXiv:1508.07909}, + year={2015} +} + +@article{ardila2019common, + title={Common Voice: A Massively-Multilingual Speech Corpus}, + author={Ardila, Rosana and Branson, Megan and Davis, Kelly and Henretty, Michael and Kohler, Michael and Meyer, Josh and Morais, Reuben and Saunders, Lindsay and Tyers, Francis M and Weber, Gregor}, + journal={arXiv preprint arXiv:1912.06670}, + year={2019} +} + +@article{ott2018scaling, + title={Scaling neural machine translation}, + author={Ott, Myle and Edunov, Sergey and Grangier, David and Auli, Michael}, + journal={arXiv preprint arXiv:1806.00187}, + year={2018} +} + +@article{losslandscape, + title={Deep Ensembles: A Loss Landscape Perspective}, + author={Fort, Stanislav and Hu, Huiyi and Lakshminarayanan, Balaji}, + journal={arXiv preprint arXiv:1912.02757}, + year={2019} +} + +@article{ami-dataset, + title={The AMI meeting corpus}, + author={Kraaij, Wessel and Hain, Thomas and Lincoln, Mike and Post, Wilfried}, + year={2005} +} + +@inproceedings{librispeech, + title={Librispeech: an asr corpus based on public domain audio books}, + author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev}, + booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + pages={5206--5210}, + year={2015}, + organization={IEEE} +} + +@article{vgg-transformer, + title={Transformers with convolutional context for ASR}, + author={Mohamed, Abdelrahman and Okhonko, Dmytro and Zettlemoyer, Luke}, + journal={arXiv preprint arXiv:1904.11660}, + year={2019} +} + +@inproceedings{fairseq, + title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, + author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, + booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations}, + year = {2019}, +} + +@article{trust-uncertainty, + title={Can You Trust Your Model's Uncertainty? Evaluating Predictive Uncertainty Under Dataset Shift}, + author={Ovadia, Yaniv and Fertig, Emily and Ren, Jie and Nado, Zachary and Sculley, D and Nowozin, Sebastian and Dillon, Joshua V and Lakshminarayanan, Balaji and Snoek, Jasper}, + journal={Advances in Neural Information Processing Systems}, + year={2019} +} + +@inproceedings{ashukha2020pitfalls, +title={Pitfalls of In-Domain Uncertainty Estimation and Ensembling in Deep Learning}, +author={Arsenii Ashukha and Alexander Lyzhov and Dmitry Molchanov and Dmitry Vetrov}, +booktitle={International Conference on Learning Representations}, +year={2020}, +url={https://openreview.net/forum?id=BJxI5gHKDr} +} + +@phdthesis{malinin-thesis, + title={Uncertainty Estimation in Deep Learning with application to Spoken Language Assessment}, + author={Malinin, Andrey}, + year={2019}, + school={University of Cambridge} +} + +@article{nmt-calibration, + title={Calibration of Encoder Decoder Models for Neural Machine Translation}, + author={Kumar, Aviral and Sarawagi, Sunita}, + journal={arXiv preprint arXiv:1903.00802}, + year={2019} +} + +@inproceedings{malinin-rkl-2019, + title={Reverse KL-Divergence Training of Prior Networks: Improved Uncertainty and Adversarial Robustness}, + author={Malinin, Andrey and Gales, Mark JF}, + journal={Advances in Neural Information Processing Systems}, + year={2019} +} + +@article{yarin-mt-uncertainty, + title={Wat heb je gezegd? Detecting Out-of-Distribution Translations with Variational Transformers}, + author={Xiao, Tim Z and Gomez, Aidan N and Gal, Yarin}, + year={2019}, + url={http://bayesiandeeplearning.org/2019/papers/90.pdf} +} + +@article{back-uncertainty, + title={Improving Back-Translation with Uncertainty-based Confidence Estimation}, + author={Wang, Shuo and Liu, Yang and Wang, Chao and Luan, Huanbo and Sun, Maosong}, + journal={arXiv preprint arXiv:1909.00157}, + year={2019} +} + +@inproceedings{nlp-uncertainty, + author={Yijun Xiao and William Yang Wang}, + title={Quantifying Uncertainties in Natural Language Processing Tasks.}, + year={2019}, + cdate={1546300800000}, + pages={7322-7329}, + url={https://doi.org/10.1609/aaai.v33i01.33017322}, + booktitle={AAAI}, + crossref={conf/aaai/2019} +} + +@misc{batchbald, + title={BatchBALD: Efficient and Diverse Batch Acquisition for Deep Bayesian Active Learning}, + author={Andreas Kirsch and Joost van Amersfoort and Yarin Gal}, + year={2019}, + eprint={1906.08158}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} + +@inproceedings{chen2018ead, + title={EAD: elastic-net attacks to deep neural networks via adversarial examples}, + author={Chen, Pin-Yu and Sharma, Yash and Zhang, Huan and Yi, Jinfeng and Hsieh, Cho-Jui}, + booktitle={Thirty-second AAAI conference on artificial intelligence}, + year={2018} +} + +@inproceedings{metzen-detecting-2017, + title={On Detecting Adversarial Perturbations}, + author={Metzen, Jan Hendrik and Genewein, Tim and Fischer, Volker and Bischoff, Bastian}, + booktitle = {Proceedings of 5th International Conference on Learning Representations (ICLR)}, + year={2017}, + url = {https://arxiv.org/abs/1702.04267} +} + +@article{gong-detection-2017, + author = {Zhitao Gong and + Wenlu Wang and + Wei{-}Shinn Ku}, + title = {Adversarial and Clean Data Are Not Twins}, + journal = {CoRR}, + volume = {abs/1704.04960}, + year = {2017}, + url = {http://arxiv.org/abs/1704.04960} +} + +@article{grosse-detection-2017, + author = {Kathrin Grosse and + Praveen Manoharan and + Nicolas Papernot and + Michael Backes and + Patrick D. McDaniel}, + title = {On the (Statistical) Detection of Adversarial Examples}, + journal = {CoRR}, + volume = {abs/1702.06280}, + year = {2017}, + url = {http://arxiv.org/abs/1702.06280} +} + + +@article{gu-adversarial-2014, + author = {Shixiang Gu and + Luca Rigazio}, + title = {Towards Deep Neural Network Architectures Robust to Adversarial Examples}, + journal = {CoRR}, + volume = {abs/1412.5068}, + year = {2014}, + url = {http://arxiv.org/abs/1412.5068} +} + +@article{papernot-blackbox, + author = {Nicolas Papernot and + Patrick D. McDaniel and + Ian J. Goodfellow and + Somesh Jha and + Z. Berkay Celik and + Ananthram Swami}, + title = {Practical Black-Box Attacks against Deep Learning Systems using Adversarial + Examples}, + journal = {CoRR}, + volume = {abs/1602.02697}, + year = {2016}, + url = {http://arxiv.org/abs/1602.02697} +} + +@inproceedings{papernot-limitation-2016, + author = {Nicolas Papernot and + Patrick D. McDaniel and + Somesh Jha and + Matt Fredrikson and + Z. Berkay Celik and + Ananthram Swami}, + title = {The Limitations of Deep Learning in Adversarial Settings}, + booktitle = {{IEEE} European Symposium on Security and Privacy, EuroS{\&}P + 2016, Saarbr{\"{u}}cken, Germany, March 21-24, 2016}, + pages = {372--387}, + year = {2016}, + url = {https://doi.org/10.1109/EuroSP.2016.36} +} + +@inproceedings{papernote-distllaition-2016, + author = {Nicolas Papernot and + Patrick D. McDaniel and + Xi Wu and + Somesh Jha and + Ananthram Swami}, + title = {Distillation as a Defense to Adversarial Perturbations Against Deep + Neural Networks}, + booktitle = {{IEEE} Symposium on Security and Privacy, {SP} 2016, San Jose, CA, + USA, May 22-26, 2016}, + pages = {582--597}, + year = {2016}, + url = {https://doi.org/10.1109/SP.2016.41} +} + +@article{liu-delving-2016, + author = {Yanpei Liu and + Xinyun Chen and + Chang Liu and + Dawn Song}, + title = {Delving into Transferable Adversarial Examples and Black-box Attacks}, + journal = {CoRR}, + volume = {abs/1611.02770}, + year = {2016}, + url = {http://arxiv.org/abs/1611.02770}, + archivePrefix = {arXiv}, + eprint = {1611.02770}, + timestamp = {Mon, 13 Aug 2018 01:00:00 +0200}, + biburl = {https://dblp.org/rec/bib/journals/corr/LiuCLS16}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + + +@inproceedings{BIM, + title={Adversarial examples in the physical world}, + author={Alexey Kurakin and Ian J. Goodfellow and Samy Bengio}, + journal={CoRR}, + year={2016}, + volume={abs/1607.02533} +} + +@inproceedings{MIM, + title={Boosting Adversarial Attacks with Momentum}, + author={Yinpeng Dong and Fangzhou Liao and Tianyu Pang and Hang Su and Jun Zhu and Xiaolin Hu and Jianguo Li}, + year={2018}, + booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)} +} + +@article{carlini-evaluating, + title={On evaluating adversarial robustness}, + author={Carlini, Nicholas and Athalye, Anish and Papernot, Nicolas and Brendel, Wieland and Rauber, Jonas and Tsipras, Dimitris and Goodfellow, Ian and Madry, Aleksander}, + journal={arXiv preprint arXiv:1902.06705}, + year={2019} +} + +@article{kolchinsky2017estimating, + title={Estimating mixture entropy with pairwise distances}, + author={Kolchinsky, Artemy and Tracey, Brendan}, + journal={Entropy}, + volume={19}, + number={7}, + pages={361}, + year={2017}, + publisher={Multidisciplinary Digital Publishing Institute} +} + +@inproceedings{huber2008entropy, + title={On entropy approximation for Gaussian mixture random vectors}, + author={Huber, Marco F and Bailey, Tim and Durrant-Whyte, Hugh and Hanebeck, Uwe D}, + booktitle={2008 IEEE International Conference on Multisensor Fusion and Integration for Intelligent Systems}, + pages={181--188}, + year={2008}, + organization={IEEE} +} + +@article{densenet, + title={Densenet: Implementing efficient convnet descriptor pyramids}, + author={Iandola, Forrest and Moskewicz, Matt and Karayev, Sergey and Girshick, Ross and Darrell, Trevor and Keutzer, Kurt}, + journal={arXiv preprint arXiv:1404.1869}, + year={2014} +} + +@inproceedings{attentionisallyouneed, + title={Attention is all you need}, + author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, + booktitle={Advances in neural information processing systems}, + pages={5998--6008}, + year={2017} +} + +@inproceedings{resnet, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} + +@misc{one-cycle-fast-ai, + title = {The 1cycle Policy}, + author = {Sylvain Gugger}, + howpublished = {\url{https://sgugger.github.io/the-1cycle-policy.html}}, +} + +@misc{linguaskill, + title = {LinguaSkill}, + author = {Cambridge English Language Assessment}, + howpublished = {\url{https://www.cambridgeenglish.org/exams-and-tests/linguaskill/}}, +} + +@misc{vet, + title = {Computer says no: Irish vet fails oral English test needed to stay in Australia}, + author = {The Guardian}, + howpublished = {\url{https://www.theguardian.com/australia-news/2017/aug/08/computer-says-no-irish-vet-fails-oral-english-test-needed-to-stay-in-australia}}, +} + +@article{arellano2013shannon, + title={Shannon Entropy and Mutual Information for Multivariate Skew-Elliptical Distributions}, + author={ARELLANO-VALLE, REINALDO B and CONTRERAS-REYES, JAVIER E and Genton, Marc G}, + journal={Scandinavian Journal of Statistics}, + volume={40}, + number={1}, + pages={42--62}, + year={2013}, + publisher={Wiley Online Library} +} + +@article{gupta2010parametric, + title={Parametric Bayesian estimation of differential entropy and relative entropy}, + author={Gupta, Maya and Srivastava, Santosh}, + journal={Entropy}, + volume={12}, + number={4}, + pages={818--843}, + year={2010}, + publisher={Molecular Diversity Preservation International} +} + +@inproceedings{malinin2017hierarchical, + title={A hierarchical attention based model for off-topic spontaneous spoken response detection}, + author={Malinin, Andrey and Knill, Kate and Gales, Mark JF}, + booktitle={2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)}, + pages={397--403}, + year={2017}, + organization={IEEE} +} + +@InProceedings{confidence-scores, + author = {G. Evermann and P.C. Woodland}, + title = {{Large vocabulary decoding and confidence estimation using word posterior probabilities}}, + booktitle = {Proc. of {IEEE} Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, + year = 2000 +} + +@book{information-theory, + title={Elements of information theory}, + author={Cover, Thomas M and Thomas, Joy A}, + year={2006}, + publisher={John Wiley \& Sons} +} + +@InProceedings{Birkenesab2006, + author = {Birkenesab, {\O}ystein and Matsuia, Tomoko and Tanabec, Kunio}, + title = {Isolated-word recognition with penalized logistic regression machines}, + booktitle = {Proc. of {IEEE} Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, + year = {2006}, + volume = {hen}, +} + +@InCollection{Carletta2006, + author = {Carletta, J. and Ashby, S. and Bourban, S. and Flynn, M. and Guillemot, M. and Hain, T. and Kadlec, J. and Karaiskos, V. and Kraaij, W. and Kronenthal, M. and others}, + title = {The {AMI} meeting corpus: A pre-announcement}, + booktitle = {Machine Learning for Multimodal Interaction}, + publisher = {Springer}, + year = {2006}, + pages = {28--39}, + owner = {yw}, +} + +@InProceedings{Dalen2015a, + author = {van Dalen, R. C. and Knill, K. M. and Tsiakoulis, P. and Gales, M. J. F.}, + title = {Improving multiple-crowd-sourced transcriptions using a speech recogniser}, + booktitle = {Proc. of {IEEE} Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, + year = {2015}, +} + +@ARTICLE{Gales2008, + author = {Gales, M. and Young, S.}, + title = {{The application of hidden Markov models in speech recognition}}, + journal = {Foundations and Trends in Signal Processing}, + year = {2008}, + volume = {1}, + pages = {195--304}, + number = {3}, + publisher = {Now Publishers Inc.} +} + +@INPROCEEDINGS{Gimpel2010, + author = {Gimpel, K. and Smith, N. A.}, + title = {{Softmax-margin CRFs: Training log-linear models with cost functions}}, + booktitle = {Proc. of Human Language Technologies: The 2010 Annual Conference of the North + American Chapter of the Association for Computational Linguistics}, + year = {2010}, + pages = {733--736}, + organization = {Association for Computational Linguistics} +} + +@Article{Hinton2012, + author = {Hinton, G. and Deng, L. and Yu, D. and Dahl, G. E. and Mohamed, A. and Jaitly, N. and Senior, A. and Vanhoucke, V. and Nguyen, P. and Sainath, T. N. and Kingsbury, B.}, + title = {{Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups}}, + journal = {IEEE Signal Processing Magazine}, + year = {2012}, + volume = {29}, + number = {6}, + pages = {82--97}, +} + +@INPROCEEDINGS{Layton2006, + author = {Layton, M. I. and Gales, M. J. F.}, + title = {Augmented statistical models for speech recognition}, + booktitle = {Proc. of {IEEE} Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, + year = {2006}, + volume = {1}, + pages = {I--I}, + organization = {IEEE} +} + +@Article{Mohamed2012, + author = {Mohamed, A. and Dahl, G. E. and Hinton, G.}, + title = {Acoustic modeling using deep belief networks}, + journal = {IEEE Trans. on Audio, Speech, and Language Processing}, + year = {2012}, + volume = {20}, + number = {1}, + pages = {14--22}, + publisher = {IEEE}, +} + +@ARTICLE{Nguyen2010, + author = {Nguyen, P. and Heigold, G. and Zweig, G.}, + title = {Speech recognition with flat direct models}, + journal = {IEEE Journal of Selected Topics in Signal Processing}, + year = {2010}, + volume = {4}, + pages = {994--1006}, + number = {6}, + publisher = {IEEE} +} + +@InProceedings{park-2011-phoneme, + author = {H. Park and S. Yun}, + year = 2011, + title = {{Phoneme Classification using Constrained Variational Gaussian Process Dynamical System}}, + booktitle = {Proc. of Conference on Neural Information Processing Systems (NIPS)} +} + + +@Article{Park2011, + author = {J. Park and F. Diehl and M.J.F. Gales and M. Tomalin and P.C. Woodland}, + title = {The efficient incorporation of {MLP} features into automatic speech recognition systems}, + journal = {Computer Speech and Language}, + year = {2011}, + volume = {25}, + number = {3}, + pages = {519 - 534}, + doi = {http://dx.doi.org/10.1016/j.csl.2010.07.005}, + issn = {0885-2308}, + keywords = {Automatic speech recognition}, + owner = {yw}, + timestamp = {2015.09.24}, + url = {http://www.sciencedirect.com/science/article/pii/S0885230810000628}, +} + +@PHDTHESIS{Povey2003, + author = {Povey, D.}, + title = {Discriminative training for large vocabulary speech recognition}, + school = {Eng. Dept., Cambridge Univ., Cambridge, U.K.}, + year = {2003} +} + +@InProceedings{Taskar2005, + author = {Taskar, B. and Chatalbashev, V. and Koller, D. and Guestrin, C.}, + title = {{Learning structured prediction models: A large margin approach}}, + booktitle = {Proc. of 22nd Intl. Conf. on Machine Learning (ICML)}, + year = {2005}, + pages = {896--903}, + organization = {ACM}, +} + +@InProceedings{Vesely2013, + author = {Vesel{\`y}, K. and Ghoshal, A. and Burget, L. and Povey, D.}, + title = {Sequence discriminative training of deep neural networks}, + booktitle = {Proc. of INTERSPEECH}, + year = {2013}, + pages = {2345--2349}, + month = {Aug}, +} + +@InProceedings{Knill2017, + author = {Knill, K. M. and Gales, M. J. F. and K. Kyriakopoulos and Ragni, A. and Wang Y.}, + title = {Use of Graphemic Lexicon for Spoken Language Assessment}, + booktitle = {Proc. of INTERSPEECH}, + year = {2017}, + month = {Aug}, +} + +@INPROCEEDINGS{Wang2015, + author = {Wang, H. and Ragni, A. and Gales, M. J. F. and Knill, K. M. and Woodland, + P. C. and Zhang, C.}, + title = {Joint decoding of tandem and hybrid systems for improved keyword + spotting on low resource languages}, + booktitle = {Proc. of INTERSPEECH}, + year = {2015}, + volume = {15}, + pages = {3660--3664}, + month = {{S}ep}, + owner = {yw} +} + +@Book{Young2009, + author = {Young, S. and Evermann, G. and Gales, M. and Hain, T. and Kershaw, D. and Liu, X. and Moore, G. and Odell, J. and Ollason, D. and Povey, D. and Valtchev, V. and Woodland, P.}, + title = {The {HTK} book (for {HTK} version 3.4.1)}, + PUBLISHER = {University of Cambridge}, + url = {http://htk.eng.cam.ac.uk}, + year = {2009}, +} + +@Book{Young2015_htk, + author = {Young, S. and Evermann, G. and Gales, M. and Hain, T. and Kershaw, D. and Liu, X. and Moore, G. and Odell, J. and Ollason, D. and Povey, D. and Valtchev, V. and Woodland, P.}, + title = {The {HTK} book (for {HTK} version 3.5)}, + PUBLISHER = {University of Cambridge}, + year = {2015}, + url = {http://htk.eng.cam.ac.uk} +} + +@ARTICLE{Zhang2015, + author = {Zhang, C. and Woodland, P. C. }, + title = {A general artificial neural network extension for {HTK}}, + journal = {Proc. of INTERSPEECH}, + year = {2015}, + month = {{S}ep} +} + +@INPROCEEDINGS{Zweig2009, + author = {Zweig, G. and Nguyen, P.}, + title = {A segmental {CRF} approach to large vocabulary continuous speech + recognition}, + booktitle = {{IEEE} Workshop on Automatic Speech Recognition \& Understanding + (ASRU)}, + year = {2009}, + pages = {152--157}, + organization = {IEEE} +} + +@InProceedings{Dalen2015, + author = {van Dalen, R. C. and Knill, K. M. and Gales, M. J. F.}, + title = {{Automatically grading learners' English using a Gaussian process}}, + year = {2015}, + month = {Aug}, + booktitle = {Proc of {ISCA} International Workshop on Speech and Language Technology in + Education (SLaTE)}, + organization = {ISCA}, +} + +@inproceedings{henter-2012-gaussian_process, + author = {G. E. Henter and M. R. Frean and W. B. Kleijn}, + year = 2012, + title = {Gaussian process dynamical models for nonparametric speech representation and synthesis}, + booktitle = {Proc. of {IEEE} Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, + +} + +@article{Hu2015, + author = {Hu, W. and Qian, Y. and Soong, F.K. and Wang, Y.}, + title = {{Improved mispronunciation detection with deep neural network trained acoustic models and transfer learning based logistic regression classifiers}}, + journal = {Speech Communication}, + year = {2015}, + volume = {67}, + pages = {154--165}, +} + +@article{Cheng2015, + author = {Cheng, J. and Chen, X. and Metallinou, A.}, + title = {{Deep neural network acoustic models for spoken assessment applications}}, + journal = {Speech Communication}, + year = {2015}, + volume = {73}, + pages = {14--27}, +} + + +@InProceedings{Metallinou2014, + author = {Metallinou, A. and Cheng, J.}, + title = {{Using deep neural networks to improve proficiency assessment for children English language learners}}, + booktitle = {Proc of INTERSPEECH}, + year = {2014}, + pages = {1468--1472}, +} + +@InProceedings{Tao2015, + author = {Tao, J. and Ghaffarzadegan, S. and Chen, L. and Zechner, K.}, + title = {{Exploring deep learning architectures for automatically grading non-native spontaneous speech}}, + booktitle = {Proc of INTERSPEECH}, + year = {2015}, +} + +@InProceedings{Cucchiarini1997, + author = {Cucchiarini, C. and Strik, H. and Boves, L.}, + title = {{Automatic evaluation of Dutch pronunciation by using speech recognition technology}}, + booktitle = {Proc. of {IEEE} Workshop on Automatic Speech Recognition \& Understanding + (ASRU)}, + year = {1997}, + pages = {622--629}, +} + +@Book{Rasmussen2006, + title = {Gaussian Processes for Machine Learning}, + publisher = {MIT Press}, + year = {2006}, + author = {Rasmussen, C. E. and Williams, C. K. I.}, +} + +@Article{Franco2000, + author = {Franco, H. and Abrash, V. and Precoda, K. and Bratt, H. and Rao, R. and Butzberger, J. and Rossier, R. and Cesari, F.}, + title = {{The SRI EduSpeakTM system: Recognition and pronunciation scoring for language learning}}, + journal = {Proc. of InSTILL 2000}, + year = {2000}, + pages = {123--128}, +} + +@Article{Zechner2009, + author = {Zechner, K. and Higgins, D. and Xi, X. and Williamson, D. M.}, + title = {{Automatic scoring of non-native spontaneous speech in tests of spoken English}}, + journal = {Speech Communication}, + year = {2009}, + volume = {51}, + number = {10}, + pages = {883--895}, +} + +@Article{Higgins2011, + author = {Higgins, D. and Xi, X. and Zechner, K. and Williamson, D.}, + title = {A three-stage approach to the automated scoring of spontaneous spoken responses}, + journal = {Computer Speech and Language}, + year = {2011}, + volume = {25}, + number = {2}, + pages = {282--306}, +} + +@InProceedings{Zechner2006, + author = {Zechner, K. and Bejar, I. I.}, + title = {Towards automatic scoring of non-native spontaneous speech}, + booktitle = {Proc. of Human Language Technology Conference of the North American chapter of the Association of Computational Linguistics}, + year = {2006}, + pages = {216--223}, + organization = {Association for Computational Linguistics}, +} + +@InProceedings{Doremalen2009, + author = {van Doremalen, J. and Cucchiarini, C. and Strik, H.}, + title = {Automatic detection of vowel pronunciation errors using multiple information sources}, + booktitle = {Proc. of IEEE Workshop on Automatic Speech Recognition \& Understanding (ASRU)}, + year = {2009}, + pages = {580--585}, +} + +@InProceedings{Nicolao2015, + author = {Nicolao, M. and Beeston, A. V. and Hain, T.}, + title = {{Automatic assessment of English learner pronunciation using discriminative classifiers}}, + booktitle = {Proc. of {IEEE} Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, + year = {2015}, + pages = {5351--5355}, + organization = {IEEE}, +} + +@Article{Maqsood2016, + author = {Maqsood, M. and Habib, H. A. and Nawaz, T. and Haider, K. Z.}, + title = {{A Complete Mispronunciation Detection System for Arabic Phonemes using SVM}}, + journal = {Intl. Journal of Computer Science and Network Security (IJCSNS)}, + year = {2016}, + volume = {16}, + number = {3}, + pages = {30--34}, + publisher = {International Journal of Computer Science and Network Security}, +} + +@InProceedings{Eskenazi1996, + author = {Eskenazi, M.}, + title = {Detection of foreign speakers' pronunciation errors for second language training-preliminary results}, + booktitle = {Proc. of Fourth International Conference on Spoken Language (ICSLP)}, + year = {1996}, + volume = {3}, + pages = {1465--1468}, + organization = {ISCA}, +} + +@Article{Witt2000, + author = {Witt, S. M. and Young, S. J.}, + title = {Phone-level pronunciation scoring and assessment for interactive language learning}, + journal = {Speech Communication}, + year = {2000}, + volume = {30}, + number = {2}, + pages = {95--108}, +} + +@InProceedings{Muller2009, + author = {M{\"u}ller, Pieter and De Wet, Febe and Van Der Walt, Christa and Niesler, Thomas}, + title = {{Automatically assessing the oral proficiency of proficient L2 speakers}}, + booktitle = {{ISCA} International Workshop on Speech and Language Technology in + Education (SLaTE)}, + year = {2009}, + pages = {29--32}, + month = {Sep}, +} + +@InProceedings{Kim1997, + author = {Kim, Y. and Franco, H. and Neumeyer, L.}, + title = {Automatic pronunciation scoring of specific phone segments for language instruction.}, + booktitle = {Proc. of Eurospeech}, + year = {1997}, + month = {Sep}, +} + +@Article{Cincarek2009, + author = {Cincarek, T. and Gruhn, R. and Hacker, C. and N{\"o}th, E. and Nakamura, S.}, + title = {Automatic pronunciation scoring of words and sentences independent from the non-native's first language}, + journal = {Computer Speech and Language}, + year = {2009}, + volume = {23}, + number = {1}, + pages = {65--88}, +} + +@PhdThesis{Witt1999, + author = {Witt, S. M.}, + title = {Use of speech recognition in computer-assisted language learning}, + school = {University of Cambridge}, + year = {1999}, +} + + + +@Misc{BULATS, + title = {{BULATS}. {B}usiness {L}anguage {T}esting {S}ervice}, + howpublished = {\url{http://www.bulats.org/computer-based-tests/online-tests}}, +} + +@Book{CEFR2001, + title = {{C}ommon {E}uropean {F}ramework of {R}eference for {L}anguages: learning, teaching, assessment}, + publisher = {Cambridge University Press}, + author = {Council of Europe}, + year = {2001}, +} + +@InProceedings{van2015, + author = {van Dalen, R. C. and Knill, K. M. and Tsiakoulis, P. and Gales, M. J. F.}, + title = {Improving multiple-crowd-sourced transcriptions using a speech recogniser}, + booktitle = {Proc. of {IEEE} Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, + year = {2015}, + pages = {4709--4713}, + organization = {IEEE}, +} + +@InProceedings{Snow2008, + author = {Snow, R. and O'Connor, B. and Jurafsky, D. and Ng, Andrew Y.}, + title = {Cheap and fast---but is it good?: evaluating non-expert annotations for natural language tasks}, + booktitle = {Proc. of Conference on Empirical Methods in Natural Language Processing (ENMLP)}, + year = {2008}, + pages = {254--263}, + organization = {Association for Computational Linguistics}, +} + +@article{chambers-2011-bulats, + AUTHOR = {Lucy Chambers and Kate Ingham}, + TITLE = {The {BULATS} Online Speaking Test}, + JOURNAL = {Research Notes}, + VOLUME = 43, + PAGES = {21-25}, + YEAR = 2011, + howpublished = {\url{http://www.cambridgeenglish.org/images/23161-research-notes-43.pdf}} +} + +@InProceedings{Parent2011, + author = {Parent, G. and Eskenazi, M.}, + title = {{Speaking to the Crowd: Looking at Past Achievements in Using Crowdsourcing for Speech and Predicting Future Challenges}}, + booktitle = {Proc. of INTERSPEECH}, + year = {2011}, + pages = {3037--3040}, +} + +@InProceedings{Graham2016, + author = {Graham, C. and Buttery, P. and Nolan, F.}, + title = {{Vowel Characteristics in the Assessment of L2 English Pronunciation}}, + booktitle = {Proc. of INTERSPEECH}, + year = {2016}, +} + +@InProceedings{Qian2016, + author = {Qian, Y. and Wang, X. and Evanini, K. and Suendermann-Oeft, D.}, + title = {{Self-Adaptive DNN for Improving Spoken Language Proficiency Assessment}}, + booktitle = {Proc. of INTERSPEECH}, + year = {2016}, +} + +@InProceedings{Evanini2010, + author = {Evanini, K. and Higgins, D. and Zechner, K.}, + title = {{Using Amazon Mechanical Turk for transcription of non-native speech}}, + booktitle = {Proc. of the {NAACL} {HLT} 2010 Workshop on Creating Speech and Language Data with Amazon's Mechanical Turk}, + year = {2010}, + pages = {53--56}, + organization = {Association for Computational Linguistics}, +} + +@InProceedings{Yannakoudakis2011, + author = {Yannakoudakis, H. and Briscoe, T. and Medlock, B.}, + title = {{A new dataset and method for automatically grading ESOL texts}}, + booktitle = {Proc. of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, + year = {2011}, + pages = {180--189}, + organization = {Association for Computational Linguistics}, +} + +@InProceedings{Briscoe2006, + author = {Briscoe, T. and Carroll, J. and Watson, R.}, + title = {The second release of the {RASP} system}, + booktitle = {Proc. of the {COLING}/{ACL} on Interactive Presentation Sessions}, + year = {2006}, + pages = {77--80}, + organization = {Association for Computational Linguistics}, +} + +@InProceedings{Collins2001, + author = {Collins, M. and Duffy, N.}, + title = {Convolution kernels for natural language}, + booktitle = {Advances in Neural Information Processing Systems}, + year = {2001}, + pages = {625--632}, +} + +@Article{Rabiner1986, + author = {Rabiner, L. and Juang, B.}, + title = {{An introduction to hidden Markov models}}, + journal = {IEEE ASSP Magazine}, + year = {1986}, + volume = {3}, + number = {1}, + pages = {4--16}, + publisher = {IEEE}, +} + +@Misc{Brookes1998-2016, + author = {M. Brookes}, + title = {{The Matrix Reference Manual}}, + howpublished = {\url{http://www.ee.imperial.ac.uk/hp/staff/dmb/matrix/intro.html}}, + year = {1998-2016}, + institution = {Imperial College}, + keywords = {DMB=M}, + owner = {dmb}, + type = {Website}, + url = {http://www.ee.imperial.ac.uk/hp/staff/dmb/matrix/intro.html}, +} + +@InProceedings{Evermann2000, + author = {G. Evermann and P.C. Woodland}, + title = {{Large vocabulary decoding and confidence estimation using word posterior probabilities}}, + booktitle = {Proc. of {IEEE} Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, + year = {2000}, +} + +@INPROCEEDINGS{SRILM-2002, +author = {A. Stolcke}, +title = {{SRILM - An Extensible Language Modeling Toolkit}}, +booktitle = {Proc. of ICSLP}, +year = {2002} +} + +@InProceedings{Mangu1999, + author = {Mangu, L. and Brill, E. and Stolcke, A.}, + title = {Finding consensus among words: lattice-based word error minimization}, + booktitle = {Proc. of Eurospeech}, + year = {1999}, +} + +@Article{Johnson2001, + author = {Johnson, D. and Sinanovic, S.}, + title = {{Symmetrizing the Kullback-Leibler Distance}}, + journal = {IEEE Trans. on Information Theory}, + year = {2001}, +} + +@Article{Endres2003, + author = {Endres, D. M. and Schindelin, J. E.}, + title = {A new metric for probability distributions}, + journal = {IEEE Trans. on Information theory}, + year = {2003}, +} + +@PhdThesis{Longworth2010, + author = {Longworth, C.}, + title = {Kernel methods for text-independent speaker verification}, + school = {University of Cambridge}, + year = {2010}, +} + +@InProceedings{Chen2010, + author = {Chen, L. and Evanini, K. and Sun, X.}, + title = {Assessment of non-native speech using vowel space characteristics}, + booktitle = {Proc. of IEEE Workshop on Spoken Language Technology (SLT)}, + year = {2010}, + pages = {139--144}, +} + +@InProceedings{Minematsu2006, + author = {N. Minematsu and S. Asakawa and K. Hirose}, + title = {{Structural representation of the pronunciation and its use for CALL}}, + booktitle = {Proc. of IEEE Workshop on Spoken Language Technology (SLT)}, + year = {2006}, + pages = {126-129}, + month = {Dec}, + keywords = {computer aided instruction;linguistics;computer-aided language learning;language learner;pronunciation portfolio;structural representation;structural visualization;vowel structure;Acoustic distortion;Data mining;Frequency;Loudspeakers;Microphones;Portfolios;Spectrogram;Speech;Vectors;Visualization}, +} + +@Article{Franco2000a, + author = {Franco, H. and Neumeyer, L. and Digalakis, V. and Ronen, O.}, + title = {Combination of machine scores for automatic grading of pronunciation quality}, + journal = {Speech Communication}, + year = {2000}, + volume = {30}, + number = {2}, + pages = {121--130}, +} + +@InProceedings{Lee2013, + author = {Lee, A. and Glass, J. R.}, + title = {Pronunciation assessment via a comparison-based system}, + booktitle = {Proc. of {ISCA} International Workshop on Speech and Language Technology in + Education (SLaTE)}, + year = {2013}, + pages = {122--126}, +} + +@InProceedings{Koniaris2011, + author = {Koniaris, C. and Engwall, O.}, + title = {{Phoneme Level Non-Native Pronunciation Analysis by an Auditory Model-Based Native Assessment Scheme}}, + booktitle = {Proc. of INTERSPEECH}, + year = {2011}, + pages = {1157--1160}, +} + +@InProceedings{Asakawa2005, + author = {Asakawa, S. and Minematsu, N. and Isei-Jaak, T. and Hirose, K.}, + title = {Structural representation of the non-native pronunciations}, + booktitle = {Proc. of INTERSPEECH}, + year = {2005}, + pages = {165--168}, +} + +@Article{Wei2009, + author = {Wei, S. and Hu, G. and Hu, Y. and Wang, R.}, + title = {A new method for mispronunciation detection using support vector machine based on pronunciation space models}, + journal = {Speech Communication}, + year = {2009}, + volume = {51}, + number = {10}, + pages = {896--905}, +} + +@Article{Strik2009, + author = {Strik, H. and Truong, K. and De Wet, F. and Cucchiarini, C.}, + title = {Comparing different approaches for automatic pronunciation error detection}, + journal = {Speech Communication}, + year = {2009}, + volume = {51}, + number = {10}, + pages = {845--852}, +} + +@InProceedings{Kim2002, + author = {Kim, C. and Sung, W.}, + title = {Implementation of an intonational quality assessment system}, + booktitle = {Proc. of INTERSPEECH}, + year = {2002}, +} + +@Book{Hartung2011, + title = {Statistical meta-analysis with applications}, + publisher = {John Wiley \& Sons}, + year = {2011}, + author = {Hartung, J. and Knapp, G. and Sinha, B. K.}, +} + +@Article{Gales1999, + author = {Gales, M. J. F.}, + title = {{Semi-tied covariance matrices for hidden Markov models}}, + journal = {IEEE Trans. on Speech and Audio Processing}, + year = {1999}, + volume = {7}, + number = {3}, + pages = {272--281}, + publisher = {IEEE}, +} + +@Article{Gales1998, + author = {Gales, M. J. F.}, + title = {{Maximum Likelihood Linear Transformations for HMM-Based Speech Recognition}}, + journal = {Computer Speech and Language}, + year = {1998}, + volume = {12}, + number = {2}, + pages = {75--98}, +} + +@InProceedings{Povey2002, + author = {Povey, D. and Woodland, P. C.}, + title = {Minimum phone error and {I}-smoothing for improved discriminative training}, + booktitle = {Proc. of {IEEE} Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, + year = {2002}, + volume = {1}, + pages = {I--105}, +} + +@InProceedings{Ramos2003, + author = {Ramos, J.}, + title = {Using {TF}-{IDF} to determine word relevance in document queries}, + booktitle = {Proceedings of the first instructional conference on machine learning}, + year = {2003}, + volume = {242}, + pages = {133--142}, +} + +@InProceedings{Kyriakopoulos2017, + author = {Kyriakopoulos, K. and Gales, M. J. F. and Knill, K. M.}, + title = {Automatic Characterisation of the Pronunciation of Non-native English Speakers using Phone Distance Features}, + year = {2017}, + month = {Aug}, + booktitle = {Proc. of {ISCA} International Workshop on Speech and Language Technology in Education (SLaTE)}, + organization = {ISCA}, +} + +@InProceedings{Malinin2017, + author = {Malinin, A. and Ragni, A. and Knill, K. M. and Gales, M. J. F.}, + title = {Incorporating Uncertainty into Deep Learning for Spoken Language Assessment}, + year = {2017}, + organization = {Association for Computational Linguistics}, +} + +@Article{Zhang2011, + author = {Zhang, W. and Yoshida, T. and Tang, X.}, + title = {A comparative study of {TF}* {IDF}, {LSI} and multi-words for text classification}, + journal = {Expert Systems with Applications}, + year = {2011}, + volume = {38}, + number = {3}, + pages = {2758--2765}, +} + + + + + + + + + + + + + + + + + + + + + + + + + + + +@inproceedings{nallapati2016abstractive, + title={Abstractive Text Summarization using Sequence-to-sequence RNNs and Beyond}, + author={Nallapati, Ramesh and Zhou, Bowen and dos Santos, Cicero and Gulcehre, Caglar and Xiang, Bing}, + booktitle={Proceedings of The 20th SIGNLL Conference on Computational Natural Language Learning}, + pages={280--290}, + year={2016} +} + +@inproceedings{zhou2018neural, + title={Neural Document Summarization by Jointly Learning to Score and Select Sentences}, + author={Zhou, Qingyu and Yang, Nan and Wei, Furu and Huang, Shaohan and Zhou, Ming and Zhao, Tiejun}, + booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages={654--663}, + year={2018} +} + +@inproceedings{see2017get, + title={Get To The Point: Summarization with Pointer-Generator Networks}, + author={See, Abigail and Liu, Peter J and Manning, Christopher D}, + booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages={1073--1083}, + year={2017} +} + +@inproceedings{lu2016hierarchical, + title={Hierarchical question-image co-attention for visual question answering}, + author={Lu, Jiasen and Yang, Jianwei and Batra, Dhruv and Parikh, Devi}, + booktitle={Advances In Neural Information Processing Systems}, + pages={289--297}, + year={2016} +} + +@inproceedings{hermann2015teaching, + title={Teaching machines to read and comprehend}, + author={Hermann, Karl Moritz and Kocisky, Tomas and Grefenstette, Edward and Espeholt, Lasse and Kay, Will and Suleyman, Mustafa and Blunsom, Phil}, + booktitle={Advances in neural information processing systems}, + pages={1693--1701}, + year={2015} +} + +@article{cho2014learning, + title={Learning phrase representations using RNN encoder-decoder for statistical machine translation}, + author={Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua}, + journal={arXiv preprint arXiv:1406.1078}, + year={2014} +} + +@article{robbins1951stochastic, + title={A stochastic approximation method}, + author={Robbins, Herbert and Monro, Sutton}, + journal={The annals of mathematical statistics}, + pages={400--407}, + year={1951}, + publisher={JSTOR} +} + +@article{amsgrad, + title={On the convergence of adam and beyond}, + author={Reddi, Sashank J and Kale, Satyen and Kumar, Sanjiv}, + journal={arXiv preprint arXiv:1904.09237}, + year={2019} +} + +@article{smith2017super, + title={Super-convergence: Very fast training of neural networks using large learning rates}, + author={Smith, Leslie N and Topin, Nicholay}, + journal={arXiv preprint arXiv:1708.07120}, + year={2017} +} + +@article{smith2018disciplined, + title={A disciplined approach to neural network hyper-parameters: Part 1--learning rate, batch size, momentum, and weight decay}, + author={Smith, Leslie N}, + journal={arXiv preprint arXiv:1803.09820}, + year={2018} +} + +@article{wong1998generalized, + title={Generalized Dirichlet distribution in Bayesian analysis}, + author={Wong, Tzu-Tsung}, + journal={Applied Mathematics and Computation}, + volume={97}, + number={2-3}, + pages={165--181}, + year={1998}, + publisher={Elsevier} +} + +@article{connor1969concepts, + title={Concepts of independence for proportions with a generalization of the Dirichlet distribution}, + author={Connor, Robert J and Mosimann, James E}, + journal={Journal of the American Statistical Association}, + volume={64}, + number={325}, + pages={194--206}, + year={1969}, + publisher={Taylor \& Francis} +} + +@article{mead1965generalised, + title={A generalised logit-normal distribution}, + author={Mead, R}, + journal={Biometrics}, + volume={21}, + number={3}, + pages={721--732}, + year={1965}, + publisher={JSTOR} +} + +@article{wang2018towards, + title={Towards automatic assessment of spontaneous spoken English}, + author={Wang, Yu and Gales, MJF and Knill, Katherine Mary and Kyriakopoulos, K and Malinin, Andrey and van Dalen, RC and Rashid, M}, + journal={Speech Communication}, + volume={104}, + pages={47--56}, + year={2018}, + publisher={Elsevier} +} + +@inproceedings{marcopaper, + title={Improved Auto-Marking Confidence for Spoken Language Assessment}, + author={Del Vecchio, M and Malinin, A and Gales, MJF}, + booktitle={2018 IEEE Spoken Language Technology Workshop (SLT)}, + pages={957--963}, + year={2018}, + organization={IEEE} +} + +@inproceedings{snn-relevance2, + title={Off-Topic Spoken Response Detection Using Siamese Convolutional Neural Networks.}, + author={Lee, Chong Min and Yoon, Su-Youn and Wang, Xihao and Mulholland, Matthew and Choi, Ikkyu and Evanini, Keelan}, + booktitle={INTERSPEECH}, + pages={1427--1431}, + year={2017} +} + +@inproceedings{snn-relevance1, + title={Off-Topic Spoken Response Detection with Word Embeddings.}, + author={Yoon, Su-Youn and Lee, Chong Min and Choi, Ikkyu and Wang, Xinhao and Mulholland, Matthew and Evanini, Keelan}, + booktitle={INTERSPEECH}, + pages={2754--2758}, + year={2017} +} + +@inproceedings{snn, + title={Signature verification using a" siamese" time delay neural network}, + author={Bromley, Jane and Guyon, Isabelle and LeCun, Yann and S{\"a}ckinger, Eduard and Shah, Roopak}, + booktitle={Advances in neural information processing systems}, + pages={737--744}, + year={1994} +} + +@inproceedings{osband2016risk, + title={Risk versus uncertainty in deep learning: Bayes, bootstrap and the dangers of dropout}, + author={Osband, Ian}, + booktitle={NIPS Workshop on Bayesian Deep Learning}, + year={2016} +} + +@article{blundell2015weight, + title={Weight uncertainty in neural networks}, + author={Blundell, Charles and Cornebise, Julien and Kavukcuoglu, Koray and Wierstra, Daan}, + journal={arXiv preprint arXiv:1505.05424}, + year={2015} +} + +@inproceedings{probabilisticbackpropagation, + title={Probabilistic backpropagation for scalable learning of bayesian neural networks}, + author={Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Adams, Ryan}, + booktitle={International Conference on Machine Learning}, + pages={1861--1869}, + year={2015} +} + +@incollection{evidential, +title = {Evidential Deep Learning to Quantify Classification Uncertainty}, +author = {Sensoy, Murat and Kaplan, Lance and Kandemir, Melih}, +booktitle = {Advances in Neural Information Processing Systems 31}, +editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett}, +pages = {3179--3189}, +year = {2018}, +publisher = {Curran Associates, Inc.}, +url = {http://papers.nips.cc/paper/7580-evidential-deep-learning-to-quantify-classification-uncertainty.pdf} +} + + +@inproceedings{malinin-pn-2018, + title={Predictive uncertainty estimation via prior networks}, + author={Malinin, Andrey and Gales, Mark}, + booktitle={Advances in Neural Information Processing Systems}, + pages={7047--7058}, + year={2018} +} + +@inproceedings{lstm-rnn, + title={LSTM neural networks for language modeling}, + author={Sundermeyer, Martin and Schl{\"u}ter, Ralf and Ney, Hermann}, + booktitle={Thirteenth annual conference of the international speech communication association}, + year={2012} +} + +@article{adagrad, + title={Adaptive subgradient methods for online learning and stochastic optimization}, + author={Duchi, John and Hazan, Elad and Singer, Yoram}, + journal={Journal of Machine Learning Research}, + volume={12}, + number={Jul}, + pages={2121--2159}, + year={2011} +} + +@article{adadelta, + title={ADADELTA: an adaptive learning rate method}, + author={Zeiler, Matthew D}, + journal={arXiv preprint arXiv:1212.5701}, + year={2012} +} + +@article{adamW, + title={Fixing weight decay regularization in adam}, + author={Loshchilov, Ilya and Hutter, Frank}, + journal={arXiv preprint arXiv:1711.05101}, + year={2017} +} + +@inproceedings{van2016conditional, + title={Conditional image generation with pixelcnn decoders}, + author={Van den Oord, Aaron and Kalchbrenner, Nal and Espeholt, Lasse and Vinyals, Oriol and Graves, Alex and others}, + booktitle={Advances in neural information processing systems}, + pages={4790--4798}, + year={2016} +} + +@article{oord2016pixel, + title={Pixel recurrent neural networks}, + author={Oord, Aaron van den and Kalchbrenner, Nal and Kavukcuoglu, Koray}, + journal={arXiv preprint arXiv:1601.06759}, + year={2016} +} + +@article{lecun1995convolutional, + title={Convolutional networks for images, speech, and time series}, + author={LeCun, Yann and Bengio, Yoshua and others}, + journal={The handbook of brain theory and neural networks}, + volume={3361}, + number={10}, + pages={1995}, + year={1995} +} + +@book{bishop, + title={Pattern recognition and machine learning}, + author={Bishop, Christopher M}, + year={2006}, + publisher={springer} +} + +@book{deeplearning, + title={Deep learning}, + author={Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron}, + year={2016}, + publisher={MIT press} +} + +@techreport{active-learning, + title={Active learning literature survey}, + author={Settles, Burr}, + year={2009}, + institution={University of Wisconsin-Madison Department of Computer Sciences} +} + +@article{inhibitedsoftmax, + title={Inhibited Softmax for Uncertainty Estimation in Neural Networks}, + author={Mo{\.z}ejko, Marcin and Susik, Mateusz and Karczewski, Rafa{\l}}, + journal={arXiv preprint arXiv:1810.01861}, + year={2018} +} + +@article{mutual-information, + title={Decomposition of uncertainty for active learning and reliable reinforcement learning in stochastic systems}, + author={Depeweg, Stefan and Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Doshi-Velez, Finale and Udluft, Steffen}, + journal={stat}, + volume={1050}, + pages={11}, + year={2017} +} + +@incollection{bootstrapdqn, +title = {Deep Exploration via Bootstrapped DQN}, +author = {Osband, Ian and Blundell, Charles and Pritzel, Alexander and Van Roy, Benjamin}, +booktitle = {Advances in Neural Information Processing Systems 29}, +editor = {D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and R. Garnett}, +pages = {4026--4034}, +year = {2016}, +publisher = {Curran Associates, Inc.}, +url = {http://papers.nips.cc/paper/6501-deep-exploration-via-bootstrapped-dqn.pdf} +} + + +@INPROCEEDINGS{ensemble-asr2, +author={Y. {Wang} and J. H. M. {Wong} and M. J. F. {Gales} and K. M. {Knill} and A. {Ragni}}, +booktitle={2018 IEEE Spoken Language Technology Workshop (SLT)}, +title={Sequence Teacher-Student Training of Acoustic Models for Automatic Free Speaking Language Assessment}, +year={2018}, +pages={994-1000}, +doi={10.1109/SLT.2018.8639557}} + +@INPROCEEDINGS{ensemble-asr, +author={J. H. M. {Wong} and M. J. F. {Gales}}, +booktitle={2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)}, +title={Multi-task ensembles with teacher-student training}, +year={2017}, +pages={84-90}, +doi={10.1109/ASRU.2017.8268920}} + +@inproceedings{krizhevsky2012imagenet, + title={Imagenet classification with deep convolutional neural networks}, + author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, + booktitle={Advances in neural information processing systems}, + pages={1097--1105}, + year={2012} +} + +@article{rosenblatt1958perceptron, + title={The perceptron: a probabilistic model for information storage and organization in the brain.}, + author={Rosenblatt, Frank}, + journal={Psychological review}, + volume={65}, + number={6}, + pages={386}, + year={1958}, + publisher={American Psychological Association} +} + +@article{rumelhart1988learning, + title={Learning representations by back-propagating errors}, + author={Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J and others}, + journal={Cognitive modeling}, + volume={5}, + number={3}, + pages={1}, + year={1988} +} + +@article{carlini-robustness, + author = {Nicholas Carlini and + David A. Wagner}, + title = {Towards Evaluating the Robustness of Neural Networks}, + journal = {CoRR}, + year = {2016}, + url = {http://arxiv.org/abs/1608.04644} +} + +@article{carlini-detected, + author = {Nicholas Carlini and + David A. Wagner}, + title = {Adversarial Examples Are Not Easily Detected: Bypassing Ten Detection + Methods}, + journal = {CoRR}, + year = {2017}, + url = {http://arxiv.org/abs/1705.07263}, + archivePrefix = {arXiv} +} + +@InProceedings{nguyen-fooled, +author = {Nguyen, Anh and Yosinski, Jason and Clune, Jeff}, +title = {Deep Neural Networks Are Easily Fooled: High Confidence Predictions for Unrecognizable Images}, +booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, +month = {June}, +year = {2015} +} + +@article{tanay-adversarial, + author = {Thomas Tanay and + Lewis D. Griffin}, + title = {A Boundary Tilting Persepective on the Phenomenon of Adversarial Examples}, + journal = {CoRR}, + year = {2016}, + url = {http://arxiv.org/abs/1608.07690} +} + +@ARTICLE{feinman-adversarial, + author = {{Feinman}, R. and {Curtin}, R.~R. and {Shintre}, S. and {Gardner}, A.~B. + }, + title = "{Detecting Adversarial Samples from Artifacts}", + journal = {CoRR}, + year = {2018}, + url = {https://arxiv.org/pdf/1703.00410.pdf} +} + +@article{akhtar-adversarial, + author = {Naveed Akhtar and + Ajmal Mian}, + title = {Threat of Adversarial Attacks on Deep Learning in Computer Vision: + {A} Survey}, + journal = {CoRR}, + year = {2018}, + url = {http://arxiv.org/abs/1801.00553}, +} + +@inproceedings{szegedy-adversarial, +title = {Deep Neural Networks for Object Detection}, +author = {Christian Szegedy and Alexander Toshev and Dumitru Erhan}, +year = {2013}, +URL = {http://papers.nips.cc/paper/5207-deep-neural-networks-for-object-detection}, +booktitle = {Advances in Neural Information Processing Systems} +} + + +@inproceedings{goodfellow-adversarial, +title = {Explaining and Harnessing Adversarial Examples}, +author = {Ian Goodfellow and Jonathon Shlens and Christian Szegedy}, +year = {2015}, +URL = {http://arxiv.org/abs/1412.6572}, +booktitle = {International Conference on Learning Representations} +} + +@inproceedings{gal-adversarial, + author = {{Smith}, L. and {Gal}, Y.}, + title = "{Understanding Measures of Uncertainty for Adversarial Example Detection}", + booktitle = {UAI}, + year = 2018, +} + +@inproceedings{malinin-endd-2019, +title={Ensemble Distribution Distillation}, +author={Malinin, Andrey and Mlodozeniec, Bruno and Gales, Mark JF}, +booktitle={International Conference on Learning Representations}, +year={2020}, +url={https://openreview.net/forum?id=BygSP6Vtvr}, +} + +@inproceedings{odin, +title={Enhancing The Reliability of Out-of-distribution Image Detection in Neural Networks}, +author={Shiyu Liang and Yixuan Li and R. Srikant}, +booktitle={Proc. International Conference on Learning Representations}, +year={2018}, +url={https://openreview.net/forum?id=H1VGkIxRZ}, +} + +@BOOK{Datasetshift, + AUTHOR = {Quiñonero-Candela, Joaquin}, + TITLE = {{Dataset Shift in Machine Learning}}, + year = {2009}, + publisher = {The MIT Press}, +} + +@BOOK{NealBNN, + AUTHOR = {Radford M. Neal}, + TITLE = {{Bayesian learning for neural networks}}, + year = {1996}, + publisher = {Springer Science \& Business Media}, +} + +@inproceedings{Hinton1993KNNS, + author = {Hinton, Geoffrey E. and van Camp, Drew}, + title = {Keeping the Neural Networks Simple by Minimizing the Description Length of the Weights}, + booktitle = {Proc. Sixth Annual Conference on Computational Learning Theory}, + series = {COLT '93}, + year = {1993}, + isbn = {0-89791-611-5}, + location = {Santa Cruz, California, USA}, + pages = {5--13}, + numpages = {9}, + url = {http://doi.acm.org/10.1145/168304.168306}, + doi = {10.1145/168304.168306}, + acmid = {168306}, + publisher = {ACM}, + address = {New York, NY, USA}, +} + +@inproceedings{Caruana2015, + author = {Caruana, Rich and Lou, Yin and Gehrke, Johannes and Koch, Paul and Sturm, Marc and Elhadad, Noemie}, + title = {Intelligible Models for HealthCare: Predicting Pneumonia Risk and Hospital 30-day Readmission}, + booktitle = {Proc. 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, + series = {KDD '15}, + year = {2015}, + isbn = {978-1-4503-3664-2}, + location = {Sydney, NSW, Australia}, + pages = {1721--1730}, + numpages = {10}, + url = {http://doi.acm.org/10.1145/2783258.2788613}, + doi = {10.1145/2783258.2788613}, + acmid = {2788613}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {additive models, classification, healthcare, intelligibility, interaction detection, logistic regression, risk prediction}, +} + +@phdthesis{mackay1992bayesian, + title={Bayesian methods for adaptive models}, + author={MacKay, David JC}, + year={1992}, + school={California Institute of Technology} +} + +@article{mackay1992practical, + title={A practical Bayesian framework for backpropagation networks}, + author={MacKay, David JC}, + journal={Neural computation}, + volume={4}, + number={3}, + pages={448--472}, + year={1992}, + publisher={MIT Press} +} + +@article{saito2015precision, + title={The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets}, + author={Saito, Takaya and Rehmsmeier, Marc}, + journal={PloS one}, + volume={10}, + number={3}, + pages={e0118432}, + year={2015}, + publisher={Public Library of Science} +} + +@misc{aisafety, + author = {Dario Amodei and + Chris Olah and + Jacob Steinhardt and + Paul F. Christiano and + John Schulman and + Dan Man{\'{e}}}, + title = {Concrete Problems in {AI} Safety}, + note = {arXiv: 1606.06565}, + year = {2016}, + howpublished = {\url{http://arxiv.org/abs/1606.06565}}, + archivePrefix = {arXiv}, + eprint = {1606.06565}, + timestamp = {Mon, 27 Nov 2017 09:22:56 +0100}, + biburl = {http://dblp.org/rec/bib/journals/corr/AmodeiOSCSM16}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@article{dnarna, + author = {Alipanahi, Babak and Delong, Andrew and Weirauch, Matthew T. and Frey, Brendan J.}, + day = {27}, + doi = {10.1038/nbt.3300}, + issn = {1087-0156}, + journal = {Nature Biotechnology}, + month = jul, + number = {8}, + pages = {831--838}, + posted-at = {2015-08-07 21:33:40}, + priority = {2}, + publisher = {Nature Research}, + title = {{Predicting the sequence specificities of DNA- and RNA-binding proteins by deep learning}}, + url = {http://dx.doi.org/10.1038/nbt.3300}, + volume = {33}, + year = {2015} +} + +@inproceedings{Girshick2015, + author = {Girshick, Ross}, + title = {{Fast R-CNN}}, + booktitle = {Proc. 2015 IEEE International Conference on Computer Vision (ICCV)}, + year = {2015}, + pages = {1440--1448}, +} + +@misc{videopredictionarxiv, +title = {{Learning to Generate Long-term Future via Hierarchical Prediction}}, +author = {Ruben Villegas and Jimei Yang and Yuliang Zou and Sungryull Sohn and Xunyu Lin and Honglak Lee}, +year = {2017}, + note = {arXiv:1704.05831}, +howpublished = {\url{https://arxiv.org/abs/1704.05831}}, + archivePrefix = {arXiv}, +} + +@inproceedings{videoprediction, +title = {{Learning to Generate Long-term Future via Hierarchical Prediction}}, +author = {Ruben Villegas and Jimei Yang and Yuliang Zou and Sungryull Sohn and Xunyu Lin and Honglak Lee}, +booktitle = ICML, +year = {2017}, +} + + + +@misc{baselinedetecting, + author = {Dan Hendrycks and + Kevin Gimpel}, + title = {{A Baseline for Detecting Misclassified and Out-of-Distribution Examples in Neural Networks}}, + note = {arXiv:1610.02136}, + year = {2016}, + howpublished = {\url {http://arxiv.org/abs/1610.02136}}, + archivePrefix = {arXiv}, + eprint = {1610.02136}, + timestamp = {Wed, 07 Jun 2017 14:40:14 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/HendrycksG16c}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + + +@misc{DeepSpeech, + author = {Awni Y. Hannun and + Carl Case and + Jared Casper and + Bryan Catanzaro and + Greg Diamos and + Erich Elsen and + Ryan Prenger and + Sanjeev Satheesh and + Shubho Sengupta and + Adam Coates and + Andrew Y. Ng}, + title = {Deep Speech: Scaling up end-to-end speech recognition}, + note = {arXiv:1412.5567}, + year = {2014}, + url = {http://arxiv.org/abs/1412.5567}, + archivePrefix = {arXiv}, + eprint = {1412.5567}, + timestamp = {Wed, 07 Jun 2017 14:42:46 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/HannunCCCDEPSSCN14}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + + + +@MISC{tinyimagenet, + AUTHOR={Stanford CS231N}, + TITLE = {{Tiny ImageNet}}, + YEAR= {2017}, + howpublished = {\url{https://tiny-imagenet.herokuapp.com/}} +} + + +@inproceedings{pbp, + author = {Charles Blundell and Julien Cornebise and Koray Kavukcuoglu and Daan Wierstra}, + title = {{Weight Uncertainty in Neural Networks}}, + year = {2015}, + booktitle=ICML, +} + +@inproceedings{langevin, + author = {Max Welling and Yee Whye Teh}, + title = {{Bayesian Learning via Stochastic Gradient Langevin Dynamics}}, + year = {2011}, + booktitle=ICML, +} + +@inproceedings{nadam, +author={Timothy Dozat}, +title={{Incorporating Nesterov Momentum into Adam}}, +year = {2016}, +booktitle=ICLR, +} + +@inproceedings{vgg, + author = {Karen Simonyan and + Andrew Zisserman}, + title = {{Very Deep Convolutional Networks for Large-Scale Image Recognition}}, + booktitle = ICLR, + year = 2015 + } + +@misc{vggarxiv, + author = {Karen Simonyan and + Andrew Zisserman}, + title = {{Very Deep Convolutional Networks for Large-Scale Image Recognition}}, + note = {arXiv:1409.1556}, + year = {2014}, + url = {http://arxiv.org/abs/1409.1556}, + archivePrefix = {arXiv}, + eprint = {1409.1556}, + timestamp = {Wed, 07 Jun 2017 14:41:51 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/SimonyanZ14a}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@misc{svhn, + author = {Ian J. Goodfellow and + Yaroslav Bulatov and + Julian Ibarz and + Sacha Arnoud and + Vinay D. Shet}, + title = {Multi-digit Number Recognition from Street View Imagery using Deep + Convolutional Neural Networks}, + year = {2013}, + url = {http://arxiv.org/abs/1312.6082}, + archivePrefix = {arXiv}, + note = {arXiv:1312.6082}, + timestamp = {Wed, 07 Jun 2017 14:40:39 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/GoodfellowBIAS13}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + + +@inproceedings{imagenet, + AUTHOR = {Deng, J. and Dong, W. and Socher, R. and Li, L.-J. and Li, K. and Fei-Fei, L.}, + TITLE = {{ImageNet: A Large-Scale Hierarchical Image Database}}, + BOOKTITLE = {CVPR09}, + YEAR = {2009}, + BIBSOURCE = "http://www.image-net.org/papers/imagenet_cvpr09.bib"} + +@misc{lsun, + author = {Fisher Yu and + Yinda Zhang and + Shuran Song and + Ari Seff and + Jianxiong Xiao}, + title = {{LSUN:} Construction of a Large-scale Image Dataset using Deep Learning + with Humans in the Loop}, + note = {arXiv:1506.03365}, + year = {2015}, + url = {http://arxiv.org/abs/1506.03365}, + archivePrefix = {arXiv}, + eprint = {1506.03365}, + timestamp = {Wed, 07 Jun 2017 14:40:17 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/YuZSSX15}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@article {cifar, + author = {Alex Krizhevsky}, + title = {Learning Multiple Layers of Features from Tiny Images}, + year = {2009}, + URL = {https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf} + } + +@article {omniglot, + author = {Lake, Brenden M. and Salakhutdinov, Ruslan and Tenenbaum, Joshua B.}, + title = {Human-level concept learning through probabilistic program induction}, + volume = {350}, + number = {6266}, + pages = {1332--1338}, + year = {2015}, + doi = {10.1126/science.aab3050}, + publisher = {American Association for the Advancement of Science}, + issn = {0036-8075}, + URL = {http://science.sciencemag.org/content/350/6266/1332}, + eprint = {http://science.sciencemag.org/content/350/6266/1332.full.pdf}, + journal = {Science} + } + +@PhdThesis{galthesis, + title={Uncertainty in Deep Learning}, + author={Gal, Yarin}, + year={2016}, + school={University of Cambridge} +} + +@article{mnist, + author={LeCun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.}, + title = {Gradient-based learning applied to document recognition}, + journal = PIEEE, + volume={86}, + pages={2278–2324}, + year = 1998, + } + + +@misc{mnisturl, +author={Yann LeCun and Corinna Cortes and Christopher J.C. Burges}, +title={{The MNIST database of handwritten digits}}, +note={http://yann.lecun.com/exdb/mnist/}, +URL={https://ci.nii.ac.jp/naid/10027939599/en/}, +} + +@article{schuster1997bidirectional, + title={Bidirectional recurrent neural networks}, + author={Schuster, Mike and Paliwal, Kuldip K}, + journal={IEEE Transactions on Signal Processing}, + volume={45}, + number={11}, + pages={2673--2681}, + year={1997}, + publisher={IEEE} +} + +@article{lee2018training, +title={Training Confidence-calibrated Classifiers for Detecting Out-of-Distribution Samples}, +author={Kimin Lee and Honglak Lee and Kibok Lee and Jinwoo Shin}, +journal={International Conference on Learning Representations}, +year={2018}, +url={https://openreview.net/forum?id=ryiAv2xAZ}, +} + +@INPROCEEDINGS{calibration2017, + title={On Fairness and Calibration}, + author={G. Pleiss and M. Raghavan and J. Kleinberg and K. Q. Weinberger}, + booktitle= NIPS, + year={2017} +} + +@INPROCEEDINGS{cv_uncertainty2017, + title={{What Uncertainties Do We Need in Bayesian Deep Learning for Computer Vision}}, + author={A. Kendall and Y. Gal}, + booktitle=NIPS, + year={2017} +} + +@INPROCEEDINGS{mt_uncertainty2017, + title={{Multi-Task Learning Using Uncertainty to Weight Losses for Scene Geometry and Semantics}}, + author={A. Kendall and Y. Gal and R. Cipolla}, + booktitle=NIPS, + year={2017} +} + +@INPROCEEDINGS{deepensemble2017, + title={{Simple and Scalable Predictive Uncertainty Estimation using Deep Ensembles}}, + author={B. Lakshminarayanan and A. Pritzel and C. Blundell}, + booktitle=NIPS, + year={2017} +} + +@misc{graves2014ntm, + author = {Alex Graves and + Greg Wayne and + Ivo Danihelka}, + title = {{Neural Turing Machines}}, + note = {arXiv:1410.5401}, + year = {2014}, + url = {http://arxiv.org/abs/1410.5401}, + timestamp = {Sun, 02 Nov 2014 11:25:59 +0100}, + biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/GravesWD14}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@INPROCEEDINGS{luong2015effective, + title={Effective approaches to attention-based neural machine translation}, + author={Luong, Minh-Thang and Pham, Hieu and Manning, Christopher D}, + booktitle= P_EMNLP, + year={2015} +} + +@INPROCEEDINGS{yu2015bilstm, +author={Z. Yu and others}, +booktitle=P_ASRU, +title={{Using bidirectional LSTM recurrent neural networks to learn high-level abstractions of sequential features for automated scoring of non-native spontaneous speech}}, +year={2015}, +pages={338-345}, +doi={10.1109/ASRU.2015.7404814}, +} + +@INPROCEEDINGS{yu2015bilstm-fullauthor, +author={Z. Yu and V. Ramanarayanan and D. Suendermann-Oeft and X. Wang and K. Zechner and L. Chen and J. Tao and A. Ivanou and Y. Qian}, +booktitle= P_ASRU, +title={{Using bidirectional LSTM recurrent neural networks to learn high-level abstractions of sequential features for automated scoring of non-native spontaneous speech}}, +year={2015}, +pages={338-345}, +doi={10.1109/ASRU.2015.7404814}, +month={Dec},} + +@article{maddox2019simple, + title={A simple baseline for bayesian uncertainty in deep learning}, + author={Maddox, Wesley and Garipov, Timur and Izmailov, Pavel and Vetrov, Dmitry and Wilson, Andrew Gordon}, + journal={arXiv preprint arXiv:1902.02476}, + year={2019} +} + + + +@article{diverse-beam-search, + title={Diverse beam search: Decoding diverse solutions from neural sequence models}, + author={Vijayakumar, Ashwin K and Cogswell, Michael and Selvaraju, Ramprasath R and Sun, Qing and Lee, Stefan and Crandall, David and Batra, Dhruv}, + journal={arXiv preprint arXiv:1610.02424}, + year={2016} +} + +@inproceedings{Tseng2016TowardsMC, + title={{Towards Machine Comprehension of Spoken Content: Initial TOEFL Listening Comprehension Test by Machine}}, + author={Bo-Hsiang Tseng and Sheng-Syun Shen and Hung-Yi Lee and Lin-Shan Lee}, + booktitle= P_INTERSPEECH, + year={2016} +} + +@inproceedings{bahdanau2015nmt, + title={Neural machine translation by jointly learning to align and translate}, + author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua}, + booktitle=ICLR, + year={2015} +} + +@inproceedings{glorot2010understanding, + title={Understanding the difficulty of training deep feedforward neural networks}, + author={Glorot, Xavier and Bengio, Yoshua}, + booktitle={Aistats}, + volume={9}, + pages={249--256}, + year={2010} +} + +@article{Dropout, + title={Dropout: a simple way to prevent neural networks from overfitting}, + author={Srivastava, Nitish and others}, + journal={Journal of Machine Learning Research}, + volume={15}, + number={1}, + pages={1929--1958}, + year={2014} +} + +@article{Dropout-fullauthor, + title={Dropout: a simple way to prevent neural networks from overfitting}, + author={Srivastava, Nitish and Hinton, Geoffrey E and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan}, + journal={Journal of Machine Learning Research}, + volume={15}, + number={1}, + pages={1929--1958}, + year={2014} +} + +@misc{tensorflow, +title={{TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems}}, +url={http://tensorflow.org/}, +note={Software available from tensorflow.org}, +author={ + Mart\'{\i}n~Abadi and + others}, + year={2015}, +} + + + +@misc{tensorflow-fullauthor, +title={{TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems}}, +url={http://tensorflow.org/}, +note={Software available from tensorflow.org}, +author={ + Mart\'{\i}n~Abadi and + Ashish~Agarwal and + Paul~Barham and + Eugene~Brevdo and + Zhifeng~Chen and + Craig~Citro and + Greg~S.~Corrado and + Andy~Davis and + Jeffrey~Dean and + Matthieu~Devin and + Sanjay~Ghemawat and + Ian~Goodfellow and + Andrew~Harp and + Geoffrey~Irving and + Michael~Isard and + Yangqing Jia and + Rafal~Jozefowicz and + Lukasz~Kaiser and + Manjunath~Kudlur and + Josh~Levenberg and + Dan~Man\'{e} and + Rajat~Monga and + Sherry~Moore and + Derek~Murray and + Chris~Olah and + Mike~Schuster and + Jonathon~Shlens and + Benoit~Steiner and + Ilya~Sutskever and + Kunal~Talwar and + Paul~Tucker and + Vincent~Vanhoucke and + Vijay~Vasudevan and + Fernanda~Vi\'{e}gas and + Oriol~Vinyals and + Pete~Warden and + Martin~Wattenberg and + Martin~Wicke and + Yuan~Yu and + Xiaoqiang~Zheng}, + year={2015}, +} + +@article{lsa, + title={{Introduction to Latent Semantic Analysis}}, + author={Thomas K Landauer and Peter W. Foltz and Darrell Laham}, + journal={Discourse Processes}, + volume={25}, + pages={259--284}, + year={1998} +} + +@article{madry2017towards, + title={Towards deep learning models resistant to adversarial attacks}, + author={Madry, Aleksander and Makelov, Aleksandar and Schmidt, Ludwig and Tsipras, Dimitris and Vladu, Adrian}, + journal={arXiv preprint arXiv:1706.06083}, + year={2017} +} + +@inproceedings{vae, + title={{Auto-Encoding Variational Bayes}}, + author={Kingma, Diederik P. and Welling, Max}, + BOOKTITLE = ICLR, + year={2014} +} + +@inproceedings{he2015delving, + title={Delving deep into rectifiers: Surpassing human-level performance on imagenet classification}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + pages={1026--1034}, + year={2015} +} + +@incollection{GAN, +title = {{Generative Adversarial Nets}}, +author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua}, +booktitle = {Advances in Neural Information Processing Systems 27}, +pages = {2672--2680}, +year = {2014}, +publisher = {Curran Associates, Inc.}, +url = {http://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf} +} + +@inproceedings{Gal2016Dropout, +Author = {Yarin Gal and Zoubin Ghahramani}, +Title = {{Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning}}, +booktitle = {Proc. 33rd International Conference on Machine Learning (ICML-16)}, +year={2016} +} + +@article{scikit-learn, + title={{Scikit-learn: Machine Learning in Python}}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} +} + +@ARTICLE{elf, +author = {Barbara Seidlhofer}, + title = {English as a lingua franca}, + journal = { ELT journal}, + VOLUME={59}, +NUMBER={4}, +PAGES={339}, +YEAR = 2005 +} + +@misc{tonyrobinson, +author = {Will Williams and Niranjani Prasad and David Mrva and Tom Ash and Tony Robinson}, + title = {{Scaling Recurrent Neural Network Language Models}}, + note = {arXiv:1502.00512 [cs.CL]}, +howpublished = {\url{http://arxiv.org/pdf/1502.00512v1.pdf}}, +YEAR = 2015 +} + +@INPROCEEDINGS{plagiarism, +AUTHOR = {Keelan Evanini and Xinhao Wang}, +TITLE = {{Automatic detection of plagiarized spoken responses}}, +BOOKTITLE={Proc. Ninth Workshop on Innovative Use of NLP for Building Educational Applications}, +YEAR = {2014} +} + +@INPROCEEDINGS{nonscorable, +AUTHOR = {Su-Youn Yoon and Shasha Xie}, +TITLE = {{Similarity-Based Non-Scorable Response Detection for Automated Speech +Scoring}}, +BOOKTITLE={Proc. Ninth Workshop on Innovative Use of NLP for Building Educational Applications}, +YEAR = {2014} +} + +@INPROCEEDINGS{coherence, +AUTHOR = {Xinhao Wang, Keelan Evanini, Klaus Zechner}, +TITLE = {{Coherence Modeling for the Automated Assessment of Spontaneous Spoken Responses}}, +BOOKTITLE={Proc. NAACL-HLT}, +YEAR = {2013} +} + +@INPROCEEDINGS{ukwac, +AUTHOR = {Adriano Ferraresi and others}, +TITLE = {{Introducing and evaluating ukWaC, a very large web-derived corpus of English}}, +BOOKTITLE={Proc. LREC}, +YEAR = {2008} +} + +@ARTICLE{Cummins-ACM2015, +AUTHOR = {Ronan Cummins and Jiaul H. Paik and Yuanhua Lv}, +TITLE = {{A {{\'{o}}lya Urn Document Language Model for Improved Information Retrieval}}}, +JOURNAL={{ACM} Trans. Inf. Syst.}, +VOLUME={33}, +NUMBER={4}, +PAGES={21}, +YEAR = 2015, +} + +@INPROCEEDINGS{qir, +AUTHOR = {Victor Lavrenko and W. Bruce Croft}, +TITLE = {{ Relevance Base Language Models}}, +JOURNAL={Proc. SIGIR}, +YEAR = 2001, +} + +@INPROCEEDINGS{moses, +AUTHOR = {Philip Koehn and others}, +TITLE = {{ Moses: Open Source Toolkit for Statistical Machine Translation}}, +BOOKTITLE= {Proc. ACL}, +NOTE={demonstation session}, +YEAR = 2007, +} + +@ARTICLE{giza, +AUTHOR = {Franz Josef Och and Hermann Ney}, +TITLE = {{A Systematic Comparison of Various Statistical Alignment Models}}, +JOURNAL= {CL}, +NUMBER = {1}, +VOLUME = {29}, +YEAR = {2003}, +PAGES = {19--51} +} + +@INPROCEEDINGS{felice, + author = {Felice, Mariano and Yuan, Zheng and Andersen, {\O}istein E. and Yannakoudakis, Helen and Kochmar, Ekaterina}, + title = {Grammatical error correction using hybrid systems and type filtering}, + booktitle = {Proc. Eighteenth Conference on Computational Natural Language Learning: Shared Task}, + YEAR= 2014, +} + +@INPROCEEDINGS{zheng, + author = {Yuan, Zheng and Felice, Mariano}, + title = {{Constrained Grammatical Error Correction using Statistical Machine Translation}}, + booktitle = {Proc. Seventeenth Conference on Computational Natural Language Learning: Shared Task}, + YEAR= 2013, +} + +@INPROCEEDINGS{theano, + author = {Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Pascanu, Razvan and Bergstra, James and Goodfellow, Ian J. and Bergeron, Arnaud and Bouchard, Nicolas and Bengio, Yoshua}, + TITLE = {Theano: new features and speed improvement}, + BOOKTITLE = {Proc. Deep Learning and Unsupervised Feature Learning NIPS 2012 Workshop}, + YEAR= 2012, +} + +@MISC{word2vec, + AUTHOR={Tomas Mikolov}, + TITLE = { word2vec}, + YEAR= 2013, + howpublished = {\url{https://code.google.com/p/word2vec/}} +} + +@MISC{gibbs-lda, + AUTHOR={Xuan-Hieu Phan and Cam-Tu Nguyen}, + TITLE = {{GibbsLDA++: A C/C++ implementation of latent Dirichlet allocation (LDA)}}, + YEAR= 2007, + howpublished= {\url{http://gibbslda.sourceforge.net/}} +} + +@ARTICLE{griffiths, + AUTHOR={Thomas L. Griffiths and Mark Steyvers}, + TITLE = {{Finding Scientific Topics}}, + JOURNAL= {Proceedings of the National Academy of Sciences}, + VOLUME={101}, + PAGES={5228--5235}, + YEAR= 2004, +} + +@article{blei, + author = {Blei, David M. and Ng, Andrew Y. and Jordan, Michael I.}, + TITLE = {{Latent Dirichlet Allocation}}, + journal = JMLR, +volume = {3}, + month = mar, + pages = {993--1022}, + year = {2003}, +} + +@INPROCEEDINGS{hyvarinen-nce, + AUTHOR = {Michael Gutman and Aapo Hyvarinen}, + TITLE = {{ Noise-contrastive estimation: A new estimation principle for unnormalized statistical models}}, + JOURNAL = {Proc. International Conference of Artificial Intelligence and Statistics}, + YEAR = 2010 +} + +@INPROCEEDINGS{chen-2015-improving, + AUTHOR = {Xie Chen and Xunying. Liu and Mark J.F. Gales and Philip .C. Woodland}, + TITLE = {{Improving the Training and Evaluation Efficiency of Recurrent Neural Network Language Models}}, + BOOKTITLE = P_ICASSP, + YEAR = 2015 +} + +@INPROCEEDINGS{jeff-nce, + AUTHOR = {Xie Chen and Xunying Liu and Mark J.F. Gales and Philip C. Woodland}, + TITLE = {{Recurrent Neural Network Language Model Training with Noise Contrastive Estimation for Speech Recognition}}, + BOOKTITLE = P_ICASSP, + YEAR = 2015 +} + +@INPROCEEDINGS{chen-2014-efficient, + AUTHOR = {Xie Chen and Yongqiang Wang and Xunying Liu and Mark J.F. Gales and P.C. Woodland}, + TITLE = {{Efficient GPU-based Training of Recurrent Neural Network Language Models Using Spliced Sentence Bunch}}, + BOOKTITLE = P_INTERSPEECH, + YEAR = 2014 +} + +@ARTICLE{deep-learning, +author = {Bengio, Yoshua}, + title = {{Learning Deep Architectures for AI}}, + journal = {Found. Trends Mach. Learn.}, +volume = {2}, + number = {1}, + pages = {1--127}, + YEAR = 2009 +} + +@INPROCEEDINGS{deep-learning2, + AUTHOR = {Delalleau, O. and Bengio, Y.}, + TITLE = {Shallow vs. deep sum-product networks}, + BOOKTITLE = {Proc. NIPS}, + YEAR = 2011 +} + +@INPROCEEDINGS{max-out, + author = {Ian J. Goodfellow and David Warde-Farley and Mehdi Mirza and Aaron Courville and Yoshua Bengio}, + TITLE = {{Maxout Networks}}, + BOOKTITLE = ICML, + YEAR = 2013 +} + +@MISC{yu-report, + AUTHOR = {Yu Wang}, + TITLE = {{ALTA Progress Report}}, + note = {Internal ALTA Project Report, Cambridge University Engineering Department}, + MONTH={Jul}, + YEAR = {2015} +} + +@MISC{BULATS-2015, +TITLE = {{Business Language Testing Service}}, +AUTHOR={BULATS}, +YEAR= 2012, +howpublished = {\url{http://www.bulats.org}} +} + + +@BOOK{Verhelst-2009, + AUTHOR = {Verhelst, N. and others}, + TITLE = {{Common European Framework of Reference for Languages: learning, teaching, assessment}}, + YEAR = {2009}, + PUBLISHER = {Cambridge University Press} +} + + +@INPROCEEDINGS{Evanini-2013, + AUTHOR = {Keelan Evanini and Shasha Xie and Klaus Zechner}, + TITLE = {{Prompt-based Content Scoring for Automated Spoken Language Assessment}}, + BOOKTITLE = {Proc. Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)}, + YEAR = {2013} +} + +@INPROCEEDINGS{Xie-2012, + AUTHOR = {Shasha Xie and Keelan Evanini and Klaus Zechner}, + TITLE = {{Exploring Content Features for Automated Speech Scoring}}, + BOOKTITLE = {Proc. Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)}, + YEAR = 2012, +} + +@TECHREPORT{Briscoe-2010, + AUTHOR = {Briscoe, T. and others}, + TITLE = {{Automated assessment of ESOL free text examinations}}, + NUMBER = {UCAM-CL-TR-790}, + INSTITUTION = {University of Cambridge Computer Laboratory}, + YEAR = {2010}, + howpublished = {\url{http://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-790.pdf}} +} + +@ARTICLE{Barzilay-2008, + AUTHOR = {Barzilay, R. and Lapata, M.}, + TITLE = {{Modeling Local Coherence: An Entity-Based Approach}}, + JOURNAL = {Computational Linguistics}, +volume = {34}, + number = {1}, + pages = {1--34}, + YEAR = 2008, +} + +@BOOK{Halliday-and-Hasan, + AUTHOR = {Halliday, M. A. K. and Hasan, R}, + TITLE = {{Cohesion in English}}, + YEAR = 1976, + PUBLISHER = {Longman Pub Group, UK} +} + +@MISC{Myles-2011, + AUTHOR = {F. Myles}, + TITLE = {{Second language acquisition (SLA) research: its significance for learning and teaching issues}}, + howpublished = {\url{https://www.llas.ac.uk//resources/gpg/421}}, + YEAR = 2002, +} + + +@misc{word2vec1, + AUTHOR = {Alex Minnaar}, + TITLE = {{Word2Vec Tutorial Part I: The Skip-Gram Model}}, + YEAR = 2015, + howpublished= {\url{http://alexminnaar.com/word2vec-tutorial-part-i-the-skip-gram-model.html}} +} + +@misc{word2vec2, + AUTHOR = {Alex Minnaar}, + TITLE = {{Word2Vec Tutorial Part II: The Continuous Bag-of-Words Model}}, + YEAR = 2015, + howpublished = {\url{http://alexminnaar.com/word2vec-tutorial-part-ii-the-continuous-bag-of-words-model.html}} +} + +@TECHREPORT{Yannakoudakis-2011, + AUTHOR = {Helen Yannakoudakis}, + TITLE = {{Automated assessment of English-learner writing}}, + NUMBER = {UCAM-CL-TR-842}, + INSTITUTION = {University of Cambridge Computer Laboratory}, + YEAR = 2013, + howpublished = {\url{https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-842.pdf}} +} + +@ARTICLE{vanishing1, + author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo}, + TITLE = {{Learning Long-Term Dependencies with Gradient Descent is Difficult}}, + journal = {IEEE Transactions on Neural Networks}, + volume = {5}, + number = {2}, + pages = {157--166}, + year = {1994}, +} + +@INPROCEEDINGS{vanishing2, + author = {Pascanu, Razvan and Mikolov, Tomas and Bengio, Yoshua}, + TITLE = {On the difficulty of training recurrent neural networks}, + BOOKTITLE = ICML, + YEAR = 2013 +} + +@ARTICLE{hmm-gmm, + AUTHOR = {Mark J.F. Gales and Steven Young}, + TITLE = {{The Application of Hidden Markov Models in Speech Recognition}}, + JOURNAL = {Foundations and Trends in Signal Processing}, + volume = {1}, + number = {3}, + YEAR = 2008, +} + +@misc{bahdanau2014nmtarxiv, +author = {Dzmitry Bahdanau and + Kyunghyun Cho and + Yoshua Bengio}, + title = {{Neural Machine Translation by Jointly Learning to Align and Translate}}, + note = {arXiv:1409.0473}, + YEAR = 2014, +howpublished = {\url{http://arxiv.org/abs/1409.0473}}, +} + +@misc{las, + author = {William Chan and + Navdeep Jaitly and + Quoc V. Le and + Oriol Vinyals}, + title = {{Listen, Attend and Spell}}, + note = {arXiv:1508.01211}, + year = {2015}, + howpublished= {\url{http://arxiv.org/abs/1508.01211}}, +} + + +@INPROCEEDINGS{rnnlda, + AUTHOR = {Tomas Mikolov and Geoffrey Zweig}, + YEAR = 2012, + TITLE = {{Context Dependent Recurrent Neural Network Language Model}}, + BOOKTITLE ={Proc. IEEE Spoken Language Technology Workshop (SLT)}, +} + +@INPROCEEDINGS{rnnlda2, + author={Xie Chen and Tian Tan and Xunying Liu and Pierre Lanchantin and Moquan Wan and Mark J.F. Gales and Philip C. Woodland}, + YEAR = 2015, + TITLE = {{Recurrent Neural Network Language Model Adaptation for Multi-Genre Broadcast Speech Recognition}}, + BOOKTITLE = P_INTERSPEECH, +} + +@misc{drnn, +author = {Razvan Pascanu and + {\c{C}}aglar G{\"{u}}l{\c{c}}ehre and + Kyunghyun Cho and + Yoshua Bengio}, + TITLE = {{How to Construct Deep Recurrent Neural Networks}}, +note = {arXiv:1312.6026}, + year = {2013}, + howpublished = {\url{http://arxiv.org/abs/1312.6026}} +} + +@misc{Dropout_old, +author = {Geoffrey E. Hinton and + Nitish Srivastava and + Alex Krizhevsky and + Ilya Sutskever and + Ruslan Salakhutdinov}, + title = {Improving neural networks by preventing co-adaptation of feature detectors}, + note = {arXiv:1207.0580}, + year = {2012}, + howpublished = {\url{http://arxiv.org/abs/1207.0580}}, +} +@ARTICLE{lstm1, + AUTHOR = {S. Hochreiter and J. Schmidhuber}, + TITLE = {{Long Short-Term Memory}}, +journal = {Neural Comput.}, + volume = {9}, + number = {8}, + year = {1997}, + pages = {1735--1780}, +} + +@INPROCEEDINGS{Chen-Goodman, + AUTHOR = {S. Chen and J. Goodman}, + YEAR = 1996, + TITLE = {{An Empirical Study of Smoothing Techniques for Language Modelling}}, + BOOKTITLE ={Proc. ACL} +} + +@INPROCEEDINGS{Deoras, +author = {Anoop Deoras and Tom{\'{a}}{\v{s}} Mikolov and Stefan Kombrink and Martin Karafi{\'{a}}t and Sanjeev Khudanpur}, + TITLE = {{Variational Approximation of Long-Span Language Models for LVCSR}}, + BOOKTITLE = P_ICASSP, + YEAR = 2011, +} + +@INPROCEEDINGS{Gales-NNLM-adaptation, +author = {Junho Park and + Xunying Liu and + Mark J. F. Gales and + Philip C. Woodland}, + TITLE = {{Improved Neural Network Based Language Modelling and Adaptation}}, + BOOKTITLE = P_INTERSPEECH, + YEAR = 2010, +} + +@misc{hinton2015distilling, + author={Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff}, + title={Distilling the knowledge in a neural network}, + archivePrefix = "arXiv", + note = {arXiv:1503.02531}, + year={2015} +} + +@ARTICLE{batch_norm, +author= {Sergey Ioffe and Christian Szegedy}, + TITLE = {{Batch Normalization: Accelerating Deep Network Training by Reducing +Internal Covariate Shift}}, + JOURNAL = {Journal of Machine Learning Research}, + YEAR = 2015, +} + +@BOOK{jurafsky, + AUTHOR = {D. Jurafsky and J. Martin.}, + TITLE = {{Speech and Natural Language Processing (2nd Edition)}}, + year = {2009}, + publisher = {Prentice-Hall, Inc.}, +} + +@BOOK{murphy, + AUTHOR = {Kevin P. Murphy}, + TITLE = {{Machine Learning}}, + year = {2012}, + publisher = {The MIT Press}, +} + +@BOOK{graves-rnn, + AUTHOR = {Alex Graves}, + TITLE = {{Supervised Sequence Labelling with Recurrent Neural Networks}}, + YEAR = 2009, + PUBLISHER = {Pre-print}, + howpublished = {\url{http://www.cs.toronto.edu/~graves/preprint.pdf}} +} + +@INPROCEEDINGS{liu-2014-efficient, + AUTHOR = {Xunying Liu and Y. Wang and Xie Chen and Mark J.F. Gales and Philip C. Woodland}, + YEAR = 2014, + TITLE = {{Efficient Lattice Rescoring using Recurrent Neural Network Language Models}}, + BOOKTITLE = P_INTERSPEECH +} + +@INPROCEEDINGS{mikolov-rnn, +author = {Tomas Mikolov and + Martin Karafi{\'{a}}t and + Luk{\'{a}}s Burget and + Jan Cernock{\'{y}} and + Sanjeev Khudanpur}, + YEAR = 2010, + TITLE = {{Recurrent Neural Network Based Language Model}}, + BOOKTITLE = P_INTERSPEECH +} + +@INPROCEEDINGS{mikolov-rnn2, + author = {Tomas Mikolov and + Stefan Kombrink and + Luk{\'{a}}s Burget and + Jan Cernock{\'{y}} and + Sanjeev Khudanpur}, + YEAR = 2011, + TITLE = {{Extensions of Recurrent Neural Network Language Model}}, + BOOKTITLE = P_ICASSP +} + +@PHDTHESIS{mikolov-rnn3, + AUTHOR = {Tomas Mikolov}, + TITLE = {{Statistical Language Models Based on Neural Networks}}, + SCHOOL = {Brno University of Technology}, + YEAR = 2012 +} + +@INPROCEEDINGS{srilm, + AUTHOR = {A. Stolcke}, + TITLE = {{SRILM – an extensible language modelling toolkit}}, + BOOKTITLE = P_ICSLP, + YEAR = 2002 +} + + +@INPROCEEDINGS{chen-rnnlm-toolkit, + author = {X. Chen and X. Liu and Y. Qian and M.J.F. Gales and P.C. Woodland}, + YEAR = 2016, + TITLE = {{CUED-RNNLM -- An Open-Source Toolkit for Efficient Training and Evaluation of Recurrent Neural Network Language Models}}, + BOOKTITLE = P_ICASSP +} + + +@PHDTHESIS{sutskever-phd, + AUTHOR = {I. Sutskever}, + TITLE = {{Training Recurrent Neural Networks}}, + SCHOOL = {University of Toronto}, + YEAR = 2013, +} + +@INPROCEEDINGS{Dropconnect, + author = {Li Wan and + Matthew D. Zeiler and + Sixin Zhang and + Yann LeCun and + Rob Fergus}, + title = {{Regularization of Neural Networks using DropConnect}}, + booktitle = ICML, + year = {2013} +} + +@BOOK{lstm2, + AUTHOR = {Alex Graves}, + TITLE = {{Supervised Sequence Labelling with Recurrent Neural Networks}}, + YEAR = 2012, + PUBLISHER = {Studies in Computational Intelligence, Springer} +} + + +@BOOK{smt-Koehn, + AUTHOR = {Philip Koehn}, + TITLE = {{Statistical Machine Translation}}, + YEAR = 2013, + PUBLISHER = {Cambridge University Press} +} + +@ARTICLE{nnlm, +author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Janvin, Christian}, + title = {{A Neural Probabilistic Language Model}}, + journal = JMLR, + volume = {3}, + year = {2003}, + pages = {1137--1155}, +} + +@BOOK{nn-book, + AUTHOR = {C. Bishop}, + TITLE = {Neural Networks for Pattern Recognition}, + YEAR = 1995, + PUBLISHER = {Oxford University Press} +} + +@ARTICLE{brnn, + AUTHOR= {M. Schuster and K. Paliwal}, + TITLE = {{Bidirectional Recurrent Neural Networks}}, +volume = {45}, + number = {11}, + pages = {2673--2681}, + YEAR = 1997, + JOURNAL ={IEEE Transactions on Signal Processing} +} + +@INPROCEEDINGS{embedding1, + AUTHOR = {Tomas Mikolov and others}, + TITLE = {{Linguistic Regularities in Continuous Space Word Representations}}, + BOOKTITLE = {Proc. NAACL-HLT}, + YEAR = 2013, +} + +@article{dnnspeech, +title = {Deep Neural Networks for Acoustic Modeling in Speech Recognition}, +author = {Geoffrey Hinton and Li Deng and Dong Yu and George Dahl and Abdel-rahman Mohamed and Navdeep Jaitly and Andrew Senior and Vincent Vanhoucke and Patrick Nguyen and Tara Sainath and Brian Kingsbury}, +year = {2012}, +journal = {Signal Processing Magazine} +} + +@misc{embedding2, +author = {Tomas Mikolov and + Kai Chen and + Greg Corrado and + Jeffrey Dean}, + title = {{Efficient Estimation of Word Representations in Vector Space}}, + archivePrefix = "arXiv", + note = {arXiv:1301.3781}, + year = {2013}, + howpublished = {\url{http://arxiv.org/abs/1301.3781}}, +} + +@misc{embedding3, +author = {Tomas Mikolov and + Ilya Sutskever and + Kai Chen and + Greg Corrado and + Jeffrey Dean}, + title = {{Distributed Representations of Words and Phrases and their Compositionality}}, + archivePrefix = "arXiv", + note = {arXiv:1310.4546}, + year = {2013}, + howpublished = {\url{http://arxiv.org/abs/1310.4546}}, +} + +@misc{embedding4, +author = {Quoc V. Le and + Tomas Mikolov}, + title = {Distributed Representations of Sentences and Documents}, + archivePrefix = "arXiv", + note = {arXiv:1405.4053}, + year = {2014}, + url = {\url{http://arxiv.org/abs/1405.4053}}, +} + +@INPROCEEDINGS{Dalen-ICASSP2015, + AUTHOR = {Rogier C. van Dalen and Kate M. Knill and Pirros Tsiakoulis and Mark J. F. Gales}, + YEAR = 2015, + TITLE = {{Improving Multiple-Crowd-Sourced Transcriptions Using a Speech Recogniser}}, + BOOKTITLE = P_ICASSP +} + +@INPROCEEDINGS{van_dalen-2015-grading, + AUTHOR = {Rogier C. van Dalen and Kate M. Knill and Mark J. F. Gales}, + YEAR = 2015, + TITLE = {{Automatically Grading Learners' English Using a Gaussian Process}}, + BOOKTITLE = P_SLATE, +} + +@INPROCEEDINGS{adam, + author={Kingma, Diederik P. and Ba, Jimmy}, + YEAR = 2015, + TITLE = {{Adam: A Method for Stochastic Optimization}}, + BOOKTITLE = {Proc. 3rd International Conference on Learning Representations (ICLR)} +} + +@ARTICLE{gales-1998-transformations, + AUTHOR = {Mark J. F. Gales}, + TITLE = {{Maximum Likelihood Linear Transformations for HMM-based Speech Recognition}}, + JOURNAL = {Computer Speech and Language}, + YEAR = 1998, + VOLUME = 12, + NUMBER = 2, + PAGES = {75-98} +} + +@BOOK{rasmussen-2006, + AUTHOR = {Carl Edward Rasmussen and Christopher K. I. Williams}, + TITLE = {{Gaussian Processes for Machine Learning}}, + YEAR = 2006, + PUBLISHER = MITPress, +} + +@book{Young2006, + AUTHOR = {Steve Young and others}, + TITLE = {The \textsc{htk} book (for \textsc{htk} Version 3.4.1)}, + YEAR = 2009, + PUBLISHER = {University of Cambridge}, + howpublished = {\url{http://htk.eng.cam.ac.uk/docs/docs.shtml}} +} + +@book{Young2006-fullauthor, + AUTHOR = {Steve Young and Gunnar Evermann and Mark J. F. Gales and Thomas Hain and Dan Kershaw and Xunying {(Andrew)} Liu and Gareth Moore and Julian Odell and Dave Ollason and Dan Povey and Valtcho Valtchev and Phil Woodland}, + TITLE = {The \textsc{htk} book (for \textsc{htk} Version 3.4.1)}, + YEAR = 2009, + PUBLISHER = {University of Cambridge}, + howpublished = {\url{http://htk.eng.cam.ac.uk/docs/docs.shtml}} +} + +@ARTICLE{gales-1999-semi-tied, + AUTHOR = {Mark J. F. Gales}, + TITLE = {{Semi-Tied Covariance Matrices for Hidden Markov Models}}, + JOURNAL = {IEEE Trans. Speech Audio Processing}, + YEAR = 1999, + VOLUME = 7, + NUMBER = 3, + PAGES = {272-281} +} + +@inproceedings{povey-2002-mpe, + author = {D. Povey and P. C. Woodland}, + title = {{Minimum Phone Error and I-smoothing for improved discriminative training}}, + booktitle = P_ICASSP, + year = 2002 +} + +@incollection{Carletta2006, + title={{The AMI meeting corpus: A pre-announcement}}, + author={Jean Carletta and Simone Ashby and Sebastien Bourban and Mike Flynn and Mael Guillemot and Thomas Hain and Jaroslav Kadlec and Vasilis Karaiskos and Wessel Kraaij and Melissa Kronenthal and Guillaume Lathoud and Mike Lincoln and Agnes Lisowska and Iain McCowan and Wilfried Post and Dennis Reidsma and Pierre Wellner}, + booktitle={Machine learning for multimodal interaction}, + pages={28--39}, + year={2006}, + publisher={Springer} +} + +@misc{quicknet, + title = {{QuickNet}}, + author={David Johnson and others}, + note = {\url{http://www1.icsi.berkeley.edu/Speech/qn.html}} +} + +@MISC{matrix_cookbook, + author = "K. B. Petersen and M. S. Pedersen", + title = {{The Matrix Cookbook}}, + year = "2008", + month = "Nov", + keywords = "Matrix identity, matrix relations, inverse, matrix derivative", + publisher = "Technical University of Denmark", + url = {\url{http://matrixcookbook.com/}} +} + +@comment{Language learning} + +@book{cefr-2001, + author = {Council of Europe}, + title={{Common European framework of reference for languages: Learning, teaching, assessment}}, + publisher= {Cambridge, U.K: Press Syndicate of the University of Cambridge}, + isbn={9780521005319}, + year={2001}, +} + +@inproceedings{cucchiarini-1997-automatic, + title={{Automatic evaluation of Dutch pronunciation by using speech recognition technology}}, + author={Cucchiarini, Catia and Strik, Helmer and Boves, Lou}, + booktitle= P_ASRU, + year={1997} +} + +@inproceedings{franco-2000-sri, + title={{The SRI EduSpeak system: Recognition and pronunciation scoring for language learning}}, + author={Franco, Horacio and Abrash, Victor and Precoda, Kristin and Bratt, Harry and Rao, Ramana and Butzberger, John and Rossier, Romain and Cesari, Federico}, + booktitle={Proc. InSTILL 2000}, + year={2000} +} + +@article{bernstein-2007-logic, + title={Logic and validation of fully automatic spoken {E}nglish test}, + author={Bernstein, Jared and Cheng, Jian}, + journal={The path of speech technologies in computer assisted language learning: From research toward practice}, + pages={174--194}, + year={2007}, + publisher={Routledge Florence, KY} +} + +@article{Zechner-2009, + author = {Klaus Zechner and others}, + title = {{Automatic scoring of non-native spontaneous speech in tests of spoken English}}, + journal = {Speech Communication}, + volume = "51", + number = "10", + pages = {883--895}, + year = "2009", +} + +@article{Zechner-2009-fullauthor, + author = {Klaus Zechner and Derrick Higgins and Xiaoming Xi and David M. Williamson}, + title = {{Automatic scoring of non-native spontaneous speech in tests of spoken English}}, + journal = {Speech Communication}, + volume = "51", + number = "10", + pages = {883--895}, + year = "2009", +} + +@article{higgins-2011, + author = {Derrick Higgins and Xiaoming Xi and Klaus Zechner and David Williamson}, + title = {A three-stage approach to the automated scoring of spontaneous spoken responses }, + journal = {Computer Speech and Language}, + volume = "25", + number = "2", + pages = "282-306", + year = "2011" +} + +@inproceedings{metallinou-2014-deep_neural, + AUTHOR = {Angeliki Metallinou and Jian Cheng}, + TITLE = {{Using Deep Neural Networks to Improve Proficiency Assessment for Children English Language Learners}}, + BOOKTITLE = P_INTERSPEECH, + YEAR = {2014} +} + +@inproceedings{Wang2015, + author={Haipeng Wang and others}, + title={{Joint Decoding of Tandem and Hybrid Systems for Improved Keyword Spotting on Low Resource Languages}}, + booktitle=P_INTERSPEECH, + year=2015, +} + +@inproceedings{Wang2015-fullauthor, + author={Haipeng Wang and Anton Ragni and Mark J. F. Gales and Kate M. Knill and Philip C. Woodland and Chao Zhang}, + title={{Joint Decoding of Tandem and Hybrid Systems for Improved Keyword Spotting on Low Resource Languages}}, + booktitle=P_INTERSPEECH, + year=2015, +} + +@inproceedings{Zhang2015, +author={Chau Zhang and Philip C. Woodland}, +title={{A General Artificial Neural Network Extension for HTK}}, +booktitle=P_INTERSPEECH, +year={2015} +} + +@ARTICLE{Park2011, + author = {Junho Park and Frank Diehl and Mark J.F. Gales and Moquan Tomalin and Philip C. Woodland}, + title = {The efficient incorporation of \{MLP\} features into automatic speech + recognition systems }, + journal = {Computer Speech and Language }, + year = {2011}, + volume = {25}, + pages = {519 - 534}, + number = {3}, +} + +@ARTICLE{Mohamed2012, + author = {Mohamed, Abdel-rahman and Dahl, George E and Hinton, Geoffrey}, + title = {Acoustic modeling using deep belief networks}, + journal = {IEEE Transactions on Audio, Speech, and Language Processing}, + year = {2012}, + volume = {20}, + pages = {14--22}, + number = {1}, +} + + +@ARTICLE{mdn, + author = {C. M. Bishop}, + title = {{Mixture Density Networks}}, + journal = {Technical Report NCRG 4288, Neural Computing Research Group, Department of Computer Science, Aston University}, + year = {1994} +} + +@INPROCEEDINGS{Vesely2013, + author = {Vesel{\`y}, Karel and Ghoshal, Arnab and Burget, Luk{\'a}s and Povey, + Daniel}, + title = {Sequence-discriminative training of deep neural networks}, + booktitle = P_INTERSPEECH, + year = {2013}, + pages = {2345--2349} +} + +@InProceedings{malinin2016offtopic, + author = {Malinin, Andrey and van Dalen, Rogier and Knill, Kate and Wang, Yu and Gales, Mark}, + title = {{Off-topic Response Detection for Spontaneous Spoken English Assessment}}, + booktitle = {Proc. 54th Annual Meeting of the Association for Computational Linguistics (ACL)}, + year = {2016}, + address = {Berlin, Germany}, + pages = {1075--1084}, +} + +@Book{htk-2015, + AUTHOR = {Steve Young and others}, + title = {The \textsc{HTK} book (for \textsc{HTK} version 3.5)}, + PUBLISHER = {University of Cambridge}, + year = {2015}, + note = {\url{http://htk.eng.cam.ac.uk}}, +} + +@Book{htk-2015-fullauthor, + AUTHOR = {Steve Young and Gunnar Evermann and Mark J. F. Gales and Thomas Hain and Dan Kershaw and Xunying {(Andrew)} Liu and Gareth Moore and Julian Odell and Dave Ollason and Dan Povey and Valtcho Valtchev and Phil Woodland}, + title = {The \textsc{HTK} book (for \textsc{HTK} version 3.5)}, + PUBLISHER = {University of Cambridge}, + year = {2015}, + note = {\url{http://htk.eng.cam.ac.uk}}, +} + +@inproceedings{malinin2017incorporating, + author = {A. Malinin and A. Ragni and M.J.F. Gales and K.M. Knill}, + title = {{Incorporating Uncertainty into Deep Learning for Spoken Language Assessment}}, + booktitle = {Proc. 55th Annual Meeting of the Association for Computational Linguistics (ACL)}, + year = 2017, +} + +@inproceedings{malinin2017attention, + author = {A, Malinin and K. Knill and A. Ragni and Y. Wang and M.J.F. Gales}, + title = {{An attention based model for off-topic spontaneous spoken respnse detection: An Initial Study}}, + booktitle = {Proc. ISCA Workshop on Speech and Language Technology for Education (SLaTE)}, + year = 2017 + } + + @inproceedings{evermann2000large, + author = {G. Evermann and P. C. Woodland}, + title = {{Large vocabulary decoding and confidence estimation using word posterior probabilities}}, + booktitle = P_ICASSP, + year = 2000, +} + +@article{vandermaaten2008visualizing, + author = {van~der~Maaten, L. and Hinton, G.}, + title = {{Visualizing Data using t-SNE}}, + journal = {{J. MLR}}, + year = {2008}, + volume = {1}, + pages = {1--49} +} + +@article{semeion, + author = {M Buscema}, + title = {MetaNet: The Theory of Independent Judges}, + journal={Substance Use \& Misuse}, + volume = 33, + number = 2, + year = 1998, + pages={439-461}, + } + + + +@inproceedings{dietterich2000ensemble, +author = {Dietterich, Thomas G.}, +title = {Ensemble Methods in Machine Learning}, +year = {2000}, +isbn = {3540677046}, +publisher = {Springer-Verlag}, +address = {Berlin, Heidelberg}, +booktitle = {Proceedings of the First International Workshop on Multiple Classifier Systems}, +pages = {1–15}, +numpages = {15}, +series = {MCS '00} +} + +@article{hendrycks2021nae, + title={Natural Adversarial Examples}, + author={Dan Hendrycks and Kevin Zhao and Steven Basart and Jacob Steinhardt and Dawn Song}, + journal={CVPR}, + year={2021} +} + +@article{hendrycks2020many, + title={The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization}, + author={Dan Hendrycks and Steven Basart and Norman Mu and Saurav Kadavath and Frank Wang and Evan Dorundo and Rahul Desai and Tyler Zhu and Samyak Parajuli and Mike Guo and Dawn Song and Jacob Steinhardt and Justin Gilmer}, + journal={arXiv preprint arXiv:2006.16241}, + year={2020} +} + +@article{hendrycks2019robustness, + title={Benchmarking Neural Network Robustness to Common Corruptions and Perturbations}, + author={Dan Hendrycks and Thomas Dietterich}, + journal={Proceedings of the International Conference on Learning Representations}, + year={2019} +} + +@inproceedings{papineni2002bleu, + title={Bleu: a method for automatic evaluation of machine translation}, + author={Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing}, + booktitle={Proceedings of the 40th annual meeting of the Association for Computational Linguistics}, + pages={311--318}, + year={2002} +} + +@misc{vaswani2017attention, + title={Attention Is All You Need}, + author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin}, + year={2017}, + eprint={1706.03762}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} + +@misc{goyal2018accurate, + title={Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour}, + author={Priya Goyal and Piotr Dollár and Ross Girshick and Pieter Noordhuis and Lukasz Wesolowski and Aapo Kyrola and Andrew Tulloch and Yangqing Jia and Kaiming He}, + year={2018}, + eprint={1706.02677}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} + +@inproceedings{ +zhang2018residual, +title={Residual Learning Without Normalization via Better Initialization}, +author={Hongyi Zhang and Yann N. Dauphin and Tengyu Ma}, +booktitle={International Conference on Learning Representations}, +year={2019}, +url={https://openreview.net/forum?id=H1gsz30cKX}, +} + +@inproceedings{rezero, + title = "ReZero is All You Need: Fast Convergence at Large Depth", + author = "Bachlechner, Thomas and + Majumder, Bodhisattwa Prasad + Mao, Huanru Henry and + Cottrell, Garrison W. and + McAuley, Julian", + booktitle = "arXiv", + year = "2020", + url = "https://arxiv.org/abs/2003.04887" +} + +@article{albumentations, + author = {A. Buslaev and A. Parinov and E. Khvedchenya and V.~I. Iglovikov and A.~A. Kalinin}, + title = "{Albumentations: fast and flexible image augmentations}", + journal = {ArXiv e-prints}, + eprint = {1809.06839}, + year = 2018 +} + +@inproceedings{touvron2019FixRes, + author = {Touvron, Hugo and Vedaldi, Andrea and Douze, Matthijs and J{\'e}gou, Herv{\'e}}, + title = {Fixing the train-test resolution discrepancy}, + booktitle = {Advances in Neural Information Processing Systems (NeurIPS)}, + year = {2019}, +} + +@inproceedings{naeini2015obtaining, + title={Obtaining well calibrated probabilities using bayesian binning}, + author={Naeini, Mahdi Pakdaman and Cooper, Gregory and Hauskrecht, Milos}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={29}, + number={1}, + year={2015} +} + +@inproceedings{sennrich-etal-2016-neural, + title = "Neural Machine Translation of Rare Words with Subword Units", + author = "Sennrich, Rico and + Haddow, Barry and + Birch, Alexandra", + booktitle = "Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", + month = aug, + year = "2016", + address = "Berlin, Germany", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/P16-1162", + doi = "10.18653/v1/P16-1162", + pages = "1715--1725", +} diff --git a/2024/05/29/papers/2105.06987/conclusion.tex b/2024/05/29/papers/2105.06987/conclusion.tex new file mode 100644 index 00000000..1bdd117c --- /dev/null +++ b/2024/05/29/papers/2105.06987/conclusion.tex @@ -0,0 +1,3 @@ +\section{Conclusion}\label{sec:conclusion} + +This work examined poor convergence of Ensemble Distribution Distillation when applied to large-scale tasks where the number of classes is very high. We investigated the Dirichlet log-likelihood loss and showed that classes with low probability induce larger gradients than high-probability classes, forcing the model to focus on the distribution of the ensemble tail-class probabilities. We proposed a new training objective which minimizes the reverse KL-divergence to a \emph{Proxy-Dirichlet} target derived from the ensemble. This loss resolves the gradient issues of Ensemble Distribution Distillation, as we demonstrate both theoretically and empirically on the ImageNet and WMT17 En-De datasets containing 1000 and 40,000 classes, respectively. This, this work allows Ensemble-Distribution Distillation to be applied to tasks with arbitrary numbers of classes and complexity, enabling fast ensemble inference through distillation in compute bound, risk-critical applications. diff --git a/2024/05/29/papers/2105.06987/distillation_structured_uncertainty.tex b/2024/05/29/papers/2105.06987/distillation_structured_uncertainty.tex new file mode 100644 index 00000000..35901de2 --- /dev/null +++ b/2024/05/29/papers/2105.06987/distillation_structured_uncertainty.tex @@ -0,0 +1,181 @@ +\section{Distillation for Structured Prediction} + +\begin{eqnarray*} +{\cal D} +%% =\left\{\{{\bm x}^{(1)},y^{(1)}\},\ldots,{\bm x}^{(N)},y^{(N)}\}\right\} += +\left\{\{{\bm x}^{(i)},y^{(i)}\}\right\}_{i=1}^N +; \:\:\:\: +y^{(i)}\in\{\omega_1,\ldots,\omega_K\} +\end{eqnarray*} +In terms of standard notation given the training data ${\cal D}$ training can be expressed as +\begin{eqnarray} +{\overline{\bm\theta}} = += +\argmax_{\bm\theta}\left\{ +\sum_{i=1}^N +\log\left({\tt P}(y^{(i)}|\bm{x}^{(i)}; {{\bm{\theta}}})\right) +\right\} +\end{eqnarray} +The labels, $y$ from the joint distribution are now discarded, and the posteriors frome the trained model are used to estimate the distilled model parameters ${\hat{\bm\theta}}$ +\begin{eqnarray} +{\hat{\bm\theta}} = \argmin_{\bm\theta}\left\{ +\sum_{i=1}^N{\tt KL}\big[ +{\tt P}(y|\bm{x}^{(i)}; {\overline{\bm{\theta}}}) +\big|\big| +{{\tt P}(y|\bm{x}^{(i)}; {{\bm{\theta}}})} +\big] +\right\} +\end{eqnarray} +where +\begin{eqnarray} +{\tt KL}\big[ +{\tt P}(y|\bm{x}^{(i)}; {\overline{\bm{\theta}}}) +\big|\big| +{{\tt P}(y|\bm{x}^{(i)}; {{\bm{\theta}}})} +\big] = +\sum_{k=1}^K{\tt P}(y=\omega_k|{\bm x}^{(i)}; {\overline{\bm{\theta}}})\log +\left( +\frac{{\tt P}(y=\omega_k|{\bm x}^{(i)}; {\overline{\bm{\theta}}})} +{{\tt P}(y=\omega_k|{\bm x}^{(i)};{\bm\theta})} +\right) +\end{eqnarray} +When the data is now structured, for example tasks such as speech recognition, speech synthesis and machine translation the joint distribution of the data is more complicated as there are now observations sequences and label sequences. The data distribution can be expressed as~\footnote{Of course $T$ and $L$ vary, but for simplicity this is ignored for notational simplicity.} ${\tt p}_{\tt tr}({\bm x}_{1:T},y_{1:L})$. Again only samples from the true distribution +\begin{eqnarray*} +{\cal D} +%% =\left\{\{{\bm x}^{(1)},y^{(1)}\},\ldots,{\bm x}^{(N)},y^{(N)}\}\right\} += +\left\{\{{\bm x}^{(i)}_{1:T},y^{(i)}_{1:L}\}\right\}_{i=1}^N +; \:\:\:\: +y^{(i)}\in\{\omega_1,\ldots,\omega_K\} +\end{eqnarray*} +Again this data can be used to train a standard sequence model with parameters ${\overline{\bm\theta}}$. One optimal difference is that the simple (conditional) maximum likelihood training for unstructured data can use minimum Bayes risk training with an appropriate sequence loss function. The original implementation of knowledge distillation, or teacher-student training, for this form of model used a simple per sample approximation +\begin{eqnarray} +{\hat{\bm\theta}} = \argmin_{\bm\theta}\left\{ +\sum_{i=1}^N\sum_{j=1}^T{\tt KL}\big[ +{\tt P}(y|\bm{x}^{(i)}_j; {\overline{\bm{\theta}}}) +\big|\big| +{{\tt P}(y|\bm{x}^{(i)}_j; {{\bm{\theta}}})} +\big] +\right\} +\end{eqnarray} +Effectively this has ignored the problem as a sequence problem. Sequence teacher-student training, sequence distillation, aims to address this problem by considering posterior distributions of complete sequences, compressed into a lattice ${\cal L}$ generated from model ${\overline{\bm\theta}}$ for the $i-th$ sample, ${\cal L}|\bm{x}^{(i)}_{1:T}; {\overline{\bm{\theta}}}$. The expression then becomes +\begin{eqnarray} +{\hat{\bm\theta}} = \argmin_{\bm\theta}\left\{ +\sum_{i=1}^N{\tt KL}\big[ +{\cal L}|\bm{x}^{(i)}_{1:T}; {\overline{\bm{\theta}}} +\big|\big| +{\cal L}|\bm{x}^{(i)}_{1:T}; {\bm{\theta}} +\big] +\right\} +\end{eqnarray} + +For these approaches there is no assumption that the complexity of the models is consistent between ${\overline{\bm\theta}}$ and ${\hat{\bm\theta}}$. This distillation can be used to simplify the complexity of the model, yielding speed and memory advantages. + +The second application of distillation is to distill an ensemble of models into a single model. Applying standard ensemble distillation +\begin{eqnarray} +{\hat{\bm\theta}} = \argmin_{\bm\theta}\left\{ +\sum_{i=1}^N\sum_{m=1}^M{\tt KL}\big[ +{\tt p}(y|\bm{x}^{(i)}; {\overline{\bm{\theta}}}^{(m)}) +\big|\big| +{\tt p}(y|\bm{x}^{(i)}; {\bm{\theta}}) +\big] +\right\} +\end{eqnarray} +where the ensemble of models comprises parameters, $\left\{{\overline{\bm\theta}}^{(m)}\right\}_{m=1}^M$. As previously mentioned this has ignored the diversity associated with the model. To address this problem ensemble distribution distillation can be used. Here the distribution of the predictions in the ensemble is modelled. Denote the prediction of sample ${\bm x}^{(i)}$ using model ${\overline{\bm\theta}}^{(m)}$ as +\begin{eqnarray} +{\bm\pi}^{(mi)} = \left[ +\begin{array}{c} +{\tt P}(y=\omega_1|{\bm x}^{(i)};{\overline{\bm\theta}}^{(m)}) \\ +\vdots\\ +{\tt P}(y=\omega_K|{\bm x}^{(i)};{\overline{\bm\theta}}^{(m)}) +\end{array} +\right]; +\:\:\:\: +{\bm\pi}^{(mi)} \sim {\tt p}_{\pi}({\bm x}^{(i)}) +\end{eqnarray} +To model the ensemble distribution these samples can then be used for training the model +\begin{eqnarray} +{\hat{\bm\theta}} = +\argmin_{\bm\theta}\left\{ +\sum_{i=1}^N +{\tt KL}\big[ +{\tt p}_{\pi}(\bm{x}^{(i)}) +\big|\big| +{\tt p}_{\pi}(\bm{x}^{(i)}; {\bm{\theta}}) +\big] +\right\} +\approx +\argmax_{\bm\theta}\left\{ +\sum_{i=1}^N +\sum_{m=1}^M +\log\left( +{\tt p}_{\pi}({\bm\pi}^{(mi)} | \bm{x}^{(i)}; {\bm{\theta}}) +\right) +\right\} +\end{eqnarray} +This form of model, associated with distributions over distributions, is a prior network. This has the same form as density networks, networks that predict the parameters of parametric distributions, but the form here is a distribution over distributions. Thus +\begin{eqnarray*} +{\tt p}_{\pi}({\bm\pi}^{(mi)} | \bm{x}^{(i)}; {\bm{\theta}}) += {\tt Dir}({\bm\pi}^{(mi)}; {\cal F}_\alpha(\bm{x}^{(i)}; {\bm{\theta}})) +\end{eqnarray*} +where ${\cal F}_\alpha(\bm{x}^{(i)}; {\bm{\theta}})$ yields the appropriate Dirichlet distribution parameters. + +Extending this form of distribution to handle structured data is non-trivial. It is possible to extend the notation above to handle lattices +\begin{eqnarray} +{\cal L}^{(mi)} = +{\cal L}|{\bm x}^{(i)};{\overline{\bm\theta}}^{(m)}; +\:\:\:\: +{\cal L}^{(mi)} \sim {\tt p}_{\cal L}({\bm x}^{(i)}) +\end{eqnarray} +The challenge is to decide on the appropriate parametric form for ${\tt p}_{\cal L}({\bm x}^{(i)};{\bm\theta})$. + +\section{Sequence Distribution Distillation} +The simplest approach to a distribution over the lattice~\footnote{The initial form considered here is a prefix-tree not a lattice.} is to make an additional conditional independence assumption: the distribution over the predicted word distributions is only dependent on the history of words to that point. Thus consider adding a new arc ${\cal A}$ to the lattice having taken word sequence $w_{1:\tau-1}=w_1,\ldots,w_{\tau-1}$ where $w_\tau\in\{\omega_k\}_{k=1}^K$. Initially just consider the scenario where only the word-id information is considered (so duration and score are not considered). The task is to get the distribution over the predictions for word $w_\tau$. Consider +\begin{eqnarray*} +{\tt p}_{\pi}({{\bm\pi}}_{\tau}|w_{1:\tau-1};{\bm\theta}) +\end{eqnarray*} +where +\begin{eqnarray} +{\bm\pi}_{\tau} = \left[ +\begin{array}{c} +{\tt P}(y=\omega_1|w_{1:\tau-1}) \\ +\vdots\\ +{\tt P}(y=\omega_K|w_{1:\tau-1}) +\end{array} +\right] +\end{eqnarray} + +Consider the following form +\begin{eqnarray} +{\tt p}_{\pi}({{\bm\pi}}_{\tau}|w_{1:\tau-1};{\bm\theta}) +\approx +{\tt p}_{\pi}({{\bm\pi}}_{\tau}|w_{\tau-1},{\bm\alpha}_{\tau-1};{\bm\theta}) += {\tt Dir}({{\bm\pi}}_{\tau};{\cal F}_\alpha(w_{\tau-1},{\bm\alpha}_{\tau-1};{\bm\theta}) += {\tt Dir}({{\bm\pi}}_{\tau};{\bm\alpha}_{\tau}) +\end{eqnarray} +This effectively replaces the standard PMF prediction from an RNN-LM, with a Dirichlet distribution prediction. Following this logic through +\begin{eqnarray} +{\cal F}_\alpha(w_{\tau},{\bm\alpha}_{\tau};{\bm\theta}) \approx +{\cal F}_\alpha(w_{\tau},{\bm h}_{\tau};{\bm\theta}); \:\:\:\: +{\bm h}_{\tau} = {\cal F}_{\tt h}(w_{\tau-1},{\bm h}_{\tau-1};{\bm\theta}) +\end{eqnarray} +This form of model can be trained in the same fashion as standard LMs, but now using the distributions that result from an ensemble if language models. Let +\begin{eqnarray} +{\bm\pi}^{(m)}_{\tau} = \left[ +\begin{array}{c} +{\tt P}(y=\omega_1|w_{1:\tau-1};{\bm\theta}^{(m)}) \\ +\vdots\\ +{\tt P}(y=\omega_K|w_{1:\tau-1};{\bm\theta}^{(m)}) +\end{array} +\right] +\end{eqnarray} +The optimisation criterion then becomes +\begin{eqnarray} +{\hat{\bm\theta}} = \argmax_{{\bm\theta}}\left\{ +\sum_{\tau=1}^L\sum_{m=1}^M\log\left( +{\tt p}_{\pi}({\bm\pi}^{(m)}_{\tau}|w_{1:\tau-1};{\bm\theta}) +\right) +\right\} +\end{eqnarray} + diff --git a/2024/05/29/papers/2105.06987/ensemble_uncertainty.tex b/2024/05/29/papers/2105.06987/ensemble_uncertainty.tex new file mode 100644 index 00000000..54b6fa41 --- /dev/null +++ b/2024/05/29/papers/2105.06987/ensemble_uncertainty.tex @@ -0,0 +1,62 @@ +\section{Uncertainty Estimation via Ensembles}\label{sec:ensembles} +%%%%FLOW!!!!!!%%%% + +In this work we take a Bayesian viewpoint on ensembles, as it yields an elegant probabilistic framework within which interpretable uncertainty estimates can be obtained. The core of the Bayesian approach is to treat the model parameters $\bm{\theta}$ as random variables and place a prior ${\tt p}(\bm{\theta})$ over them to compute a posterior ${\tt p}(\bm{\theta}|\mathcal{D})$ via Bayes' rule: +\begin{empheq}{align} +\begin{split} + {\tt p}(\bm{\theta}|\mathcal{D}) &= \frac{{\tt p}(\mathcal{D}|\bm{\theta}){\tt p}(\bm{\theta})}{{\tt p}(\mathcal{D})} +\end{split} +\label{eqn:bayesposterior} +\end{empheq} +However, exact Bayesian inference is intractable for neural networks. It is therefore necessary to consider an explicit or implicit approximation ${\tt q}(\bm{\theta})$ to the true posterior ${\tt p}(\bm{\theta}|\mathcal{D})$ to generate an ensemble of models. A number of different approaches to generating an ensemble of models have been developed, such as Monte-Carlo Dropout~\cite{Gal2016Dropout}, DeepEnsembles~\cite{deepensemble2017}, and Stochastic Weight Averaging Gaussian (SWAG)~\cite{maddox2019simple}. A full overview is available in~\cite{ashukha2020pitfalls, trust-uncertainty}. + +Consider an ensemble of models $\{{\tt P}(y | \bm{x}; \bm{\theta}^{(m)})\}_{m=1}^M$ sampled from an approximate posterior ${\tt q}(\bm{\theta})$, where each model captures an \emph{unstructured} mapping $\bm{x} \rightarrow y$, where $\bm{x} \in \mathcal{R}^D$ and $y \in \{\omega_1,\cdots,\omega_K\}$. Each of the models ${\tt P}(y|\bm{x}, \bm{\theta}^{(m)})$ yields a \emph{different} estimate of \emph{data uncertainty}. Uncertainty in predictions due to \emph{knowledge uncertainty} is expressed as the level of spread, or `disagreement', of models in the ensemble~\cite{malinin-thesis}. The \emph{predictive posterior} of the ensemble is obtained by taking the expectation with respect to the models in the ensemble: +\begin{empheq}{align} +\begin{split} +{\tt P}(y | \bm{x}, \mathcal{D}) =&\ \mathbb{E}_{{\tt p}(\bm{\theta}|\mathcal{D})}\big[{\tt P}(y | \bm{x} ; \bm{\theta}) \big] \\ +\approx&\ \frac{1}{M}\sum_{m=1}^M {\tt P}(y | \bm{x}; \bm{\theta}^{(m)}),\ \bm{\theta}^{(m)} \sim {\tt q}(\bm{\theta}) +\end{split} +\end{empheq} +The entropy of the predictive posterior will be an estimate of \emph{total uncertainty} in predictions: +\begin{empheq}{align} +\mathcal{H}\big[{\tt P}(y | \bm{x}, \mathcal{D})\big] =&\ \mathbb{E}_{{\tt P}(y | \bm{x}, \mathcal{D})}\big[ -\ln {\tt P}(y | \bm{x}, \mathcal{D}) \big] +\end{empheq} +\emph{Total uncertainty} in the prediction is due to both \emph{data uncertainty} and \emph{knowledge uncertainty}. However, for tasks such as misclassification detection considering the log-likelihood score assigned by the predictive posterior to a particular class can yield superior performance as it is more sensitive to the prediction made~\cite{malinin-thesis}: +\begin{empheq}{align} +\begin{split} +\text{SCR} =&\ \ln{\tt P}(y = \hat \omega| \bm{x}, \mathcal{D}) +\end{split} +\end{empheq} + +In certain situations, such as active learning~\cite{batchbald} and out-of-distribution input detection, it is desirable to evaluate uncertainty in predictions due to \emph{knowledge uncertainty}. The sources of uncertainty can be decomposed by considering the \emph{mutual information} between the model parameters $\bm{\theta}$ and the prediction $y$~\cite{mutual-information}: +\begin{empheq}{align} +\begin{split} +\underbrace{\mathcal{I}\big[y, \bm{\theta} | \bm{x}, \mathcal{D}\big]}_{\text{Knowledge Uncertainty}} =&\ \underbrace{\mathcal{H}\big[{\tt P}(y | \bm{x}, \mathcal{D})\big]}_{\text{Total Uncertainty}} \\ +- & \underbrace{\mathbb{E}_{{\tt q}(\bm{\theta})}\big[\mathcal{H}[{\tt P}(y | \bm{x}; \bm{\theta})]\big]}_{\text{Expected Data Uncertainty}} +\end{split}\label{eqn:mi} +\end{empheq} +This is expressed as the difference between the entropy of the predictive posterior and the expected entropy of each model in the ensemble. The former is a measure of \emph{total uncertainty}, while the later is a measure of \emph{expected data uncertainty}. Their difference will be measure of spread of the ensemble and an estimate of \emph{knowledge uncertainty}. It is also possible to consider the \emph{expected pairwise KL-divergence} between models in an ensemble as an alternative measure of ensemble diversity: +\begin{empheq}{align} +\begin{split} +\mathcal{K}[y, \bm{\theta}] =&\ \mathbb{E}_{{\tt q}(\bm{\theta}){\tt q}(\bm{\tilde \theta})}\big[{\tt KL}[{\tt P}(y|\bm{x},\bm{\theta})||{\tt P}(y|\bm{x},\bm{\tilde \theta}) ]\big] \\ +=& \underbrace{-\sum_{y} \mathbb{E}_{{\tt q}(\bm{\theta})}[{\tt P}(y|\bm{x}, \bm{\theta})]\mathbb{E}_{{\tt q}(\bm{\tilde \theta})}[\ln{\tt P}(y|\bm{x}, \bm{\tilde \theta}]}_{\text{Total Uncertainty}}\\ +- & \underbrace{\mathbb{E}_{{\tt q}(\bm{\theta})}\big[\mathcal{H}[{\tt P}(y | \bm{x}; \bm{\theta})]\big]}_{\text{Expected Data Uncertainty}} +\end{split} +\label{eqn:epklbayes} +\end{empheq} +where ${\tt q}(\bm{\theta})={\tt q}(\bm{\tilde \theta})$. This measure is an upper bound on the mutual information and also allows \emph{total uncertainty} to be decomposed into \emph{knowledge uncertainty} and \emph{data uncertainty}~\cite{malinin-thesis}. Notably, only estimates of \emph{total uncertainty} differ, while the estimate of \emph{data uncertainty} provided by both decompositions is the same. + + + + + + + +% First, let's consider the entropy of as single model in the ensemble at an input $\bm{x}$: +% \begin{empheq}{align} +% \begin{split} +% \mathcal{H}\big[{\tt P}(y | \bm{x}; \bm{\theta})\big] +% =&\ \mathbb{E}_{{\tt P}(y | \bm{x}; \bm{\theta})}\big[ -\ln {\tt P}(y | \bm{x}; \bm{\theta}) \big] +% \end{split} +% \end{empheq} +% This will be an estimate of \emph{data uncertainty}, or uncertainty due to the natural complexity of the data~\cite{malinin-thesis}. \ No newline at end of file diff --git a/2024/05/29/papers/2105.06987/experiments.tex b/2024/05/29/papers/2105.06987/experiments.tex new file mode 100644 index 00000000..7a673c3e --- /dev/null +++ b/2024/05/29/papers/2105.06987/experiments.tex @@ -0,0 +1,159 @@ +\section{Experiments} +\label{sec:experiments} + +In this section, we evaluate \Endd via minimization of Reverse KL-divergence between the model and a Proxy Dirichlet target. We apply distribution distillation to ensembles of convolutional networks trained on the ImageNet dataset and to ensemble of Transformer models trained on WMT'17 En-De. Our goal here is to demonstrate that given an ensemble, we can successfully distribution-distill it into a single model. Note that we do not provide results for \Endd accomplished by optimizing Dirichlet NLL or forward KL-divergence, because we could not get them to even begin to converge on the tasks considered here. + +\subsection{Setup} +\label{sec:experiments_setup} +We consider two large-scale tasks involving classification: 1000-class image classification and sequence-to-sequence modeling of natural language. For each task, we first train the ensemble of regular models and then distill it with \Endd. For comparison, we also report the average single-model performance along with the following baselines: + +\begin{itemize} +\item \textbf{Ensemble} refers to the performance of an ensemble of independently trained models, which was previously shown to yield high quality uncertainty estimates~\cite{deepensemble2017} and to outperform more sophisticated methods using only a few models~\cite{ashukha2020pitfalls}. +\item \textbf{Ensemble Distillation} (EnD) is a common approach to model and ensemble distillation, first proposed in~\cite{hinton2015distilling}. It involves training the student model with the soft target distribution of averaged ensemble predictions. Notably, we do not add the cross-entropy loss for ground truth labels, because we focus on the comparison of distillation objectives and not only classification performance. +\end{itemize} + +% TODO discuss why not MC-dropout? +We do not use Hydra~\cite{hydra} or similar multi-head approaches for distilling each separate ensemble member, because with a high number of models in the ensemble and even 1000 classes the computation overhead is no longer negligible. In all experiments with \Endd, we add 1 both to the predicted parameters of the Dirichlet distribution and the Dirichlet proxy parameters. +% : we evaluate the performance of versions without these modifications in Section~\ref{sec:experiments_ablation} + +Both for error rejection and out-of-distribution detection, we use several information-theoretic measures uncertainty; in particular, we use entropy of the expected predictive distribution (EoE) for total uncertainty and Reverse Mutual Information (RMI) for knowledge uncertainty throughout this section. +Derivations of these measures both for \Endd and ensembles are available in~\cite{malinin-thesis} and~\cite{malinin-structured-2020}. +For Single and EnD single-model baselines, we use +entropy of the output distribution as the only valid uncertainty estimate. +% the same measures of uncertainty by interpreting exponents of logits as parameters of a Dirichlet distribution. +% As we show later, in some setups the performance of such models can be surprisingly competitive with that of \Endd and even ensembles; we leave the study of this phenomenon to future work. + +\subsection{Large-scale image classification} +\label{experiments:imagenet} + +For the first experiment, we run distillation of the ensemble that contains 10 ResNet-50~\cite{resnet} models trained on the ImageNet~\cite{imagenet} image classification dataset. We use the standard training setup outlined in~\cite{touvron2019FixRes}; specifically, we train for 90 epochs using stochastic gradient descent with momentum of 0.9 and a learning rate of $0.1\times B/256$ (first proposed in~\cite{goyal2018accurate}), where B is the per-device batch size multiplied by the number of GPUs. +In our experiments, we use a single-GPU batch size of 256 and 8 NVIDIA V100 GPUs. The learning rate is divided by 10 every 30 epochs. For data augmentations, we use a standard combination of random resized crops and horizontal flips implemented in the Albumentations library~\cite{albumentations}. +In all experiments, we found it beneficial to initialize the last batch normalization $\gamma$ in each residual branch to zero, which agrees with previous results~\cite{goyal2018accurate, zhang2018residual, rezero}. + +For a thorough evaluation of all methods, we use several different characteristics of performance. First, we measure the in-domain classification accuracy on the original ImageNet validation subset~\cite{imagenet}, which is commonly used for comparison of image classification models. Second, we compare the robustness of all approaches to different domain shifts, also measured by accuracy on datasets corresponding to these shifts. In particular, we use adversarial examples from ImageNet-A~\cite{hendrycks2021nae}, corrupted and perturbed versions of original ImageNet validation data from ImageNet-C~\cite{hendrycks2019robustness}, and artistic renditions from ImageNet-R~\cite{hendrycks2020many}. Next, these domain shift and the original validation dataset are used to compare calibration of models with Expected Calibration Error (ECE). +Finally, we measure the out-of-distribution detection error in terms of Receiver Operating Characteristic area under curve (ROC AUC) on the domain shift datasets together with ImageNet-O~\cite{hendrycks2021nae}. + +% \begin{itemize} +% \item In-domain classification accuracy +% \item Robustness is specifically classification accuracy for out-of-domain data +% \item Expected calibration error (ECE) +% \item Out-of-domain detection error: , measured in terms of the area under the ROC AUC curve +% \end{itemize} + +% Finally, we also evaluate the out-of-distribution detection error, measured in terms of Receiver Operating Characteristic area under curve (ROC AUC). + + +We report the results for all metrics in Tables~\ref{tab:imagenet_pred} and~\ref{tab:imagenet_ood} for prediction quality and out-of-distribution detection respectively. +Here, the metrics on ImageNet-C are averaged over all degrees of corruption; in Figure~\ref{fig:imagenet_breakdown}, we provide the detailed results of evaluation on each degree separately. +For out-of-distribution detection, we also provide the results of the Dirichlet Proxy to verify that this approximation of the ensemble predictive distribution does not significantly affect its performance. + +Table~\ref{tab:imagenet_pred} shows that \Endd is capable of accurate emulation of the ensemble in terms of classification performance: in terms of accuracy, the method displays results on par or slightly better than regular distillation while also having smaller calibration errors. Also, in Table~\ref{tab:imagenet_ood}, it can be seen that for most datasets (except the hardest ImageNet-O) Proxy-Dirichlet distillation can closely match the out-of-distribution performance of the ensemble. As expected, both distillation methods outperform training a single model from scratch while having the same computational complexity. + +Furthermore, Figure~\ref{fig:imagenet_breakdown} shows that as the domain shift increases, all models suffer from a drop in accuracy and calibration quality; notably, EnD and \Endd have the same calibration performance on original data, but Dirichlet network distillation has lower calibration errors for the highest degrees of corruption. Unsurprisingly, the further the data is from the original training images, the better the models are at out-of-distribution detection. + +\begin{table} +\centering +\small +\caption{Prediction quality results for image classification.} +\label{tab:imagenet_pred} +\begin{tabular}{lcccccccc} +\toprule +{} & \multicolumn{2}{c}{ImageNet-val} & \multicolumn{2}{c}{ImageNet-A} & \multicolumn{2}{c}{ImageNet-C} & \multicolumn{2}{c}{ImageNet-R} \\ +{} & Acc & ECE & Acc & ECE & Acc & ECE & Acc & ECE \\ +\midrule +Single & 75.9±0.1 & 4.8±0.1 & 4.4±0.2 & 51.1±0.3 & 39.1±0.7 & 11.3±0.7 & 35.0±0.2 & 21.3±0.4 \\ +Ensemble & 79.0 & 2.3 & 3.9 & 42.0 & 43.5 & 4.5 & 38.8 & 9.8 \\ +EnD & 77.0 & 1.6 & 3.8 & 46.6 & 40.6 & 5.9 & 36.9 & 16.1 \\ +\Endd & 77.1 & 1.6 & 3.9 & 42.8 & 40.6 & 4.5 & 37.0 & 11.8 \\ +\bottomrule +\end{tabular} +\end{table} + +\begin{table} +\centering +\small +\caption{Out-of-distribution detection results for image classification.} +\label{tab:imagenet_ood} +\begin{tabular}{lcccccccc} +\toprule +{} & \multicolumn{2}{c}{ImageNet-O} & \multicolumn{2}{c}{ImageNet-A} & \multicolumn{2}{c}{ImageNet-C} & \multicolumn{2}{c}{ImageNet-R} \\ +{} & EoE & RMI & EoE & RMI & EoE & RMI & EoE & RMI \\ +\midrule +Single & 50.7±0.3 & - & 85.8±0.1 & - & 79.9±0.4 & - & 83.0±0.2 & - \\ +Ensemble & 54.6 & 62.7 & 88.8 & 86.7 & 82.0 & 77.5 & 86.1 & 84.1 \\ +Proxy & 54.6 & 62.9 & 88.8 & 86.5 & 82.0 & 77.3 & 86.1 & 84.0 \\ +EnD & 48.4 & - & 87.2 & - & 80.8 & - & 83.9 & - \\ +\Endd & 52.0 & 53.2 & 86.8 & 84.6 & 80.1 & 76.9 & 83.7 & 81.4 \\ +\bottomrule +\end{tabular} +\end{table} + +\begin{figure} + \centering + \includegraphics[width=\textwidth]{figures/breakdown.pdf} + \caption{Performance of image classification models depending on the level of ImageNet-C corruption. No corruption corresponds to the original ImageNet validation data.} + \label{fig:imagenet_breakdown} +\end{figure} + +\subsection{Machine translation} +\label{experiments:nmt} +For this experiment, we train standard Transformer-big~\cite{vaswani2017attention} models on the WMT'17 English-German machine translation dataset with the vocabulary of 40,000 Byte-Pair Encoding tokens~\cite{sennrich-etal-2016-neural}. Each of the 10 ensemble members is trained with the setup described in~\cite{ott2018scaling}: in particular, we train them for 193,000 steps with Adam~\cite{adam} on 8 NVIDIA V100 GPUs with a batch size of 4096 tokens per GPU. We train all distillation models for 20,000 steps with the increased batch size of 32K tokens. Because our approach requires fitting all 10 ensemble members in GPU memory, we reduce the immediate batch size for each step to 1024, but compensate for it with gradient accumulation over 32 steps. For output generation and estimation of uncertainty measures (where applicable), we use beam search with beam size 5. + +To compare the approaches in terms of translation quality, we use the BLEU score~\cite{papineni2002bleu} computed with SacreBLEU~\cite{sacrebleu} and sequence-level Prediction Rejection Ratio~\cite{malinin-thesis} on the newstest14 English-German test set. For out-of-distribution detection, we also compute ROC AUC and use several datasets with different characteristics and degrees of domain shift: sentences with permuted tokens in the input, LibriSpeech~\cite{librispeech} test-clean speech transcriptions, and source sentences from newstest14 in German and French languages respectively. We average the results of both distillation methods over 5 random seeds and provide standard deviations of all metrics. + +\begin{table} +\centering +\small +\caption{Prediction quality results for machine translation.} +\label{tab:wmt_pred} +\begin{tabular}{lccc} +\toprule +{} & BLEU & EoE & RMI \\ +\midrule +Single & 28.8±0.1 & 36.0±1.3 & - \\ +Ensemble & 30.1 & 30.2 & 26.0 \\ +EnD & 29.4±0.1 & 35.6±0.4 & - \\ +\Endd & 29.5±0.1 & 35.9±0.8 & 35.8±0.5 \\ +\bottomrule +\end{tabular} +\end{table} + +\begin{table} +\centering +\small +\caption{Out-of-distribution detection results for machine translation.} +\label{tab:wmt_ood} +\begin{tabular}{lcccccccc} +\toprule +{} & \multicolumn{2}{c}{Permuted} & \multicolumn{2}{c}{Speech} & \multicolumn{2}{c}{German} & \multicolumn{2}{c}{French} \\ +{} & EoE & RMI & EoE & RMI & EoE & RMI & EoE & RMI \\ +\midrule +Single & 80.7±1.5 & - & 73.7±1.2 & - & 32.8±2.8 & - & 27.1±6.3 & - \\ +Ensemble & 83.7 & 97.4 & 67.8 & 73.7 & 39.5 & 82.4 & 25.0 & 73.6 \\ +EnD & 79.5±1.1 & - & 75.9±0.6 & - & 35.4±1.6 & - & 15.6±3.2 & - \\ +\Endd & 78.3±1.6 & 97.1±0.3 & 77.0±0.3 & 78.5±0.2 & 38.3±1.6 & 70.9±0.7 & 15.9±3.0 & 60.1±3.6 \\ +\bottomrule +\end{tabular} +\end{table} + +Table~\ref{tab:wmt_pred} further confirms the findings made in the previous section: \Endd via Dirichlet-Proxy outperforms regular ensemble distillation in terms of translation quality and sequence-level error detection. Furthermore, in Table~\ref{tab:wmt_ood} we see that, compared to image classification, the OOD performance gap between total uncertainty and knowledge uncertainty is significantly larger. This might be explained by a significantly larger output space (40,000 classes instead of 1000) or the sequential nature of NMT predictions: because the model generates candidates in a large output space of all possible sequences, its prediction entropy might be high regardless of presence of a domain shift. + +% \subsection{Ablation study} +% \label{sec:experiments_ablation} + + +% \begin{table}[t] +% \centering +% \begin{tabular}{@{}lllll@{}} +% \toprule +% & & \multicolumn{3}{l}{OOD detection} \\ +% & Accuracy & Imagenet-C & Imagenet-R & Imagenet-A \\ \midrule +% END\textasciicircum{}2 & & & & \\ +% END\textasciicircum{}2+RKL mediator etc & & & & \\ \midrule +% - target smoothing & & & & \\ +% - shifted parametrization & & & & \\ +% RKL -\textgreater Forward KL & & & & \\ \bottomrule +% \end{tabular}% +% \end{table} + +% Several additional modifications are given in the Appendix\ref{TODOappendix_ablation} \ No newline at end of file diff --git a/2024/05/29/papers/2105.06987/figures/Rejection-Curve-oracle.png b/2024/05/29/papers/2105.06987/figures/Rejection-Curve-oracle.png new file mode 100644 index 00000000..3738d41d Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/Rejection-Curve-oracle.png differ diff --git a/2024/05/29/papers/2105.06987/figures/Rejection-Curve-uncertainty.png b/2024/05/29/papers/2105.06987/figures/Rejection-Curve-uncertainty.png new file mode 100644 index 00000000..2b7d6939 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/Rejection-Curve-uncertainty.png differ diff --git a/2024/05/29/papers/2105.06987/figures/algorithm.sty b/2024/05/29/papers/2105.06987/figures/algorithm.sty new file mode 100644 index 00000000..843e3d5b --- /dev/null +++ b/2024/05/29/papers/2105.06987/figures/algorithm.sty @@ -0,0 +1,79 @@ +% ALGORITHM STYLE -- Released 8 April 1996 +% for LaTeX-2e +% Copyright -- 1994 Peter Williams +% E-mail Peter.Williams@dsto.defence.gov.au +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithm} +\typeout{Document Style `algorithm' - floating environment} + +\RequirePackage{float} +\RequirePackage{ifthen} +\newcommand{\ALG@within}{nothing} +\newboolean{ALG@within} +\setboolean{ALG@within}{false} +\newcommand{\ALG@floatstyle}{ruled} +\newcommand{\ALG@name}{Algorithm} +\newcommand{\listalgorithmname}{List of \ALG@name s} + +% Declare Options +% first appearance +\DeclareOption{plain}{ + \renewcommand{\ALG@floatstyle}{plain} +} +\DeclareOption{ruled}{ + \renewcommand{\ALG@floatstyle}{ruled} +} +\DeclareOption{boxed}{ + \renewcommand{\ALG@floatstyle}{boxed} +} +% then numbering convention +\DeclareOption{part}{ + \renewcommand{\ALG@within}{part} + \setboolean{ALG@within}{true} +} +\DeclareOption{chapter}{ + \renewcommand{\ALG@within}{chapter} + \setboolean{ALG@within}{true} +} +\DeclareOption{section}{ + \renewcommand{\ALG@within}{section} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsection}{ + \renewcommand{\ALG@within}{subsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsubsection}{ + \renewcommand{\ALG@within}{subsubsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{nothing}{ + \renewcommand{\ALG@within}{nothing} + \setboolean{ALG@within}{true} +} +\DeclareOption*{\edef\ALG@name{\CurrentOption}} + +% ALGORITHM +% +\ProcessOptions +\floatstyle{\ALG@floatstyle} +\ifthenelse{\boolean{ALG@within}}{ + \ifthenelse{\equal{\ALG@within}{part}} + {\newfloat{algorithm}{htbp}{loa}[part]}{} + \ifthenelse{\equal{\ALG@within}{chapter}} + {\newfloat{algorithm}{htbp}{loa}[chapter]}{} + \ifthenelse{\equal{\ALG@within}{section}} + {\newfloat{algorithm}{htbp}{loa}[section]}{} + \ifthenelse{\equal{\ALG@within}{subsection}} + {\newfloat{algorithm}{htbp}{loa}[subsection]}{} + \ifthenelse{\equal{\ALG@within}{subsubsection}} + {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{} + \ifthenelse{\equal{\ALG@within}{nothing}} + {\newfloat{algorithm}{htbp}{loa}}{} +}{ + \newfloat{algorithm}{htbp}{loa} +} +\floatname{algorithm}{\ALG@name} + +\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}} + diff --git a/2024/05/29/papers/2105.06987/figures/algorithmic.sty b/2024/05/29/papers/2105.06987/figures/algorithmic.sty new file mode 100644 index 00000000..ad614783 --- /dev/null +++ b/2024/05/29/papers/2105.06987/figures/algorithmic.sty @@ -0,0 +1,201 @@ +% ALGORITHMIC STYLE -- Released 8 APRIL 1996 +% for LaTeX version 2e +% Copyright -- 1994 Peter Williams +% E-mail PeterWilliams@dsto.defence.gov.au +% +% Modified by Alex Smola (08/2000) +% E-mail Alex.Smola@anu.edu.au +% +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{algorithmic} +\typeout{Document Style `algorithmic' - environment} +% +\RequirePackage{ifthen} +\RequirePackage{calc} +\newboolean{ALC@noend} +\setboolean{ALC@noend}{false} +\newcounter{ALC@line} +\newcounter{ALC@rem} +\newlength{\ALC@tlm} +% +\DeclareOption{noend}{\setboolean{ALC@noend}{true}} +% +\ProcessOptions +% +% ALGORITHMIC +\newcommand{\algorithmicrequire}{\textbf{Require:}} +\newcommand{\algorithmicensure}{\textbf{Ensure:}} +\newcommand{\algorithmiccomment}[1]{\{#1\}} +\newcommand{\algorithmicend}{\textbf{end}} +\newcommand{\algorithmicif}{\textbf{if}} +\newcommand{\algorithmicthen}{\textbf{then}} +\newcommand{\algorithmicelse}{\textbf{else}} +\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif} +\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif} +\newcommand{\algorithmicfor}{\textbf{for}} +\newcommand{\algorithmicforall}{\textbf{for all}} +\newcommand{\algorithmicdo}{\textbf{do}} +\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor} +\newcommand{\algorithmicwhile}{\textbf{while}} +\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile} +\newcommand{\algorithmicloop}{\textbf{loop}} +\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop} +\newcommand{\algorithmicrepeat}{\textbf{repeat}} +\newcommand{\algorithmicuntil}{\textbf{until}} + +%changed by alex smola +\newcommand{\algorithmicinput}{\textbf{input}} +\newcommand{\algorithmicoutput}{\textbf{output}} +\newcommand{\algorithmicset}{\textbf{set}} +\newcommand{\algorithmictrue}{\textbf{true}} +\newcommand{\algorithmicfalse}{\textbf{false}} +\newcommand{\algorithmicand}{\textbf{and\ }} +\newcommand{\algorithmicor}{\textbf{or\ }} +\newcommand{\algorithmicfunction}{\textbf{function}} +\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction} +\newcommand{\algorithmicmain}{\textbf{main}} +\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain} +%end changed by alex smola + +\def\ALC@item[#1]{% +\if@noparitem \@donoparitem + \else \if@inlabel \indent \par \fi + \ifhmode \unskip\unskip \par \fi + \if@newlist \if@nobreak \@nbitem \else + \addpenalty\@beginparpenalty + \addvspace\@topsep \addvspace{-\parskip}\fi + \else \addpenalty\@itempenalty \addvspace\itemsep + \fi + \global\@inlabeltrue +\fi +\everypar{\global\@minipagefalse\global\@newlistfalse + \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels + \penalty\z@ \fi + \everypar{}}\global\@nobreakfalse +\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi +\sbox\@tempboxa{\makelabel{#1}}% +\global\setbox\@labels + \hbox{\unhbox\@labels \hskip \itemindent + \hskip -\labelwidth \hskip -\ALC@tlm + \ifdim \wd\@tempboxa >\labelwidth + \box\@tempboxa + \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi + \hskip \ALC@tlm}\ignorespaces} +% +\newenvironment{algorithmic}[1][0]{ +\let\@item\ALC@item + \newcommand{\ALC@lno}{% +\ifthenelse{\equal{\arabic{ALC@rem}}{0}} +{{\footnotesize \arabic{ALC@line}:}}{}% +} +\let\@listii\@listi +\let\@listiii\@listi +\let\@listiv\@listi +\let\@listv\@listi +\let\@listvi\@listi +\let\@listvii\@listi + \newenvironment{ALC@g}{ + \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@ + \listparindent\z@ \rightmargin\z@ + \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@ + \leftmargin 1em + \addtolength{\ALC@tlm}{\leftmargin} + } + } + {\end{list}} + \newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item} + \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}% +{}{\ \algorithmiccomment{##1}}} + \newcommand{\REQUIRE}{\item[\algorithmicrequire]} + \newcommand{\ENSURE}{\item[\algorithmicensure]} + \newcommand{\STATE}{\ALC@it} + \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}} +%changes by alex smola + \newcommand{\INPUT}{\item[\algorithmicinput]} + \newcommand{\OUTPUT}{\item[\algorithmicoutput]} + \newcommand{\SET}{\item[\algorithmicset]} +% \newcommand{\TRUE}{\algorithmictrue} +% \newcommand{\FALSE}{\algorithmicfalse} + \newcommand{\AND}{\algorithmicand} + \newcommand{\OR}{\algorithmicor} + \newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}} +%end changes by alex smola + \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}} + \renewcommand{\\}{\@centercr} + \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\ + \algorithmicthen\ {##2}} + \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\ELSIF}[2][default]% +{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ % + \algorithmicdo\ {##2}} + \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@whl}} + \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop% +\ALC@com{##1}\begin{ALC@loop}} +%changed by alex smola + \newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ % + \ALC@com{##1}\begin{ALC@func}} + \newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ % + \ALC@com{##1}\begin{ALC@main}} +%end changed by alex smola + \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat% + \ALC@com{##1}\begin{ALC@rpt}} + \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1} + \ifthenelse{\boolean{ALC@noend}}{ + \newcommand{\ENDIF}{\end{ALC@if}} + \newcommand{\ENDFOR}{\end{ALC@for}} + \newcommand{\ENDWHILE}{\end{ALC@whl}} + \newcommand{\ENDLOOP}{\end{ALC@loop}} + \newcommand{\ENDFUNCTION}{\end{ALC@func}} + \newcommand{\ENDMAIN}{\end{ALC@main}} + }{ + \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif} + \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor} + \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile} + \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop} + \newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction} + \newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain} + } + \renewcommand{\@toodeep}{} + \begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}% + \itemsep\z@ \itemindent\z@ \listparindent\z@% + \partopsep\z@ \parskip\z@ \parsep\z@% + \labelsep 0.5em \topsep 0.2em% + \ifthenelse{\equal{#1}{0}} + {\labelwidth 0.5em } + {\labelwidth 1.2em } + \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep} + \ALC@tlm\labelsep + } + } + {\end{list}} + + + + + + + + + + + + + + diff --git a/2024/05/29/papers/2105.06987/figures/breakdown.pdf b/2024/05/29/papers/2105.06987/figures/breakdown.pdf new file mode 100644 index 00000000..23034990 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/breakdown.pdf differ diff --git a/2024/05/29/papers/2105.06987/figures/example_paper.aux b/2024/05/29/papers/2105.06987/figures/example_paper.aux new file mode 100644 index 00000000..d13c9a39 --- /dev/null +++ b/2024/05/29/papers/2105.06987/figures/example_paper.aux @@ -0,0 +1,48 @@ +\relax +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} +\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined +\global\let\oldcontentsline\contentsline +\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} +\global\let\oldnewlabel\newlabel +\gdef\newlabel#1#2{\newlabelxx{#1}#2} +\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} +\AtEndDocument{\ifx\hyper@anchor\@undefined +\let\contentsline\oldcontentsline +\let\newlabel\oldnewlabel +\fi} +\fi} +\global\let\hyper@last\relax +\gdef\HyperFirstAtBeginDocument#1{#1} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\newlabel{submission}{{1}{1}{}{section.1}{}} +\citation{langley00} +\citation{anonymous} +\newlabel{author info}{{2.3}{2}{}{subsection.2.3}{}} +\newlabel{final author}{{2.3.2}{3}{}{subsubsection.2.3.2}{}} +\citation{Samuel59} +\citation{Samuel59} +\citation{kearns89,Samuel59,mitchell80} +\citation{MachineLearningI} +\citation{Samuel59} +\citation{langley00} +\citation{Newell81} +\citation{DudaHart2nd} +\citation{MachineLearningI} +\citation{mitchell80} +\citation{kearns89} +\newlabel{icml-historical}{{1}{4}{Historical locations and number of accepted papers for International Machine Learning Conferences (ICML 1993 -- ICML 2008) and International Workshops on Machine Learning (ML 1988 -- ML 1992). At the time this figure was produced, the number of accepted papers for ICML 2008 was unknown and instead estimated}{figure.1}{}} +\newlabel{alg:example}{{1}{4}{}{algorithm.1}{}} +\newlabel{sample-table}{{1}{4}{Classification accuracies for naive Bayes and flexible Bayes on various data sets}{table.1}{}} +\citation{langley00} +\bibdata{example_paper} +\bibcite{anonymous}{{1}{2021}{{Author}}{{}}} +\bibcite{DudaHart2nd}{{2}{2000}{{Duda et~al.}}{{Duda, Hart, and Stork}}} +\bibcite{kearns89}{{3}{1989}{{Kearns}}{{}}} +\bibcite{langley00}{{4}{2000}{{Langley}}{{}}} +\bibcite{MachineLearningI}{{5}{1983}{{Michalski et~al.}}{{Michalski, Carbonell, and Mitchell}}} +\bibcite{mitchell80}{{6}{1980}{{Mitchell}}{{}}} +\bibcite{Newell81}{{7}{1981}{{Newell \& Rosenbloom}}{{Newell and Rosenbloom}}} +\bibcite{Samuel59}{{8}{1959}{{Samuel}}{{}}} +\bibstyle{icml2021} diff --git a/2024/05/29/papers/2105.06987/figures/example_paper.blg b/2024/05/29/papers/2105.06987/figures/example_paper.blg new file mode 100644 index 00000000..0dc0330a --- /dev/null +++ b/2024/05/29/papers/2105.06987/figures/example_paper.blg @@ -0,0 +1,46 @@ +This is BibTeX, Version 0.99d (TeX Live 2015) +Capacity: max_strings=35307, hash_size=35307, hash_prime=30011 +The top-level auxiliary file: example_paper.aux +The style file: icml2021.bst +Database file #1: example_paper.bib +You've used 8 entries, + 2773 wiz_defined-function locations, + 645 strings with 5915 characters, +and the built_in function-call counts, 3248 in all, are: += -- 293 +> -- 140 +< -- 9 ++ -- 49 +- -- 41 +* -- 223 +:= -- 507 +add.period$ -- 25 +call.type$ -- 8 +change.case$ -- 36 +chr.to.int$ -- 8 +cite$ -- 16 +duplicate$ -- 174 +empty$ -- 295 +format.name$ -- 51 +if$ -- 691 +int.to.chr$ -- 1 +int.to.str$ -- 1 +missing$ -- 6 +newline$ -- 47 +num.names$ -- 37 +pop$ -- 81 +preamble$ -- 1 +purify$ -- 29 +quote$ -- 0 +skip$ -- 127 +stack$ -- 0 +substring$ -- 100 +swap$ -- 24 +text.length$ -- 3 +text.prefix$ -- 0 +top$ -- 0 +type$ -- 78 +warning$ -- 0 +while$ -- 34 +width$ -- 0 +write$ -- 113 diff --git a/2024/05/29/papers/2105.06987/figures/example_paper.log b/2024/05/29/papers/2105.06987/figures/example_paper.log new file mode 100644 index 00000000..2009e9d1 --- /dev/null +++ b/2024/05/29/papers/2105.06987/figures/example_paper.log @@ -0,0 +1,581 @@ +This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) (preloaded format=pdflatex 2015.5.24) 30 DEC 2020 09:51 +entering extended mode + restricted \write18 enabled. + %&-line parsing enabled. +**example_paper.tex +(./example_paper.tex +LaTeX2e <2015/01/01> +Babel <3.9l> and hyphenation patterns for 79 languages loaded. +(/usr/local/texlive/2015/texmf-dist/tex/latex/base/article.cls +Document Class: article 2014/09/29 v1.4h Standard LaTeX document class +(/usr/local/texlive/2015/texmf-dist/tex/latex/base/size10.clo +File: size10.clo 2014/09/29 v1.4h Standard LaTeX file (size option) +) +\c@part=\count79 +\c@section=\count80 +\c@subsection=\count81 +\c@subsubsection=\count82 +\c@paragraph=\count83 +\c@subparagraph=\count84 +\c@figure=\count85 +\c@table=\count86 +\abovecaptionskip=\skip41 +\belowcaptionskip=\skip42 +\bibindent=\dimen102 +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/microtype/microtype.sty +Package: microtype 2013/05/23 v2.5a Micro-typographical refinements (RS) + +(/usr/local/texlive/2015/texmf-dist/tex/latex/graphics/keyval.sty +Package: keyval 2014/10/28 v1.15 key=value parser (DPC) +\KV@toks@=\toks14 +) +\MT@toks=\toks15 +\MT@count=\count87 +LaTeX Info: Redefining \textls on input line 766. +\MT@outer@kern=\dimen103 +LaTeX Info: Redefining \textmicrotypecontext on input line 1285. +\MT@listname@count=\count88 + +(/usr/local/texlive/2015/texmf-dist/tex/latex/microtype/microtype-pdftex.def +File: microtype-pdftex.def 2013/05/23 v2.5a Definitions specific to pdftex (RS) + +LaTeX Info: Redefining \lsstyle on input line 915. +LaTeX Info: Redefining \lslig on input line 915. +\MT@outer@space=\skip43 +) +Package microtype Info: Loading configuration file microtype.cfg. + +(/usr/local/texlive/2015/texmf-dist/tex/latex/microtype/microtype.cfg +File: microtype.cfg 2013/05/23 v2.5a microtype main configuration file (RS) +)) +(/usr/local/texlive/2015/texmf-dist/tex/latex/graphics/graphicx.sty +Package: graphicx 2014/10/28 v1.0g Enhanced LaTeX Graphics (DPC,SPQR) + +(/usr/local/texlive/2015/texmf-dist/tex/latex/graphics/graphics.sty +Package: graphics 2014/10/28 v1.0p Standard LaTeX Graphics (DPC,SPQR) + +(/usr/local/texlive/2015/texmf-dist/tex/latex/graphics/trig.sty +Package: trig 1999/03/16 v1.09 sin cos tan (DPC) +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/latexconfig/graphics.cfg +File: graphics.cfg 2010/04/23 v1.9 graphics configuration of TeX Live +) +Package graphics Info: Driver file: pdftex.def on input line 94. + +(/usr/local/texlive/2015/texmf-dist/tex/latex/pdftex-def/pdftex.def +File: pdftex.def 2011/05/27 v0.06d Graphics/color for pdfTeX + +(/usr/local/texlive/2015/texmf-dist/tex/generic/oberdiek/infwarerr.sty +Package: infwarerr 2010/04/08 v1.3 Providing info/warning/error messages (HO) +) +(/usr/local/texlive/2015/texmf-dist/tex/generic/oberdiek/ltxcmds.sty +Package: ltxcmds 2011/11/09 v1.22 LaTeX kernel commands for general use (HO) +) +\Gread@gobject=\count89 +)) +\Gin@req@height=\dimen104 +\Gin@req@width=\dimen105 +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/subfigure/subfigure.sty +Package: subfigure 2002/03/15 v2.1.5 subfigure package +\subfigtopskip=\skip44 +\subfigcapskip=\skip45 +\subfigcaptopadj=\dimen106 +\subfigbottomskip=\skip46 +\subfigcapmargin=\dimen107 +\subfiglabelskip=\skip47 +\c@subfigure=\count90 +\c@lofdepth=\count91 +\c@subtable=\count92 +\c@lotdepth=\count93 + +**************************************** +* Local config file subfigure.cfg used * +**************************************** +(/usr/local/texlive/2015/texmf-dist/tex/latex/subfigure/subfigure.cfg) +\subfig@top=\skip48 +\subfig@bottom=\skip49 +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/booktabs/booktabs.sty +Package: booktabs 2005/04/14 v1.61803 publication quality tables +\heavyrulewidth=\dimen108 +\lightrulewidth=\dimen109 +\cmidrulewidth=\dimen110 +\belowrulesep=\dimen111 +\belowbottomsep=\dimen112 +\aboverulesep=\dimen113 +\abovetopsep=\dimen114 +\cmidrulesep=\dimen115 +\cmidrulekern=\dimen116 +\defaultaddspace=\dimen117 +\@cmidla=\count94 +\@cmidlb=\count95 +\@aboverulesep=\dimen118 +\@belowrulesep=\dimen119 +\@thisruleclass=\count96 +\@lastruleclass=\count97 +\@thisrulewidth=\dimen120 +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/hyperref/hyperref.sty +Package: hyperref 2012/11/06 v6.83m Hypertext links for LaTeX + +(/usr/local/texlive/2015/texmf-dist/tex/generic/oberdiek/hobsub-hyperref.sty +Package: hobsub-hyperref 2012/05/28 v1.13 Bundle oberdiek, subset hyperref (HO) + + +(/usr/local/texlive/2015/texmf-dist/tex/generic/oberdiek/hobsub-generic.sty +Package: hobsub-generic 2012/05/28 v1.13 Bundle oberdiek, subset generic (HO) +Package: hobsub 2012/05/28 v1.13 Construct package bundles (HO) +Package hobsub Info: Skipping package `infwarerr' (already loaded). +Package hobsub Info: Skipping package `ltxcmds' (already loaded). +Package: ifluatex 2010/03/01 v1.3 Provides the ifluatex switch (HO) +Package ifluatex Info: LuaTeX not detected. +Package: ifvtex 2010/03/01 v1.5 Detect VTeX and its facilities (HO) +Package ifvtex Info: VTeX not detected. +Package: intcalc 2007/09/27 v1.1 Expandable calculations with integers (HO) +Package: ifpdf 2011/01/30 v2.3 Provides the ifpdf switch (HO) +Package ifpdf Info: pdfTeX in PDF mode is detected. +Package: etexcmds 2011/02/16 v1.5 Avoid name clashes with e-TeX commands (HO) +Package etexcmds Info: Could not find \expanded. +(etexcmds) That can mean that you are not using pdfTeX 1.50 or +(etexcmds) that some package has redefined \expanded. +(etexcmds) In the latter case, load this package earlier. +Package: kvsetkeys 2012/04/25 v1.16 Key value parser (HO) +Package: kvdefinekeys 2011/04/07 v1.3 Define keys (HO) +Package: pdftexcmds 2011/11/29 v0.20 Utility functions of pdfTeX for LuaTeX (HO +) +Package pdftexcmds Info: LuaTeX not detected. +Package pdftexcmds Info: \pdf@primitive is available. +Package pdftexcmds Info: \pdf@ifprimitive is available. +Package pdftexcmds Info: \pdfdraftmode found. +Package: pdfescape 2011/11/25 v1.13 Implements pdfTeX's escape features (HO) +Package: bigintcalc 2012/04/08 v1.3 Expandable calculations on big integers (HO +) +Package: bitset 2011/01/30 v1.1 Handle bit-vector datatype (HO) +Package: uniquecounter 2011/01/30 v1.2 Provide unlimited unique counter (HO) +) +Package hobsub Info: Skipping package `hobsub' (already loaded). +Package: letltxmacro 2010/09/02 v1.4 Let assignment for LaTeX macros (HO) +Package: hopatch 2012/05/28 v1.2 Wrapper for package hooks (HO) +Package: xcolor-patch 2011/01/30 xcolor patch +Package: atveryend 2011/06/30 v1.8 Hooks at the very end of document (HO) +Package atveryend Info: \enddocument detected (standard20110627). +Package: atbegshi 2011/10/05 v1.16 At begin shipout hook (HO) +Package: refcount 2011/10/16 v3.4 Data extraction from label references (HO) +Package: hycolor 2011/01/30 v1.7 Color options for hyperref/bookmark (HO) +) +(/usr/local/texlive/2015/texmf-dist/tex/generic/ifxetex/ifxetex.sty +Package: ifxetex 2010/09/12 v0.6 Provides ifxetex conditional +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/oberdiek/auxhook.sty +Package: auxhook 2011/03/04 v1.3 Hooks for auxiliary files (HO) +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/oberdiek/kvoptions.sty +Package: kvoptions 2011/06/30 v3.11 Key value format for package options (HO) +) +\@linkdim=\dimen121 +\Hy@linkcounter=\count98 +\Hy@pagecounter=\count99 + +(/usr/local/texlive/2015/texmf-dist/tex/latex/hyperref/pd1enc.def +File: pd1enc.def 2012/11/06 v6.83m Hyperref: PDFDocEncoding definition (HO) +) +\Hy@SavedSpaceFactor=\count100 + +(/usr/local/texlive/2015/texmf-dist/tex/latex/latexconfig/hyperref.cfg +File: hyperref.cfg 2002/06/06 v1.2 hyperref configuration of TeXLive +) +Package hyperref Info: Hyper figures OFF on input line 4443. +Package hyperref Info: Link nesting OFF on input line 4448. +Package hyperref Info: Hyper index ON on input line 4451. +Package hyperref Info: Plain pages OFF on input line 4458. +Package hyperref Info: Backreferencing OFF on input line 4463. +Package hyperref Info: Implicit mode ON; LaTeX internals redefined. +Package hyperref Info: Bookmarks ON on input line 4688. +\c@Hy@tempcnt=\count101 + +(/usr/local/texlive/2015/texmf-dist/tex/latex/url/url.sty +\Urlmuskip=\muskip10 +Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc. +) +LaTeX Info: Redefining \url on input line 5041. +\XeTeXLinkMargin=\dimen122 +\Fld@menulength=\count102 +\Field@Width=\dimen123 +\Fld@charsize=\dimen124 +Package hyperref Info: Hyper figures OFF on input line 6295. +Package hyperref Info: Link nesting OFF on input line 6300. +Package hyperref Info: Hyper index ON on input line 6303. +Package hyperref Info: backreferencing OFF on input line 6310. +Package hyperref Info: Link coloring OFF on input line 6315. +Package hyperref Info: Link coloring with OCG OFF on input line 6320. +Package hyperref Info: PDF/A mode OFF on input line 6325. +LaTeX Info: Redefining \ref on input line 6365. +LaTeX Info: Redefining \pageref on input line 6369. +\Hy@abspage=\count103 +\c@Item=\count104 +\c@Hfootnote=\count105 +) + +Package hyperref Message: Driver (autodetected): hpdftex. + +(/usr/local/texlive/2015/texmf-dist/tex/latex/hyperref/hpdftex.def +File: hpdftex.def 2012/11/06 v6.83m Hyperref driver for pdfTeX +\Fld@listcount=\count106 +\c@bookmark@seq@number=\count107 + +(/usr/local/texlive/2015/texmf-dist/tex/latex/oberdiek/rerunfilecheck.sty +Package: rerunfilecheck 2011/04/15 v1.7 Rerun checks for auxiliary files (HO) +Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2 +82. +) +\Hy@SectionHShift=\skip50 +) +(./icml2021.sty +Package: icml2021 2020/11/18 v2.0 ICML Conference Style File + (/usr/local/texlive/2015/texmf-dist/tex/latex/psnfss/times.sty +Package: times 2005/04/12 PSNFSS-v9.2a (SPQR) +) (./fancyhdr.sty +\fancy@headwidth=\skip51 +\f@ncyO@elh=\skip52 +\f@ncyO@erh=\skip53 +\f@ncyO@olh=\skip54 +\f@ncyO@orh=\skip55 +\f@ncyO@elf=\skip56 +\f@ncyO@erf=\skip57 +\f@ncyO@olf=\skip58 +\f@ncyO@orf=\skip59 +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/graphics/color.sty +Package: color 2014/10/28 v1.1a Standard LaTeX Color (DPC) + +(/usr/local/texlive/2015/texmf-dist/tex/latex/latexconfig/color.cfg +File: color.cfg 2007/01/18 v1.5 color configuration of teTeX/TeXLive +) +Package color Info: Driver file: pdftex.def on input line 142. +) +(./algorithm.sty +Package: algorithm + +Document Style `algorithm' - floating environment +(/usr/local/texlive/2015/texmf-dist/tex/latex/float/float.sty +Package: float 2001/11/08 v1.3d Float enhancements (AL) +\c@float@type=\count108 +\float@exts=\toks16 +\float@box=\box26 +\@float@everytoks=\toks17 +\@floatcapt=\box27 +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/base/ifthen.sty +Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC) +) +\@float@every@algorithm=\toks18 +\c@algorithm=\count109 +) +(./algorithmic.sty +Package: algorithmic + +Document Style `algorithmic' - environment +(/usr/local/texlive/2015/texmf-dist/tex/latex/tools/calc.sty +Package: calc 2014/10/28 v4.3 Infix arithmetic (KKT,FJ) +\calc@Acount=\count110 +\calc@Bcount=\count111 +\calc@Adimen=\dimen125 +\calc@Bdimen=\dimen126 +\calc@Askip=\skip60 +\calc@Bskip=\skip61 +LaTeX Info: Redefining \setlength on input line 80. +LaTeX Info: Redefining \addtolength on input line 81. +\calc@Ccount=\count112 +\calc@Cskip=\skip62 +) +\c@ALC@line=\count113 +\c@ALC@rem=\count114 +\ALC@tlm=\skip63 +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/natbib/natbib.sty +Package: natbib 2010/09/13 8.31b (PWD, AO) +\bibhang=\skip64 +\bibsep=\skip65 +LaTeX Info: Redefining \cite on input line 694. +\c@NAT@ctr=\count115 +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/eso-pic/eso-pic.sty +Package: eso-pic 2015/04/20 v2.0e eso-pic (RN) +) +(/usr/local/texlive/2015/texmf-dist/tex/latex/forloop/forloop.sty +Package: forloop 2006/09/18 v3.0 For Loops for LaTeX +) +Package hyperref Info: Option `colorlinks' set `true' on input line 108. +\titrun=\box28 +\c@@affiliationcounter=\count116 +\c@@affilnum=\count117 +\newcaptionbox=\box29 +\newcaptionboxwid=\dimen127 +\icmlrulerbox=\box30 +\icmlrulercount=\count118 +\icmlruleroffset=\dimen128 +\cv@lineheight=\dimen129 +\cv@boxheight=\dimen130 +\cv@tmpbox=\box31 +\cv@refno=\count119 +\cv@tot=\count120 +\cv@tmpc@=\count121 +\cv@tmpc=\count122 +) +(./example_paper.aux) +\openout1 = `example_paper.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 30. +LaTeX Font Info: ... okay on input line 30. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 30. +LaTeX Font Info: ... okay on input line 30. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 30. +LaTeX Font Info: ... okay on input line 30. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 30. +LaTeX Font Info: ... okay on input line 30. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 30. +LaTeX Font Info: ... okay on input line 30. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 30. +LaTeX Font Info: ... okay on input line 30. +LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 30. +LaTeX Font Info: ... okay on input line 30. +LaTeX Font Info: Try loading font information for OT1+ptm on input line 30. + +(/usr/local/texlive/2015/texmf-dist/tex/latex/psnfss/ot1ptm.fd +File: ot1ptm.fd 2001/06/04 font definitions for OT1/ptm. +) +LaTeX Info: Redefining \microtypecontext on input line 30. +Package microtype Info: Generating PDF output. +Package microtype Info: Character protrusion enabled (level 2). +Package microtype Info: Using default protrusion set `alltext'. +Package microtype Info: Automatic font expansion enabled (level 2), +(microtype) stretch: 20, shrink: 20, step: 1, non-selected. +Package microtype Info: Using default expansion set `basictext'. +Package microtype Info: No adjustment of tracking. +Package microtype Info: No adjustment of interword spacing. +Package microtype Info: No adjustment of character kerning. + +(/usr/local/texlive/2015/texmf-dist/tex/latex/microtype/mt-ptm.cfg +File: mt-ptm.cfg 2006/04/20 v1.7 microtype config. file: Times (RS) +) +(/usr/local/texlive/2015/texmf-dist/tex/context/base/supp-pdf.mkii +[Loading MPS to PDF converter (version 2006.09.02).] +\scratchcounter=\count123 +\scratchdimen=\dimen131 +\scratchbox=\box32 +\nofMPsegments=\count124 +\nofMParguments=\count125 +\everyMPshowfont=\toks19 +\MPscratchCnt=\count126 +\MPscratchDim=\dimen132 +\MPnumerator=\count127 +\makeMPintoPDFobject=\count128 +\everyMPtoPDFconversion=\toks20 +) (/usr/local/texlive/2015/texmf-dist/tex/latex/oberdiek/epstopdf-base.sty +Package: epstopdf-base 2010/02/09 v2.5 Base part for package epstopdf + +(/usr/local/texlive/2015/texmf-dist/tex/latex/oberdiek/grfext.sty +Package: grfext 2010/08/19 v1.1 Manage graphics extensions (HO) +) +Package grfext Info: Graphics extension search list: +(grfext) [.png,.pdf,.jpg,.mps,.jpeg,.jbig2,.jb2,.PNG,.PDF,.JPG,.JPE +G,.JBIG2,.JB2,.eps] +(grfext) \AppendGraphicsExtensions on input line 452. + +(/usr/local/texlive/2015/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv +e +)) +\AtBeginShipoutBox=\box33 +Package hyperref Info: Link coloring ON on input line 30. + +(/usr/local/texlive/2015/texmf-dist/tex/latex/hyperref/nameref.sty +Package: nameref 2012/10/27 v2.43 Cross-referencing by name of section + +(/usr/local/texlive/2015/texmf-dist/tex/generic/oberdiek/gettitlestring.sty +Package: gettitlestring 2010/12/03 v1.4 Cleanup title references (HO) +) +\c@section@level=\count129 +) +LaTeX Info: Redefining \ref on input line 30. +LaTeX Info: Redefining \pageref on input line 30. +LaTeX Info: Redefining \nameref on input line 30. + +(./example_paper.out) (./example_paper.out) +\@outlinefile=\write3 +\openout3 = `example_paper.out'. + + + +Package hyperref Warning: Token not allowed in a PDF string (PDFDocEncoding): +(hyperref) removing `\\' on input line 79. + +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <9> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 79. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <14.4> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 79. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <10> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 79. +\c@@affil@anon=\count130 +(/usr/local/texlive/2015/texmf-dist/tex/latex/microtype/mt-cmr.cfg +File: mt-cmr.cfg 2013/05/19 v2.2 microtype config. file: Computer Modern Roman +(RS) +) +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <7> on input line 79. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <5> on input line 79. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <9> on input line 90. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <6> on input line 90. + + +Package hyperref Warning: Ignoring empty anchor on input line 90. + +LaTeX Font Info: Try loading font information for OML+ptm on input line 90. +(/usr/local/texlive/2015/texmf-dist/tex/latex/psnfss/omlptm.fd +File: omlptm.fd +) +LaTeX Font Info: Font shape `OML/ptm/m/n' in size <9> not available +(Font) Font shape `OML/cmm/m/it' tried instead on input line 90. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <12> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 92. +LaTeX Font Info: Try loading font information for OT1+pcr on input line 105. + + +(/usr/local/texlive/2015/texmf-dist/tex/latex/psnfss/ot1pcr.fd +File: ot1pcr.fd 2001/06/04 font definitions for OT1/pcr. +) +LaTeX Font Info: Font shape `OT1/pcr/bx/n' in size <10> not available +(Font) Font shape `OT1/pcr/b/n' tried instead on input line 105. +Package microtype Info: Loading generic settings for font family +(microtype) `pcr' (encoding: OT1). +(microtype) For optimal results, create family-specific settings. +(microtype) See the microtype manual for details. +LaTeX Font Info: Try loading font information for OMS+ptm on input line 111. + + +(/usr/local/texlive/2015/texmf-dist/tex/latex/psnfss/omsptm.fd +File: omsptm.fd +) +LaTeX Font Info: Font shape `OMS/ptm/m/n' in size <10> not available +(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 111. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <7> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 166. + [1{/usr/local/texlive/2015/texmf-var/fonts/map/pdftex/updmap/pdftex.map} + + + +] +Overfull \hbox (13.52222pt too wide) in paragraph at lines 174--174 +[]\OT1/pcr/m/n/9 dvips -Ppdf -tletter -G0 -o paper.ps paper.dvi[] + [] + +Package microtype Info: Loading generic settings for font family +(microtype) `cmtt' (encoding: OT1). +(microtype) For optimal results, create family-specific settings. +(microtype) See the microtype manual for details. + +Underfull \hbox (badness 1371) in paragraph at lines 208--211 +\OT1/ptm/m/n/10 (+20) oth-ers) is han-dled au-to-mat-i-cally by sim-ply chang-i +ng + [] + + +Underfull \hbox (badness 1755) in paragraph at lines 260--267 +\OT1/ptm/m/n/10 (+20) must not ap-pear. If you are us-ing L[]T[]X and the + [] + + +Underfull \vbox (badness 10000) has occurred while \output is active [] + + [2] +LaTeX Font Info: Font shape `OML/ptm/m/n' in size <10> not available +(Font) Font shape `OML/cmm/m/it' tried instead on input line 308. + +Underfull \hbox (badness 1189) in paragraph at lines 311--315 +[]\OT1/ptm/m/n/10 (+20) A sam-ple file with au-thor names is in-cluded in the + [] + + +Underfull \hbox (badness 1460) in paragraph at lines 318--325 +[]\OT1/ptm/m/n/10 (+20) The pa-per ab-stract should be-gin in the left col-umn, + + [] + + +Underfull \vbox (badness 10000) has occurred while \output is active [] + + + +File: icml_numpapers.pdf Graphic file (type pdf) + +Package pdftex.def Info: icml_numpapers.pdf used on input line 368. +(pdftex.def) Requested size: 234.8775pt x 185.90239pt. + +Underfull \vbox (badness 10000) has occurred while \output is active [] + + [3] +Underfull \vbox (badness 1248) has occurred while \output is active [] + + [4pdfTeX warning (ext4): destination with the same identifier (name{figure.1}) + has been already used, duplicate ignored + +\AtBegShi@Output ...ipout \box \AtBeginShipoutBox + \fi \fi +l.503 + pdfTeX warning (ext4): destination with the same identifier (name{table.1 +}) has been already used, duplicate ignored + +\AtBegShi@Output ...ipout \box \AtBeginShipoutBox + \fi \fi +l.503 + <./icml_numpapers.pdf>] (./example_paper.bbl) +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <12> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 555. +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <10> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 557. +Package atveryend Info: Empty hook `BeforeClearDocument' on input line 574. + [5] +Package atveryend Info: Empty hook `AfterLastShipout' on input line 574. + (./example_paper.aux) +Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 574. +Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 574. +Package rerunfilecheck Info: File `example_paper.out' has not changed. +(rerunfilecheck) Checksum: D41D8CD98F00B204E9800998ECF8427E;0. +Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 574. + ) +Here is how much of TeX's memory you used: + 8244 strings out of 493089 + 123729 string characters out of 6134842 + 252821 words of memory out of 5000000 + 11369 multiletter control sequences out of 15000+600000 + 43285 words of font info for 188 fonts, out of 8000000 for 9000 + 1141 hyphenation exceptions out of 8191 + 31i,11n,45p,411b,577s stack positions out of 5000i,500n,10000p,200000b,80000s +{/usr/local/texlive/2015/texmf-dist/fonts/enc/dvips/base/8r.enc} +Output written on example_paper.pdf (5 pages, 189624 bytes). +PDF statistics: + 180 PDF objects out of 1000 (max. 8388607) + 154 compressed objects within 2 object streams + 40 named destinations out of 1000 (max. 500000) + 40966 words of extra memory for PDF output out of 42996 (max. 10000000) + diff --git a/2024/05/29/papers/2105.06987/figures/grad_ratio_conv.png b/2024/05/29/papers/2105.06987/figures/grad_ratio_conv.png new file mode 100644 index 00000000..59a5d1e4 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/grad_ratio_conv.png differ diff --git a/2024/05/29/papers/2105.06987/figures/grad_ratio_conv_smooth.png b/2024/05/29/papers/2105.06987/figures/grad_ratio_conv_smooth.png new file mode 100644 index 00000000..f9549e07 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/grad_ratio_conv_smooth.png differ diff --git a/2024/05/29/papers/2105.06987/figures/grad_ratio_init.png b/2024/05/29/papers/2105.06987/figures/grad_ratio_init.png new file mode 100644 index 00000000..605505b4 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/grad_ratio_init.png differ diff --git a/2024/05/29/papers/2105.06987/figures/grad_ratio_init_smooth.png b/2024/05/29/papers/2105.06987/figures/grad_ratio_init_smooth.png new file mode 100644 index 00000000..0ca388b4 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/grad_ratio_init_smooth.png differ diff --git a/2024/05/29/papers/2105.06987/figures/grad_ratio_misc.png b/2024/05/29/papers/2105.06987/figures/grad_ratio_misc.png new file mode 100644 index 00000000..feb1e388 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/grad_ratio_misc.png differ diff --git a/2024/05/29/papers/2105.06987/figures/histogram_confidence_asr.png b/2024/05/29/papers/2105.06987/figures/histogram_confidence_asr.png new file mode 100644 index 00000000..3b98ee16 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/histogram_confidence_asr.png differ diff --git a/2024/05/29/papers/2105.06987/figures/histogram_confidence_nmt.png b/2024/05/29/papers/2105.06987/figures/histogram_confidence_nmt.png new file mode 100644 index 00000000..5f994019 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/histogram_confidence_nmt.png differ diff --git a/2024/05/29/papers/2105.06987/figures/naive-distillation-2.png b/2024/05/29/papers/2105.06987/figures/naive-distillation-2.png new file mode 100644 index 00000000..02b14c08 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/naive-distillation-2.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/gnorms.png b/2024/05/29/papers/2105.06987/figures/plots/gnorms.png new file mode 100644 index 00000000..20f8d6a7 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/gnorms.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/precisions.png b/2024/05/29/papers/2105.06987/figures/plots/precisions.png new file mode 100644 index 00000000..5d898ef4 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/precisions.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_entropy.png b/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_entropy.png new file mode 100644 index 00000000..c07172e8 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_entropy.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_precision.png b/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_precision.png new file mode 100644 index 00000000..b70128d0 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_precision.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_seq_entropy.png b/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_seq_entropy.png new file mode 100644 index 00000000..87895c87 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_seq_entropy.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_seq_precision.png b/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_seq_precision.png new file mode 100644 index 00000000..cf758d4a Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test12_ens_pred_seq_precision.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test12_entropy.png b/2024/05/29/papers/2105.06987/figures/plots/test12_entropy.png new file mode 100644 index 00000000..4984696b Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test12_entropy.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test12_precision.png b/2024/05/29/papers/2105.06987/figures/plots/test12_precision.png new file mode 100644 index 00000000..85e5a17c Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test12_precision.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test12_seq_entropy.png b/2024/05/29/papers/2105.06987/figures/plots/test12_seq_entropy.png new file mode 100644 index 00000000..92e861c3 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test12_seq_entropy.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_entropy.png b/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_entropy.png new file mode 100644 index 00000000..23f5250c Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_entropy.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_precision.png b/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_precision.png new file mode 100644 index 00000000..439501c3 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_precision.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_seq_entropy.png b/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_seq_entropy.png new file mode 100644 index 00000000..c5d70e98 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_seq_entropy.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_seq_precision.png b/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_seq_precision.png new file mode 100644 index 00000000..2f0d4881 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test_ens_pred_seq_precision.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test_entropy.png b/2024/05/29/papers/2105.06987/figures/plots/test_entropy.png new file mode 100644 index 00000000..746c14fc Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test_entropy.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test_precision.png b/2024/05/29/papers/2105.06987/figures/plots/test_precision.png new file mode 100644 index 00000000..acc0a68f Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test_precision.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test_seq_entropy.png b/2024/05/29/papers/2105.06987/figures/plots/test_seq_entropy.png new file mode 100644 index 00000000..c76341ad Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test_seq_entropy.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/test_seq_precision.png b/2024/05/29/papers/2105.06987/figures/plots/test_seq_precision.png new file mode 100644 index 00000000..2cae0d8f Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/test_seq_precision.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/train_loss.png b/2024/05/29/papers/2105.06987/figures/plots/train_loss.png new file mode 100644 index 00000000..d5c71f5a Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/train_loss.png differ diff --git a/2024/05/29/papers/2105.06987/figures/plots/train_nll.png b/2024/05/29/papers/2105.06987/figures/plots/train_nll.png new file mode 100644 index 00000000..383ba32c Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/plots/train_nll.png differ diff --git a/2024/05/29/papers/2105.06987/figures/proxy_distillation.png b/2024/05/29/papers/2105.06987/figures/proxy_distillation.png new file mode 100644 index 00000000..2d7c4d15 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/proxy_distillation.png differ diff --git a/2024/05/29/papers/2105.06987/figures/seq_reject_ami.png b/2024/05/29/papers/2105.06987/figures/seq_reject_ami.png new file mode 100644 index 00000000..e7089932 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/seq_reject_ami.png differ diff --git a/2024/05/29/papers/2105.06987/figures/seq_reject_deen.png b/2024/05/29/papers/2105.06987/figures/seq_reject_deen.png new file mode 100644 index 00000000..dcb8173a Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/seq_reject_deen.png differ diff --git a/2024/05/29/papers/2105.06987/figures/seq_reject_devc.png b/2024/05/29/papers/2105.06987/figures/seq_reject_devc.png new file mode 100644 index 00000000..ec96ca1c Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/seq_reject_devc.png differ diff --git a/2024/05/29/papers/2105.06987/figures/seq_reject_devo.png b/2024/05/29/papers/2105.06987/figures/seq_reject_devo.png new file mode 100644 index 00000000..e03340fd Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/seq_reject_devo.png differ diff --git a/2024/05/29/papers/2105.06987/figures/seq_reject_ende.png b/2024/05/29/papers/2105.06987/figures/seq_reject_ende.png new file mode 100644 index 00000000..b1042c38 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/seq_reject_ende.png differ diff --git a/2024/05/29/papers/2105.06987/figures/seq_reject_enfr.png b/2024/05/29/papers/2105.06987/figures/seq_reject_enfr.png new file mode 100644 index 00000000..8b5af04a Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/seq_reject_enfr.png differ diff --git a/2024/05/29/papers/2105.06987/figures/seq_reject_fren.png b/2024/05/29/papers/2105.06987/figures/seq_reject_fren.png new file mode 100644 index 00000000..a5d15363 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/seq_reject_fren.png differ diff --git a/2024/05/29/papers/2105.06987/figures/seq_reject_tc.png b/2024/05/29/papers/2105.06987/figures/seq_reject_tc.png new file mode 100644 index 00000000..efa98b5f Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/seq_reject_tc.png differ diff --git a/2024/05/29/papers/2105.06987/figures/seq_reject_to.png b/2024/05/29/papers/2105.06987/figures/seq_reject_to.png new file mode 100644 index 00000000..342b4805 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/seq_reject_to.png differ diff --git a/2024/05/29/papers/2105.06987/figures/slbeu_hist_deen.png b/2024/05/29/papers/2105.06987/figures/slbeu_hist_deen.png new file mode 100644 index 00000000..83b838e2 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/slbeu_hist_deen.png differ diff --git a/2024/05/29/papers/2105.06987/figures/slbeu_hist_ende.png b/2024/05/29/papers/2105.06987/figures/slbeu_hist_ende.png new file mode 100644 index 00000000..f9817d6a Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/slbeu_hist_ende.png differ diff --git a/2024/05/29/papers/2105.06987/figures/slbeu_hist_enfr.png b/2024/05/29/papers/2105.06987/figures/slbeu_hist_enfr.png new file mode 100644 index 00000000..071296fb Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/slbeu_hist_enfr.png differ diff --git a/2024/05/29/papers/2105.06987/figures/slbeu_hist_fren.png b/2024/05/29/papers/2105.06987/figures/slbeu_hist_fren.png new file mode 100644 index 00000000..598340b8 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/slbeu_hist_fren.png differ diff --git a/2024/05/29/papers/2105.06987/figures/swer_hist.png b/2024/05/29/papers/2105.06987/figures/swer_hist.png new file mode 100644 index 00000000..a00a1223 Binary files /dev/null and b/2024/05/29/papers/2105.06987/figures/swer_hist.png differ diff --git a/2024/05/29/papers/2105.06987/introduction.tex b/2024/05/29/papers/2105.06987/introduction.tex new file mode 100644 index 00000000..c71369c6 --- /dev/null +++ b/2024/05/29/papers/2105.06987/introduction.tex @@ -0,0 +1,29 @@ +\section{Introduction} +\label{sec:introduction} + + +%Ensemble-based uncertainty estimates have been successfully applied to detecting misclassifications, out-of-distribution inputs and adversarial attacks \citep{carlini-detected, gal-adversarial, malinin-rkl-2019} and to active learning~\citep{batchbald}. + +%%% THIS NEEDS MORE DETAIL +Ensembles of machine learning models are known to yield improved predictive performance relative to single models~\cite{dietterich2000ensemble}. With the increasing popularity of neural networks, ensemble methods have been rapidly adopted in numerous sub-fields of machine learning~\cite{ashukha2020pitfalls, trust-uncertainty}. More importantly, \cite{deepensemble2017} demonstrated that although single neural networks are often overconfident in their predictions, their ensembles can output reliable uncertainty estimates. Furthermore, ensembles allow \emph{total uncertainty} to be decomposed into \emph{data} and \emph{knowledge uncertainty}\footnote{Data and Knowledge Uncertainty are also known as Aleatoric and Epistemic uncertainty.}. The former is the intrinsic uncertainty due to class overlap and noise inherent in the data, while the latter is the model's uncertainty due to lack of understanding of the test data~\cite{malinin-thesis}. Estimates of \emph{knowledge uncertainty} are often used to detect anomalous and unfamiliar inputs~\cite{batchbald,gal-adversarial, malinin-rkl-2019, malinin-thesis}. Given the increased usage of deep learning for safety-critical applications such as self-driving cars or medical diagnostics, obtaining reliable uncertainty estimates becomes ever more important each year. + +However, using ensembles for inference can be computationally prohibitive in certain applications. Obtaining predictions in real time can often be quite involved even for a single model, and the hardware requirements for serving the ensemble of neural networks scale linearly with its size. As a result, over the past several years the area of ensemble distillation has gained increased attention of the research community. Broadly speaking, distillation methods aim to train a single model which can approximate the behavior of the ensemble sufficiently well. + +In the simplest and most frequently used form of distillation \cite{hinton2015distilling}, the student model is trained to capture the average prediction of the ensemble: for example, in case of classification this reduces to KL-divergence between the model and the ensemble mean. While this method allows the student to obtain predictive performance comparable to that of the original ensemble, the information about its distributional properties (in other words, its diversity) is lost in the process. As ensemble-based uncertainty estimation methods often utilize the information about disagreement between its members, such distillation approaches achieve only one of two favorable ensemble properties which we would like to preserve. + +% Ensemble Distribution Distillation is a general process. In the original paper we did EnDD via Dirichlet max likelihood. + +%%%% Need a clear contrast with Hydra and so on and a clearer statement +Recently, several works have proposed distillation procedures that capture information about both the mean as well as the distribution of ensemble predictions within a single model~\cite{malinin-endd-2019,hydra,mdd,malinin2020regression}. We will broadly refer to this class of distillation approach as \emph{Ensemble Distribution Distillation} (\Endd). Ensemble Distribution Distillation offers a straightforward way to model the ensemble predictions~\cite{malinin-endd-2019,malinin2020regression}. Outputs of each member are viewed as samples from a higher-order Dirichlet or Normal-Wishart distribution, and the student model attempts to learn the parameters of that distribution. Typically, \Endd is done by maximizing the likelihood the ensemble's output distributions under the Dirichlet or Normal-Wishart Prior. While theoretically sound, for large-scale classification tasks with many classes, gradient-based optimization of this criterion is highly problematic, which limits its usefulness in real-life production scenarios. + +In this work, we investigate the poor convergence of models trained with Ensemble Distribution Distillation at scale. We analyze the the Dirichlet log-likelihood criterion and show that it leads to high gradient norm values that affect the optimization procedure. Specifically, if a particular ensemble member's output distribution has most probability mass allocated to a few classes, with the remained spread among a long tail of exponentially less-probable classes, then the gradients associated with the tail-classes will be significantly larger than those associated with high-probability classes. As a result, the model focuses on modelling this distribution of probabilities of tail-classes. + +To solve this, we propose to transform the empirical distribution of ensemble member predictions into a \emph{Proxy-target} Dirichlet distribution with the same statistics and to use this distribution as the target during distillation. Furthermore, we show that it is crucial to minimize the \emph{reverse} KL-divergence between the model and the Proxy-Dirichlet, as minimization the \emph{forward} KL-divergence exacerbates the optimizations issues. The proposed training procedure allows the model to converge, mitigating the issue of gradient explosion. We demonstrate this by distribution-distilling ensembles of models trained on both the ImageNet classification and WMT17 English-German language translation datasets, where there are 1000 and 40,000 classes, respectively. On both datasets the distribution-distilled models outperforms models trained from scratch and yield uncertainty estimates competitive this those of the original ensemble. + +Thus, our contributions are as follows: +\begin{itemize} + \item We analyze the issues of Dirichlet distribution likelihood when applied to a large number of classes and confident predictions + \item We propose several improvements to the {Ensemble Distribution Distillation} framework, each of them arising from the Dirichlet distribution properties in the context of deep learning + \item We adapt \emph{Ensemble Distribution Distillation} to auto-regressive models and propose {Sequence Ensemble-Distribution Distillation} (SEnD$^2$) + \item We examine and propose solutions for a range of technical challenges associated with scaling {Ensemble Distribution Distillation} to large output spaces. +\end{itemize}. \ No newline at end of file diff --git a/2024/05/29/papers/2105.06987/main.bbl b/2024/05/29/papers/2105.06987/main.bbl new file mode 100644 index 00000000..e19f56e5 --- /dev/null +++ b/2024/05/29/papers/2105.06987/main.bbl @@ -0,0 +1,191 @@ +\begin{thebibliography}{10} + +\bibitem{dietterich2000ensemble} +Thomas~G. Dietterich, +\newblock ``Ensemble methods in machine learning,'' +\newblock in {\em Proceedings of the First International Workshop on Multiple + Classifier Systems}, Berlin, Heidelberg, 2000, MCS '00, p. 1–15, + Springer-Verlag. + +\bibitem{ashukha2020pitfalls} +Arsenii Ashukha, Alexander Lyzhov, Dmitry Molchanov, and Dmitry Vetrov, +\newblock ``Pitfalls of in-domain uncertainty estimation and ensembling in deep + learning,'' +\newblock in {\em International Conference on Learning Representations}, 2020. + +\bibitem{trust-uncertainty} +Yaniv Ovadia, Emily Fertig, Jie Ren, Zachary Nado, D~Sculley, Sebastian + Nowozin, Joshua~V Dillon, Balaji Lakshminarayanan, and Jasper Snoek, +\newblock ``Can you trust your model's uncertainty? evaluating predictive + uncertainty under dataset shift,'' +\newblock {\em Advances in Neural Information Processing Systems}, 2019. + +\bibitem{deepensemble2017} +B.~Lakshminarayanan, A.~Pritzel, and C.~Blundell, +\newblock ``{Simple and Scalable Predictive Uncertainty Estimation using Deep + Ensembles},'' +\newblock in {\em Proc. Conference on Neural Information Processing Systems + (NIPS)}, 2017. + +\bibitem{malinin-thesis} +Andrey Malinin, +\newblock {\em Uncertainty Estimation in Deep Learning with application to + Spoken Language Assessment}, +\newblock Ph.D. thesis, University of Cambridge, 2019. + +\bibitem{batchbald} +Andreas Kirsch, Joost van Amersfoort, and Yarin Gal, +\newblock ``Batchbald: Efficient and diverse batch acquisition for deep + bayesian active learning,'' 2019. + +\bibitem{gal-adversarial} +L.~{Smith} and Y.~{Gal}, +\newblock ``{Understanding Measures of Uncertainty for Adversarial Example + Detection},'' +\newblock in {\em UAI}, 2018. + +\bibitem{malinin-rkl-2019} +Andrey Malinin and Mark~JF Gales, +\newblock ``Reverse kl-divergence training of prior networks: Improved + uncertainty and adversarial robustness,'' +\newblock 2019. + +\bibitem{hinton2015distilling} +Geoffrey Hinton, Oriol Vinyals, and Jeff Dean, +\newblock ``Distilling the knowledge in a neural network,'' 2015, +\newblock arXiv:1503.02531. + +\bibitem{malinin-endd-2019} +Andrey Malinin, Bruno Mlodozeniec, and Mark~JF Gales, +\newblock ``Ensemble distribution distillation,'' +\newblock in {\em International Conference on Learning Representations}, 2020. + +\bibitem{hydra} +Linh Tran, Bastiaan~S. Veeling, Kevin Roth, Jakub {\'S}wi{\k{a}}tkowski, + Joshua~V. Dillon, Jasper Snoek, Stephan Mandt, Tim Salimans, Sebastian + Nowozin, and Rodolphe Jenatton, +\newblock ``Hydra: Preserving ensemble diversity for model distillation,'' + 2020. + +\bibitem{mdd} +Xixin Wu, Kate~M Knill, Mark~JF Gales, and Andrey Malinin, +\newblock ``Ensemble approaches for uncertainty in spoken language + assessment,'' +\newblock {\em Proc. Interspeech 2020}, pp. 3860--3864, 2020. + +\bibitem{malinin2020regression} +Andrey Malinin, Sergey Chervontsev, Ivan Provilkov, and Mark Gales, +\newblock ``Regression prior networks,'' +\newblock {\em arXiv preprint arXiv:2006.11590}, 2020. + +\bibitem{minka2000estimating} +Thomas Minka, +\newblock ``Estimating a dirichlet distribution,'' 2000. + +\bibitem{malinin-structured-2020} +Andrey Malinin and Mark Gales, +\newblock ``Uncertainty in structured prediction,'' +\newblock {\em arXiv preprint arXiv:2002.07650}, 2020. + +\bibitem{resnet} +Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, +\newblock ``Deep residual learning for image recognition,'' +\newblock in {\em Proceedings of the IEEE conference on computer vision and + pattern recognition}, 2016, pp. 770--778. + +\bibitem{imagenet} +J.~Deng, W.~Dong, R.~Socher, L.-J. Li, K.~Li, and L.~Fei-Fei, +\newblock ``{ImageNet: A Large-Scale Hierarchical Image Database},'' +\newblock in {\em CVPR09}, 2009. + +\bibitem{touvron2019FixRes} +Hugo Touvron, Andrea Vedaldi, Matthijs Douze, and Herv{\'e} J{\'e}gou, +\newblock ``Fixing the train-test resolution discrepancy,'' +\newblock in {\em Advances in Neural Information Processing Systems (NeurIPS)}, + 2019. + +\bibitem{goyal2018accurate} +Priya Goyal, Piotr Dollár, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, + Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He, +\newblock ``Accurate, large minibatch sgd: Training imagenet in 1 hour,'' 2018. + +\bibitem{albumentations} +A.~Buslaev, A.~Parinov, E.~Khvedchenya, V.~I. Iglovikov, and A.~A. Kalinin, +\newblock ``{Albumentations: fast and flexible image augmentations},'' +\newblock {\em ArXiv e-prints}, 2018. + +\bibitem{zhang2018residual} +Hongyi Zhang, Yann~N. Dauphin, and Tengyu Ma, +\newblock ``Residual learning without normalization via better + initialization,'' +\newblock in {\em International Conference on Learning Representations}, 2019. + +\bibitem{rezero} +Thomas Bachlechner, Huanru~Henry Majumder, Bodhisattwa Prasad~Mao, Garrison~W. + Cottrell, and Julian McAuley, +\newblock ``Rezero is all you need: Fast convergence at large depth,'' +\newblock in {\em arXiv}, 2020. + +\bibitem{hendrycks2021nae} +Dan Hendrycks, Kevin Zhao, Steven Basart, Jacob Steinhardt, and Dawn Song, +\newblock ``Natural adversarial examples,'' +\newblock {\em CVPR}, 2021. + +\bibitem{hendrycks2019robustness} +Dan Hendrycks and Thomas Dietterich, +\newblock ``Benchmarking neural network robustness to common corruptions and + perturbations,'' +\newblock {\em Proceedings of the International Conference on Learning + Representations}, 2019. + +\bibitem{hendrycks2020many} +Dan Hendrycks, Steven Basart, Norman Mu, Saurav Kadavath, Frank Wang, Evan + Dorundo, Rahul Desai, Tyler Zhu, Samyak Parajuli, Mike Guo, Dawn Song, Jacob + Steinhardt, and Justin Gilmer, +\newblock ``The many faces of robustness: A critical analysis of + out-of-distribution generalization,'' +\newblock {\em arXiv preprint arXiv:2006.16241}, 2020. + +\bibitem{vaswani2017attention} +Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, + Aidan~N. Gomez, Lukasz Kaiser, and Illia Polosukhin, +\newblock ``Attention is all you need,'' 2017. + +\bibitem{sennrich-etal-2016-neural} +Rico Sennrich, Barry Haddow, and Alexandra Birch, +\newblock ``Neural machine translation of rare words with subword units,'' +\newblock in {\em Proceedings of the 54th Annual Meeting of the Association for + Computational Linguistics (Volume 1: Long Papers)}, Berlin, Germany, Aug. + 2016, pp. 1715--1725, Association for Computational Linguistics. + +\bibitem{ott2018scaling} +Myle Ott, Sergey Edunov, David Grangier, and Michael Auli, +\newblock ``Scaling neural machine translation,'' +\newblock {\em arXiv preprint arXiv:1806.00187}, 2018. + +\bibitem{adam} +Diederik~P. Kingma and Jimmy Ba, +\newblock ``{Adam: A Method for Stochastic Optimization},'' +\newblock in {\em Proc. 3rd International Conference on Learning + Representations (ICLR)}, 2015. + +\bibitem{papineni2002bleu} +Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu, +\newblock ``Bleu: a method for automatic evaluation of machine translation,'' +\newblock in {\em Proceedings of the 40th annual meeting of the Association for + Computational Linguistics}, 2002, pp. 311--318. + +\bibitem{sacrebleu} +Matt Post, +\newblock ``A call for clarity in reporting {BLEU} scores,'' +\newblock in {\em Proceedings of the Third Conference on Machine Translation: + Research Papers}, Belgium, Brussels, Oct. 2018, pp. 186--191, Association for + Computational Linguistics. + +\bibitem{librispeech} +Vassil Panayotov, Guoguo Chen, Daniel Povey, and Sanjeev Khudanpur, +\newblock ``Librispeech: an asr corpus based on public domain audio books,'' +\newblock in {\em 2015 IEEE International Conference on Acoustics, Speech and + Signal Processing (ICASSP)}. IEEE, 2015, pp. 5206--5210. + +\end{thebibliography} diff --git a/2024/05/29/papers/2105.06987/main.tex b/2024/05/29/papers/2105.06987/main.tex new file mode 100644 index 00000000..d6dc7afe --- /dev/null +++ b/2024/05/29/papers/2105.06987/main.tex @@ -0,0 +1,191 @@ +\documentclass{article} +\pdfoutput=1 +% if you need to pass options to natbib, use, e.g.: +% \PassOptionsToPackage{numbers, compress}{natbib} +% before loading neurips_2021 + +% ready for submission +\usepackage[nonatbib,preprint]{neurips_2021} + +% to compile a preprint version, e.g., for submission to arXiv, add add the +% [preprint] option: +% \usepackage[preprint]{neurips_2021} + +% to compile a camera-ready version, add the [final] option, e.g.: +% \usepackage[final]{neurips_2021} + +% to avoid loading the natbib package, add option nonatbib: +% \usepackage[nonatbib]{neurips_2021} + +\title{Scaling Ensemble Distribution Distillation to Many Classes with Proxy Targets} + +% The \author macro works with any number of authors. There are two commands +% used to separate the names and addresses of multiple authors: \And and \AND. +% +% Using \And between authors leaves it to LaTeX to determine where to break the +% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4 +% authors names on the first line, and the last on the second line, try using +% \AND instead of \And before the third author name. + +\author{% + Max Ryabinin\thanks{Equal contribution.} \\ + Yandex, HSE University\\ + Moscow, Russia \\ + \texttt{mryabinin0@gmail.com} \\ + % examples of more authors + \And + Andrey Malinin + \footnotemark[1] \\ + Yandex, HSE University \\ + Moscow, Russia \\ + \texttt{am969@yandex-team.ru} \\ + \And + Mark Gales \\ + University of Cambridge \\ + Cambridge, United Kingdom \\ + \texttt{mjfg@eng.cam.ac.uk} \\ + % \And + % Coauthor \\ + % Affiliation \\ + % Address \\ + % \texttt{email} \\ + % \And + % Coauthor \\ + % Affiliation \\ + % Address \\ + % \texttt{email} \\ +} + +\input{math_commands.tex} + +\usepackage{hyperref} +\usepackage{url} + +\usepackage[utf8]{inputenc} % allow utf-8 input +\usepackage[T1]{fontenc} % use 8-bit T1 fonts +\usepackage{hyperref} % hyperlinks +\usepackage{url} % simple URL typesetting +\usepackage{booktabs} % professional-quality tables +\usepackage{amsfonts} % blackboard math symbols +\usepackage{nicefrac} % compact symbols for 1/2, etc. +\usepackage{microtype} % microtypography +\usepackage{xcolor} % colors + +%\usepackage{amsmath,graphicx} +%\usepackage{pifont} +\usepackage{subfigure} +\usepackage[titletoc,title]{appendix} +\usepackage{mathtools} +\usepackage{subfiles} +%\usepackage{times} +\usepackage{latexsym} +\usepackage{multirow} +\usepackage{empheq} +\usepackage{bm} +\usepackage{graphicx} + +\usepackage[normalem]{ulem} +\useunder{\uline}{\ul}{} +\usepackage{xspace} +\usepackage{siunitx} +\newcommand{\Endd}{EnD$^2$\xspace} + + +\begin{document} + +\maketitle + +\begin{abstract} + \subfile{abstract} +\end{abstract} + +\subfile{introduction} +\subfile{background} +\subfile{method} +\subfile{experiments} +\subfile{conclusion} + +\bibliographystyle{IEEEbib} +\bibliography{bibliography} + +% \section*{Checklist} + +% %%% BEGIN INSTRUCTIONS %%% +% The checklist follows the references. Please +% read the checklist guidelines carefully for information on how to answer these +% questions. For each question, change the default \answerTODO{} to \answerYes{}, +% \answerNo{}, or \answerNA{}. You are strongly encouraged to include a {\bf +% justification to your answer}, either by referencing the appropriate section of +% your paper or providing a brief inline description. For example: +% \begin{itemize} +% \item Did you include the license to the code and datasets? \answerYes{See Section~\ref{gen_inst}.} +% \item Did you include the license to the code and datasets? \answerNo{The code and the data are proprietary.} +% \item Did you include the license to the code and datasets? \answerNA{} +% \end{itemize} +% Please do not modify the questions and only use the provided macros for your +% answers. Note that the Checklist section does not count towards the page +% limit. In your paper, please delete this instructions block and only keep the +% Checklist section heading above along with the questions/answers below. +% %%% END INSTRUCTIONS %%% + +% \begin{enumerate} + +% \item For all authors... +% \begin{enumerate} +% \item Do the main claims made in the abstract and introduction accurately reflect the paper's contributions and scope? +% \answerTODO{} +% \item Did you describe the limitations of your work? +% \answerTODO{} +% \item Did you discuss any potential negative societal impacts of your work? +% \answerTODO{} +% \item Have you read the ethics review guidelines and ensured that your paper conforms to them? +% \answerTODO{} +% \end{enumerate} + +% \item If you are including theoretical results... +% \begin{enumerate} +% \item Did you state the full set of assumptions of all theoretical results? +% \answerTODO{} +% \item Did you include complete proofs of all theoretical results? +% \answerTODO{} +% \end{enumerate} + +% \item If you ran experiments... +% \begin{enumerate} +% \item Did you include the code, data, and instructions needed to reproduce the main experimental results (either in the supplemental material or as a URL)? +% \answerTODO{} +% \item Did you specify all the training details (e.g., data splits, hyperparameters, how they were chosen)? +% \answerTODO{} +% \item Did you report error bars (e.g., with respect to the random seed after running experiments multiple times)? +% \answerTODO{} +% \item Did you include the total amount of compute and the type of resources used (e.g., type of GPUs, internal cluster, or cloud provider)? +% \answerTODO{} +% \end{enumerate} + +% \item If you are using existing assets (e.g., code, data, models) or curating/releasing new assets... +% \begin{enumerate} +% \item If your work uses existing assets, did you cite the creators? +% \answerTODO{} +% \item Did you mention the license of the assets? +% \answerTODO{} +% \item Did you include any new assets either in the supplemental material or as a URL? +% \answerTODO{} +% \item Did you discuss whether and how consent was obtained from people whose data you're using/curating? +% \answerTODO{} +% \item Did you discuss whether the data you are using/curating contains personally identifiable information or offensive content? +% \answerTODO{} +% \end{enumerate} + +% \item If you used crowdsourcing or conducted research with human subjects... +% \begin{enumerate} +% \item Did you include the full text of instructions given to participants and screenshots, if applicable? +% \answerTODO{} +% \item Did you describe any potential participant risks, with links to Institutional Review Board (IRB) approvals, if applicable? +% \answerTODO{} +% \item Did you include the estimated hourly wage paid to participants and the total amount spent on participant compensation? +% \answerTODO{} +% \end{enumerate} + +% \end{enumerate} + +\end{document} diff --git a/2024/05/29/papers/2105.06987/math_commands.tex b/2024/05/29/papers/2105.06987/math_commands.tex new file mode 100644 index 00000000..0668f931 --- /dev/null +++ b/2024/05/29/papers/2105.06987/math_commands.tex @@ -0,0 +1,508 @@ +%%%%% NEW MATH DEFINITIONS %%%%% + +\usepackage{amsmath,amsfonts,bm} + +% Mark sections of captions for referring to divisions of figures +\newcommand{\figleft}{{\em (Left)}} +\newcommand{\figcenter}{{\em (Center)}} +\newcommand{\figright}{{\em (Right)}} +\newcommand{\figtop}{{\em (Top)}} +\newcommand{\figbottom}{{\em (Bottom)}} +\newcommand{\captiona}{{\em (a)}} +\newcommand{\captionb}{{\em (b)}} +\newcommand{\captionc}{{\em (c)}} +\newcommand{\captiond}{{\em (d)}} + +% Highlight a newly defined term +\newcommand{\newterm}[1]{{\bf #1}} + + +% Figure reference, lower-case. +\def\figref#1{figure~\ref{#1}} +% Figure reference, capital. For start of sentence +\def\Figref#1{Figure~\ref{#1}} +\def\twofigref#1#2{figures \ref{#1} and \ref{#2}} +\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}} +% Section reference, lower-case. +\def\secref#1{section~\ref{#1}} +% Section reference, capital. +\def\Secref#1{Section~\ref{#1}} +% Reference to two sections. +\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}} +% Reference to three sections. +\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}} +% Reference to an equation, lower-case. +\def\eqref#1{equation~\ref{#1}} +% Reference to an equation, upper case +\def\Eqref#1{Equation~\ref{#1}} +% A raw reference to an equation---avoid using if possible +\def\plaineqref#1{\ref{#1}} +% Reference to a chapter, lower-case. +\def\chapref#1{chapter~\ref{#1}} +% Reference to an equation, upper case. +\def\Chapref#1{Chapter~\ref{#1}} +% Reference to a range of chapters +\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}} +% Reference to an algorithm, lower-case. +\def\algref#1{algorithm~\ref{#1}} +% Reference to an algorithm, upper case. +\def\Algref#1{Algorithm~\ref{#1}} +\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}} +\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}} +% Reference to a part, lower case +\def\partref#1{part~\ref{#1}} +% Reference to a part, upper case +\def\Partref#1{Part~\ref{#1}} +\def\twopartref#1#2{parts \ref{#1} and \ref{#2}} + +\def\ceil#1{\lceil #1 \rceil} +\def\floor#1{\lfloor #1 \rfloor} +\def\1{\bm{1}} +\newcommand{\train}{\mathcal{D}} +\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}} +\newcommand{\test}{\mathcal{D_{\mathrm{test}}}} + +\def\eps{{\epsilon}} + + +% Random variables +\def\reta{{\textnormal{$\eta$}}} +\def\ra{{\textnormal{a}}} +\def\rb{{\textnormal{b}}} +\def\rc{{\textnormal{c}}} +\def\rd{{\textnormal{d}}} +\def\re{{\textnormal{e}}} +\def\rf{{\textnormal{f}}} +\def\rg{{\textnormal{g}}} +\def\rh{{\textnormal{h}}} +\def\ri{{\textnormal{i}}} +\def\rj{{\textnormal{j}}} +\def\rk{{\textnormal{k}}} +\def\rl{{\textnormal{l}}} +% rm is already a command, just don't name any random variables m +\def\rn{{\textnormal{n}}} +\def\ro{{\textnormal{o}}} +\def\rp{{\textnormal{p}}} +\def\rq{{\textnormal{q}}} +\def\rr{{\textnormal{r}}} +\def\rs{{\textnormal{s}}} +\def\rt{{\textnormal{t}}} +\def\ru{{\textnormal{u}}} +\def\rv{{\textnormal{v}}} +\def\rw{{\textnormal{w}}} +\def\rx{{\textnormal{x}}} +\def\ry{{\textnormal{y}}} +\def\rz{{\textnormal{z}}} + +% Random vectors +\def\rvepsilon{{\mathbf{\epsilon}}} +\def\rvtheta{{\mathbf{\theta}}} +\def\rva{{\mathbf{a}}} +\def\rvb{{\mathbf{b}}} +\def\rvc{{\mathbf{c}}} +\def\rvd{{\mathbf{d}}} +\def\rve{{\mathbf{e}}} +\def\rvf{{\mathbf{f}}} +\def\rvg{{\mathbf{g}}} +\def\rvh{{\mathbf{h}}} +\def\rvu{{\mathbf{i}}} +\def\rvj{{\mathbf{j}}} +\def\rvk{{\mathbf{k}}} +\def\rvl{{\mathbf{l}}} +\def\rvm{{\mathbf{m}}} +\def\rvn{{\mathbf{n}}} +\def\rvo{{\mathbf{o}}} +\def\rvp{{\mathbf{p}}} +\def\rvq{{\mathbf{q}}} +\def\rvr{{\mathbf{r}}} +\def\rvs{{\mathbf{s}}} +\def\rvt{{\mathbf{t}}} +\def\rvu{{\mathbf{u}}} +\def\rvv{{\mathbf{v}}} +\def\rvw{{\mathbf{w}}} +\def\rvx{{\mathbf{x}}} +\def\rvy{{\mathbf{y}}} +\def\rvz{{\mathbf{z}}} + +% Elements of random vectors +\def\erva{{\textnormal{a}}} +\def\ervb{{\textnormal{b}}} +\def\ervc{{\textnormal{c}}} +\def\ervd{{\textnormal{d}}} +\def\erve{{\textnormal{e}}} +\def\ervf{{\textnormal{f}}} +\def\ervg{{\textnormal{g}}} +\def\ervh{{\textnormal{h}}} +\def\ervi{{\textnormal{i}}} +\def\ervj{{\textnormal{j}}} +\def\ervk{{\textnormal{k}}} +\def\ervl{{\textnormal{l}}} +\def\ervm{{\textnormal{m}}} +\def\ervn{{\textnormal{n}}} +\def\ervo{{\textnormal{o}}} +\def\ervp{{\textnormal{p}}} +\def\ervq{{\textnormal{q}}} +\def\ervr{{\textnormal{r}}} +\def\ervs{{\textnormal{s}}} +\def\ervt{{\textnormal{t}}} +\def\ervu{{\textnormal{u}}} +\def\ervv{{\textnormal{v}}} +\def\ervw{{\textnormal{w}}} +\def\ervx{{\textnormal{x}}} +\def\ervy{{\textnormal{y}}} +\def\ervz{{\textnormal{z}}} + +% Random matrices +\def\rmA{{\mathbf{A}}} +\def\rmB{{\mathbf{B}}} +\def\rmC{{\mathbf{C}}} +\def\rmD{{\mathbf{D}}} +\def\rmE{{\mathbf{E}}} +\def\rmF{{\mathbf{F}}} +\def\rmG{{\mathbf{G}}} +\def\rmH{{\mathbf{H}}} +\def\rmI{{\mathbf{I}}} +\def\rmJ{{\mathbf{J}}} +\def\rmK{{\mathbf{K}}} +\def\rmL{{\mathbf{L}}} +\def\rmM{{\mathbf{M}}} +\def\rmN{{\mathbf{N}}} +\def\rmO{{\mathbf{O}}} +\def\rmP{{\mathbf{P}}} +\def\rmQ{{\mathbf{Q}}} +\def\rmR{{\mathbf{R}}} +\def\rmS{{\mathbf{S}}} +\def\rmT{{\mathbf{T}}} +\def\rmU{{\mathbf{U}}} +\def\rmV{{\mathbf{V}}} +\def\rmW{{\mathbf{W}}} +\def\rmX{{\mathbf{X}}} +\def\rmY{{\mathbf{Y}}} +\def\rmZ{{\mathbf{Z}}} + +% Elements of random matrices +\def\ermA{{\textnormal{A}}} +\def\ermB{{\textnormal{B}}} +\def\ermC{{\textnormal{C}}} +\def\ermD{{\textnormal{D}}} +\def\ermE{{\textnormal{E}}} +\def\ermF{{\textnormal{F}}} +\def\ermG{{\textnormal{G}}} +\def\ermH{{\textnormal{H}}} +\def\ermI{{\textnormal{I}}} +\def\ermJ{{\textnormal{J}}} +\def\ermK{{\textnormal{K}}} +\def\ermL{{\textnormal{L}}} +\def\ermM{{\textnormal{M}}} +\def\ermN{{\textnormal{N}}} +\def\ermO{{\textnormal{O}}} +\def\ermP{{\textnormal{P}}} +\def\ermQ{{\textnormal{Q}}} +\def\ermR{{\textnormal{R}}} +\def\ermS{{\textnormal{S}}} +\def\ermT{{\textnormal{T}}} +\def\ermU{{\textnormal{U}}} +\def\ermV{{\textnormal{V}}} +\def\ermW{{\textnormal{W}}} +\def\ermX{{\textnormal{X}}} +\def\ermY{{\textnormal{Y}}} +\def\ermZ{{\textnormal{Z}}} + +% Vectors +\def\vzero{{\bm{0}}} +\def\vone{{\bm{1}}} +\def\vmu{{\bm{\mu}}} +\def\vtheta{{\bm{\theta}}} +\def\va{{\bm{a}}} +\def\vb{{\bm{b}}} +\def\vc{{\bm{c}}} +\def\vd{{\bm{d}}} +\def\ve{{\bm{e}}} +\def\vf{{\bm{f}}} +\def\vg{{\bm{g}}} +\def\vh{{\bm{h}}} +\def\vi{{\bm{i}}} +\def\vj{{\bm{j}}} +\def\vk{{\bm{k}}} +\def\vl{{\bm{l}}} +\def\vm{{\bm{m}}} +\def\vn{{\bm{n}}} +\def\vo{{\bm{o}}} +\def\vp{{\bm{p}}} +\def\vq{{\bm{q}}} +\def\vr{{\bm{r}}} +\def\vs{{\bm{s}}} +\def\vt{{\bm{t}}} +\def\vu{{\bm{u}}} +\def\vv{{\bm{v}}} +\def\vw{{\bm{w}}} +\def\vx{{\bm{x}}} +\def\vy{{\bm{y}}} +\def\vz{{\bm{z}}} + +% Elements of vectors +\def\evalpha{{\alpha}} +\def\evbeta{{\beta}} +\def\evepsilon{{\epsilon}} +\def\evlambda{{\lambda}} +\def\evomega{{\omega}} +\def\evmu{{\mu}} +\def\evpsi{{\psi}} +\def\evsigma{{\sigma}} +\def\evtheta{{\theta}} +\def\eva{{a}} +\def\evb{{b}} +\def\evc{{c}} +\def\evd{{d}} +\def\eve{{e}} +\def\evf{{f}} +\def\evg{{g}} +\def\evh{{h}} +\def\evi{{i}} +\def\evj{{j}} +\def\evk{{k}} +\def\evl{{l}} +\def\evm{{m}} +\def\evn{{n}} +\def\evo{{o}} +\def\evp{{p}} +\def\evq{{q}} +\def\evr{{r}} +\def\evs{{s}} +\def\evt{{t}} +\def\evu{{u}} +\def\evv{{v}} +\def\evw{{w}} +\def\evx{{x}} +\def\evy{{y}} +\def\evz{{z}} + +% Matrix +\def\mA{{\bm{A}}} +\def\mB{{\bm{B}}} +\def\mC{{\bm{C}}} +\def\mD{{\bm{D}}} +\def\mE{{\bm{E}}} +\def\mF{{\bm{F}}} +\def\mG{{\bm{G}}} +\def\mH{{\bm{H}}} +\def\mI{{\bm{I}}} +\def\mJ{{\bm{J}}} +\def\mK{{\bm{K}}} +\def\mL{{\bm{L}}} +\def\mM{{\bm{M}}} +\def\mN{{\bm{N}}} +\def\mO{{\bm{O}}} +\def\mP{{\bm{P}}} +\def\mQ{{\bm{Q}}} +\def\mR{{\bm{R}}} +\def\mS{{\bm{S}}} +\def\mT{{\bm{T}}} +\def\mU{{\bm{U}}} +\def\mV{{\bm{V}}} +\def\mW{{\bm{W}}} +\def\mX{{\bm{X}}} +\def\mY{{\bm{Y}}} +\def\mZ{{\bm{Z}}} +\def\mBeta{{\bm{\beta}}} +\def\mPhi{{\bm{\Phi}}} +\def\mLambda{{\bm{\Lambda}}} +\def\mSigma{{\bm{\Sigma}}} + +% Tensor +\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl} +\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n} +\newcommand{\tens}[1]{\bm{\mathsfit{#1}}} +\def\tA{{\tens{A}}} +\def\tB{{\tens{B}}} +\def\tC{{\tens{C}}} +\def\tD{{\tens{D}}} +\def\tE{{\tens{E}}} +\def\tF{{\tens{F}}} +\def\tG{{\tens{G}}} +\def\tH{{\tens{H}}} +\def\tI{{\tens{I}}} +\def\tJ{{\tens{J}}} +\def\tK{{\tens{K}}} +\def\tL{{\tens{L}}} +\def\tM{{\tens{M}}} +\def\tN{{\tens{N}}} +\def\tO{{\tens{O}}} +\def\tP{{\tens{P}}} +\def\tQ{{\tens{Q}}} +\def\tR{{\tens{R}}} +\def\tS{{\tens{S}}} +\def\tT{{\tens{T}}} +\def\tU{{\tens{U}}} +\def\tV{{\tens{V}}} +\def\tW{{\tens{W}}} +\def\tX{{\tens{X}}} +\def\tY{{\tens{Y}}} +\def\tZ{{\tens{Z}}} + + +% Graph +\def\gA{{\mathcal{A}}} +\def\gB{{\mathcal{B}}} +\def\gC{{\mathcal{C}}} +\def\gD{{\mathcal{D}}} +\def\gE{{\mathcal{E}}} +\def\gF{{\mathcal{F}}} +\def\gG{{\mathcal{G}}} +\def\gH{{\mathcal{H}}} +\def\gI{{\mathcal{I}}} +\def\gJ{{\mathcal{J}}} +\def\gK{{\mathcal{K}}} +\def\gL{{\mathcal{L}}} +\def\gM{{\mathcal{M}}} +\def\gN{{\mathcal{N}}} +\def\gO{{\mathcal{O}}} +\def\gP{{\mathcal{P}}} +\def\gQ{{\mathcal{Q}}} +\def\gR{{\mathcal{R}}} +\def\gS{{\mathcal{S}}} +\def\gT{{\mathcal{T}}} +\def\gU{{\mathcal{U}}} +\def\gV{{\mathcal{V}}} +\def\gW{{\mathcal{W}}} +\def\gX{{\mathcal{X}}} +\def\gY{{\mathcal{Y}}} +\def\gZ{{\mathcal{Z}}} + +% Sets +\def\sA{{\mathbb{A}}} +\def\sB{{\mathbb{B}}} +\def\sC{{\mathbb{C}}} +\def\sD{{\mathbb{D}}} +% Don't use a set called E, because this would be the same as our symbol +% for expectation. +\def\sF{{\mathbb{F}}} +\def\sG{{\mathbb{G}}} +\def\sH{{\mathbb{H}}} +\def\sI{{\mathbb{I}}} +\def\sJ{{\mathbb{J}}} +\def\sK{{\mathbb{K}}} +\def\sL{{\mathbb{L}}} +\def\sM{{\mathbb{M}}} +\def\sN{{\mathbb{N}}} +\def\sO{{\mathbb{O}}} +\def\sP{{\mathbb{P}}} +\def\sQ{{\mathbb{Q}}} +\def\sR{{\mathbb{R}}} +\def\sS{{\mathbb{S}}} +\def\sT{{\mathbb{T}}} +\def\sU{{\mathbb{U}}} +\def\sV{{\mathbb{V}}} +\def\sW{{\mathbb{W}}} +\def\sX{{\mathbb{X}}} +\def\sY{{\mathbb{Y}}} +\def\sZ{{\mathbb{Z}}} + +% Entries of a matrix +\def\emLambda{{\Lambda}} +\def\emA{{A}} +\def\emB{{B}} +\def\emC{{C}} +\def\emD{{D}} +\def\emE{{E}} +\def\emF{{F}} +\def\emG{{G}} +\def\emH{{H}} +\def\emI{{I}} +\def\emJ{{J}} +\def\emK{{K}} +\def\emL{{L}} +\def\emM{{M}} +\def\emN{{N}} +\def\emO{{O}} +\def\emP{{P}} +\def\emQ{{Q}} +\def\emR{{R}} +\def\emS{{S}} +\def\emT{{T}} +\def\emU{{U}} +\def\emV{{V}} +\def\emW{{W}} +\def\emX{{X}} +\def\emY{{Y}} +\def\emZ{{Z}} +\def\emSigma{{\Sigma}} + +% entries of a tensor +% Same font as tensor, without \bm wrapper +\newcommand{\etens}[1]{\mathsfit{#1}} +\def\etLambda{{\etens{\Lambda}}} +\def\etA{{\etens{A}}} +\def\etB{{\etens{B}}} +\def\etC{{\etens{C}}} +\def\etD{{\etens{D}}} +\def\etE{{\etens{E}}} +\def\etF{{\etens{F}}} +\def\etG{{\etens{G}}} +\def\etH{{\etens{H}}} +\def\etI{{\etens{I}}} +\def\etJ{{\etens{J}}} +\def\etK{{\etens{K}}} +\def\etL{{\etens{L}}} +\def\etM{{\etens{M}}} +\def\etN{{\etens{N}}} +\def\etO{{\etens{O}}} +\def\etP{{\etens{P}}} +\def\etQ{{\etens{Q}}} +\def\etR{{\etens{R}}} +\def\etS{{\etens{S}}} +\def\etT{{\etens{T}}} +\def\etU{{\etens{U}}} +\def\etV{{\etens{V}}} +\def\etW{{\etens{W}}} +\def\etX{{\etens{X}}} +\def\etY{{\etens{Y}}} +\def\etZ{{\etens{Z}}} + +% The true underlying data generating distribution +\newcommand{\pdata}{p_{\rm{data}}} +% The empirical distribution defined by the training set +\newcommand{\ptrain}{\hat{p}_{\rm{data}}} +\newcommand{\Ptrain}{\hat{P}_{\rm{data}}} +% The model distribution +\newcommand{\pmodel}{p_{\rm{model}}} +\newcommand{\Pmodel}{P_{\rm{model}}} +\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}} +% Stochastic autoencoder distributions +\newcommand{\pencode}{p_{\rm{encoder}}} +\newcommand{\pdecode}{p_{\rm{decoder}}} +\newcommand{\precons}{p_{\rm{reconstruct}}} + +\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution + +\newcommand{\E}{\mathbb{E}} +\newcommand{\Ls}{\mathcal{L}} +\newcommand{\R}{\mathbb{R}} +\newcommand{\emp}{\tilde{p}} +\newcommand{\lr}{\alpha} +\newcommand{\reg}{\lambda} +\newcommand{\rect}{\mathrm{rectifier}} +\newcommand{\softmax}{\mathrm{softmax}} +\newcommand{\sigmoid}{\sigma} +\newcommand{\softplus}{\zeta} +\newcommand{\KL}{D_{\mathrm{KL}}} +\newcommand{\Var}{\mathrm{Var}} +\newcommand{\standarderror}{\mathrm{SE}} +\newcommand{\Cov}{\mathrm{Cov}} +% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors +% But then they seem to use $L^2$ for vectors throughout the site, and so does +% wikipedia. +\newcommand{\normlzero}{L^0} +\newcommand{\normlone}{L^1} +\newcommand{\normltwo}{L^2} +\newcommand{\normlp}{L^p} +\newcommand{\normmax}{L^\infty} + +\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book. + +\DeclareMathOperator*{\argmax}{arg\,max} +\DeclareMathOperator*{\argmin}{arg\,min} + +\DeclareMathOperator{\sign}{sign} +\DeclareMathOperator{\Tr}{Tr} +\let\ab\allowbreak diff --git a/2024/05/29/papers/2105.06987/method.tex b/2024/05/29/papers/2105.06987/method.tex new file mode 100644 index 00000000..a409283e --- /dev/null +++ b/2024/05/29/papers/2105.06987/method.tex @@ -0,0 +1,121 @@ +\begin{figure}[ht] + \centering + \subfigure[Initialization]{\includegraphics[width=0.32\textwidth]{figures/grad_ratio_init.png}} + \subfigure[Near Convergence]{\includegraphics[width=0.32\textwidth]{figures/grad_ratio_conv.png}} + \subfigure[Misclassification]{\includegraphics[width=0.32\textwidth]{figures/grad_ratio_misc.png}} + \caption{Gradient Ratio} + \label{fig:grad_ratio} +\end{figure} + + + +\section{Theoretical Analysis and Alternative Loss functions} + +In the previous section we described how \Endd can be done by maximising the log-likelihood of the ensemble's output distributions under a conditional Dirichlet Prior. However, we empirically observed significant convergence issues when applying this approach to tasks with large numbers of classes. Thus, in this section we examine the gradients of the Dirichlet NLL loss and propose an alternate training approach which overcomes them. + + +\textbf{First-Order Analysis} + +The setup which will consider in our analysis is the following. First, we have a Prior Network model which is initialized such that it always returns a uniform Dirichlet distribution ($\bm{\alpha} = \bm{1}$), while the target distribution whose probability is being maximized is a sparse K-length vector of probabilities: +\begin{empheq}{align*} + \bm{\pi}_{tgt} = \big[1-\epsilon, \epsilon/(K-1), \epsilon/(K-1), \cdots \big]^{\tt T},\quad \epsilon = \text{1e-4} +\end{empheq} +Second, we have a Prior Network which is \emph{near convergence} with the following output distribution: +\begin{empheq}{align*} + \bm{\alpha}_{cnv} =&\ \bm{\pi}_{cnv} \cdot \alpha_0,\ \alpha_0 = 90K,\quad + \bm{\pi}_{cnv} =\ \big[1-5\epsilon, \frac{5\epsilon}{K-1}, \frac{5\epsilon}{K-1}, \cdots \big]^{\tt T} +\end{empheq} +Finally, we have a Prior Network which has made a strong mistake, which represents a situation which could occur somewhere in the middle on training, far from convergence: +\begin{empheq}{align*} + \bm{\alpha}_{msc} =&\ \bm{\pi}_{msc} \cdot \alpha_0,\ \alpha_0 = 90K,\quad + \bm{\pi}_{msc} =\ \big[\frac{5\epsilon}{K-1}, \frac{5\epsilon}{K-1}, \cdots, 1-5\epsilon \big]^{\tt T} +\end{empheq} + +First, lets consider the standard cross-entropy loss between a predicted and target discrete distribution and it's gradient with respect to the logit $z_k$: +\begin{empheq}{align} + \mathcal{L}^{\text{CE}} =&\ -\sum_{k=1}^K \hat \pi_k \ln\big(\frac{\alpha_k}{\alpha_0}\big),\quad + \frac{\partial\mathcal{L}^{\text{CE}}}{\partial z_k} =\ \frac{\alpha_k}{\alpha_0} - \hat \pi_k +\end{empheq} + +Second, consider the NLL loss of a Dirichlet distribution and its gradient with respect to logit $z_k$: +\begin{empheq}{align} + \mathcal{L} \small{=} \sum_{k=1}^K\Gamma(\alpha_k) \small{-}(\alpha_k \small{-} 1)\sum_{m=1}^M\frac{\ln\pi_k^{(m)}}{M} \small{-} \Gamma(\alpha_0), \ \frac{\partial\mathcal{L}}{\partial z_k} \small{=} \big(\psi(\alpha_k) \small{-} \psi(\alpha_0) \small{-}\sum_{m=1}^M\frac{\ln\pi_k^{(m)}}{M}\big) \cdot \alpha_k +\end{empheq} + +Finally, consider the dimensionality normalized ratio of the gradient with respect to the logit 1 and logit 2, which represents the relative contribution of the gradients with respect to the class we are interested in modelling to the long tail. +\begin{empheq}{align} +\begin{split} + \rho = \frac{1}{K} \Big| \frac{\partial\mathcal{L}}{\partial z_1} \Big| \Big/ \Big|\frac{\partial\mathcal{L}}{\partial z_2}\Big| +\end{split} +\end{empheq} +Figure~\ref{fig:grad_ratio} shows that, at initialization, as the number of classes is increased the standard cross-entropy loss primarily focuses on the high probability class and ignores the long tail. In contrast, the Dirichlet NLL loss displays a diminishing contribution. This means that the loss will focus on modelling the probability distribution of the high-probability classes only after it \emph{perfectly} models the long tail. As the loss is also very sensitive, it means that on complex tasks the model is perpetually stuck modelling the probabilities of tail classes. Note that even near convergence, the ratio $\rho$ is far smaller for the NLL criterion than for discrete cross-entropy. Finally, if a significant error is made on the training data, $\rho$ becomes very large for cross-entropy, and increasingly small for Dirichlet NLL as the number of classes increases. This analysis shows that a desirable property of the loss which ensures good convergence is that the ratio $\rho$ is high and either constant or increasing as the number of classes grows, otherwise the model focuses on modelling the distribution of tail-class probabilities across the ensemble. + +An additional issue to consider is that the NLL noise is also noisy, as for each input $\bm{x}$ we only have a few discrete distributions - it may be necessary to use far more samples to get a good estimate of the ensemble's distribution. Furthermore, this distribution may be poorly matched to the Dirichlet, which introduces additional issues. Thus, a natural solution to consider would be to introduce a \emph{Proxy Dirichlet Distribution} to which we can minimize either the \emph{KL-divergence} or \emph{reverse KL divergence}. We leave discussion of the details of the Proxy Dirichlet until later and only consider the gradients which arise from minimizing either loss. + +For this analysis we consider a target Dirichlet distribution with parameters $\bm{\beta} = \bm{\pi}_{tgt}*\beta_0$ where $\beta_0 = 100K$. The explicit forms of the KL-divergence between two Dirichlet distributions, as well the gradient of the forward and reverse KL-divergence are provided below: +\begin{empheq}{align} +\begin{split} + & \mathcal{L}^{\text{KL}} =\ \ \sum_{k=1}^K\Gamma(\alpha_k) - \sum_{k=1}^K\Gamma(\beta_k) + \Gamma(\beta_0) - \Gamma(\alpha_0) + \sum_{k=1}^K(\beta_k - \alpha_k)\Big(\psi(\beta_k)-\psi(\beta_0)\Big) +\end{split} \\ +\begin{split} + & \mathcal{L}^{\text{RKL}} =\ \ \sum_{k=1}^K\Gamma(\beta_k) - \sum_{k=1}^K\Gamma(\alpha_k) + \Gamma(\alpha_0) - \Gamma(\beta_0) + \sum_{k=1}^K(\alpha_k - \beta_k)\Big(\psi(\alpha_k)-\psi(\alpha_0)\Big) +\end{split} \\ +\begin{split} + &\frac{\partial\mathcal{L}^{\text{KL}}}{\partial z_k} =\ \big(\psi(\alpha_k) - \psi(\alpha_0) - \psi(\beta_k) + \psi(\beta_0)\big) \cdot \alpha_k +\end{split} \\ +\begin{split} + &\frac{\partial\mathcal{L}^{\text{\tiny RKL}}}{\partial z_k} =\ \big((\alpha_k - \beta_k)\psi'(\alpha_k) - (\alpha_0 - \beta_0)\psi'(\alpha_0)\big) \cdot \alpha_k +\end{split} +\end{empheq} + +Figure~\ref{fig:grad_ratio} additionally displays the ratio $\rho$ for both the forward and reverse KL-divergence losses. The forward KL-divergence displays the same issues as the NLL loss and $\rho$ continues to decrease as the number of classes in increased. This is unsurprising, as the NLL is equivalent to the KL-divergence in the limit. However, the \emph{reverse KL-divergence} displays the desirable properly that $\rho$ grows and stabilizes as the number of classes is increased. This suggests that if we were to minimize the \emph{reverse KL-divergence} to an appropriately chosen \emph{Proxy-Target Dirichlet distribution}, then we would be able to avoid convergence issues. + +% \textbf{Second-Order Analysis} + +% In addition to the first-order analysis provided above, we also conduct a second order analysis by considering the eigenvalues of the Hessian of the loss. + + +\textbf{Proxy-Dirichlet distribution} + +\begin{figure*}[ht] + \centering + \subfigure[Naive \Endd]{\includegraphics[scale=0.067]{figures/naive-distillation-2.png}} + \subfigure[Proxy \Endd]{\includegraphics[scale=0.067]{figures/proxy_distillation.png}} + \caption{Schematic of Distillation Approaches} + \label{fig:distillation overview} +\end{figure*} + +It is important to remember that the ensemble may be poorly modelled via a Dirichlet distribution, so it is necessary to ask which properties of the ensemble we are actually interested in capturing. Clearly, we would like to capture the mean of the ensemble, as that typically has better predictive accuracy and calibration. Additionally, we would like to capture \emph{bulk-diversity properties} of the ensemble, such that the measures of divergence derived from the Proxy Dirichlet are similar to those of the original ensemble and therefore provide a similar rank-ordering of data. At the same time, we are \emph{not} interested modelling properties like multi-modality and skew. + +Clearly, obtaining the mean of the ensemble is trivial. Obtaining an estimate of the precision $\beta_0$ is more challenging. One approach based on Sterling's approximation is described in~\cite{minka2000estimating} and proposes the following estimate: +\begin{empheq}{align} +\begin{split} + \hat \pi_k (\bm{x})=&\ \frac{1}{M}\sum_{m=1}^M {\tt P}(y=\omega_k|\bm{x}, \bm{\theta}^{(m)}) \\ + \tilde \beta_0(\bm{x}) =& \frac{K-1}{2 \sum_{k=1}^K\hat \pi_k (\ln \hat \pi_k - \frac{1}{M}\sum_{m=1}^M\ln \pi_k^{(m)})},\ \bm{\beta}_k (\bm{x}) = \ \hat \pi_k(\bm{x}) \cdot \tilde \beta_0(\bm{x}) + 1 +\end{split} +\end{empheq} + +We found that it is important to also add 1 to all the target concentration parameters. Figure~\ref{fig:grad_ratio_smooth} shows that for the reverse KL loss, adding 1 to \emph{both} the target Proxy-Dirichlet as well as \emph{the model} yields an improved ratio $\rho$ both at initialization and near convergence. Heuristically, it seems to make the loss more linear and stable by preventing the digamma and trigamma functions $\psi$ and $\psi'$ in the reverse-KL loss from dropping into the highly non-linear regime when $\alpha_k < 1$ and $\beta_k < 1$. +\begin{figure}[ht] + \centering + \subfigure[Initialization]{\includegraphics[scale=0.49]{figures/grad_ratio_init_smooth.png}} + \subfigure[Near Convergence]{\includegraphics[scale=0.49]{figures/grad_ratio_conv_smooth.png}} + \caption{Gradient Ratio} + \label{fig:grad_ratio_smooth} +\end{figure} + +Note, while the solution may seem to similar to work done in \cite{malinin-rkl-2019}, the fundamental underlying reason for using this loss is altogether different. Here, the issue is due to large gradients from low-probability tail classes, while in~\cite{malinin-rkl-2019} the reverse KL loss is used to avoid inducing a multi-modal target Dirichlet distribution in expectation. + +\begin{empheq}{align} +\begin{split} +{\tt KL}[{\tt p}(\bm{\pi}|\bm{x},\bm{\theta}) \| {\tt p}(\bm{\pi}|\bm{\hat \beta})] =&\ \underbrace{ \beta_0\cdot\mathbb{E}_{{\tt p}(\bm{\pi}|\bm{x},\bm{\theta})}\big[-\sum_{k=1}^K\hat \pi_k\ln \pi_k\big]}_{\text{Reconstruction term}} + \underbrace{{\tt KL}[{\tt p}(\bm{\pi}|\bm{x},\bm{\theta}) \| {\tt p}(\bm{\pi}|\bm{1})]}_{\text{Prior}} +Z +\end{split} +\end{empheq} +% \textbf{Alternative solutions (if it fits) } + +% If not, we'll move that to the appendix (along with comparisons) + +% \begin{itemize} +% \item Top-k aggregation +% \item Softplus parametrization +% \end{itemize} \ No newline at end of file diff --git a/2024/05/29/papers/2105.06987/natbib.sty b/2024/05/29/papers/2105.06987/natbib.sty new file mode 100644 index 00000000..ff0d0b91 --- /dev/null +++ b/2024/05/29/papers/2105.06987/natbib.sty @@ -0,0 +1,1246 @@ +%% +%% This is file `natbib.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% natbib.dtx (with options: `package,all') +%% ============================================= +%% IMPORTANT NOTICE: +%% +%% This program can be redistributed and/or modified under the terms +%% of the LaTeX Project Public License Distributed from CTAN +%% archives in directory macros/latex/base/lppl.txt; either +%% version 1 of the License, or any later version. +%% +%% This is a generated file. +%% It may not be distributed without the original source file natbib.dtx. +%% +%% Full documentation can be obtained by LaTeXing that original file. +%% Only a few abbreviated comments remain here to describe the usage. +%% ============================================= +%% Copyright 1993-2009 Patrick W Daly +%% Max-Planck-Institut f\"ur Sonnensystemforschung +%% Max-Planck-Str. 2 +%% D-37191 Katlenburg-Lindau +%% Germany +%% E-mail: daly@mps.mpg.de +\NeedsTeXFormat{LaTeX2e}[1995/06/01] +\ProvidesPackage{natbib} + [2009/07/16 8.31 (PWD, AO)] + + % This package reimplements the LaTeX \cite command to be used for various + % citation styles, both author-year and numerical. It accepts BibTeX + % output intended for many other packages, and therefore acts as a + % general, all-purpose citation-style interface. + % + % With standard numerical .bst files, only numerical citations are + % possible. With an author-year .bst file, both numerical and + % author-year citations are possible. + % + % If author-year citations are selected, \bibitem must have one of the + % following forms: + % \bibitem[Jones et al.(1990)]{key}... + % \bibitem[Jones et al.(1990)Jones, Baker, and Williams]{key}... + % \bibitem[Jones et al., 1990]{key}... + % \bibitem[\protect\citeauthoryear{Jones, Baker, and Williams}{Jones + % et al.}{1990}]{key}... + % \bibitem[\protect\citeauthoryear{Jones et al.}{1990}]{key}... + % \bibitem[\protect\astroncite{Jones et al.}{1990}]{key}... + % \bibitem[\protect\citename{Jones et al., }1990]{key}... + % \harvarditem[Jones et al.]{Jones, Baker, and Williams}{1990}{key}... + % + % This is either to be made up manually, or to be generated by an + % appropriate .bst file with BibTeX. + % Author-year mode || Numerical mode + % Then, \citet{key} ==>> Jones et al. (1990) || Jones et al. [21] + % \citep{key} ==>> (Jones et al., 1990) || [21] + % Multiple citations as normal: + % \citep{key1,key2} ==>> (Jones et al., 1990; Smith, 1989) || [21,24] + % or (Jones et al., 1990, 1991) || [21,24] + % or (Jones et al., 1990a,b) || [21,24] + % \cite{key} is the equivalent of \citet{key} in author-year mode + % and of \citep{key} in numerical mode + % Full author lists may be forced with \citet* or \citep*, e.g. + % \citep*{key} ==>> (Jones, Baker, and Williams, 1990) + % Optional notes as: + % \citep[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2) + % \citep[e.g.,][]{key} ==>> (e.g., Jones et al., 1990) + % \citep[see][pg. 34]{key}==>> (see Jones et al., 1990, pg. 34) + % (Note: in standard LaTeX, only one note is allowed, after the ref. + % Here, one note is like the standard, two make pre- and post-notes.) + % \citealt{key} ==>> Jones et al. 1990 + % \citealt*{key} ==>> Jones, Baker, and Williams 1990 + % \citealp{key} ==>> Jones et al., 1990 + % \citealp*{key} ==>> Jones, Baker, and Williams, 1990 + % Additional citation possibilities (both author-year and numerical modes) + % \citeauthor{key} ==>> Jones et al. + % \citeauthor*{key} ==>> Jones, Baker, and Williams + % \citeyear{key} ==>> 1990 + % \citeyearpar{key} ==>> (1990) + % \citetext{priv. comm.} ==>> (priv. comm.) + % \citenum{key} ==>> 11 [non-superscripted] + % Note: full author lists depends on whether the bib style supports them; + % if not, the abbreviated list is printed even when full requested. + % + % For names like della Robbia at the start of a sentence, use + % \Citet{dRob98} ==>> Della Robbia (1998) + % \Citep{dRob98} ==>> (Della Robbia, 1998) + % \Citeauthor{dRob98} ==>> Della Robbia + % + % + % Citation aliasing is achieved with + % \defcitealias{key}{text} + % \citetalias{key} ==>> text + % \citepalias{key} ==>> (text) + % + % Defining the citation mode and punctual (citation style) + % \setcitestyle{} + % Example: \setcitestyle{square,semicolon} + % Alternatively: + % Use \bibpunct with 6 mandatory arguments: + % 1. opening bracket for citation + % 2. closing bracket + % 3. citation separator (for multiple citations in one \cite) + % 4. the letter n for numerical styles, s for superscripts + % else anything for author-year + % 5. punctuation between authors and date + % 6. punctuation between years (or numbers) when common authors missing + % One optional argument is the character coming before post-notes. It + % appears in square braces before all other arguments. May be left off. + % Example (and default) \bibpunct[, ]{(}{)}{;}{a}{,}{,} + % + % To make this automatic for a given bib style, named newbib, say, make + % a local configuration file, natbib.cfg, with the definition + % \newcommand{\bibstyle@newbib}{\bibpunct...} + % Then the \bibliographystyle{newbib} will cause \bibstyle@newbib to + % be called on THE NEXT LATEX RUN (via the aux file). + % + % Such preprogrammed definitions may be invoked anywhere in the text + % by calling \citestyle{newbib}. This is only useful if the style specified + % differs from that in \bibliographystyle. + % + % With \citeindextrue and \citeindexfalse, one can control whether the + % \cite commands make an automatic entry of the citation in the .idx + % indexing file. For this, \makeindex must also be given in the preamble. + % + % Package Options: (for selecting punctuation) + % round - round parentheses are used (default) + % square - square brackets are used [option] + % curly - curly braces are used {option} + % angle - angle brackets are used