From a97ddfa621dcfc6987fb1abaac775d3657647c17 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 10 Dec 2024 09:07:28 +0000 Subject: [PATCH] Add pages for volume v262 --- Gemfile | 15 + README.md | 28 ++ _config.yml | 110 ++++++ _posts/2024-12-10-agrawal24a.md | 60 +++ _posts/2024-12-10-ali-sadraei-javaheri24a.md | 57 +++ _posts/2024-12-10-alizadeh-vahid24a.md | 73 ++++ _posts/2024-12-10-ardestani24a.md | 45 +++ _posts/2024-12-10-ashkboos24a.md | 58 +++ _posts/2024-12-10-azimi24a.md | 57 +++ _posts/2024-12-10-bhendawade24a.md | 64 +++ _posts/2024-12-10-chen24a.md | 57 +++ _posts/2024-12-10-chung24a.md | 66 ++++ _posts/2024-12-10-d-chaparala24a.md | 54 +++ _posts/2024-12-10-doubov24a.md | 50 +++ _posts/2024-12-10-f-thielmann24a.md | 45 +++ _posts/2024-12-10-fathan24a.md | 54 +++ _posts/2024-12-10-g-lawton24a.md | 62 +++ _posts/2024-12-10-g-vasudev24a.md | 49 +++ _posts/2024-12-10-hajimolahoseini24a.md | 54 +++ _posts/2024-12-10-hajimolahoseini24b.md | 51 +++ _posts/2024-12-10-kang24a.md | 69 ++++ _posts/2024-12-10-khera24a.md | 55 +++ _posts/2024-12-10-kimhi24a.md | 54 +++ _posts/2024-12-10-kumar24a.md | 51 +++ _posts/2024-12-10-liu24a.md | 63 +++ _posts/2024-12-10-lu24a.md | 60 +++ _posts/2024-12-10-mamou24a.md | 57 +++ _posts/2024-12-10-panda24a.md | 59 +++ _posts/2024-12-10-pieler24a.md | 79 ++++ _posts/2024-12-10-praveen-rajasekhar24a.md | 55 +++ _posts/2024-12-10-qiao24a.md | 69 ++++ _posts/2024-12-10-rajabzadeh24a.md | 68 ++++ _posts/2024-12-10-rajput24a.md | 53 +++ _posts/2024-12-10-saheb-pasand24a.md | 50 +++ _posts/2024-12-10-samragh24a.md | 67 ++++ _posts/2024-12-10-sarkar24a.md | 65 ++++ _posts/2024-12-10-sarwar24a.md | 62 +++ _posts/2024-12-10-seng-chua24a.md | 50 +++ _posts/2024-12-10-sharify24a.md | 56 +++ _posts/2024-12-10-sharma24a.md | 49 +++ _posts/2024-12-10-shinde24a.md | 60 +++ _posts/2024-12-10-shiraee-kasmaee24a.md | 66 ++++ _posts/2024-12-10-stewart24a.md | 54 +++ _posts/2024-12-10-timor24a.md | 70 ++++ _posts/2024-12-10-wang24a.md | 69 ++++ _posts/2024-12-10-wu24a.md | 64 +++ _posts/2024-12-10-xu24a.md | 55 +++ _posts/2024-12-10-yang24a.md | 53 +++ _posts/2024-12-10-zayats24a.md | 58 +++ enlsp24.bib | 390 +++++++++++++++++++ index.html | 3 + 51 files changed, 3242 insertions(+) create mode 100644 Gemfile create mode 100644 README.md create mode 100644 _config.yml create mode 100644 _posts/2024-12-10-agrawal24a.md create mode 100644 _posts/2024-12-10-ali-sadraei-javaheri24a.md create mode 100644 _posts/2024-12-10-alizadeh-vahid24a.md create mode 100644 _posts/2024-12-10-ardestani24a.md create mode 100644 _posts/2024-12-10-ashkboos24a.md create mode 100644 _posts/2024-12-10-azimi24a.md create mode 100644 _posts/2024-12-10-bhendawade24a.md create mode 100644 _posts/2024-12-10-chen24a.md create mode 100644 _posts/2024-12-10-chung24a.md create mode 100644 _posts/2024-12-10-d-chaparala24a.md create mode 100644 _posts/2024-12-10-doubov24a.md create mode 100644 _posts/2024-12-10-f-thielmann24a.md create mode 100644 _posts/2024-12-10-fathan24a.md create mode 100644 _posts/2024-12-10-g-lawton24a.md create mode 100644 _posts/2024-12-10-g-vasudev24a.md create mode 100644 _posts/2024-12-10-hajimolahoseini24a.md create mode 100644 _posts/2024-12-10-hajimolahoseini24b.md create mode 100644 _posts/2024-12-10-kang24a.md create mode 100644 _posts/2024-12-10-khera24a.md create mode 100644 _posts/2024-12-10-kimhi24a.md create mode 100644 _posts/2024-12-10-kumar24a.md create mode 100644 _posts/2024-12-10-liu24a.md create mode 100644 _posts/2024-12-10-lu24a.md create mode 100644 _posts/2024-12-10-mamou24a.md create mode 100644 _posts/2024-12-10-panda24a.md create mode 100644 _posts/2024-12-10-pieler24a.md create mode 100644 _posts/2024-12-10-praveen-rajasekhar24a.md create mode 100644 _posts/2024-12-10-qiao24a.md create mode 100644 _posts/2024-12-10-rajabzadeh24a.md create mode 100644 _posts/2024-12-10-rajput24a.md create mode 100644 _posts/2024-12-10-saheb-pasand24a.md create mode 100644 _posts/2024-12-10-samragh24a.md create mode 100644 _posts/2024-12-10-sarkar24a.md create mode 100644 _posts/2024-12-10-sarwar24a.md create mode 100644 _posts/2024-12-10-seng-chua24a.md create mode 100644 _posts/2024-12-10-sharify24a.md create mode 100644 _posts/2024-12-10-sharma24a.md create mode 100644 _posts/2024-12-10-shinde24a.md create mode 100644 _posts/2024-12-10-shiraee-kasmaee24a.md create mode 100644 _posts/2024-12-10-stewart24a.md create mode 100644 _posts/2024-12-10-timor24a.md create mode 100644 _posts/2024-12-10-wang24a.md create mode 100644 _posts/2024-12-10-wu24a.md create mode 100644 _posts/2024-12-10-xu24a.md create mode 100644 _posts/2024-12-10-yang24a.md create mode 100644 _posts/2024-12-10-zayats24a.md create mode 100644 enlsp24.bib create mode 100644 index.html diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..a13bb1c --- /dev/null +++ b/Gemfile @@ -0,0 +1,15 @@ +source "https://rubygems.org" + +git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } + +gem 'jekyll' + +group :jekyll_plugins do + gem 'github-pages' + gem 'jekyll-remote-theme' + gem 'jekyll-include-cache' + gem 'webrick' +end + +# gem "rails" + diff --git a/README.md b/README.md new file mode 100644 index 0000000..52271ed --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# PMLR 262 + +To suggest fixes to this volume please make a pull request containing the changes requested and a justification for the changes. + +To edit the details of this conference work edit the [_config.yml](./_config.yml) file and submit a pull request. + +To make changes to the individual paper details, edit the associated paper file in the [./_posts](./_posts) subdirectory. + +For details of how to publish in PMLR please check https://proceedings.mlr.press/faq.html + +For details of what is required to submit a proceedings please check https://proceedings.mlr.press/spec.html + + + +Published as Volume 262 by the Proceedings of Machine Learning Research on 10 December 2024. + +Volume Edited by: + * Mehdi Rezagholizadeh + * Peyman Passban + * Soheila Samiee + * Vahid Partovi Nia + * Yu Cheng + * Yue Deng + * Qun Liu + * Boxing Chen + +Series Editors: + * Neil D. Lawrence diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..6cd1b36 --- /dev/null +++ b/_config.yml @@ -0,0 +1,110 @@ +--- +booktitle: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech Processing + Workshop +shortname: ENLSP-IV 2024 +sections: +- name: Training + title: Training +- name: Model Design \& Architecture + title: Model Design \& Architecture +- name: Model Efficiency \& Compression + title: Model Efficiency \& Compression +- name: Inference + title: Inference +- name: " Benchmark \\& Evaluation" + title: " Benchmark \\& Evaluation" +- name: 'Applications ' + title: 'Applications ' +volume: '262' +year: '2024' +start: &1 2024-12-14 +end: 2024-12-14 +published: 2024-12-10 +layout: proceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: ENLSP-2024 +month: 0 +cycles: false +bibtex_editor: Rezagholizadeh, Mehdi and Passban, Peyman and Samiee, Soheila and Partovi + Nia, Vahid and Cheng, Yu and Deng, Yue and Liu, Qun and Chen, Boxing +editor: +- given: Mehdi + family: Rezagholizadeh +- given: Peyman + family: Passban +- given: Soheila + family: Samiee +- given: Vahid + family: Partovi Nia +- given: Yu + family: Cheng +- given: Yue + family: Deng +- given: Qun + family: Liu +- given: Boxing + family: Chen +title: Proceedings of Machine Learning Research +description: | + Proceedings of The 4th NeurIPS Efficient Natural Language and Speech Processing Workshop + Held in Vancouver, British Columbia, Canada on 14 December 2024 + + Published as Volume 262 by the Proceedings of Machine Learning Research on 10 December 2024. + + Volume Edited by: + Mehdi Rezagholizadeh + Peyman Passban + Soheila Samiee + Vahid Partovi Nia + Yu Cheng + Yue Deng + Qun Liu + Boxing Chen + + Series Editors: + Neil D. Lawrence +date_str: 14 Dec +url: https://proceedings.mlr.press +author: + name: PMLR +baseurl: "/v262" +twitter_username: MLResearchPress +github_username: mlresearch +markdown: kramdown +exclude: +- README.md +- Gemfile +- ".gitignore" +plugins: +- jekyll-feed +- jekyll-seo-tag +- jekyll-remote-theme +remote_theme: mlresearch/jekyll-theme +style: pmlr +permalink: "/:title.html" +ghub: + edit: true + repository: v262 +display: + copy_button: + bibtex: true + endnote: true + apa: true + comments: false +volume_type: Volume +volume_dir: v262 +email: '' +conference: + name: NeurIPS Efficient Natural Language and Speech Processing Workshop + url: https://neurips2024-enlsp.github.io/ + location: Vancouver, British Columbia, Canada + dates: + - *1 +analytics: + google: + tracking_id: UA-92432422-1 +orig_bibfile: "/Users/neil/mlresearch/v262/enlsp24.bib" +# Site settings +# Original source: /Users/neil/mlresearch/v262/enlsp24.bib diff --git a/_posts/2024-12-10-agrawal24a.md b/_posts/2024-12-10-agrawal24a.md new file mode 100644 index 0000000..d00da3d --- /dev/null +++ b/_posts/2024-12-10-agrawal24a.md @@ -0,0 +1,60 @@ +--- +title: 'AdaEDL: Early Draft Stopping for Speculative Decoding of Large Language Models + via an Entropy-based Lower Bound on Token Acceptance Probability' +section: Inference +abstract: 'Speculative decoding is a powerful technique that attempts to circumvent + the autoregressive constraint of modern Large Language Models (LLMs). The aim of + speculative decoding techniques is to improve the average inference time of a large, + target model without sacrificing its accuracy, by using a more efficient draft model + to propose draft tokens which are then verified in parallel. The number of draft + tokens produced in each drafting round is referred to as the draft length and is + often a static hyperparameter chosen based on the acceptance rate statistics of + the draft tokens. However, setting a static draft length can negatively impact performance, + especially in scenarios where drafting is expensive and there is a high variance + in the number of tokens accepted. Adaptive Entropy-based Draft Length (AdaEDL) is + a simple, training and parameter-free criteria which allows for early stopping of + the token drafting process by approximating a lower bound on the expected acceptance + probability of the drafted token based on the currently observed entropy of the + drafted logits. We show that AdaEDL consistently outperforms static draft-length + speculative decoding by 10%-57% as well as other training-free draft-stopping techniques + by upto 10% in a variety of settings and datasets. At the same time, we show that + AdaEDL is more robust than these techniques and preserves performance in high-sampling-temperature + scenarios. Since it is training-free, in contrast to techniques that rely on the + training of dataset-specific draft-stopping predictors, AdaEDL can seamlessly be + integrated into a variety of pre-existing LLM systems. ' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: agrawal24a +month: 0 +tex_title: "{AdaEDL}: Early Draft Stopping for Speculative Decoding of Large Language + Models via an Entropy-based Lower Bound on Token Acceptance Probability" +firstpage: 355 +lastpage: 369 +page: 355-369 +order: 355 +cycles: false +bibtex_author: Agrawal, Sudhanshu and Jeon, Wonseok and Lee, Mingu +author: +- given: Sudhanshu + family: Agrawal +- given: Wonseok + family: Jeon +- given: Mingu + family: Lee +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/agrawal24a/agrawal24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-ali-sadraei-javaheri24a.md b/_posts/2024-12-10-ali-sadraei-javaheri24a.md new file mode 100644 index 0000000..88ea820 --- /dev/null +++ b/_posts/2024-12-10-ali-sadraei-javaheri24a.md @@ -0,0 +1,57 @@ +--- +title: 'SuperPos-Prompt: Enhancing Soft Prompt Tuning of Language Models with Superposition + of Multi Token Embeddings' +section: Training +abstract: 'Soft prompt tuning techniques have recently gained traction as an effective + strategy for the parameter-efficient tuning of pre-trained language models, particularly + minimizing the required adjustment of model parameters. Despite their growing use, + achieving optimal tuning with soft prompts, especially with smaller datasets, remains + a substantial challenge. This study makes two contributions in this domain: (i) + we introduce SuperPos-Prompt, a new reparameterization technique employing the superposition + of multiple pre-trained vocabulary embeddings to improve the learning of soft prompts. + Our experiments across several GLUE and SuperGLUE benchmarks consistently highlight + SuperPos-Prompt’s superiority over Residual Prompt tuning, exhibiting an average + score increase of +6.4 in T5-Small and +5.0 in T5-Base along with a faster convergence. + Remarkably, SuperPos-Prompt occasionally outperforms even full fine-tuning methods. + (ii) Additionally, we demonstrate enhanced performance and rapid convergence by + omitting dropouts from the frozen network, yielding consistent improvements across + various scenarios and tuning methods.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: ali-sadraei-javaheri24a +month: 0 +tex_title: "{SuperPos-Prompt}: Enhancing Soft Prompt Tuning of Language Models with + Superposition of Multi Token Embeddings" +firstpage: 34 +lastpage: 46 +page: 34-46 +order: 34 +cycles: false +bibtex_author: Ali Sadraei Javaheri, Mohammad and Asgari, Ehsaneddin and C. McHardy, + Alice and R. Rabiee, Hamid +author: +- given: Mohammad + family: Ali Sadraei Javaheri +- given: Ehsaneddin + family: Asgari +- given: Alice + family: C. McHardy +- given: Hamid + family: R. Rabiee +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/ali-sadraei-javaheri24a/ali-sadraei-javaheri24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-alizadeh-vahid24a.md b/_posts/2024-12-10-alizadeh-vahid24a.md new file mode 100644 index 0000000..5958386 --- /dev/null +++ b/_posts/2024-12-10-alizadeh-vahid24a.md @@ -0,0 +1,73 @@ +--- +title: 'Duo-LLM: A Framework for Studying Adaptive Computation in Large Language Models' +section: Inference +abstract: 'Large Language Models (LLMs) typically generate outputs token by token + using a fixed compute budget, leading to inefficient resource utilization. To address + this shortcoming, recent advancements in mixture of expert (MoE) models, speculative + decoding, and early exit strategies leverage the insight that computational demands + can vary significantly based on the complexity and nature of the input. However, + identifying optimal routing patterns for dynamic execution remains an open challenge, + limiting the full potential of these adaptive methods. To address this need, we + study adaptive computation in LLMs more systematically. We propose a novel framework + that integrates smaller auxiliary modules within each Feed-Forward Network layer + of the LLM. This design enables dynamic routing of tokens based on task complexity: + tokens can be processed by either the small or big modules at each layer, or even + bypass certain layers entirely. This allows us to introduce a novel notion of a + token’s difficulty, defined by its potential to benefit from additional computational + resources. Importantly, by employing oracles to identify optimal patterns of adaptive + computations, we gain valuable insights into the internal workings of LLMs and the + routing processes in a simplified heterogeneous MoE setup. We show that trained + routers operate differently from oracles and often yield suboptimal solutions. Notably, + activating a large module in just one layer outperforms models that use large modules + across all layers, underscoring the gap between practical implementations of routing + in MoE models and theoretical optima for adaptive computation.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: alizadeh-vahid24a +month: 0 +tex_title: "{Duo-LLM}: A Framework for Studying Adaptive Computation in Large Language + Models" +firstpage: 443 +lastpage: 455 +page: 443-455 +order: 443 +cycles: false +bibtex_author: Alizadeh-Vahid, Keivan and Iman Mirzadeh, Seyed and Shahrkokhi, Hooman + and Belenko, Dmitry and Sun, Frank and Cho, Minsik and Hossein Sekhavat, Mohammad + and Nabi, Moin and Farajtabar, Mehrdad +author: +- given: Keivan + family: Alizadeh-Vahid +- given: Seyed + family: Iman Mirzadeh +- given: Hooman + family: Shahrkokhi +- given: Dmitry + family: Belenko +- given: Frank + family: Sun +- given: Minsik + family: Cho +- given: Mohammad + family: Hossein Sekhavat +- given: Moin + family: Nabi +- given: Mehrdad + family: Farajtabar +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/alizadeh-vahid24a/alizadeh-vahid24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-ardestani24a.md b/_posts/2024-12-10-ardestani24a.md new file mode 100644 index 0000000..da2da03 --- /dev/null +++ b/_posts/2024-12-10-ardestani24a.md @@ -0,0 +1,45 @@ +--- +title: Text Summarization With Graph Attention Networks +section: Applications +abstract: This study aimed to leverage graph information, particularly Rhetorical + Structure Theory (RST) and Co-reference (Coref) graphs, to enhance the performance + of our baseline summarization models. Specifically, we experimented with a Graph + Attention Network architecture to incorporate graph information. However, this architecture + did not enhance the performance. Subsequently, we used a simple Multi-layer Perceptron + architecture, which improved the results in our proposed model on our primary dataset, + CNN/DM. Additionally, we annotated XSum dataset with RST graph information, establishing + a benchmark for future graph-based summarization models. This secondary dataset + posed multiple challenges, revealing both the merits and limitations of our models. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: ardestani24a +month: 0 +tex_title: Text Summarization With Graph Attention Networks +firstpage: 540 +lastpage: 553 +page: 540-553 +order: 540 +cycles: false +bibtex_author: Ardestani, Mohammadreza and Chali, Yllias +author: +- given: Mohammadreza + family: Ardestani +- given: Yllias + family: Chali +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/ardestani24a/ardestani24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-ashkboos24a.md b/_posts/2024-12-10-ashkboos24a.md new file mode 100644 index 0000000..d1d7808 --- /dev/null +++ b/_posts/2024-12-10-ashkboos24a.md @@ -0,0 +1,58 @@ +--- +title: Computational Bottlenecks of Training Small-scale Large Language Models +section: Training +abstract: While large language models (LLMs) dominate the AI landscape, Small-scale + large Language Models (SLMs) are gaining attention due to cost and efficiency demands + from consumers. However, there is limited research on the training behavior and + computational requirements of SLMs. In this study, we explore the computational + bottlenecks of training SLMs (up to 2B parameters) by examining the effects of various + hyperparameters and configurations, including GPU type, batch size, model size, + communication protocol, attention type, and the number of GPUs. We assess these + factors on popular cloud services using metrics such as loss per dollar and tokens + per second. Our findings aim to support the broader adoption and optimization of + language model training for low-resource AI research institutes. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: ashkboos24a +month: 0 +tex_title: Computational Bottlenecks of Training Small-scale Large Language Models +firstpage: 14 +lastpage: 21 +page: 14-21 +order: 14 +cycles: false +bibtex_author: Ashkboos, Saleh and Iman Mirzadeh, Seyed and Alizadeh-Vahid, Keivan + and Hossein Sekhavat, Mohammad and Nabi, Moin and Farajtabar, Mehrdad and Faghri, + Fartash +author: +- given: Saleh + family: Ashkboos +- given: Seyed + family: Iman Mirzadeh +- given: Keivan + family: Alizadeh-Vahid +- given: Mohammad + family: Hossein Sekhavat +- given: Moin + family: Nabi +- given: Mehrdad + family: Farajtabar +- given: Fartash + family: Faghri +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/ashkboos24a/ashkboos24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-azimi24a.md b/_posts/2024-12-10-azimi24a.md new file mode 100644 index 0000000..536f590 --- /dev/null +++ b/_posts/2024-12-10-azimi24a.md @@ -0,0 +1,57 @@ +--- +title: 'KD-LoRA: A Hybrid Approach to Efficient Fine-Tuning with LoRA and Knowledge + Distillation' +section: Training +abstract: 'Large language models (LLMs) have demonstrated remarkable performance across + various downstream tasks. However, the high computational and memory requirements + of LLMs are a major bottleneck. To address this, parameter-efficient fine-tuning + (PEFT) methods such as low-rank adaptation (LoRA) have been proposed to reduce computational + costs while ensuring minimal loss in performance. Additionally, knowledge distillation + (KD) has been a popular choice for obtaining compact student models from teacher + models. In this work, we present KD-LoRA, a novel fine-tuning method that combines + LoRA with KD. Our results demonstrate that KD-LoRA achieves performance comparable + to full fine-tuning (FFT) and LoRA while significantly reducing resource requirements. + Specifically, KD-LoRA retains 98% of LoRA’s performance on the GLUE benchmark, while + being 40% more compact. Additionally, KD-LoRA reduces GPU memory usage by 30% compared + to LoRA, while decreasing inference time by 30% compared to both FFT and LoRA. We + evaluate KD-LoRA across three encoder-only models: BERT, RoBERTa, and DeBERTaV3. + Code is available at https://github.com/rambodazimi/KD-LoRA.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: azimi24a +month: 0 +tex_title: "{KD-LoRA}: A Hybrid Approach to Efficient Fine-Tuning with LoRA and Knowledge + Distillation" +firstpage: 73 +lastpage: 80 +page: 73-80 +order: 73 +cycles: false +bibtex_author: Azimi, Rambod and Rishav, Rishav and Teichmann, Marek and Ebrahimi + Kahou, Samira +author: +- given: Rambod + family: Azimi +- given: Rishav + family: Rishav +- given: Marek + family: Teichmann +- given: Samira + family: Ebrahimi Kahou +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/azimi24a/azimi24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-bhendawade24a.md b/_posts/2024-12-10-bhendawade24a.md new file mode 100644 index 0000000..de6a857 --- /dev/null +++ b/_posts/2024-12-10-bhendawade24a.md @@ -0,0 +1,64 @@ +--- +title: 'Speculative Streaming: Fast LLM Inference without Auxiliary Models' +section: Inference +abstract: Speculative decoding is a prominent technique to accelerate large language + model inference by leveraging predictions from an auxiliary draft model. While effective, + in application-specific settings, it often involves fine-tuning both draft and target + models to achieve high acceptance rates. As the number of downstream tasks grows, + draft models add significant complexity to inference systems. Recently several single + model architectures viz. Medusa have been proposed to speculate tokens in non-autoregressive + manner, however, their effectiveness is limited due to lack of dependency between + speculated tokens. We introduce a novel speculative decoding method that integrates + drafting within the target model by using Multi-stream attention and incorporates + future token planning into supervised fine-tuning objective. To the best of our + knowledge, it is the first parameter-efficient approach that scales well with number + of downstream tasks while improving downstream metrics. Speculative Streaming speeds + up decoding by 1.9 - 3X in a diverse set of tasks, such as Summarization, Structured + Queries, and Meaning Representation, while improving generation quality and using + 10000X fewer extra parameters than alternative architectures, making it ideal for + resource-constrained devices. Our approach can also be effectively deployed in lossless + settings for generic chatbot applications that do not necessitate fine-tuning. In + such setups, we achieve 2.9 - 3.2X speedup while maintaining the integrity of the + base model’s output. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: bhendawade24a +month: 0 +tex_title: 'Speculative Streaming: Fast {LLM} Inference without Auxiliary Models' +firstpage: 395 +lastpage: 413 +page: 395-413 +order: 395 +cycles: false +bibtex_author: Bhendawade, Nikhil and Belousova, Irina and Fu, Qichen and Mason, Henry + and Rastegari, Mohammad and Najibi, Mahyar +author: +- given: Nikhil + family: Bhendawade +- given: Irina + family: Belousova +- given: Qichen + family: Fu +- given: Henry + family: Mason +- given: Mohammad + family: Rastegari +- given: Mahyar + family: Najibi +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/bhendawade24a/bhendawade24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-chen24a.md b/_posts/2024-12-10-chen24a.md new file mode 100644 index 0000000..46c1f2a --- /dev/null +++ b/_posts/2024-12-10-chen24a.md @@ -0,0 +1,57 @@ +--- +title: 'OnlySportsLM: Optimizing Sports-Domain Language Models with SOTA Performance + under Billion Parameters' +section: Applications +abstract: 'This paper explores the potential of a small, domain-specific language + model trained exclusively on sports-related data. We investigate whether extensive + training data with specially designed small model structures can overcome model + size constraints. The study introduces the OnlySports collection, comprising OnlySportsLM, + OnlySports Dataset, and OnlySports Benchmark. Our approach involves: 1) creating + a massive 600 billion tokens OnlySports Dataset from FineWeb, 2) optimizing the + RWKV architecture for sports-related tasks, resulting in a 196M parameters model + with 20-layer, 640-dimension structure, 3) training the OnlySportsLM on part of + OnlySports Dataset, and 4) testing the resultant model on OnlySports Benchmark. + OnlySportsLM achieves a 37.62%/34.08% accuracy improvement over previous 135M/360M + state-of-the-art models and matches the performance of larger models such as SomlLM + 1.7B and Qwen 1.5B in the sports domain. Additionally, the OnlySports collection + presents a comprehensive workflow for building high-quality, domain-specific language + models, providing a replicable blueprint for efficient AI development across various + specialized fields.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: chen24a +month: 0 +tex_title: "{OnlySportsLM}: Optimizing Sports-Domain Language Models with {SOTA} Performance + under Billion Parameters" +firstpage: 596 +lastpage: 610 +page: 596-610 +order: 596 +cycles: false +bibtex_author: Chen, Zexin and Li, Chengxi and Xie, Xiangyu and Dube, Parijat +author: +- given: Zexin + family: Chen +- given: Chengxi + family: Li +- given: Xiangyu + family: Xie +- given: Parijat + family: Dube +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/chen24a/chen24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-chung24a.md b/_posts/2024-12-10-chung24a.md new file mode 100644 index 0000000..b5aabfc --- /dev/null +++ b/_posts/2024-12-10-chung24a.md @@ -0,0 +1,66 @@ +--- +title: 'Beyond Parameter Count: Implicit Bias in Soft Mixture of Experts' +section: Model Design \& Architecture +abstract: The traditional viewpoint on Sparse Mixture of Experts (MoE) models is that + instead of training a single large expert, which is computationally expensive, we + can train many small experts. The hope is that if the total parameter count of the + small experts equals that of the singular large expert, then we retain the representation + power of the large expert while gaining computational tractability and promoting + expert specialization. The recently introduced Soft MoE replaces the Sparse MoE’s + discrete routing mechanism with a differentiable gating function that smoothly mixes + tokens. While this smooth gating function successfully mitigates the various training + instabilities associated with Sparse MoE, it is unclear whether it induces implicit + biases that affect Soft MoE’s representation power or potential for expert specialization. + We prove that Soft MoE with a single arbitrarily powerful expert cannot represent + simple convex functions. This justifies that Soft MoE’s success cannot be explained + by the traditional viewpoint of many small experts collectively mimicking the representation + power of a single large expert, and that multiple experts are actually necessary + to achieve good representation power (even for a fixed total parameter count). Continuing + along this line of investigation, we introduce a notion of expert specialization + for Soft MoE, and while varying the number of experts yet fixing the total parameter + count, we consider the following (computationally intractable) task. Given any input, + how can we discover the expert subset that is specialized to predict this input’s + label? We empirically show that when there are many small experts, the architecture + is implicitly biased in a fashion that allows us to efficiently approximate the + specialized expert subset. Our method can be easily implemented to potentially reduce + computation during inference. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: chung24a +month: 0 +tex_title: 'Beyond Parameter Count: Implicit Bias in Soft Mixture of Experts' +firstpage: 145 +lastpage: 164 +page: 145-164 +order: 145 +cycles: false +bibtex_author: Chung, Youngseog and Malik, Dhruv and Schneider, Jeff and Li, Yuanzhi + and Singh, Aarti +author: +- given: Youngseog + family: Chung +- given: Dhruv + family: Malik +- given: Jeff + family: Schneider +- given: Yuanzhi + family: Li +- given: Aarti + family: Singh +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/chung24a/chung24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-d-chaparala24a.md b/_posts/2024-12-10-d-chaparala24a.md new file mode 100644 index 0000000..095f860 --- /dev/null +++ b/_posts/2024-12-10-d-chaparala24a.md @@ -0,0 +1,54 @@ +--- +title: 'Mai Ho‘omāuna i ka ‘Ai: Language Models Improve Automatic Speech Recognition + in Hawaiian ' +section: Applications +abstract: In this paper we address the challenge of improving Automatic Speech Recognition + (ASR) for a low-resource language, Hawaiian, by incorporating large amounts of independent + text data into an ASR foundation model, Whisper. To do this, we train an external + language model (LM) on ∼1.5M words of Hawaiian text. We then use the LM to rescore + Whisper and compute word error rates (WERs) on a manually curated test set of labeled + Hawaiian data. As a baseline, we use Whisper without an external LM. Experimental + results reveal a small but significant improvement in WER when ASR outputs are rescored + with a Hawaiian LM. The results support leveraging all available data in the development + of ASR systems for underrepresented languages. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: d-chaparala24a +month: 0 +tex_title: "{Mai Ho‘omāuna i ka ‘Ai}: Language Models Improve Automatic Speech Recognition + in Hawaiian " +firstpage: 576 +lastpage: 583 +page: 576-583 +order: 576 +cycles: false +bibtex_author: D Chaparala, Kaavya and Zarrella, Guido and Torres Fischer, Bruce and + Kimura, Larry and Parker Jones, Oiwi +author: +- given: Kaavya + family: D Chaparala +- given: Guido + family: Zarrella +- given: Bruce + family: Torres Fischer +- given: Larry + family: Kimura +- given: Oiwi + family: Parker Jones +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/d-chaparala24a/d-chaparala24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-doubov24a.md b/_posts/2024-12-10-doubov24a.md new file mode 100644 index 0000000..3b11be4 --- /dev/null +++ b/_posts/2024-12-10-doubov24a.md @@ -0,0 +1,50 @@ +--- +title: 'Sparse Upcycling: Inference Inefficient Finetuning' +section: Model Design \& Architecture +abstract: Small, highly trained, open-source LLMs are widely used due to their inference + efficiency, but further improving their quality remains a challenge. Sparse upcycling + is a promising approach that transforms a pretrained dense model into a Mixture-of-Experts + (MoE) architecture, increasing the model’s parameter count and potential quality. + In this work, we compare the effectiveness of sparse upcycling against continued + pretraining (CPT) across different model sizes, FLOP budgets, and pretraining durations. + Our experiments show that sparse upcycling can achieve better quality, with improvements + of over 20% relative to CPT in certain scenarios. However, this comes with a significant + inference cost, leading to 40% slowdowns in high-demand inference settings for larger + models. These results highlight the trade-off between model quality and inference + efficiency, offering insights for practitioners seeking to balance performance with + practical deployment costs. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: doubov24a +month: 0 +tex_title: 'Sparse Upcycling: Inference Inefficient Finetuning' +firstpage: 194 +lastpage: 205 +page: 194-205 +order: 194 +cycles: false +bibtex_author: Doubov, Sasha and Sardana, Nikhil and Chiley, Vitaliy +author: +- given: Sasha + family: Doubov +- given: Nikhil + family: Sardana +- given: Vitaliy + family: Chiley +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/doubov24a/doubov24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-f-thielmann24a.md b/_posts/2024-12-10-f-thielmann24a.md new file mode 100644 index 0000000..3e5f557 --- /dev/null +++ b/_posts/2024-12-10-f-thielmann24a.md @@ -0,0 +1,45 @@ +--- +title: On the Efficiency of NLP-Inspired Methods for Tabular Deep Learning +section: Benchmark \& Evaluation +abstract: Recent advancements in tabular deep learning (DL) have led to substantial + performance improvements, surpassing the capabilities of traditional models. With + the adoption of techniques from natural language processing (NLP), such as language + model-based approaches, DL models for tabular data have also grown in complexity + and size. Although tabular datasets do not typically pose scalability issues, the + escalating size of these models has raised efficiency concerns. Despite its importance, + efficiency has been relatively underexplored in tabular DL research. This paper + critically examines the latest innovations in tabular DL, with a dual focus on performance + and computational efficiency. The source code is available at https://github.com/basf/mamba-tabular. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: f-thielmann24a +month: 0 +tex_title: On the Efficiency of {NLP}-Inspired Methods for Tabular Deep Learning +firstpage: 532 +lastpage: 539 +page: 532-539 +order: 532 +cycles: false +bibtex_author: F Thielmann, Anton and Samiee, Soheila +author: +- given: Anton + family: F Thielmann +- given: Soheila + family: Samiee +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/f-thielmann24a/f-thielmann24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-fathan24a.md b/_posts/2024-12-10-fathan24a.md new file mode 100644 index 0000000..e721481 --- /dev/null +++ b/_posts/2024-12-10-fathan24a.md @@ -0,0 +1,54 @@ +--- +title: Enhanced label noise robustness through early adaptive filtering for the self-supervised + speaker verification task +section: Applications +abstract: Using clustering-driven annotations to train a neural network can be a tricky + task because of label noise. In this paper, we propose a dynamic and adaptive label + noise filtering method, called AdaptiveDrop which combines both label noise cleansing + and correction simultaneously in cascade to combine their advantages. Contrary to + other label noise filtering approaches, our method filters noisy samples on the + fly from an early stage of training. We also provide a variant that incorporates + sub-centers per each class for enhanced robustness to label noise by continuously + tracking the dominant sub-centers via a dictionary table. AdaptiveDrop is a simple + general-purpose method, performed end-to-end in only one stage of training, can + be integrated with any loss function, and does not require training from scratch + on the cleansed dataset. We show through extensive ablation studies for the self-supervised + speaker verification task that our method is effective, benefits from long epochs + of iterative filtering and provides consistent performance gains across various + loss functions and real-world pseudo-labels. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: fathan24a +month: 0 +tex_title: Enhanced label noise robustness through early adaptive filtering for the + self-supervised speaker verification task +firstpage: 564 +lastpage: 575 +page: 564-575 +order: 564 +cycles: false +bibtex_author: Fathan, Abderrahim and Zhu, Xiaolin and Alam, Jahangir +author: +- given: Abderrahim + family: Fathan +- given: Xiaolin + family: Zhu +- given: Jahangir + family: Alam +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/fathan24a/fathan24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-g-lawton24a.md b/_posts/2024-12-10-g-lawton24a.md new file mode 100644 index 0000000..8e59ddd --- /dev/null +++ b/_posts/2024-12-10-g-lawton24a.md @@ -0,0 +1,62 @@ +--- +title: 'QuAILoRA: Quantization-Aware Initialization for LoRA' +section: Training +abstract: QLoRA reduces the memory-cost of fine-tuning a large language model (LLM) + with LoRA by quantizing the base LLM. However, quantization introduces quantization + errors that negatively impact model performance after fine-tuning. In this paper + we introduce QuAILoRA, a quantization-aware initialization for LoRA that mitigates + this negative impact by decreasing quantization errors at initialization. Our method + spends a small amount of computational overhead to compute this quantization-aware + initialization, without increasing the memory-cost of fine-tuning. We evaluate our + method on several causal language modeling and downstream evaluation tasks using + several different model sizes and families. We observe that almost all LLMs fined-tuned + with QuAILoRA achieve better validation perplexity. When evaluated on downstream + tasks, we find that QuAILoRA yields improvements proportional to the negative effect + of quantization error. On average, applying QuAILoRA to 4-bit QLoRA models yields + 75% of the validation perplexity decrease and 86% of the downstream task accuracy + increase as doubling the quantization precision to 8-bit, without increasing GPU + memory utilization during fine-tuning. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: g-lawton24a +month: 0 +tex_title: "{QuAILoRA}: Quantization-Aware Initialization for {LoRA}" +firstpage: 22 +lastpage: 33 +page: 22-33 +order: 22 +cycles: false +bibtex_author: G Lawton, Neal and Padmakumar, Aishwarya and Gaspers, Judith and FitzGerald, + Jack and Kumar, Anoop and Ver Steeg, Greg and Galstyan, Aram +author: +- given: Neal + family: G Lawton +- given: Aishwarya + family: Padmakumar +- given: Judith + family: Gaspers +- given: Jack + family: FitzGerald +- given: Anoop + family: Kumar +- given: Greg + family: Ver Steeg +- given: Aram + family: Galstyan +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/g-lawton24a/g-lawton24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-g-vasudev24a.md b/_posts/2024-12-10-g-vasudev24a.md new file mode 100644 index 0000000..ea2a089 --- /dev/null +++ b/_posts/2024-12-10-g-vasudev24a.md @@ -0,0 +1,49 @@ +--- +title: 'The EarlyBird Gets the WORM: Heuristically Accelerating EarlyBird Convergence' +section: Model Efficiency \& Compression +abstract: The Lottery Ticket hypothesis proposes that ideal, sparse subnetworks, called + lottery tickets, exist in untrained dense neural networks. The Early Bird hypothesis + proposes an efficient algorithm to find these winning lottery tickets in convolutional + neural networks, using the novel concept of distance between subnetworks to detect + convergence in the subnetworks of a model. However, this approach overlooks unchanging + groups of unimportant neurons near the search’s end. We propose WORM, a method that + exploits these static groups by truncating their gradients, forcing the model to + rely on other neurons. Experiments show WORM achieves faster ticket identification + during training on convolutional neural networks, despite the additional computational + overhead, when compared to EarlyBird Search. Additionally, WORM-pruned models lose + less accuracy during pruning and recover accuracy faster, improving the robustness + of a given model. Furthermore, WORM is also able to generalize the Early Bird hypothesis + reasonably well to larger models, such as transformers, displaying its flexibility + to adapt to more complex architectures. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: g-vasudev24a +month: 0 +tex_title: 'The {EarlyBird} Gets the {WORM}: Heuristically Accelerating {EarlyBird} + Convergence' +firstpage: 232 +lastpage: 240 +page: 232-240 +order: 232 +cycles: false +bibtex_author: G Vasudev, Adithya +author: +- given: Adithya + family: G Vasudev +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/g-vasudev24a/g-vasudev24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-hajimolahoseini24a.md b/_posts/2024-12-10-hajimolahoseini24a.md new file mode 100644 index 0000000..7851e3b --- /dev/null +++ b/_posts/2024-12-10-hajimolahoseini24a.md @@ -0,0 +1,54 @@ +--- +title: Is 3D Convolution with 5D Tensors Really Necessary for Video Analysis? +section: Model Design \& Architecture +abstract: In this paper, we present a comprehensive study and propose several novel + techniques for implementing 3D convolutional blocks using 2D and/or 1D convolutions + with only 4D and/or 3D tensors. Our motivation is that 3D convolutions with 5D tensors + are computationally very expensive and they may not be supported by some of the + edge devices used in real-time applications such as robots. The existing approaches + mitigate this by splitting the 3D kernels into spatial and temporal domains, but + they still use 3D convolutions with 5D tensors in their implementations. We resolve + this issue by introducing some appropriate 4D/3D tensor reshaping as well as new + combination techniques for spatial and temporal splits. The proposed implementation + methods show significant improvement both in terms of efficiency and accuracy. The + experimental results confirm that the proposed spatio-temporal processing structure + outperforms the original model in terms of speed and accuracy using only 4D tensors + with fewer parameters. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: hajimolahoseini24a +month: 0 +tex_title: Is {3D} Convolution with {5D} Tensors Really Necessary for Video Analysis? +firstpage: 136 +lastpage: 144 +page: 136-144 +order: 136 +cycles: false +bibtex_author: Hajimolahoseini, Habib and Ahmed, Walid and Wen, Shuangyue and Liu, + Yang +author: +- given: Habib + family: Hajimolahoseini +- given: Walid + family: Ahmed +- given: Shuangyue + family: Wen +- given: Yang + family: Liu +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/hajimolahoseini24a/hajimolahoseini24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-hajimolahoseini24b.md b/_posts/2024-12-10-hajimolahoseini24b.md new file mode 100644 index 0000000..86a38a3 --- /dev/null +++ b/_posts/2024-12-10-hajimolahoseini24b.md @@ -0,0 +1,51 @@ +--- +title: Accelerating the Low-Rank Decomposed Models +section: Model Efficiency \& Compression +abstract: 'Tensor decomposition is a mathematically supported technique for data compression. + It consists of applying some kind of a Low Rank Decomposition technique on the tensors + or matrices in order to reduce the redundancy of the data. However, it is not a + popular technique for compressing the AI models duo to the high number of new layers + added to the architecture after decomposition. Although the number of parameters + could shrink significantly, it could result in the model be more than twice deeper + which could add some latency to the training or inference. In this paper, we present + a comprehensive study about how to modify low rank decomposition technique in AI + models so that we could benefit from both high accuracy and low memory consumption + as well as speeding up the training and inference. ' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: hajimolahoseini24b +month: 0 +tex_title: Accelerating the Low-Rank Decomposed Models +firstpage: 222 +lastpage: 231 +page: 222-231 +order: 222 +cycles: false +bibtex_author: Hajimolahoseini, Habib and Ahmed, Walid and Wen, Shuangyue and Liu, + Yang +author: +- given: Habib + family: Hajimolahoseini +- given: Walid + family: Ahmed +- given: Shuangyue + family: Wen +- given: Yang + family: Liu +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/hajimolahoseini24b/hajimolahoseini24b.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-kang24a.md b/_posts/2024-12-10-kang24a.md new file mode 100644 index 0000000..5dd0b32 --- /dev/null +++ b/_posts/2024-12-10-kang24a.md @@ -0,0 +1,69 @@ +--- +title: 'GEAR: An Efficient Error Reduction Framework for KV Cache Compression in LLM + Inference' +section: Inference +abstract: 'Key-value (KV) caching has become the de-facto technique to accelerate + generation speed for large language models (LLMs) inference. However, the growing + cache demand with increasing sequence length has transformed LLM inference to be + a memory bound problem, significantly constraining the system throughput. Existing + methods rely on dropping unimportant tokens or quantizing entries group-wise. Such + methods, however, often incur high approximation errors to represent the compressed + matrices. The autoregressive decoding process further compounds the error of each + step, resulting in critical deviation in model generation and deterioration of performance. + To tackle this challenge, we propose GEAR, an efficient error reduction framework + that augments a quantization scheme with two error reduction components and achieves + near-lossless performance at high compression ratios. GEAR first applies quantization + to majority of entries of similar magnitudes to ultra-low precision. It then employs + a low-rank matrix to approximate the quantization error, and a sparse matrix to + remedy individual errors from outlier entries. By adeptly integrating three techniques, + GEAR is able to fully exploit their synergistic potentials. Our experiments show + that GEAR can maintain similar accuracy to that of FP16 cache with improvement up + to 24.42% over the SOTA baselines at 2-bit compression. Additionally, compared to + LLM inference with FP16 KV cache, GEAR can reduce peak-memory of up to $2.39\times$, + bringing $2.1\times\sim 5.07\times$ throughput improvement. Our code will be publicly + available. ' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: kang24a +month: 0 +tex_title: "{GEAR}: An Efficient Error Reduction Framework for {KV} Cache Compression + in {LLM} Inference" +firstpage: 305 +lastpage: 321 +page: 305-321 +order: 305 +cycles: false +bibtex_author: Kang, Hao and Zhang, Qingru and Kundu, Souvik and Jeong, Geonhwa and + Liu, Zaoxing and Krishna, Tushar and Zhao, Tuo +author: +- given: Hao + family: Kang +- given: Qingru + family: Zhang +- given: Souvik + family: Kundu +- given: Geonhwa + family: Jeong +- given: Zaoxing + family: Liu +- given: Tushar + family: Krishna +- given: Tuo + family: Zhao +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/kang24a/kang24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-khera24a.md b/_posts/2024-12-10-khera24a.md new file mode 100644 index 0000000..0c934f7 --- /dev/null +++ b/_posts/2024-12-10-khera24a.md @@ -0,0 +1,55 @@ +--- +title: Efficient Alignment of Large Language Models via Data Sampling +section: Training +abstract: Despite the capabilities of Large Language Models (LLMs), the output is + not always safe or desirable. Aligning the models to human values is a critical + step for the safe adoption of these models. Aligning LLMs employ huge amounts of + data, computation, and time. Moreover, curating data with human feedback is expensive + and takes time. Recent research depicts the benefit of data engineering in the fine-tuning + and pre-training paradigms to bring down such costs. However, alignment differs + from the afore-mentioned paradigms and it is unclear if data efficient alignment + is feasible. In this work, we first aim to understand how the performance of LLM + alignment scales with data. We find out that LLM alignment performance follows an + exponential plateau pattern which tapers off post a rapid initial increase. We identify + data subsampling as a viable method to reduce resources required for alignment. + Further, we propose a methodology for efficient alignment by identifying a small + high quality subset thereby reducing the computation and time required by alignment. + We evaluate the proposed methodology over multiple datasets and compare the results. + We find that the model aligned using our proposed methodology outperforms other + sampling methods and performs comparable to the model aligned with the full dataset + while using a fraction of the resources. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: khera24a +month: 0 +tex_title: Efficient Alignment of Large Language Models via Data Sampling +firstpage: 55 +lastpage: 72 +page: 55-72 +order: 55 +cycles: false +bibtex_author: Khera, Amrit and Ghosh, Rajat and Dutta, Debojyoti +author: +- given: Amrit + family: Khera +- given: Rajat + family: Ghosh +- given: Debojyoti + family: Dutta +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/khera24a/khera24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-kimhi24a.md b/_posts/2024-12-10-kimhi24a.md new file mode 100644 index 0000000..0302255 --- /dev/null +++ b/_posts/2024-12-10-kimhi24a.md @@ -0,0 +1,54 @@ +--- +title: Hysteresis Activation Function for Efficient Inference +section: Inference +abstract: The widely used ReLU is favored for its hardware efficiency yet suffers + from issues such as the “dying ReLU” problem, where during training, neurons fail + to activate and constantly remain at zero, as highlighted by Lu et al. \citep{lu2018collapse}. + Traditional approaches to mitigate this issue often introduce more complex and less + hardware-friendly activation functions. In this work, we propose a Hysteresis Rectified + Linear Unit (HeLU), an efficient activation function designed to address the “dying + ReLU” problem with minimal complexity. Unlike traditional activation functions with + fixed thresholds for training and inference, HeLU employs a variable threshold that + refines the backpropagation. This refined mechanism allows simpler activation functions + to achieve competitive performance comparable to their more complex counterparts + without introducing unnecessary complexity or requiring inductive biases. Empirical + evaluations demonstrate that HeLU enhances model generalization across diverse datasets, + offering a promising solution for efficient and effective inference suitable for + a wide range of neural network architectures. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: kimhi24a +month: 0 +tex_title: Hysteresis Activation Function for Efficient Inference +firstpage: 414 +lastpage: 422 +page: 414-422 +order: 414 +cycles: false +bibtex_author: Kimhi, Moshe and Kashani, Idan and Baskin, Chaim and Mendelson, Avi +author: +- given: Moshe + family: Kimhi +- given: Idan + family: Kashani +- given: Chaim + family: Baskin +- given: Avi + family: Mendelson +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/kimhi24a/kimhi24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-kumar24a.md b/_posts/2024-12-10-kumar24a.md new file mode 100644 index 0000000..140f31d --- /dev/null +++ b/_posts/2024-12-10-kumar24a.md @@ -0,0 +1,51 @@ +--- +title: Residual vector quantization for KV cache compression in large language model +section: Inference +abstract: 'KV cache compression methods have mainly relied on scalar quantization + techniques to reduce the memory requirements during decoding. In this work, we apply + residual vector quantization, which has been widely used for high fidelity audio + compression, to compress KV cache in large language models (LLM). We adapt the standard + recipe with minimal changes to compress the output of any key or value projection + matrix in a pretrained LLM: we scale the vector by its standard deviation, divide + channels into groups and then quantize each group with the same residual vector + quantizer. We learn the codebook using exponential moving average and there are + no other learnable parameters including the input and output projections normally + used in a vector quantization set up. We find that a residual depth of 8 recovers + most of the performance of the unquantized model. We also find that grouping non-contiguous + channels together works better than grouping contiguous channels for compressing + key matrix and the method further benefits from a light weight finetuning of LLM + together with the quantization. Overall, the proposed technique is competitive with + existing quantization methods while being much simpler and results in  5.5x compression + compared to half precision.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: kumar24a +month: 0 +tex_title: Residual vector quantization for {KV} cache compression in large language + model +firstpage: 485 +lastpage: 490 +page: 485-490 +order: 485 +cycles: false +bibtex_author: Kumar, Ankur +author: +- given: Ankur + family: Kumar +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/kumar24a/kumar24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-liu24a.md b/_posts/2024-12-10-liu24a.md new file mode 100644 index 0000000..4393a25 --- /dev/null +++ b/_posts/2024-12-10-liu24a.md @@ -0,0 +1,63 @@ +--- +title: 'MisD-MoE: A Multimodal Misinformation Detection Framework with Adaptive Feature + Selection' +section: Model Design \& Architecture +abstract: The rapid growth of social media has led to the widespread dissemination + of misinformation across multiple content forms, including text, images, audio, + and video. Compared to unimodal misinformation detection, multimodal misinformation + detection benefits from the increased availability of information across multiple + modalities. However, these additional features may introduce redundancy, where overlapping + or irrelevant information is included, potentially disrupting the feature space + and consequently impairing the model’s performance. To address the issue, we propose + a novel framework, Misinformation Detection Mixture of Experts (MisD-MoE), which + employs distinct expert models for each modality and incorporates an adaptive feature + selection mechanism using top-k gating and Gumbel-Sigmoid. This approach dynamically + filters relevant features, reducing redundancy and improving detection accuracy. + Extensive experiments on the FakeSV and FVC-2018 datasets demonstrate that MisD-MoE + significantly outperforms state-of-the-art methods, with accuracy improvements of + 3.45% and 3.71% on the respective datasets compared to baseline models. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: liu24a +month: 0 +tex_title: "{MisD-MoE}: A Multimodal Misinformation Detection Framework with Adaptive + Feature Selection" +firstpage: 114 +lastpage: 122 +page: 114-122 +order: 114 +cycles: false +bibtex_author: Liu, Moyang and Yan, Kaiying and Liu, Yukun and Fu, Ruibo and Wen, + Zhengqi and Liu, Xuefei and Li, Chenxing +author: +- given: Moyang + family: Liu +- given: Kaiying + family: Yan +- given: Yukun + family: Liu +- given: Ruibo + family: Fu +- given: Zhengqi + family: Wen +- given: Xuefei + family: Liu +- given: Chenxing + family: Li +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/liu24a/liu24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-lu24a.md b/_posts/2024-12-10-lu24a.md new file mode 100644 index 0000000..aaf8140 --- /dev/null +++ b/_posts/2024-12-10-lu24a.md @@ -0,0 +1,60 @@ +--- +title: Improving Multi-candidate Speculative Decoding +section: Inference +abstract: Speculative Decoding (SD) is a technique to accelerate the inference of + Large Language Models (LLMs) by using a lower complexity draft model to propose + candidate tokens verified by a larger target model. To further improve efficiency, + Multi-Candidate Speculative Decoding (MCSD) improves upon this by sampling multiple + candidate tokens from the draft model at each step and verifying them in parallel, + thus increasing the chances of accepting a token and reducing generation time. Existing + MCSD methods rely on the draft model to initialize the multi-candidate sequences + and use static length and tree attention structure for draft generation. However, + such an approach suffers from the draft and target model’s output distribution differences, + especially in a dynamic generation context. In this work, we introduce a new version + of MCSD that includes a target model initialized multi-candidate generation, a dynamic + sliced topology-aware causal mask for dynamic length adjustment, and decision models + to optimize early stopping. We experimented with our method on Llama 2-7B and its + variants and observed a maximum 27.5% speedup compared to our MCSD baseline across + three benchmarks with Llama 2-7B as the target model and JackFram 68M as the draft + model. Additionally, we evaluate the effects of using the target model initialized + multi-candidate process with different draft models on output quality. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: lu24a +month: 0 +tex_title: Improving Multi-candidate Speculative Decoding +firstpage: 382 +lastpage: 394 +page: 382-394 +order: 382 +cycles: false +bibtex_author: Lu, XiaoFan and Zeng, Yixiao and Levorato, Marco and Ma, FeiYang and + Yu, ZiXu +author: +- given: XiaoFan + family: Lu +- given: Yixiao + family: Zeng +- given: Marco + family: Levorato +- given: FeiYang + family: Ma +- given: ZiXu + family: Yu +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/lu24a/lu24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-mamou24a.md b/_posts/2024-12-10-mamou24a.md new file mode 100644 index 0000000..56f1fdb --- /dev/null +++ b/_posts/2024-12-10-mamou24a.md @@ -0,0 +1,57 @@ +--- +title: Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large Language + Models +section: Inference +abstract: Speculative decoding is commonly used for reducing the inference latency + of large language models. Its effectiveness depends highly on the speculation lookahead + (SL)-the number of tokens generated by the draft model at each iteration. In this + work we show that the common practice of using the same SL for all iterations (static + SL) is suboptimal. We introduce DISCO (DynamIc SpeCulation lookahead Optimization), + a novel method for dynamically selecting the SL. Our experiments with four datasets + show that DISCO reaches an average speedup of 10% compared to the best static SL + baseline, while generating the exact same text. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: mamou24a +month: 0 +tex_title: Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large + Language Models +firstpage: 456 +lastpage: 467 +page: 456-467 +order: 456 +cycles: false +bibtex_author: Mamou, Jonathan and Pereg, Oren and Korat, Daniel and Berchansky, Moshe + and Timor, Nadav and Wasserblat, Moshe and Schwartz, Roy +author: +- given: Jonathan + family: Mamou +- given: Oren + family: Pereg +- given: Daniel + family: Korat +- given: Moshe + family: Berchansky +- given: Nadav + family: Timor +- given: Moshe + family: Wasserblat +- given: Roy + family: Schwartz +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/mamou24a/mamou24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-panda24a.md b/_posts/2024-12-10-panda24a.md new file mode 100644 index 0000000..e820632 --- /dev/null +++ b/_posts/2024-12-10-panda24a.md @@ -0,0 +1,59 @@ +--- +title: Dense Backpropagation Improves Routing for Sparsely-Gated Mixture-of-Experts +section: Model Design \& Architecture +abstract: Sparsely-gated Mixture-of-Experts (MoEs) such as Gemini have proven to be + more efficient than dense Transformers because they can dynamically activate a subset + of their overall parameters by \emph{routing} tokens to selected “experts”, allowing + practitioners to scale up model parameter counts without significantly increasing + total compute. However, current MoE training approaches only update the router with + a sparse gradient and suffer from issues such as load imbalance. We propose a new + router that can receive a dense gradient update from a sparse forward pass. Our + method adds minimal overhead, but improves on the common Top-K routing in both performance + and load balance. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: panda24a +month: 0 +tex_title: Dense Backpropagation Improves Routing for Sparsely-Gated Mixture-of-Experts +firstpage: 81 +lastpage: 101 +page: 81-101 +order: 81 +cycles: false +bibtex_author: Panda, Ashwinee and Baherwani, Vatsal and Sarwar, Zain and Therien, + Benjamin and Sahu, Sambit and Rawls, Stephen and Chakraborty, Supriyo and Goldstein, + Tom +author: +- given: Ashwinee + family: Panda +- given: Vatsal + family: Baherwani +- given: Zain + family: Sarwar +- given: Benjamin + family: Therien +- given: Sambit + family: Sahu +- given: Stephen + family: Rawls +- given: Supriyo + family: Chakraborty +- given: Tom + family: Goldstein +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/panda24a/panda24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-pieler24a.md b/_posts/2024-12-10-pieler24a.md new file mode 100644 index 0000000..2917e99 --- /dev/null +++ b/_posts/2024-12-10-pieler24a.md @@ -0,0 +1,79 @@ +--- +title: Rephrasing natural text data with different languages and quality levels for + Large Language Model pre-training +section: Benchmark \& Evaluation +abstract: Recently published work on rephrasing natural text data for pre-training + LLMs has shown promising results when combining the original dataset with the synthetically + rephrased data. We build upon previous work by replicating existing results on C4 + and extending them with our optimized rephrasing pipeline to the English, German, + Italian, and Spanish Oscar subsets of CulturaX. Our pipeline leads to increased + performance on standard evaluation benchmarks in both the mono- and multilingual + setup. In addition, we provide a detailed study of our pipeline, investigating the + choice of the base dataset and LLM for the rephrasing, as well as the relationship + between the model size and the performance after pre-training. By exploring data + with different perceived quality levels, we show that gains decrease with higher + quality. Furthermore, we find the difference in performance between model families + to be bigger than between different model sizes. This highlights the necessity for + detailed tests before choosing an LLM to rephrase large amounts of data. Moreover, + we investigate the effect of pre-training with synthetic data on supervised fine-tuning. + Here, we find increasing but inconclusive results that highly depend on the used + benchmark. These results (again) highlight the need for better benchmarking setups. + In summary, we show that rephrasing multilingual and low-quality data is a very + promising direction to extend LLM pre-training data. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: pieler24a +month: 0 +tex_title: Rephrasing natural text data with different languages and quality levels + for Large Language Model pre-training +firstpage: 491 +lastpage: 511 +page: 491-511 +order: 491 +cycles: false +bibtex_author: Pieler, Michael and Bellagente, Marco and Teufel, Hannah and Phung, + Duy and Cooper, Nathan and Tow, Jonathan and Rocha, Paulo and Adithyan, Reshinth + and Alyafeai, Zaid and Pinnaparaju, Nikhil and Zhuravinskyi, Maksym and Riquelme, + Carlos +author: +- given: Michael + family: Pieler +- given: Marco + family: Bellagente +- given: Hannah + family: Teufel +- given: Duy + family: Phung +- given: Nathan + family: Cooper +- given: Jonathan + family: Tow +- given: Paulo + family: Rocha +- given: Reshinth + family: Adithyan +- given: Zaid + family: Alyafeai +- given: Nikhil + family: Pinnaparaju +- given: Maksym + family: Zhuravinskyi +- given: Carlos + family: Riquelme +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/pieler24a/pieler24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-praveen-rajasekhar24a.md b/_posts/2024-12-10-praveen-rajasekhar24a.md new file mode 100644 index 0000000..b440c01 --- /dev/null +++ b/_posts/2024-12-10-praveen-rajasekhar24a.md @@ -0,0 +1,55 @@ +--- +title: 'Less is Enough: Adapting Pre-trained Vision Transformers for Audio-Visual + Speaker Verification' +section: Applications +abstract: Speaker Verification has achieved significant improvement in performance + using sophisticated deep learning architectures, specialized for speech signals + as well as robust loss functions. Recently, the fusion of faces and voices received + a lot of attention as they offer complementary relationship with each other, which + has the potential to outperform systems with only speech signals. Inspired by the + massive success of Vision Transformers (ViTs) in computer vision, ViTs have also + been explored for multimodal learning. In this work, we have investigated the potential + of ViTs, pre-trained on visual data, for audio-visual speaker verification. To cope + with the challenges of large-scale training, we introduce the Latent Audio-Visual + Vision Transformer (LAVViT) adapters, where we exploit the existing pre-trained + models on visual data by training only the parameters of LAVViT adapters, without + fine-tuning the original parameters of the pre-trained models. The LAVViT adapters + are injected into every layer of the ViT architecture to effectively fuse the audio + and visual modalities using a small set of latent tokens, thereby avoiding the quadratic + computational cost of cross-attention across the modalities. The proposed approach + has been evaluated on the Voxceleb1 dataset and shows promising performance using + only a few trainable parameters. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: praveen-rajasekhar24a +month: 0 +tex_title: 'Less is Enough: Adapting Pre-trained Vision Transformers for Audio-Visual + Speaker Verification' +firstpage: 554 +lastpage: 563 +page: 554-563 +order: 554 +cycles: false +bibtex_author: Praveen Rajasekhar, Gnana and Alam, Jahangir +author: +- given: Gnana + family: Praveen Rajasekhar +- given: Jahangir + family: Alam +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/praveen-rajasekhar24a/praveen-rajasekhar24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-qiao24a.md b/_posts/2024-12-10-qiao24a.md new file mode 100644 index 0000000..82984d3 --- /dev/null +++ b/_posts/2024-12-10-qiao24a.md @@ -0,0 +1,69 @@ +--- +title: 'VL-Mamba: Exploring State Space Models for Multimodal Learning' +section: Model Design \& Architecture +abstract: Multimodal large language models (MLLMs) have gained considerable attention + due to their ability to integrate visual and textual information, enhancing understanding + and providing context for complex tasks. While Transformer-based architectures have + been the dominant framework for MLLMs, recent studies suggest that state space models + (SSMs) like Mamba can achieve competitive or even superior performance. However, + no prior research has investigated the potential of SSMs to replace Transformers + in multimodal tasks, which are inherently more challenging due to the heterogeneity + of visual and language data and the complexities of aligning these modalities. In + this paper, we introduce VL-Mamba, the first study to explore the application of + state space models in multimodal learning tasks. VL-Mamba leverages a pretrained + Mamba language model as its core, and we propose a novel MultiModal Connector (MMC) + that incorporates a Vision Selective Scan (VSS) module to improve visual sequence + modeling. We empirically explore how to effectively apply the 2D vision selective + scan mechanism for multimodal learning and the combinations of different vision + encoders and variants of pretrained Mamba language models. Our experiments across + multiple multimodal benchmarks demonstrate that VL-Mamba achieves competitive performance + against small MLLMs of similar size, and in some cases, surpasses larger models + such as the 7B and 13B versions of LLaVA-1.5. These results suggest that state space + models have the potential to serve as an alternative to Transformers in multimodal + learning tasks. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: qiao24a +month: 0 +tex_title: "{VL-Mamba}: Exploring State Space Models for Multimodal Learning" +firstpage: 102 +lastpage: 113 +page: 102-113 +order: 102 +cycles: false +bibtex_author: Qiao, Yanyuan and Yu, Zheng and Zhao, Zijia and Chen, Sihan and Sun, + Mingzhen and Guo, Longteng and Wu, Qi and Liu, Jing +author: +- given: Yanyuan + family: Qiao +- given: Zheng + family: Yu +- given: Zijia + family: Zhao +- given: Sihan + family: Chen +- given: Mingzhen + family: Sun +- given: Longteng + family: Guo +- given: Qi + family: Wu +- given: Jing + family: Liu +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/qiao24a/qiao24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-rajabzadeh24a.md b/_posts/2024-12-10-rajabzadeh24a.md new file mode 100644 index 0000000..317d20d --- /dev/null +++ b/_posts/2024-12-10-rajabzadeh24a.md @@ -0,0 +1,68 @@ +--- +title: 'EchoAtt: Attend, Copy, then Adjust for More Efficient Large Language Models' +section: Model Efficiency \& Compression +abstract: Large Language Models (LLMs), with their increasing depth and number of + parameters, have demonstrated outstanding performance across a variety of natural + language processing tasks. However, this growth in scale leads to increased computational + demands, particularly during inference and fine-tuning. To address these challenges, + we introduce \textbf{EchoAtt}, a novel framework aimed at optimizing transformer-based + models by analyzing and leveraging the similarity of attention patterns across layers. + Our analysis reveals that many inner layers in LLMs, especially larger ones, exhibit + highly similar attention matrices. By exploiting this similarity, \textbf{EchoAtt} + enables the sharing of attention matrices in less critical layers, significantly + reducing computational requirements without compromising performance. We incorporate + this approach within a knowledge distillation setup, where a pre-trained teacher + model guides the training of a smaller student model. The student model selectively + shares attention matrices in layers with high similarity while inheriting key parameters + from the teacher. Our best results with TinyLLaMA-1.1B demonstrate that \textbf{EchoAtt} + improves inference speed by 15%, training speed by 25%, and reduces the number of + parameters by approximately 4%, all while improving zero-shot performance. These + findings highlight the potential of attention matrix sharing to enhance the efficiency + of LLMs, making them more practical for real-time and resource-limited applications. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: rajabzadeh24a +month: 0 +tex_title: "{EchoAtt}: Attend, Copy, then Adjust for More Efficient Large Language + Models" +firstpage: 259 +lastpage: 269 +page: 259-269 +order: 259 +cycles: false +bibtex_author: Rajabzadeh, Hossein and Jafari, Aref and Sharma, Aman and Jami, Benyamin + and Ju Hj Kwon, Hyock and Ghodsi, Ali and Chen, Boxing and Rezagholizadeh, Mehdi +author: +- given: Hossein + family: Rajabzadeh +- given: Aref + family: Jafari +- given: Aman + family: Sharma +- given: Benyamin + family: Jami +- given: Hyock + family: Ju Hj Kwon +- given: Ali + family: Ghodsi +- given: Boxing + family: Chen +- given: Mehdi + family: Rezagholizadeh +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/rajabzadeh24a/rajabzadeh24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-rajput24a.md b/_posts/2024-12-10-rajput24a.md new file mode 100644 index 0000000..b420144 --- /dev/null +++ b/_posts/2024-12-10-rajput24a.md @@ -0,0 +1,53 @@ +--- +title: Inference-Friendly Models With MixAttention +section: Inference +abstract: The size of the key-value (KV) cache plays a critical role in determining + both the maximum context length and the number of concurrent requests supported + during inference in modern language models. The KV cache size grows proportionally + with the number of attention heads and the tokens processed, leading to increased + memory consumption and slower inference for long inputs. In this work, we explore + the use of MixAttention, a model architecture modification closely related to a + blog published by Character.AI. MixAttention combines sliding window attention, + where only a small subset of recent tokens is stored in the KV cache, with KV cache + sharing across layers. Our experiments demonstrate that MixAttention significantly + reduces memory usage and improves inference speed without sacrificing model performance + in both short and long-context tasks. We also explore various configurations of + this architecture, identifying those that maintain quality across evaluation metrics + while optimizing resource efficiency. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: rajput24a +month: 0 +tex_title: Inference-Friendly Models With {MixAttention} +firstpage: 370 +lastpage: 381 +page: 370-381 +order: 370 +cycles: false +bibtex_author: Rajput, Shashank and Sheng, Ying and Owen, Sean and Chiley, Vitaliy +author: +- given: Shashank + family: Rajput +- given: Ying + family: Sheng +- given: Sean + family: Owen +- given: Vitaliy + family: Chiley +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/rajput24a/rajput24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-saheb-pasand24a.md b/_posts/2024-12-10-saheb-pasand24a.md new file mode 100644 index 0000000..c4e727a --- /dev/null +++ b/_posts/2024-12-10-saheb-pasand24a.md @@ -0,0 +1,50 @@ +--- +title: 'RGP: Achieving Memory-Efficient Model Fine-tuning Via Randomized Gradient + Projection' +section: Training +abstract: Training and fine-tuning Large Language Models (LLMs) require significant + memory due to the substantial growth in the size of weight parameters and optimizer + states. While methods like low-rank adaptation (LoRA), which introduce low-rank + trainable modules in parallel to frozen pre-trained weights, effectively reduce + memory usage, they often fail to preserve the optimization trajectory and are generally + less effective for pre-training models. On the other hand, approaches, such as GaLore, + that project gradients onto lower-dimensional spaces maintain the training trajectory + and perform well in pre-training but suffer from high computational complexity, + as they require repeated singular value decomposition on large matrices. In this + work, we propose Randomized Gradient Projection (RGP), which outperforms GaLore, + the current state-of-the-art in efficient fine-tuning, on the GLUE task suite, while + being 74% faster on average and requiring similar memory. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: saheb-pasand24a +month: 0 +tex_title: "{RGP}: Achieving Memory-Efficient Model Fine-tuning Via Randomized Gradient + Projection" +firstpage: 47 +lastpage: 54 +page: 47-54 +order: 47 +cycles: false +bibtex_author: Saheb Pasand, Ali and Bashivan, Pouya +author: +- given: Ali + family: Saheb Pasand +- given: Pouya + family: Bashivan +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/saheb-pasand24a/saheb-pasand24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-samragh24a.md b/_posts/2024-12-10-samragh24a.md new file mode 100644 index 0000000..0ebef5c --- /dev/null +++ b/_posts/2024-12-10-samragh24a.md @@ -0,0 +1,67 @@ +--- +title: 'Scaling Smart: Accelerating Large Language Model Pre-Training with Small Model + Initialization' +section: Training +abstract: 'The pre-training phase of language models often begins with randomly initialized + parameters. With the current trends in scaling models, training their large number + of parameters can be extremely slow and costly. In contrast, small language models + are less expensive to train, but they often cannot achieve the accuracy of large + models. In this paper, we explore an intriguing idea to connect these two different + regimes: Can we develop a method to initialize large language models using smaller + pre-trained models? Will such initialization bring any benefits in terms of training + time and final accuracy? In this paper, we introduce HyperCloning, a method that + can expand the parameters of a pre-trained language model to those of a larger model + with increased hidden dimensions. Our method ensures that the larger model retains + the functionality of the smaller model. As a result, the larger model already inherits + the predictive power and accuracy of the smaller model before the training starts. + We demonstrate that training such an initialized model results in significant savings + in terms of GPU hours required for pre-training large language models. Implementation + of HyperCloning is available at https://github.com/apple/ml-hypercloning/tree/main.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: samragh24a +month: 0 +tex_title: 'Scaling Smart: Accelerating Large Language Model Pre-Training with Small + Model Initialization' +firstpage: 1 +lastpage: 13 +page: 1-13 +order: 1 +cycles: false +bibtex_author: Samragh, Mohammad and Mirzadeh, Seyed Iman and Alizadeh-Vahid, Keivan + and Faghri, Fartash and Cho, Minsik and Nabi, Moin and Naik, Devang and Farajtabar, + Mehrdad +author: +- given: Mohammad + family: Samragh +- given: Seyed Iman + family: Mirzadeh +- given: Keivan + family: Alizadeh-Vahid +- given: Fartash + family: Faghri +- given: Minsik + family: Cho +- given: Moin + family: Nabi +- given: Devang + family: Naik +- given: Mehrdad + family: Farajtabar +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/samragh24a/samragh24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-sarkar24a.md b/_posts/2024-12-10-sarkar24a.md new file mode 100644 index 0000000..bc1efbb --- /dev/null +++ b/_posts/2024-12-10-sarkar24a.md @@ -0,0 +1,65 @@ +--- +title: Revisiting SMoE Language Models by Evaluating Inefficiencies with Task Specific + Expert Pruning +section: Model Design \& Architecture +abstract: 'Sparse Mixture of Expert (SMoE) models have emerged as a scalable alternative + to dense models in language modeling. These models use conditionally activated feedforward + subnetworks in transformer blocks, allowing for a separation between total model + parameters and per-example computation. However, large token-routed SMoE models + face a significant challenge: during inference, the entire model must be used for + a sequence or a batch, resulting in high latencies in a distributed setting that + offsets the advantages of per-token sparse activation. Our research explores task-specific + model pruning to inform decisions about designing SMoE architectures, mainly modulating + the choice of expert counts in pretraining. We investigate whether such pruned models + offer advantages over smaller SMoE models trained from scratch, when evaluating + and comparing them individually on tasks. To that end, we introduce an adaptive + task-aware pruning technique {\tt UNCURL} to reduce the number of experts per MoE + layer in an offline manner post-training. Our findings reveal a threshold pruning + factor for the reduction that depends on the number of experts used in pretraining, + above which, the reduction starts to degrade model performance. These insights contribute + to our understanding of model design choices when pretraining with SMoE architectures, + particularly useful when considering task-specific inference optimization for later + stages.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: sarkar24a +month: 0 +tex_title: Revisiting {SMoE} Language Models by Evaluating Inefficiencies with Task + Specific Expert Pruning +firstpage: 165 +lastpage: 181 +page: 165-181 +order: 165 +cycles: false +bibtex_author: Sarkar, Soumajyoti and Lausen, Leonard and Cevher, Volkan and Brox, + Thomas and Zha, Sheng and Karypis, George +author: +- given: Soumajyoti + family: Sarkar +- given: Leonard + family: Lausen +- given: Volkan + family: Cevher +- given: Thomas + family: Brox +- given: Sheng + family: Zha +- given: George + family: Karypis +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/sarkar24a/sarkar24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-sarwar24a.md b/_posts/2024-12-10-sarwar24a.md new file mode 100644 index 0000000..991e3f3 --- /dev/null +++ b/_posts/2024-12-10-sarwar24a.md @@ -0,0 +1,62 @@ +--- +title: 'StructMoE: Structured Mixture of Experts Using Low Rank Experts' +section: Model Design \& Architecture +abstract: We introduce StructMoE, a method to scale MoE architectures by augmenting + experts with dynamic capacity using structured matrices we call Low Rank Experts + (LoRE). These LoREs are selected on a per-expert and per-token basis using a secondary + router specific to every expert and are entangled with the main expert in the up-projection + phase of the expert before the activation function. Empirically, we find this approach + to outperform an MoE baseline in terms of loss on a held out validation set. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: sarwar24a +month: 0 +tex_title: "{StructMoE}: Structured Mixture of Experts Using Low Rank Experts" +firstpage: 182 +lastpage: 193 +page: 182-193 +order: 182 +cycles: false +bibtex_author: Sarwar, Zain and Panda, Ashwinee and Th\'erien, Benjamin and Rawls, + Stephen and Das, Anirban and Balasubramaniam, Kartik and Kapusuzoglu, Berkcan and + Zhang, Shixiong and Sahu, Sambit and Naphade, Milind and Chakraborty, Supriyo +author: +- given: Zain + family: Sarwar +- given: Ashwinee + family: Panda +- given: Benjamin + family: Thérien +- given: Stephen + family: Rawls +- given: Anirban + family: Das +- given: Kartik + family: Balasubramaniam +- given: Berkcan + family: Kapusuzoglu +- given: Shixiong + family: Zhang +- given: Sambit + family: Sahu +- given: Milind + family: Naphade +- given: Supriyo + family: Chakraborty +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/sarwar24a/sarwar24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-seng-chua24a.md b/_posts/2024-12-10-seng-chua24a.md new file mode 100644 index 0000000..74550a9 --- /dev/null +++ b/_posts/2024-12-10-seng-chua24a.md @@ -0,0 +1,50 @@ +--- +title: Post-Training Statistical Calibration for Higher Activation Sparsity +section: Model Efficiency \& Compression +abstract: We present Statistical Calibrated Activation Pruning (SCAP), a post-training + activation pruning framework that (1) generalizes sparsification by input activations + of Fully-Connected layers for generic and flexible application across Transformers, + and (2) features a simple Mode-Centering technique to pre-calibrate activation distributions + for maximizing post-training sparsity. Our results demonstrate robust Pareto efficiency + compared to prior methods, translating to a 1.5× additional LLM decoding speedup + against CATS[12] at iso model quality. SCAP effectiveness is empirically verified + across a wide range of models, including recent Transformer Decoders, MoE, Mamba2, + Encoding Transformer, and pre-quantized models, highlighting its practicality and + scalability. The code is available at https://github.com/IntelLabs/SCAP. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: seng-chua24a +month: 0 +tex_title: Post-Training Statistical Calibration for Higher Activation Sparsity +firstpage: 206 +lastpage: 221 +page: 206-221 +order: 206 +cycles: false +bibtex_author: Seng Chua, Vui and Pan, Yujie and Jain, Nilesh and Seng Chua, Vui +author: +- given: Vui + family: Seng Chua +- given: Yujie + family: Pan +- given: Nilesh + family: Jain +- given: Vui + family: Seng Chua +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/seng-chua24a/seng-chua24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-sharify24a.md b/_posts/2024-12-10-sharify24a.md new file mode 100644 index 0000000..9512d1a --- /dev/null +++ b/_posts/2024-12-10-sharify24a.md @@ -0,0 +1,56 @@ +--- +title: Post Training Quantization of Large Language Models with Microscaling Formats +section: Model Efficiency \& Compression +abstract: 'Large Language Models (LLMs) have distinguished themselves with outstanding + performance in complex language modeling tasks, yet they come with significant computational + and storage challenges. This paper explores the potential of quantization to mitigate + these challenges. We systematically study the combined application of three well-known + post-training techniques, SmoothQuant, AWQ, and GPTQ, and provide a comprehensive + analysis of their interactions and implications for advancing LLM quantization. + We enhance the versatility of these methods by enabling quantization to microscaling + (MX) formats, extending the applicability of these PTQ algorithms beyond their original + fixed-point format targets. We show that combining different PTQ methods enables + us to quantize models to 4-bit weights and 8-bit activations using the MXINT format + with negligible accuracy loss compared to the uncompressed baseline. ' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: sharify24a +month: 0 +tex_title: Post Training Quantization of Large Language Models with Microscaling Formats +firstpage: 241 +lastpage: 258 +page: 241-258 +order: 241 +cycles: false +bibtex_author: Sharify, Sayeh and Saxena, Utkarsh and Xu, Zifei and Yazar, Wanzin + and Soloveychik, Ilya and Wang, Xin +author: +- given: Sayeh + family: Sharify +- given: Utkarsh + family: Saxena +- given: Zifei + family: Xu +- given: Wanzin + family: Yazar +- given: Ilya + family: Soloveychik +- given: Xin + family: Wang +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/sharify24a/sharify24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-sharma24a.md b/_posts/2024-12-10-sharma24a.md new file mode 100644 index 0000000..353b008 --- /dev/null +++ b/_posts/2024-12-10-sharma24a.md @@ -0,0 +1,49 @@ +--- +title: Efficiently Dispatching Flash Attention For Partially Filled Attention Masks +section: Inference +abstract: 'Transformers are widely used across various applications, many of which + yield sparse or partially filled attention matrices. Examples include attention + masks designed to reduce the quadratic complexity of attention, sequence packing + techniques, and recent innovations like tree masking for fast validation in MEDUSA. + Despite the inherent sparsity in these matrices, the state-of-the-art algorithm + Flash Attention still processes them with quadratic complexity as though they were + dense. In this paper, we introduce \textbf{Binary Block Masking}, a highly efficient + modification that enhances Flash Attention by making it mask-aware. We further propose + two optimizations: one tailored for masks with contiguous non-zero patterns and + another for extremely sparse masks. Our experiments on attention masks derived from + real-world scenarios demonstrate up to a 9x runtime improvement. The implementation + will be publicly released to foster further research and application.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: sharma24a +month: 0 +tex_title: Efficiently Dispatching Flash Attention For Partially Filled Attention + Masks +firstpage: 423 +lastpage: 442 +page: 423-442 +order: 423 +cycles: false +bibtex_author: Sharma, Agniv and A. Geiping, Jonas +author: +- given: Agniv + family: Sharma +- given: Jonas + family: A. Geiping +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/sharma24a/sharma24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-shinde24a.md b/_posts/2024-12-10-shinde24a.md new file mode 100644 index 0000000..4223df8 --- /dev/null +++ b/_posts/2024-12-10-shinde24a.md @@ -0,0 +1,60 @@ +--- +title: Lightweight Neural Networks for Speech Emotion Recognition using Layer-wise + Adaptive Quantization +section: Applications +abstract: Speech Emotion Recognition (SER) systems are essential in advancing human-machine + interaction. While deep learning models have shown substantial success in SER by + eliminating the need for handcrafted features, their high computational and memory + requirements, alongside intensive hyper-parameter optimization, limit their deployment + on resource-constrained edge devices. To address these challenges, we introduce + an optimized and computationally efficient Multilayer Perceptron (MLP)-based classifier + within a custom SER framework. We further propose a novel, layer-wise adaptive quantization + scheme that compresses the model by adjusting bit-width precision according to layer + importance. This layer importance is calculated based on statistical measures such + as parameter proportion, entropy, and weight variance within each layer. Our approach + achieves an optimal balance between model size reduction and performance retention, + ensuring that the quantized model maintains accuracy within acceptable limits. Traditional + fixed-precision methods, while computationally simple, are less effective at reducing + model size without compromising performance. In contrast, our scheme provides a + more interpretable and computationally efficient solution. We evaluate the proposed + model on standard SER datasets using features such as Mel-Frequency Cepstral Coefficients + (MFCC), Chroma, and Mel-spectrogram. Experimental results demonstrate that our adaptive + quantization method achieves performance competitive with state-of-the-art models + while significantly reducing model size, making it highly suitable for deployment + on edge devices. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: shinde24a +month: 0 +tex_title: Lightweight Neural Networks for Speech Emotion Recognition using Layer-wise + Adaptive Quantization +firstpage: 584 +lastpage: 595 +page: 584-595 +order: 584 +cycles: false +bibtex_author: Shinde, Tushar and Jain, Ritika and Kumar Sharma, Avinash +author: +- given: Tushar + family: Shinde +- given: Ritika + family: Jain +- given: Avinash + family: Kumar Sharma +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/shinde24a/shinde24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-shiraee-kasmaee24a.md b/_posts/2024-12-10-shiraee-kasmaee24a.md new file mode 100644 index 0000000..4d60240 --- /dev/null +++ b/_posts/2024-12-10-shiraee-kasmaee24a.md @@ -0,0 +1,66 @@ +--- +title: 'ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models + Performance & Efficiency on a Specific Domain' +section: Benchmark \& Evaluation +abstract: Recent advancements in language models have started a new era of superior + information retrieval and content generation, with embedding models playing an important + role in optimizing data representation efficiency and performance. While benchmarks + like the Massive Text Embedding Benchmark (MTEB) have standardized the evaluation + of general domain embedding models, a gap remains in specialized fields such as + chemistry, which require tailored approaches due to domain-specific challenges. + This paper introduces a novel benchmark, the Chemical Text Embedding Benchmark (ChemTEB), + designed specifically for the chemical sciences. ChemTEB addresses the unique linguistic + and semantic complexities of chemical literature and data, offering a comprehensive + suite of tasks on chemical domain data. Through the evaluation of 34 open-source + and proprietary models using this benchmark, we illuminate the strengths and weaknesses + of current methodologies in processing and understanding chemical information. Our + work aims to equip the research community with a standardized, domain-specific evaluation + framework, promoting the development of more precise and efficient NLP models for + chemistry-related applications. Furthermore, it provides insights into the performance + of generic models in a domain-specific context. ChemTEB comes with open-source code + and data, contributing further to its accessibility and utility. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: shiraee-kasmaee24a +month: 0 +tex_title: "{ChemTEB}: Chemical Text Embedding Benchmark, an Overview of Embedding + Models Performance & Efficiency on a Specific Domain" +firstpage: 512 +lastpage: 531 +page: 512-531 +order: 512 +cycles: false +bibtex_author: Shiraee Kasmaee, Ali and Khodadad, Mohammad and Arshi Saloot, Mohammad + and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila +author: +- given: Ali + family: Shiraee Kasmaee +- given: Mohammad + family: Khodadad +- given: Mohammad + family: Arshi Saloot +- given: Nick + family: Sherck +- given: Stephen + family: Dokas +- given: Hamidreza + family: Mahyar +- given: Soheila + family: Samiee +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/shiraee-kasmaee24a/shiraee-kasmaee24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-stewart24a.md b/_posts/2024-12-10-stewart24a.md new file mode 100644 index 0000000..ec7b99b --- /dev/null +++ b/_posts/2024-12-10-stewart24a.md @@ -0,0 +1,54 @@ +--- +title: 'The N-Grammys: Accelerating Autoregressive Inference with Learning-Free Batched + Speculation' +section: Inference +abstract: Speculative decoding aims to speed up autoregressive generation of a language + model by verifying in parallel the tokens generated by a smaller draft model. In + this work, we explore the effectiveness of learning-free, negligible-cost draft + strategies, namely $N$-grams obtained from the model weights and the context. While + the predicted next token of the base model is rarely the top prediction of these + simple strategies, we observe that it is often within their top-$k$ predictions + for small $k$. Based on this, we show that combinations of simple strategies can + achieve significant inference speedups over different tasks. The overall performance + is comparable to more complex methods, yet does not require expensive preprocessing + or modification of the base model, and allows for seamless ‘plug-and-play’ integration + into pipelines. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: stewart24a +month: 0 +tex_title: 'The {N-Grammys}: Accelerating Autoregressive Inference with Learning-Free + Batched Speculation' +firstpage: 322 +lastpage: 335 +page: 322-335 +order: 322 +cycles: false +bibtex_author: Stewart, Lawrence and Trager, Matthew and Gonugondla, Sujan and Soatto, + Stefano +author: +- given: Lawrence + family: Stewart +- given: Matthew + family: Trager +- given: Sujan + family: Gonugondla +- given: Stefano + family: Soatto +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/stewart24a/stewart24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-timor24a.md b/_posts/2024-12-10-timor24a.md new file mode 100644 index 0000000..8761bb1 --- /dev/null +++ b/_posts/2024-12-10-timor24a.md @@ -0,0 +1,70 @@ +--- +title: Distributed Speculative Inference of Large Language Models is Provably Faster +section: Inference +abstract: 'Accelerating the inference of large language models (LLMs) is an important + challenge in artificial intelligence. This paper introduces Distributed Speculative + Inference (DSI), a novel distributed inference algorithm that is provably faster + than speculative inference (SI) [leviathan2023fast, chen2023accelerating, miao2023specinfer] + and traditional autoregressive inference (non-SI). Like other SI algorithms, DSI + works on frozen LLMs, requiring no training or architectural modifications, and + it preserves the target distribution. Prior studies on SI have demonstrated empirical + speedups (compared to non-SI) but require fast and accurate drafters, which are + often unavailable in practice. We identify a gap where SI can be slower than non-SI + given slower or less accurate drafters. We close this gap by proving that DSI is + faster than both SI and non-SI—given any drafters. DSI introduces a novel type of + task parallelism called Speculation Parallelism (SP), which orchestrates target + and drafter instances to overlap in time, creating a new foundational tradeoff between + computational resources and latency. DSI is not only faster than SI but also supports + LLMs that cannot be accelerated with SI. Our simulations show speedups of off-the-shelf + LLMs in realistic single-node settings where DSI is 1.29-1.92x faster than SI. Our + code is open-sourced: github.com/keyboardAnt/distributed-speculative-inference' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: timor24a +month: 0 +tex_title: Distributed Speculative Inference of Large Language Models is Provably + Faster +firstpage: 336 +lastpage: 354 +page: 336-354 +order: 336 +cycles: false +bibtex_author: Timor, Nadav and Mamou, Jonathan and Pereg, Oren and Berchansky, Moshe + and Korat, Daniel and Wasserblat, Moshe and Galanti, Tomer and Gordon, Michal and + Harel, David +author: +- given: Nadav + family: Timor +- given: Jonathan + family: Mamou +- given: Oren + family: Pereg +- given: Moshe + family: Berchansky +- given: Daniel + family: Korat +- given: Moshe + family: Wasserblat +- given: Tomer + family: Galanti +- given: Michal + family: Gordon +- given: David + family: Harel +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/timor24a/timor24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-wang24a.md b/_posts/2024-12-10-wang24a.md new file mode 100644 index 0000000..33f0d9a --- /dev/null +++ b/_posts/2024-12-10-wang24a.md @@ -0,0 +1,69 @@ +--- +title: 'CSKV: Training-Efficient Channel Shrinking for KV Cache in Long-Context Scenarios' +section: Inference +abstract: 'Large Language Models (LLMs) have been widely adopted to process long-context + tasks. However, the large memory overhead of the key-value (KV) cache poses significant + challenges in long-context scenarios. Existing training-free KV cache compression + methods typically focus on quantization and token pruning, which have compression + limits, and excessive sparsity can lead to severe performance degradation. Other + methods design new architectures with less KV overhead but require significant training + overhead. To address the above two drawbacks, we further explore the redundancy + in the channel dimension and apply an architecture-level design with minor training + costs. Therefore, we introduce CSKV, a training-efficient Channel Shrinking technique + for KV cache compression: (1) We first analyze the singular value distribution of + the KV cache, revealing significant redundancy and compression potential along the + channel dimension. Based on this observation, we propose using low-rank decomposition + for key and value layers and storing the low-dimension features. (2) To preserve + model performance, we introduce a bi-branch KV cache, including a window-based full-precision + KV cache and a low-precision compressed KV cache. (3) To reduce the training costs, + we minimize the layer-wise reconstruction loss for the compressed KV cache instead + of retraining the entire LLMs. Extensive experiments show that CSKV can reduce the + memory overhead of the KV cache by 80% while maintaining the model’s long-context + capability. Moreover, we show that our method can be seamlessly combined with quantization + to further reduce the memory overhead, achieving a compression ratio of up to 95%. + Code is available at https://github.com/wln20/CSKV.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: wang24a +month: 0 +tex_title: "{CSKV}: Training-Efficient Channel Shrinking for {KV} Cache in Long-Context + Scenarios" +firstpage: 468 +lastpage: 484 +page: 468-484 +order: 468 +cycles: false +bibtex_author: Wang, Luning and Li, Shiyao and Ning, Xuefei and Yuan, Zhihang and + Yan, Shengen and Dai, Guohao and Wang, Yu +author: +- given: Luning + family: Wang +- given: Shiyao + family: Li +- given: Xuefei + family: Ning +- given: Zhihang + family: Yuan +- given: Shengen + family: Yan +- given: Guohao + family: Dai +- given: Yu + family: Wang +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/wang24a/wang24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-wu24a.md b/_posts/2024-12-10-wu24a.md new file mode 100644 index 0000000..cbb1895 --- /dev/null +++ b/_posts/2024-12-10-wu24a.md @@ -0,0 +1,64 @@ +--- +title: 'Snakes and Ladders: Accelerating SSM Inference with Speculative Decoding' +section: Inference +abstract: 'Speculative decoding is a method for accelerating inference in large language + models (LLMs) by predicting multiple tokens using a smaller ‘draft model’ and validating + them against the larger ‘base model.’ If a draft token is inconsistent with what + the base model would have generated, speculative decoding ‘backtracks’ to the last + consistent token before resuming generation. This is straightforward in autoregressive + Transformer architectures since their state is a sliding window of past tokens. + However, their baseline inference complexity is quadratic in the number of input + tokens. State Space Models (SSMs) have linear inference complexity, but they maintain + a separate Markov state that makes backtracking non-trivial. We propose two methods + to perform speculative decoding in SSMs: “Joint Attainment and Advancement” and + “Activation Replay.” Both methods utilize idle computational resources to speculate + and verify multiple tokens, allowing us to produce 6 tokens for 1.47$\times$ the + cost of one, corresponding to an average 1.82$\times$ wall-clock speed-up on three + different benchmarks using a simple $n$-gram for drafting. Furthermore, as model + size increases, relative overhead of speculation and verification decreases: Scaling + from 1.3B parameters to 13B reduces relative overhead from 1.98$\times$ to 1.22$\times$. + Unlike Transformers, speculative decoding in SSMs can be easily applied to batches + of sequences, allowing dynamic allocation of resources to fill gaps in compute utilization + and thereby improving efficiency and throughput with variable inference traffic.' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: wu24a +month: 0 +tex_title: 'Snakes and Ladders: Accelerating {SSM} Inference with Speculative Decoding' +firstpage: 292 +lastpage: 304 +page: 292-304 +order: 292 +cycles: false +bibtex_author: Wu, Yangchao and Dukler, Yonatan and Trager, Matthew and Achille, Alessandro + and Xia, Wei and Soatto, Stefano +author: +- given: Yangchao + family: Wu +- given: Yonatan + family: Dukler +- given: Matthew + family: Trager +- given: Alessandro + family: Achille +- given: Wei + family: Xia +- given: Stefano + family: Soatto +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/wu24a/wu24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-xu24a.md b/_posts/2024-12-10-xu24a.md new file mode 100644 index 0000000..95c765c --- /dev/null +++ b/_posts/2024-12-10-xu24a.md @@ -0,0 +1,55 @@ +--- +title: Scaling laws for post-training quantized large language models +section: Model Efficiency \& Compression +abstract: 'Generalization abilities of well-trained large language models (LLMs) are + known to scale predictably as a function of model size. In contrast to the existence + of practical scaling laws governing pre-training, the quality of LLMs after post-training + compression remains highly unpredictable, often requiring case-by-case validation + in practice. In this work, we attempted to close this gap for post-training weight + quantization of LLMs by conducting a systematic empirical study on multiple LLM + families quantized to numerous low-precision tensor data types using popular weight + quantization techniques. We identified key scaling factors pertaining to characteristics + of the local loss landscape, based on which the performance of quantized LLMs can + be reasonably well predicted by a statistical model. ' +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: xu24a +month: 0 +tex_title: Scaling laws for post-training quantized large language models +firstpage: 270 +lastpage: 285 +page: 270-285 +order: 270 +cycles: false +bibtex_author: Xu, Zifei and Y Lan, Alexander and Yazar, Wanzin and Webb, Tristan + and Sharify, Sayeh and Wang, Xin +author: +- given: Zifei + family: Xu +- given: Alexander + family: Y Lan +- given: Wanzin + family: Yazar +- given: Tristan + family: Webb +- given: Sayeh + family: Sharify +- given: Xin + family: Wang +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/xu24a/xu24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-yang24a.md b/_posts/2024-12-10-yang24a.md new file mode 100644 index 0000000..049f5fa --- /dev/null +++ b/_posts/2024-12-10-yang24a.md @@ -0,0 +1,53 @@ +--- +title: Partially Shared Query-Key for Lightweight Language Models +section: Model Efficiency \& Compression +abstract: Lightweight language models, such as TinyBERT 14.5M, have emerged as a critical + area of research because of their implementation on resource-constrained hardware. + These transformer models include significantly smaller parameter size, reduced memory + and computational requirements. These features make such models highly suitable + for deployment on small devices. We explore the concept of parameter sharing between + the key and query weight matrices of a transformer model. The full query-key sharing + which has already been proposed in the literature introduces a fully-quadratic attention + matrix, oversimplifies directional dependencies and degrades pre-training loss. + In contrast, partial parameter sharing balances complexity reduction and performance + retention. Partial parameter sharing effectively addresses over-fitting while maintaining + strong performance even with a high degree of shared parameters up to 95%. This + provides a promising strategy for enhancing language models, specifically targeting + small models. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: yang24a +month: 0 +tex_title: Partially Shared Query-Key for Lightweight Language Models +firstpage: 286 +lastpage: 291 +page: 286-291 +order: 286 +cycles: false +bibtex_author: Yang, Kai and Partovi Nia, Vahid and Chen, Boxing and Asgharian, Masoud +author: +- given: Kai + family: Yang +- given: Vahid + family: Partovi Nia +- given: Boxing + family: Chen +- given: Masoud + family: Asgharian +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/yang24a/yang24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/_posts/2024-12-10-zayats24a.md b/_posts/2024-12-10-zayats24a.md new file mode 100644 index 0000000..bdcdcc5 --- /dev/null +++ b/_posts/2024-12-10-zayats24a.md @@ -0,0 +1,58 @@ +--- +title: 'Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities' +section: Model Design \& Architecture +abstract: Integrating multiple generative foundation models, especially those trained + on different modalities, into something greater than the sum of its parts poses + significant challenges. Two key hurdles are the availability of aligned data (concepts + that contain similar meaning but is expressed differently in different modalities), + and effectively leveraging unimodal representations in cross-domain generative tasks, + without compromising their original unimodal capabilities. We propose Zipper, a + multi-tower decoder architecture that addresses these concerns by using cross-attention + to flexibly compose multimodal generative models from independently pre-trained + unimodal decoders. In our experiments fusing speech and text modalities, we show + the proposed architecture performs very competitively in scenarios with limited + aligned text-speech data. We also showcase the flexibility of our model to selectively + maintain unimodal (e.g., text-to-text generation) generation performance by freezing + the corresponding modal tower (e.g. text). In cross-modal tasks such as automatic + speech recognition (ASR) where the output modality is text, we show that freezing + the text backbone results in negligible performance degradation. In cross-modal + tasks such as text-to-speech generation (TTS) where the output modality is speech, + we show that using a pre-trained speech backbone results in superior performance + to the baseline. +layout: inproceedings +series: Proceedings of Machine Learning Research +publisher: PMLR +issn: 2640-3498 +id: zayats24a +month: 0 +tex_title: 'Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities' +firstpage: 123 +lastpage: 135 +page: 123-135 +order: 123 +cycles: false +bibtex_author: Zayats, Vicky and Chen, Peter and Ferrari, Melissa and Padfield, Dirk +author: +- given: Vicky + family: Zayats +- given: Peter + family: Chen +- given: Melissa + family: Ferrari +- given: Dirk + family: Padfield +date: 2024-12-10 +address: +container-title: Proceedings of The 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop +volume: '262' +genre: inproceedings +issued: + date-parts: + - 2024 + - 12 + - 10 +pdf: https://raw.githubusercontent.com/mlresearch/v262/main/assets/zayats24a/zayats24a.pdf +extras: [] +# Format based on Martin Fenner's citeproc: https://blog.front-matter.io/posts/citeproc-yaml-for-bibliographies/ +--- diff --git a/enlsp24.bib b/enlsp24.bib new file mode 100644 index 0000000..465ed12 --- /dev/null +++ b/enlsp24.bib @@ -0,0 +1,390 @@ +@Proceedings{ENLSP-2024, + booktitle = {Proceedings of The 4th NeurIPS Efficient Natural Language and Speech Processing Workshop}, + name = {NeurIPS Efficient Natural Language and Speech Processing Workshop}, + shortname = {ENLSP-IV 2024}, + sections = {Training|Model Design \& Architecture|Model Efficiency \& Compression|Inference| Benchmark \& Evaluation|Applications }, + editor = {Rezagholizadeh, Mehdi and Passban, Peyman and Samiee, Soheila and Partovi Nia, Vahid and Cheng, Yu and Deng, Yue and Liu, Qun and Chen, Boxing}, + volume = {262}, + year = {2024}, + start = {2024-12-14}, + end = {2024-12-14}, + published = {2024-12-10}, + conference_url = {https://neurips2024-enlsp.github.io/}, + address = {Vancouver, British Columbia, Canada} +} + +% Training +@InProceedings{samragh2024scaling, + title = {Scaling Smart: Accelerating Large Language Model Pre-Training with Small Model Initialization}, + section = {Training}, + author = {Samragh, Mohammad and Mirzadeh, Seyed Iman and Alizadeh-Vahid, Keivan and Faghri, Fartash and Cho, Minsik and Nabi, Moin and Naik, Devang and Farajtabar, Mehrdad}, + pages = {1-13}, + abstract = {The pre-training phase of language models often begins with randomly initialized parameters. With the current trends in scaling models, training their large number of parameters can be extremely slow and costly. In contrast, small language models are less expensive to train, but they often cannot achieve the accuracy of large models. In this paper, we explore an intriguing idea to connect these two different regimes: Can we develop a method to initialize large language models using smaller pre-trained models? Will such initialization bring any benefits in terms of training time and final accuracy? In this paper, we introduce HyperCloning, a method that can expand the parameters of a pre-trained language model to those of a larger model with increased hidden dimensions. Our method ensures that the larger model retains the functionality of the smaller model. As a result, the larger model already inherits the predictive power and accuracy of the smaller model before the training starts. We demonstrate that training such an initialized model results in significant savings in terms of GPU hours required for pre-training large language models. Implementation of HyperCloning is available at https://github.com/apple/ml-hypercloning/tree/main.} +} + + +@InProceedings{ashkboos2024computational, + title = {Computational Bottlenecks of Training Small-scale Large Language Models}, + section = {Training}, + author = {Ashkboos, Saleh and Iman Mirzadeh, Seyed and Alizadeh-Vahid, Keivan and Hossein Sekhavat, Mohammad and Nabi, Moin and Farajtabar, Mehrdad and Faghri, Fartash}, + pages = {14-21}, + abstract = {While large language models (LLMs) dominate the AI landscape, Small-scale large Language Models (SLMs) are gaining attention due to cost and efficiency demands from consumers. However, there is limited research on the training behavior and computational requirements of SLMs. In this study, we explore the computational bottlenecks of training SLMs (up to 2B parameters) by examining the effects of various hyperparameters and configurations, including GPU type, batch size, model size, communication protocol, attention type, and the number of GPUs. We assess these factors on popular cloud services using metrics such as loss per dollar and tokens per second. Our findings aim to support the broader adoption and optimization of language model training for low-resource AI research institutes.} +} + +@InProceedings{lawton2024quailora, + title = {{QuAILoRA}: Quantization-Aware Initialization for {LoRA}}, + section = {Training}, + author = {G Lawton, Neal and Padmakumar, Aishwarya and Gaspers, Judith and FitzGerald, Jack and Kumar, Anoop and Ver Steeg, Greg and Galstyan, Aram}, + pages = {22-33}, + abstract = {QLoRA reduces the memory-cost of fine-tuning a large language model (LLM) with LoRA by quantizing the base LLM. However, quantization introduces quantization errors that negatively impact model performance after fine-tuning. In this paper we introduce QuAILoRA, a quantization-aware initialization for LoRA that mitigates this negative impact by decreasing quantization errors at initialization. Our method spends a small amount of computational overhead to compute this quantization-aware initialization, without increasing the memory-cost of fine-tuning. We evaluate our method on several causal language modeling and downstream evaluation tasks using several different model sizes and families. We observe that almost all LLMs fined-tuned with QuAILoRA achieve better validation perplexity. When evaluated on downstream tasks, we find that QuAILoRA yields improvements proportional to the negative effect of quantization error. On average, applying QuAILoRA to 4-bit QLoRA models yields 75\% of the validation perplexity decrease and 86\% of the downstream task accuracy increase as doubling the quantization precision to 8-bit, without increasing GPU memory utilization during fine-tuning.} +} + + +@InProceedings{javaheri2024superpos, + title = {{SuperPos-Prompt}: Enhancing Soft Prompt Tuning of Language Models with Superposition of Multi Token Embeddings}, + section = {Training}, + author = {Ali Sadraei Javaheri, Mohammad and Asgari, Ehsaneddin and C. McHardy, Alice and R. Rabiee, Hamid}, + pages = {34-46}, + abstract = {Soft prompt tuning techniques have recently gained traction as an effective strategy for the parameter-efficient tuning of pre-trained language models, particularly minimizing the required adjustment of model parameters. Despite their growing use, achieving optimal tuning with soft prompts, especially with smaller datasets, remains a substantial challenge. This study makes two contributions in this domain: (i) we introduce SuperPos-Prompt, a new reparameterization technique employing the superposition of multiple pre-trained vocabulary embeddings to improve the learning of soft prompts. Our experiments across several GLUE and SuperGLUE benchmarks consistently highlight SuperPos-Prompt's superiority over Residual Prompt tuning, exhibiting an average score increase of +6.4 in T5-Small and +5.0 in T5-Base along with a faster convergence. Remarkably, SuperPos-Prompt occasionally outperforms even full fine-tuning methods. (ii) Additionally, we demonstrate enhanced performance and rapid convergence by omitting dropouts from the frozen network, yielding consistent improvements across various scenarios and tuning methods.} +} + + +@InProceedings{pasand2024rgp, + title = {{RGP}: Achieving Memory-Efficient Model Fine-tuning Via Randomized Gradient Projection}, + section = {Training}, + author = {Saheb Pasand, Ali and Bashivan, Pouya}, + pages = {47-54}, + abstract = {Training and fine-tuning Large Language Models (LLMs) require significant memory due to the substantial growth in the size of weight parameters and optimizer states. While methods like low-rank adaptation (LoRA), which introduce low-rank trainable modules in parallel to frozen pre-trained weights, effectively reduce memory usage, they often fail to preserve the optimization trajectory and are generally less effective for pre-training models. On the other hand, approaches, such as GaLore, that project gradients onto lower-dimensional spaces maintain the training trajectory and perform well in pre-training but suffer from high computational complexity, as they require repeated singular value decomposition on large matrices. In this work, we propose Randomized Gradient Projection (RGP), which outperforms GaLore, the current state-of-the-art in efficient fine-tuning, on the GLUE task suite, while being 74\% faster on average and requiring similar memory.} +} + +@InProceedings{khera2024efficient, + title = {Efficient Alignment of Large Language Models via Data Sampling}, + section = {Training}, + author = {Khera, Amrit and Ghosh, Rajat and Dutta, Debojyoti}, + pages = {55-72}, + abstract = {Despite the capabilities of Large Language Models (LLMs), the output is not always safe or desirable. Aligning the models to human values is a critical step for the safe adoption of these models. Aligning LLMs employ huge amounts of data, computation, and time. Moreover, curating data with human feedback is expensive and takes time. Recent research depicts the benefit of data engineering in the fine-tuning and pre-training paradigms to bring down such costs. However, alignment differs from the afore-mentioned paradigms and it is unclear if data efficient alignment is feasible. In this work, we first aim to understand how the performance of LLM alignment scales with data. We find out that LLM alignment performance follows an exponential plateau pattern which tapers off post a rapid initial increase. We identify data subsampling as a viable method to reduce resources required for alignment. Further, we propose a methodology for efficient alignment by identifying a small high quality subset thereby reducing the computation and time required by alignment. We evaluate the proposed methodology over multiple datasets and compare the results. We find that the model aligned using our proposed methodology outperforms other sampling methods and performs comparable to the model aligned with the full dataset while using a fraction of the resources.} +} + +@InProceedings{azimi2024kd-lora, + title = {{KD-LoRA}: A Hybrid Approach to Efficient Fine-Tuning with LoRA and Knowledge Distillation}, + section = {Training}, + author = {Azimi, Rambod and Rishav, Rishav and Teichmann, Marek and Ebrahimi Kahou, Samira}, + pages = {73-80}, + abstract = {Large language models (LLMs) have demonstrated remarkable performance across various downstream tasks. However, the high computational and memory requirements of LLMs are a major bottleneck. To address this, parameter-efficient fine-tuning (PEFT) methods such as low-rank adaptation (LoRA) have been proposed to reduce computational costs while ensuring minimal loss in performance. Additionally, knowledge distillation (KD) has been a popular choice for obtaining compact student models from teacher models. In this work, we present KD-LoRA, a novel fine-tuning method that combines LoRA with KD. Our results demonstrate that KD-LoRA achieves performance comparable to full fine-tuning (FFT) and LoRA while significantly reducing resource requirements. Specifically, KD-LoRA retains 98\% of LoRA’s performance on the GLUE benchmark, while being 40\% more compact. Additionally, KD-LoRA reduces GPU memory usage by 30\% compared to LoRA, while decreasing inference time by 30\% compared to both FFT and LoRA. We evaluate KD-LoRA across three encoder-only models: BERT, RoBERTa, and DeBERTaV3. Code is available at https://github.com/rambodazimi/KD-LoRA.} +} + +% Model Design & Architecture +@InProceedings{panda2024dense, + title = {Dense Backpropagation Improves Routing for Sparsely-Gated Mixture-of-Experts}, + section = {Model Design \& Architecture}, + author = {Panda, Ashwinee and Baherwani, Vatsal and Sarwar, Zain and Therien, Benjamin and Sahu, Sambit and Rawls, Stephen and Chakraborty, Supriyo and Goldstein, Tom}, + pages = {81-101}, + abstract = {Sparsely-gated Mixture-of-Experts (MoEs) such as Gemini have proven to be more efficient than dense Transformers because they can dynamically activate a subset of their overall parameters by \emph{routing} tokens to selected ``experts'', allowing practitioners to scale up model parameter counts without significantly increasing total compute. +However, current MoE training approaches only update the router with a sparse gradient and suffer from issues such as load imbalance. We propose a new router that can receive a dense gradient update from a sparse forward pass. Our method adds minimal overhead, but improves on the common Top-K routing in both performance and load balance.} +} + +@InProceedings{qiao2024vl-mamba, + title = {{VL-Mamba}: Exploring State Space Models for Multimodal Learning}, + section = {Model Design \& Architecture}, + author = {Qiao, Yanyuan and Yu, Zheng and Zhao, Zijia and Chen, Sihan and Sun, Mingzhen and Guo, Longteng and Wu, Qi and Liu, Jing}, + pages = {102-113}, + abstract = {Multimodal large language models (MLLMs) have gained considerable attention due to their ability to integrate visual and textual information, enhancing understanding and providing context for complex tasks. While Transformer-based architectures have been the dominant framework for MLLMs, recent studies suggest that state space models (SSMs) like Mamba can achieve competitive or even superior performance. However, no prior research has investigated the potential of SSMs to replace Transformers in multimodal tasks, which are inherently more challenging due to the heterogeneity of visual and language data and the complexities of aligning these modalities. In this paper, we introduce VL-Mamba, the first study to explore the application of state space models in multimodal learning tasks. VL-Mamba leverages a pretrained Mamba language model as its core, and we propose a novel MultiModal Connector (MMC) that incorporates a Vision Selective Scan (VSS) module to improve visual sequence modeling. We empirically explore how to effectively apply the 2D vision selective scan mechanism for multimodal learning and the combinations of different vision encoders and variants of pretrained Mamba language models. Our experiments across multiple multimodal benchmarks demonstrate that VL-Mamba achieves competitive performance against small MLLMs of similar size, and in some cases, surpasses larger models such as the 7B and 13B versions of LLaVA-1.5. These results suggest that state space models have the potential to serve as an alternative to Transformers in multimodal learning tasks.} +} +@InProceedings{liu2024misdmoe, + title = {{MisD-MoE}: A Multimodal Misinformation Detection Framework with Adaptive Feature Selection}, + section = {Model Design \& Architecture}, + author = {Liu, Moyang and Yan, Kaiying and Liu, Yukun and Fu, Ruibo and Wen, Zhengqi and Liu, Xuefei and Li, Chenxing}, + pages = {114-122}, + abstract = {The rapid growth of social media has led to the widespread dissemination of misinformation across multiple content forms, including text, images, audio, and video. Compared to unimodal misinformation detection, multimodal misinformation detection benefits from the increased availability of information across multiple modalities. However, these additional features may introduce redundancy, where overlapping or irrelevant information is included, potentially disrupting the feature space and consequently impairing the model's performance. To address the issue, we propose a novel framework, Misinformation Detection Mixture of Experts (MisD-MoE), which employs distinct expert models for each modality and incorporates an adaptive feature selection mechanism using top-k gating and Gumbel-Sigmoid. This approach dynamically filters relevant features, reducing redundancy and improving detection accuracy. Extensive experiments on the FakeSV and FVC-2018 datasets demonstrate that MisD-MoE significantly outperforms state-of-the-art methods, with accuracy improvements of 3.45\% and 3.71\% on the respective datasets compared to baseline models.} +} + +@InProceedings{zayats2024zipper, + title = {Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities}, + section = {Model Design \& Architecture}, + author = {Zayats, Vicky and Chen, Peter and Ferrari, Melissa and Padfield, Dirk}, + pages = {123-135}, + abstract = {Integrating multiple generative foundation models, especially those trained on different modalities, into something greater than the sum of its parts poses significant challenges. Two key hurdles are the availability of aligned data (concepts that contain similar meaning but is expressed differently in different modalities), and effectively leveraging unimodal representations in cross-domain generative tasks, without compromising their original unimodal capabilities. + +We propose Zipper, a multi-tower decoder architecture that addresses these concerns by using cross-attention to flexibly compose multimodal generative models from independently pre-trained unimodal decoders. In our experiments fusing speech and text modalities, we show the proposed architecture performs very competitively in scenarios with limited aligned text-speech data. We also showcase the flexibility of our model to selectively maintain unimodal (e.g., text-to-text generation) generation performance by freezing the corresponding modal tower (e.g. text). In cross-modal tasks such as automatic speech recognition (ASR) where the output modality is text, we show that freezing the text backbone results in negligible performance degradation. In cross-modal tasks such as text-to-speech generation (TTS) where the output modality is speech, we show that using a pre-trained speech backbone results in superior performance to the baseline.} +} + +@InProceedings{hajimolahoseini2024is, + title = {Is {3D} Convolution with {5D} Tensors Really Necessary for Video Analysis?}, + section = {Model Design \& Architecture}, + author = {Hajimolahoseini, Habib and Ahmed, Walid and Wen, Shuangyue and Liu, Yang}, + pages = {136-144}, + abstract = {In this paper, we present a comprehensive study and propose several novel techniques for implementing 3D convolutional blocks using 2D and/or 1D convolutions with only 4D and/or 3D tensors. Our motivation is that 3D convolutions with 5D tensors are computationally very expensive and they may not be supported by some of the edge devices used in real-time applications such as robots. The existing approaches mitigate this by splitting the 3D kernels into spatial and temporal domains, but they still use 3D convolutions with 5D tensors in their implementations. We resolve this issue by introducing some appropriate 4D/3D tensor reshaping as well as new combination techniques for spatial and temporal splits. The proposed implementation methods show significant improvement both in terms of efficiency and accuracy. The experimental results confirm that the proposed spatio-temporal processing +structure outperforms the original model in terms of speed and accuracy using only 4D tensors with fewer parameters.} +} +@InProceedings{chung2024beyond, + title = {Beyond Parameter Count: Implicit Bias in Soft Mixture of Experts}, + section = {Model Design \& Architecture}, + author = {Chung, Youngseog and Malik, Dhruv and Schneider, Jeff and Li, Yuanzhi and Singh, Aarti}, + pages = {145-164}, + abstract = {The traditional viewpoint on Sparse Mixture of Experts (MoE) models is that instead of training a single large expert, which is computationally expensive, we can train many small experts. The hope is that if the total parameter count of the small experts equals that of the singular large expert, then we retain the representation power of the large expert while gaining computational tractability and promoting expert specialization. The recently introduced Soft MoE replaces the Sparse MoE's discrete routing mechanism with a differentiable gating function that smoothly mixes tokens. While this smooth gating function successfully mitigates the various training instabilities associated with Sparse MoE, it is unclear whether it induces implicit biases that affect Soft MoE's representation power or potential for expert specialization. We prove that Soft MoE with a single arbitrarily powerful expert cannot represent simple convex functions. This justifies that Soft MoE's success cannot be explained by the traditional viewpoint of many small experts collectively mimicking the representation power of a single large expert, and that multiple experts are actually necessary to achieve good representation power (even for a fixed total parameter count). Continuing along this line of investigation, we introduce a notion of expert specialization for Soft MoE, and while varying the number of experts yet fixing the total parameter count, we consider the following (computationally intractable) task. Given any input, how can we discover the expert subset that is specialized to predict this input's label? We empirically show that when there are many small experts, the architecture is implicitly biased in a fashion that allows us to efficiently approximate the specialized expert subset. Our method can be easily implemented to potentially reduce computation during inference.} +} + +@InProceedings{sarkar2024revisiting, + title = {Revisiting {SMoE} Language Models by Evaluating Inefficiencies with Task Specific Expert Pruning}, + section = {Model Design \& Architecture}, + author = {Sarkar, Soumajyoti and Lausen, Leonard and Cevher, Volkan and Brox, Thomas and Zha, Sheng and Karypis, George}, + pages = {165-181}, + abstract = {Sparse Mixture of Expert (SMoE) models have emerged as a scalable alternative to dense models in language modeling. These models use conditionally activated feedforward subnetworks in transformer blocks, allowing for a separation between total model parameters and per-example computation. However, large token-routed SMoE models face a significant challenge: during inference, the entire model must be used for a sequence or a batch, resulting in high latencies in a distributed setting that offsets the advantages of per-token sparse activation. +Our research explores task-specific model pruning to inform decisions about designing SMoE architectures, mainly modulating the choice of expert counts in pretraining. We investigate whether such pruned models offer advantages over smaller SMoE models trained from scratch, when evaluating and comparing them individually on tasks. To that end, we introduce an adaptive task-aware pruning technique {\tt UNCURL} to reduce the number of experts per MoE layer in an offline manner post-training. +Our findings reveal a threshold pruning factor for the reduction that depends on the number of experts used in pretraining, above which, the reduction starts to degrade model performance. These insights contribute to our understanding of model design choices when pretraining with SMoE architectures, particularly useful when considering task-specific inference optimization for later stages.} +} + +@InProceedings{sarwar2024structmoe, + title = {{StructMoE}: Structured Mixture of Experts Using Low Rank Experts}, + section = {Model Design \& Architecture}, + author = {Sarwar, Zain and Panda, Ashwinee and Th\'erien, Benjamin and Rawls, Stephen and Das, Anirban and Balasubramaniam, Kartik and Kapusuzoglu, Berkcan and Zhang, Shixiong and Sahu, Sambit and Naphade, Milind and Chakraborty, Supriyo}, + pages = {182-193}, + abstract = {We introduce StructMoE, a method to scale MoE architectures by augmenting experts with dynamic capacity using structured matrices we call Low Rank Experts (LoRE). These LoREs are selected on a per-expert and per-token basis using a secondary router specific to every expert and are entangled with the main expert in the up-projection phase of the expert before the activation function. Empirically, we find this approach to outperform an MoE baseline in terms of loss on a held out validation set.} +} + +@InProceedings{doubov2024sparse, + title = {Sparse Upcycling: Inference Inefficient Finetuning}, + section = {Model Design \& Architecture}, + author = {Doubov, Sasha and Sardana, Nikhil and Chiley, Vitaliy}, + pages = {194-205}, + abstract = {Small, highly trained, open-source LLMs are widely used due to their inference efficiency, but further improving their quality remains a challenge. Sparse upcycling is a promising approach that transforms a pretrained dense model into a Mixture-of-Experts (MoE) architecture, increasing the model’s parameter count and potential quality. In this work, we compare the effectiveness of sparse upcycling against continued pretraining (CPT) across different model sizes, FLOP budgets, and pretraining durations. Our experiments show that sparse upcycling can achieve better quality, with improvements of over 20\% relative to CPT in certain scenarios. However, this comes with a significant inference cost, leading to 40\% slowdowns in high-demand inference settings for larger models. These results highlight the trade-off between model quality and inference efficiency, offering insights for practitioners seeking to balance performance with practical deployment costs.} +} + + +% Model Efficiency & Compression + +@InProceedings{chua2024post-training, + title = {Post-Training Statistical Calibration for Higher Activation Sparsity}, + section = {Model Efficiency \& Compression}, + author = {Seng Chua, Vui and Pan, Yujie and Jain, Nilesh and Seng Chua, Vui}, + pages = {206-221}, + abstract = {We present Statistical Calibrated Activation Pruning (SCAP), a post-training activation pruning framework that (1) generalizes sparsification by input activations of Fully-Connected layers for generic and flexible application across Transformers, and (2) features a simple Mode-Centering technique to pre-calibrate activation distributions for maximizing post-training sparsity. Our results demonstrate robust Pareto efficiency compared to prior methods, translating to a 1.5× additional LLM decoding speedup against CATS[12] at iso model quality. SCAP effectiveness is empirically verified across a wide range of models, including recent Transformer Decoders, MoE, Mamba2, Encoding Transformer, and pre-quantized models, highlighting its practicality and scalability. The code is available at https://github.com/IntelLabs/SCAP.} +} +@InProceedings{hajimolahoseini2024accelerating, + title = {Accelerating the Low-Rank Decomposed Models}, + section = {Model Efficiency \& Compression}, + author = {Hajimolahoseini, Habib and Ahmed, Walid and Wen, Shuangyue and Liu, Yang}, + pages = {222-231}, + abstract = {Tensor decomposition is a mathematically supported technique for data compression. It consists of applying some kind of a Low Rank Decomposition technique on the tensors or matrices in order to reduce the redundancy of the data. + + +However, it is not a popular technique for compressing the AI models duo to the high number of new layers added to the architecture after decomposition. Although the number of parameters could shrink significantly, it could result in the model be more than twice deeper which could add some latency to the training or inference. In this paper, we present a comprehensive study about how to modify low rank decomposition technique in AI models so that we could benefit from both high accuracy and low memory consumption as well as speeding up the training and inference. } +} +@InProceedings{vasudev2024the, + title = {The {EarlyBird} Gets the {WORM}: Heuristically Accelerating {EarlyBird} Convergence}, + section = {Model Efficiency \& Compression}, + author = {G Vasudev, Adithya}, + pages = {232-240}, + abstract = {The Lottery Ticket hypothesis proposes that ideal, sparse subnetworks, called lottery tickets, exist in untrained dense neural networks. The Early Bird hypothesis proposes an efficient algorithm to find these winning lottery tickets in convolutional neural networks, using the novel concept of distance between subnetworks to detect convergence in the subnetworks of a model. However, this approach overlooks unchanging groups of unimportant neurons near the search's end. We propose WORM, a method that exploits these static groups by truncating their gradients, forcing the model to rely on other neurons. Experiments show WORM achieves faster ticket identification during training on convolutional neural networks, despite the additional computational overhead, when compared to EarlyBird Search. Additionally, WORM-pruned models lose less accuracy during pruning and recover accuracy faster, improving the robustness of a given model. Furthermore, WORM is also able to generalize the Early Bird hypothesis reasonably well to larger models, such as transformers, displaying its flexibility to adapt to more complex architectures.} +} +%#73 no correction +@InProceedings{sharify2024post, + title = {Post Training Quantization of Large Language Models with Microscaling Formats}, + section = {Model Efficiency \& Compression}, + author = {Sharify, Sayeh and Saxena, Utkarsh and Xu, Zifei and Yazar, Wanzin and Soloveychik, Ilya and Wang, Xin}, + pages = {241-258}, + abstract = {Large Language Models (LLMs) have distinguished themselves with outstanding performance in complex language modeling tasks, yet they come with significant computational and storage challenges. This paper explores the potential of quantization to mitigate these challenges. We systematically study the combined application of three well-known post-training techniques, SmoothQuant, AWQ, and GPTQ, and provide a comprehensive analysis of their interactions and implications for advancing LLM quantization. We enhance the versatility of these methods by enabling quantization to microscaling (MX) formats, extending the applicability of these PTQ algorithms beyond their original fixed-point format targets. We show that combining different PTQ methods enables us to quantize models to 4-bit weights and 8-bit activations using the MXINT format with negligible accuracy loss compared to the uncompressed baseline. } +} +@InProceedings{rajabzadeh2024echoatt, + title = {{EchoAtt}: Attend, Copy, then Adjust for More Efficient Large Language Models}, + section = {Model Efficiency \& Compression}, + author = {Rajabzadeh, Hossein and Jafari, Aref and Sharma, Aman and Jami, Benyamin and Ju Hj Kwon, Hyock and Ghodsi, Ali and Chen, Boxing and Rezagholizadeh, Mehdi}, + pages = {259-269}, + abstract = {Large Language Models (LLMs), with their increasing depth and number of parameters, have demonstrated outstanding performance across a variety of natural language processing tasks. However, this growth in scale leads to increased computational demands, particularly during inference and fine-tuning. To address these challenges, we introduce \textbf{EchoAtt}, a novel framework aimed at optimizing transformer-based models by analyzing and leveraging the similarity of attention patterns across layers. Our analysis reveals that many inner layers in LLMs, especially larger ones, exhibit highly similar attention matrices. By exploiting this similarity, \textbf{EchoAtt} enables the sharing of attention matrices in less critical layers, significantly reducing computational requirements without compromising performance. We incorporate this approach within a knowledge distillation setup, where a pre-trained teacher model guides the training of a smaller student model. The student model selectively shares attention matrices in layers with high similarity while inheriting key parameters from the teacher. Our best results with TinyLLaMA-1.1B demonstrate that \textbf{EchoAtt} improves inference speed by 15\%, training speed by 25\%, and reduces the number of parameters by approximately 4\%, all while improving zero-shot performance. These findings highlight the potential of attention matrix sharing to enhance the efficiency of LLMs, making them more practical for real-time and resource-limited applications.} +} + +@InProceedings{xu2024scaling, + title = {Scaling laws for post-training quantized large language models}, + section = {Model Efficiency \& Compression}, + author = {Xu, Zifei and Y Lan, Alexander and Yazar, Wanzin and Webb, Tristan and Sharify, Sayeh and Wang, Xin}, + pages = {270-285}, + abstract = {Generalization abilities of well-trained large language models (LLMs) are known to scale predictably as a function of model size. In contrast to the existence of practical scaling laws governing pre-training, the quality of LLMs after post-training compression remains highly unpredictable, often requiring case-by-case validation in practice. In this work, we attempted to close this gap for post-training weight quantization of LLMs by conducting a systematic empirical study on multiple LLM families quantized to numerous low-precision tensor data types using popular weight quantization techniques. We identified key scaling factors pertaining to characteristics of the local loss landscape, based on which the performance of quantized LLMs can be reasonably well predicted by a statistical model. } +} + +@InProceedings{yang2024partially, + title = {Partially Shared Query-Key for Lightweight Language Models}, + section = {Model Efficiency \& Compression}, + author = {Yang, Kai and Partovi Nia, Vahid and Chen, Boxing and Asgharian, Masoud}, + pages = {286-291}, + abstract = {Lightweight language models, such as TinyBERT 14.5M, have emerged as a critical area of research because of their implementation on resource-constrained hardware. These transformer models include significantly smaller parameter size, reduced memory and computational requirements. These features make such models highly suitable for deployment on small devices. We explore the concept of parameter sharing between the key and query weight matrices of a transformer model. The full query-key sharing which has already been proposed in the literature introduces a fully-quadratic attention matrix, oversimplifies directional dependencies and degrades pre-training loss. In contrast, partial parameter sharing balances complexity reduction and performance retention. Partial parameter sharing effectively addresses over-fitting while maintaining strong performance even with a high degree of shared parameters up to 95\%. This provides a promising strategy for enhancing language models, specifically targeting small models.} +} + +% Inference +@InProceedings{wu2024snakes, + title = {Snakes and Ladders: Accelerating {SSM} Inference with Speculative Decoding}, + section = {Inference}, + author = {Wu, Yangchao and Dukler, Yonatan and Trager, Matthew and Achille, Alessandro and Xia, Wei and Soatto, Stefano}, + pages = {292-304}, + abstract = {Speculative decoding is a method for accelerating inference in large language models (LLMs) by predicting multiple tokens using a smaller `draft model' and validating them against the larger `base model.' If a draft token is inconsistent with what the base model would have generated, speculative decoding `backtracks' to the last consistent token before resuming generation. This is straightforward in autoregressive Transformer architectures since their state is a sliding window of past tokens. However, their baseline inference complexity is quadratic in the number of input tokens. State Space Models (SSMs) have linear inference complexity, but they maintain a separate Markov state that makes backtracking non-trivial. We propose two methods to perform speculative decoding in SSMs: ``Joint Attainment and Advancement'' and ``Activation Replay.'' Both methods utilize idle computational resources to speculate and verify multiple tokens, allowing us to produce 6 tokens for 1.47$\times$ the cost of one, corresponding to an average 1.82$\times$ wall-clock speed-up on three different benchmarks using a simple $n$-gram for drafting. Furthermore, as model size increases, relative overhead of speculation and verification decreases: Scaling from 1.3B parameters to 13B reduces relative overhead from 1.98$\times$ to 1.22$\times$. Unlike Transformers, speculative decoding in SSMs can be easily applied to batches of sequences, allowing dynamic allocation of resources to fill gaps in compute utilization and thereby improving efficiency and throughput with variable inference traffic.} +} +@InProceedings{kang2024gear, + title = {{GEAR}: An Efficient Error Reduction Framework for {KV} Cache Compression in {LLM} Inference}, + section = {Inference}, + author = {Kang, Hao and Zhang, Qingru and Kundu, Souvik and Jeong, Geonhwa and Liu, Zaoxing and Krishna, Tushar and Zhao, Tuo}, + pages = {305-321}, + abstract = {Key-value (KV) caching has become the de-facto technique to accelerate generation speed for large language models (LLMs) inference. However, the growing cache demand with increasing sequence length has transformed LLM inference to be a memory bound problem, significantly constraining the system throughput. Existing methods rely on dropping unimportant tokens or quantizing entries group-wise. Such methods, however, often incur high approximation errors to represent the compressed matrices. The autoregressive decoding process further compounds the error of each step, resulting in critical deviation in model generation and deterioration of performance. To tackle this challenge, we propose GEAR, an efficient error reduction framework that augments a quantization scheme with two error reduction components and achieves near-lossless performance at high compression ratios. GEAR first applies quantization to majority of entries of similar magnitudes to ultra-low precision. It then employs a low-rank matrix to approximate the quantization error, and a sparse matrix to remedy individual errors from outlier entries. By adeptly integrating three techniques, GEAR is able to fully exploit their synergistic potentials. Our experiments show that GEAR can maintain similar accuracy to that of FP16 cache with improvement up to 24.42\% over the SOTA baselines at 2-bit compression. Additionally, compared to LLM inference with FP16 KV cache, GEAR can reduce peak-memory of up to $2.39\times$, bringing $2.1\times\sim 5.07\times$ throughput improvement. Our code will be publicly available. } +} +@InProceedings{stewart2024the, + title = {The {N-Grammys}: Accelerating Autoregressive Inference with Learning-Free Batched Speculation}, + section = {Inference}, + author = {Stewart, Lawrence and Trager, Matthew and Gonugondla, Sujan and Soatto, Stefano}, + pages = {322-335}, + abstract = {Speculative decoding aims to speed up autoregressive generation of a language model by verifying in parallel the tokens generated by a smaller draft model. +In this work, we explore the effectiveness of learning-free, negligible-cost draft strategies, namely $N$-grams obtained from the model weights and the context. While the predicted next token of the base model is rarely the top prediction of these simple strategies, we observe that it is often within their top-$k$ predictions for small $k$. Based on this, we show that combinations of simple strategies can achieve significant inference speedups over different tasks. The overall performance is comparable to more complex methods, yet does not require expensive preprocessing or modification of the base model, and allows for seamless `plug-and-play' integration into pipelines.} +} +@InProceedings{timor2024distributed, + title = {Distributed Speculative Inference of Large Language Models is Provably Faster}, + section = {Inference}, + author = {Timor, Nadav and Mamou, Jonathan and Pereg, Oren and Berchansky, Moshe and Korat, Daniel and Wasserblat, Moshe and Galanti, Tomer and Gordon, Michal and Harel, David}, + pages = {336-354}, + abstract = {Accelerating the inference of large language models (LLMs) is an important challenge in artificial intelligence. This paper introduces Distributed Speculative Inference (DSI), a novel distributed inference algorithm that is provably faster than speculative inference (SI) [leviathan2023fast, chen2023accelerating, miao2023specinfer] and traditional autoregressive inference (non-SI). Like other SI algorithms, DSI works on frozen LLMs, requiring no training or architectural modifications, and it preserves the target distribution. Prior studies on SI have demonstrated empirical speedups (compared to non-SI) but require fast and accurate drafters, which are often unavailable in practice. We identify a gap where SI can be slower than non-SI given slower or less accurate drafters. We close this gap by proving that DSI is faster than both SI and non-SI—given any drafters. DSI introduces a novel type of task parallelism called Speculation Parallelism (SP), which orchestrates target and drafter instances to overlap in time, creating a new foundational tradeoff between computational resources and latency. DSI is not only faster than SI but also supports LLMs that cannot be accelerated with SI. Our simulations show speedups of off-the-shelf LLMs in realistic single-node settings where DSI is 1.29-1.92x faster than SI. Our code is open-sourced: github.com/keyboardAnt/distributed-speculative-inference} +} +@InProceedings{agrawal2024adaedl, + title = {{AdaEDL}: Early Draft Stopping for Speculative Decoding of Large Language Models via an Entropy-based Lower Bound on Token Acceptance Probability}, + section = {Inference}, + author = {Agrawal, Sudhanshu and Jeon, Wonseok and Lee, Mingu}, + pages = {355-369}, + abstract = {Speculative decoding is a powerful technique that attempts to circumvent the autoregressive constraint of modern Large Language Models (LLMs). The aim of speculative decoding techniques is to improve the average inference time of a large, target model without sacrificing its accuracy, by using a more efficient draft model to propose draft tokens which are then verified in parallel. The number of draft tokens produced in each drafting round is referred to as the draft length and is often a static hyperparameter chosen based on the acceptance rate statistics of the draft tokens. However, setting a static draft length can negatively impact performance, especially in scenarios where drafting is expensive and there is a high variance in the number of tokens accepted. Adaptive Entropy-based Draft Length (AdaEDL) is a simple, training and parameter-free criteria which allows for early stopping of the token drafting process by approximating a lower bound on the expected acceptance probability of the drafted token based on the currently observed entropy of the drafted logits. We show that AdaEDL consistently outperforms static draft-length speculative decoding by 10\%-57\% as well as other training-free draft-stopping techniques by upto 10\% in a variety of settings and datasets. At the same time, we show that AdaEDL is more robust than these techniques and preserves performance in high-sampling-temperature scenarios. Since it is training-free, in contrast to techniques that rely on the training of dataset-specific draft-stopping predictors, AdaEDL can seamlessly be integrated into a variety of pre-existing LLM systems. } +} +@InProceedings{rajput2024inference-friendly, + title = {Inference-Friendly Models With {MixAttention}}, + section = {Inference}, + author = {Rajput, Shashank and Sheng, Ying and Owen, Sean and Chiley, Vitaliy}, + pages = {370-381}, + abstract = {The size of the key-value (KV) cache plays a critical role in determining both the maximum context length and the number of concurrent requests supported during inference in modern language models. The KV cache size grows proportionally with the number of attention heads and the tokens processed, leading to increased memory consumption and slower inference for long inputs. In this work, we explore the use of MixAttention, a model architecture modification closely related to a blog published by Character.AI. MixAttention combines sliding window attention, where only a small subset of recent tokens is stored in the KV cache, with KV cache sharing across layers. Our experiments demonstrate that MixAttention significantly reduces memory usage and improves inference speed without sacrificing model performance in both short and long-context tasks. We also explore various configurations of this architecture, identifying those that maintain quality across evaluation metrics while optimizing resource efficiency.} +} +@InProceedings{lu2024improving, + title = {Improving Multi-candidate Speculative Decoding}, + section = {Inference}, + author = {Lu, XiaoFan and Zeng, Yixiao and Levorato, Marco and Ma, FeiYang and Yu, ZiXu}, + pages = {382-394}, + abstract = {Speculative Decoding (SD) is a technique to accelerate the inference of Large Language Models (LLMs) by using a lower complexity draft model to propose candidate tokens verified by a larger target model. To further improve efficiency, Multi-Candidate Speculative Decoding (MCSD) improves upon this by sampling multiple candidate tokens from the draft model at each step and verifying them in parallel, thus increasing the chances of accepting a token and reducing generation time. Existing MCSD methods rely on the draft model to initialize the multi-candidate sequences and use static length and tree attention structure for draft generation. However, such an approach suffers from the draft and target model's output distribution differences, especially in a dynamic generation context. In this work, we introduce a new version of MCSD that includes a target model initialized multi-candidate generation, a dynamic sliced topology-aware causal mask for dynamic length adjustment, and decision models to optimize early stopping. We experimented with our method on Llama 2-7B and its variants and observed a maximum 27.5\% speedup compared to our MCSD baseline across three benchmarks with Llama 2-7B as the target model and JackFram 68M as the draft model. Additionally, we evaluate the effects of using the target model initialized multi-candidate process with different draft models on output quality.} +} + +@InProceedings{bhendawade2024speculative, + title = {Speculative Streaming: Fast {LLM} Inference without Auxiliary Models}, + section = {Inference}, + author = {Bhendawade, Nikhil and Belousova, Irina and Fu, Qichen and Mason, Henry and Rastegari, Mohammad and Najibi, Mahyar}, + pages = {395-413}, + abstract = {Speculative decoding is a prominent technique to accelerate large language model inference by leveraging predictions from an auxiliary draft model. While effective, in application-specific settings, it often involves fine-tuning both draft and target models to achieve high acceptance rates. As the number of downstream tasks grows, draft models add significant complexity to inference systems. Recently several single model architectures viz. Medusa have been proposed to speculate tokens in non-autoregressive manner, however, their effectiveness is limited due to lack of dependency between speculated tokens. We introduce a novel speculative decoding method that integrates drafting within the target model by using Multi-stream attention and incorporates future token planning into supervised fine-tuning objective. To the best of our knowledge, it is the first parameter-efficient approach that scales well with number of downstream tasks while improving downstream metrics. Speculative Streaming speeds up decoding by 1.9 - 3X in a diverse set of tasks, such as Summarization, Structured Queries, and Meaning Representation, while improving generation quality and using 10000X fewer extra parameters than alternative architectures, making it ideal for resource-constrained devices. Our approach can also be effectively deployed in lossless settings for generic chatbot applications that do not necessitate fine-tuning. In such setups, we achieve 2.9 - 3.2X speedup while maintaining the integrity of the base model's output.} +} +@InProceedings{kimhi2024hysteresis, + title = {Hysteresis Activation Function for Efficient Inference}, + section = {Inference}, + author = {Kimhi, Moshe and Kashani, Idan and Baskin, Chaim and Mendelson, Avi}, + pages = {414-422}, + abstract = {The widely used ReLU is favored for its hardware efficiency yet suffers from issues such as the ``dying ReLU'' problem, where during training, neurons fail to activate and constantly remain at zero, as highlighted by Lu et al.~\citep{lu2018collapse}. Traditional approaches to mitigate this issue often introduce more complex and less hardware-friendly activation functions. In this work, we propose a Hysteresis Rectified Linear Unit (HeLU), an efficient activation function designed to address the ``dying ReLU'' problem with minimal complexity. Unlike traditional activation functions with fixed thresholds for training and inference, HeLU employs a variable threshold that refines the backpropagation. This refined mechanism allows simpler activation functions to achieve competitive performance comparable to their more complex counterparts without introducing unnecessary complexity or requiring inductive biases. Empirical evaluations demonstrate that HeLU enhances model generalization across diverse datasets, offering a promising solution for efficient and effective inference suitable for a wide range of neural network architectures.} +} +@InProceedings{sharma2024efficiently, + title = {Efficiently Dispatching Flash Attention For Partially Filled Attention Masks}, + section = {Inference}, + author = {Sharma, Agniv and A. Geiping, Jonas}, + pages = {423-442}, + abstract = {Transformers are widely used across various applications, many of which yield sparse or partially filled attention matrices. Examples include attention masks designed to reduce the quadratic complexity of attention, sequence packing techniques, and recent innovations like tree masking for fast validation in MEDUSA. Despite the inherent sparsity in these matrices, the state-of-the-art algorithm Flash Attention still processes them with quadratic complexity as though they were dense. In this paper, we introduce \textbf{Binary Block Masking}, a highly efficient modification that enhances Flash Attention by making it mask-aware. We further propose two optimizations: one tailored for masks with contiguous non-zero patterns and another for extremely sparse masks. Our experiments on attention masks derived from real-world scenarios demonstrate up to a 9x runtime improvement. The implementation will be publicly released to foster further research and application.} +} +@InProceedings{alizadeh-vahid2024duo-llm, + title = {{Duo-LLM}: A Framework for Studying Adaptive Computation in Large Language Models}, + section = {Inference}, + author = {Alizadeh-Vahid, Keivan and Iman Mirzadeh, Seyed and Shahrkokhi, Hooman and Belenko, Dmitry and Sun, Frank and Cho, Minsik and Hossein Sekhavat, Mohammad and Nabi, Moin and Farajtabar, Mehrdad}, + pages = {443-455}, + abstract = {Large Language Models (LLMs) typically generate outputs token by token using a fixed compute budget, leading to inefficient resource utilization. To address this shortcoming, recent advancements in mixture of expert (MoE) models, speculative decoding, and early exit strategies leverage the insight that computational demands can vary significantly based on the complexity and nature of the input. However, identifying optimal routing patterns for dynamic execution remains an open challenge, limiting the full potential of these adaptive methods. To address this need, we study adaptive computation in LLMs more systematically. We propose a novel framework that integrates smaller auxiliary modules within each Feed-Forward Network layer of the LLM. This design enables dynamic routing of tokens based on task complexity: tokens can be processed by either the small or big modules at each layer, or even bypass certain layers entirely. This allows us to introduce a novel notion of a token's difficulty, defined by its potential to benefit from additional computational resources. Importantly, by employing oracles to identify optimal patterns of adaptive computations, we gain valuable insights into the internal workings of LLMs and the routing processes in a simplified heterogeneous MoE setup. We show that trained routers operate differently from oracles and often yield suboptimal solutions. Notably, activating a large module in just one layer outperforms models that use large modules across all layers, underscoring the gap between practical implementations of routing in MoE models and theoretical optima for adaptive computation.} +} +@InProceedings{mamou2024dynamic, + title = {Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large Language Models}, + section = {Inference}, + author = {Mamou, Jonathan and Pereg, Oren and Korat, Daniel and Berchansky, Moshe and Timor, Nadav and Wasserblat, Moshe and Schwartz, Roy}, + pages = {456-467}, + abstract = {Speculative decoding is commonly used for reducing the inference latency of large language models. Its effectiveness depends highly on the speculation lookahead (SL)-the number of tokens generated by the draft model at each iteration. In this work we show that the common practice of using the same SL for all iterations (static SL) is suboptimal. We introduce DISCO (DynamIc SpeCulation lookahead Optimization), a novel method for dynamically selecting the SL. Our experiments with four datasets show that DISCO reaches an average speedup of 10\% compared to the best static SL baseline, while generating the exact same text.} +} +@InProceedings{wang2024cskv, + title = {{CSKV}: Training-Efficient Channel Shrinking for {KV} Cache in Long-Context Scenarios}, + section = {Inference}, + author = {Wang, Luning and Li, Shiyao and Ning, Xuefei and Yuan, Zhihang and Yan, Shengen and Dai, Guohao and Wang, Yu}, + pages = {468-484}, + abstract = {Large Language Models (LLMs) have been widely adopted to process long-context tasks. However, the large memory overhead of the key-value (KV) cache poses significant challenges in long-context scenarios. Existing training-free KV cache compression methods typically focus on quantization and token pruning, which have compression limits, and excessive sparsity can lead to severe performance degradation. Other methods design new architectures with less KV overhead but require significant training overhead. To address the above two drawbacks, we further explore the redundancy in the channel dimension and apply an architecture-level design with minor training costs. Therefore, we introduce CSKV, a training-efficient Channel Shrinking technique for KV cache compression: (1) We first analyze the singular value distribution of the KV cache, revealing significant redundancy and compression potential along the channel dimension. Based on this observation, we propose using low-rank decomposition for key and value layers and storing the low-dimension features. (2) To preserve model performance, we introduce a bi-branch KV cache, including a window-based full-precision KV cache and a low-precision compressed KV cache. (3) To reduce the training costs, we minimize the layer-wise reconstruction loss for the compressed KV cache instead of retraining the entire LLMs. Extensive experiments show that CSKV can reduce the memory overhead of the KV cache by 80\% while maintaining the model's long-context capability. Moreover, we show that our method can be seamlessly combined with quantization to further reduce the memory overhead, achieving a compression ratio of up to 95\%. Code is available at https://github.com/wln20/CSKV.} +} +@InProceedings{kumar2024residual, + title = {Residual vector quantization for {KV} cache compression in large language model}, + section = {Inference}, + author = {Kumar, Ankur}, + pages = {485-490}, + abstract = {KV cache compression methods have mainly relied on scalar quantization techniques to reduce the memory requirements during decoding. In this work, we apply residual vector quantization, which has been widely used for high fidelity audio compression, to compress KV cache in large language models (LLM). We adapt the standard recipe with minimal changes to compress the output of any key or value projection matrix in a pretrained LLM: we scale the vector by its standard deviation, divide channels into groups and then quantize each group with the same residual vector quantizer. We learn the codebook using exponential moving average and there are no other learnable parameters including the input and output projections normally used in a vector quantization set up. We find that a residual depth of 8 recovers most of the performance of the unquantized model. We also find that grouping non-contiguous channels together works better than grouping contiguous channels for compressing key matrix and the method further benefits from a light weight finetuning of LLM together with the quantization. Overall, the proposed technique is competitive with existing quantization methods while being much simpler and results in ~5.5x compression compared to half precision.} +} + +% Benchmark & Evaluation +@InProceedings{pieler2024rephrasing, + title = {Rephrasing natural text data with different languages and quality levels for Large Language Model pre-training}, + section = {Benchmark \& Evaluation}, + author = {Pieler, Michael and Bellagente, Marco and Teufel, Hannah and Phung, Duy and Cooper, Nathan and Tow, Jonathan and Rocha, Paulo and Adithyan, Reshinth and Alyafeai, Zaid and Pinnaparaju, Nikhil and Zhuravinskyi, Maksym and Riquelme, Carlos}, + pages = {491-511}, + abstract = {Recently published work on rephrasing natural text data for pre-training LLMs has shown promising results when combining the original dataset with the synthetically rephrased data. We build upon previous work by replicating existing results on C4 and extending them with our optimized rephrasing pipeline to the English, German, Italian, and Spanish Oscar subsets of CulturaX. Our pipeline leads to increased performance on standard evaluation benchmarks in both the mono- and multilingual setup. In addition, we provide a detailed study of our pipeline, investigating the choice of the base dataset and LLM for the rephrasing, as well as the relationship between the model size and the performance after pre-training. By exploring data with different perceived quality levels, we show that gains decrease with higher quality. Furthermore, we find the difference in performance between model families to be bigger than between different model sizes. This highlights the necessity for detailed tests before choosing an LLM to rephrase large amounts of data. Moreover, we investigate the effect of pre-training with synthetic data on supervised fine-tuning. Here, we find increasing but inconclusive results that highly depend on the used benchmark. These results (again) highlight the need for better benchmarking setups. In summary, we show that rephrasing multilingual and low-quality data is a very promising direction to extend LLM pre-training data.} +} +@InProceedings{kasmaee2024chemteb, + title = {{ChemTEB}: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + section = {Benchmark \& Evaluation}, + author = {Shiraee Kasmaee, Ali and Khodadad, Mohammad and Arshi Saloot, Mohammad and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + pages = {512-531}, + abstract = {Recent advancements in language models have started a new era of superior information retrieval and content generation, with embedding models playing an important role in optimizing data representation efficiency and performance. While benchmarks like the Massive Text Embedding Benchmark (MTEB) have standardized the evaluation of general domain embedding models, a gap remains in specialized fields such as chemistry, which require tailored approaches due to domain-specific challenges. +This paper introduces a novel benchmark, the Chemical Text Embedding Benchmark (ChemTEB), designed specifically for the chemical sciences. ChemTEB addresses the unique linguistic and semantic complexities of chemical literature and data, offering a comprehensive suite of tasks on chemical domain data. +Through the evaluation of 34 open-source and proprietary models using this benchmark, we illuminate the strengths and weaknesses of current methodologies in processing and understanding chemical information. Our work aims to equip the research community with a standardized, domain-specific evaluation framework, promoting the development of more precise and efficient NLP models for chemistry-related applications. Furthermore, it provides insights into the performance of generic models in a domain-specific context. +ChemTEB comes with open-source code and data, contributing further to its accessibility and utility.} +} +@InProceedings{thielmann2024on, + title = {On the Efficiency of {NLP}-Inspired Methods for Tabular Deep Learning}, + section = {Benchmark \& Evaluation}, + author = {F Thielmann, Anton and Samiee, Soheila}, + pages = {532-539}, + abstract = {Recent advancements in tabular deep learning (DL) have led to substantial performance improvements, surpassing the capabilities of traditional models. +With the adoption of techniques from natural language processing (NLP), such as language model-based approaches, DL models for tabular data have also grown in complexity and size. +Although tabular datasets do not typically pose scalability issues, the escalating size of these models has raised efficiency concerns. Despite its importance, efficiency has been relatively underexplored in tabular DL research. This paper critically examines the latest innovations in tabular DL, with a dual focus on performance and computational efficiency. +The source code is available at https://github.com/basf/mamba-tabular.} +} + +% Applications +@InProceedings{ardestani2024text, + title = {Text Summarization With Graph Attention Networks}, + section = {Applications}, + author = {Ardestani, Mohammadreza and Chali, Yllias}, + pages = {540-553}, + abstract = {This study aimed to leverage graph information, particularly Rhetorical Structure Theory (RST) and Co-reference (Coref) graphs, to enhance the performance of our baseline summarization models. Specifically, we experimented with a Graph Attention Network architecture to incorporate graph information. However, this architecture did not enhance the performance. Subsequently, we used a simple Multi-layer Perceptron architecture, which improved the results in our proposed model on our primary dataset, CNN/DM. Additionally, we annotated XSum dataset with RST graph information, establishing a benchmark for future graph-based summarization models. This secondary dataset posed multiple challenges, revealing both the merits and limitations of our models.} +} + +@InProceedings{rajasekhar2024less, + title = {Less is Enough: Adapting Pre-trained Vision Transformers for Audio-Visual Speaker Verification}, + section = {Applications}, + author = {Praveen Rajasekhar, Gnana and Alam, Jahangir}, + pages = {554-563}, + abstract = {Speaker Verification has achieved significant improvement in performance using sophisticated deep learning architectures, specialized for speech signals as well as robust loss functions. Recently, the fusion of faces and voices received a lot of attention as they offer complementary relationship with each other, which has the potential to outperform systems with only speech signals. Inspired by the massive success of Vision Transformers (ViTs) in computer vision, ViTs have also been explored for multimodal learning. In this work, we have investigated the potential of ViTs, pre-trained on visual data, for audio-visual speaker verification. To cope with the challenges of large-scale training, we introduce the Latent Audio-Visual Vision Transformer (LAVViT) adapters, where we exploit the existing pre-trained models on visual data by training only the parameters of LAVViT adapters, without fine-tuning the original parameters of the pre-trained models. The LAVViT adapters are injected into every layer of the ViT architecture to effectively fuse the audio and visual modalities using a small set of latent tokens, thereby avoiding the quadratic computational cost of cross-attention across the modalities. The proposed approach has been evaluated on the Voxceleb1 dataset and shows promising performance using only a few trainable parameters.} +} + +@InProceedings{fathan2024enhanced, + title = {Enhanced label noise robustness through early adaptive filtering for the self-supervised speaker verification task}, + section = {Applications}, + author = {Fathan, Abderrahim and Zhu, Xiaolin and Alam, Jahangir}, + pages = {564-575}, + abstract = {Using clustering-driven annotations to train a neural network can be a tricky task because of label noise. In this paper, we propose a dynamic and adaptive label noise filtering method, called AdaptiveDrop which combines both label noise cleansing and correction simultaneously in cascade to combine their advantages. Contrary to other label noise filtering approaches, our method filters noisy samples on the fly from an early stage of training. We also provide a variant that incorporates sub-centers per each class for enhanced robustness to label noise by continuously tracking the dominant sub-centers via a dictionary table. AdaptiveDrop is a simple general-purpose method, performed end-to-end in only one stage of training, can be integrated with any loss function, and does not require training from scratch on the cleansed dataset. We show through extensive ablation studies for the self-supervised speaker verification task that our method is effective, benefits from long epochs of iterative filtering and provides consistent performance gains across various loss functions and real-world pseudo-labels.} +} + +@InProceedings{chaparala2024mai, + title = {{Mai Ho`omāuna i ka `Ai}: Language Models Improve Automatic Speech Recognition in Hawaiian }, + section = {Applications}, + author = {D Chaparala, Kaavya and Zarrella, Guido and Torres Fischer, Bruce and Kimura, Larry and Parker Jones, Oiwi}, + pages = {576-583}, + abstract = {In this paper we address the challenge of improving Automatic Speech Recognition (ASR) for a low-resource language, Hawaiian, by incorporating large amounts of independent text data into an ASR foundation model, Whisper. To do this, we train an external language model (LM) on ∼1.5M words of Hawaiian text. We then use the LM to rescore Whisper and compute word error rates (WERs) on a manually curated test set of labeled Hawaiian data. As a baseline, we use Whisper without an external LM. Experimental results reveal a small but significant improvement in WER when ASR outputs are rescored with a Hawaiian LM. The results support leveraging all available data in the development of ASR systems for underrepresented languages.} +} + +@InProceedings{shinde2024lightweight, + title = {Lightweight Neural Networks for Speech Emotion Recognition using Layer-wise Adaptive Quantization}, + section = {Applications}, + author = {Shinde, Tushar and Jain, Ritika and Kumar Sharma, Avinash}, + pages = {584-595}, + abstract = {Speech Emotion Recognition (SER) systems are essential in advancing human-machine interaction. While deep learning models have shown substantial success in SER by eliminating the need for handcrafted features, their high computational and memory requirements, alongside intensive hyper-parameter optimization, limit their deployment on resource-constrained edge devices. To address these challenges, we introduce an optimized and computationally efficient Multilayer Perceptron (MLP)-based classifier within a custom SER framework. We further propose a novel, layer-wise adaptive quantization scheme that compresses the model by adjusting bit-width precision according to layer importance. This layer importance is calculated based on statistical measures such as parameter proportion, entropy, and weight variance within each layer. Our approach achieves an optimal balance between model size reduction and performance retention, ensuring that the quantized model maintains accuracy within acceptable limits. Traditional fixed-precision methods, while computationally simple, are less effective at reducing model size without compromising performance. In contrast, our scheme provides a more interpretable and computationally efficient solution. We evaluate the proposed model on standard SER datasets using features such as Mel-Frequency Cepstral Coefficients (MFCC), Chroma, and Mel-spectrogram. Experimental results demonstrate that our adaptive quantization method achieves performance competitive with state-of-the-art models while significantly reducing model size, making it highly suitable for deployment on edge devices.} +} + +@InProceedings{chen2024onlysportslm, + title = {{OnlySportsLM}: Optimizing Sports-Domain Language Models with {SOTA} Performance under Billion Parameters}, + section = {Applications}, + author = {Chen, Zexin and Li, Chengxi and Xie, Xiangyu and Dube, Parijat}, + pages = {596-610}, + abstract = {This paper explores the potential of a small, domain-specific language model trained exclusively on sports-related data. We investigate whether extensive training data with specially designed small model structures can overcome model size constraints. The study introduces the OnlySports collection, comprising OnlySportsLM, OnlySports Dataset, and OnlySports Benchmark. Our approach involves: 1) creating a massive 600 billion tokens OnlySports Dataset from FineWeb, 2) optimizing the RWKV architecture for sports-related tasks, resulting in a 196M parameters model with 20-layer, 640-dimension structure, 3) training the OnlySportsLM on part of OnlySports Dataset, and 4) testing the resultant model on OnlySports Benchmark. OnlySportsLM achieves a 37.62%/34.08% accuracy improvement over previous 135M/360M state-of-the-art models and matches the performance of larger models such as SomlLM 1.7B and Qwen 1.5B in the sports domain. Additionally, the OnlySports collection presents a comprehensive workflow for building high-quality, domain-specific language models, providing a replicable blueprint for efficient AI development across various specialized fields.} +} diff --git a/index.html b/index.html new file mode 100644 index 0000000..e4d427d --- /dev/null +++ b/index.html @@ -0,0 +1,3 @@ +--- +layout: home +---