index.html

<html>
<!DOCTYPE html>

<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=default'></script>
	<!-- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css"> -->
	<!-- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/atom-one-dark.min.css"> -->
	<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github.min.css">
	<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>	
	<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/python.min.js"></script>
	<script>hljs.highlightAll();</script>
	<script>
		function copyCode() {
		  const codeElement = document.querySelector('pre code');
		  const codeText = codeElement.innerText;
		  navigator.clipboard.writeText(codeText);
		  // Optionally provide user feedback, e.g., changing the button text
		}
	  </script>
	  <script>
        document.addEventListener("DOMContentLoaded", function() {
            var button = document.getElementById("toggle-button");
            var content = document.getElementById("toggle-content");
			var toggle_text = document.getElementById("toggle-text");
			// <div class="toggle-text"><p>Press the + sign  to see  the raw datapoints for the above graph.</p></div>

            button.addEventListener("click", function() {
                if (content.style.display === "none" || content.style.display === "") {
                    content.style.display = "block";
                    button.textContent = "-";
					toggle_text.textContent = "Press the - sign to hide the raw datapoints for the above graph.";
                } else {
                    content.style.display = "none";
                    button.textContent = "+";
					toggle_text.innerHTML = "<b>Tabular Data:</b> Press the + sign  to see  the raw datapoints for the above graph."
                }
            });
        });
    	</script>
		<!-- <script>
			function toggleContent() {
				var content = document.getElementById("content");
				var button = document.getElementById("toggle-button");

				button.addEventListener("click", function() {
					if (content.style.display === "none" || content.style.display === "") {
						content.style.display = "block";
						button.textContent = "-";
					} else {
						content.style.display = "none";
						button.textContent = "+";
					}
				});
			}
			</script> -->

	<!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.2/es5/latest.min.js"></script> -->

	<title>Faster and Smaller Whisper: A Deep Dive into Quantization and Torch Compilation</title>
	<link rel="stylesheet" type="text/css" href="styling.css">
	<link rel="icon" type="image/png" href="figs/aana_logo.png">	
	<link rel="stylesheet" href="https://use.typekit.net/pnf5khj.css">
	<!-- <link href='https://fonts.googleapis.com/css?family=Poppins' rel='stylesheet'> -->
	<link href="https://fonts.googleapis.com/css2?family=Merriweather:ital,wght@0,300;0,400;0,700;0,900;1,300;1,400;1,700;1,900&family=Poppins:ital,wght@0,100;0,200;0,300;0,400;0,500;0,600;0,700;0,800;0,900;1,100;1,200;1,300;1,400;1,500;1,600;1,700;1,800;1,900&display=swap" rel="stylesheet">


	<!-- <link rel="preconnect" href="https://fonts.googleapis.com">
	<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
	<link href="https://fonts.googleapis.com/css2?family=Oxygen&family=Source+Serif+4:ital,opsz,wght@0,8..60,200..900;1,8..60,200..900&display=swap" rel="stylesheet"> -->


	<meta name="description" content="Faster and Smaller Whisper: A Deep Dive into Quantization and Torch Compilation">

	<meta name="keywords"
		content="Whisper, ASR, Automatic Speech Recongition,  Machine Learning, Transformer Models, Neural Networks, AI Optimization, Faster Whisper, Quantization, HQQ">


	<!-- Specific tags for Open Graph / social media sharing -->
	<meta property="og:title" content="Faster and Smaller Whisper: A Deep Dive into Quantization and Torch Compilation">
	<meta property="og:description"
		content="A support blog for speeding up whisper by batch processing.">
	<meta property="og:image" content="https://mobiusml.github.io/batched_whisper_blog/figs/aana_whisper_hqq_compile.png">
	<meta property="og:url" content="https://mobiusml.github.io/batched_whisper_blog/">
	<meta property="og:type" content="article">

	<!-- Twitter Card data -->
	<meta name="twitter:card" content="summary_large_image">
	<meta name="twitter:title" content="Speeding up Whisper (ASR)">
	<meta name="twitter:description"
		content="A support blog for the release of fast and small whisper model.">
	<meta name="twitter:image" content=https://mobiusml.github.io/batched_whisper_blog/figs/aana_whisper_hqq_compile.png">
	<meta name="twitter:creator" content="@appughar">

	<!-- Meta tags for article publishing date and modification date -->
	<meta name="article:published_time" content="2024-03-27T08:00:00+00:00">
	<meta name="article:modified_time" content="2024-03-27T09:00:00+00:00">
</head>

<body>
	<article id="speeding up whisper" class="page sans">
		<header>
			<h1 class="page-title">Faster and Smaller Whisper: A Deep Dive into Quantization and Torch Compilation</h1>

		</header>
		<div class="page-body">
			<p><a href="https://scholar.google.de/citations?user=KcoTPWoAAAAJ&hl=en"><mark
						class="highlight-gray">Jilt Sebastian</mark></a><mark class="highlight-gray"></mark>,
						<a href="https://github.com/huseinzol05"><mark
							class="highlight-gray">Husein Zolkepli</mark></a><mark class="highlight-gray"></mark>,</a>
						<a href="https://scholar.google.com/citations?user=LxweMX4AAAAJ&hl=en"><mark
						class="highlight-gray">Hicham Badri</mark></a><mark class="highlight-gray"></mark>,
						<a
					href="https://scholar.google.com/citations?user=HxZDDzUAAAAJ&hl=en"><mark
						class="highlight-gray">Appu Shaji</mark></a><mark class="highlight-gray"></mark></p>
			<p><mark class="highlight-gray"><a href="https://www.mobiuslabs.com/"><mark class="highlight-gray">Mobius
							Labs GmbH</mark></a> and <a href="https://mesolitica.com/">Mesolitica</a></p>
			<hr />
			<h2 id="intro" class="">Introduction</h2>

			<p>Recent years have witnessed remarkable advancements in artificial intelligence, propelling rapid growth in automatic speech recognition (ASR) technologies. Soon after its release, OpenAI's <a href="https://github.com/openai/whisper">Whisper</a> model quickly gained prominence due to its open licensing, competitive performance against proprietary models, and strong generalization capabilities. Despite being in the field for over two years, Whisper models continue to be highly relevant and are the go-to workhorse for many large-scale ASR systems deployed worldwide.</p>

			<div class="column-list">
				<div style="width:49%" class="column">
					<div class="caption">Baseline</div>
					<img src="figs/transcription_speed_default_opt.gif" />
					
				</div>
				<div style="width:49%" class="column">
					<div class="caption">Torch Compile + HQQ</div> 
					<img src="figs/transcription_compile_hqq_opt.gif" />					
				</div>

			</div>
		
			<p>In this blog post, we explain the techniques we used to enhance the performance of the PyTorch based Whisper models. By leveraging transformers, implementing a <a href"https://github.com/huggingface/transformers/pull/30760"> static cache</a>, and utilizing <a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html">torch.compile</a>, we significantly accelerated the model's inference speed. Additionally, we employed <a href="https://github.com/mobiusml/hqq">HQQ</a> to quantize the Whisper models to 4 bits, preserving transcription quality with minimal degradation, as evaluated by Word Error Rate (WER) benchmarks. Our optimizations resulted in a <b>4.5x</b> speedup for non-quantized models and an impressive <b>6x</b> speedup for quantized models.  Moreover, we report detailed quantization results and found that Whisper can be run in extremely low-bit configurations.</p>

			
			<p>Furthermore, we provide benchmarks across various Automatic Speech Recognition (ASR) datasets to demonstrate the effectiveness of our optimizations. This post will delve into the methods and processes behind these improvements, providing insights into the power of optimized kernels and quantization.</p>


			<hr id="header_seperator" />
			<div class="column-list">
				<div style="width:32%" class="column">
					<!-- <p class="page-description"><img src="./baby_aana.png" /></p> -->
					<figure class="image" style="text-align:left"><a href="figs/aana_whisper_hqq_compile.png"><img style="width:240px"
								src="figs/aana_whisper_hqq_compile.png" /></a>
					</figure>
					<p>
						<strong>Table of Contents</strong>
					</p>
					<nav class="block-color-gray table_of_contents">
						<div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
								href="#intro">Introduction</a></div>
						<div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
								href="#speed-optimization">Speed Optimization</a>	
						</div>						
						<div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
							href="#quantization">Quantization</a>	
					</div>						
						<div class="table_of_contents-item table_of_contents-indent-0"><a
							class="table_of_contents-link" href="#benchmarks">Benchmarks</a>
						</div>						
						<div class="table_of_contents-item table_of_contents-indent-0"><a
								class="table_of_contents-link" href="#low-bit">Extreme Quantization</a>
						</div>

						<hr />

						<!-- <p><strong> Test it out at </strong></p>
						<ul style="list-style-type:none;"> 
						 <p><a target="_blank" href="https://github.com/mobiusml/faster-whisper">
							<img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-title.svg" alt="Batched Faster Whisper" width="120"></a></p>
						<p><a target="_blank" href="https://colab.research.google.com/drive/1ywpZVj1NwZ2Tre0KOb_a2J_2S1hbzx0P?usp=sharing">
							<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
							</a></p>							
						

						<hr />
						 -->
						<p><strong>Code</strong></p>
						<a target="_blank" href="https://colab.research.google.com/drive/18Zs-oG1Ztco3cfnNexcHDi-Zn9vk2RJ5?usp=sharing">
							<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
						  </a>

						<p><strong> Talk to us at </strong></p>
						<a href="https://discord.gg/du5KZ66JSM"><img
							src="https://icons.iconarchive.com/icons/bootstrap/bootstrap/48/Bootstrap-discord-icon.png"
							width="24"></a>
						<a href="https://twitter.com/Mobius_Labs"><img
							src="https://upload.wikimedia.org/wikipedia/commons/thumb/c/ce/X_logo_2023.svg/450px-X_logo_2023.svg.png"
							width="24"></a>
					
						<hr />
							
						
					</nav>

				</div>
				<div style="width:75%" class="column">
			
					
					<h2 id="speed-optimization" class="">Speed Optimization </h2>
					<p><a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html">torch.compile</a> compiles PyTorch models into optimized Triton kernels and can often result in significant speedups for various PyTorch-based models. One of the prerequisites and critical steps toward achieving this is managing a static KV-cache, as detailed in <a href="https://pytorch.org/blog/accelerating-generative-ai-2/">https://pytorch.org/blog/accelerating-generative-ai-2/</a>. During inference, the KV-cache stores intermediate outputs of the model in the attention modules. However, due to varying prompts and generation lengths, these caches are implemented via <a href="https://github.com/openai/whisper/blob/ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab/whisper/model.py#L301">dynamic allocation</a>, which creates overhead. Instead, torch.compile requires the KV-cache to be maintained statically by pre-allocating the maximum size. </p>
					
					<p>The first step we took was to implement a static cache for the Whisper model to make it fully compatible with torch.compile. A gist of the implementation is available <a href="https://gist.github.com/huseinzol05/9aff34ec1427ee8c92240cb4f3cc0c88">here</a>. As far as we know, this is the first implementation of a <a href="https://github.com/huggingface/transformers/pull/30760#discussion_r1606974803">static cache for encoder-decoder models in the Transformers library</a>, and hopefully, it will be useful for other encoder-decoder architectures. </p>

					
					<h2 id="quantization" class="">Quantization</h2>
					<p>We further quantized models to 4 bits using <a href="https://github.com/mobiusml/hqq">Half-Quadratic Quantization (HQQ)</a>. HQQ is a fast and accurate model quantizer that does not need any calibration data. Additionally, it fully supports torch.compile. Though quantization techniques can reduce VRAM requirements, they require optimized fused kernels to run faster. Recent developments have introduced low-bit matmul kernels that offer significant speed-ups such as TorchAO's <a href="https://pytorch.org/cppdocs/api/function_namespaceat_1adeda9630914278ac02d7fd758da19e3d.html">tiny_gemm</a>, and HQQ <a href="https://github.com/mobiusml/hqq/blob/aad68687e042ed628b5a655969406d501a203949/examples/backends/torchao_int4_demo.py">can leverage these kernels</a>). </p>

					<p>Our previous experience with quantizing LLMs has shown that there is only a very marginal drop in quality (measured as word error rate) when quantizing to 4 bits, while achieving further speedup compared to vanilla PyTorch compiled models. Moreover, the GPU VRAM requirements for the decoder can be reduced by a factor of 3 to 4.</p>

					<p>We only quantize the decoder linear layers of Whisper for the following reason: the encoder linear layers involve mainly matrix-matrix computations (GEMM), whereas the decoder layers primarily involve matrix-vector multiplications (GEMV) during the decoding phase. The TorchAO kernel is optimized to speed up GEMV operations with 4-bit quantized weights.</p>

					<p>Even if the encoder performance is optimized, its impact on the overall runtime is marginal compared to the decoder. The encoder is only run once, whereas the decoder must be run many times, once per token generated.</p>

					
					<h2 id="benchmarks">Benchmarks</h2>
					<p>We conducted two sets of benchmarking experiments to validate the speed-up using torch.compile and HQQ. The <a href="https://huggingface.co/datasets/open-asr-leaderboard/datasets">Open ASR eval dataset</a> is used in the first experiment to measure the speed improvement and effect on the WER, and a test set of audio extracts from YouTube videos is used in the second benchmarking. We find that in real-life use cases, transcripts are generally long, hence the second dataset is included. We used the 'large-v2' model with a batch size of 1 and greedy decoding (no sampling) for the experiments.</p>
						
					<p>The <a href="https://openvoice-tech.net/index.php/Real-time-factor">Real Time Factor (RTF)</a> metric, commonly used in open-source benchmarks like <a href="https://huggingface.co/spaces/hf-audio/open_asr_leaderboard">Open ASR Leaderboard</a> evaluation, measures the speed of offline ASR systems by comparing total processing time to audio duration. We use 1/RTF in our benchmarking experiments as it is more interpretable once the speed becomes faster than the real-time speed, which is a common offline ASR requirement. We used <code>EnglishTextNormalizer()</code> from <a href="https://github.com/huggingface/open_asr_leaderboard">Open ASR Eval</a> as the text normalizer to compute WER. Experiments use torch nightly build as of 28 May 2024. The experiments are run on a RTX 4090 GPU.
					</p>


					<h3>Short-Form Audio</h3>
					<p>The first dataset represents short-form audio as the average duration of each sample in the dataset is less than 10 seconds. Four Open ASR eval datasets with audios less than 30 seconds are used for the experiments. Baseline system is the current implementation in <a href="https://github.com/huggingface/transformers" ></a>transformers, that does not use torch.compile operation. </p>
						<table>
						<caption> Speed up and WER (short form audio). The speed metric indicates how many times faster than the real-time speed. </caption>
						<tr>
							<th>Dataset</th>
							<th>Avg. Duration</th>						
							<th colspan="2" >Baseline</th>
							<th colspan="2" >Torch Compile</th>
							<th colspan="2" >Torch Compile + HQQ</th>
						</tr>
						<tr>
							<th></th>
							<th></th>						
							<th>Speed</th>
							<th>WER</th>
							<th>Speed</th>
							<th>WER</th>
							<th>Speed</th>
							<th>WER</th>
							
						</tr>				
						<tr>
							<td><a href="https://huggingface.co/datasets/LIUM/tedlium">TEDLIUM</a></td>
							<td>8 secs</td>							
							<td>11.6x</td>
							<td>4.06%</td>
							<td>35.3x </td>
							<td>4.06%</td>
							<td>37.3x</td>							
							<td>4.0%</td>
						</tr>
						<tr>
							<td><a href="https://huggingface.co/datasets/facebook/voxpopuli">Voxpopuli</a></td>
							<td>10 secs</td>							
							<td>14.2x</td>
							<td>7.84%</td>
							<td>42.6x</td>
							<td>7.84%</td>
							<td>47.9x</td>
							<td>7.91%</td>
						</tr>
						<tr>
							<td><a href="https://huggingface.co/datasets/distil-whisper/earnings22">Earnings22</a></td>
							<td>7 secs</td>														
							<td>12.8x</td>
							<td>12.13%</td>
							<td>36.4x</td>
							<td>12.15%</td>
							<td>42.9x</td>
							<td>12.83%</td>
						</tr>
						<tr>
							<td><a href="https://huggingface.co/datasets/edinburghcstr/ami">AMI</a></td>
							<td>2 secs</td>														
							<td>9.8x</td>
							<td>16.67%</td>
							<td>20.8x </td>
							<td>16.71%</td>
							<td>20.4x </td>
							<td>16.69%</td>
						</tr>
					</table>

				<h3>Real-Word Scenario: Long Audios</h3>
				<p> In order to check the effect of these decoding methods in real-world scenarios with long audios, we used an internal test dataset with 84 minutes duration and verified ground truth. The test set contains 9 audios ranging from 3 minutes to 13 minutes and covers various audio types.</p>
				<table>
					<caption>Speed up and WER (long form audio). We also compare the VRAM usage of the models.</caption>
		
				<tr>
					<th>System</th>					
					<th>Baseline </th>
					<th>Torch Compile</th>
					<th>Torch Compile + HQQ</th>
				</tr>
				<tr>
					<td>Speed</td>
					<td>15.8x </td>
					<td>57.6x </td>
					<td>72.8x </td>			
				</tr>
				<tr>
					<td>WER (%)</td>
					<td>11.45</td>
					<td>11.47</td>
					<td>11.49</td>
				</tr>
				<!-- <tr>
					<td> Tokens/sec </td>
					<td>49.5</td>
					<td>243.4</td>
					<td>316.4</td>
				</tr> -->
				<tr>
					<td> VRAM usage </td>
					<td>3576 MiB</td>
					<td>3576 MiB</td>
					<td>2454 MiB</td>
				</tr>
			</table>

			<p>The speed-up in the actual processing time depends on the audio length. For datasets with larger audio lengths, the speed-up is higher as there are more tokens to decode at a higher pace, and the impact of optimization and quantization is more profound.</p> 
			
			<p>For example in AMI corpus, which is a corpus with very short audio clips with average length of 2 seconds, the speed up is around </b>2x</b> compared to baseline versus the long form dataset where the speed up is <b>4.6x</b> for the HQQ based approach and <b>3.6x</b> for pytorch compiled one. This speed up can be achieved on top of the fastest sequential /batched execution pytorch based models, resulting in ultra-fast ASR inference. </p>

			<h4>Tokens per Second</h4>

		
			<p>The significant speed-up and impact of optimization and quantization happens in the decoder layers. The following table shows the tokens per second processed by the decoder.</p>

			<table>
				<caption>Decoding Speed (tokens/sec)</caption>
				<tr>
					<td>Baseline</td>
					<td>49.5 it/sec</td>
				</tr>
				<tr>
					<td>Torch Compile</td>
					<td>243.4 it/sec</td>
				</tr>
				<tr>
					<td>Torch Compile + HQQ</td>
					<td>316.4 it/sec</td>
				</tr>
			</table>


			<h2 id="low-bit">Whisper in Extreme Low-Bit Configurations</h2>

			<p>Additionally, we tested Whisper in extremely low-bit configurations (i.e., 3, 2, and 1.58 (ternary) bits) with different quantization group sizes. Quantization parameters (namely scale and zero point) are set and computed for a group of entries in a matrix. A lower group size requires more metadata regarding quantization parameters to be stored. Therefore, finding an optimal balance between a large group size and preserving quality is crucial. However, for lower bit sizes, a smaller group size is required (refer to this <a href="https://mobiusml.github.io/hqq_blog/">blog</a>).</p>


			<p>The default group size for 4-bit models is 64, so we report results for lower bit sizes at 32 and 16 group sizes. To our surprise, Whisper's performance only degrades slightly and remains quite useful even in extremely low-bit configurations. For 2-bit and 1.58-bit configurations, a group size of 16 is required to be useful. For larger bit sizes, a group size of 64 exhibits comparable accuracy to the non-quantized model.</p>

			<p>Ternary weights (1.58 bit) are quite promising since matrix multiplications can be reformulated as additions, potentially resulting in a 70x improvement in efficiency (refer to this <a href="https://arxiv.org/abs/2402.17764">paper</a>). We also conducted experiments with pure 1-bit quantization. However, even with a low group size, the Word Error Rate (WER) was very high, and the output was not meaningful.</p>


			<p>Note that the PyTorch acceleration kernels we use are optimized for 4 bits, so the speed decreases when the bit size is less than 4 bits. Below, we report the Word Error Rate (WER) along with speed and quantization bitrate and group sizes for these configurations.</p>


			<p>The interactive graph below summarizes the performance with different datasets into a scatter plot. Hover or click on a bubble to display the details.</p>

			<!-- Load d3.js -->
			<script src="https://d3js.org/d3.v4.js"></script>
			<script src="https://d3js.org/d3-scale-chromatic.v1.min.js"></script>

			<!-- Create a div where the graph will take place -->
			<div id="my_dataviz"></div>

			<style>
				.bubbles {
					stroke-width: 2px;
					stroke: white;
				}

				.bubbles:hover {
					stroke: black;
				}

				.tooltip {
					position: absolute;
					background-color: white;
					border: solid 1px;
					border-radius: 2px;
					padding: 10px;
					color: black;
					pointer-events: none;
				}

				.radio-buttons, .checkboxes {
					margin: 10px;
					border: solid;
					background-color: #fff;
					border-width: 1px;
					border-radius: 5px;
					width: auto;
					display: inline-block;
					text-align: center;
				}

				.radio-buttons label, .checkboxes label {
					font-weight: bold;
					font-size: 12px;
					margin-right: 10px;
				}
			</style>

			<script>
				var margin = { top: 10, right: 20, bottom: 30, left: 50 },
					width = 500 - margin.left - margin.right,
					height = 420 - margin.top - margin.bottom;

				var svg = d3.select("#my_dataviz")
					.append("svg")
					.attr("width", width + margin.left + margin.right)
					.attr("height", height + margin.top + margin.bottom)
					.append("g")
					.attr("transform", "translate(" + margin.left + "," + margin.top + ")");

				d3.csv("https://gist.githubusercontent.com/Jiltseb/2fe646d71053b4d462f00959b0587f84/raw/7ab4a299579fc1949b4e4260c0bcae2d7f992780/whisper_hqq_low_bit_chart.csv", function (data) {

					var datasets = ["TEDLIUM", "Voxpopuli", "Earnings22", "AMI", "Long-form Audio"];
					var groupSizes = [...new Set(data.map(item => item.group_size))];

					// Create div for radio buttons
					var radioButtonsDiv = d3.select("#my_dataviz").append("div")
						.attr("class", "radio-buttons");

					radioButtonsDiv.append("label").text("Select Dataset: ");
					datasets.forEach(function (dataset) {
						radioButtonsDiv.append("input")
							.attr("type", "radio")
							.attr("name", "datasetGroup")
							.attr("value", dataset)
							.property("checked", dataset === "TEDLIUM")
							.on("change", updatePlot);
						radioButtonsDiv.append("label").text(dataset);
					});

					// Create div for checkboxes
					var checkboxesDiv = d3.select("#my_dataviz").append("div")
						.attr("class", "checkboxes");

					checkboxesDiv.append("label").text("Configuration: ");
					groupSizes.forEach(function (groupSize) {
						var labelText = groupSize === '-' ? 'Baseline System' : "Group size " + groupSize;
						checkboxesDiv.append("input")
							.attr("type", "checkbox")
							.attr("value", groupSize)
							.attr("checked", true)
							.on("change", updatePlot);
						checkboxesDiv.append("label").text(labelText);
					});

					var x = d3.scaleLog()
						.base(10)
						.domain([1, 16])
						.range([0, width]);

					svg.append("g")
						.attr("transform", "translate(0," + height + ")")
						.call(d3.axisBottom(x)
							.tickValues([1, 1.58, 2, 3, 4, 16])
							.tickFormat(function (d) {
								if (d === 1.58) {
									return d;
								} else {
									return d3.format(",.0f")(d);
								}
							})
						);

					var y = d3.scaleLinear()
						.domain([0, d3.max(data, function (d) { return +d.WER; })])
						.range([height, 0]);

					svg.append("g")
						.call(d3.axisLeft(y));

					var z = d3.scaleLinear()
						.domain([2, 16])
						.range([8, 16]);

					var myColor = d3.scaleOrdinal()
						.domain(datasets)
						.range(["#3d85c6", "#f44336", "#ce7e00", "#777777", "#8fce00"]);

					var tooltip = d3.select("#my_dataviz")
						.append("div")
						.attr("class", "tooltip")
						.style("opacity", 0);

					var showTooltip = function (d) {
						var content = "<ul>" +
							"<li><strong>Group size:</strong> " + d.group_size + "</li>" +
							"<li><strong>Number of Bits:</strong> " + d.nbits + "</li>" +
							"<li><strong>Dataset:</strong> " + d.Dataset + "</li>" +
							"<li><strong>WER:</strong> " + d.WER + "</li>";
						tooltip.transition().duration(200).style("opacity", 0.9);
						tooltip.html(content)
							.style("left", (d3.event.pageX + 10) + "px")
							.style("top", (d3.event.pageY + 10) + "px");
					};

					var moveTooltip = function (d) {
						tooltip.style("left", (d3.event.pageX + 10) + "px").style("top", (d3.event.pageY + 10) + "px");
					};

					var hideTooltip = function (d) {
						tooltip.transition().duration(200).style("opacity", 0);
					};

					svg.append("text")
						.attr("class", "x label")
						.attr("text-anchor", "end")
						.attr("x", width - width / 2 + 60)
						.attr("y", height + 30)
						.style("font", "12px sans-serif")
						.style("font-weight", "bold")
						.text("Number of bits");

					svg.append("text")
						.attr("class", "y label")
						.attr("text-anchor", "end")
						.attr("y", -50)
						.attr("dy", ".75em")
						.attr("x", -height / 2 + 30)
						.attr("transform", "rotate(-90)")
						.style("font", "12px sans-serif")
						.style("font-weight", "bold")
						.text("WER");

					var bubble = svg.append('g')
						.selectAll("circle")
						.data(data)
						.enter()
						.append("circle")
						.style('opacity', 0.5)
						.attr("class", "bubbles")
						.attr("cx", function (d) { return x(d.nbits); })
						.attr("cy", function (d) { return y(d.WER); })
						.attr("r", function (d) { return z(d.nbits); })
						.style("fill", function (d) { return myColor(d.group_size); })
						.on("mouseover", showTooltip)
						.on("mousemove", moveTooltip)
						.on("mouseleave", hideTooltip);

					function updatePlot() {
						var selectedDataset = document.querySelector('input[name="datasetGroup"]:checked').value;
						var selectedGroupSizes = Array.from(document.querySelectorAll('.checkboxes input:checked')).map(d => d.value);

						bubble.style('display', function (d) {
							var datasetMatch = d.Dataset === selectedDataset;
							var groupMatch = selectedGroupSizes.includes(d.group_size);
							return datasetMatch && groupMatch ? null : 'none';
						});
					}

					updatePlot(); // Initial call to update the plot based on default selections
				});
			</script>

			<div class="table-data" style="border: 1px solid #d6d6d6; border-radius: 8px; padding-left: 20px;">
				<div class="toggle-container">
					<div class="toggle-text" id="toggle-text"><b>Tabular Data:</b> Press the + sign  to see  the raw datapoints for the above graph.</div>
				<div><button class="toggle-button" id="toggle-button">+</button></div>
				</div>
			

			<table class="toggle-content" id="toggle-content">
				<caption> Whisper Low-Bit Quantization WER. </caption>
				<tr>
					<th>Dataset</th>
					<th>#bits</th>
					<th>Group Size</th>
					<th>WER</th>						
					<th>Speed</th>					
				</tr>
				<tr>
					<td rowspan="9">TEDLIUM</td>
					<td class="group2">4 bit</td>
					<td class="group2">64</td>
					<td class="group2">4.06</td>
					<td class="group2">37.3x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>16</td>
					<td>4.94</td>
					<td>17.9x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>32</td>
					<td>4.14</td>
					<td>18.0x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>64</td>
					<td>4.72</td>
					<td>15.9x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>16</td>
					<td>4.61</td>
					<td>26.2x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>32</td>
					<td>4.77</td>
					<td>23.2x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>64</td>
					<td>7.26</td>
					<td>24.4x</td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>16</td>
					<td>5.30</td>
					<td>26.3x</td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>32</td>
					<td>23.94</td>
					<td>25.7x</td>
				</tr>
				<tr>
					<td rowspan="9">Voxpopuli</td>
					<td class="group2">4 bit</td>
					<td class="group2">64</td>
					<td class="group2">7.84</td>
					<td class="group2">47.9x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>16</td>
					<td>9.03</td>
					<td>20.7x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>32</td>
					<td>8.40</td>
					<td>21.3x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>64</td>
					<td>7.82</td>
					<td>21.8x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>16</td>
					<td>8.05</td>
					<td>29.6x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>32</td>
					<td>8.74</td>
					<td>30.0x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>64</td>
					<td>8.97</td>
					<td>29.4x</td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>16</td>
					<td>9.64</td>
					<td>27.8x</td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>32</td>
					<td>30.42</td>
					<td>25.3x</td>
				</tr>
				<tr>
					<td rowspan="9">Earnings22</td>
					<td class="group2">4 bit</td>
					<td class="group2">64</td>
					<td class="group2">12.15</td>
					<td class="group2">42.9x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>16</td>
					<td>12.3</td>
					<td>17.7x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>32</td>
					<td>12.41</td>
					<td>16.3x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>64</td>
					<td>12.26</td>
					<td>17.5x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>16</td>
					<td>13.60</td>
					<td>25.9x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>32</td>
					<td>13.54</td>
					<td>25.9x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>64</td>
					<td>16.03</td>
					<td>23.0x</td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>16</td>
					<td>12.40</td>
					<td>20.5x</td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>32</td>
					<td>38.50</td>
					<td>23x</td>
				</tr>
				<tr>
					<td rowspan="9">AMI</td>
					<td class="group2">4 bit</td>
					<td class="group2">64</td>
					<td class="group2">16.71</td>
					<td class="group2">20.4x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>16</td>
					<td>16.63</td>
					<td>9.0x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>32</td>
					<td>16.64</td>
					<td>8.6x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>64</td>
					<td>16.72</td>
					<td>8.7x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>16</td>
					<td>17.30</td>
					<td>13.4x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>32</td>
					<td>17.80</td>
					<td>13.7x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>64</td>
					<td>19.52</td>
					<td>12.9x</td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>16</td>
					<td>19.15</td>
					<td>13.3x</td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>32</td>
					<td>31.22</td>
					<td>12.7x</td>
				</tr>
				<tr">
					<td rowspan="12">Long-form Audio</td>
					<td class="group2" >4 bit</td>
					<td class="group2">64</td>
					<td class="group2">11.49</td>
					<td class="group2">72.8x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>					
					<td>64</td>
					<td>11.56</td>
					<td>34.6x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>32</td>
					<td>11.40</td>
					<td>33.9x</td>
				</tr>
				<tr class="group1">					
					<td>3 bit</td>
					<td>16</td>
					<td>11.56</td>
					<td>34.6x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>64</td>
					<td>12.74</td>
					<td>46.0x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>32</td>
					<td>11.75</td>
					<td>44.7x</td>
				</tr>
				<tr class="group2">
					<td>2 bit</td>
					<td>16</td>
					<td>11.58</td>
					<td>45.3x</td>
				</tr>

				<tr class="group1">
					<td>1.58 bit</td>
					<td>64</td>
					<td>96.66</td>
					<td>- <small>(higly erroneous)</small></td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>32</td>
					<td>41.8</td>
					<td>38.7x <small>(high hallucinations)</small></td>
				</tr>
				<tr class="group1">
					<td>1.58 bit</td>
					<td>16</td>
					<td>12.48</td>
					<td>42.9x</td>
				</tr>
			</table>
			</div>


			<!-- <p>To our surprise, Whisper's performance only degrades slightly and remains quite useful even in extremely low-bit configurations. For 2-bit and 1.58-bit configurations, a group size of 16 is required to be useful. For larger bit sizes, a group size of 64 exhibits comparable accuracy with the non quantized model.</p> -->

			<p>This warrants two future directions we will be pursuing: the development of more optimized kernels for lower bits and the use of post-quantization training, proposed as one of the methods in our <a href="https://mobiusml.github.io/hqq_blog/">1-bit blog</a>, to recover from low-bit performance degradation.</p>

			<h3>Caveats</h3>

			<p>A few caveats to note</p>
			<ol>
				<li>The TorchAO kernels required modern GPUs.</li>
				<li>Benchmarking of ASR systems <a href="https://x.com/reach_vb/status/1789020406709416111?s=46&t=FMqc_pzqAD4bhPuXQjLpKA">is not fully mature yet</a>. Therefore, these results need to be further battle-tested. However, our internal benchmark with long-form videos is representative of a production scenario, especially for English.</li>
			</ol>


					<h2 id="citations">Citation</h2>
					<div>								
					<pre><code style="background-color: #fff; color: #777;" >
@misc{sebastian2024whisper1,
title = {Faster and Smaller Whisper: A Deep Dive into Quantization and Torch Compilation},
url = {https://mobiusml.github.io/whisper-static-cache-blog/},
author = {Jilt Sebastian, Husein Zolkepli, Hicham Badri, and Appu Shaji},
month = {May},
year = {2024}
}
					</code></pre>
							</div>


							<div>
								<p style="text-align: center;">Please feel free to <a
										href="mailto:jilt@mobiuslabs.com">contact us.</a></p>
								<!--                             <p style="text-align: center; color:hotpink;">Check out our other blog post</p> -->

							</div>

				</div>


			</div>
			<p id="d9be7859-86c8-4e9e-8957-b0127ad9431d" class="">
			<div class="indented">
				<p id="7b0d7f13-0909-4e80-97fe-e0102053cc62" class="">
				</p>
			</div>
			</p>
		</div>
	</article>
</body>

</html>