huggingface · OlivierDehaene · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -60,10 +60,13 @@ jobs:
           authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
           slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
           slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
-      - name: Initialize Docker Buildx
-        uses: docker/setup-buildx-action@v2.0.0
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
         with:
           install: true
+          config-inline: |
+            [registry."docker.io"]
+              mirrors = ["registry.github-runners.huggingface.tech"]
       - name: Login to GitHub Container Registry
         if: github.event_name != 'pull_request'
         uses: docker/login-action@v2

diff --git a/Cargo.toml b/Cargo.toml
@@ -20,10 +20,6 @@ tokenizers = { version = "0.19.1", features = ["http"] }
 hf-hub = { version = "0.3.1", features = ["tokio"] }
 
 [profile.release]
-incremental = true
-
-[profile.release-binary]
-inherits = "release"
 debug = 1
 incremental = true
 panic = "abort"

diff --git a/benchmark/src/generation.rs b/benchmark/src/generation.rs
@@ -156,7 +156,6 @@ async fn prefill(
             }),
             top_n_tokens: top_n_tokens.unwrap_or(0),
             blocks: vec![],
-            slots: vec![],
         })
         .collect();
 

diff --git a/proto/v3/generate.proto b/proto/v3/generate.proto
@@ -132,8 +132,6 @@ message Request {
     uint32 top_n_tokens = 7;
     /// Paged attention blocks
     repeated uint32 blocks = 9;
-    /// Paged attention slots
-    repeated uint32  slots = 10;
 }
 
 message Batch {
@@ -164,6 +162,7 @@ enum FinishReason {
     FINISH_REASON_LENGTH = 0;
     FINISH_REASON_EOS_TOKEN = 1;
     FINISH_REASON_STOP_SEQUENCE = 2;
+    FINISH_REASON_TERMINATED = 3;
 }
 
 message GeneratedText {
@@ -198,18 +197,43 @@ message Generation {
     optional GeneratedText generated_text = 4;
     /// Top tokens
     repeated Tokens top_tokens = 5;
+    /// Current length of the cache: prompt tokens + number of generated tokens until this point
+    uint32 cache_length = 6;
 }
 
+message KeptRequest {
+    /// Request ID
+    uint64 id = 1;
+    /// Paged attention blocks
+    repeated uint32 blocks = 2;
+    /// Paged attention blocks padded to max blocks for this batch
+    repeated uint32 padded_blocks = 3;
+}
+
+/// kept_requests + terminated_request_ids might not cover all requests from the
+/// cached batch as some requests can be filtered out without requiring to generate text
+/// for example if the client dropped its connection to the router
 message FilterBatchRequest {
     /// Batch ID
     uint64 batch_id = 1;
     /// Requests to keep
-    repeated uint64 request_ids = 2;
+    repeated KeptRequest kept_requests = 2;
+    /// Requests to terminate and generate text for
+    repeated uint64 terminated_request_ids = 3;
+}
+
+message TerminatedGeneration {
+    // Request ID
+    uint64 id = 1;
+    // Generated text
+    GeneratedText generated_text = 2;
 }
 
 message FilterBatchResponse {
     /// Filtered Batch (cached)
     CachedBatch batch = 1;
+    /// Terminated generations
+    repeated TerminatedGeneration terminated_generations = 2;
 }
 
 

diff --git a/router/client/src/v3/client.rs b/router/client/src/v3/client.rs
@@ -90,15 +90,17 @@ impl Client {
     pub async fn filter_batch(
         &mut self,
         batch_id: u64,
-        request_ids: Vec<u64>,
-    ) -> Result<Option<CachedBatch>> {
+        kept_requests: Vec<KeptRequest>,
+        terminated_request_ids: Vec<u64>,
+    ) -> Result<(Option<CachedBatch>, Vec<TerminatedGeneration>)> {
         let request = tonic::Request::new(FilterBatchRequest {
             batch_id,
-            request_ids,
+            kept_requests,
+            terminated_request_ids,
         })
         .inject_context();
         let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
-        Ok(filtered_batch.batch)
+        Ok((filtered_batch.batch, filtered_batch.terminated_generations))
     }
 
     /// Warmup on a max size batch
@@ -155,7 +157,6 @@ impl Client {
                 truncate,
                 // Blocks and slots will be set on the server side if we use paged attention
                 blocks: vec![],
-                slots: vec![],
                 // Set sampling parameters to also take these ops into account in the max memory
                 parameters: Some(NextTokenChooserParameters {
                     temperature: 0.9,

diff --git a/router/client/src/v3/mod.rs b/router/client/src/v3/mod.rs
@@ -7,7 +7,7 @@ mod sharded_client;
 pub use client::Client;
 pub use pb::generate::v3::{
     input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
-    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
-    StoppingCriteriaParameters, Tokens,
+    HealthResponse, Image, InfoResponse, Input, InputChunk, KeptRequest,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters, TerminatedGeneration, Tokens,
 };
 pub use sharded_client::ShardedClient;
diff --git a/router/client/src/v3/sharded_client.rs b/router/client/src/v3/sharded_client.rs
@@ -2,14 +2,15 @@
 use crate::{v3, Health, ShardInfo};
 use crate::{ClientError, Result};
 
-use crate::v3::{Chunk, InfoResponse, Input};
+use crate::v3::{Chunk, InfoResponse, Input, TerminatedGeneration};
 use async_trait::async_trait;
-use futures::future::join_all;
+use futures::stream::FuturesUnordered;
+use futures::stream::StreamExt;
 use tonic::transport::Uri;
 use tracing::instrument;
 use v3::client::{DecodeTimings, PrefillTimings};
 use v3::{
-    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse, KeptRequest,
     NextTokenChooserParameters, Request, StoppingCriteriaParameters,
 };
 
@@ -29,8 +30,12 @@ impl ShardedClient {
     async fn from_master_client(mut master_client: Client) -> Result<Self> {
         // Get all uris/unix sockets from the master client
         let uris = master_client.service_discovery().await?;
-        let futures = uris.into_iter().map(Client::connect_uds);
-        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
+        let futures: FuturesUnordered<_> = uris.into_iter().map(Client::connect_uds).collect();
+        let clients: Result<Vec<Client>> = futures
+            .collect::<Vec<Result<_>>>()
+            .await
+            .into_iter()
+            .collect();
         Ok(Self::new(clients?))
     }
 
@@ -49,50 +54,66 @@ impl ShardedClient {
     /// Get the model info
     #[instrument(skip(self))]
     pub async fn info(&mut self) -> Result<ShardInfo> {
-        let futures: Vec<_> = self
+        let futures: FuturesUnordered<_> = self
             .clients
             .iter_mut()
             .map(|client| client.info())
             .collect();
-        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
+        futures
+            .collect::<Vec<Result<_>>>()
+            .await
+            .pop()
+            .unwrap()
+            .map(ShardInfo::from)
     }
 
     /// GRPC health check
     #[instrument(skip(self))]
     pub async fn health(&mut self) -> Result<HealthResponse> {
-        let futures: Vec<_> = self
+        let futures: FuturesUnordered<_> = self
             .clients
             .iter_mut()
             .map(|client| client.health())
             .collect();
-        join_all(futures).await.pop().unwrap()
+        futures.collect::<Vec<Result<_>>>().await.pop().unwrap()
     }
 
     /// Clear the past generations cache
     #[instrument(skip(self))]
     pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
-        let futures: Vec<_> = self
+        let futures: FuturesUnordered<_> = self
             .clients
             .iter_mut()
             .map(|client| client.clear_cache(batch_id))
             .collect();
-        join_all(futures).await.into_iter().collect()
+        futures
+            .collect::<Vec<Result<_>>>()
+            .await
+            .into_iter()
+            .collect()
     }
 
     /// Filter a cached batch
     #[instrument(skip(self))]
     pub async fn filter_batch(
         &mut self,
         batch_id: u64,
-        request_ids: Vec<u64>,
-    ) -> Result<Option<CachedBatch>> {
-        let futures: Vec<_> = self
+        kept_requests: Vec<KeptRequest>,
+        terminated_request_ids: Vec<u64>,
+    ) -> Result<(Option<CachedBatch>, Vec<TerminatedGeneration>)> {
+        let futures: FuturesUnordered<_> = self
             .clients
             .iter_mut()
-            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
+            .map(|client| {
+                Box::pin(client.filter_batch(
+                    batch_id,
+                    kept_requests.clone(),
+                    terminated_request_ids.clone(),
+                ))
+            })
             .collect();
         // all shards return the same message
-        join_all(futures).await.pop().unwrap()
+        futures.collect::<Vec<Result<_>>>().await.pop().unwrap()
     }
 
     /// Warmup on a max size batch
@@ -106,7 +127,7 @@ impl ShardedClient {
         max_total_tokens: u32,
         max_batch_size: Option<usize>,
     ) -> Result<Option<u32>> {
-        let futures: Vec<_> = self
+        let futures: FuturesUnordered<_> = self
             .clients
             .iter_mut()
             .map(|client| {
@@ -119,7 +140,8 @@ impl ShardedClient {
             })
             .collect();
         // Take the minimum value
-        let results = join_all(futures)
+        let results = futures
+            .collect::<Vec<Result<_>>>()
             .await
             .into_iter()
             .collect::<Result<Vec<Option<u32>>>>()?;
@@ -135,14 +157,17 @@ impl ShardedClient {
         &mut self,
         batch: Batch,
     ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
-        let futures: Vec<_> = self
+        let futures: FuturesUnordered<_> = self
             .clients
             .iter_mut()
             .map(|client| Box::pin(client.prefill(batch.clone())))
             .collect();
         #[allow(clippy::type_complexity)]
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
-            join_all(futures).await.into_iter().collect();
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> = futures
+            .collect::<Vec<Result<_>>>()
+            .await
+            .into_iter()
+            .collect();
         let mut results = results?;
 
         let (mut generations, next_batch, mut timings) =
@@ -168,14 +193,17 @@ impl ShardedClient {
         &mut self,
         batches: Vec<CachedBatch>,
     ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
-        let futures: Vec<_> = self
+        let futures: FuturesUnordered<_> = self
             .clients
             .iter_mut()
             .map(|client| Box::pin(client.decode(batches.clone())))
             .collect();
         #[allow(clippy::type_complexity)]
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
-            join_all(futures).await.into_iter().collect();
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> = futures
+            .collect::<Vec<Result<_>>>()
+            .await
+            .into_iter()
+            .collect();
         let mut results = results?;
 
         let (mut generations, next_batch, mut timings) =
@@ -243,7 +271,6 @@ impl Health for ShardedClient {
             top_n_tokens: 0,
             // Block 0 is reserved for health checks
             blocks: vec![0],
-            slots: (0..16).collect(),
         };
         let batch = Batch {
             id: u64::MAX,

diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs
@@ -503,6 +503,8 @@ pub enum InferError {
     TemplateError(#[from] minijinja::Error),
     #[error("Tool error: {0}")]
     ToolError(String),
+    #[error("Request could not be re-allocated: out of pages")]
+    OutOfPages,
 }
 
 impl InferError {
@@ -514,6 +516,7 @@ impl InferError {
             InferError::IncompleteGeneration => "incomplete_generation",
             InferError::TemplateError(_) => "template_error",
             InferError::ToolError(_) => "tool_error",
+            InferError::OutOfPages => "out_of_pages",
         }
     }
 }