Merge branch 'master' into openvla

EricLBuehler · Jul 15, 2024 · ae43fbf · ae43fbf
2 parents 99005db + 7c50b68
commit ae43fbf
Show file tree

Hide file tree

Showing 20 changed files with 247 additions and 111 deletions.
diff --git a/docs/PAGED_ATTENTION.md b/docs/PAGED_ATTENTION.md
@@ -6,7 +6,7 @@ Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and b
 
 > Note: The default block size if not specified is 32.
 
-> Warning: When using dynamic adapter activation or sending re-ISQ requests, it may trigger OOM because the Paged Attention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount (recommended) or disable paged attention.
+> Note: if OOM happens (this can be caused by a variety of factors including adapter activation, re-ISQ, and others), it happens because the Paged Attention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount or usage percentage (recommended) or disable paged attention entirely for a dynamically allocated cache.
 
 **There are more features being added to this:**
 - GGML model support 
@@ -23,14 +23,14 @@ Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and b
 
 ## Using the CLI
 
-Add the `--pa-gpu-mem` and `--pa-blk-size` parameters before the model kind selector. The GPU memory is in MBs and the block size means the number of tokens per block. These parameters may be passed on any supported model type.
+Add the `--pa-gpu-mem`/`--pa-gpu-mem-usage` and `--pa-blk-size` parameters before the model kind selector. The GPU memory is in MBs and the block size means the number of tokens per block. These parameters may be passed on any supported model type.
 
 ```
 cargo run --release --features cuda -- -i --pa-gpu-mem 8192 --pa-blk-size 32 --isq Q4K plain -m microsoft/Phi-3-mini-128k-instruct -a phi3
 ```
 
 ```
-cargo run --release --features cuda -- -i --pa-gpu-mem 8192 --pa-blk-size 32 gguf -t mistralai/Mistral-7B-Instruct-v0.1 -m TheBloke/Mistral-7B-Instruct-v0.1-GGUF -f mistral-7b-instruct-v0.1.Q4_K_M.gguf
+cargo run --release --features cuda -- -i --pa-gpu-mem-usage .95 --pa-blk-size 32 gguf -t mistralai/Mistral-7B-Instruct-v0.1 -m TheBloke/Mistral-7B-Instruct-v0.1-GGUF -f mistral-7b-instruct-v0.1.Q4_K_M.gguf
 ```
 
 ## Using the Rust API

diff --git a/mistralrs-bench/src/main.rs b/mistralrs-bench/src/main.rs
@@ -1,6 +1,7 @@
 use candle_core::Device;
 use clap::Parser;
 use cli_table::{format::Justify, print_stdout, Cell, CellStruct, Style, Table};
+use either::Either;
 use mistralrs_core::{
     initialize_logging, Constraint, DefaultSchedulerMethod, DeviceLayerMapMetadata,
     DeviceMapMetadata, Loader, LoaderBuilder, MistralRs, MistralRsBuilder, ModelDType,
@@ -278,11 +279,17 @@ struct Args {
     #[arg(short, long, value_parser, value_delimiter = ';')]
     num_device_layers: Option<Vec<String>>,
 
-    /// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to to the
-    /// available GPU memory. Paged Attention is only supported on CUDA and is always automatically activated.
+    /// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to
+    /// using `pa-gpu-mem-usage` set to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
     #[arg(long = "pa-gpu-mem")]
     paged_attn_gpu_mem: Option<usize>,
 
+    /// Percentage of GPU memory to utilize after allocation of KV cache with Paged Attention, from 0 to 1.
+    /// If this is not set and the device is CUDA, it will default to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
+    /// This is always used over `pa-gpu-mem` if both are specified.
+    #[arg(long = "pa-gpu-mem-usage")]
+    paged_attn_gpu_mem_usage: Option<f32>,
+
     /// Block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA, it will default to 32.
     /// Paged Attention is only supported on CUDA and is always automatically activated.
     #[arg(long = "pa-blk-size")]
@@ -373,16 +380,32 @@ fn main() -> anyhow::Result<()> {
     let cache_config = match (
         args.paged_attn_block_size,
         args.paged_attn_gpu_mem,
+        args.paged_attn_gpu_mem_usage,
         device.is_cuda(),
         args.no_paged_attn,
     ) {
-        (block_size, None, true, false) => Some(PagedAttentionConfig::new(
-            block_size, 512, None, // Autodetermine KV cache size
+        (block_size, None, None, true, false) => Some(PagedAttentionConfig::new(
+            block_size,
+            512,
+            Either::Right(0.9), // NOTE(EricLBuehler): default is to use 90% of memory
+        )?),
+        (block_size, Some(m), None, true, false) => {
+            Some(PagedAttentionConfig::new(block_size, 512, Either::Left(m))?)
+        }
+        (block_size, None, Some(f), true, false) => Some(PagedAttentionConfig::new(
+            block_size,
+            512,
+            Either::Right(f),
         )?),
-        (block_size, Some(gpu_mem), _, false) => {
-            Some(PagedAttentionConfig::new(block_size, 512, Some(gpu_mem))?)
+        (block_size, Some(_m), Some(f), true, false) => {
+            info!("Both memory size and usage were specified, defaulting to the usage value.");
+            Some(PagedAttentionConfig::new(
+                block_size,
+                512,
+                Either::Right(f),
+            )?)
         }
-        (_, _, _, _) => None,
+        (_, _, _, _, _) => None,
     };
 
     let pipeline = loader.load_model_from_hf(

diff --git a/mistralrs-core/src/dummy_paged_attention/mod.rs b/mistralrs-core/src/dummy_paged_attention/mod.rs
@@ -16,6 +16,7 @@ pub use block_engine_sequence::BlockEngineSequence;
 pub use cache_engine::{CacheConfig, CacheEngine};
 use candle_core::{DType, Device};
 pub use config::{ModelConfigLike, ModelConfigMetadata};
+use either::Either;
 pub use layers::PagedAttention;
 pub use scheduler::{
     PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
@@ -29,14 +30,14 @@ use tracing::info;
 pub struct PagedAttentionConfig {
     pub(crate) block_size: Option<usize>,
     pub(crate) mem_cpu: usize,
-    pub(crate) mem_gpu: Option<usize>,
+    pub(crate) mem_gpu: Either<usize, f32>,
 }
 
 impl PagedAttentionConfig {
     pub fn new(
         _block_size: Option<usize>,
         _mem_cpu: usize,
-        _mem_gpu: Option<usize>,
+        _mem_gpu: Either<usize, f32>,
     ) -> anyhow::Result<Self> {
         anyhow::bail!("PagedAttention is only supported for CUDA, compile with feature `cuda`.")
     }
@@ -64,9 +65,9 @@ macro_rules! mb_to_blocks {
     };
 }
 
-/// Memory values are in MBs. Specify block size or the default is 32.
+/// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32.
 pub fn calculate_cache_config(
-    mem_gpu: Option<usize>,
+    mem_gpu: Either<usize, f32>,
     mem_cpu: usize,
     block_size: Option<usize>,
     dtype: DType,
@@ -79,15 +80,15 @@ pub fn calculate_cache_config(
     }
     let dtype_size = dtype.size_in_bytes();
 
+    #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
     let mem_gpu = match mem_gpu {
-        Some(v) => v,
-        None => {
-            let free = MemoryUsage.get_memory_available(device)? / SIZE_IN_MB;
-            info!(
-                "Automatically using {} MB for Paged Attention KV cache",
-                free - 512
-            );
-            free - 512
+        Either::Left(v) => v,
+        Either::Right(f) => {
+            let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32;
+            let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32 * f;
+            let size = (total - free) as usize;
+            info!("Allocating {size} MB for Paged Attention KV cache");
+            size
         }
     };
 

diff --git a/mistralrs-core/src/dummy_paged_attention/scheduler.rs b/mistralrs-core/src/dummy_paged_attention/scheduler.rs
@@ -345,4 +345,7 @@ impl Scheduler for PagedAttentionScheduler {
     fn free_finished_sequence_groups(&mut self) {
         self.free_finished_sequence_groups()
     }
+    fn block_engine(&mut self) -> Option<&mut BlockEngine> {
+        Some(&mut self.block_engine)
+    }
 }
diff --git a/mistralrs-core/src/engine/mod.rs b/mistralrs-core/src/engine/mod.rs
@@ -257,50 +257,79 @@ impl Engine {
                 }
                 SchedulerOutput::PagedAttention { mut output } => {
                     if !output.scheduled.is_empty() {
-                        let mut pipeline = get_mut_arcmutex!(self.pipeline);
-
                         let is_prompt = get_mut_arcmutex!(output.scheduled[0]).is_prompt();
 
-                        let block_tables = self.scheduler.block_tables().unwrap();
-                        let block_size = self.scheduler.block_size().unwrap();
-
-                        let metadata = PagedAttentionMeta {
-                            block_tables,
-                            block_size,
-                            sliding_window: pipeline.get_metadata().sliding_window,
-                        };
-
                         let mut guards = output
                             .scheduled
                             .iter_mut()
                             .map(|seq| seq.lock().unwrap())
                             .collect::<Vec<_>>();
 
-                        let res = pipeline
-                            .step(
-                                &mut guards.iter_mut().map(|seq| &mut **seq).collect::<Vec<_>>(),
-                                is_prompt,
-                                &mut self.prefix_cacher,
-                                self.disable_eos_stop,
-                                rng.clone(),
-                                CacheBackendMetadata::PagedAttention {
-                                    metadata,
-                                    blocks_to_copy: output.blocks_to_copy,
-                                    blocks_to_swap_in: output.blocks_to_swap_in,
-                                    blocks_to_swap_out: output.blocks_to_swap_out,
-                                },
-                            )
-                            .await;
+                        let mut guards_mut =
+                            guards.iter_mut().map(|seq| &mut **seq).collect::<Vec<_>>();
+
+                        let res = {
+                            let mut pipeline = get_mut_arcmutex!(self.pipeline);
+
+                            let block_size = self.scheduler.block_size().unwrap();
+
+                            let metadata = PagedAttentionMeta {
+                                block_size,
+                                sliding_window: pipeline.get_metadata().sliding_window,
+                                block_engine: self.scheduler.block_engine().unwrap(),
+                            };
+
+                            pipeline
+                                .step(
+                                    &mut guards_mut,
+                                    is_prompt,
+                                    &mut self.prefix_cacher,
+                                    self.disable_eos_stop,
+                                    rng.clone(),
+                                    CacheBackendMetadata::PagedAttention {
+                                        metadata,
+                                        blocks_to_copy: output.blocks_to_copy,
+                                        blocks_to_swap_in: output.blocks_to_swap_in,
+                                        blocks_to_swap_out: output.blocks_to_swap_out,
+                                    },
+                                )
+                                .await
+                        };
 
                         handle_pipeline_forward_error!(
                             "step",
                             res,
-                            &mut guards.iter_mut().map(|seq| &mut **seq).collect::<Vec<_>>(),
+                            &mut guards_mut,
                             self.pipeline,
                             'lp,
                             self.prefix_cacher
                         );
 
+                        if self.is_debug {
+                            let ms_from_last_run = run_start.elapsed().as_secs_f64();
+                            let total_len = guards.len();
+                            if total_len > 0 {
+                                let lengths = guards
+                                    .iter()
+                                    .map(|seq| seq.len().to_string())
+                                    .collect::<Vec<_>>()
+                                    .join(", ");
+
+                                let (prompt_lengths, completion_lengths) = if is_prompt {
+                                    (lengths, "".to_string())
+                                } else {
+                                    ("".to_string(), lengths)
+                                };
+
+                                tracing::info!(
+                                    "Prompt[{}] Completion[{}] - {}ms",
+                                    prompt_lengths,
+                                    completion_lengths,
+                                    ms_from_last_run * 1000.,
+                                );
+                            }
+                        }
+
                         if is_prompt {
                             for mut seq in guards {
                                 let now = SystemTime::now()

diff --git a/mistralrs-core/src/paged_attention/mod.rs b/mistralrs-core/src/paged_attention/mod.rs
@@ -16,6 +16,7 @@ pub use block_engine_sequence::BlockEngineSequence;
 pub use cache_engine::{CacheConfig, CacheEngine};
 use candle_core::{DType, Device};
 pub use config::{ModelConfigLike, ModelConfigMetadata};
+use either::Either;
 pub use layers::PagedAttention;
 pub use scheduler::{
     PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
@@ -29,14 +30,14 @@ use tracing::info;
 pub struct PagedAttentionConfig {
     pub(crate) block_size: Option<usize>,
     pub(crate) mem_cpu: usize,
-    pub(crate) mem_gpu: Option<usize>,
+    pub(crate) mem_gpu: Either<usize, f32>,
 }
 
 impl PagedAttentionConfig {
     pub fn new(
         block_size: Option<usize>,
         mem_cpu: usize,
-        mem_gpu: Option<usize>,
+        mem_gpu: Either<usize, f32>,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             block_size,
@@ -68,9 +69,9 @@ macro_rules! mb_to_blocks {
     };
 }
 
-/// Memory values are in MBs. Specify block size or the default is 32.
+/// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32.
 pub fn calculate_cache_config(
-    mem_gpu: Option<usize>,
+    mem_gpu: Either<usize, f32>,
     mem_cpu: usize,
     block_size: Option<usize>,
     dtype: DType,
@@ -83,15 +84,15 @@ pub fn calculate_cache_config(
     }
     let dtype_size = dtype.size_in_bytes();
 
+    #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
     let mem_gpu = match mem_gpu {
-        Some(v) => v,
-        None => {
-            let free = MemoryUsage.get_memory_available(device)? / SIZE_IN_MB;
-            info!(
-                "Automatically using {} MB for Paged Attention KV cache",
-                free - 512
-            );
-            free - 512
+        Either::Left(v) => v,
+        Either::Right(f) => {
+            let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32;
+            let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32 * f;
+            let size = (total - free) as usize;
+            info!("Allocating {size} MB for Paged Attention KV cache");
+            size
         }
     };
 

diff --git a/mistralrs-core/src/paged_attention/scheduler.rs b/mistralrs-core/src/paged_attention/scheduler.rs
@@ -345,4 +345,7 @@ impl Scheduler for PagedAttentionScheduler {
     fn free_finished_sequence_groups(&mut self) {
         self.free_finished_sequence_groups()
     }
+    fn block_engine(&mut self) -> Option<&mut BlockEngine> {
+        Some(&mut self.block_engine)
+    }
 }