Skip to content

Commit

Permalink
Merge branch 'master' into openvla
Browse files Browse the repository at this point in the history
  • Loading branch information
EricLBuehler committed Jul 15, 2024
2 parents 99005db + 7c50b68 commit ae43fbf
Show file tree
Hide file tree
Showing 20 changed files with 247 additions and 111 deletions.
6 changes: 3 additions & 3 deletions docs/PAGED_ATTENTION.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and b

> Note: The default block size if not specified is 32.
> Warning: When using dynamic adapter activation or sending re-ISQ requests, it may trigger OOM because the Paged Attention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount (recommended) or disable paged attention.
> Note: if OOM happens (this can be caused by a variety of factors including adapter activation, re-ISQ, and others), it happens because the Paged Attention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount or usage percentage (recommended) or disable paged attention entirely for a dynamically allocated cache.
**There are more features being added to this:**
- GGML model support
Expand All @@ -23,14 +23,14 @@ Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and b
## Using the CLI

Add the `--pa-gpu-mem` and `--pa-blk-size` parameters before the model kind selector. The GPU memory is in MBs and the block size means the number of tokens per block. These parameters may be passed on any supported model type.
Add the `--pa-gpu-mem`/`--pa-gpu-mem-usage` and `--pa-blk-size` parameters before the model kind selector. The GPU memory is in MBs and the block size means the number of tokens per block. These parameters may be passed on any supported model type.

```
cargo run --release --features cuda -- -i --pa-gpu-mem 8192 --pa-blk-size 32 --isq Q4K plain -m microsoft/Phi-3-mini-128k-instruct -a phi3
```

```
cargo run --release --features cuda -- -i --pa-gpu-mem 8192 --pa-blk-size 32 gguf -t mistralai/Mistral-7B-Instruct-v0.1 -m TheBloke/Mistral-7B-Instruct-v0.1-GGUF -f mistral-7b-instruct-v0.1.Q4_K_M.gguf
cargo run --release --features cuda -- -i --pa-gpu-mem-usage .95 --pa-blk-size 32 gguf -t mistralai/Mistral-7B-Instruct-v0.1 -m TheBloke/Mistral-7B-Instruct-v0.1-GGUF -f mistral-7b-instruct-v0.1.Q4_K_M.gguf
```

## Using the Rust API
Expand Down
37 changes: 30 additions & 7 deletions mistralrs-bench/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use candle_core::Device;
use clap::Parser;
use cli_table::{format::Justify, print_stdout, Cell, CellStruct, Style, Table};
use either::Either;
use mistralrs_core::{
initialize_logging, Constraint, DefaultSchedulerMethod, DeviceLayerMapMetadata,
DeviceMapMetadata, Loader, LoaderBuilder, MistralRs, MistralRsBuilder, ModelDType,
Expand Down Expand Up @@ -278,11 +279,17 @@ struct Args {
#[arg(short, long, value_parser, value_delimiter = ';')]
num_device_layers: Option<Vec<String>>,

/// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to to the
/// available GPU memory. Paged Attention is only supported on CUDA and is always automatically activated.
/// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to
/// using `pa-gpu-mem-usage` set to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
#[arg(long = "pa-gpu-mem")]
paged_attn_gpu_mem: Option<usize>,

/// Percentage of GPU memory to utilize after allocation of KV cache with Paged Attention, from 0 to 1.
/// If this is not set and the device is CUDA, it will default to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
/// This is always used over `pa-gpu-mem` if both are specified.
#[arg(long = "pa-gpu-mem-usage")]
paged_attn_gpu_mem_usage: Option<f32>,

/// Block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA, it will default to 32.
/// Paged Attention is only supported on CUDA and is always automatically activated.
#[arg(long = "pa-blk-size")]
Expand Down Expand Up @@ -373,16 +380,32 @@ fn main() -> anyhow::Result<()> {
let cache_config = match (
args.paged_attn_block_size,
args.paged_attn_gpu_mem,
args.paged_attn_gpu_mem_usage,
device.is_cuda(),
args.no_paged_attn,
) {
(block_size, None, true, false) => Some(PagedAttentionConfig::new(
block_size, 512, None, // Autodetermine KV cache size
(block_size, None, None, true, false) => Some(PagedAttentionConfig::new(
block_size,
512,
Either::Right(0.9), // NOTE(EricLBuehler): default is to use 90% of memory
)?),
(block_size, Some(m), None, true, false) => {
Some(PagedAttentionConfig::new(block_size, 512, Either::Left(m))?)
}
(block_size, None, Some(f), true, false) => Some(PagedAttentionConfig::new(
block_size,
512,
Either::Right(f),
)?),
(block_size, Some(gpu_mem), _, false) => {
Some(PagedAttentionConfig::new(block_size, 512, Some(gpu_mem))?)
(block_size, Some(_m), Some(f), true, false) => {
info!("Both memory size and usage were specified, defaulting to the usage value.");
Some(PagedAttentionConfig::new(
block_size,
512,
Either::Right(f),
)?)
}
(_, _, _, _) => None,
(_, _, _, _, _) => None,
};

let pipeline = loader.load_model_from_hf(
Expand Down
25 changes: 13 additions & 12 deletions mistralrs-core/src/dummy_paged_attention/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pub use block_engine_sequence::BlockEngineSequence;
pub use cache_engine::{CacheConfig, CacheEngine};
use candle_core::{DType, Device};
pub use config::{ModelConfigLike, ModelConfigMetadata};
use either::Either;
pub use layers::PagedAttention;
pub use scheduler::{
PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
Expand All @@ -29,14 +30,14 @@ use tracing::info;
pub struct PagedAttentionConfig {
pub(crate) block_size: Option<usize>,
pub(crate) mem_cpu: usize,
pub(crate) mem_gpu: Option<usize>,
pub(crate) mem_gpu: Either<usize, f32>,
}

impl PagedAttentionConfig {
pub fn new(
_block_size: Option<usize>,
_mem_cpu: usize,
_mem_gpu: Option<usize>,
_mem_gpu: Either<usize, f32>,
) -> anyhow::Result<Self> {
anyhow::bail!("PagedAttention is only supported for CUDA, compile with feature `cuda`.")
}
Expand Down Expand Up @@ -64,9 +65,9 @@ macro_rules! mb_to_blocks {
};
}

/// Memory values are in MBs. Specify block size or the default is 32.
/// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32.
pub fn calculate_cache_config(
mem_gpu: Option<usize>,
mem_gpu: Either<usize, f32>,
mem_cpu: usize,
block_size: Option<usize>,
dtype: DType,
Expand All @@ -79,15 +80,15 @@ pub fn calculate_cache_config(
}
let dtype_size = dtype.size_in_bytes();

#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
let mem_gpu = match mem_gpu {
Some(v) => v,
None => {
let free = MemoryUsage.get_memory_available(device)? / SIZE_IN_MB;
info!(
"Automatically using {} MB for Paged Attention KV cache",
free - 512
);
free - 512
Either::Left(v) => v,
Either::Right(f) => {
let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32;
let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32 * f;
let size = (total - free) as usize;
info!("Allocating {size} MB for Paged Attention KV cache");
size
}
};

Expand Down
3 changes: 3 additions & 0 deletions mistralrs-core/src/dummy_paged_attention/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,4 +345,7 @@ impl Scheduler for PagedAttentionScheduler {
fn free_finished_sequence_groups(&mut self) {
self.free_finished_sequence_groups()
}
fn block_engine(&mut self) -> Option<&mut BlockEngine> {
Some(&mut self.block_engine)
}
}
83 changes: 56 additions & 27 deletions mistralrs-core/src/engine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,50 +257,79 @@ impl Engine {
}
SchedulerOutput::PagedAttention { mut output } => {
if !output.scheduled.is_empty() {
let mut pipeline = get_mut_arcmutex!(self.pipeline);

let is_prompt = get_mut_arcmutex!(output.scheduled[0]).is_prompt();

let block_tables = self.scheduler.block_tables().unwrap();
let block_size = self.scheduler.block_size().unwrap();

let metadata = PagedAttentionMeta {
block_tables,
block_size,
sliding_window: pipeline.get_metadata().sliding_window,
};

let mut guards = output
.scheduled
.iter_mut()
.map(|seq| seq.lock().unwrap())
.collect::<Vec<_>>();

let res = pipeline
.step(
&mut guards.iter_mut().map(|seq| &mut **seq).collect::<Vec<_>>(),
is_prompt,
&mut self.prefix_cacher,
self.disable_eos_stop,
rng.clone(),
CacheBackendMetadata::PagedAttention {
metadata,
blocks_to_copy: output.blocks_to_copy,
blocks_to_swap_in: output.blocks_to_swap_in,
blocks_to_swap_out: output.blocks_to_swap_out,
},
)
.await;
let mut guards_mut =
guards.iter_mut().map(|seq| &mut **seq).collect::<Vec<_>>();

let res = {
let mut pipeline = get_mut_arcmutex!(self.pipeline);

let block_size = self.scheduler.block_size().unwrap();

let metadata = PagedAttentionMeta {
block_size,
sliding_window: pipeline.get_metadata().sliding_window,
block_engine: self.scheduler.block_engine().unwrap(),
};

pipeline
.step(
&mut guards_mut,
is_prompt,
&mut self.prefix_cacher,
self.disable_eos_stop,
rng.clone(),
CacheBackendMetadata::PagedAttention {
metadata,
blocks_to_copy: output.blocks_to_copy,
blocks_to_swap_in: output.blocks_to_swap_in,
blocks_to_swap_out: output.blocks_to_swap_out,
},
)
.await
};

handle_pipeline_forward_error!(
"step",
res,
&mut guards.iter_mut().map(|seq| &mut **seq).collect::<Vec<_>>(),
&mut guards_mut,
self.pipeline,
'lp,
self.prefix_cacher
);

if self.is_debug {
let ms_from_last_run = run_start.elapsed().as_secs_f64();
let total_len = guards.len();
if total_len > 0 {
let lengths = guards
.iter()
.map(|seq| seq.len().to_string())
.collect::<Vec<_>>()
.join(", ");

let (prompt_lengths, completion_lengths) = if is_prompt {
(lengths, "".to_string())
} else {
("".to_string(), lengths)
};

tracing::info!(
"Prompt[{}] Completion[{}] - {}ms",
prompt_lengths,
completion_lengths,
ms_from_last_run * 1000.,
);
}
}

if is_prompt {
for mut seq in guards {
let now = SystemTime::now()
Expand Down
25 changes: 13 additions & 12 deletions mistralrs-core/src/paged_attention/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pub use block_engine_sequence::BlockEngineSequence;
pub use cache_engine::{CacheConfig, CacheEngine};
use candle_core::{DType, Device};
pub use config::{ModelConfigLike, ModelConfigMetadata};
use either::Either;
pub use layers::PagedAttention;
pub use scheduler::{
PagedAttentionScheduler, PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
Expand All @@ -29,14 +30,14 @@ use tracing::info;
pub struct PagedAttentionConfig {
pub(crate) block_size: Option<usize>,
pub(crate) mem_cpu: usize,
pub(crate) mem_gpu: Option<usize>,
pub(crate) mem_gpu: Either<usize, f32>,
}

impl PagedAttentionConfig {
pub fn new(
block_size: Option<usize>,
mem_cpu: usize,
mem_gpu: Option<usize>,
mem_gpu: Either<usize, f32>,
) -> anyhow::Result<Self> {
Ok(Self {
block_size,
Expand Down Expand Up @@ -68,9 +69,9 @@ macro_rules! mb_to_blocks {
};
}

/// Memory values are in MBs. Specify block size or the default is 32.
/// Memory values are in MBs or a percentage in [0,1]. Specify block size or the default is 32.
pub fn calculate_cache_config(
mem_gpu: Option<usize>,
mem_gpu: Either<usize, f32>,
mem_cpu: usize,
block_size: Option<usize>,
dtype: DType,
Expand All @@ -83,15 +84,15 @@ pub fn calculate_cache_config(
}
let dtype_size = dtype.size_in_bytes();

#[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
let mem_gpu = match mem_gpu {
Some(v) => v,
None => {
let free = MemoryUsage.get_memory_available(device)? / SIZE_IN_MB;
info!(
"Automatically using {} MB for Paged Attention KV cache",
free - 512
);
free - 512
Either::Left(v) => v,
Either::Right(f) => {
let free = MemoryUsage.get_memory_available(device)? as f32 / SIZE_IN_MB as f32;
let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32 * f;
let size = (total - free) as usize;
info!("Allocating {size} MB for Paged Attention KV cache");
size
}
};

Expand Down
3 changes: 3 additions & 0 deletions mistralrs-core/src/paged_attention/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,4 +345,7 @@ impl Scheduler for PagedAttentionScheduler {
fn free_finished_sequence_groups(&mut self) {
self.free_finished_sequence_groups()
}
fn block_engine(&mut self) -> Option<&mut BlockEngine> {
Some(&mut self.block_engine)
}
}
Loading

0 comments on commit ae43fbf

Please sign in to comment.