Skip to content

Commit

Permalink
v0.3.4 (#942)
Browse files Browse the repository at this point in the history
* v0.3.3

* General Metal bf16 support

* Clippy

* Update example

* Update example

* Cfg

* Export diffusion arch

* Cfg

* Bump
  • Loading branch information
EricLBuehler authored Nov 28, 2024
1 parent d5cc451 commit 68c078f
Show file tree
Hide file tree
Showing 24 changed files with 166 additions and 59 deletions.
28 changes: 14 additions & 14 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ exclude = [
resolver = "2"

[workspace.package]
version = "0.3.2"
version = "0.3.4"
edition = "2021"
description = "Fast and easy LLM serving."
homepage = "https://github.com/EricLBuehler/mistral.rs"
Expand All @@ -25,8 +25,8 @@ license = "MIT"

[workspace.dependencies]
anyhow = "1.0.80"
candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "823a83a" }
candle-nn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "823a83a" }
candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "8742354" }
candle-nn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "8742354" }
serde = "1.0.197"
serde_json = "1.0.114"
indexmap = { version = "2.2.5", features = ["serde"] }
Expand Down
2 changes: 1 addition & 1 deletion docs/VLLAMA.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ res = runner.send_chat_completion_request(
},
{
"type": "text",
"text": "What is shown in this image? Write a detailed response analyzing the scene.",
"text": "<|image|>What is shown in this image? Write a detailed response analyzing the scene.",
},
],
}
Expand Down
2 changes: 1 addition & 1 deletion examples/python/llama_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
},
{
"type": "text",
"text": "What is shown in this image? Write a detailed response analyzing the scene.",
"text": "<|image|>What is shown in this image? Write a detailed response analyzing the scene.",
},
],
}
Expand Down
37 changes: 37 additions & 0 deletions examples/python/smolvlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture

runner = Runner(
which=Which.VisionPlain(
model_id="HuggingFaceTB/SmolVLM-Instruct",
arch=VisionArchitecture.Idefics3,
),
)

res = runner.send_chat_completion_request(
ChatCompletionRequest(
model="idefics3",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/45/5645-050-B9EC0205/head-treasure-flower-disk-flowers-inflorescence-ray.jpg"
},
},
{
"type": "text",
"text": "What is shown in this image?",
},
],
},
],
max_tokens=256,
presence_penalty=1.0,
top_p=0.1,
temperature=0.1,
)
)
print(res.choices[0].message.content)
print(res.usage)
2 changes: 1 addition & 1 deletion mistralrs-bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ candle-core.workspace = true
serde.workspace = true
serde_json.workspace = true
clap.workspace = true
mistralrs-core = { version = "0.3.2", path = "../mistralrs-core" }
mistralrs-core = { version = "0.3.4", path = "../mistralrs-core" }
tracing.workspace = true
tokio.workspace = true
cli-table = "0.4.7"
Expand Down
8 changes: 4 additions & 4 deletions mistralrs-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ candle-core.workspace = true
candle-nn.workspace = true
serde.workspace = true
serde_json.workspace = true
candle-flash-attn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "823a83a", optional = true }
candle-flash-attn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "8742354", optional = true }
dirs = "5.0.1"
hf-hub = { version = "0.3.3", package = "candle-hf-hub" }
thiserror = "1.0.57"
Expand Down Expand Up @@ -64,13 +64,13 @@ tracing-subscriber.workspace = true
derive-new = "0.7.0"
itertools = "0.13.0"
sysinfo = "0.30.12"
mistralrs-vision = { version = "0.3.2", path = "../mistralrs-vision" }
mistralrs-vision = { version = "0.3.4", path = "../mistralrs-vision" }
csv = "1.3.0"
reqwest.workspace = true
base64.workspace = true
bytemuck_derive = "1.7.0"
mistralrs-paged-attn = { version = "0.3.2", path = "../mistralrs-paged-attn", optional = true }
mistralrs-quant = { version = "0.3.2", path = "../mistralrs-quant" }
mistralrs-paged-attn = { version = "0.3.4", path = "../mistralrs-paged-attn", optional = true }
mistralrs-quant = { version = "0.3.4", path = "../mistralrs-quant" }
uuid = { version = "1.10.0", features = ["v4"] }
schemars = "0.8.21"
serde_yaml = "0.9.34"
Expand Down
2 changes: 1 addition & 1 deletion mistralrs-core/src/aici/svob.rs
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ pub struct SimpleVobIter<'a> {
idx: usize,
}

impl<'a> Iterator for SimpleVobIter<'a> {
impl Iterator for SimpleVobIter<'_> {
type Item = u32;

#[inline(always)]
Expand Down
68 changes: 67 additions & 1 deletion mistralrs-core/src/attention.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#![allow(clippy::cast_precision_loss)]

#[cfg(feature = "metal")]
use std::sync::atomic::AtomicUsize;

use crate::{
cublaslt::CUBLASLT_HANDLE,
layers::{get_use_matmul_via_f16, MatMul},
Expand All @@ -8,6 +11,10 @@ use crate::{

use candle_core::{Device, Result, Tensor};

#[cfg(feature = "metal")]
/// Initial, sentinel value is usize::MAX
static METAL_VERSION_CACHE: AtomicUsize = AtomicUsize::new(usize::MAX);

#[cfg(feature = "flash-attn")]
fn flash_attn(
q: &Tensor,
Expand Down Expand Up @@ -92,7 +99,66 @@ fn naive_sdpa(
head_dim: usize,
sdpa_params: &SdpaParams,
) -> Result<Tensor> {
if mask.is_some_and(|mask| mask.rank() == 2) {
#[cfg(feature = "metal")]
let supports_attn_softmax = {
use std::sync::atomic::Ordering;
let cache = METAL_VERSION_CACHE.load(Ordering::Relaxed);

let version = if cache != usize::MAX {
cache
} else {
// echo "__METAL_VERSION__" | xcrun -sdk macosx metal -E -x metal -P -

use std::process::{Command, Stdio};

// Create the `echo` command and pipe its output into `xcrun`
let mut echo = Command::new("echo")
.arg("__METAL_VERSION__")
.stdout(Stdio::piped())
.spawn()
.expect("Failed to start echo command");

echo.wait()?;

// Run the `xcrun` command, taking input from the `echo` command's output
let output = Command::new("xcrun")
.arg("-sdk")
.arg("macosx")
.arg("metal")
.arg("-E")
.arg("-x")
.arg("metal")
.arg("-P")
.arg("-")
.stdin(echo.stdout.unwrap())
.output()
.expect("Failed to run xcrun command");

// Handle the output
if output.status.success() {
let version = String::from_utf8_lossy(&output.stdout)
.split('\n')
.nth(1)
.unwrap()
.trim()
.to_string()
.parse::<usize>()
.unwrap();
METAL_VERSION_CACHE.store(version, Ordering::Relaxed);
version
} else {
let stderr = String::from_utf8_lossy(&output.stderr);
panic!("Error:\n{}", stderr);
}
};
// Attn softmax is only supported for metal >= 310
version >= 310
};

#[cfg(not(feature = "metal"))]
let supports_attn_softmax = true;

if mask.is_some_and(|mask| mask.rank() == 2) && supports_attn_softmax {
let mut att = MatMul.matmul(q, &k.t()?)?;
if let Some(softcap) = sdpa_params.softcap {
att = (att / softcap as f64)?;
Expand Down
1 change: 1 addition & 0 deletions mistralrs-core/src/cublaslt/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ pub fn setup_cublas_lt_wrapper() {
tracing::info!("Initialized cuBLASlt handle");
}
});
#[allow(static_mut_refs)]
let cublaslt: Option<&'static CublasLtWrapper> = CUBLASLT.as_ref();
*CUBLASLT_HANDLE.lock().unwrap() = cublaslt;
}
Expand Down
3 changes: 1 addition & 2 deletions mistralrs-core/src/engine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -829,8 +829,7 @@ impl Engine {
.as_ref()
.expect("If a model has a NormalCache it must have a model metadata");
let n_tokens = prompt_tokens.len();
let required_blocks =
(n_tokens + NormalCache::CACHE_GROW_SIZE - 1) / NormalCache::CACHE_GROW_SIZE;
let required_blocks = n_tokens.div_ceil(NormalCache::CACHE_GROW_SIZE);
let max_seq_len = required_blocks * NormalCache::CACHE_GROW_SIZE;
let kv_shape = (
1usize,
Expand Down
Loading

0 comments on commit 68c078f

Please sign in to comment.