v0.3.4 (#942)

* v0.3.3 * General Metal bf16 support * Clippy * Update example * Update example * Cfg * Export diffusion arch * Cfg * Bump
EricLBuehler · Nov 28, 2024 · 68c078f · 68c078f
1 parent d5cc451
commit 68c078f
Show file tree

Hide file tree

Showing 24 changed files with 166 additions and 59 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,7 +14,7 @@ exclude = [
 resolver = "2"
 
 [workspace.package]
-version = "0.3.2"
+version = "0.3.4"
 edition = "2021"
 description = "Fast and easy LLM serving."
 homepage = "https://github.com/EricLBuehler/mistral.rs"
@@ -25,8 +25,8 @@ license = "MIT"
 
 [workspace.dependencies]
 anyhow = "1.0.80"
-candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "823a83a" }
-candle-nn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "823a83a" }
+candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "8742354" }
+candle-nn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "8742354" }
 serde = "1.0.197"
 serde_json = "1.0.114"
 indexmap = { version = "2.2.5", features = ["serde"] }

diff --git a/docs/VLLAMA.md b/docs/VLLAMA.md
@@ -229,7 +229,7 @@ res = runner.send_chat_completion_request(
                     },
                     {
                         "type": "text",
-                        "text": "What is shown in this image? Write a detailed response analyzing the scene.",
+                        "text": "<|image|>What is shown in this image? Write a detailed response analyzing the scene.",
                     },
                 ],
             }

diff --git a/examples/python/llama_vision.py b/examples/python/llama_vision.py
@@ -25,7 +25,7 @@
                     },
                     {
                         "type": "text",
-                        "text": "What is shown in this image? Write a detailed response analyzing the scene.",
+                        "text": "<|image|>What is shown in this image? Write a detailed response analyzing the scene.",
                     },
                 ],
             }

diff --git a/examples/python/smolvlm.py b/examples/python/smolvlm.py
@@ -0,0 +1,37 @@
+from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture
+
+runner = Runner(
+    which=Which.VisionPlain(
+        model_id="HuggingFaceTB/SmolVLM-Instruct",
+        arch=VisionArchitecture.Idefics3,
+    ),
+)
+
+res = runner.send_chat_completion_request(
+    ChatCompletionRequest(
+        model="idefics3",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://cdn.britannica.com/45/5645-050-B9EC0205/head-treasure-flower-disk-flowers-inflorescence-ray.jpg"
+                        },
+                    },
+                    {
+                        "type": "text",
+                        "text": "What is shown in this image?",
+                    },
+                ],
+            },
+        ],
+        max_tokens=256,
+        presence_penalty=1.0,
+        top_p=0.1,
+        temperature=0.1,
+    )
+)
+print(res.choices[0].message.content)
+print(res.usage)
diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
@@ -18,7 +18,7 @@ candle-core.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 clap.workspace = true
-mistralrs-core = { version = "0.3.2", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.3.4", path = "../mistralrs-core" }
 tracing.workspace = true
 tokio.workspace = true
 cli-table = "0.4.7"

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -17,7 +17,7 @@ candle-core.workspace = true
 candle-nn.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-candle-flash-attn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "823a83a", optional = true }
+candle-flash-attn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "8742354", optional = true }
 dirs = "5.0.1"
 hf-hub = { version = "0.3.3", package = "candle-hf-hub" }
 thiserror = "1.0.57"
@@ -64,13 +64,13 @@ tracing-subscriber.workspace = true
 derive-new = "0.7.0"
 itertools = "0.13.0"
 sysinfo = "0.30.12"
-mistralrs-vision = { version = "0.3.2", path = "../mistralrs-vision" }
+mistralrs-vision = { version = "0.3.4", path = "../mistralrs-vision" }
 csv = "1.3.0"
 reqwest.workspace = true
 base64.workspace = true
 bytemuck_derive = "1.7.0"
-mistralrs-paged-attn = { version = "0.3.2", path = "../mistralrs-paged-attn", optional = true }
-mistralrs-quant = { version = "0.3.2", path = "../mistralrs-quant" }
+mistralrs-paged-attn = { version = "0.3.4", path = "../mistralrs-paged-attn", optional = true }
+mistralrs-quant = { version = "0.3.4", path = "../mistralrs-quant" }
 uuid = { version = "1.10.0", features = ["v4"] }
 schemars = "0.8.21"
 serde_yaml = "0.9.34"

diff --git a/mistralrs-core/src/aici/svob.rs b/mistralrs-core/src/aici/svob.rs
@@ -337,7 +337,7 @@ pub struct SimpleVobIter<'a> {
     idx: usize,
 }
 
-impl<'a> Iterator for SimpleVobIter<'a> {
+impl Iterator for SimpleVobIter<'_> {
     type Item = u32;
 
     #[inline(always)]

diff --git a/mistralrs-core/src/attention.rs b/mistralrs-core/src/attention.rs
@@ -1,5 +1,8 @@
 #![allow(clippy::cast_precision_loss)]
 
+#[cfg(feature = "metal")]
+use std::sync::atomic::AtomicUsize;
+
 use crate::{
     cublaslt::CUBLASLT_HANDLE,
     layers::{get_use_matmul_via_f16, MatMul},
@@ -8,6 +11,10 @@ use crate::{
 
 use candle_core::{Device, Result, Tensor};
 
+#[cfg(feature = "metal")]
+/// Initial, sentinel value is usize::MAX
+static METAL_VERSION_CACHE: AtomicUsize = AtomicUsize::new(usize::MAX);
+
 #[cfg(feature = "flash-attn")]
 fn flash_attn(
     q: &Tensor,
@@ -92,7 +99,66 @@ fn naive_sdpa(
     head_dim: usize,
     sdpa_params: &SdpaParams,
 ) -> Result<Tensor> {
-    if mask.is_some_and(|mask| mask.rank() == 2) {
+    #[cfg(feature = "metal")]
+    let supports_attn_softmax = {
+        use std::sync::atomic::Ordering;
+        let cache = METAL_VERSION_CACHE.load(Ordering::Relaxed);
+
+        let version = if cache != usize::MAX {
+            cache
+        } else {
+            // echo "__METAL_VERSION__" | xcrun -sdk macosx metal -E -x metal -P -
+
+            use std::process::{Command, Stdio};
+
+            // Create the `echo` command and pipe its output into `xcrun`
+            let mut echo = Command::new("echo")
+                .arg("__METAL_VERSION__")
+                .stdout(Stdio::piped())
+                .spawn()
+                .expect("Failed to start echo command");
+
+            echo.wait()?;
+
+            // Run the `xcrun` command, taking input from the `echo` command's output
+            let output = Command::new("xcrun")
+                .arg("-sdk")
+                .arg("macosx")
+                .arg("metal")
+                .arg("-E")
+                .arg("-x")
+                .arg("metal")
+                .arg("-P")
+                .arg("-")
+                .stdin(echo.stdout.unwrap())
+                .output()
+                .expect("Failed to run xcrun command");
+
+            // Handle the output
+            if output.status.success() {
+                let version = String::from_utf8_lossy(&output.stdout)
+                    .split('\n')
+                    .nth(1)
+                    .unwrap()
+                    .trim()
+                    .to_string()
+                    .parse::<usize>()
+                    .unwrap();
+                METAL_VERSION_CACHE.store(version, Ordering::Relaxed);
+                version
+            } else {
+                let stderr = String::from_utf8_lossy(&output.stderr);
+                panic!("Error:\n{}", stderr);
+            }
+        };
+        // Attn softmax is only supported for metal >= 310
+        version >= 310
+    };
+
+    #[cfg(not(feature = "metal"))]
+    let supports_attn_softmax = true;
+
+    if mask.is_some_and(|mask| mask.rank() == 2) && supports_attn_softmax {
         let mut att = MatMul.matmul(q, &k.t()?)?;
         if let Some(softcap) = sdpa_params.softcap {
             att = (att / softcap as f64)?;

diff --git a/mistralrs-core/src/cublaslt/mod.rs b/mistralrs-core/src/cublaslt/mod.rs
@@ -46,6 +46,7 @@ pub fn setup_cublas_lt_wrapper() {
                 tracing::info!("Initialized cuBLASlt handle");
             }
         });
+        #[allow(static_mut_refs)]
         let cublaslt: Option<&'static CublasLtWrapper> = CUBLASLT.as_ref();
         *CUBLASLT_HANDLE.lock().unwrap() = cublaslt;
     }

diff --git a/mistralrs-core/src/engine/mod.rs b/mistralrs-core/src/engine/mod.rs
@@ -829,8 +829,7 @@ impl Engine {
                     .as_ref()
                     .expect("If a model has a NormalCache it must have a model metadata");
                 let n_tokens = prompt_tokens.len();
-                let required_blocks =
-                    (n_tokens + NormalCache::CACHE_GROW_SIZE - 1) / NormalCache::CACHE_GROW_SIZE;
+                let required_blocks = n_tokens.div_ceil(NormalCache::CACHE_GROW_SIZE);
                 let max_seq_len = required_blocks * NormalCache::CACHE_GROW_SIZE;
                 let kv_shape = (
                     1usize,
-Original file line number
+Diff line change
@@ Expand Up / @@ -229,7 +229,7 @@ res = runner.send_chat_completion_request( @@
                         },
                         {
                             "type": "text",
-                            "text": "What is shown in this image? Write a detailed response analyzing the scene.",
+                            "text": "<|image|>What is shown in this image? Write a detailed response analyzing the scene.",
                         },
                     ],
                 }
@@ Expand Down @@