EricLBuehler · joshpopelka20 · Aug 12, 2024 · Aug 12, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/mistralrs-core/src/models/llama.rs b/mistralrs-core/src/models/llama.rs
@@ -405,7 +405,9 @@ pub struct Llama {
     blocks: Vec<Block>,
     ln_f: RmsNorm,
     lm_head: QMatMul,
-    pub kv_cache: crate::pipeline::Cache,
+    // pub kv_cache: crate::pipeline::Cache,
+    pub kv_caches: Vec<crate::pipeline::Cache>,
+    cuda_devices: Vec<candle_core::Device>,
     pub device: Device,
     mapper: Box<dyn DeviceMapper + Send + Sync>,
     cfg: ModelConfigMetadata,
@@ -421,18 +423,35 @@ impl Llama {
         mut metadata: Option<(Vec<(Tensor, Tensor)>, &mut PagedAttentionInputMetadata)>,
     ) -> Result<Tensor> {
         let mut x = self.wte.forward(input_ids)?;
-        let mut cache = self.kv_cache.lock();
+        let (batch_size, seq_len, hidden_size) = x.dims3()?;
+
+        let num_devices = 1;
+        let chunk_size = seq_len / num_devices;
+
+        // let mut chunks = Vec::new();
+        // let mut chunks = Vec::<Tensor>;
+        // let chunk = x.clone();
+        // chunks.push(chunk.to_device(&self.cuda_devices[0])?);
+        let mut chunks: Vec<Tensor> = Vec::with_capacity(num_devices);
+        println!("x device {:?}", x.device());
+        chunks.push(x.copy().unwrap());
+        println!("chunk device {:?}", chunks[0].device());
+
+        let mut cache = self.kv_caches[0].lock();
         let mask = CausalMasker.make_causal_mask_as_attn_bias(
             input_ids,
             metadata
                 .as_ref()
                 .map(|(_, _)| &seqlen_offsets as &dyn PastKvLenCache)
                 .unwrap_or(&*cache as &dyn PastKvLenCache),
-            x.dtype(),
+            // x.dtype(),
+            chunks[0].dtype(),
             self.blocks[0].attn.num_attention_heads,
         )?;
         for (block_idx, block) in self.blocks.iter().enumerate() {
-            x = self.mapper.map(x, block_idx)?;
+            // x = self.mapper.map(x, block_idx)?;
+            // x = self.mapper.map(&chunks[0], block_idx)?;
+            x = self.mapper.map(chunks[0].copy().unwrap(), block_idx)?;
             x = block.forward(
                 &x,
                 &mask.clone().map(|m| m.to_device(x.device()).unwrap()),
@@ -468,6 +487,9 @@ impl Llama {
                 quant_cfg.bits
             );
         }
+
+        let num_devices = 1;
+        let mut cuda_devices = Vec::with_capacity(num_devices);
         let mapper = normal_loading_metadata
             .mapper
             .into_mapper(cfg.num_hidden_layers, &normal_loading_metadata.real_device)?;
@@ -514,6 +536,9 @@ impl Llama {
                             .expect("Failed to create PagedAttention"),
                         ),
                     };
+                    if !cuda_devices.iter().any(|d| format!("{:?}", d) == format!("{:?}", device)) {
+                        cuda_devices.push(device.clone());
+                    }
                     Block::load(
                         vb.pp(&format!("model.layers.{i}")),
                         cfg,
@@ -527,12 +552,21 @@ impl Llama {
                 })
                 .collect();
 
+                let mut kv_caches: Vec<crate::pipeline::Cache> = Vec::with_capacity(num_devices);
+
+                for device_id in 0..num_devices {
+                    let cache = crate::pipeline::Cache::new(cfg.num_hidden_layers , false);
+                    kv_caches.push(cache);
+                };
+
         Ok(Self {
             wte,
             blocks,
             ln_f,
             lm_head: QMatMul::Tensor(lm_head.weight().clone()),
-            kv_cache: crate::pipeline::Cache::new(cfg.num_hidden_layers, false),
+            // kv_cache: crate::pipeline::Cache::new(cfg.num_hidden_layers, false),
+            kv_caches,
+            cuda_devices,
             device: normal_loading_metadata.real_device,
             mapper,
             cfg: ModelConfigMetadata {
@@ -623,7 +657,8 @@ impl NormalModel for Llama {
         unimplemented!()
     }
     fn cache(&self) -> &crate::pipeline::Cache {
-        &self.kv_cache
+        &self.kv_caches[0]
+        // &self.kv_cache
     }
     fn device(&self) -> &Device {
         &self.device