EricLBuehler · guoqingbao · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024
diff --git a/README.md b/README.md
@@ -133,7 +133,7 @@ For model-specific help, run `cargo run -- --port 2000 <MODEL_TYPE> --help`
 
 For local model weights, run `cargo run --release -- --port 2000 --weight-path /home/llama2_7b/ llama --repeat-last-n 64`, change the path when needed.
 
-`MODEL_TYPE` = ["llama", "phi3", "qwen2"]
+`MODEL_TYPE` = ["llama", "phi3", "qwen2", "gemma"]
 
 `WEIGHT_FILE_PATH` = Corresponding weight path for the given model type
 

diff --git a/src/openai/models/gemma.rs b/src/openai/models/gemma.rs
@@ -75,17 +75,17 @@ struct RotaryEmbedding {
 }
 
 impl RotaryEmbedding {
-    fn new(dtype: DType, cfg: &Config, dev: &Device) -> Result<Self> {
+    fn new(_dtype: DType, cfg: &Config, dev: &Device) -> Result<Self> {
         let dim = cfg.hidden_size / cfg.num_attention_heads;
         let max_seq_len = cfg.max_seq_len;
         let inv_freq: Vec<_> = (0..dim)
             .step_by(2)
             .map(|i| 1f32 / cfg.rope_theta.powf(i as f64 / dim as f64) as f32)
             .collect();
         let inv_freq_len = inv_freq.len();
-        let inv_freq = Tensor::from_vec(inv_freq, (1, inv_freq_len), dev)?.to_dtype(dtype)?;
+        let inv_freq = Tensor::from_vec(inv_freq, (1, inv_freq_len), dev)?.to_dtype(DType::F32)?;
         let t = Tensor::arange(0u32, max_seq_len as u32, dev)?
-            .to_dtype(dtype)?
+            .to_dtype(DType::F32)?
             .reshape((max_seq_len, 1))?;
         let freqs = t.matmul(&inv_freq)?;
         Ok(Self {
@@ -218,12 +218,17 @@ impl Attention {
             let v = value_states
                 .reshape((b_sz, seq_len, self.num_kv_heads, self.head_dim))?
                 .transpose(1, 2)?;
-            (q.contiguous()?, k.contiguous()?, v.contiguous()?)
+            (q, k, v.contiguous()?)
         };
 
-        let (q, k) = self
-            .rotary_emb
-            .apply_rotary_emb_qkv(&q, &k, seqlen_offset)?;
+        let (q, k) = self.rotary_emb.apply_rotary_emb_qkv(
+            &q.to_dtype(DType::F32)?,
+            &k.to_dtype(DType::F32)?,
+            seqlen_offset,
+        )?;
+
+        let q = q.to_dtype(v.dtype())?;
+        let k = k.to_dtype(v.dtype())?;
 
         // No need repeat_kv since we performed broadcasted matmul in the prefiling stage
         // while, the decoding stage used paged-attention which also does not need kv stacking (to match query dim)

diff --git a/src/openai/models/llama.rs b/src/openai/models/llama.rs
@@ -157,13 +157,6 @@ impl CausalSelfAttention {
         Ok(y)
     }
 
-    fn repeat_kv(&self, x: Tensor) -> Result<Tensor> {
-        candle_transformers::utils::repeat_kv(
-            x,
-            self.num_attention_heads / self.num_key_value_heads,
-        )
-    }
-
     fn load(vb: VarBuilder, cfg: &Config, dtype: DType, device: &Device) -> Result<Self> {
         let span = tracing::span!(tracing::Level::TRACE, "attn");
         let span_rot = tracing::span!(tracing::Level::TRACE, "attn-rot");

diff --git a/src/openai/models/phi3.rs b/src/openai/models/phi3.rs
@@ -205,7 +205,6 @@ struct Attention {
     o_proj: Linear,
     num_heads: usize,
     num_kv_heads: usize,
-    num_kv_groups: usize,
     hidden_size: usize,
     head_dim: usize,
     rotary_emb: Arc<RotaryEmbedding>,
@@ -226,7 +225,6 @@ impl Attention {
             rotary_emb,
             num_heads,
             num_kv_heads,
-            num_kv_groups: num_heads / num_kv_heads,
             head_dim,
             hidden_size: cfg.hidden_size,
             attn: PagedAttention::new(

diff --git a/src/openai/models/qwen2.rs b/src/openai/models/qwen2.rs
@@ -134,7 +134,6 @@ struct Attention {
     o_proj: Linear,
     num_heads: usize,
     num_kv_heads: usize,
-    num_kv_groups: usize,
     head_dim: usize,
     hidden_size: usize,
     rotary_emb: Arc<RotaryEmbedding>,
@@ -146,7 +145,6 @@ impl Attention {
         let hidden_sz = cfg.hidden_size;
         let num_heads = cfg.num_attention_heads;
         let num_kv_heads = cfg.num_key_value_heads;
-        let num_kv_groups = num_heads / num_kv_heads;
         let head_dim = hidden_sz / num_heads;
         let q_proj = linear(hidden_sz, num_heads * head_dim, vb.pp("q_proj"))?;
         let k_proj = linear(hidden_sz, num_kv_heads * head_dim, vb.pp("k_proj"))?;
@@ -159,7 +157,6 @@ impl Attention {
             o_proj,
             num_heads,
             num_kv_heads,
-            num_kv_groups,
             head_dim,
             hidden_size: hidden_sz,
             rotary_emb,

diff --git a/src/paged_attention/mod.rs b/src/paged_attention/mod.rs
@@ -84,25 +84,24 @@ impl PagedAttention {
             None => None,
             Some(mask) => {
                 let att = if key_value_heads != attention_heads {
-                    (query.matmul(&key.t()?.broadcast_as((
-                        batch_size,
-                        attention_heads,
-                        head_size,
-                        seq_len,
-                    ))?)?
-                        * self.scale as f64)?
+                    (query.matmul(
+                        &key.t()?
+                            .broadcast_as((batch_size, attention_heads, head_size, seq_len))?
+                            .contiguous()?,
+                    )? * self.scale as f64)?
                 } else {
                     (query.matmul(&key.t()?)? * self.scale as f64)?
                 };
                 let att = att.broadcast_add(mask)?;
                 let att = candle_nn::ops::softmax_last_dim(&att)?;
                 if key_value_heads != attention_heads {
-                    Some(att.matmul(&value.broadcast_as((
-                        batch_size,
-                        attention_heads,
-                        seq_len,
-                        head_size,
-                    ))?)?)
+                    Some(
+                        att.matmul(
+                            &value
+                                .broadcast_as((batch_size, attention_heads, seq_len, head_size))?
+                                .contiguous()?,
+                        )?,
+                    )
                 } else {
                     Some(att.matmul(&value)?)
                 }