diff --git a/docs/ANYMOE.md b/docs/ANYMOE.md
index fbe5bfab4..fb4206c69 100644
--- a/docs/ANYMOE.md
+++ b/docs/ANYMOE.md
@@ -171,7 +171,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = NormalLoaderBuilder::new(
         NormalSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/docs/IDEFICS2.md b/docs/IDEFICS2.md
index b7798036a..ad6089eca 100644
--- a/docs/IDEFICS2.md
+++ b/docs/IDEFICS2.md
@@ -113,7 +113,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = VisionLoaderBuilder::new(
         VisionSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/docs/LLaVA.md b/docs/LLaVA.md
index 47ff661a3..a8a0cefa4 100644
--- a/docs/LLaVA.md
+++ b/docs/LLaVA.md
@@ -110,7 +110,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = VisionLoaderBuilder::new(
         VisionSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/docs/PAGED_ATTENTION.md b/docs/PAGED_ATTENTION.md
index 8ee028e4c..f61dc0808 100644
--- a/docs/PAGED_ATTENTION.md
+++ b/docs/PAGED_ATTENTION.md
@@ -68,7 +68,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = NormalLoaderBuilder::new(
         NormalSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/docs/PHI3V.md b/docs/PHI3V.md
index 0a535cf23..376419922 100644
--- a/docs/PHI3V.md
+++ b/docs/PHI3V.md
@@ -121,7 +121,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = VisionLoaderBuilder::new(
         VisionSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs-core/src/engine/mod.rs b/mistralrs-core/src/engine/mod.rs
index 912434536..a622a02b1 100644
--- a/mistralrs-core/src/engine/mod.rs
+++ b/mistralrs-core/src/engine/mod.rs
@@ -1,5 +1,4 @@
 use std::{
-    collections::HashMap,
     sync::{
         atomic::{AtomicBool, Ordering},
         Arc,
@@ -19,7 +18,6 @@ use crate::{
     scheduler::{Scheduler, SchedulerOutput},
     CompletionResponse, RequestMessage, Response, SchedulerConfig, DEBUG,
 };
-use candle_core::{Device, Result, Tensor};
 use rand::SeedableRng;
 use rand_isaac::Isaac64Rng;
 use tracing::{info, warn};
@@ -430,26 +428,6 @@ impl Engine {
         Ok(recognizer)
     }
 
-    fn alloc_logits_bias(&self, logits_bias: Option<HashMap<u32, f32>>) -> Result<Option<Tensor>> {
-        let tokenizer = get_mut_arcmutex!(self.pipeline).tokenizer();
-        let vocab_size = tokenizer.get_vocab_size(true);
-
-        match logits_bias {
-            Some(bias) => {
-                let mut logits_bias = vec![0.0; vocab_size];
-                for (k, v) in bias {
-                    logits_bias[k as usize] = v;
-                }
-                Ok(Some(Tensor::from_vec(
-                    logits_bias,
-                    vocab_size,
-                    &Device::Cpu,
-                )?))
-            }
-            None => Ok(None),
-        }
-    }
-
     async fn handle_request(&mut self, request: Request) {
         match request {
             Request::ActivateAdapters(adapters) => {
@@ -644,19 +622,6 @@ impl Engine {
             .duration_since(UNIX_EPOCH)
             .expect("Time travel has occurred!");
 
-        let logits_bias = match self.alloc_logits_bias(request.sampling_params.logits_bias) {
-            Ok(logits_bias) => logits_bias,
-            Err(err) => {
-                request
-                    .response
-                    .send(Response::ValidationError(
-                        format!("Failed creation of logits bias. {}", err).into(),
-                    ))
-                    .await
-                    .expect("Expected receiver.");
-                return;
-            }
-        };
         let tokenizer = get_mut_arcmutex!(self.pipeline).tokenizer();
 
         let sampler = Sampler::new(
@@ -665,7 +630,6 @@ impl Engine {
             tokenizer,
             request.sampling_params.frequency_penalty,
             request.sampling_params.presence_penalty,
-            logits_bias,
             topk,
             topp,
             minp,
@@ -703,6 +667,7 @@ impl Engine {
                 .cache_config
                 .clone()
                 .map(|conf| conf.block_size);
+            let trie = (*get_mut_arcmutex!(self.pipeline).get_metadata().tok_trie).clone();
             let seq = Sequence::new_waiting(
                 prompt.clone(),
                 self.id,
@@ -733,6 +698,7 @@ impl Engine {
                 request.adapters.clone(),
                 images.clone(),
                 block_size,
+                trie,
             );
             let seq = if let Some(prefill_cache) = prefill_cache.clone() {
                 seq.prefill(
diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs
index ca7bffd53..31ebaec58 100644
--- a/mistralrs-core/src/lib.rs
+++ b/mistralrs-core/src/lib.rs
@@ -62,13 +62,12 @@ pub use device_map::{DeviceLayerMapMetadata, DeviceMapMetadata, LayerDeviceMappe
 pub use paged_attention::PagedAttentionConfig;
 pub use pipeline::{
     chat_template::ChatTemplate, AnyMoeLoader, AnyMoePipeline, GGMLLoader, GGMLLoaderBuilder,
-    GGMLSpecificConfig, GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig,
-    GemmaLoader, Idefics2Loader, LLaVALoader, LLaVANextLoader, LlamaLoader, Loader,
-    LocalModelPaths, MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader,
-    NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader,
-    Phi3VLoader, Qwen2Loader, SpeculativeConfig, SpeculativeLoader, SpeculativePipeline,
-    Starcoder2Loader, TokenSource, VisionLoader, VisionLoaderBuilder, VisionLoaderType,
-    VisionModelLoader, VisionSpecificConfig,
+    GGMLSpecificConfig, GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder, GemmaLoader,
+    Idefics2Loader, LLaVALoader, LLaVANextLoader, LlamaLoader, Loader, LocalModelPaths,
+    MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader, NormalLoaderBuilder,
+    NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader, Phi3VLoader, Qwen2Loader,
+    SpeculativeConfig, SpeculativeLoader, SpeculativePipeline, Starcoder2Loader, TokenSource,
+    VisionLoader, VisionLoaderBuilder, VisionLoaderType, VisionModelLoader, VisionSpecificConfig,
 };
 pub use request::{Constraint, MessageContent, NormalRequest, Request, RequestMessage};
 pub use response::Response;
diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs
index 1bd5994b4..84ad38c6d 100644
--- a/mistralrs-core/src/model_loader.rs
+++ b/mistralrs-core/src/model_loader.rs
@@ -2,10 +2,7 @@ use std::fs::{self, File};
 
 use crate::{
     get_toml_selected_model_dtype,
-    pipeline::{
-        GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, GGUFSpecificConfig,
-        NormalSpecificConfig,
-    },
+    pipeline::{GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, NormalSpecificConfig},
     Loader, ModelDType, ModelSelected, NormalLoaderBuilder, TomlLoaderArgs, TomlSelector,
     VisionLoaderBuilder, VisionSpecificConfig,
 };
@@ -110,15 +107,11 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         }
         ModelSelected::Plain {
             model_id,
-            repeat_last_n,
             tokenizer_json,
             arch,
             dtype: _,
         } => NormalLoaderBuilder::new(
-            NormalSpecificConfig {
-                use_flash_attn,
-                repeat_last_n,
-            },
+            NormalSpecificConfig { use_flash_attn },
             args.chat_template,
             tokenizer_json,
             Some(model_id),
@@ -127,17 +120,13 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         ModelSelected::XLora {
             model_id,
             xlora_model_id,
-            repeat_last_n,
             order,
             tokenizer_json,
             tgt_non_granular_index,
             arch,
             dtype: _,
         } => NormalLoaderBuilder::new(
-            NormalSpecificConfig {
-                use_flash_attn,
-                repeat_last_n,
-            },
+            NormalSpecificConfig { use_flash_attn },
             args.chat_template,
             tokenizer_json,
             model_id,
@@ -156,15 +145,11 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             model_id,
             tokenizer_json,
             adapters_model_id,
-            repeat_last_n,
             order,
             arch,
             dtype: _,
         } => NormalLoaderBuilder::new(
-            NormalSpecificConfig {
-                use_flash_attn,
-                repeat_last_n,
-            },
+            NormalSpecificConfig { use_flash_attn },
             args.chat_template,
             tokenizer_json,
             model_id,
@@ -181,9 +166,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             tok_model_id,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
         } => GGUFLoaderBuilder::new(
-            GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
             tok_model_id,
             quantized_model_id,
@@ -194,12 +177,10 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             tok_model_id,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             xlora_model_id,
             order,
             tgt_non_granular_index,
         } => GGUFLoaderBuilder::new(
-            GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
             tok_model_id,
             quantized_model_id,
@@ -219,11 +200,9 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             tok_model_id,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             adapters_model_id,
             order,
         } => GGUFLoaderBuilder::new(
-            GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
             tok_model_id,
             quantized_model_id,
@@ -242,10 +221,9 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             tokenizer_json,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             gqa,
         } => GGMLLoaderBuilder::new(
-            GGMLSpecificConfig { repeat_last_n, gqa },
+            GGMLSpecificConfig { gqa },
             args.chat_template,
             tokenizer_json,
             Some(tok_model_id),
@@ -258,13 +236,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             tokenizer_json,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             xlora_model_id,
             order,
             tgt_non_granular_index,
             gqa,
         } => GGMLLoaderBuilder::new(
-            GGMLSpecificConfig { repeat_last_n, gqa },
+            GGMLSpecificConfig { gqa },
             args.chat_template,
             tokenizer_json,
             tok_model_id,
@@ -286,12 +263,11 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             tokenizer_json,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             adapters_model_id,
             order,
             gqa,
         } => GGMLLoaderBuilder::new(
-            GGMLSpecificConfig { repeat_last_n, gqa },
+            GGMLSpecificConfig { gqa },
             args.chat_template,
             tokenizer_json,
             tok_model_id,
@@ -308,15 +284,11 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         .build(),
         ModelSelected::VisionPlain {
             model_id,
-            repeat_last_n,
             tokenizer_json,
             arch,
             dtype: _,
         } => VisionLoaderBuilder::new(
-            VisionSpecificConfig {
-                use_flash_attn,
-                repeat_last_n,
-            },
+            VisionSpecificConfig { use_flash_attn },
             args.chat_template,
             tokenizer_json,
             Some(model_id),
diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs
index 1a6e06bbf..32dd350ad 100644
--- a/mistralrs-core/src/model_selected.rs
+++ b/mistralrs-core/src/model_selected.rs
@@ -36,10 +36,6 @@ pub enum ModelSelected {
         #[arg(short, long)]
         tokenizer_json: Option<String>,
 
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
-
         /// The architecture of the model.
         #[arg(short, long, value_parser = parse_arch)]
         arch: NormalLoaderType,
@@ -63,10 +59,6 @@ pub enum ModelSelected {
         #[arg(short, long)]
         xlora_model_id: String,
 
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
-
         /// Ordering JSON file
         #[arg(short, long)]
         order: String,
@@ -99,10 +91,6 @@ pub enum ModelSelected {
         #[arg(short, long)]
         adapters_model_id: String,
 
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
-
         /// Ordering JSON file
         #[arg(short, long)]
         order: String,
@@ -132,10 +120,6 @@ pub enum ModelSelected {
         /// Quantized filename, only applicable if `quantized` is set.
         #[arg(short = 'f', long)]
         quantized_filename: String,
-
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
     },
 
     /// Select a GGUF model with X-LoRA.
@@ -155,10 +139,6 @@ pub enum ModelSelected {
         #[arg(short = 'f', long)]
         quantized_filename: String,
 
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
-
         /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
         #[arg(short, long)]
         xlora_model_id: String,
@@ -190,10 +170,6 @@ pub enum ModelSelected {
         #[arg(short = 'f', long)]
         quantized_filename: String,
 
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
-
         /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
         #[arg(short, long)]
         adapters_model_id: String,
@@ -222,10 +198,6 @@ pub enum ModelSelected {
         #[arg(short = 'f', long)]
         quantized_filename: String,
 
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
-
         /// GQA value
         #[arg(short, long, default_value_t = 1)]
         gqa: usize,
@@ -250,10 +222,6 @@ pub enum ModelSelected {
         #[arg(short = 'f', long)]
         quantized_filename: String,
 
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
-
         /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path.
         #[arg(short, long)]
         xlora_model_id: String,
@@ -291,10 +259,6 @@ pub enum ModelSelected {
         #[arg(short = 'f', long)]
         quantized_filename: String,
 
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
-
         /// Model ID to load LoRA from. This may be a HF hub repo or a local path.
         #[arg(short, long)]
         adapters_model_id: String,
@@ -318,10 +282,6 @@ pub enum ModelSelected {
         #[arg(short, long)]
         tokenizer_json: Option<String>,
 
-        /// Control the application of repeat penalty for the last n tokens
-        #[arg(long, default_value_t = 64)]
-        repeat_last_n: usize,
-
         /// The architecture of the model.
         #[arg(short, long, value_parser = parse_vision_arch)]
         arch: VisionLoaderType,
diff --git a/mistralrs-core/src/pipeline/amoe.rs b/mistralrs-core/src/pipeline/amoe.rs
index f6f4adb40..3f02e6cb8 100644
--- a/mistralrs-core/src/pipeline/amoe.rs
+++ b/mistralrs-core/src/pipeline/amoe.rs
@@ -19,6 +19,7 @@ use rand_isaac::Isaac64Rng;
 use tracing::{info, warn};
 
 use crate::{
+    aici::toktree::TokTrie,
     amoe::{AnyMoeConfig, AnyMoeTrainingInputRow, AnyMoeTrainingInputs, AnyMoeTrainingResult},
     get_mut_arcmutex,
     prefix_cacher::PrefixCacheManager,
@@ -352,8 +353,7 @@ impl AnyMoePipelineMixin for AnyMoePipeline {
 
         // Create several dummy objects for the sequences.
         let (dummy_sender, _) = tokio::sync::mpsc::channel(10000);
-        let dummy_sampler =
-            Sampler::new(None, 0, tokenizer.clone(), None, None, None, -1, 0.0, 0.0);
+        let dummy_sampler = Sampler::new(None, 0, tokenizer.clone(), None, None, -1, 0.0, 0.0);
 
         let dummy_group = Arc::new(tokio::sync::Mutex::new(SequenceGroup::new(
             1, false, false, 0,
@@ -425,6 +425,7 @@ impl AnyMoePipelineMixin for AnyMoePipeline {
                         dummy_sampler.clone(),
                         dummy_group.clone(),
                         images,
+                        (*self.get_metadata().tok_trie).clone(),
                     ));
                 }
                 let mut input_seqs = seqs.iter_mut().collect::<Vec<_>>();
@@ -539,6 +540,7 @@ fn new_dummy_seq(
     dummy_sampler: Sampler,
     dummy_group: Arc<tokio::sync::Mutex<SequenceGroup>>,
     images: Option<Vec<DynamicImage>>,
+    trie: TokTrie,
 ) -> Sequence {
     Sequence::new_waiting(
         tokens,
@@ -561,5 +563,6 @@ fn new_dummy_seq(
         None,
         images,
         None, // TODO incorrect for PagedAttention
+        trie,
     )
 }
diff --git a/mistralrs-core/src/pipeline/ggml.rs b/mistralrs-core/src/pipeline/ggml.rs
index 42f95951c..fa5e656e2 100644
--- a/mistralrs-core/src/pipeline/ggml.rs
+++ b/mistralrs-core/src/pipeline/ggml.rs
@@ -76,7 +76,6 @@ pub struct GGMLLoader {
 #[derive(Clone, Copy, Default)]
 /// Config for a GGML loader.
 pub struct GGMLSpecificConfig {
-    pub repeat_last_n: usize,
     pub gqa: usize,
 }
 
@@ -341,7 +340,6 @@ impl Loader for GGMLLoader {
             }),
             metadata: Arc::new(GeneralMetadata {
                 max_seq_len,
-                repeat_last_n: self.config.repeat_last_n,
                 tok_trie,
                 has_no_kv_cache: self.no_kv_cache,
                 num_hidden_layers,
diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
index 21a5267c9..8379b94e8 100644
--- a/mistralrs-core/src/pipeline/gguf.rs
+++ b/mistralrs-core/src/pipeline/gguf.rs
@@ -81,7 +81,6 @@ pub struct GGUFPipeline {
 /// Loader for a GGUF model.
 pub struct GGUFLoader {
     model_id: Option<String>,
-    config: GGUFSpecificConfig,
     quantized_model_id: String,
     quantized_filename: String,
     xlora_model_id: Option<String>,
@@ -120,17 +119,10 @@ impl GGUFArchitecture {
     }
 }
 
-#[derive(Clone, Copy, Default)]
-/// A config for a GGUF loader.
-pub struct GGUFSpecificConfig {
-    pub repeat_last_n: usize,
-}
-
 #[derive(Default)]
 /// A builder for a GGUF loader.
 pub struct GGUFLoaderBuilder {
     model_id: Option<String>,
-    config: GGUFSpecificConfig,
     quantized_model_id: String,
     quantized_filename: String,
     xlora_model_id: Option<String>,
@@ -146,7 +138,6 @@ impl GGUFLoaderBuilder {
     /// `tokenizer_config.json` file. If the `chat_template` is specified, then it will be treated as a
     /// path and used over remote files, removing all remote accesses.
     pub fn new(
-        config: GGUFSpecificConfig,
         chat_template: Option<String>,
         tok_model_id: Option<String>,
         quantized_model_id: String,
@@ -157,7 +148,6 @@ impl GGUFLoaderBuilder {
         };
 
         Self {
-            config,
             chat_template,
             model_id: tok_model_id,
             kind,
@@ -216,7 +206,6 @@ impl GGUFLoaderBuilder {
     pub fn build(self) -> Box<dyn Loader> {
         Box::new(GGUFLoader {
             model_id: self.model_id,
-            config: self.config,
             xlora_model_id: self.xlora_model_id,
             kind: self.kind,
             xlora_order: self.xlora_order,
@@ -233,7 +222,6 @@ impl GGUFLoader {
     #[allow(clippy::too_many_arguments)]
     pub fn new(
         model_id: Option<String>,
-        config: GGUFSpecificConfig,
         quantized_model_id: String,
         quantized_filename: String,
         xlora_model_id: Option<String>,
@@ -256,7 +244,6 @@ impl GGUFLoader {
         };
         Self {
             model_id,
-            config,
             quantized_model_id,
             quantized_filename,
             xlora_model_id,
@@ -578,7 +565,6 @@ impl Loader for GGUFLoader {
             }),
             metadata: Arc::new(GeneralMetadata {
                 max_seq_len,
-                repeat_last_n: self.config.repeat_last_n,
                 tok_trie,
                 has_no_kv_cache: self.no_kv_cache,
                 num_hidden_layers,
diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
index 9fb3ce708..848eb0458 100644
--- a/mistralrs-core/src/pipeline/mod.rs
+++ b/mistralrs-core/src/pipeline/mod.rs
@@ -28,7 +28,7 @@ use candle_core::quantized::GgmlDType;
 use chat_template::ChatTemplate;
 use core::fmt;
 pub use ggml::{GGMLLoader, GGMLLoaderBuilder, GGMLSpecificConfig};
-pub use gguf::{GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig};
+pub use gguf::{GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder};
 pub use isq::IsqModel;
 pub use normal::{NormalLoader, NormalLoaderBuilder, NormalSpecificConfig};
 pub use normal_loaders::{
@@ -433,7 +433,6 @@ pub trait Loader {
 
 pub struct GeneralMetadata {
     pub max_seq_len: usize,
-    pub repeat_last_n: usize,
     pub tok_trie: Arc<TokTrie>,
     pub has_no_kv_cache: bool,
     pub num_hidden_layers: usize,
diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
index edc18b0a0..a0c5ea1e3 100644
--- a/mistralrs-core/src/pipeline/normal.rs
+++ b/mistralrs-core/src/pipeline/normal.rs
@@ -89,7 +89,6 @@ pub struct NormalLoaderBuilder {
 /// Config specific to loading a normal model.
 pub struct NormalSpecificConfig {
     pub use_flash_attn: bool,
-    pub repeat_last_n: usize,
 }
 
 impl NormalLoaderBuilder {
@@ -355,7 +354,6 @@ impl Loader for NormalLoader {
             model_id: self.model_id.clone(),
             metadata: Arc::new(GeneralMetadata {
                 max_seq_len,
-                repeat_last_n: self.config.repeat_last_n,
                 tok_trie,
                 has_no_kv_cache: self.no_kv_cache,
                 num_hidden_layers,
diff --git a/mistralrs-core/src/pipeline/sampling.rs b/mistralrs-core/src/pipeline/sampling.rs
index b9bee4898..7c33d3e40 100644
--- a/mistralrs-core/src/pipeline/sampling.rs
+++ b/mistralrs-core/src/pipeline/sampling.rs
@@ -4,7 +4,6 @@ use candle_core::{DType, Device, Result, Tensor};
 use rand_isaac::Isaac64Rng;
 
 use crate::{
-    aici::toktree::TokTrie,
     get_bias_if_not_allowed,
     prefix_cacher::PrefixCacheManager,
     sampler::Logprobs,
@@ -238,8 +237,6 @@ pub async fn sample_and_add_toks(
                 logits_per_seq,
                 seq,
                 return_logprobs,
-                this.get_metadata().repeat_last_n,
-                this.get_metadata().tok_trie.clone(),
                 rng.clone(),
                 use_async_pool,
                 true, // Append result to trie
@@ -271,18 +268,15 @@ pub async fn sample_sequence(
     logits: Tensor,
     seq: &mut Sequence,
     return_logprobs: bool,
-    repeat_last_n: usize,
-    tok_trie: Arc<TokTrie>,
     rng: Arc<std::sync::Mutex<Isaac64Rng>>,
     use_async_pool: bool,
     add_to_trie: bool,
     sample_speculative: bool,
 ) -> Result<Logprobs> {
     let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
-    let start_at = seq.get_toks().len().saturating_sub(repeat_last_n);
 
     let sampler = seq.sampler();
-    let ctx_clone = seq.get_toks()[start_at..].to_vec();
+    let ctx_clone = seq.get_toks()[seq.prompt_tokens()..].to_vec();
     let rng_clone = rng.clone();
     let logits_clone = logits.clone();
     let first_lobprobs_response = if use_async_pool {
@@ -308,20 +302,20 @@ pub async fn sample_sequence(
 
     let bias_if_not_allowed = match &mut seq.recognizer {
         SequenceRecognizer::Regex(ref mut rx) => {
-            get_bias_if_not_allowed!(tok_trie, rx.as_mut(), first_lobprobs_response.token)
+            get_bias_if_not_allowed!(seq.tok_trie, rx.as_mut(), first_lobprobs_response.token)
         }
         SequenceRecognizer::Cfg(ref mut cfg) => {
-            get_bias_if_not_allowed!(tok_trie, cfg.as_mut(), first_lobprobs_response.token)
+            get_bias_if_not_allowed!(seq.tok_trie, cfg.as_mut(), first_lobprobs_response.token)
         }
         SequenceRecognizer::None => None,
     };
     let second_logprobs_response = match bias_if_not_allowed {
         Some(token_set) => {
-            let mut acc = vec![-f32::INFINITY; tok_trie.vocab_size()];
+            let mut acc = vec![-f32::INFINITY; seq.tok_trie.vocab_size()];
             token_set.apply_to(&mut acc);
             let new_logits = (logits + Tensor::from_slice(&acc, acc.len(), &Device::Cpu)?)?;
 
-            let ctx_clone = seq.get_toks()[start_at..].to_vec();
+            let ctx_clone = seq.get_toks()[seq.prompt_tokens()..].to_vec();
             let rng_clone = rng.clone();
             let sampler = seq.sampler();
             if use_async_pool {
@@ -351,12 +345,12 @@ pub async fn sample_sequence(
     if add_to_trie {
         match seq.recognizer {
             SequenceRecognizer::Regex(ref mut rx) => {
-                tok_trie
+                seq.tok_trie
                     .append_token(rx.as_mut(), second_logprobs_response.token)
                     .map_err(|e| candle_core::Error::Msg(e.to_string()))?;
             }
             SequenceRecognizer::Cfg(ref mut cfg) => {
-                tok_trie
+                seq.tok_trie
                     .append_token(cfg.as_mut(), second_logprobs_response.token)
                     .map_err(|e| candle_core::Error::Msg(e.to_string()))?;
             }
@@ -376,8 +370,6 @@ pub async fn sample_target_sequence_speculative(
     logits: Tensor,
     seq: &mut Sequence,
     return_logprobs: bool,
-    repeat_last_n: usize,
-    tok_trie: Arc<TokTrie>,
     rng: Arc<std::sync::Mutex<Isaac64Rng>>,
     n_toks: usize,
 ) -> Result<Vec<SpeculativeSample>> {
@@ -388,8 +380,6 @@ pub async fn sample_target_sequence_speculative(
                 chunk,
                 seq,
                 return_logprobs,
-                repeat_last_n,
-                tok_trie.clone(),
                 rng.clone(),
                 true,  // TODO(EricLBuehler): does this hurt perf?
                 false, // Do not append to trie (yet)
diff --git a/mistralrs-core/src/pipeline/speculative.rs b/mistralrs-core/src/pipeline/speculative.rs
index 7b543be10..be221ca14 100644
--- a/mistralrs-core/src/pipeline/speculative.rs
+++ b/mistralrs-core/src/pipeline/speculative.rs
@@ -374,7 +374,6 @@ impl Pipeline for SpeculativePipeline {
                 // ======================= Run draft model gamma times producing tokens ============================
                 // ======================= Sample the `gamma` logits. ============================
                 let mut draft_samples = Vec::new();
-                let repeat_last_n = get_mut_arcmutex!(self.draft).get_metadata().repeat_last_n;
                 for i in 0..self.gamma {
                     let is_xlora = get_mut_arcmutex!(self.draft).get_metadata().is_xlora;
                     let device = get_mut_arcmutex!(self.draft).device();
@@ -401,11 +400,6 @@ impl Pipeline for SpeculativePipeline {
                         logits.clone(),
                         seq,
                         seq.return_logprobs(),
-                        repeat_last_n,
-                        get_mut_arcmutex!(self.draft)
-                            .get_metadata()
-                            .tok_trie
-                            .clone(),
                         rng.clone(),
                         false, // todo tune
                         false, // do not add to tok trie yet
@@ -471,11 +465,6 @@ impl Pipeline for SpeculativePipeline {
                     logits.clone(),
                     seq,
                     seq.return_logprobs(),
-                    repeat_last_n,
-                    get_mut_arcmutex!(self.draft)
-                        .get_metadata()
-                        .tok_trie
-                        .clone(),
                     rng.clone(),
                     self.gamma,
                 )
@@ -579,11 +568,6 @@ impl Pipeline for SpeculativePipeline {
                     logits.clone(),
                     seq,
                     seq.return_logprobs(),
-                    repeat_last_n,
-                    get_mut_arcmutex!(self.draft)
-                        .get_metadata()
-                        .tok_trie
-                        .clone(),
                     rng.clone(),
                     false, // todo tune
                     true, // do not add to tok trie yet
diff --git a/mistralrs-core/src/pipeline/vision.rs b/mistralrs-core/src/pipeline/vision.rs
index 6eca26ac9..a01b17d6b 100644
--- a/mistralrs-core/src/pipeline/vision.rs
+++ b/mistralrs-core/src/pipeline/vision.rs
@@ -77,7 +77,6 @@ pub struct VisionLoaderBuilder {
 /// Config specific to loading a vision model.
 pub struct VisionSpecificConfig {
     pub use_flash_attn: bool,
-    pub repeat_last_n: usize,
 }
 
 impl VisionLoaderBuilder {
@@ -270,7 +269,6 @@ impl Loader for VisionLoader {
             model_id: self.model_id.clone(),
             metadata: Arc::new(GeneralMetadata {
                 max_seq_len,
-                repeat_last_n: self.config.repeat_last_n,
                 tok_trie,
                 is_xlora: false,
                 num_hidden_layers,
diff --git a/mistralrs-core/src/sampler.rs b/mistralrs-core/src/sampler.rs
index 15abd25e0..d1894f91f 100644
--- a/mistralrs-core/src/sampler.rs
+++ b/mistralrs-core/src/sampler.rs
@@ -64,7 +64,6 @@ pub struct Sampler {
     tokenizer: Arc<Tokenizer>,
     frequency_penalty: Option<f32>,
     presence_penalty: Option<f32>,
-    logits_bias: Option<Tensor>,
     top_k: i64,
     top_p: f64,
     min_p: f64,
@@ -100,7 +99,6 @@ impl Sampler {
         tokenizer: Arc<Tokenizer>,
         frequency_penalty: Option<f32>,
         presence_penalty: Option<f32>,
-        logits_bias: Option<Tensor>,
         top_k: i64,
         top_p: f64,
         min_p: f64,
@@ -116,7 +114,6 @@ impl Sampler {
             tokenizer,
             frequency_penalty,
             presence_penalty,
-            logits_bias,
             top_k,
             top_p,
             min_p,
@@ -400,10 +397,6 @@ impl Sampler {
         sample_speculative: bool,
     ) -> Result<Logprobs> {
         let logits = self.apply_penalties(logits.to_vec1()?, penalty_ctxt)?;
-        let logits = match self.logits_bias {
-            Some(ref bias) => (logits + bias)?,
-            None => logits,
-        };
         let next_token = if sample_speculative {
             match self.temperature {
                 None => self.sample_speculative_top_kp_min_p(
@@ -475,17 +468,7 @@ mod tests {
         use std::sync::Arc;
         use std::sync::Mutex;
 
-        let sampler = Sampler::new(
-            None,
-            10,
-            get_tokenizer().into(),
-            None,
-            None,
-            None,
-            32,
-            0.1,
-            0.05,
-        );
+        let sampler = Sampler::new(None, 10, get_tokenizer().into(), None, None, 32, 0.1, 0.05);
         let logits = Tensor::arange(0f32, 1024f32, &Device::Cpu).unwrap();
         let rng = Arc::new(Mutex::new(Isaac64Rng::seed_from_u64(42)));
         let res = sampler.sample(logits, None, false, rng, false).unwrap();
@@ -503,17 +486,7 @@ mod tests {
         use std::sync::Arc;
         use std::sync::Mutex;
 
-        let sampler = Sampler::new(
-            None,
-            10,
-            get_tokenizer().into(),
-            None,
-            None,
-            None,
-            32,
-            0.1,
-            0.05,
-        );
+        let sampler = Sampler::new(None, 10, get_tokenizer().into(), None, None, 32, 0.1, 0.05);
         let logits = Tensor::arange(0f32, 1024f32, &Device::Cpu).unwrap();
         let rng = Arc::new(Mutex::new(Isaac64Rng::seed_from_u64(42)));
         let res = sampler.sample(logits, None, false, rng, true).unwrap();
diff --git a/mistralrs-core/src/sequence.rs b/mistralrs-core/src/sequence.rs
index 378e86e8e..93bc69b1d 100644
--- a/mistralrs-core/src/sequence.rs
+++ b/mistralrs-core/src/sequence.rs
@@ -9,7 +9,7 @@ use tokio::sync::{
 };
 
 use crate::{
-    aici::{cfg::CfgParser, recognizer::StackRecognizer, rx::RecRx},
+    aici::{cfg::CfgParser, recognizer::StackRecognizer, rx::RecRx, toktree::TokTrie},
     paged_attention::{BlockEngineSequence, LogicalTokenBlock},
     response::CompletionChoice,
     CompletionChunkChoice, CompletionChunkResponse, CompletionResponse,
@@ -160,6 +160,7 @@ pub struct Sequence {
     prefix: Option<String>,
     is_tmp: bool,
     adapters: Option<Vec<String>>,
+    pub(crate) tok_trie: TokTrie,
 
     // Cache
     scaling_cache: Option<Tensor>,
@@ -242,6 +243,7 @@ impl Sequence {
         input_images: Option<Vec<image::DynamicImage>>,
         // Paged attention
         block_size: Option<usize>,
+        tok_trie: TokTrie,
     ) -> Self {
         let prompt_len = tokens.len();
         let mut custom_metadata = if let Some(block_size) = block_size {
@@ -295,6 +297,7 @@ impl Sequence {
             adapters,
             input_images,
             custom_metadata,
+            tok_trie,
         }
     }
 
diff --git a/mistralrs-core/src/toml_selector.rs b/mistralrs-core/src/toml_selector.rs
index 90d195d64..0ffcee93a 100644
--- a/mistralrs-core/src/toml_selector.rs
+++ b/mistralrs-core/src/toml_selector.rs
@@ -4,15 +4,11 @@ use serde::Deserialize;
 
 use crate::{
     amoe::AnyMoeConfig, AnyMoeLoader, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder,
-    GGUFSpecificConfig, Loader, ModelDType, NormalLoaderBuilder, NormalLoaderType,
-    NormalSpecificConfig, SpeculativeConfig, SpeculativeLoader, VisionLoaderBuilder,
-    VisionLoaderType, VisionSpecificConfig,
+    Loader, ModelDType, NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig,
+    SpeculativeConfig, SpeculativeLoader, VisionLoaderBuilder, VisionLoaderType,
+    VisionSpecificConfig,
 };
 
-fn default_repeat_last_n() -> usize {
-    64
-}
-
 fn default_one() -> usize {
     1
 }
@@ -263,10 +259,6 @@ pub struct TomlSelector {
     /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
     tokenizer_json: Option<String>,
 
-    /// Control the application of repeat penalty for the last n tokens
-    #[serde(default = "default_repeat_last_n")]
-    repeat_last_n: usize,
-
     /// Selected model
     model: TomlModelSelected,
 
@@ -283,7 +275,6 @@ struct TomlLoaderInnerParams {
     chat_template: Option<String>,
     no_kv_cache: bool,
     tokenizer_json: Option<String>,
-    repeat_last_n: usize,
 }
 
 pub struct TomlLoaderArgs {
@@ -318,10 +309,7 @@ fn loader_from_selected(
             arch,
             dtype: _,
         } => NormalLoaderBuilder::new(
-            NormalSpecificConfig {
-                use_flash_attn,
-                repeat_last_n: args.repeat_last_n,
-            },
+            NormalSpecificConfig { use_flash_attn },
             args.chat_template,
             args.tokenizer_json,
             Some(model_id),
@@ -335,10 +323,7 @@ fn loader_from_selected(
             arch,
             dtype: _,
         } => NormalLoaderBuilder::new(
-            NormalSpecificConfig {
-                use_flash_attn,
-                repeat_last_n: args.repeat_last_n,
-            },
+            NormalSpecificConfig { use_flash_attn },
             args.chat_template,
             args.tokenizer_json,
             model_id,
@@ -360,10 +345,7 @@ fn loader_from_selected(
             arch,
             dtype: _,
         } => NormalLoaderBuilder::new(
-            NormalSpecificConfig {
-                use_flash_attn,
-                repeat_last_n: args.repeat_last_n,
-            },
+            NormalSpecificConfig { use_flash_attn },
             args.chat_template,
             args.tokenizer_json,
             model_id,
@@ -381,9 +363,6 @@ fn loader_from_selected(
             quantized_model_id,
             quantized_filename,
         } => GGUFLoaderBuilder::new(
-            GGUFSpecificConfig {
-                repeat_last_n: args.repeat_last_n,
-            },
             args.chat_template,
             Some(tok_model_id),
             quantized_model_id,
@@ -398,9 +377,6 @@ fn loader_from_selected(
             order,
             tgt_non_granular_index,
         } => GGUFLoaderBuilder::new(
-            GGUFSpecificConfig {
-                repeat_last_n: args.repeat_last_n,
-            },
             args.chat_template,
             tok_model_id,
             quantized_model_id,
@@ -423,9 +399,6 @@ fn loader_from_selected(
             adapters_model_id,
             order,
         } => GGUFLoaderBuilder::new(
-            GGUFSpecificConfig {
-                repeat_last_n: args.repeat_last_n,
-            },
             args.chat_template,
             tok_model_id,
             quantized_model_id,
@@ -445,10 +418,7 @@ fn loader_from_selected(
             quantized_filename,
             gqa,
         } => GGMLLoaderBuilder::new(
-            GGMLSpecificConfig {
-                repeat_last_n: args.repeat_last_n,
-                gqa,
-            },
+            GGMLSpecificConfig { gqa },
             args.chat_template,
             args.tokenizer_json,
             Some(tok_model_id),
@@ -465,10 +435,7 @@ fn loader_from_selected(
             tgt_non_granular_index,
             gqa,
         } => GGMLLoaderBuilder::new(
-            GGMLSpecificConfig {
-                repeat_last_n: args.repeat_last_n,
-                gqa,
-            },
+            GGMLSpecificConfig { gqa },
             args.chat_template,
             args.tokenizer_json,
             tok_model_id,
@@ -493,10 +460,7 @@ fn loader_from_selected(
             order,
             gqa,
         } => GGMLLoaderBuilder::new(
-            GGMLSpecificConfig {
-                repeat_last_n: args.repeat_last_n,
-                gqa,
-            },
+            GGMLSpecificConfig { gqa },
             args.chat_template,
             args.tokenizer_json,
             tok_model_id,
@@ -516,10 +480,7 @@ fn loader_from_selected(
             arch,
             dtype: _,
         } => VisionLoaderBuilder::new(
-            VisionSpecificConfig {
-                use_flash_attn,
-                repeat_last_n: args.repeat_last_n,
-            },
+            VisionSpecificConfig { use_flash_attn },
             args.chat_template,
             args.tokenizer_json,
             Some(model_id),
@@ -538,7 +499,6 @@ impl TryInto<Box<dyn Loader>> for (TomlSelector, TomlLoaderArgs) {
             chat_template: args.chat_template,
             no_kv_cache: args.no_kv_cache,
             tokenizer_json: selector.tokenizer_json,
-            repeat_last_n: selector.repeat_last_n,
         };
         let loader = loader_from_selected(args.clone(), selector.model)?;
         let loader = if let Some(speculative) = selector.speculative {
diff --git a/mistralrs-pyo3/API.md b/mistralrs-pyo3/API.md
index 63e754860..cdca30e29 100644
--- a/mistralrs-pyo3/API.md
+++ b/mistralrs-pyo3/API.md
@@ -42,7 +42,6 @@ class Which(Enum):
         model_id: str
         arch: Architecture
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class XLora:
@@ -51,7 +50,6 @@ class Which(Enum):
         arch: Architecture
         model_id: str | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
         tgt_non_granular_index: int | None = None
 
     @dataclass
@@ -61,14 +59,12 @@ class Which(Enum):
         arch: Architecture
         model_id: str | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class GGUF:
         quantized_model_id: str
         quantized_filename: str
         tok_model_id: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class XLoraGGUF:
@@ -77,7 +73,6 @@ class Which(Enum):
         xlora_model_id: str
         order: str
         tok_model_id: str | None = None
-        repeat_last_n: int = 64
         tgt_non_granular_index: int | None = None
 
     @dataclass
@@ -87,7 +82,6 @@ class Which(Enum):
         adapters_model_id: str
         order: str
         tok_model_id: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class GGML:
@@ -95,7 +89,6 @@ class Which(Enum):
         quantized_filename: str
         tok_model_id: str | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
         gqa: int | None = None
 
     @dataclass
@@ -107,7 +100,6 @@ class Which(Enum):
         tok_model_id: str | None = None
         tgt_non_granular_index: int | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
         gqa: int | None = None
 
     @dataclass
@@ -118,14 +110,12 @@ class Which(Enum):
         order: str
         tok_model_id: str | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class VisionPlain:
         model_id: str
         arch: VisionArchitecture
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
 ```
 
 
diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi
index e827e2824..7d392c433 100644
--- a/mistralrs-pyo3/mistralrs.pyi
+++ b/mistralrs-pyo3/mistralrs.pyi
@@ -89,7 +89,6 @@ class Which(Enum):
         model_id: str
         arch: Architecture
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class XLora:
@@ -98,7 +97,6 @@ class Which(Enum):
         arch: Architecture
         model_id: str | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
         tgt_non_granular_index: int | None = None
 
     @dataclass
@@ -108,14 +106,12 @@ class Which(Enum):
         arch: Architecture
         model_id: str | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class GGUF:
         quantized_model_id: str
         quantized_filename: str
         tok_model_id: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class XLoraGGUF:
@@ -124,7 +120,6 @@ class Which(Enum):
         xlora_model_id: str
         order: str
         tok_model_id: str | None = None
-        repeat_last_n: int = 64
         tgt_non_granular_index: int | None = None
 
     @dataclass
@@ -134,7 +129,6 @@ class Which(Enum):
         adapters_model_id: str
         order: str
         tok_model_id: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class GGML:
@@ -142,7 +136,6 @@ class Which(Enum):
         quantized_filename: str
         tok_model_id: str | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
         gqa: int | None = None
 
     @dataclass
@@ -154,7 +147,6 @@ class Which(Enum):
         tok_model_id: str | None = None
         tgt_non_granular_index: int | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
         gqa: int | None = None
 
     @dataclass
@@ -165,14 +157,12 @@ class Which(Enum):
         order: str
         tok_model_id: str | None = None
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
 
     @dataclass
     class VisionPlain:
         model_id: str
         arch: VisionArchitecture
         tokenizer_json: str | None = None
-        repeat_last_n: int = 64
 
 class Runner:
     def __init__(
diff --git a/mistralrs-pyo3/src/lib.rs b/mistralrs-pyo3/src/lib.rs
index 74478b893..8ff8ad95a 100644
--- a/mistralrs-pyo3/src/lib.rs
+++ b/mistralrs-pyo3/src/lib.rs
@@ -21,11 +21,11 @@ use candle_core::Device;
 use mistralrs_core::{
     initialize_logging, paged_attn_supported, AnyMoeLoader, ChatCompletionResponse,
     CompletionResponse, Constraint, DefaultSchedulerMethod, DeviceLayerMapMetadata,
-    DeviceMapMetadata, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder,
-    GGUFSpecificConfig, Loader, MistralRs, MistralRsBuilder, ModelDType, NormalLoaderBuilder,
-    NormalRequest, NormalSpecificConfig, PagedAttentionConfig, Request as _Request, RequestMessage,
-    Response, SamplingParams, SchedulerConfig, SpeculativeConfig, SpeculativeLoader, StopTokens,
-    TokenSource, VisionLoaderBuilder, VisionSpecificConfig,
+    DeviceMapMetadata, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, Loader, MistralRs,
+    MistralRsBuilder, ModelDType, NormalLoaderBuilder, NormalRequest, NormalSpecificConfig,
+    PagedAttentionConfig, Request as _Request, RequestMessage, Response, SamplingParams,
+    SchedulerConfig, SpeculativeConfig, SpeculativeLoader, StopTokens, TokenSource,
+    VisionLoaderBuilder, VisionSpecificConfig,
 };
 use pyo3::{
     exceptions::{PyTypeError, PyValueError},
@@ -92,14 +92,10 @@ fn parse_which(
     Ok(match which {
         Which::Plain {
             model_id,
-            repeat_last_n,
             tokenizer_json,
             arch,
         } => NormalLoaderBuilder::new(
-            NormalSpecificConfig {
-                use_flash_attn,
-                repeat_last_n,
-            },
+            NormalSpecificConfig { use_flash_attn },
             chat_template,
             tokenizer_json,
             Some(model_id),
@@ -108,16 +104,12 @@ fn parse_which(
         Which::XLora {
             model_id,
             xlora_model_id,
-            repeat_last_n,
             order,
             tokenizer_json,
             tgt_non_granular_index,
             arch,
         } => NormalLoaderBuilder::new(
-            NormalSpecificConfig {
-                use_flash_attn,
-                repeat_last_n,
-            },
+            NormalSpecificConfig { use_flash_attn },
             chat_template,
             tokenizer_json,
             model_id,
@@ -137,14 +129,10 @@ fn parse_which(
             model_id,
             tokenizer_json,
             adapters_model_id,
-            repeat_last_n,
             order,
             arch,
         } => NormalLoaderBuilder::new(
-            NormalSpecificConfig {
-                use_flash_attn,
-                repeat_last_n,
-            },
+            NormalSpecificConfig { use_flash_attn },
             chat_template,
             tokenizer_json,
             model_id,
@@ -162,9 +150,7 @@ fn parse_which(
             tok_model_id,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
         } => GGUFLoaderBuilder::new(
-            GGUFSpecificConfig { repeat_last_n },
             chat_template,
             tok_model_id,
             quantized_model_id,
@@ -175,12 +161,10 @@ fn parse_which(
             tok_model_id,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             xlora_model_id,
             order,
             tgt_non_granular_index,
         } => GGUFLoaderBuilder::new(
-            GGUFSpecificConfig { repeat_last_n },
             chat_template,
             tok_model_id,
             quantized_model_id,
@@ -201,11 +185,9 @@ fn parse_which(
             tok_model_id,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             adapters_model_id,
             order,
         } => GGUFLoaderBuilder::new(
-            GGUFSpecificConfig { repeat_last_n },
             chat_template,
             tok_model_id,
             quantized_model_id,
@@ -225,10 +207,9 @@ fn parse_which(
             tokenizer_json,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             gqa,
         } => GGMLLoaderBuilder::new(
-            GGMLSpecificConfig { repeat_last_n, gqa },
+            GGMLSpecificConfig { gqa },
             chat_template,
             tokenizer_json,
             Some(tok_model_id),
@@ -241,13 +222,12 @@ fn parse_which(
             tokenizer_json,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             xlora_model_id,
             order,
             tgt_non_granular_index,
             gqa,
         } => GGMLLoaderBuilder::new(
-            GGMLSpecificConfig { repeat_last_n, gqa },
+            GGMLSpecificConfig { gqa },
             chat_template,
             tokenizer_json,
             tok_model_id,
@@ -270,12 +250,11 @@ fn parse_which(
             tokenizer_json,
             quantized_model_id,
             quantized_filename,
-            repeat_last_n,
             adapters_model_id,
             order,
             gqa,
         } => GGMLLoaderBuilder::new(
-            GGMLSpecificConfig { repeat_last_n, gqa },
+            GGMLSpecificConfig { gqa },
             chat_template,
             tokenizer_json,
             tok_model_id,
@@ -293,14 +272,10 @@ fn parse_which(
         .build(),
         Which::VisionPlain {
             model_id,
-            repeat_last_n,
             tokenizer_json,
             arch,
         } => VisionLoaderBuilder::new(
-            VisionSpecificConfig {
-                use_flash_attn,
-                repeat_last_n,
-            },
+            VisionSpecificConfig { use_flash_attn },
             chat_template,
             tokenizer_json,
             Some(model_id),
diff --git a/mistralrs-pyo3/src/which.rs b/mistralrs-pyo3/src/which.rs
index 2081efea8..c78c34e6f 100644
--- a/mistralrs-pyo3/src/which.rs
+++ b/mistralrs-pyo3/src/which.rs
@@ -57,14 +57,12 @@ pub enum Which {
     #[pyo3(constructor = (
         model_id,
         arch,
-        tokenizer_json = None,
-        repeat_last_n = 64
+        tokenizer_json = None
     ))]
     Plain {
         model_id: String,
         arch: Architecture,
         tokenizer_json: Option<String>,
-        repeat_last_n: usize,
     },
 
     #[pyo3(constructor = (
@@ -73,7 +71,6 @@ pub enum Which {
         arch,
         model_id = None,
         tokenizer_json = None,
-        repeat_last_n = 64,
         tgt_non_granular_index = None
     ))]
     XLora {
@@ -82,7 +79,6 @@ pub enum Which {
         arch: Architecture,
         model_id: Option<String>,
         tokenizer_json: Option<String>,
-        repeat_last_n: usize,
         tgt_non_granular_index: Option<usize>,
     },
 
@@ -91,8 +87,7 @@ pub enum Which {
         order,
         arch,
         model_id = None,
-        tokenizer_json = None,
-        repeat_last_n = 64
+        tokenizer_json = None
     ))]
     Lora {
         adapters_model_id: String,
@@ -100,21 +95,18 @@ pub enum Which {
         arch: Architecture,
         model_id: Option<String>,
         tokenizer_json: Option<String>,
-        repeat_last_n: usize,
     },
 
     #[pyo3(constructor = (
         quantized_model_id,
         quantized_filename,
         tok_model_id = None,
-        repeat_last_n = 64
     ))]
     #[allow(clippy::upper_case_acronyms)]
     GGUF {
         quantized_model_id: String,
         quantized_filename: String,
         tok_model_id: Option<String>,
-        repeat_last_n: usize,
     },
 
     #[pyo3(constructor = (
@@ -123,7 +115,6 @@ pub enum Which {
         xlora_model_id,
         order,
         tok_model_id = None,
-        repeat_last_n = 64,
         tgt_non_granular_index = None,
     ))]
     XLoraGGUF {
@@ -132,7 +123,6 @@ pub enum Which {
         xlora_model_id: String,
         order: String,
         tok_model_id: Option<String>,
-        repeat_last_n: usize,
         tgt_non_granular_index: Option<usize>,
     },
 
@@ -142,7 +132,6 @@ pub enum Which {
         adapters_model_id,
         order,
         tok_model_id = None,
-        repeat_last_n = 64
     ))]
     LoraGGUF {
         quantized_model_id: String,
@@ -150,7 +139,6 @@ pub enum Which {
         adapters_model_id: String,
         order: String,
         tok_model_id: Option<String>,
-        repeat_last_n: usize,
     },
 
     #[pyo3(constructor = (
@@ -158,7 +146,6 @@ pub enum Which {
         quantized_filename,
         tok_model_id,
         tokenizer_json = None,
-        repeat_last_n = 64,
         gqa = 1,
     ))]
     #[allow(clippy::upper_case_acronyms)]
@@ -167,7 +154,6 @@ pub enum Which {
         quantized_filename: String,
         tok_model_id: String,
         tokenizer_json: Option<String>,
-        repeat_last_n: usize,
         gqa: usize,
     },
 
@@ -178,7 +164,6 @@ pub enum Which {
         order,
         tok_model_id = None,
         tokenizer_json = None,
-        repeat_last_n = 64,
         tgt_non_granular_index = None,
         gqa = 1,
     ))]
@@ -189,7 +174,6 @@ pub enum Which {
         order: String,
         tok_model_id: Option<String>,
         tokenizer_json: Option<String>,
-        repeat_last_n: usize,
         tgt_non_granular_index: Option<usize>,
         gqa: usize,
     },
@@ -201,7 +185,6 @@ pub enum Which {
         order,
         tok_model_id = None,
         tokenizer_json = None,
-        repeat_last_n = 64,
         gqa = 1,
     ))]
     LoraGGML {
@@ -211,7 +194,6 @@ pub enum Which {
         order: String,
         tok_model_id: Option<String>,
         tokenizer_json: Option<String>,
-        repeat_last_n: usize,
         gqa: usize,
     },
 
@@ -219,12 +201,10 @@ pub enum Which {
         model_id,
         arch,
         tokenizer_json = None,
-        repeat_last_n = 64
     ))]
     VisionPlain {
         model_id: String,
         arch: VisionArchitecture,
         tokenizer_json: Option<String>,
-        repeat_last_n: usize,
     },
 }
diff --git a/mistralrs/examples/anymoe/main.rs b/mistralrs/examples/anymoe/main.rs
index 708b175de..17c27e552 100644
--- a/mistralrs/examples/anymoe/main.rs
+++ b/mistralrs/examples/anymoe/main.rs
@@ -27,7 +27,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = NormalLoaderBuilder::new(
         NormalSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/anymoe_lora/main.rs b/mistralrs/examples/anymoe_lora/main.rs
index 8ed21aa45..2a3a2bd2b 100644
--- a/mistralrs/examples/anymoe_lora/main.rs
+++ b/mistralrs/examples/anymoe_lora/main.rs
@@ -27,7 +27,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = NormalLoaderBuilder::new(
         NormalSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/gemma2/main.rs b/mistralrs/examples/gemma2/main.rs
index fa03a6978..12b806bab 100644
--- a/mistralrs/examples/gemma2/main.rs
+++ b/mistralrs/examples/gemma2/main.rs
@@ -26,7 +26,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = NormalLoaderBuilder::new(
         NormalSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/gguf_locally/main.rs b/mistralrs/examples/gguf_locally/main.rs
index 93e0d55e5..bc0456b72 100644
--- a/mistralrs/examples/gguf_locally/main.rs
+++ b/mistralrs/examples/gguf_locally/main.rs
@@ -4,9 +4,9 @@ use std::sync::Arc;
 use tokio::sync::mpsc::channel;
 
 use mistralrs::{
-    Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, GGUFLoaderBuilder,
-    GGUFSpecificConfig, MistralRs, MistralRsBuilder, ModelDType, NormalRequest, Request,
-    RequestMessage, Response, Result, SamplingParams, SchedulerConfig, TokenSource,
+    Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, GGUFLoaderBuilder, MistralRs,
+    MistralRsBuilder, ModelDType, NormalRequest, Request, RequestMessage, Response, Result,
+    SamplingParams, SchedulerConfig, TokenSource,
 };
 
 /// Gets the best device, cpu, cuda if compiled with CUDA
@@ -27,7 +27,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     // chat template from the specified file, and the tokenizer and model from a
     // local GGUF file at the path `.`
     let loader = GGUFLoaderBuilder::new(
-        GGUFSpecificConfig { repeat_last_n: 64 },
         Some("chat_templates/mistral.json".to_string()),
         None,
         ".".to_string(),
diff --git a/mistralrs/examples/grammar/main.rs b/mistralrs/examples/grammar/main.rs
index c864c69f0..33e109588 100644
--- a/mistralrs/examples/grammar/main.rs
+++ b/mistralrs/examples/grammar/main.rs
@@ -26,7 +26,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = NormalLoaderBuilder::new(
         NormalSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/idefics2/main.rs b/mistralrs/examples/idefics2/main.rs
index a5f700837..424bc65e4 100644
--- a/mistralrs/examples/idefics2/main.rs
+++ b/mistralrs/examples/idefics2/main.rs
@@ -27,7 +27,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = VisionLoaderBuilder::new(
         VisionSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/isq/main.rs b/mistralrs/examples/isq/main.rs
index 3d308f6a0..26e44a5a7 100644
--- a/mistralrs/examples/isq/main.rs
+++ b/mistralrs/examples/isq/main.rs
@@ -27,7 +27,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = NormalLoaderBuilder::new(
         NormalSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/llava/main.rs b/mistralrs/examples/llava/main.rs
index 6300c216a..d5a102edd 100644
--- a/mistralrs/examples/llava/main.rs
+++ b/mistralrs/examples/llava/main.rs
@@ -14,7 +14,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = VisionLoaderBuilder::new(
         VisionSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         Some("chat_templates/vicuna.json".to_string()),
         None,
diff --git a/mistralrs/examples/llava_next/main.rs b/mistralrs/examples/llava_next/main.rs
index 5b190d855..7cea0b625 100644
--- a/mistralrs/examples/llava_next/main.rs
+++ b/mistralrs/examples/llava_next/main.rs
@@ -15,7 +15,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = VisionLoaderBuilder::new(
         VisionSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/lora/main.rs b/mistralrs/examples/lora/main.rs
index 96a700ae3..6007a9038 100644
--- a/mistralrs/examples/lora/main.rs
+++ b/mistralrs/examples/lora/main.rs
@@ -27,7 +27,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
         NormalLoaderBuilder::new(
             NormalSpecificConfig {
                 use_flash_attn: false,
-                repeat_last_n: 64,
             },
             None,
             None,
diff --git a/mistralrs/examples/lora_activation/main.rs b/mistralrs/examples/lora_activation/main.rs
index d5e01e5ba..72988b66d 100644
--- a/mistralrs/examples/lora_activation/main.rs
+++ b/mistralrs/examples/lora_activation/main.rs
@@ -27,7 +27,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
         NormalLoaderBuilder::new(
             NormalSpecificConfig {
                 use_flash_attn: false,
-                repeat_last_n: 64,
             },
             None,
             None,
diff --git a/mistralrs/examples/paged_attn/main.rs b/mistralrs/examples/paged_attn/main.rs
index dafb8e0e9..a6fb014a1 100644
--- a/mistralrs/examples/paged_attn/main.rs
+++ b/mistralrs/examples/paged_attn/main.rs
@@ -33,7 +33,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = NormalLoaderBuilder::new(
         NormalSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/phi3v/main.rs b/mistralrs/examples/phi3v/main.rs
index a6edc0c5d..4623ef63c 100644
--- a/mistralrs/examples/phi3v/main.rs
+++ b/mistralrs/examples/phi3v/main.rs
@@ -27,7 +27,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = VisionLoaderBuilder::new(
         VisionSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/quantized/main.rs b/mistralrs/examples/quantized/main.rs
index eb2e1185d..594625f1e 100644
--- a/mistralrs/examples/quantized/main.rs
+++ b/mistralrs/examples/quantized/main.rs
@@ -4,9 +4,9 @@ use std::sync::Arc;
 use tokio::sync::mpsc::channel;
 
 use mistralrs::{
-    Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, GGUFLoaderBuilder,
-    GGUFSpecificConfig, MistralRs, MistralRsBuilder, ModelDType, NormalRequest, Request,
-    RequestMessage, Response, Result, SamplingParams, SchedulerConfig, TokenSource,
+    Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, GGUFLoaderBuilder, MistralRs,
+    MistralRsBuilder, ModelDType, NormalRequest, Request, RequestMessage, Response, Result,
+    SamplingParams, SchedulerConfig, TokenSource,
 };
 
 /// Gets the best device, cpu, cuda if compiled with CUDA
@@ -25,7 +25,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     // Select a Mistral model
     // This uses a model, tokenizer, and chat template, from HF hub.
     let loader = GGUFLoaderBuilder::new(
-        GGUFSpecificConfig { repeat_last_n: 64 },
         None,
         Some("mistralai/Mistral-7B-Instruct-v0.1".to_string()),
         "TheBloke/Mistral-7B-Instruct-v0.1-GGUF".to_string(),
diff --git a/mistralrs/examples/simple/main.rs b/mistralrs/examples/simple/main.rs
index 610e14825..afdb29dc3 100644
--- a/mistralrs/examples/simple/main.rs
+++ b/mistralrs/examples/simple/main.rs
@@ -26,7 +26,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = NormalLoaderBuilder::new(
         NormalSpecificConfig {
             use_flash_attn: false,
-            repeat_last_n: 64,
         },
         None,
         None,
diff --git a/mistralrs/examples/xlora/main.rs b/mistralrs/examples/xlora/main.rs
index 22e5ef1af..d2ed716bb 100644
--- a/mistralrs/examples/xlora/main.rs
+++ b/mistralrs/examples/xlora/main.rs
@@ -27,7 +27,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
         NormalLoaderBuilder::new(
             NormalSpecificConfig {
                 use_flash_attn: false,
-                repeat_last_n: 64,
             },
             None,
             None,