diff --git a/docs/ANYMOE.md b/docs/ANYMOE.md index fbe5bfab4..fb4206c69 100644 --- a/docs/ANYMOE.md +++ b/docs/ANYMOE.md @@ -171,7 +171,6 @@ fn setup() -> anyhow::Result> { let loader = NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/docs/IDEFICS2.md b/docs/IDEFICS2.md index b7798036a..ad6089eca 100644 --- a/docs/IDEFICS2.md +++ b/docs/IDEFICS2.md @@ -113,7 +113,6 @@ fn setup() -> anyhow::Result> { let loader = VisionLoaderBuilder::new( VisionSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/docs/LLaVA.md b/docs/LLaVA.md index 47ff661a3..a8a0cefa4 100644 --- a/docs/LLaVA.md +++ b/docs/LLaVA.md @@ -110,7 +110,6 @@ fn setup() -> anyhow::Result> { let loader = VisionLoaderBuilder::new( VisionSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/docs/PAGED_ATTENTION.md b/docs/PAGED_ATTENTION.md index 8ee028e4c..f61dc0808 100644 --- a/docs/PAGED_ATTENTION.md +++ b/docs/PAGED_ATTENTION.md @@ -68,7 +68,6 @@ fn setup() -> anyhow::Result> { let loader = NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/docs/PHI3V.md b/docs/PHI3V.md index 0a535cf23..376419922 100644 --- a/docs/PHI3V.md +++ b/docs/PHI3V.md @@ -121,7 +121,6 @@ fn setup() -> anyhow::Result> { let loader = VisionLoaderBuilder::new( VisionSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs-core/src/engine/mod.rs b/mistralrs-core/src/engine/mod.rs index 912434536..a622a02b1 100644 --- a/mistralrs-core/src/engine/mod.rs +++ b/mistralrs-core/src/engine/mod.rs @@ -1,5 +1,4 @@ use std::{ - collections::HashMap, sync::{ atomic::{AtomicBool, Ordering}, Arc, @@ -19,7 +18,6 @@ use crate::{ scheduler::{Scheduler, SchedulerOutput}, CompletionResponse, RequestMessage, Response, SchedulerConfig, DEBUG, }; -use candle_core::{Device, Result, Tensor}; use rand::SeedableRng; use rand_isaac::Isaac64Rng; use tracing::{info, warn}; @@ -430,26 +428,6 @@ impl Engine { Ok(recognizer) } - fn alloc_logits_bias(&self, logits_bias: Option>) -> Result> { - let tokenizer = get_mut_arcmutex!(self.pipeline).tokenizer(); - let vocab_size = tokenizer.get_vocab_size(true); - - match logits_bias { - Some(bias) => { - let mut logits_bias = vec![0.0; vocab_size]; - for (k, v) in bias { - logits_bias[k as usize] = v; - } - Ok(Some(Tensor::from_vec( - logits_bias, - vocab_size, - &Device::Cpu, - )?)) - } - None => Ok(None), - } - } - async fn handle_request(&mut self, request: Request) { match request { Request::ActivateAdapters(adapters) => { @@ -644,19 +622,6 @@ impl Engine { .duration_since(UNIX_EPOCH) .expect("Time travel has occurred!"); - let logits_bias = match self.alloc_logits_bias(request.sampling_params.logits_bias) { - Ok(logits_bias) => logits_bias, - Err(err) => { - request - .response - .send(Response::ValidationError( - format!("Failed creation of logits bias. {}", err).into(), - )) - .await - .expect("Expected receiver."); - return; - } - }; let tokenizer = get_mut_arcmutex!(self.pipeline).tokenizer(); let sampler = Sampler::new( @@ -665,7 +630,6 @@ impl Engine { tokenizer, request.sampling_params.frequency_penalty, request.sampling_params.presence_penalty, - logits_bias, topk, topp, minp, @@ -703,6 +667,7 @@ impl Engine { .cache_config .clone() .map(|conf| conf.block_size); + let trie = (*get_mut_arcmutex!(self.pipeline).get_metadata().tok_trie).clone(); let seq = Sequence::new_waiting( prompt.clone(), self.id, @@ -733,6 +698,7 @@ impl Engine { request.adapters.clone(), images.clone(), block_size, + trie, ); let seq = if let Some(prefill_cache) = prefill_cache.clone() { seq.prefill( diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs index ca7bffd53..31ebaec58 100644 --- a/mistralrs-core/src/lib.rs +++ b/mistralrs-core/src/lib.rs @@ -62,13 +62,12 @@ pub use device_map::{DeviceLayerMapMetadata, DeviceMapMetadata, LayerDeviceMappe pub use paged_attention::PagedAttentionConfig; pub use pipeline::{ chat_template::ChatTemplate, AnyMoeLoader, AnyMoePipeline, GGMLLoader, GGMLLoaderBuilder, - GGMLSpecificConfig, GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig, - GemmaLoader, Idefics2Loader, LLaVALoader, LLaVANextLoader, LlamaLoader, Loader, - LocalModelPaths, MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader, - NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader, - Phi3VLoader, Qwen2Loader, SpeculativeConfig, SpeculativeLoader, SpeculativePipeline, - Starcoder2Loader, TokenSource, VisionLoader, VisionLoaderBuilder, VisionLoaderType, - VisionModelLoader, VisionSpecificConfig, + GGMLSpecificConfig, GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder, GemmaLoader, + Idefics2Loader, LLaVALoader, LLaVANextLoader, LlamaLoader, Loader, LocalModelPaths, + MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader, NormalLoaderBuilder, + NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader, Phi3VLoader, Qwen2Loader, + SpeculativeConfig, SpeculativeLoader, SpeculativePipeline, Starcoder2Loader, TokenSource, + VisionLoader, VisionLoaderBuilder, VisionLoaderType, VisionModelLoader, VisionSpecificConfig, }; pub use request::{Constraint, MessageContent, NormalRequest, Request, RequestMessage}; pub use response::Response; diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs index 1bd5994b4..84ad38c6d 100644 --- a/mistralrs-core/src/model_loader.rs +++ b/mistralrs-core/src/model_loader.rs @@ -2,10 +2,7 @@ use std::fs::{self, File}; use crate::{ get_toml_selected_model_dtype, - pipeline::{ - GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, GGUFSpecificConfig, - NormalSpecificConfig, - }, + pipeline::{GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, NormalSpecificConfig}, Loader, ModelDType, ModelSelected, NormalLoaderBuilder, TomlLoaderArgs, TomlSelector, VisionLoaderBuilder, VisionSpecificConfig, }; @@ -110,15 +107,11 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result NormalLoaderBuilder::new( - NormalSpecificConfig { - use_flash_attn, - repeat_last_n, - }, + NormalSpecificConfig { use_flash_attn }, args.chat_template, tokenizer_json, Some(model_id), @@ -127,17 +120,13 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result NormalLoaderBuilder::new( - NormalSpecificConfig { - use_flash_attn, - repeat_last_n, - }, + NormalSpecificConfig { use_flash_attn }, args.chat_template, tokenizer_json, model_id, @@ -156,15 +145,11 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result NormalLoaderBuilder::new( - NormalSpecificConfig { - use_flash_attn, - repeat_last_n, - }, + NormalSpecificConfig { use_flash_attn }, args.chat_template, tokenizer_json, model_id, @@ -181,9 +166,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result GGUFLoaderBuilder::new( - GGUFSpecificConfig { repeat_last_n }, args.chat_template, tok_model_id, quantized_model_id, @@ -194,12 +177,10 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result GGUFLoaderBuilder::new( - GGUFSpecificConfig { repeat_last_n }, args.chat_template, tok_model_id, quantized_model_id, @@ -219,11 +200,9 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result GGUFLoaderBuilder::new( - GGUFSpecificConfig { repeat_last_n }, args.chat_template, tok_model_id, quantized_model_id, @@ -242,10 +221,9 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result GGMLLoaderBuilder::new( - GGMLSpecificConfig { repeat_last_n, gqa }, + GGMLSpecificConfig { gqa }, args.chat_template, tokenizer_json, Some(tok_model_id), @@ -258,13 +236,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result GGMLLoaderBuilder::new( - GGMLSpecificConfig { repeat_last_n, gqa }, + GGMLSpecificConfig { gqa }, args.chat_template, tokenizer_json, tok_model_id, @@ -286,12 +263,11 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result GGMLLoaderBuilder::new( - GGMLSpecificConfig { repeat_last_n, gqa }, + GGMLSpecificConfig { gqa }, args.chat_template, tokenizer_json, tok_model_id, @@ -308,15 +284,11 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result VisionLoaderBuilder::new( - VisionSpecificConfig { - use_flash_attn, - repeat_last_n, - }, + VisionSpecificConfig { use_flash_attn }, args.chat_template, tokenizer_json, Some(model_id), diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs index 1a6e06bbf..32dd350ad 100644 --- a/mistralrs-core/src/model_selected.rs +++ b/mistralrs-core/src/model_selected.rs @@ -36,10 +36,6 @@ pub enum ModelSelected { #[arg(short, long)] tokenizer_json: Option, - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, - /// The architecture of the model. #[arg(short, long, value_parser = parse_arch)] arch: NormalLoaderType, @@ -63,10 +59,6 @@ pub enum ModelSelected { #[arg(short, long)] xlora_model_id: String, - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, - /// Ordering JSON file #[arg(short, long)] order: String, @@ -99,10 +91,6 @@ pub enum ModelSelected { #[arg(short, long)] adapters_model_id: String, - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, - /// Ordering JSON file #[arg(short, long)] order: String, @@ -132,10 +120,6 @@ pub enum ModelSelected { /// Quantized filename, only applicable if `quantized` is set. #[arg(short = 'f', long)] quantized_filename: String, - - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, }, /// Select a GGUF model with X-LoRA. @@ -155,10 +139,6 @@ pub enum ModelSelected { #[arg(short = 'f', long)] quantized_filename: String, - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, - /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path. #[arg(short, long)] xlora_model_id: String, @@ -190,10 +170,6 @@ pub enum ModelSelected { #[arg(short = 'f', long)] quantized_filename: String, - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, - /// Model ID to load LoRA from. This may be a HF hub repo or a local path. #[arg(short, long)] adapters_model_id: String, @@ -222,10 +198,6 @@ pub enum ModelSelected { #[arg(short = 'f', long)] quantized_filename: String, - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, - /// GQA value #[arg(short, long, default_value_t = 1)] gqa: usize, @@ -250,10 +222,6 @@ pub enum ModelSelected { #[arg(short = 'f', long)] quantized_filename: String, - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, - /// Model ID to load X-LoRA from. This may be a HF hub repo or a local path. #[arg(short, long)] xlora_model_id: String, @@ -291,10 +259,6 @@ pub enum ModelSelected { #[arg(short = 'f', long)] quantized_filename: String, - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, - /// Model ID to load LoRA from. This may be a HF hub repo or a local path. #[arg(short, long)] adapters_model_id: String, @@ -318,10 +282,6 @@ pub enum ModelSelected { #[arg(short, long)] tokenizer_json: Option, - /// Control the application of repeat penalty for the last n tokens - #[arg(long, default_value_t = 64)] - repeat_last_n: usize, - /// The architecture of the model. #[arg(short, long, value_parser = parse_vision_arch)] arch: VisionLoaderType, diff --git a/mistralrs-core/src/pipeline/amoe.rs b/mistralrs-core/src/pipeline/amoe.rs index f6f4adb40..3f02e6cb8 100644 --- a/mistralrs-core/src/pipeline/amoe.rs +++ b/mistralrs-core/src/pipeline/amoe.rs @@ -19,6 +19,7 @@ use rand_isaac::Isaac64Rng; use tracing::{info, warn}; use crate::{ + aici::toktree::TokTrie, amoe::{AnyMoeConfig, AnyMoeTrainingInputRow, AnyMoeTrainingInputs, AnyMoeTrainingResult}, get_mut_arcmutex, prefix_cacher::PrefixCacheManager, @@ -352,8 +353,7 @@ impl AnyMoePipelineMixin for AnyMoePipeline { // Create several dummy objects for the sequences. let (dummy_sender, _) = tokio::sync::mpsc::channel(10000); - let dummy_sampler = - Sampler::new(None, 0, tokenizer.clone(), None, None, None, -1, 0.0, 0.0); + let dummy_sampler = Sampler::new(None, 0, tokenizer.clone(), None, None, -1, 0.0, 0.0); let dummy_group = Arc::new(tokio::sync::Mutex::new(SequenceGroup::new( 1, false, false, 0, @@ -425,6 +425,7 @@ impl AnyMoePipelineMixin for AnyMoePipeline { dummy_sampler.clone(), dummy_group.clone(), images, + (*self.get_metadata().tok_trie).clone(), )); } let mut input_seqs = seqs.iter_mut().collect::>(); @@ -539,6 +540,7 @@ fn new_dummy_seq( dummy_sampler: Sampler, dummy_group: Arc>, images: Option>, + trie: TokTrie, ) -> Sequence { Sequence::new_waiting( tokens, @@ -561,5 +563,6 @@ fn new_dummy_seq( None, images, None, // TODO incorrect for PagedAttention + trie, ) } diff --git a/mistralrs-core/src/pipeline/ggml.rs b/mistralrs-core/src/pipeline/ggml.rs index 42f95951c..fa5e656e2 100644 --- a/mistralrs-core/src/pipeline/ggml.rs +++ b/mistralrs-core/src/pipeline/ggml.rs @@ -76,7 +76,6 @@ pub struct GGMLLoader { #[derive(Clone, Copy, Default)] /// Config for a GGML loader. pub struct GGMLSpecificConfig { - pub repeat_last_n: usize, pub gqa: usize, } @@ -341,7 +340,6 @@ impl Loader for GGMLLoader { }), metadata: Arc::new(GeneralMetadata { max_seq_len, - repeat_last_n: self.config.repeat_last_n, tok_trie, has_no_kv_cache: self.no_kv_cache, num_hidden_layers, diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs index 21a5267c9..8379b94e8 100644 --- a/mistralrs-core/src/pipeline/gguf.rs +++ b/mistralrs-core/src/pipeline/gguf.rs @@ -81,7 +81,6 @@ pub struct GGUFPipeline { /// Loader for a GGUF model. pub struct GGUFLoader { model_id: Option, - config: GGUFSpecificConfig, quantized_model_id: String, quantized_filename: String, xlora_model_id: Option, @@ -120,17 +119,10 @@ impl GGUFArchitecture { } } -#[derive(Clone, Copy, Default)] -/// A config for a GGUF loader. -pub struct GGUFSpecificConfig { - pub repeat_last_n: usize, -} - #[derive(Default)] /// A builder for a GGUF loader. pub struct GGUFLoaderBuilder { model_id: Option, - config: GGUFSpecificConfig, quantized_model_id: String, quantized_filename: String, xlora_model_id: Option, @@ -146,7 +138,6 @@ impl GGUFLoaderBuilder { /// `tokenizer_config.json` file. If the `chat_template` is specified, then it will be treated as a /// path and used over remote files, removing all remote accesses. pub fn new( - config: GGUFSpecificConfig, chat_template: Option, tok_model_id: Option, quantized_model_id: String, @@ -157,7 +148,6 @@ impl GGUFLoaderBuilder { }; Self { - config, chat_template, model_id: tok_model_id, kind, @@ -216,7 +206,6 @@ impl GGUFLoaderBuilder { pub fn build(self) -> Box { Box::new(GGUFLoader { model_id: self.model_id, - config: self.config, xlora_model_id: self.xlora_model_id, kind: self.kind, xlora_order: self.xlora_order, @@ -233,7 +222,6 @@ impl GGUFLoader { #[allow(clippy::too_many_arguments)] pub fn new( model_id: Option, - config: GGUFSpecificConfig, quantized_model_id: String, quantized_filename: String, xlora_model_id: Option, @@ -256,7 +244,6 @@ impl GGUFLoader { }; Self { model_id, - config, quantized_model_id, quantized_filename, xlora_model_id, @@ -578,7 +565,6 @@ impl Loader for GGUFLoader { }), metadata: Arc::new(GeneralMetadata { max_seq_len, - repeat_last_n: self.config.repeat_last_n, tok_trie, has_no_kv_cache: self.no_kv_cache, num_hidden_layers, diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs index 9fb3ce708..848eb0458 100644 --- a/mistralrs-core/src/pipeline/mod.rs +++ b/mistralrs-core/src/pipeline/mod.rs @@ -28,7 +28,7 @@ use candle_core::quantized::GgmlDType; use chat_template::ChatTemplate; use core::fmt; pub use ggml::{GGMLLoader, GGMLLoaderBuilder, GGMLSpecificConfig}; -pub use gguf::{GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig}; +pub use gguf::{GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder}; pub use isq::IsqModel; pub use normal::{NormalLoader, NormalLoaderBuilder, NormalSpecificConfig}; pub use normal_loaders::{ @@ -433,7 +433,6 @@ pub trait Loader { pub struct GeneralMetadata { pub max_seq_len: usize, - pub repeat_last_n: usize, pub tok_trie: Arc, pub has_no_kv_cache: bool, pub num_hidden_layers: usize, diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs index edc18b0a0..a0c5ea1e3 100644 --- a/mistralrs-core/src/pipeline/normal.rs +++ b/mistralrs-core/src/pipeline/normal.rs @@ -89,7 +89,6 @@ pub struct NormalLoaderBuilder { /// Config specific to loading a normal model. pub struct NormalSpecificConfig { pub use_flash_attn: bool, - pub repeat_last_n: usize, } impl NormalLoaderBuilder { @@ -355,7 +354,6 @@ impl Loader for NormalLoader { model_id: self.model_id.clone(), metadata: Arc::new(GeneralMetadata { max_seq_len, - repeat_last_n: self.config.repeat_last_n, tok_trie, has_no_kv_cache: self.no_kv_cache, num_hidden_layers, diff --git a/mistralrs-core/src/pipeline/sampling.rs b/mistralrs-core/src/pipeline/sampling.rs index b9bee4898..7c33d3e40 100644 --- a/mistralrs-core/src/pipeline/sampling.rs +++ b/mistralrs-core/src/pipeline/sampling.rs @@ -4,7 +4,6 @@ use candle_core::{DType, Device, Result, Tensor}; use rand_isaac::Isaac64Rng; use crate::{ - aici::toktree::TokTrie, get_bias_if_not_allowed, prefix_cacher::PrefixCacheManager, sampler::Logprobs, @@ -238,8 +237,6 @@ pub async fn sample_and_add_toks( logits_per_seq, seq, return_logprobs, - this.get_metadata().repeat_last_n, - this.get_metadata().tok_trie.clone(), rng.clone(), use_async_pool, true, // Append result to trie @@ -271,18 +268,15 @@ pub async fn sample_sequence( logits: Tensor, seq: &mut Sequence, return_logprobs: bool, - repeat_last_n: usize, - tok_trie: Arc, rng: Arc>, use_async_pool: bool, add_to_trie: bool, sample_speculative: bool, ) -> Result { let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?; - let start_at = seq.get_toks().len().saturating_sub(repeat_last_n); let sampler = seq.sampler(); - let ctx_clone = seq.get_toks()[start_at..].to_vec(); + let ctx_clone = seq.get_toks()[seq.prompt_tokens()..].to_vec(); let rng_clone = rng.clone(); let logits_clone = logits.clone(); let first_lobprobs_response = if use_async_pool { @@ -308,20 +302,20 @@ pub async fn sample_sequence( let bias_if_not_allowed = match &mut seq.recognizer { SequenceRecognizer::Regex(ref mut rx) => { - get_bias_if_not_allowed!(tok_trie, rx.as_mut(), first_lobprobs_response.token) + get_bias_if_not_allowed!(seq.tok_trie, rx.as_mut(), first_lobprobs_response.token) } SequenceRecognizer::Cfg(ref mut cfg) => { - get_bias_if_not_allowed!(tok_trie, cfg.as_mut(), first_lobprobs_response.token) + get_bias_if_not_allowed!(seq.tok_trie, cfg.as_mut(), first_lobprobs_response.token) } SequenceRecognizer::None => None, }; let second_logprobs_response = match bias_if_not_allowed { Some(token_set) => { - let mut acc = vec![-f32::INFINITY; tok_trie.vocab_size()]; + let mut acc = vec![-f32::INFINITY; seq.tok_trie.vocab_size()]; token_set.apply_to(&mut acc); let new_logits = (logits + Tensor::from_slice(&acc, acc.len(), &Device::Cpu)?)?; - let ctx_clone = seq.get_toks()[start_at..].to_vec(); + let ctx_clone = seq.get_toks()[seq.prompt_tokens()..].to_vec(); let rng_clone = rng.clone(); let sampler = seq.sampler(); if use_async_pool { @@ -351,12 +345,12 @@ pub async fn sample_sequence( if add_to_trie { match seq.recognizer { SequenceRecognizer::Regex(ref mut rx) => { - tok_trie + seq.tok_trie .append_token(rx.as_mut(), second_logprobs_response.token) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; } SequenceRecognizer::Cfg(ref mut cfg) => { - tok_trie + seq.tok_trie .append_token(cfg.as_mut(), second_logprobs_response.token) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; } @@ -376,8 +370,6 @@ pub async fn sample_target_sequence_speculative( logits: Tensor, seq: &mut Sequence, return_logprobs: bool, - repeat_last_n: usize, - tok_trie: Arc, rng: Arc>, n_toks: usize, ) -> Result> { @@ -388,8 +380,6 @@ pub async fn sample_target_sequence_speculative( chunk, seq, return_logprobs, - repeat_last_n, - tok_trie.clone(), rng.clone(), true, // TODO(EricLBuehler): does this hurt perf? false, // Do not append to trie (yet) diff --git a/mistralrs-core/src/pipeline/speculative.rs b/mistralrs-core/src/pipeline/speculative.rs index 7b543be10..be221ca14 100644 --- a/mistralrs-core/src/pipeline/speculative.rs +++ b/mistralrs-core/src/pipeline/speculative.rs @@ -374,7 +374,6 @@ impl Pipeline for SpeculativePipeline { // ======================= Run draft model gamma times producing tokens ============================ // ======================= Sample the `gamma` logits. ============================ let mut draft_samples = Vec::new(); - let repeat_last_n = get_mut_arcmutex!(self.draft).get_metadata().repeat_last_n; for i in 0..self.gamma { let is_xlora = get_mut_arcmutex!(self.draft).get_metadata().is_xlora; let device = get_mut_arcmutex!(self.draft).device(); @@ -401,11 +400,6 @@ impl Pipeline for SpeculativePipeline { logits.clone(), seq, seq.return_logprobs(), - repeat_last_n, - get_mut_arcmutex!(self.draft) - .get_metadata() - .tok_trie - .clone(), rng.clone(), false, // todo tune false, // do not add to tok trie yet @@ -471,11 +465,6 @@ impl Pipeline for SpeculativePipeline { logits.clone(), seq, seq.return_logprobs(), - repeat_last_n, - get_mut_arcmutex!(self.draft) - .get_metadata() - .tok_trie - .clone(), rng.clone(), self.gamma, ) @@ -579,11 +568,6 @@ impl Pipeline for SpeculativePipeline { logits.clone(), seq, seq.return_logprobs(), - repeat_last_n, - get_mut_arcmutex!(self.draft) - .get_metadata() - .tok_trie - .clone(), rng.clone(), false, // todo tune true, // do not add to tok trie yet diff --git a/mistralrs-core/src/pipeline/vision.rs b/mistralrs-core/src/pipeline/vision.rs index 6eca26ac9..a01b17d6b 100644 --- a/mistralrs-core/src/pipeline/vision.rs +++ b/mistralrs-core/src/pipeline/vision.rs @@ -77,7 +77,6 @@ pub struct VisionLoaderBuilder { /// Config specific to loading a vision model. pub struct VisionSpecificConfig { pub use_flash_attn: bool, - pub repeat_last_n: usize, } impl VisionLoaderBuilder { @@ -270,7 +269,6 @@ impl Loader for VisionLoader { model_id: self.model_id.clone(), metadata: Arc::new(GeneralMetadata { max_seq_len, - repeat_last_n: self.config.repeat_last_n, tok_trie, is_xlora: false, num_hidden_layers, diff --git a/mistralrs-core/src/sampler.rs b/mistralrs-core/src/sampler.rs index 15abd25e0..d1894f91f 100644 --- a/mistralrs-core/src/sampler.rs +++ b/mistralrs-core/src/sampler.rs @@ -64,7 +64,6 @@ pub struct Sampler { tokenizer: Arc, frequency_penalty: Option, presence_penalty: Option, - logits_bias: Option, top_k: i64, top_p: f64, min_p: f64, @@ -100,7 +99,6 @@ impl Sampler { tokenizer: Arc, frequency_penalty: Option, presence_penalty: Option, - logits_bias: Option, top_k: i64, top_p: f64, min_p: f64, @@ -116,7 +114,6 @@ impl Sampler { tokenizer, frequency_penalty, presence_penalty, - logits_bias, top_k, top_p, min_p, @@ -400,10 +397,6 @@ impl Sampler { sample_speculative: bool, ) -> Result { let logits = self.apply_penalties(logits.to_vec1()?, penalty_ctxt)?; - let logits = match self.logits_bias { - Some(ref bias) => (logits + bias)?, - None => logits, - }; let next_token = if sample_speculative { match self.temperature { None => self.sample_speculative_top_kp_min_p( @@ -475,17 +468,7 @@ mod tests { use std::sync::Arc; use std::sync::Mutex; - let sampler = Sampler::new( - None, - 10, - get_tokenizer().into(), - None, - None, - None, - 32, - 0.1, - 0.05, - ); + let sampler = Sampler::new(None, 10, get_tokenizer().into(), None, None, 32, 0.1, 0.05); let logits = Tensor::arange(0f32, 1024f32, &Device::Cpu).unwrap(); let rng = Arc::new(Mutex::new(Isaac64Rng::seed_from_u64(42))); let res = sampler.sample(logits, None, false, rng, false).unwrap(); @@ -503,17 +486,7 @@ mod tests { use std::sync::Arc; use std::sync::Mutex; - let sampler = Sampler::new( - None, - 10, - get_tokenizer().into(), - None, - None, - None, - 32, - 0.1, - 0.05, - ); + let sampler = Sampler::new(None, 10, get_tokenizer().into(), None, None, 32, 0.1, 0.05); let logits = Tensor::arange(0f32, 1024f32, &Device::Cpu).unwrap(); let rng = Arc::new(Mutex::new(Isaac64Rng::seed_from_u64(42))); let res = sampler.sample(logits, None, false, rng, true).unwrap(); diff --git a/mistralrs-core/src/sequence.rs b/mistralrs-core/src/sequence.rs index 378e86e8e..93bc69b1d 100644 --- a/mistralrs-core/src/sequence.rs +++ b/mistralrs-core/src/sequence.rs @@ -9,7 +9,7 @@ use tokio::sync::{ }; use crate::{ - aici::{cfg::CfgParser, recognizer::StackRecognizer, rx::RecRx}, + aici::{cfg::CfgParser, recognizer::StackRecognizer, rx::RecRx, toktree::TokTrie}, paged_attention::{BlockEngineSequence, LogicalTokenBlock}, response::CompletionChoice, CompletionChunkChoice, CompletionChunkResponse, CompletionResponse, @@ -160,6 +160,7 @@ pub struct Sequence { prefix: Option, is_tmp: bool, adapters: Option>, + pub(crate) tok_trie: TokTrie, // Cache scaling_cache: Option, @@ -242,6 +243,7 @@ impl Sequence { input_images: Option>, // Paged attention block_size: Option, + tok_trie: TokTrie, ) -> Self { let prompt_len = tokens.len(); let mut custom_metadata = if let Some(block_size) = block_size { @@ -295,6 +297,7 @@ impl Sequence { adapters, input_images, custom_metadata, + tok_trie, } } diff --git a/mistralrs-core/src/toml_selector.rs b/mistralrs-core/src/toml_selector.rs index 90d195d64..0ffcee93a 100644 --- a/mistralrs-core/src/toml_selector.rs +++ b/mistralrs-core/src/toml_selector.rs @@ -4,15 +4,11 @@ use serde::Deserialize; use crate::{ amoe::AnyMoeConfig, AnyMoeLoader, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, - GGUFSpecificConfig, Loader, ModelDType, NormalLoaderBuilder, NormalLoaderType, - NormalSpecificConfig, SpeculativeConfig, SpeculativeLoader, VisionLoaderBuilder, - VisionLoaderType, VisionSpecificConfig, + Loader, ModelDType, NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, + SpeculativeConfig, SpeculativeLoader, VisionLoaderBuilder, VisionLoaderType, + VisionSpecificConfig, }; -fn default_repeat_last_n() -> usize { - 64 -} - fn default_one() -> usize { 1 } @@ -263,10 +259,6 @@ pub struct TomlSelector { /// Path to local tokenizer.json file. If this is specified it is used over any remote file. tokenizer_json: Option, - /// Control the application of repeat penalty for the last n tokens - #[serde(default = "default_repeat_last_n")] - repeat_last_n: usize, - /// Selected model model: TomlModelSelected, @@ -283,7 +275,6 @@ struct TomlLoaderInnerParams { chat_template: Option, no_kv_cache: bool, tokenizer_json: Option, - repeat_last_n: usize, } pub struct TomlLoaderArgs { @@ -318,10 +309,7 @@ fn loader_from_selected( arch, dtype: _, } => NormalLoaderBuilder::new( - NormalSpecificConfig { - use_flash_attn, - repeat_last_n: args.repeat_last_n, - }, + NormalSpecificConfig { use_flash_attn }, args.chat_template, args.tokenizer_json, Some(model_id), @@ -335,10 +323,7 @@ fn loader_from_selected( arch, dtype: _, } => NormalLoaderBuilder::new( - NormalSpecificConfig { - use_flash_attn, - repeat_last_n: args.repeat_last_n, - }, + NormalSpecificConfig { use_flash_attn }, args.chat_template, args.tokenizer_json, model_id, @@ -360,10 +345,7 @@ fn loader_from_selected( arch, dtype: _, } => NormalLoaderBuilder::new( - NormalSpecificConfig { - use_flash_attn, - repeat_last_n: args.repeat_last_n, - }, + NormalSpecificConfig { use_flash_attn }, args.chat_template, args.tokenizer_json, model_id, @@ -381,9 +363,6 @@ fn loader_from_selected( quantized_model_id, quantized_filename, } => GGUFLoaderBuilder::new( - GGUFSpecificConfig { - repeat_last_n: args.repeat_last_n, - }, args.chat_template, Some(tok_model_id), quantized_model_id, @@ -398,9 +377,6 @@ fn loader_from_selected( order, tgt_non_granular_index, } => GGUFLoaderBuilder::new( - GGUFSpecificConfig { - repeat_last_n: args.repeat_last_n, - }, args.chat_template, tok_model_id, quantized_model_id, @@ -423,9 +399,6 @@ fn loader_from_selected( adapters_model_id, order, } => GGUFLoaderBuilder::new( - GGUFSpecificConfig { - repeat_last_n: args.repeat_last_n, - }, args.chat_template, tok_model_id, quantized_model_id, @@ -445,10 +418,7 @@ fn loader_from_selected( quantized_filename, gqa, } => GGMLLoaderBuilder::new( - GGMLSpecificConfig { - repeat_last_n: args.repeat_last_n, - gqa, - }, + GGMLSpecificConfig { gqa }, args.chat_template, args.tokenizer_json, Some(tok_model_id), @@ -465,10 +435,7 @@ fn loader_from_selected( tgt_non_granular_index, gqa, } => GGMLLoaderBuilder::new( - GGMLSpecificConfig { - repeat_last_n: args.repeat_last_n, - gqa, - }, + GGMLSpecificConfig { gqa }, args.chat_template, args.tokenizer_json, tok_model_id, @@ -493,10 +460,7 @@ fn loader_from_selected( order, gqa, } => GGMLLoaderBuilder::new( - GGMLSpecificConfig { - repeat_last_n: args.repeat_last_n, - gqa, - }, + GGMLSpecificConfig { gqa }, args.chat_template, args.tokenizer_json, tok_model_id, @@ -516,10 +480,7 @@ fn loader_from_selected( arch, dtype: _, } => VisionLoaderBuilder::new( - VisionSpecificConfig { - use_flash_attn, - repeat_last_n: args.repeat_last_n, - }, + VisionSpecificConfig { use_flash_attn }, args.chat_template, args.tokenizer_json, Some(model_id), @@ -538,7 +499,6 @@ impl TryInto> for (TomlSelector, TomlLoaderArgs) { chat_template: args.chat_template, no_kv_cache: args.no_kv_cache, tokenizer_json: selector.tokenizer_json, - repeat_last_n: selector.repeat_last_n, }; let loader = loader_from_selected(args.clone(), selector.model)?; let loader = if let Some(speculative) = selector.speculative { diff --git a/mistralrs-pyo3/API.md b/mistralrs-pyo3/API.md index 63e754860..cdca30e29 100644 --- a/mistralrs-pyo3/API.md +++ b/mistralrs-pyo3/API.md @@ -42,7 +42,6 @@ class Which(Enum): model_id: str arch: Architecture tokenizer_json: str | None = None - repeat_last_n: int = 64 @dataclass class XLora: @@ -51,7 +50,6 @@ class Which(Enum): arch: Architecture model_id: str | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 tgt_non_granular_index: int | None = None @dataclass @@ -61,14 +59,12 @@ class Which(Enum): arch: Architecture model_id: str | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 @dataclass class GGUF: quantized_model_id: str quantized_filename: str tok_model_id: str | None = None - repeat_last_n: int = 64 @dataclass class XLoraGGUF: @@ -77,7 +73,6 @@ class Which(Enum): xlora_model_id: str order: str tok_model_id: str | None = None - repeat_last_n: int = 64 tgt_non_granular_index: int | None = None @dataclass @@ -87,7 +82,6 @@ class Which(Enum): adapters_model_id: str order: str tok_model_id: str | None = None - repeat_last_n: int = 64 @dataclass class GGML: @@ -95,7 +89,6 @@ class Which(Enum): quantized_filename: str tok_model_id: str | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 gqa: int | None = None @dataclass @@ -107,7 +100,6 @@ class Which(Enum): tok_model_id: str | None = None tgt_non_granular_index: int | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 gqa: int | None = None @dataclass @@ -118,14 +110,12 @@ class Which(Enum): order: str tok_model_id: str | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 @dataclass class VisionPlain: model_id: str arch: VisionArchitecture tokenizer_json: str | None = None - repeat_last_n: int = 64 ``` diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi index e827e2824..7d392c433 100644 --- a/mistralrs-pyo3/mistralrs.pyi +++ b/mistralrs-pyo3/mistralrs.pyi @@ -89,7 +89,6 @@ class Which(Enum): model_id: str arch: Architecture tokenizer_json: str | None = None - repeat_last_n: int = 64 @dataclass class XLora: @@ -98,7 +97,6 @@ class Which(Enum): arch: Architecture model_id: str | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 tgt_non_granular_index: int | None = None @dataclass @@ -108,14 +106,12 @@ class Which(Enum): arch: Architecture model_id: str | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 @dataclass class GGUF: quantized_model_id: str quantized_filename: str tok_model_id: str | None = None - repeat_last_n: int = 64 @dataclass class XLoraGGUF: @@ -124,7 +120,6 @@ class Which(Enum): xlora_model_id: str order: str tok_model_id: str | None = None - repeat_last_n: int = 64 tgt_non_granular_index: int | None = None @dataclass @@ -134,7 +129,6 @@ class Which(Enum): adapters_model_id: str order: str tok_model_id: str | None = None - repeat_last_n: int = 64 @dataclass class GGML: @@ -142,7 +136,6 @@ class Which(Enum): quantized_filename: str tok_model_id: str | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 gqa: int | None = None @dataclass @@ -154,7 +147,6 @@ class Which(Enum): tok_model_id: str | None = None tgt_non_granular_index: int | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 gqa: int | None = None @dataclass @@ -165,14 +157,12 @@ class Which(Enum): order: str tok_model_id: str | None = None tokenizer_json: str | None = None - repeat_last_n: int = 64 @dataclass class VisionPlain: model_id: str arch: VisionArchitecture tokenizer_json: str | None = None - repeat_last_n: int = 64 class Runner: def __init__( diff --git a/mistralrs-pyo3/src/lib.rs b/mistralrs-pyo3/src/lib.rs index 74478b893..8ff8ad95a 100644 --- a/mistralrs-pyo3/src/lib.rs +++ b/mistralrs-pyo3/src/lib.rs @@ -21,11 +21,11 @@ use candle_core::Device; use mistralrs_core::{ initialize_logging, paged_attn_supported, AnyMoeLoader, ChatCompletionResponse, CompletionResponse, Constraint, DefaultSchedulerMethod, DeviceLayerMapMetadata, - DeviceMapMetadata, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, - GGUFSpecificConfig, Loader, MistralRs, MistralRsBuilder, ModelDType, NormalLoaderBuilder, - NormalRequest, NormalSpecificConfig, PagedAttentionConfig, Request as _Request, RequestMessage, - Response, SamplingParams, SchedulerConfig, SpeculativeConfig, SpeculativeLoader, StopTokens, - TokenSource, VisionLoaderBuilder, VisionSpecificConfig, + DeviceMapMetadata, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, Loader, MistralRs, + MistralRsBuilder, ModelDType, NormalLoaderBuilder, NormalRequest, NormalSpecificConfig, + PagedAttentionConfig, Request as _Request, RequestMessage, Response, SamplingParams, + SchedulerConfig, SpeculativeConfig, SpeculativeLoader, StopTokens, TokenSource, + VisionLoaderBuilder, VisionSpecificConfig, }; use pyo3::{ exceptions::{PyTypeError, PyValueError}, @@ -92,14 +92,10 @@ fn parse_which( Ok(match which { Which::Plain { model_id, - repeat_last_n, tokenizer_json, arch, } => NormalLoaderBuilder::new( - NormalSpecificConfig { - use_flash_attn, - repeat_last_n, - }, + NormalSpecificConfig { use_flash_attn }, chat_template, tokenizer_json, Some(model_id), @@ -108,16 +104,12 @@ fn parse_which( Which::XLora { model_id, xlora_model_id, - repeat_last_n, order, tokenizer_json, tgt_non_granular_index, arch, } => NormalLoaderBuilder::new( - NormalSpecificConfig { - use_flash_attn, - repeat_last_n, - }, + NormalSpecificConfig { use_flash_attn }, chat_template, tokenizer_json, model_id, @@ -137,14 +129,10 @@ fn parse_which( model_id, tokenizer_json, adapters_model_id, - repeat_last_n, order, arch, } => NormalLoaderBuilder::new( - NormalSpecificConfig { - use_flash_attn, - repeat_last_n, - }, + NormalSpecificConfig { use_flash_attn }, chat_template, tokenizer_json, model_id, @@ -162,9 +150,7 @@ fn parse_which( tok_model_id, quantized_model_id, quantized_filename, - repeat_last_n, } => GGUFLoaderBuilder::new( - GGUFSpecificConfig { repeat_last_n }, chat_template, tok_model_id, quantized_model_id, @@ -175,12 +161,10 @@ fn parse_which( tok_model_id, quantized_model_id, quantized_filename, - repeat_last_n, xlora_model_id, order, tgt_non_granular_index, } => GGUFLoaderBuilder::new( - GGUFSpecificConfig { repeat_last_n }, chat_template, tok_model_id, quantized_model_id, @@ -201,11 +185,9 @@ fn parse_which( tok_model_id, quantized_model_id, quantized_filename, - repeat_last_n, adapters_model_id, order, } => GGUFLoaderBuilder::new( - GGUFSpecificConfig { repeat_last_n }, chat_template, tok_model_id, quantized_model_id, @@ -225,10 +207,9 @@ fn parse_which( tokenizer_json, quantized_model_id, quantized_filename, - repeat_last_n, gqa, } => GGMLLoaderBuilder::new( - GGMLSpecificConfig { repeat_last_n, gqa }, + GGMLSpecificConfig { gqa }, chat_template, tokenizer_json, Some(tok_model_id), @@ -241,13 +222,12 @@ fn parse_which( tokenizer_json, quantized_model_id, quantized_filename, - repeat_last_n, xlora_model_id, order, tgt_non_granular_index, gqa, } => GGMLLoaderBuilder::new( - GGMLSpecificConfig { repeat_last_n, gqa }, + GGMLSpecificConfig { gqa }, chat_template, tokenizer_json, tok_model_id, @@ -270,12 +250,11 @@ fn parse_which( tokenizer_json, quantized_model_id, quantized_filename, - repeat_last_n, adapters_model_id, order, gqa, } => GGMLLoaderBuilder::new( - GGMLSpecificConfig { repeat_last_n, gqa }, + GGMLSpecificConfig { gqa }, chat_template, tokenizer_json, tok_model_id, @@ -293,14 +272,10 @@ fn parse_which( .build(), Which::VisionPlain { model_id, - repeat_last_n, tokenizer_json, arch, } => VisionLoaderBuilder::new( - VisionSpecificConfig { - use_flash_attn, - repeat_last_n, - }, + VisionSpecificConfig { use_flash_attn }, chat_template, tokenizer_json, Some(model_id), diff --git a/mistralrs-pyo3/src/which.rs b/mistralrs-pyo3/src/which.rs index 2081efea8..c78c34e6f 100644 --- a/mistralrs-pyo3/src/which.rs +++ b/mistralrs-pyo3/src/which.rs @@ -57,14 +57,12 @@ pub enum Which { #[pyo3(constructor = ( model_id, arch, - tokenizer_json = None, - repeat_last_n = 64 + tokenizer_json = None ))] Plain { model_id: String, arch: Architecture, tokenizer_json: Option, - repeat_last_n: usize, }, #[pyo3(constructor = ( @@ -73,7 +71,6 @@ pub enum Which { arch, model_id = None, tokenizer_json = None, - repeat_last_n = 64, tgt_non_granular_index = None ))] XLora { @@ -82,7 +79,6 @@ pub enum Which { arch: Architecture, model_id: Option, tokenizer_json: Option, - repeat_last_n: usize, tgt_non_granular_index: Option, }, @@ -91,8 +87,7 @@ pub enum Which { order, arch, model_id = None, - tokenizer_json = None, - repeat_last_n = 64 + tokenizer_json = None ))] Lora { adapters_model_id: String, @@ -100,21 +95,18 @@ pub enum Which { arch: Architecture, model_id: Option, tokenizer_json: Option, - repeat_last_n: usize, }, #[pyo3(constructor = ( quantized_model_id, quantized_filename, tok_model_id = None, - repeat_last_n = 64 ))] #[allow(clippy::upper_case_acronyms)] GGUF { quantized_model_id: String, quantized_filename: String, tok_model_id: Option, - repeat_last_n: usize, }, #[pyo3(constructor = ( @@ -123,7 +115,6 @@ pub enum Which { xlora_model_id, order, tok_model_id = None, - repeat_last_n = 64, tgt_non_granular_index = None, ))] XLoraGGUF { @@ -132,7 +123,6 @@ pub enum Which { xlora_model_id: String, order: String, tok_model_id: Option, - repeat_last_n: usize, tgt_non_granular_index: Option, }, @@ -142,7 +132,6 @@ pub enum Which { adapters_model_id, order, tok_model_id = None, - repeat_last_n = 64 ))] LoraGGUF { quantized_model_id: String, @@ -150,7 +139,6 @@ pub enum Which { adapters_model_id: String, order: String, tok_model_id: Option, - repeat_last_n: usize, }, #[pyo3(constructor = ( @@ -158,7 +146,6 @@ pub enum Which { quantized_filename, tok_model_id, tokenizer_json = None, - repeat_last_n = 64, gqa = 1, ))] #[allow(clippy::upper_case_acronyms)] @@ -167,7 +154,6 @@ pub enum Which { quantized_filename: String, tok_model_id: String, tokenizer_json: Option, - repeat_last_n: usize, gqa: usize, }, @@ -178,7 +164,6 @@ pub enum Which { order, tok_model_id = None, tokenizer_json = None, - repeat_last_n = 64, tgt_non_granular_index = None, gqa = 1, ))] @@ -189,7 +174,6 @@ pub enum Which { order: String, tok_model_id: Option, tokenizer_json: Option, - repeat_last_n: usize, tgt_non_granular_index: Option, gqa: usize, }, @@ -201,7 +185,6 @@ pub enum Which { order, tok_model_id = None, tokenizer_json = None, - repeat_last_n = 64, gqa = 1, ))] LoraGGML { @@ -211,7 +194,6 @@ pub enum Which { order: String, tok_model_id: Option, tokenizer_json: Option, - repeat_last_n: usize, gqa: usize, }, @@ -219,12 +201,10 @@ pub enum Which { model_id, arch, tokenizer_json = None, - repeat_last_n = 64 ))] VisionPlain { model_id: String, arch: VisionArchitecture, tokenizer_json: Option, - repeat_last_n: usize, }, } diff --git a/mistralrs/examples/anymoe/main.rs b/mistralrs/examples/anymoe/main.rs index 708b175de..17c27e552 100644 --- a/mistralrs/examples/anymoe/main.rs +++ b/mistralrs/examples/anymoe/main.rs @@ -27,7 +27,6 @@ fn setup() -> anyhow::Result> { let loader = NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/anymoe_lora/main.rs b/mistralrs/examples/anymoe_lora/main.rs index 8ed21aa45..2a3a2bd2b 100644 --- a/mistralrs/examples/anymoe_lora/main.rs +++ b/mistralrs/examples/anymoe_lora/main.rs @@ -27,7 +27,6 @@ fn setup() -> anyhow::Result> { let loader = NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/gemma2/main.rs b/mistralrs/examples/gemma2/main.rs index fa03a6978..12b806bab 100644 --- a/mistralrs/examples/gemma2/main.rs +++ b/mistralrs/examples/gemma2/main.rs @@ -26,7 +26,6 @@ fn setup() -> anyhow::Result> { let loader = NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/gguf_locally/main.rs b/mistralrs/examples/gguf_locally/main.rs index 93e0d55e5..bc0456b72 100644 --- a/mistralrs/examples/gguf_locally/main.rs +++ b/mistralrs/examples/gguf_locally/main.rs @@ -4,9 +4,9 @@ use std::sync::Arc; use tokio::sync::mpsc::channel; use mistralrs::{ - Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, GGUFLoaderBuilder, - GGUFSpecificConfig, MistralRs, MistralRsBuilder, ModelDType, NormalRequest, Request, - RequestMessage, Response, Result, SamplingParams, SchedulerConfig, TokenSource, + Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, GGUFLoaderBuilder, MistralRs, + MistralRsBuilder, ModelDType, NormalRequest, Request, RequestMessage, Response, Result, + SamplingParams, SchedulerConfig, TokenSource, }; /// Gets the best device, cpu, cuda if compiled with CUDA @@ -27,7 +27,6 @@ fn setup() -> anyhow::Result> { // chat template from the specified file, and the tokenizer and model from a // local GGUF file at the path `.` let loader = GGUFLoaderBuilder::new( - GGUFSpecificConfig { repeat_last_n: 64 }, Some("chat_templates/mistral.json".to_string()), None, ".".to_string(), diff --git a/mistralrs/examples/grammar/main.rs b/mistralrs/examples/grammar/main.rs index c864c69f0..33e109588 100644 --- a/mistralrs/examples/grammar/main.rs +++ b/mistralrs/examples/grammar/main.rs @@ -26,7 +26,6 @@ fn setup() -> anyhow::Result> { let loader = NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/idefics2/main.rs b/mistralrs/examples/idefics2/main.rs index a5f700837..424bc65e4 100644 --- a/mistralrs/examples/idefics2/main.rs +++ b/mistralrs/examples/idefics2/main.rs @@ -27,7 +27,6 @@ fn setup() -> anyhow::Result> { let loader = VisionLoaderBuilder::new( VisionSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/isq/main.rs b/mistralrs/examples/isq/main.rs index 3d308f6a0..26e44a5a7 100644 --- a/mistralrs/examples/isq/main.rs +++ b/mistralrs/examples/isq/main.rs @@ -27,7 +27,6 @@ fn setup() -> anyhow::Result> { let loader = NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/llava/main.rs b/mistralrs/examples/llava/main.rs index 6300c216a..d5a102edd 100644 --- a/mistralrs/examples/llava/main.rs +++ b/mistralrs/examples/llava/main.rs @@ -14,7 +14,6 @@ fn setup() -> anyhow::Result> { let loader = VisionLoaderBuilder::new( VisionSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, Some("chat_templates/vicuna.json".to_string()), None, diff --git a/mistralrs/examples/llava_next/main.rs b/mistralrs/examples/llava_next/main.rs index 5b190d855..7cea0b625 100644 --- a/mistralrs/examples/llava_next/main.rs +++ b/mistralrs/examples/llava_next/main.rs @@ -15,7 +15,6 @@ fn setup() -> anyhow::Result> { let loader = VisionLoaderBuilder::new( VisionSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/lora/main.rs b/mistralrs/examples/lora/main.rs index 96a700ae3..6007a9038 100644 --- a/mistralrs/examples/lora/main.rs +++ b/mistralrs/examples/lora/main.rs @@ -27,7 +27,6 @@ fn setup() -> anyhow::Result> { NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/lora_activation/main.rs b/mistralrs/examples/lora_activation/main.rs index d5e01e5ba..72988b66d 100644 --- a/mistralrs/examples/lora_activation/main.rs +++ b/mistralrs/examples/lora_activation/main.rs @@ -27,7 +27,6 @@ fn setup() -> anyhow::Result> { NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/paged_attn/main.rs b/mistralrs/examples/paged_attn/main.rs index dafb8e0e9..a6fb014a1 100644 --- a/mistralrs/examples/paged_attn/main.rs +++ b/mistralrs/examples/paged_attn/main.rs @@ -33,7 +33,6 @@ fn setup() -> anyhow::Result> { let loader = NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/phi3v/main.rs b/mistralrs/examples/phi3v/main.rs index a6edc0c5d..4623ef63c 100644 --- a/mistralrs/examples/phi3v/main.rs +++ b/mistralrs/examples/phi3v/main.rs @@ -27,7 +27,6 @@ fn setup() -> anyhow::Result> { let loader = VisionLoaderBuilder::new( VisionSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/quantized/main.rs b/mistralrs/examples/quantized/main.rs index eb2e1185d..594625f1e 100644 --- a/mistralrs/examples/quantized/main.rs +++ b/mistralrs/examples/quantized/main.rs @@ -4,9 +4,9 @@ use std::sync::Arc; use tokio::sync::mpsc::channel; use mistralrs::{ - Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, GGUFLoaderBuilder, - GGUFSpecificConfig, MistralRs, MistralRsBuilder, ModelDType, NormalRequest, Request, - RequestMessage, Response, Result, SamplingParams, SchedulerConfig, TokenSource, + Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, GGUFLoaderBuilder, MistralRs, + MistralRsBuilder, ModelDType, NormalRequest, Request, RequestMessage, Response, Result, + SamplingParams, SchedulerConfig, TokenSource, }; /// Gets the best device, cpu, cuda if compiled with CUDA @@ -25,7 +25,6 @@ fn setup() -> anyhow::Result> { // Select a Mistral model // This uses a model, tokenizer, and chat template, from HF hub. let loader = GGUFLoaderBuilder::new( - GGUFSpecificConfig { repeat_last_n: 64 }, None, Some("mistralai/Mistral-7B-Instruct-v0.1".to_string()), "TheBloke/Mistral-7B-Instruct-v0.1-GGUF".to_string(), diff --git a/mistralrs/examples/simple/main.rs b/mistralrs/examples/simple/main.rs index 610e14825..afdb29dc3 100644 --- a/mistralrs/examples/simple/main.rs +++ b/mistralrs/examples/simple/main.rs @@ -26,7 +26,6 @@ fn setup() -> anyhow::Result> { let loader = NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None, diff --git a/mistralrs/examples/xlora/main.rs b/mistralrs/examples/xlora/main.rs index 22e5ef1af..d2ed716bb 100644 --- a/mistralrs/examples/xlora/main.rs +++ b/mistralrs/examples/xlora/main.rs @@ -27,7 +27,6 @@ fn setup() -> anyhow::Result> { NormalLoaderBuilder::new( NormalSpecificConfig { use_flash_attn: false, - repeat_last_n: 64, }, None, None,