SciSharp · martindevans · Dec 15, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023
diff --git a/LLama.Examples/Examples/BatchedDecoding.cs b/LLama.Examples/Examples/BatchedDecoding.cs
@@ -1,5 +1,6 @@
 using System.Diagnostics;
 using System.Text;
+using LLama.Abstractions;
 using LLama.Common;
 using LLama.Native;
 
@@ -30,6 +31,7 @@ public static async Task Run()
 
         // Load model
         var parameters = new ModelParams(modelPath);
+
         using var model = LLamaWeights.LoadFromFile(parameters);
 
         // Tokenize prompt

diff --git a/LLama.Examples/LLama.Examples.csproj b/LLama.Examples/LLama.Examples.csproj
@@ -2,7 +2,7 @@
   <Import Project="..\LLama\LLamaSharp.Runtime.targets" />
   <PropertyGroup>
     <OutputType>Exe</OutputType>
-    <TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
+    <TargetFrameworks>net6.0;net8.0</TargetFrameworks>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
     <Platforms>AnyCPU;x64</Platforms>

diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs
@@ -7,7 +7,10 @@
 
 Console.WriteLine("======================================================================================================");
 
-NativeLibraryConfig.Instance.WithCuda().WithLogs();
+NativeLibraryConfig
+   .Instance
+   .WithCuda()
+   .WithLogs();
 
 NativeApi.llama_empty_call();
 Console.WriteLine();

diff --git a/LLama.Unittest/ModelsParamsTests.cs b/LLama.Unittest/ModelsParamsTests.cs
@@ -1,5 +1,6 @@
 using LLama.Common;
 using System.Text.Json;
+using LLama.Abstractions;
 
 namespace LLama.Unittest
 {
@@ -8,59 +9,45 @@ public class ModelsParamsTests
         [Fact]
         public void SerializeRoundTripSystemTextJson()
         {
+            var options = new JsonSerializerOptions()
+            {
+                WriteIndented = true,
+            };
+
             var expected = new ModelParams("abc/123")
             {
                 BatchSize = 17,
                 ContextSize = 42,
                 Seed = 42,
                 GpuLayerCount = 111,
-                TensorSplits = { [0] = 3 }
+                TensorSplits = { [0] = 3 },
+                MetadataOverrides =
+                {
+                    new MetadataOverride("hello", true),
+                    new MetadataOverride("world", 17),
+                    new MetadataOverride("cats", 17f),
+                }
             };
 
-            var json = JsonSerializer.Serialize(expected);
-            var actual = JsonSerializer.Deserialize<ModelParams>(json)!;
+            var json = JsonSerializer.Serialize(expected, options);
+            var actual = JsonSerializer.Deserialize<ModelParams>(json, options)!;
 
             // Cannot compare splits with default equality, check they are sequence equal and then set to null
-            Assert.Equal((IEnumerable<float>)expected.TensorSplits, expected.TensorSplits);
+            Assert.True(expected.TensorSplits.SequenceEqual(actual.TensorSplits));
             actual.TensorSplits = null!;
             expected.TensorSplits = null!;
 
+            // Cannot compare overrides with default equality, check they are sequence equal and then set to null
+            Assert.True(expected.MetadataOverrides.SequenceEqual(actual.MetadataOverrides));
+            actual.MetadataOverrides = null!;
+            expected.MetadataOverrides = null!;
+
             // Check encoding is the same
             var b1 = expected.Encoding.GetBytes("Hello");
             var b2 = actual.Encoding.GetBytes("Hello");
             Assert.True(b1.SequenceEqual(b2));
 
             Assert.Equal(expected, actual);
         }
-
-        //[Fact]
-        //public void SerializeRoundTripNewtonsoft()
-        //{
-        //    var expected = new ModelParams("abc/123")
-        //    {
-        //        BatchSize = 17,
-        //        ContextSize = 42,
-        //        Seed = 42,
-        //        GpuLayerCount = 111,
-        //        LoraAdapters =
-        //        {
-        //            new("abc", 1),
-        //            new("def", 0)
-        //        },
-        //        TensorSplits = { [0] = 3 }
-        //    };
-
-        //    var settings = new Newtonsoft.Json.JsonSerializerSettings();
-
-        //    var json = Newtonsoft.Json.JsonConvert.SerializeObject(expected, settings);
-        //    var actual = Newtonsoft.Json.JsonConvert.DeserializeObject<ModelParams>(json, settings)!;
-
-        //    // Cannot compare splits with default equality, check they are sequence equal and then set to null
-        //    Assert.Equal((IEnumerable<float>)expected.TensorSplits, expected.TensorSplits);
-        //    actual.TensorSplits = null!;
-        //    expected.TensorSplits = null!;
-
-        //    Assert.Equal(expected, actual);
-        //}
     }
 }
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -17,106 +17,55 @@ public class ModelOptions
         /// </summary>
         public int MaxInstances { get; set; }
 
-        /// <summary>
-        /// Model context size (n_ctx)
-        /// </summary>
+        /// <inheritdoc />
         public uint? ContextSize { get; set; }
 
-        /// <summary>
-        /// the GPU that is used for scratch and small tensors
-        /// </summary>
+        /// <inheritdoc />
         public int MainGpu { get; set; } = 0;
 
-        /// <summary>
-        /// if true, reduce VRAM usage at the cost of performance
-        /// </summary>
-        public bool LowVram { get; set; } = false;
-
-        /// <summary>
-        /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
-        /// </summary>
+        /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 
-        /// <summary>
-        /// Seed for the random number generator (seed)
-        /// </summary>
+        /// <inheritdoc />
         public uint Seed { get; set; } = 1686349486;
 
-        /// <summary>
-        /// Use f16 instead of f32 for memory kv (memory_f16)
-        /// </summary>
-        public bool UseFp16Memory { get; set; } = true;
-
-        /// <summary>
-        /// Use mmap for faster loads (use_mmap)
-        /// </summary>
+        /// <inheritdoc />
         public bool UseMemorymap { get; set; } = true;
 
-        /// <summary>
-        /// Use mlock to keep model in memory (use_mlock)
-        /// </summary>
+        /// <inheritdoc />
         public bool UseMemoryLock { get; set; } = false;
 
-        /// <summary>
-        /// Compute perplexity over the prompt (perplexity)
-        /// </summary>
-        public bool Perplexity { get; set; } = false;
-
-        /// <summary>
-        /// Model path (model)
-        /// </summary>
+        /// <inheritdoc />
         public string ModelPath { get; set; }
 
-        /// <summary>
-        /// List of LoRAs to apply
-        /// </summary>
+        /// <inheritdoc />
         public AdapterCollection LoraAdapters { get; set; } = new();
 
-        /// <summary>
-
-        /// base model path for the lora adapter (lora_base)
-        /// </summary>
+        /// <inheritdoc />
         public string LoraBase { get; set; } = string.Empty;
 
-        /// <summary>
-        /// Number of threads (null = autodetect) (n_threads)
-        /// </summary>
+        /// <inheritdoc />
         public uint? Threads { get; set; }
 
-        /// <summary>
-        /// Number of threads to use for batch processing (null = autodetect) (n_threads)
-        /// </summary>
+        /// <inheritdoc />
         public uint? BatchThreads { get; set; }
 
-        /// <summary>
-        /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
-        /// </summary>
+        /// <inheritdoc />
         public uint BatchSize { get; set; } = 512;
 
-        /// <summary>
-        /// Whether to convert eos to newline during the inference.
-        /// </summary>
-        public bool ConvertEosToNewLine { get; set; } = false;
-
-        /// <summary>
-        /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
-        /// The LLamaModel won't produce text response anymore.
-        /// </summary>
+        /// <inheritdoc />
         public bool EmbeddingMode { get; set; } = false;
 
-        /// <summary>
-        /// how split tensors should be distributed across GPUs
-        /// </summary>
+        /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
 
-        /// <summary>
-        /// RoPE base frequency
-        /// </summary>
+        /// <inheritdoc />
+        public List<MetadataOverride> MetadataOverrides { get; } = new();
+
+        /// <inheritdoc />
         public float? RopeFrequencyBase { get; set; }
 
-        /// <summary>
-        /// RoPE frequency scaling factor
-        /// </summary>
+        /// <inheritdoc />
         public float? RopeFrequencyScale { get; set; }
 
         /// <inheritdoc />
@@ -137,19 +86,19 @@ public class ModelOptions
         /// <inheritdoc />
         public RopeScalingType? YarnScalingType { get; set; }
 
-        /// <summary>
-        /// Use experimental mul_mat_q kernels
-        /// </summary>
-        public bool MulMatQ { get; set; }
+        /// <inheritdoc />
+        public GGMLType? TypeK { get; set; }
 
-        /// <summary>
-        /// The encoding to use for models
-        /// </summary>
+        /// <inheritdoc />
+        public GGMLType? TypeV { get; set; }
+
+        /// <inheritdoc />
+        public bool NoKqvOffload { get; set; }
+
+        /// <inheritdoc />
         public Encoding Encoding { get; set; } = Encoding.UTF8;
 
-        /// <summary>
-        /// Load vocab only (no weights)
-        /// </summary>
+        /// <inheritdoc />
         public bool VocabOnly { get; set; }
     }
 }
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -23,16 +23,6 @@ public interface IContextParams
     /// </summary>
     uint Seed { get; set; }
 
-    /// <summary>
-    /// Use f16 instead of f32 for memory kv (memory_f16)
-    /// </summary>
-    bool UseFp16Memory { get; set; }
-
-    /// <summary>
-    /// Compute perplexity over the prompt (perplexity)
-    /// </summary>
-    bool Perplexity { get; set; }
-
     /// <summary>
     /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
     /// The LLamaModel won't produce text response anymore.
@@ -49,11 +39,6 @@ public interface IContextParams
     /// </summary>
     float? RopeFrequencyScale { get; set; }
 
-    /// <summary>
-    /// Use experimental mul_mat_q kernels
-    /// </summary>
-    bool MulMatQ { get; set; }
-
     /// <summary>
     /// The encoding to use for models
     /// </summary>
@@ -70,32 +55,47 @@ public interface IContextParams
     uint? BatchThreads { get; set; }
 
     /// <summary>
-    /// YaRN extrapolation mix factor
+    /// YaRN extrapolation mix factor (null = from model)
     /// </summary>
     float? YarnExtrapolationFactor { get; set; }
 
     /// <summary>
-    /// YaRN magnitude scaling factor
+    /// YaRN magnitude scaling factor (null = from model)
     /// </summary>
     float? YarnAttentionFactor { get; set; }
 
     /// <summary>
-    /// YaRN low correction dim
+    /// YaRN low correction dim (null = from model)
     /// </summary>
     float? YarnBetaFast { get; set; }
 
     /// <summary>
-    /// YaRN high correction dim
+    /// YaRN high correction dim (null = from model)
     /// </summary>
     float? YarnBetaSlow { get; set; }
 
     /// <summary>
-    /// YaRN original context length
+    /// YaRN original context length (null = from model)
     /// </summary>
     uint? YarnOriginalContext { get; set; }
 
     /// <summary>
     /// YaRN scaling method to use.
     /// </summary>
     RopeScalingType? YarnScalingType { get; set; }
+
+    /// <summary>
+    /// Override the type of the K cache
+    /// </summary>
+    GGMLType? TypeK { get; set; }
+
+    /// <summary>
+    /// Override the type of the V cache
+    /// </summary>
+    GGMLType? TypeV { get; set; }
+
+    /// <summary>
+    /// Whether to disable offloading the KQV cache to the GPU
+    /// </summary>
+    bool NoKqvOffload { get; set; }
 }