SciSharp · martindevans · Oct 18, 2023 · Sep 29, 2023 · Sep 29, 2023 · Sep 29, 2023
diff --git a/LLama.Examples/NewVersion/LoadAndSaveSession.cs b/LLama.Examples/NewVersion/LoadAndSaveSession.cs
@@ -8,7 +8,7 @@ public static async Task Run()
         {
             Console.Write("Please input your model path: ");
             var modelPath = Console.ReadLine();
-            var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
+            var prompt = (await File.ReadAllTextAsync("Assets/chat-with-bob.txt")).Trim();
 
             var parameters = new ModelParams(modelPath)
             {
@@ -50,7 +50,7 @@ public static async Task Run()
                     Console.ForegroundColor = ConsoleColor.White;
 
                     ex.Context.Dispose();
-                    ex = new(new LLamaContext(parameters));
+                    ex = new(new LLamaContext(model, parameters));
                     session = new ChatSession(ex);
                     session.LoadSession(statePath);
 

diff --git a/LLama.Examples/NewVersion/SemanticKernelChat.cs b/LLama.Examples/NewVersion/SemanticKernelChat.cs
@@ -1,13 +1,7 @@
-using System.Reflection.Metadata;
-using System.Security.Cryptography;
-using System.Text;
-using LLama.Abstractions;
+using System.Security.Cryptography;
 using LLama.Common;
-using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.AI.ChatCompletion;
-using Microsoft.SemanticKernel.AI.TextCompletion;
 using LLamaSharp.SemanticKernel.ChatCompletion;
-using LLamaSharp.SemanticKernel.TextCompletion;
 
 namespace LLama.Examples.NewVersion
 {
@@ -22,7 +16,7 @@ public static async Task Run()
             // Load weights into memory
             var parameters = new ModelParams(modelPath)
             {
-                Seed = RandomNumberGenerator.GetInt32(int.MaxValue),
+                Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue)),
             };
             using var model = LLamaWeights.LoadFromFile(parameters);
             using var context = model.CreateContext(parameters);

diff --git a/LLama.Examples/NewVersion/SemanticKernelMemory.cs b/LLama.Examples/NewVersion/SemanticKernelMemory.cs
@@ -22,7 +22,7 @@ public static async Task Run()
             Console.Write("Please input your model path: ");
             var modelPath = Console.ReadLine();
 
-            var seed = 1337;
+            var seed = 1337u;
             // Load weights into memory
             var parameters = new ModelParams(modelPath)
             {

diff --git a/LLama.Examples/NewVersion/SemanticKernelPrompt.cs b/LLama.Examples/NewVersion/SemanticKernelPrompt.cs
@@ -21,7 +21,7 @@ public static async Task Run()
             // Load weights into memory
             var parameters = new ModelParams(modelPath)
             {
-                Seed = RandomNumberGenerator.GetInt32(int.MaxValue),
+                Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
             };
             using var model = LLamaWeights.LoadFromFile(parameters);
             var ex = new StatelessExecutor(model, parameters);

diff --git a/LLama.Examples/NewVersion/TalkToYourself.cs b/LLama.Examples/NewVersion/TalkToYourself.cs
@@ -15,7 +15,7 @@ public static async Task Run()
             // Load weights into memory
             var @params = new ModelParams(modelPath)
             {
-                Seed = RandomNumberGenerator.GetInt32(int.MaxValue)
+                Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
             };
             using var weights = LLamaWeights.LoadFromFile(@params);
 

diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs
@@ -1,4 +1,5 @@
 using LLama.Examples.NewVersion;
+using LLama.Native;
 
 Console.WriteLine("======================================================================================================");
 
@@ -7,7 +8,7 @@
 Console.WriteLine("======================================================================================================");
 
 
-
+NativeApi.llama_empty_call();
 Console.WriteLine();
 
 await NewVersionTestRunner.Run();
diff --git a/LLama.Unittest/BasicTest.cs b/LLama.Unittest/BasicTest.cs
@@ -27,36 +27,8 @@ public void Dispose()
         public void BasicModelProperties()
         {
             Assert.Equal(32000, _model.VocabCount);
-            Assert.Equal(2048, _model.ContextSize);
+            Assert.Equal(4096, _model.ContextSize);
             Assert.Equal(4096, _model.EmbeddingSize);
-            Assert.Equal(Encoding.UTF8, _model.Encoding);
-        }
-
-        [Fact]
-        public void CloneContext()
-        {
-            var original = _model.CreateContext(_params);
-
-            // Evaluate something (doesn't matter what, as long as it begins with token 1)
-            original.Eval(new[] { 1, 42, 321 }, 0);
-
-            // Clone current state
-            var clone = original.Clone();
-
-            // Now evaluate something more
-            var reply1a = original.Eval(new[] { 4, 5, 6 }, 3);
-            var reply2a = original.Eval(new[] { 7, 8, 9 }, 6);
-
-            // Assert that the context replied differently each time
-            Assert.NotEqual(reply1a, reply2a);
-
-            // Give the same prompts to the cloned state
-            var reply1b = clone.Eval(new[] { 4, 5, 6 }, 3);
-            var reply2b = clone.Eval(new[] { 7, 8, 9 }, 6);
-
-            // Assert that the cloned context replied in the same way as originally
-            Assert.Equal(reply1a, reply1b);
-            Assert.Equal(reply2a, reply2b);
         }
     }
 }
diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
@@ -2,7 +2,7 @@
 
 namespace LLama.Unittest
 {
-    public class LLamaContextTests
+    public sealed class LLamaContextTests
         : IDisposable
     {
         private readonly LLamaWeights _weights;
@@ -30,7 +30,6 @@ public void CheckProperties()
             Assert.Equal(768, _context.ContextSize);
             Assert.Equal(4096, _context.EmbeddingSize);
             Assert.Equal(32000, _context.VocabCount);
-            Assert.Equal(0, _context.KVCacheTokenCount);
         }
 
         [Fact]

diff --git a/LLama.Unittest/ModelsParamsTests.cs b/LLama.Unittest/ModelsParamsTests.cs
@@ -13,7 +13,6 @@ public void SerializeRoundTripSystemTextJson()
             {
                 BatchSize = 17,
                 ContextSize = 42,
-                LoraAdapter = "adapter",
                 Seed = 42,
                 GpuLayerCount = 111
             };
@@ -31,9 +30,13 @@ public void SerializeRoundTripNewtonsoft()
             {
                 BatchSize = 17,
                 ContextSize = 42,
-                LoraAdapter = "adapter",
                 Seed = 42,
-                GpuLayerCount = 111
+                GpuLayerCount = 111,
+                LoraAdapters =
+                {
+                    new("abc", 1),
+                    new("def", 0)
+                }
             };
 
             var settings = new Newtonsoft.Json.JsonSerializerSettings();

diff --git a/LLama.Unittest/StatelessExecutorTest.cs b/LLama.Unittest/StatelessExecutorTest.cs
@@ -16,7 +16,7 @@ public StatelessExecutorTest(ITestOutputHelper testOutputHelper)
             _params = new ModelParams(Constants.ModelPath)
             {
                 ContextSize = 60,
-                Seed = 1754
+                Seed = 1754,
             };
             _weights = LLamaWeights.LoadFromFile(_params);
         }
@@ -48,13 +48,13 @@ public async Task OutOfContext()
         {
             var executor = new StatelessExecutor(_weights, _params);
 
-            const string question = " Question. why is a cat the best pet?\nAnswer: ";
+            const string question = " Question. cats or dogs?\nAnswer: ";
 
             // The context size is set to 60. Generate more than that, forcing it to generate a coherent response
             // with a modified context
             var @params = new InferenceParams()
             {
-                MaxTokens = 100,
+                MaxTokens = 65,
                 TokensKeep = question.Length,
             };
 

diff --git a/LLama.Unittest/TokenTests.cs b/LLama.Unittest/TokenTests.cs
@@ -27,7 +27,7 @@ public void Dispose()
     [Fact]
     public void TokensEndWith()
     {
-        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
+        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);
 
         var result = tokens.TokensEndsWithAnyString(new[]
         {
@@ -41,7 +41,7 @@ public void TokensEndWith()
     [Fact]
     public void TokensEndSubstring()
     {
-        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
+        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);
 
         var result = tokens.TokensEndsWithAnyString((IList<string>)new[]
         {
@@ -53,7 +53,7 @@ public void TokensEndSubstring()
     [Fact]
     public void TokensNotEndWith()
     {
-        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
+        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);
 
         var result = tokens.TokensEndsWithAnyString((IList<string>)new[]
         {
@@ -67,7 +67,7 @@ public void TokensNotEndWith()
     [Fact]
     public void TokensNotEndWithNothing()
     {
-        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
+        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);
 
         var result = tokens.TokensEndsWithAnyString((IList<string>)Array.Empty<string>(), _model.NativeHandle, Encoding.UTF8);
         Assert.False(result);

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -4,7 +4,7 @@
 namespace LLama.Web.Common
 {
     public class ModelOptions
-        : IModelParams
+        : ILLamaParams
     {
 
         public string Name { get; set; }
@@ -14,7 +14,7 @@ public class ModelOptions
 		/// <summary>
 		/// Model context size (n_ctx)
 		/// </summary>
-		public int ContextSize { get; set; } = 512;
+		public uint ContextSize { get; set; } = 512;
 		/// <summary>
 		/// the GPU that is used for scratch and small tensors
 		/// </summary>
@@ -30,7 +30,7 @@ public class ModelOptions
 		/// <summary>
 		/// Seed for the random number generator (seed)
 		/// </summary>
-		public int Seed { get; set; } = 1686349486;
+		public uint Seed { get; set; } = 1686349486;
 		/// <summary>
 		/// Use f16 instead of f32 for memory kv (memory_f16)
 		/// </summary>
@@ -51,26 +51,31 @@ public class ModelOptions
 		/// Model path (model)
 		/// </summary>
 		public string ModelPath { get; set; }
+
 		/// <summary>
-		/// model alias
-		/// </summary>
-		public string ModelAlias { get; set; } = "unknown";
-		/// <summary>
-		/// lora adapter path (lora_adapter)
-		/// </summary>
-		public string LoraAdapter { get; set; } = string.Empty;
-		/// <summary>
-		/// base model path for the lora adapter (lora_base)
-		/// </summary>
-		public string LoraBase { get; set; } = string.Empty;
-		/// <summary>
-		/// Number of threads (-1 = autodetect) (n_threads)
+		/// List of LoRAs to apply
 		/// </summary>
-		public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
+        public AdapterCollection LoraAdapters { get; set; } = new();
+
+        /// <summary>
+        /// base model path for the lora adapter (lora_base)
+        /// </summary>
+        public string LoraBase { get; set; } = string.Empty;
+
 		/// <summary>
-		/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+		/// Number of threads (null = autodetect) (n_threads)
 		/// </summary>
-		public int BatchSize { get; set; } = 512;
+		public uint? Threads { get; set; }
+
+        /// <summary>
+        /// Number of threads to use for batch processing (null = autodetect) (n_threads)
+        /// </summary>
+        public uint? BatchThreads { get; set; }
+
+        /// <summary>
+        /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+        /// </summary>
+        public uint BatchSize { get; set; } = 512;
 
 		/// <summary>
 		/// Whether to convert eos to newline during the inference.
@@ -107,5 +112,10 @@ public class ModelOptions
         /// The encoding to use for models
         /// </summary>
         public Encoding Encoding { get; set; } = Encoding.UTF8;
+
+        /// <summary>
+        /// Load vocab only (no weights)
+        /// </summary>
+        public bool VocabOnly { get; set; }
     }
 }
diff --git a/LLama.Web/Services/ConnectionSessionService.cs b/LLama.Web/Services/ConnectionSessionService.cs
@@ -3,7 +3,6 @@
 using LLama.Web.Models;
 using Microsoft.Extensions.Options;
 using System.Collections.Concurrent;
-using System.Drawing;
 
 namespace LLama.Web.Services
 {
@@ -50,15 +49,16 @@ public Task<IServiceResult<ModelSession>> CreateAsync(LLamaExecutorType executor
             if (modelOption.MaxInstances > -1 && currentInstances >= modelOption.MaxInstances)
                 return Task.FromResult(ServiceResult.FromError<ModelSession>("Maximum model instances reached"));
 
-            // Create model
-            var llamaModel = new LLamaContext(modelOption);
+            // Load weights
+            // todo: it would be better to have a central service which loads weights and shares them between all contexts that need them!
+            using var weights = LLamaWeights.LoadFromFile(modelOption);
 
             // Create executor
             ILLamaExecutor executor = executorType switch
             {
-                LLamaExecutorType.Interactive => new InteractiveExecutor(llamaModel),
-                LLamaExecutorType.Instruct => new InstructExecutor(llamaModel),
-                LLamaExecutorType.Stateless => new StatelessExecutor(llamaModel),
+                LLamaExecutorType.Interactive => new InteractiveExecutor(new LLamaContext(weights, modelOption)), //todo: properly dispose of LLamaContext
+                LLamaExecutorType.Instruct => new InstructExecutor(new LLamaContext(weights, modelOption)), //todo: properly dispose of LLamaContext
+                LLamaExecutorType.Stateless => new StatelessExecutor(weights, modelOption),
                 _ => default
             };
 

diff --git a/LLama.WebAPI/Services/StatefulChatService.cs b/LLama.WebAPI/Services/StatefulChatService.cs
@@ -16,10 +16,15 @@ public class StatefulChatService : IDisposable
 
     public StatefulChatService(IConfiguration configuration)
     {
-        _context = new LLamaContext(new Common.ModelParams(configuration["ModelPath"])
+        var @params = new Common.ModelParams(configuration["ModelPath"])
         {
-            ContextSize = 512
-        });
+            ContextSize = 512,
+        };
+
+        // todo: share weights from a central service
+        using var weights = LLamaWeights.LoadFromFile(@params);
+
+        _context = new LLamaContext(weights, @params);
         _session = new ChatSession(new InteractiveExecutor(_context));
     }
 

diff --git a/LLama.WebAPI/Services/StatelessChatService.cs b/LLama.WebAPI/Services/StatelessChatService.cs
@@ -12,10 +12,16 @@ public class StatelessChatService
 
         public StatelessChatService(IConfiguration configuration)
         {
-            _context = new LLamaContext(new ModelParams(configuration["ModelPath"])
+            var @params = new Common.ModelParams(configuration["ModelPath"])
             {
                 ContextSize = 512,
-            });
+            };
+
+            // todo: share weights from a central service
+            using var weights = LLamaWeights.LoadFromFile(@params);
+
+            _context = new LLamaContext(weights, @params);
+
             // TODO: replace with a stateless executor
             _session = new ChatSession(new InteractiveExecutor(_context))
                         .WithOutputTransform(new LLamaTransforms.KeywordTextOutputStreamTransform(new string[] { "User:", "Assistant:" }, redundancyLength: 8))