Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Major llama.cpp API Change #185

Merged
merged 12 commits into from
Oct 18, 2023
Merged
4 changes: 2 additions & 2 deletions LLama.Examples/NewVersion/LoadAndSaveSession.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ public static async Task Run()
{
Console.Write("Please input your model path: ");
var modelPath = Console.ReadLine();
var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
var prompt = (await File.ReadAllTextAsync("Assets/chat-with-bob.txt")).Trim();

var parameters = new ModelParams(modelPath)
{
Expand Down Expand Up @@ -50,7 +50,7 @@ public static async Task Run()
Console.ForegroundColor = ConsoleColor.White;

ex.Context.Dispose();
ex = new(new LLamaContext(parameters));
ex = new(new LLamaContext(model, parameters));
session = new ChatSession(ex);
session.LoadSession(statePath);

Expand Down
10 changes: 2 additions & 8 deletions LLama.Examples/NewVersion/SemanticKernelChat.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
using System.Reflection.Metadata;
using System.Security.Cryptography;
using System.Text;
using LLama.Abstractions;
using System.Security.Cryptography;
using LLama.Common;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.AI.ChatCompletion;
using Microsoft.SemanticKernel.AI.TextCompletion;
using LLamaSharp.SemanticKernel.ChatCompletion;
using LLamaSharp.SemanticKernel.TextCompletion;

namespace LLama.Examples.NewVersion
{
Expand All @@ -22,7 +16,7 @@ public static async Task Run()
// Load weights into memory
var parameters = new ModelParams(modelPath)
{
Seed = RandomNumberGenerator.GetInt32(int.MaxValue),
Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue)),
};
using var model = LLamaWeights.LoadFromFile(parameters);
using var context = model.CreateContext(parameters);
Expand Down
2 changes: 1 addition & 1 deletion LLama.Examples/NewVersion/SemanticKernelMemory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public static async Task Run()
Console.Write("Please input your model path: ");
var modelPath = Console.ReadLine();

var seed = 1337;
var seed = 1337u;
// Load weights into memory
var parameters = new ModelParams(modelPath)
{
Expand Down
2 changes: 1 addition & 1 deletion LLama.Examples/NewVersion/SemanticKernelPrompt.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public static async Task Run()
// Load weights into memory
var parameters = new ModelParams(modelPath)
{
Seed = RandomNumberGenerator.GetInt32(int.MaxValue),
Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
};
using var model = LLamaWeights.LoadFromFile(parameters);
var ex = new StatelessExecutor(model, parameters);
Expand Down
2 changes: 1 addition & 1 deletion LLama.Examples/NewVersion/TalkToYourself.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public static async Task Run()
// Load weights into memory
var @params = new ModelParams(modelPath)
{
Seed = RandomNumberGenerator.GetInt32(int.MaxValue)
Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
};
using var weights = LLamaWeights.LoadFromFile(@params);

Expand Down
3 changes: 2 additions & 1 deletion LLama.Examples/Program.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using LLama.Examples.NewVersion;
using LLama.Native;

Console.WriteLine("======================================================================================================");

Expand All @@ -7,7 +8,7 @@
Console.WriteLine("======================================================================================================");



NativeApi.llama_empty_call();
Console.WriteLine();

await NewVersionTestRunner.Run();
30 changes: 1 addition & 29 deletions LLama.Unittest/BasicTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,36 +27,8 @@ public void Dispose()
public void BasicModelProperties()
{
Assert.Equal(32000, _model.VocabCount);
Assert.Equal(2048, _model.ContextSize);
Assert.Equal(4096, _model.ContextSize);
Assert.Equal(4096, _model.EmbeddingSize);
Assert.Equal(Encoding.UTF8, _model.Encoding);
}

[Fact]
public void CloneContext()
{
var original = _model.CreateContext(_params);

// Evaluate something (doesn't matter what, as long as it begins with token 1)
original.Eval(new[] { 1, 42, 321 }, 0);

// Clone current state
var clone = original.Clone();

// Now evaluate something more
var reply1a = original.Eval(new[] { 4, 5, 6 }, 3);
var reply2a = original.Eval(new[] { 7, 8, 9 }, 6);

// Assert that the context replied differently each time
Assert.NotEqual(reply1a, reply2a);

// Give the same prompts to the cloned state
var reply1b = clone.Eval(new[] { 4, 5, 6 }, 3);
var reply2b = clone.Eval(new[] { 7, 8, 9 }, 6);

// Assert that the cloned context replied in the same way as originally
Assert.Equal(reply1a, reply1b);
Assert.Equal(reply2a, reply2b);
}
}
}
3 changes: 1 addition & 2 deletions LLama.Unittest/LLamaContextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

namespace LLama.Unittest
{
public class LLamaContextTests
public sealed class LLamaContextTests
: IDisposable
{
private readonly LLamaWeights _weights;
Expand Down Expand Up @@ -30,7 +30,6 @@ public void CheckProperties()
Assert.Equal(768, _context.ContextSize);
Assert.Equal(4096, _context.EmbeddingSize);
Assert.Equal(32000, _context.VocabCount);
Assert.Equal(0, _context.KVCacheTokenCount);
}

[Fact]
Expand Down
9 changes: 6 additions & 3 deletions LLama.Unittest/ModelsParamsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ public void SerializeRoundTripSystemTextJson()
{
BatchSize = 17,
ContextSize = 42,
LoraAdapter = "adapter",
Seed = 42,
GpuLayerCount = 111
};
Expand All @@ -31,9 +30,13 @@ public void SerializeRoundTripNewtonsoft()
{
BatchSize = 17,
ContextSize = 42,
LoraAdapter = "adapter",
Seed = 42,
GpuLayerCount = 111
GpuLayerCount = 111,
LoraAdapters =
{
new("abc", 1),
new("def", 0)
}
};

var settings = new Newtonsoft.Json.JsonSerializerSettings();
Expand Down
6 changes: 3 additions & 3 deletions LLama.Unittest/StatelessExecutorTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public StatelessExecutorTest(ITestOutputHelper testOutputHelper)
_params = new ModelParams(Constants.ModelPath)
{
ContextSize = 60,
Seed = 1754
Seed = 1754,
};
_weights = LLamaWeights.LoadFromFile(_params);
}
Expand Down Expand Up @@ -48,13 +48,13 @@ public async Task OutOfContext()
{
var executor = new StatelessExecutor(_weights, _params);

const string question = " Question. why is a cat the best pet?\nAnswer: ";
const string question = " Question. cats or dogs?\nAnswer: ";

// The context size is set to 60. Generate more than that, forcing it to generate a coherent response
// with a modified context
var @params = new InferenceParams()
{
MaxTokens = 100,
MaxTokens = 65,
TokensKeep = question.Length,
};

Expand Down
8 changes: 4 additions & 4 deletions LLama.Unittest/TokenTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public void Dispose()
[Fact]
public void TokensEndWith()
{
var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);

var result = tokens.TokensEndsWithAnyString(new[]
{
Expand All @@ -41,7 +41,7 @@ public void TokensEndWith()
[Fact]
public void TokensEndSubstring()
{
var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);

var result = tokens.TokensEndsWithAnyString((IList<string>)new[]
{
Expand All @@ -53,7 +53,7 @@ public void TokensEndSubstring()
[Fact]
public void TokensNotEndWith()
{
var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);

var result = tokens.TokensEndsWithAnyString((IList<string>)new[]
{
Expand All @@ -67,7 +67,7 @@ public void TokensNotEndWith()
[Fact]
public void TokensNotEndWithNothing()
{
var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);

var result = tokens.TokensEndsWithAnyString((IList<string>)Array.Empty<string>(), _model.NativeHandle, Encoding.UTF8);
Assert.False(result);
Expand Down
48 changes: 29 additions & 19 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
namespace LLama.Web.Common
{
public class ModelOptions
: IModelParams
: ILLamaParams
{

public string Name { get; set; }
Expand All @@ -14,7 +14,7 @@ public class ModelOptions
/// <summary>
/// Model context size (n_ctx)
/// </summary>
public int ContextSize { get; set; } = 512;
public uint ContextSize { get; set; } = 512;
/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
Expand All @@ -30,7 +30,7 @@ public class ModelOptions
/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
public int Seed { get; set; } = 1686349486;
public uint Seed { get; set; } = 1686349486;
/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
Expand All @@ -51,26 +51,31 @@ public class ModelOptions
/// Model path (model)
/// </summary>
public string ModelPath { get; set; }

/// <summary>
/// model alias
/// </summary>
public string ModelAlias { get; set; } = "unknown";
/// <summary>
/// lora adapter path (lora_adapter)
/// </summary>
public string LoraAdapter { get; set; } = string.Empty;
/// <summary>
/// base model path for the lora adapter (lora_base)
/// </summary>
public string LoraBase { get; set; } = string.Empty;
/// <summary>
/// Number of threads (-1 = autodetect) (n_threads)
/// List of LoRAs to apply
/// </summary>
public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
public AdapterCollection LoraAdapters { get; set; } = new();

/// <summary>
/// base model path for the lora adapter (lora_base)
/// </summary>
public string LoraBase { get; set; } = string.Empty;

/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// Number of threads (null = autodetect) (n_threads)
/// </summary>
public int BatchSize { get; set; } = 512;
public uint? Threads { get; set; }

/// <summary>
/// Number of threads to use for batch processing (null = autodetect) (n_threads)
/// </summary>
public uint? BatchThreads { get; set; }

/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
public uint BatchSize { get; set; } = 512;

/// <summary>
/// Whether to convert eos to newline during the inference.
Expand Down Expand Up @@ -107,5 +112,10 @@ public class ModelOptions
/// The encoding to use for models
/// </summary>
public Encoding Encoding { get; set; } = Encoding.UTF8;

/// <summary>
/// Load vocab only (no weights)
/// </summary>
public bool VocabOnly { get; set; }
}
}
12 changes: 6 additions & 6 deletions LLama.Web/Services/ConnectionSessionService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
using LLama.Web.Models;
using Microsoft.Extensions.Options;
using System.Collections.Concurrent;
using System.Drawing;

namespace LLama.Web.Services
{
Expand Down Expand Up @@ -50,15 +49,16 @@ public Task<IServiceResult<ModelSession>> CreateAsync(LLamaExecutorType executor
if (modelOption.MaxInstances > -1 && currentInstances >= modelOption.MaxInstances)
return Task.FromResult(ServiceResult.FromError<ModelSession>("Maximum model instances reached"));

// Create model
var llamaModel = new LLamaContext(modelOption);
// Load weights
// todo: it would be better to have a central service which loads weights and shares them between all contexts that need them!
using var weights = LLamaWeights.LoadFromFile(modelOption);

// Create executor
ILLamaExecutor executor = executorType switch
{
LLamaExecutorType.Interactive => new InteractiveExecutor(llamaModel),
LLamaExecutorType.Instruct => new InstructExecutor(llamaModel),
LLamaExecutorType.Stateless => new StatelessExecutor(llamaModel),
LLamaExecutorType.Interactive => new InteractiveExecutor(new LLamaContext(weights, modelOption)), //todo: properly dispose of LLamaContext
LLamaExecutorType.Instruct => new InstructExecutor(new LLamaContext(weights, modelOption)), //todo: properly dispose of LLamaContext
LLamaExecutorType.Stateless => new StatelessExecutor(weights, modelOption),
_ => default
};

Expand Down
11 changes: 8 additions & 3 deletions LLama.WebAPI/Services/StatefulChatService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@ public class StatefulChatService : IDisposable

public StatefulChatService(IConfiguration configuration)
{
_context = new LLamaContext(new Common.ModelParams(configuration["ModelPath"])
var @params = new Common.ModelParams(configuration["ModelPath"])
{
ContextSize = 512
});
ContextSize = 512,
};

// todo: share weights from a central service
using var weights = LLamaWeights.LoadFromFile(@params);

_context = new LLamaContext(weights, @params);
_session = new ChatSession(new InteractiveExecutor(_context));
}

Expand Down
10 changes: 8 additions & 2 deletions LLama.WebAPI/Services/StatelessChatService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,16 @@ public class StatelessChatService

public StatelessChatService(IConfiguration configuration)
{
_context = new LLamaContext(new ModelParams(configuration["ModelPath"])
var @params = new Common.ModelParams(configuration["ModelPath"])
{
ContextSize = 512,
});
};

// todo: share weights from a central service
using var weights = LLamaWeights.LoadFromFile(@params);

_context = new LLamaContext(weights, @params);

// TODO: replace with a stateless executor
_session = new ChatSession(new InteractiveExecutor(_context))
.WithOutputTransform(new LLamaTransforms.KeywordTextOutputStreamTransform(new string[] { "User:", "Assistant:" }, redundancyLength: 8))
Expand Down
Loading
Loading