Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated Binaries December 2023 #361

Merged
merged 9 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions LLama.Examples/Examples/BatchedDecoding.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System.Diagnostics;
using System.Text;
using LLama.Abstractions;
using LLama.Common;
using LLama.Native;

Expand Down Expand Up @@ -30,6 +31,7 @@ public static async Task Run()

// Load model
var parameters = new ModelParams(modelPath);

using var model = LLamaWeights.LoadFromFile(parameters);

// Tokenize prompt
Expand Down
2 changes: 1 addition & 1 deletion LLama.Examples/LLama.Examples.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<Import Project="..\LLama\LLamaSharp.Runtime.targets" />
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
<TargetFrameworks>net6.0;net8.0</TargetFrameworks>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Platforms>AnyCPU;x64</Platforms>
Expand Down
5 changes: 4 additions & 1 deletion LLama.Examples/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

Console.WriteLine("======================================================================================================");

NativeLibraryConfig.Instance.WithCuda().WithLogs();
NativeLibraryConfig
.Instance
.WithCuda()
.WithLogs();

NativeApi.llama_empty_call();
Console.WriteLine();
Expand Down
55 changes: 21 additions & 34 deletions LLama.Unittest/ModelsParamsTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using LLama.Common;
using System.Text.Json;
using LLama.Abstractions;

namespace LLama.Unittest
{
Expand All @@ -8,59 +9,45 @@ public class ModelsParamsTests
[Fact]
public void SerializeRoundTripSystemTextJson()
{
var options = new JsonSerializerOptions()
{
WriteIndented = true,
};

var expected = new ModelParams("abc/123")
{
BatchSize = 17,
ContextSize = 42,
Seed = 42,
GpuLayerCount = 111,
TensorSplits = { [0] = 3 }
TensorSplits = { [0] = 3 },
MetadataOverrides =
{
new MetadataOverride("hello", true),
new MetadataOverride("world", 17),
new MetadataOverride("cats", 17f),
}
};

var json = JsonSerializer.Serialize(expected);
var actual = JsonSerializer.Deserialize<ModelParams>(json)!;
var json = JsonSerializer.Serialize(expected, options);
var actual = JsonSerializer.Deserialize<ModelParams>(json, options)!;

// Cannot compare splits with default equality, check they are sequence equal and then set to null
Assert.Equal((IEnumerable<float>)expected.TensorSplits, expected.TensorSplits);
Assert.True(expected.TensorSplits.SequenceEqual(actual.TensorSplits));
actual.TensorSplits = null!;
expected.TensorSplits = null!;

// Cannot compare overrides with default equality, check they are sequence equal and then set to null
Assert.True(expected.MetadataOverrides.SequenceEqual(actual.MetadataOverrides));
actual.MetadataOverrides = null!;
expected.MetadataOverrides = null!;

// Check encoding is the same
var b1 = expected.Encoding.GetBytes("Hello");
var b2 = actual.Encoding.GetBytes("Hello");
Assert.True(b1.SequenceEqual(b2));

Assert.Equal(expected, actual);
}

//[Fact]
//public void SerializeRoundTripNewtonsoft()
//{
// var expected = new ModelParams("abc/123")
// {
// BatchSize = 17,
// ContextSize = 42,
// Seed = 42,
// GpuLayerCount = 111,
// LoraAdapters =
// {
// new("abc", 1),
// new("def", 0)
// },
// TensorSplits = { [0] = 3 }
// };

// var settings = new Newtonsoft.Json.JsonSerializerSettings();

// var json = Newtonsoft.Json.JsonConvert.SerializeObject(expected, settings);
// var actual = Newtonsoft.Json.JsonConvert.DeserializeObject<ModelParams>(json, settings)!;

// // Cannot compare splits with default equality, check they are sequence equal and then set to null
// Assert.Equal((IEnumerable<float>)expected.TensorSplits, expected.TensorSplits);
// actual.TensorSplits = null!;
// expected.TensorSplits = null!;

// Assert.Equal(expected, actual);
//}
}
}
109 changes: 29 additions & 80 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,106 +17,55 @@ public class ModelOptions
/// </summary>
public int MaxInstances { get; set; }

/// <summary>
/// Model context size (n_ctx)
/// </summary>
/// <inheritdoc />
public uint? ContextSize { get; set; }

/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <summary>
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
public bool LowVram { get; set; } = false;

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
/// <inheritdoc />
public uint Seed { get; set; } = 1686349486;

/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
public bool UseFp16Memory { get; set; } = true;

/// <summary>
/// Use mmap for faster loads (use_mmap)
/// </summary>
/// <inheritdoc />
public bool UseMemorymap { get; set; } = true;

/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
/// <inheritdoc />
public bool UseMemoryLock { get; set; } = false;

/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
public bool Perplexity { get; set; } = false;

/// <summary>
/// Model path (model)
/// </summary>
/// <inheritdoc />
public string ModelPath { get; set; }

/// <summary>
/// List of LoRAs to apply
/// </summary>
/// <inheritdoc />
public AdapterCollection LoraAdapters { get; set; } = new();

/// <summary>

/// base model path for the lora adapter (lora_base)
/// </summary>
/// <inheritdoc />
public string LoraBase { get; set; } = string.Empty;

/// <summary>
/// Number of threads (null = autodetect) (n_threads)
/// </summary>
/// <inheritdoc />
public uint? Threads { get; set; }

/// <summary>
/// Number of threads to use for batch processing (null = autodetect) (n_threads)
/// </summary>
/// <inheritdoc />
public uint? BatchThreads { get; set; }

/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
/// <inheritdoc />
public uint BatchSize { get; set; } = 512;

/// <summary>
/// Whether to convert eos to newline during the inference.
/// </summary>
public bool ConvertEosToNewLine { get; set; } = false;

/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
/// <inheritdoc />
public bool EmbeddingMode { get; set; } = false;

/// <summary>
/// how split tensors should be distributed across GPUs
/// </summary>
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();

/// <summary>
/// RoPE base frequency
/// </summary>
/// <inheritdoc />
public List<MetadataOverride> MetadataOverrides { get; } = new();

/// <inheritdoc />
public float? RopeFrequencyBase { get; set; }

/// <summary>
/// RoPE frequency scaling factor
/// </summary>
/// <inheritdoc />
public float? RopeFrequencyScale { get; set; }

/// <inheritdoc />
Expand All @@ -137,19 +86,19 @@ public class ModelOptions
/// <inheritdoc />
public RopeScalingType? YarnScalingType { get; set; }

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
public bool MulMatQ { get; set; }
/// <inheritdoc />
public GGMLType? TypeK { get; set; }

/// <summary>
/// The encoding to use for models
/// </summary>
/// <inheritdoc />
public GGMLType? TypeV { get; set; }

/// <inheritdoc />
public bool NoKqvOffload { get; set; }

/// <inheritdoc />
public Encoding Encoding { get; set; } = Encoding.UTF8;

/// <summary>
/// Load vocab only (no weights)
/// </summary>
/// <inheritdoc />
public bool VocabOnly { get; set; }
}
}
40 changes: 20 additions & 20 deletions LLama/Abstractions/IContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,6 @@ public interface IContextParams
/// </summary>
uint Seed { get; set; }

/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
bool UseFp16Memory { get; set; }

/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
bool Perplexity { get; set; }

/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
Expand All @@ -49,11 +39,6 @@ public interface IContextParams
/// </summary>
float? RopeFrequencyScale { get; set; }

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
bool MulMatQ { get; set; }

/// <summary>
/// The encoding to use for models
/// </summary>
Expand All @@ -70,32 +55,47 @@ public interface IContextParams
uint? BatchThreads { get; set; }

/// <summary>
/// YaRN extrapolation mix factor
/// YaRN extrapolation mix factor (null = from model)
/// </summary>
float? YarnExtrapolationFactor { get; set; }

/// <summary>
/// YaRN magnitude scaling factor
/// YaRN magnitude scaling factor (null = from model)
/// </summary>
float? YarnAttentionFactor { get; set; }

/// <summary>
/// YaRN low correction dim
/// YaRN low correction dim (null = from model)
/// </summary>
float? YarnBetaFast { get; set; }

/// <summary>
/// YaRN high correction dim
/// YaRN high correction dim (null = from model)
/// </summary>
float? YarnBetaSlow { get; set; }

/// <summary>
/// YaRN original context length
/// YaRN original context length (null = from model)
/// </summary>
uint? YarnOriginalContext { get; set; }

/// <summary>
/// YaRN scaling method to use.
/// </summary>
RopeScalingType? YarnScalingType { get; set; }

/// <summary>
/// Override the type of the K cache
/// </summary>
GGMLType? TypeK { get; set; }

/// <summary>
/// Override the type of the V cache
/// </summary>
GGMLType? TypeV { get; set; }

/// <summary>
/// Whether to disable offloading the KQV cache to the GPU
/// </summary>
bool NoKqvOffload { get; set; }
}
Loading
Loading