From a018ea424840d5b31a4888d70d619c0bfc88abd6 Mon Sep 17 00:00:00 2001
From: Ares Lazarus <cernunos_kav@msn.com>
Date: Mon, 22 Jul 2024 11:27:20 +0300
Subject: [PATCH 1/8] added ITextTokenizer.GetTokens implementation to affected
 generators

---
 .../LLamaSharpTextEmbeddingGenerator.cs       | 20 +++++++++++++++++++
 LLama.KernelMemory/LlamaSharpTextGenerator.cs | 18 +++++++++++++++++
 2 files changed, 38 insertions(+)
diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index 42ba6dbc5..469ee9794 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -3,6 +3,7 @@
 using LLama.Native;
 using Microsoft.KernelMemory;
 using Microsoft.KernelMemory.AI;
+using Microsoft.KernelMemory.Context;
 
 namespace LLamaSharp.KernelMemory
 {
@@ -112,5 +113,24 @@ public async Task<Embedding> GenerateEmbeddingAsync(string text, CancellationTok
 
         /// <inheritdoc/>
         public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length;
+
+        /// <summary>
+        /// Get the list of tokens for the input text
+        /// </summary>
+        /// <param name="text">Input string to be tokenized</param>
+        /// <returns>Read-only list of tokens for the input test</returns>
+        /// <remarks>
+        /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
+        /// <see cref="CountTokens(string)"/>
+        public IReadOnlyList<string> GetTokens(string text)
+        {
+            var context = _embedder.Context;
+            var embeddings = context.Tokenize(text, special: true);
+            var decoder = new StreamingTokenDecoder(context);
+            return embeddings
+                .Select(x => { decoder.Add(x); return decoder.Read(); })
+                .ToList()
+                .AsReadOnly();
+        }
     }
 }
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index 43b9bed8b..00819592b 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -106,5 +106,23 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In
 
         /// <inheritdoc/>
         public int CountTokens(string text) => _context.Tokenize(text, special: true).Length;
+
+        /// <summary>
+        /// Get the list of tokens for the input text
+        /// </summary>
+        /// <param name="text">Input string to be tokenized</param>
+        /// <returns>Read-only list of tokens for the input test</returns>
+        /// <remarks>
+        /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
+        /// <see cref="CountTokens(string)"/>
+        public IReadOnlyList<string> GetTokens(string text)
+        {            
+            var embeddings = _context.Tokenize(text, special: true);
+            var decoder = new StreamingTokenDecoder(_context);
+            return embeddings
+                .Select(x => { decoder.Add(x); return decoder.Read(); })
+                .ToList()
+                .AsReadOnly();
+        }
     }
 }

From a2ff5fad89e6b61016dfae10d7394f3fca9d72fe Mon Sep 17 00:00:00 2001
From: Ares Lazarus <cernunos_kav@msn.com>
Date: Mon, 22 Jul 2024 11:28:10 +0300
Subject: [PATCH 2/8] updated LLama.KernelMemory to use
 Microsoft.KernelMemory.Abstractions 0.68

---
 LLama.KernelMemory/LLamaSharp.KernelMemory.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
index cad114341..f62622a47 100644
--- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
+++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
@@ -27,7 +27,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.KernelMemory.Abstractions" Version="0.66.240709.1" />
+    <PackageReference Include="Microsoft.KernelMemory.Abstractions" Version="0.68.240716.1" />
   </ItemGroup>
 
   <ItemGroup>

From 578bfa78e8e052bfea3d45d7e58a66338b4656dd Mon Sep 17 00:00:00 2001
From: Ares Lazarus <cernunos_kav@msn.com>
Date: Mon, 22 Jul 2024 11:28:44 +0300
Subject: [PATCH 3/8] updated LLama.Unittest with reference to
 LLama.KernelMemory

---
 LLama.Unittest/LLama.Unittest.csproj | 30 ++++++----------------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj
index 4ddbd1974..95d4cbc5e 100644
--- a/LLama.Unittest/LLama.Unittest.csproj
+++ b/LLama.Unittest/LLama.Unittest.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.NET.Sdk">
+<Project Sdk="Microsoft.NET.Sdk">
   <Import Project="..\LLama\LLamaSharp.Runtime.targets" />
   <PropertyGroup>
     <TargetFramework>net8.0</TargetFramework>
@@ -29,31 +29,16 @@
 
   <Target Name="DownloadContentFilesInner">
   
-    <DownloadFile
-		SourceUrl="https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_S.gguf"
-		DestinationFolder="Models"
-		DestinationFileName="llama-2-7b-chat.Q3_K_S.gguf"
-		SkipUnchangedFiles="true">
+    <DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_S.gguf" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.Q3_K_S.gguf" SkipUnchangedFiles="true">
 	</DownloadFile>
     
-	<DownloadFile
-		SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf"
-		DestinationFolder="Models" DestinationFileName="llava-v1.6-mistral-7b.Q3_K_XS.gguf"
-		SkipUnchangedFiles="true">
+	<DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf" DestinationFolder="Models" DestinationFileName="llava-v1.6-mistral-7b.Q3_K_XS.gguf" SkipUnchangedFiles="true">
 	</DownloadFile>
     
-	<DownloadFile
-		SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/mmproj-model-f16.gguf"
-		DestinationFolder="Models"
-		DestinationFileName="mmproj-model-f16.gguf"
-		SkipUnchangedFiles="true">
+	<DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/mmproj-model-f16.gguf" DestinationFolder="Models" DestinationFileName="mmproj-model-f16.gguf" SkipUnchangedFiles="true">
 	</DownloadFile>
     
-	<DownloadFile
-		SourceUrl="https://huggingface.co/leliuga/all-MiniLM-L12-v2-GGUF/resolve/main/all-MiniLM-L12-v2.Q8_0.gguf"
-		DestinationFolder="Models"
-		DestinationFileName="all-MiniLM-L12-v2.Q8_0.gguf"
-		SkipUnchangedFiles="true">
+	<DownloadFile SourceUrl="https://huggingface.co/leliuga/all-MiniLM-L12-v2-GGUF/resolve/main/all-MiniLM-L12-v2.Q8_0.gguf" DestinationFolder="Models" DestinationFileName="all-MiniLM-L12-v2.Q8_0.gguf" SkipUnchangedFiles="true">
 	</DownloadFile>
 
   </Target>
@@ -63,14 +48,11 @@
   </Target>
 
   <ItemGroup>
+    <ProjectReference Include="..\LLama.KernelMemory\LLamaSharp.KernelMemory.csproj" />
     <ProjectReference Include="..\LLama.SemanticKernel\LLamaSharp.SemanticKernel.csproj" />
     <ProjectReference Include="..\LLama\LLamaSharp.csproj" />
   </ItemGroup>
 
-  <ItemGroup>
-    <Folder Include="Models\" />
-  </ItemGroup>
-
   <ItemGroup>
     <None Update="Models\all-MiniLM-L12-v2.Q8_0.gguf">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>

From 4a9b8229390e09297495ee740d6f38f8428a7c72 Mon Sep 17 00:00:00 2001
From: Ares Lazarus <cernunos_kav@msn.com>
Date: Mon, 22 Jul 2024 11:29:17 +0300
Subject: [PATCH 4/8] added some unit tests for ITextTokenizer.GetTokens
 implementation

---
 .../KernelMemory/ITextTokenizerTests.cs       | 81 +++++++++++++++++++
 .../LLamaSharpTextEmbeddingGeneratorTests.cs  | 30 +++++++
 .../LlamaSharpTextGeneratorTests.cs           | 34 ++++++++
 3 files changed, 145 insertions(+)
 create mode 100644 LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
 create mode 100644 LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
 create mode 100644 LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs

diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
new file mode 100644
index 000000000..08d8b8132
--- /dev/null
+++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
@@ -0,0 +1,81 @@
+using LLama.Common;
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory.AI;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+using Xunit.Abstractions;
+
+namespace LLama.Unittest.KernelMemory
+{
+    public abstract class ITextTokenizerTests
+    {
+        private readonly ITestOutputHelper _testOutputHelper;
+
+#pragma warning disable KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+        protected ITextTokenizer? _generator;
+#pragma warning restore KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+
+        protected InferenceParams _infParams;
+        protected LLamaSharpConfig _lsConfig;
+
+        public ITextTokenizerTests(ITestOutputHelper testOutputHelper)
+        {
+            _testOutputHelper = testOutputHelper;
+
+            _infParams = new() { AntiPrompts = ["\n\n"] };
+            _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams };
+
+            testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}");
+        }
+
+        [Theory]
+        [InlineData("The quick brown fox jumps over the lazy dog")]
+        [InlineData("Well, here're some special characters!!!")]
+        [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")]
+        [InlineData("  \n  \r\n  \t   ")]
+        public void GetTokens_ShouldReturnListOfTokensForInputString(string? text)
+        {
+            var tokens = _generator!.GetTokens(text);
+            var tokensCount = _generator.CountTokens(text);
+
+            var expected = " " + text; // the placement of the space corresponding to BOS will vary by model
+            var actual = string.Join("", tokens);
+
+            _testOutputHelper.WriteLine($"Tokens for '{text}':");
+            _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));
+
+            Assert.Equal(expected, actual);
+            Assert.Equal(tokensCount, tokens.Count);
+        }
+
+        [Fact]
+        public void GetToken_ShouldThrowForNull()
+        {
+            string? text = null;
+
+            Assert.Throws<ArgumentNullException>(() => { _generator!.GetTokens(text!); });
+        }
+
+        [Fact]
+        public void GetToken_EmptyStringYieldsOneEmptyToken()
+        {
+            var text = "";
+            var expected = "";
+
+            var tokens = _generator!.GetTokens(text);
+            var tokensCount = _generator.CountTokens(text);
+            var actual = tokens.Single();
+
+            _testOutputHelper.WriteLine($"Tokens for '{text}':");
+            _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));
+
+            Assert.Equal(expected, actual);
+            Assert.Equal(tokensCount, tokens.Count);
+        }
+    }
+}
diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
new file mode 100644
index 000000000..91161b72c
--- /dev/null
+++ b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
@@ -0,0 +1,30 @@
+using LLama.Common;
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory.AI;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+using Xunit.Abstractions;
+
+namespace LLama.Unittest.KernelMemory
+{
+    public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
+    {
+        private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;
+
+        public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        {
+            _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);
+            
+            _generator = _embeddingGenerator;
+        }
+
+        public void Dispose()
+        {
+            _embeddingGenerator.Dispose();
+        }       
+    }
+}
diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
new file mode 100644
index 000000000..02001f8cf
--- /dev/null
+++ b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
@@ -0,0 +1,34 @@
+using LLama.Common;
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory.AI;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using System.Reflection.Emit;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+using Xunit.Abstractions;
+using Xunit.Sdk;
+using static System.Net.Mime.MediaTypeNames;
+
+namespace LLama.Unittest.KernelMemory
+{
+    public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
+    {        
+        private readonly LlamaSharpTextGenerator _textGenerator;
+
+        public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        {            
+            _textGenerator = new LlamaSharpTextGenerator(_lsConfig);
+
+            _generator = _textGenerator;
+        }
+
+        public void Dispose()
+        {
+            _textGenerator.Dispose();
+        }       
+    }
+}

From 2532afd3541cfcd72a2c415ec02c490a7a0daa3b Mon Sep 17 00:00:00 2001
From: Ares Lazarus <cernunos_kav@msn.com>
Date: Wed, 24 Jul 2024 13:48:43 +0300
Subject: [PATCH 5/8] removed redundant .AsReadOnly, cleaned up usings

---
 LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs | 5 +----
 LLama.KernelMemory/LlamaSharpTextGenerator.cs          | 3 +--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index 469ee9794..e5b193c9b 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -1,9 +1,7 @@
 using LLama;
 using LLama.Common;
-using LLama.Native;
 using Microsoft.KernelMemory;
 using Microsoft.KernelMemory.AI;
-using Microsoft.KernelMemory.Context;
 
 namespace LLamaSharp.KernelMemory
 {
@@ -129,8 +127,7 @@ public IReadOnlyList<string> GetTokens(string text)
             var decoder = new StreamingTokenDecoder(context);
             return embeddings
                 .Select(x => { decoder.Add(x); return decoder.Read(); })
-                .ToList()
-                .AsReadOnly();
+                .ToList();
         }
     }
 }
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index 00819592b..c52aeee90 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -121,8 +121,7 @@ public IReadOnlyList<string> GetTokens(string text)
             var decoder = new StreamingTokenDecoder(_context);
             return embeddings
                 .Select(x => { decoder.Add(x); return decoder.Read(); })
-                .ToList()
-                .AsReadOnly();
+                .ToList();
         }
     }
 }

From dd5ffa1788737636e87188ba7817bf679a9c6b5c Mon Sep 17 00:00:00 2001
From: Ares Lazarus <cernunos_kav@msn.com>
Date: Wed, 24 Jul 2024 23:01:51 +0300
Subject: [PATCH 6/8] changed misleading variable name

---
 LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs | 4 ++--
 LLama.KernelMemory/LlamaSharpTextGenerator.cs          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index e5b193c9b..309030eda 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -123,9 +123,9 @@ public async Task<Embedding> GenerateEmbeddingAsync(string text, CancellationTok
         public IReadOnlyList<string> GetTokens(string text)
         {
             var context = _embedder.Context;
-            var embeddings = context.Tokenize(text, special: true);
+            var numericTokens = context.Tokenize(text, special: true);
             var decoder = new StreamingTokenDecoder(context);
-            return embeddings
+            return numericTokens
                 .Select(x => { decoder.Add(x); return decoder.Read(); })
                 .ToList();
         }
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index c52aeee90..00605e479 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -117,9 +117,9 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In
         /// <see cref="CountTokens(string)"/>
         public IReadOnlyList<string> GetTokens(string text)
         {            
-            var embeddings = _context.Tokenize(text, special: true);
+            var numericTokens = _context.Tokenize(text, special: true);
             var decoder = new StreamingTokenDecoder(_context);
-            return embeddings
+            return numericTokens
                 .Select(x => { decoder.Add(x); return decoder.Read(); })
                 .ToList();
         }

From 63b50f5fba207035bf07327b94e9cc83be0ef03a Mon Sep 17 00:00:00 2001
From: Ares Lazarus <cernunos_kav@msn.com>
Date: Wed, 24 Jul 2024 23:10:42 +0300
Subject: [PATCH 7/8] spun off unicode test cases and added short explanation
 of the issue of redundant tokens resulting from multi-token characters with
 ref to PR #862

---
 .../LLamaSharpTextEmbeddingGenerator.cs       |  1 +
 LLama.KernelMemory/LlamaSharpTextGenerator.cs |  3 +-
 .../KernelMemory/ITextTokenizerTests.cs       | 56 +++++++++++++++----
 3 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index 309030eda..543f61b63 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -122,6 +122,7 @@ public async Task<Embedding> GenerateEmbeddingAsync(string text, CancellationTok
         /// <see cref="CountTokens(string)"/>
         public IReadOnlyList<string> GetTokens(string text)
         {
+            /* see relevant unit tests for important implementation notes regading unicode */
             var context = _embedder.Context;
             var numericTokens = context.Tokenize(text, special: true);
             var decoder = new StreamingTokenDecoder(context);
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index 00605e479..b2b64d046 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -116,7 +116,8 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In
         /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
         /// <see cref="CountTokens(string)"/>
         public IReadOnlyList<string> GetTokens(string text)
-        {            
+        {
+            /* see relevant unit tests for important implementation notes regading unicode */
             var numericTokens = _context.Tokenize(text, special: true);
             var decoder = new StreamingTokenDecoder(_context);
             return numericTokens
diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
index 08d8b8132..6f4ed33a2 100644
--- a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
+++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
@@ -1,17 +1,11 @@
 using LLama.Common;
 using LLamaSharp.KernelMemory;
 using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Runtime.CompilerServices;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.Threading.Tasks;
 using Xunit.Abstractions;
 
 namespace LLama.Unittest.KernelMemory
 {
+    
     public abstract class ITextTokenizerTests
     {
         private readonly ITestOutputHelper _testOutputHelper;
@@ -31,19 +25,61 @@ public ITextTokenizerTests(ITestOutputHelper testOutputHelper)
             _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams };
 
             testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}");
-        }
+        }        
+
 
         [Theory]
         [InlineData("The quick brown fox jumps over the lazy dog")]
         [InlineData("Well, here're some special characters!!!")]
-        [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")]
+        [InlineData("...___---")]
+        [InlineData("15 + 6 = 21 && 68 * 75 = 5100")]
         [InlineData("  \n  \r\n  \t   ")]
         public void GetTokens_ShouldReturnListOfTokensForInputString(string? text)
         {
             var tokens = _generator!.GetTokens(text);
             var tokensCount = _generator.CountTokens(text);
 
-            var expected = " " + text; // the placement of the space corresponding to BOS will vary by model
+            var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer
+            var actual = string.Join("", tokens);
+
+            _testOutputHelper.WriteLine($"Tokens for '{text}':");
+            _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));
+
+            Assert.Equal(expected, actual);
+            Assert.Equal(tokensCount, tokens.Count);
+        }
+
+        /* This is exactly the same test as the non-unicode cases. However, there are reasons why this
+         * should be made a special case and may deviate in the future:
+         * 
+         * As of now there appears to be no final word as to how characters that consist of more than one 
+         * numeric token should correspond to textual tokens, and results vary according to different 
+         * models' tokenizers. For example, given a character 'Z' that corresponds to the numeric tokens {1,2,3} 
+         * some (llama-2) will pad the length of the total number of tokens by returning spaces as tokens 
+         * (i.e. ' ', ' ', 'Z') while others (GPT4Tokenizer) will pad with the character itself (i.e. 'Z','Z','Z').
+         * 
+         * This is very evident when tokenizing ideograms and emojis, but can arise with various unicode characters 
+         * as well. See pull request for more relevant discussion https://github.com/SciSharp/LLamaSharp/pull/862
+         *
+         * Currently the method will remain consistent with the output of ITextTokenizer.CountTokens, meaning
+         * any redundant tokens will not be ommited as long as they are counted by CountTokens.
+         * 
+         * StreamingTokenDecoder, while sufficiently useful for this task, was not designed with producing
+         * output for one numeric token at a time in mind, so ITextTokenizer.GetTokens should not be considered 
+         * an example of proper use.
+         * 
+         * Note: if this message is removed, also remove references to it in LLamaSharpTextEmbeddingGenerator.GetTokens
+         * and LLamaSharpTextGenerator.GetTokens
+         */
+        [Theory]
+        [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")]
+        [InlineData("猫坐在垫子上 😀🤨🤐😏")]
+        public void GetTokens_Unicode_ShouldReturnListOfTokensForInputString(string? text)
+        {
+            var tokens = _generator!.GetTokens(text);
+            var tokensCount = _generator.CountTokens(text);
+
+            var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer
             var actual = string.Join("", tokens);
 
             _testOutputHelper.WriteLine($"Tokens for '{text}':");

From 939d2b1995dbb28afe8988271fb763d0ca3256c0 Mon Sep 17 00:00:00 2001
From: Ares Lazarus <cernunos_kav@msn.com>
Date: Wed, 24 Jul 2024 23:18:40 +0300
Subject: [PATCH 8/8] fixed spelling errors in comments

---
 LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs | 2 +-
 LLama.KernelMemory/LlamaSharpTextGenerator.cs          | 2 +-
 LLama.Unittest/KernelMemory/ITextTokenizerTests.cs     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index 543f61b63..a608c6571 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -122,7 +122,7 @@ public async Task<Embedding> GenerateEmbeddingAsync(string text, CancellationTok
         /// <see cref="CountTokens(string)"/>
         public IReadOnlyList<string> GetTokens(string text)
         {
-            /* see relevant unit tests for important implementation notes regading unicode */
+            /* see relevant unit tests for important implementation notes regarding unicode */
             var context = _embedder.Context;
             var numericTokens = context.Tokenize(text, special: true);
             var decoder = new StreamingTokenDecoder(context);
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index b2b64d046..e13e634b3 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -117,7 +117,7 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In
         /// <see cref="CountTokens(string)"/>
         public IReadOnlyList<string> GetTokens(string text)
         {
-            /* see relevant unit tests for important implementation notes regading unicode */
+            /* see relevant unit tests for important implementation notes regarding unicode */
             var numericTokens = _context.Tokenize(text, special: true);
             var decoder = new StreamingTokenDecoder(_context);
             return numericTokens
diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
index 6f4ed33a2..4000525cc 100644
--- a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
+++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
@@ -62,7 +62,7 @@ public void GetTokens_ShouldReturnListOfTokensForInputString(string? text)
          * as well. See pull request for more relevant discussion https://github.com/SciSharp/LLamaSharp/pull/862
          *
          * Currently the method will remain consistent with the output of ITextTokenizer.CountTokens, meaning
-         * any redundant tokens will not be ommited as long as they are counted by CountTokens.
+         * any redundant tokens will not be omitted as long as they are counted by CountTokens.
          * 
          * StreamingTokenDecoder, while sufficiently useful for this task, was not designed with producing
          * output for one numeric token at a time in mind, so ITextTokenizer.GetTokens should not be considered