From a018ea424840d5b31a4888d70d619c0bfc88abd6 Mon Sep 17 00:00:00 2001 From: Ares Lazarus Date: Mon, 22 Jul 2024 11:27:20 +0300 Subject: [PATCH 1/8] added ITextTokenizer.GetTokens implementation to affected generators --- .../LLamaSharpTextEmbeddingGenerator.cs | 20 +++++++++++++++++++ LLama.KernelMemory/LlamaSharpTextGenerator.cs | 18 +++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index 42ba6dbc5..469ee9794 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -3,6 +3,7 @@ using LLama.Native; using Microsoft.KernelMemory; using Microsoft.KernelMemory.AI; +using Microsoft.KernelMemory.Context; namespace LLamaSharp.KernelMemory { @@ -112,5 +113,24 @@ public async Task GenerateEmbeddingAsync(string text, CancellationTok /// public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length; + + /// + /// Get the list of tokens for the input text + /// + /// Input string to be tokenized + /// Read-only list of tokens for the input test + /// + /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation. + /// + public IReadOnlyList GetTokens(string text) + { + var context = _embedder.Context; + var embeddings = context.Tokenize(text, special: true); + var decoder = new StreamingTokenDecoder(context); + return embeddings + .Select(x => { decoder.Add(x); return decoder.Read(); }) + .ToList() + .AsReadOnly(); + } } } diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index 43b9bed8b..00819592b 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -106,5 +106,23 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In /// public int CountTokens(string text) => _context.Tokenize(text, special: true).Length; + + /// + /// Get the list of tokens for the input text + /// + /// Input string to be tokenized + /// Read-only list of tokens for the input test + /// + /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation. + /// + public IReadOnlyList GetTokens(string text) + { + var embeddings = _context.Tokenize(text, special: true); + var decoder = new StreamingTokenDecoder(_context); + return embeddings + .Select(x => { decoder.Add(x); return decoder.Read(); }) + .ToList() + .AsReadOnly(); + } } } From a2ff5fad89e6b61016dfae10d7394f3fca9d72fe Mon Sep 17 00:00:00 2001 From: Ares Lazarus Date: Mon, 22 Jul 2024 11:28:10 +0300 Subject: [PATCH 2/8] updated LLama.KernelMemory to use Microsoft.KernelMemory.Abstractions 0.68 --- LLama.KernelMemory/LLamaSharp.KernelMemory.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj index cad114341..f62622a47 100644 --- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj +++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj @@ -27,7 +27,7 @@ - + From 578bfa78e8e052bfea3d45d7e58a66338b4656dd Mon Sep 17 00:00:00 2001 From: Ares Lazarus Date: Mon, 22 Jul 2024 11:28:44 +0300 Subject: [PATCH 3/8] updated LLama.Unittest with reference to LLama.KernelMemory --- LLama.Unittest/LLama.Unittest.csproj | 30 ++++++---------------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj index 4ddbd1974..95d4cbc5e 100644 --- a/LLama.Unittest/LLama.Unittest.csproj +++ b/LLama.Unittest/LLama.Unittest.csproj @@ -1,4 +1,4 @@ - + net8.0 @@ -29,31 +29,16 @@ - + - + - + - + @@ -63,14 +48,11 @@ + - - - - PreserveNewest From 4a9b8229390e09297495ee740d6f38f8428a7c72 Mon Sep 17 00:00:00 2001 From: Ares Lazarus Date: Mon, 22 Jul 2024 11:29:17 +0300 Subject: [PATCH 4/8] added some unit tests for ITextTokenizer.GetTokens implementation --- .../KernelMemory/ITextTokenizerTests.cs | 81 +++++++++++++++++++ .../LLamaSharpTextEmbeddingGeneratorTests.cs | 30 +++++++ .../LlamaSharpTextGeneratorTests.cs | 34 ++++++++ 3 files changed, 145 insertions(+) create mode 100644 LLama.Unittest/KernelMemory/ITextTokenizerTests.cs create mode 100644 LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs create mode 100644 LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs new file mode 100644 index 000000000..08d8b8132 --- /dev/null +++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs @@ -0,0 +1,81 @@ +using LLama.Common; +using LLamaSharp.KernelMemory; +using Microsoft.KernelMemory.AI; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Xunit.Abstractions; + +namespace LLama.Unittest.KernelMemory +{ + public abstract class ITextTokenizerTests + { + private readonly ITestOutputHelper _testOutputHelper; + +#pragma warning disable KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + protected ITextTokenizer? _generator; +#pragma warning restore KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + + protected InferenceParams _infParams; + protected LLamaSharpConfig _lsConfig; + + public ITextTokenizerTests(ITestOutputHelper testOutputHelper) + { + _testOutputHelper = testOutputHelper; + + _infParams = new() { AntiPrompts = ["\n\n"] }; + _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams }; + + testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}"); + } + + [Theory] + [InlineData("The quick brown fox jumps over the lazy dog")] + [InlineData("Well, here're some special characters!!!")] + [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")] + [InlineData(" \n \r\n \t ")] + public void GetTokens_ShouldReturnListOfTokensForInputString(string? text) + { + var tokens = _generator!.GetTokens(text); + var tokensCount = _generator.CountTokens(text); + + var expected = " " + text; // the placement of the space corresponding to BOS will vary by model + var actual = string.Join("", tokens); + + _testOutputHelper.WriteLine($"Tokens for '{text}':"); + _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); + + Assert.Equal(expected, actual); + Assert.Equal(tokensCount, tokens.Count); + } + + [Fact] + public void GetToken_ShouldThrowForNull() + { + string? text = null; + + Assert.Throws(() => { _generator!.GetTokens(text!); }); + } + + [Fact] + public void GetToken_EmptyStringYieldsOneEmptyToken() + { + var text = ""; + var expected = ""; + + var tokens = _generator!.GetTokens(text); + var tokensCount = _generator.CountTokens(text); + var actual = tokens.Single(); + + _testOutputHelper.WriteLine($"Tokens for '{text}':"); + _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); + + Assert.Equal(expected, actual); + Assert.Equal(tokensCount, tokens.Count); + } + } +} diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs new file mode 100644 index 000000000..91161b72c --- /dev/null +++ b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs @@ -0,0 +1,30 @@ +using LLama.Common; +using LLamaSharp.KernelMemory; +using Microsoft.KernelMemory.AI; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Xunit.Abstractions; + +namespace LLama.Unittest.KernelMemory +{ + public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable + { + private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator; + + public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + { + _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig); + + _generator = _embeddingGenerator; + } + + public void Dispose() + { + _embeddingGenerator.Dispose(); + } + } +} diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs new file mode 100644 index 000000000..02001f8cf --- /dev/null +++ b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs @@ -0,0 +1,34 @@ +using LLama.Common; +using LLamaSharp.KernelMemory; +using Microsoft.KernelMemory.AI; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Reflection.Emit; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Xunit.Abstractions; +using Xunit.Sdk; +using static System.Net.Mime.MediaTypeNames; + +namespace LLama.Unittest.KernelMemory +{ + public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable + { + private readonly LlamaSharpTextGenerator _textGenerator; + + public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + { + _textGenerator = new LlamaSharpTextGenerator(_lsConfig); + + _generator = _textGenerator; + } + + public void Dispose() + { + _textGenerator.Dispose(); + } + } +} From 2532afd3541cfcd72a2c415ec02c490a7a0daa3b Mon Sep 17 00:00:00 2001 From: Ares Lazarus Date: Wed, 24 Jul 2024 13:48:43 +0300 Subject: [PATCH 5/8] removed redundant .AsReadOnly, cleaned up usings --- LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs | 5 +---- LLama.KernelMemory/LlamaSharpTextGenerator.cs | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index 469ee9794..e5b193c9b 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -1,9 +1,7 @@ using LLama; using LLama.Common; -using LLama.Native; using Microsoft.KernelMemory; using Microsoft.KernelMemory.AI; -using Microsoft.KernelMemory.Context; namespace LLamaSharp.KernelMemory { @@ -129,8 +127,7 @@ public IReadOnlyList GetTokens(string text) var decoder = new StreamingTokenDecoder(context); return embeddings .Select(x => { decoder.Add(x); return decoder.Read(); }) - .ToList() - .AsReadOnly(); + .ToList(); } } } diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index 00819592b..c52aeee90 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -121,8 +121,7 @@ public IReadOnlyList GetTokens(string text) var decoder = new StreamingTokenDecoder(_context); return embeddings .Select(x => { decoder.Add(x); return decoder.Read(); }) - .ToList() - .AsReadOnly(); + .ToList(); } } } From dd5ffa1788737636e87188ba7817bf679a9c6b5c Mon Sep 17 00:00:00 2001 From: Ares Lazarus Date: Wed, 24 Jul 2024 23:01:51 +0300 Subject: [PATCH 6/8] changed misleading variable name --- LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs | 4 ++-- LLama.KernelMemory/LlamaSharpTextGenerator.cs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index e5b193c9b..309030eda 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -123,9 +123,9 @@ public async Task GenerateEmbeddingAsync(string text, CancellationTok public IReadOnlyList GetTokens(string text) { var context = _embedder.Context; - var embeddings = context.Tokenize(text, special: true); + var numericTokens = context.Tokenize(text, special: true); var decoder = new StreamingTokenDecoder(context); - return embeddings + return numericTokens .Select(x => { decoder.Add(x); return decoder.Read(); }) .ToList(); } diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index c52aeee90..00605e479 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -117,9 +117,9 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In /// public IReadOnlyList GetTokens(string text) { - var embeddings = _context.Tokenize(text, special: true); + var numericTokens = _context.Tokenize(text, special: true); var decoder = new StreamingTokenDecoder(_context); - return embeddings + return numericTokens .Select(x => { decoder.Add(x); return decoder.Read(); }) .ToList(); } From 63b50f5fba207035bf07327b94e9cc83be0ef03a Mon Sep 17 00:00:00 2001 From: Ares Lazarus Date: Wed, 24 Jul 2024 23:10:42 +0300 Subject: [PATCH 7/8] spun off unicode test cases and added short explanation of the issue of redundant tokens resulting from multi-token characters with ref to PR #862 --- .../LLamaSharpTextEmbeddingGenerator.cs | 1 + LLama.KernelMemory/LlamaSharpTextGenerator.cs | 3 +- .../KernelMemory/ITextTokenizerTests.cs | 56 +++++++++++++++---- 3 files changed, 49 insertions(+), 11 deletions(-) diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index 309030eda..543f61b63 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -122,6 +122,7 @@ public async Task GenerateEmbeddingAsync(string text, CancellationTok /// public IReadOnlyList GetTokens(string text) { + /* see relevant unit tests for important implementation notes regading unicode */ var context = _embedder.Context; var numericTokens = context.Tokenize(text, special: true); var decoder = new StreamingTokenDecoder(context); diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index 00605e479..b2b64d046 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -116,7 +116,8 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation. /// public IReadOnlyList GetTokens(string text) - { + { + /* see relevant unit tests for important implementation notes regading unicode */ var numericTokens = _context.Tokenize(text, special: true); var decoder = new StreamingTokenDecoder(_context); return numericTokens diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs index 08d8b8132..6f4ed33a2 100644 --- a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs +++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs @@ -1,17 +1,11 @@ using LLama.Common; using LLamaSharp.KernelMemory; using Microsoft.KernelMemory.AI; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Runtime.CompilerServices; -using System.Text; -using System.Text.RegularExpressions; -using System.Threading.Tasks; using Xunit.Abstractions; namespace LLama.Unittest.KernelMemory { + public abstract class ITextTokenizerTests { private readonly ITestOutputHelper _testOutputHelper; @@ -31,19 +25,61 @@ public ITextTokenizerTests(ITestOutputHelper testOutputHelper) _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams }; testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}"); - } + } + [Theory] [InlineData("The quick brown fox jumps over the lazy dog")] [InlineData("Well, here're some special characters!!!")] - [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")] + [InlineData("...___---")] + [InlineData("15 + 6 = 21 && 68 * 75 = 5100")] [InlineData(" \n \r\n \t ")] public void GetTokens_ShouldReturnListOfTokensForInputString(string? text) { var tokens = _generator!.GetTokens(text); var tokensCount = _generator.CountTokens(text); - var expected = " " + text; // the placement of the space corresponding to BOS will vary by model + var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer + var actual = string.Join("", tokens); + + _testOutputHelper.WriteLine($"Tokens for '{text}':"); + _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); + + Assert.Equal(expected, actual); + Assert.Equal(tokensCount, tokens.Count); + } + + /* This is exactly the same test as the non-unicode cases. However, there are reasons why this + * should be made a special case and may deviate in the future: + * + * As of now there appears to be no final word as to how characters that consist of more than one + * numeric token should correspond to textual tokens, and results vary according to different + * models' tokenizers. For example, given a character 'Z' that corresponds to the numeric tokens {1,2,3} + * some (llama-2) will pad the length of the total number of tokens by returning spaces as tokens + * (i.e. ' ', ' ', 'Z') while others (GPT4Tokenizer) will pad with the character itself (i.e. 'Z','Z','Z'). + * + * This is very evident when tokenizing ideograms and emojis, but can arise with various unicode characters + * as well. See pull request for more relevant discussion https://github.com/SciSharp/LLamaSharp/pull/862 + * + * Currently the method will remain consistent with the output of ITextTokenizer.CountTokens, meaning + * any redundant tokens will not be ommited as long as they are counted by CountTokens. + * + * StreamingTokenDecoder, while sufficiently useful for this task, was not designed with producing + * output for one numeric token at a time in mind, so ITextTokenizer.GetTokens should not be considered + * an example of proper use. + * + * Note: if this message is removed, also remove references to it in LLamaSharpTextEmbeddingGenerator.GetTokens + * and LLamaSharpTextGenerator.GetTokens + */ + [Theory] + [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")] + [InlineData("猫坐在垫子上 😀🤨🤐😏")] + public void GetTokens_Unicode_ShouldReturnListOfTokensForInputString(string? text) + { + var tokens = _generator!.GetTokens(text); + var tokensCount = _generator.CountTokens(text); + + var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer var actual = string.Join("", tokens); _testOutputHelper.WriteLine($"Tokens for '{text}':"); From 939d2b1995dbb28afe8988271fb763d0ca3256c0 Mon Sep 17 00:00:00 2001 From: Ares Lazarus Date: Wed, 24 Jul 2024 23:18:40 +0300 Subject: [PATCH 8/8] fixed spelling errors in comments --- LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs | 2 +- LLama.KernelMemory/LlamaSharpTextGenerator.cs | 2 +- LLama.Unittest/KernelMemory/ITextTokenizerTests.cs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index 543f61b63..a608c6571 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -122,7 +122,7 @@ public async Task GenerateEmbeddingAsync(string text, CancellationTok /// public IReadOnlyList GetTokens(string text) { - /* see relevant unit tests for important implementation notes regading unicode */ + /* see relevant unit tests for important implementation notes regarding unicode */ var context = _embedder.Context; var numericTokens = context.Tokenize(text, special: true); var decoder = new StreamingTokenDecoder(context); diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index b2b64d046..e13e634b3 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -117,7 +117,7 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In /// public IReadOnlyList GetTokens(string text) { - /* see relevant unit tests for important implementation notes regading unicode */ + /* see relevant unit tests for important implementation notes regarding unicode */ var numericTokens = _context.Tokenize(text, special: true); var decoder = new StreamingTokenDecoder(_context); return numericTokens diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs index 6f4ed33a2..4000525cc 100644 --- a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs +++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs @@ -62,7 +62,7 @@ public void GetTokens_ShouldReturnListOfTokensForInputString(string? text) * as well. See pull request for more relevant discussion https://github.com/SciSharp/LLamaSharp/pull/862 * * Currently the method will remain consistent with the output of ITextTokenizer.CountTokens, meaning - * any redundant tokens will not be ommited as long as they are counted by CountTokens. + * any redundant tokens will not be omitted as long as they are counted by CountTokens. * * StreamingTokenDecoder, while sufficiently useful for this task, was not designed with producing * output for one numeric token at a time in mind, so ITextTokenizer.GetTokens should not be considered