Add maxTokenCount parameter to CountTokens method

pgroene · pgroene · commit ab7de4bcd716 · 2025-02-19T10:25:47.000+01:00
Updated the CountTokens method in Tokenizer.cs to include a maxTokenCount parameter for limiting token counts. Added tests in TiktokenTests.cs and TokenizerTests.cs to verify the new functionality and ensure correct behavior with the maximum token count.
diff --git a/src/Microsoft.ML.Tokenizers/Tokenizer.cs b/src/Microsoft.ML.Tokenizers/Tokenizer.cs
@@ -189,19 +189,21 @@ protected virtual int CountTokens(string? text, ReadOnlySpan<char> textSpan, Enc
         /// <param name="text">The text to encode.</param>
         /// <param name="considerPreTokenization">Indicate whether to consider pre-tokenization before tokenization.</param>
         /// <param name="considerNormalization">Indicate whether to consider normalization before tokenization.</param>
+        /// <param name="maxTokenCount">Indicate whether to consider a max token count for counting tokens.</param>
         /// <returns>The number of token Ids that the input text will be encoded to.</returns>
-        public int CountTokens(string text, bool considerPreTokenization = true, bool considerNormalization = true)
-            => CountTokens(text, text.AsSpan(), new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization });
+        public int CountTokens(string text, bool considerPreTokenization = true, bool considerNormalization = true, int maxTokenCount = int.MaxValue)
+            => CountTokens(text, text.AsSpan(), new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization, MaxTokenCount = maxTokenCount });
 
         /// <summary>
         /// Get the number of tokens that the input text will be encoded to.
         /// </summary>
         /// <param name="text">The text to encode.</param>
         /// <param name="considerPreTokenization">Indicate whether to consider pre-tokenization before tokenization.</param>
         /// <param name="considerNormalization">Indicate whether to consider normalization before tokenization.</param>
+        /// <param name="maxTokenCount">Indicate whether to consider a max token count for counting tokens.</param>
         /// <returns>The number of token Ids that the input text will be encoded to.</returns>
-        public int CountTokens(ReadOnlySpan<char> text, bool considerPreTokenization = true, bool considerNormalization = true)
-            => CountTokens(null, text, new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization });
+        public int CountTokens(ReadOnlySpan<char> text, bool considerPreTokenization = true, bool considerNormalization = true, int maxTokenCount = int.MaxValue)
+            => CountTokens(null, text, new EncodeSettings { ConsiderPreTokenization = considerPreTokenization, ConsiderNormalization = considerNormalization, MaxTokenCount = maxTokenCount });
 
         /// <summary>
         /// Find the index of the maximum encoding capacity without surpassing the token limit.
diff --git a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
@@ -391,6 +391,20 @@ public void TestEncodeR50kBase()
             TestDecodingWithSpan((R50kBase as TiktokenTokenizer)!, encoded.ToArray(), text);
         }
 
+        [Fact]
+        public void TestCountingTokens()
+        {
+            string text = ReadAndSanitizeFile("./Data/lib.rs.txt");
+            IReadOnlyList<int> encoded = R50kBase.EncodeToIds(text);
+            int idsCount = R50kBase.CountTokens(text);
+            Assert.Equal(11378, encoded.Count);
+            Assert.Equal(encoded.Count, idsCount);
+
+            // count with max tokens to encode
+            int idsCountMax1000 = R50kBase.CountTokens(text, maxTokenCount: 1000);
+            Assert.Equal(1000, idsCountMax1000);
+        }
+
         [Theory]
         [InlineData("o1")]
         [InlineData("o1-")]
diff --git a/test/Microsoft.ML.Tokenizers.Tests/TokenizerTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TokenizerTests.cs
@@ -48,6 +48,14 @@ public void CountTokens_DefaultImplementation()
             Assert.Equal(5, tokenizer.CountTokens("hello"));
         }
 
+        [Fact]
+        public void CountTokens_WithMaxTokenCount()
+        {
+            var tokenizer = new EnglishAlphabetTokenizer();
+
+            Assert.Equal(3, tokenizer.CountTokens("hello", maxTokenCount: 3));
+        }
+
         [Fact]
         public void GetIndexByTokenCount_DefaultImplementation()
         {

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,14 @@ public void CountTokens_DefaultImplementation()`
`48`	`48`	`Assert.Equal(5, tokenizer.CountTokens("hello"));`
`49`	`49`	`}`
`50`	`50`
	`51`	`+ [Fact]`
	`52`	`+ public void CountTokens_WithMaxTokenCount()`
	`53`	`+ {`
	`54`	`+ var tokenizer = new EnglishAlphabetTokenizer();`
	`55`	`+`
	`56`	`+ Assert.Equal(3, tokenizer.CountTokens("hello", maxTokenCount: 3));`
	`57`	`+ }`
	`58`	`+`
`51`	`59`	`[Fact]`
`52`	`60`	`public void GetIndexByTokenCount_DefaultImplementation()`
`53`	`61`	`{`