diff --git a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs index b89606ba8d..ae73baa35c 100644 --- a/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/SentencePieceTokenizer.cs @@ -86,6 +86,7 @@ private SentencePieceTokenizer(ModelProto modelProto, IReadOnlyDictionary Regex.Escape(s))), RegexOptions.Compiled); } } diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs index 67dec82979..2b584824e7 100644 --- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs @@ -1175,23 +1175,23 @@ private static (Dictionary SpecialTokens, Regex Regex, string Vocab internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.R50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51"; #if NET7_0_OR_GREATER - [GeneratedRegex(Cl100kBaseRegexPattern)] + [GeneratedRegex(Cl100kBaseRegexPattern, RegexOptions.None, PreTokenizer.DefaultTimeOutInMilliseconds)] private static partial Regex Cl100kBaseRegex(); - [GeneratedRegex(P50kBaseRegexPattern)] + [GeneratedRegex(P50kBaseRegexPattern, RegexOptions.None, PreTokenizer.DefaultTimeOutInMilliseconds)] internal static partial Regex P50kBaseRegex(); - [GeneratedRegex(O200kBaseRegexPattern)] + [GeneratedRegex(O200kBaseRegexPattern, RegexOptions.None, PreTokenizer.DefaultTimeOutInMilliseconds)] internal static partial Regex O200kBaseRegex(); #else private static Regex? _cl100kBaseRegex; - private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled); + private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds)); private static Regex? _p50kBaseRegex; - internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled); + internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds)); private static Regex? _o200kBaseRegex; - internal static Regex O200kBaseRegex() => _o200kBaseRegex ??= new Regex(O200kBaseRegexPattern, RegexOptions.Compiled); + internal static Regex O200kBaseRegex() => _o200kBaseRegex ??= new Regex(O200kBaseRegexPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(PreTokenizer.DefaultTimeOutInMilliseconds)); #endif private static readonly ConcurrentDictionary, int> encoder, Dictionary vocab, Dictionary> decoder)> _tiktokenCache = new(StringComparer.OrdinalIgnoreCase); diff --git a/src/Microsoft.ML.Tokenizers/PreTokenizer/PreTokenizer.cs b/src/Microsoft.ML.Tokenizers/PreTokenizer/PreTokenizer.cs index 7036d46e50..97b1605a08 100644 --- a/src/Microsoft.ML.Tokenizers/PreTokenizer/PreTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/PreTokenizer/PreTokenizer.cs @@ -40,13 +40,16 @@ public abstract partial class PreTokenizer } } + // 30 seconds is a reasonable time to process any text and find the match. + internal const int DefaultTimeOutInMilliseconds = 30_000; + private const string WhiteSpaceOrPunctuationPattern = @"\w+|[\p{P}]"; private static PreTokenizer? _whiteSpaceOrPunctuationPreTokenizer; #if NET7_0_OR_GREATER - [GeneratedRegex(WhiteSpaceOrPunctuationPattern)] + [GeneratedRegex(WhiteSpaceOrPunctuationPattern, RegexOptions.None, DefaultTimeOutInMilliseconds)] private static partial Regex WhiteSpaceOrPunctuationRegex(); #else - private static Regex WhiteSpaceOrPunctuationRegex() => new Regex(WhiteSpaceOrPunctuationPattern, RegexOptions.Compiled); + private static Regex WhiteSpaceOrPunctuationRegex() => new Regex(WhiteSpaceOrPunctuationPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(DefaultTimeOutInMilliseconds)); #endif /// @@ -69,10 +72,10 @@ public static PreTokenizer CreateWhiteSpaceOrPunctuationPreTokenizer(IReadOnlyDi private static PreTokenizer? _wordOrNonWordPreTokenizer; #if NET7_0_OR_GREATER - [GeneratedRegex(WordOrNonWordPattern)] + [GeneratedRegex(WordOrNonWordPattern, RegexOptions.None, DefaultTimeOutInMilliseconds)] private static partial Regex WordOrNonWordRegex(); #else - private static Regex WordOrNonWordRegex() => new Regex(WordOrNonWordPattern, RegexOptions.Compiled); + private static Regex WordOrNonWordRegex() => new Regex(WordOrNonWordPattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(DefaultTimeOutInMilliseconds)); #endif /// @@ -96,10 +99,10 @@ public static PreTokenizer CreateWordOrNonWordPreTokenizer(IReadOnlyDictionary new Regex(WhiteSpacePattern, RegexOptions.Compiled); + private static Regex WhiteSpaceRegex() => new Regex(WhiteSpacePattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(DefaultTimeOutInMilliseconds)); #endif /// diff --git a/src/Microsoft.ML.Tokenizers/PreTokenizer/RegexPreTokenizer.cs b/src/Microsoft.ML.Tokenizers/PreTokenizer/RegexPreTokenizer.cs index 9685e370b7..b5a994b7b3 100644 --- a/src/Microsoft.ML.Tokenizers/PreTokenizer/RegexPreTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/PreTokenizer/RegexPreTokenizer.cs @@ -35,6 +35,7 @@ public RegexPreTokenizer(Regex regex, IReadOnlyDictionary? specialT if (specialTokensEncoder is { Count: > 0 }) { + // We create this Regex object without a timeout, as we expect the match operation to complete in \(O(N)\) time complexity. Note that `specialTokensEncoder` is treated as constants after the pre-tokenizer is created. _specialTokensRegex = new Regex(string.Join("|", specialTokensEncoder.Keys.Select(s => Regex.Escape(s))), RegexOptions.Compiled); } }