Skip to content

Commit 4167468

Browse files
authored
Update pattern key for split pretokenizer (#38)
* Split change key * Fix test
1 parent 612d383 commit 4167468

File tree

2 files changed

+3
-2
lines changed

2 files changed

+3
-2
lines changed

Diff for: src/pre_tokenizer.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
7878
type = json_config.at("type");
7979
if (type == "Split") {
8080
try {
81-
pattern = json_config.at("pattern");
81+
pattern = json_config.at("pattern").at("Regex");
8282
} catch (json::out_of_range&) {
8383
}
8484
} else if (type == "Digits") {

Diff for: test/test_pre_tokenizer.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,8 @@ TEST_F(PreTokenizerConfigTest, Split) {
221221
.parse_json(json{
222222
{"type", "Split"},
223223
{"pattern",
224-
R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)"},
224+
{{"Regex",
225+
R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)"}}},
225226
})
226227
.create();
227228
assert_split_match(*ptok, "Hello World", {"Hello", " World"});

0 commit comments

Comments
 (0)