From d698d9b4a61049e36bca6857759c4d8aba29b3e4 Mon Sep 17 00:00:00 2001 From: Anton Oresten Date: Sun, 24 Nov 2024 05:51:00 +0100 Subject: [PATCH] Propagate args and kwargs, add tests --- Project.toml | 2 +- README.md | 2 +- src/HuggingFaceTokenizers.jl | 7 ++++ src/Tokenizer.jl | 43 ++++++++++------------- test/runtests.jl | 68 ++++++++++++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 26 deletions(-) diff --git a/Project.toml b/Project.toml index 76c666b..bef8419 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "HuggingFaceTokenizers" uuid = "a6888d44-1185-43bb-bd0f-7806f9976d18" authors = ["AntonOresten and contributors"] -version = "1.0.0-DEV" +version = "1.1.0-DEV" [deps] PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" diff --git a/README.md b/README.md index 6ad4359..886496d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Rudimentary Julia bindings for [🤗 Tokenizers](https://github.com/huggingface/ From the Julia REPL, enter Pkg mode with `]` and add the package using the URL: -```julia +``` add https://github.com/MurrellGroup/HuggingFaceTokenizers.jl ``` diff --git a/src/HuggingFaceTokenizers.jl b/src/HuggingFaceTokenizers.jl index 6d6d097..0944777 100644 --- a/src/HuggingFaceTokenizers.jl +++ b/src/HuggingFaceTokenizers.jl @@ -1,3 +1,10 @@ +""" + HuggingFaceTokenizers + +A Julia wrapper around HuggingFace's Tokenizers Python library. + +See https://huggingface.co/docs/tokenizers/en/index for official documentation. +""" module HuggingFaceTokenizers using PythonCall diff --git a/src/Tokenizer.jl b/src/Tokenizer.jl index 50cb249..3032e82 100644 --- a/src/Tokenizer.jl +++ b/src/Tokenizer.jl @@ -8,7 +8,7 @@ struct Tokenizer end """ - from_file(::Type{Tokenizer}, path::String) + from_file(::Type{Tokenizer}, path) -> Tokenizer Create a tokenizer from a saved tokenizer file. @@ -16,13 +16,10 @@ Create a tokenizer from a saved tokenizer file. tokenizer = from_file(Tokenizer, "path/to/tokenizer.json") ``` """ -function from_file(::Type{Tokenizer}, path::String) - py_tokenizer = tokenizers.Tokenizer.from_file(path) - return Tokenizer(py_tokenizer) -end +from_file(::Type{Tokenizer}, args...; kwargs...) = Tokenizer(tokenizers.Tokenizer.from_file(args...; kwargs...)) """ - from_pretrained(::Type{Tokenizer}, name::String) + from_pretrained(::Type{Tokenizer}, name::String, revision="main", token=nothing) -> Tokenizer Create a tokenizer from a pretrained tokenizer. @@ -30,28 +27,25 @@ Create a tokenizer from a pretrained tokenizer. tokenizer = from_pretrained(Tokenizer, "bert-base-uncased") ``` """ -function from_pretrained(::Type{Tokenizer}, name::String) - py_tokenizer = tokenizers.Tokenizer.from_pretrained(name) - return Tokenizer(py_tokenizer) -end +from_pretrained(::Type{Tokenizer}, args...; kwargs...) = Tokenizer(tokenizers.Tokenizer.from_pretrained(args...; kwargs...)) """ - save(tokenizer::Tokenizer, path::String) + save(tokenizer::Tokenizer, path::String, pretty=true) -> Nothing Save the tokenizer to a file. """ -function save(tokenizer::Tokenizer, path::String) - tokenizer.py_tokenizer.save(path) +function save(tokenizer::Tokenizer, args...; kwargs...) + pycall(tokenizer.py_tokenizer.save, args...; kwargs...) return nothing end """ - encode(tokenizer::Tokenizer, text::String) -> (tokens::Vector{String}, ids::Vector{Int}) + encode(tokenizer::Tokenizer, text::String) -> (; tokens::Vector{String}, ids::Vector{Int}) -> @NamedTuple{tokens, ids} Encode a single text string into tokens and their corresponding IDs. """ -function encode(tokenizer::Tokenizer, text::String) - output = tokenizer.py_tokenizer.encode(text) +function encode(tokenizer::Tokenizer, args...; kwargs...) + output = tokenizer.py_tokenizer.encode(args...; kwargs...) tokens = pyconvert(Vector{String}, output.tokens) ids = pyconvert(Vector{Int}, output.ids) return (; tokens, ids) @@ -62,26 +56,27 @@ end Decode a sequence of token IDs back into text. """ -function decode(tokenizer::Tokenizer, ids::Vector{Int}) - return pyconvert(String, tokenizer.py_tokenizer.decode(ids)) +function decode(tokenizer::Tokenizer, args...; kwargs...) + return pyconvert(String, tokenizer.py_tokenizer.decode(args...; kwargs...)) end """ - encode_batch(tokenizer::Tokenizer, texts::Vector{String}) -> Vector{Tuple{Vector{String}, Vector{Int}}} + encode_batch(tokenizer::Tokenizer, text_batch::Vector{String}) -> Vector{@NamedTuple{tokens, ids}} Encode multiple texts in batch. """ -function encode_batch(tokenizer::Tokenizer, texts::Vector{String}) - return map(tokenizer.py_tokenizer.encode_batch(texts)) do output +function encode_batch(tokenizer::Tokenizer, args...; kwargs...) + outputs = tokenizer.py_tokenizer.encode_batch(args...; kwargs...) + return map(outputs) do output (; tokens = pyconvert(Vector{String}, output.tokens), ids = pyconvert(Vector{Int}, output.ids)) end end """ - decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}}) -> Vector{String} + decode_batch(tokenizer::Tokenizer, ids_batch::Vector{Vector{Int}}) -> Vector{String} Decode multiple sequences of token IDs in batch. """ -function decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}}) - pyconvert(Vector{String}, tokenizer.py_tokenizer.decode_batch(batch_ids)) +function decode_batch(tokenizer::Tokenizer, args...; kwargs...) + return pyconvert(Vector{String}, tokenizer.py_tokenizer.decode_batch(args...; kwargs...)) end diff --git a/test/runtests.jl b/test/runtests.jl index 009aea0..609ac45 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,6 +3,74 @@ using Test @testset "HuggingFaceTokenizers.jl" begin + @testset "Basic Operations" begin + # Load pretrained tokenizer + tokenizer = from_pretrained(Tokenizer, "bert-base-uncased") + + # Test single text encoding/decoding + text = "Hello, how are you?" + result = encode(tokenizer, text) + + @test result.tokens isa Vector{String} + @test result.ids isa Vector{Int} + @test !isempty(result.tokens) + @test !isempty(result.ids) + @test length(result.tokens) == length(result.ids) + + decoded_text = decode(tokenizer, result.ids) + @test decoded_text isa String + @test !isempty(decoded_text) + # Note: The decoded text might not match exactly due to tokenizer behavior + @test lowercase(decoded_text) == lowercase(text) + + # Test batch processing + texts = ["Hello, how are you?", "I'm doing great!"] + batch_results = encode_batch(tokenizer, texts) + + @test batch_results isa Vector + @test length(batch_results) == length(texts) + + for result in batch_results + @test result.tokens isa Vector{String} + @test result.ids isa Vector{Int} + @test !isempty(result.tokens) + @test !isempty(result.ids) + @test length(result.tokens) == length(result.ids) + end + + # Test batch decoding + ids_batch = [result.ids for result in batch_results] + decoded_texts = decode_batch(tokenizer, ids_batch) + + @test decoded_texts isa Vector{String} + @test length(decoded_texts) == length(texts) + @test all(!isempty, decoded_texts) + end + @testset "File Operations" begin + tokenizer = from_pretrained(Tokenizer, "bert-base-uncased") + + mktempdir() do temp_dir + # Test save and load + temp_path = joinpath(temp_dir, "tokenizer.json") + + # Test saving + save(tokenizer, temp_path) + @test isfile(temp_path) + + # Test loading from file + loaded_tokenizer = from_file(Tokenizer, temp_path) + @test loaded_tokenizer isa Tokenizer + + # Verify the loaded tokenizer works + text = "Hello, world!" + result1 = encode(tokenizer, text) + result2 = encode(loaded_tokenizer, text) + + @test result1.tokens == result2.tokens + @test result1.ids == result2.ids + end + # No need for explicit cleanup - mktempdir handles it + end end