Skip to content

Commit

Permalink
Propagate args and kwargs, add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonOresten committed Nov 24, 2024
1 parent 521aa9f commit d698d9b
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 26 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "HuggingFaceTokenizers"
uuid = "a6888d44-1185-43bb-bd0f-7806f9976d18"
authors = ["AntonOresten <[email protected]> and contributors"]
version = "1.0.0-DEV"
version = "1.1.0-DEV"

[deps]
PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Rudimentary Julia bindings for [🤗 Tokenizers](https://github.com/huggingface/

From the Julia REPL, enter Pkg mode with `]` and add the package using the URL:

```julia
```
add https://github.com/MurrellGroup/HuggingFaceTokenizers.jl
```

Expand Down
7 changes: 7 additions & 0 deletions src/HuggingFaceTokenizers.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
"""
HuggingFaceTokenizers
A Julia wrapper around HuggingFace's Tokenizers Python library.
See https://huggingface.co/docs/tokenizers/en/index for official documentation.
"""
module HuggingFaceTokenizers

using PythonCall
Expand Down
43 changes: 19 additions & 24 deletions src/Tokenizer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,50 +8,44 @@ struct Tokenizer
end

"""
from_file(::Type{Tokenizer}, path::String)
from_file(::Type{Tokenizer}, path) -> Tokenizer
Create a tokenizer from a saved tokenizer file.
```julia
tokenizer = from_file(Tokenizer, "path/to/tokenizer.json")
```
"""
function from_file(::Type{Tokenizer}, path::String)
py_tokenizer = tokenizers.Tokenizer.from_file(path)
return Tokenizer(py_tokenizer)
end
from_file(::Type{Tokenizer}, args...; kwargs...) = Tokenizer(tokenizers.Tokenizer.from_file(args...; kwargs...))

"""
from_pretrained(::Type{Tokenizer}, name::String)
from_pretrained(::Type{Tokenizer}, name::String, revision="main", token=nothing) -> Tokenizer
Create a tokenizer from a pretrained tokenizer.
```julia
tokenizer = from_pretrained(Tokenizer, "bert-base-uncased")
```
"""
function from_pretrained(::Type{Tokenizer}, name::String)
py_tokenizer = tokenizers.Tokenizer.from_pretrained(name)
return Tokenizer(py_tokenizer)
end
from_pretrained(::Type{Tokenizer}, args...; kwargs...) = Tokenizer(tokenizers.Tokenizer.from_pretrained(args...; kwargs...))

"""
save(tokenizer::Tokenizer, path::String)
save(tokenizer::Tokenizer, path::String, pretty=true) -> Nothing
Save the tokenizer to a file.
"""
function save(tokenizer::Tokenizer, path::String)
tokenizer.py_tokenizer.save(path)
function save(tokenizer::Tokenizer, args...; kwargs...)
pycall(tokenizer.py_tokenizer.save, args...; kwargs...)
return nothing
end

"""
encode(tokenizer::Tokenizer, text::String) -> (tokens::Vector{String}, ids::Vector{Int})
encode(tokenizer::Tokenizer, text::String) -> (; tokens::Vector{String}, ids::Vector{Int}) -> @NamedTuple{tokens, ids}
Encode a single text string into tokens and their corresponding IDs.
"""
function encode(tokenizer::Tokenizer, text::String)
output = tokenizer.py_tokenizer.encode(text)
function encode(tokenizer::Tokenizer, args...; kwargs...)
output = tokenizer.py_tokenizer.encode(args...; kwargs...)
tokens = pyconvert(Vector{String}, output.tokens)
ids = pyconvert(Vector{Int}, output.ids)
return (; tokens, ids)
Expand All @@ -62,26 +56,27 @@ end
Decode a sequence of token IDs back into text.
"""
function decode(tokenizer::Tokenizer, ids::Vector{Int})
return pyconvert(String, tokenizer.py_tokenizer.decode(ids))
function decode(tokenizer::Tokenizer, args...; kwargs...)
return pyconvert(String, tokenizer.py_tokenizer.decode(args...; kwargs...))
end

"""
encode_batch(tokenizer::Tokenizer, texts::Vector{String}) -> Vector{Tuple{Vector{String}, Vector{Int}}}
encode_batch(tokenizer::Tokenizer, text_batch::Vector{String}) -> Vector{@NamedTuple{tokens, ids}}
Encode multiple texts in batch.
"""
function encode_batch(tokenizer::Tokenizer, texts::Vector{String})
return map(tokenizer.py_tokenizer.encode_batch(texts)) do output
function encode_batch(tokenizer::Tokenizer, args...; kwargs...)
outputs = tokenizer.py_tokenizer.encode_batch(args...; kwargs...)
return map(outputs) do output
(; tokens = pyconvert(Vector{String}, output.tokens), ids = pyconvert(Vector{Int}, output.ids))
end
end

"""
decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}}) -> Vector{String}
decode_batch(tokenizer::Tokenizer, ids_batch::Vector{Vector{Int}}) -> Vector{String}
Decode multiple sequences of token IDs in batch.
"""
function decode_batch(tokenizer::Tokenizer, batch_ids::Vector{Vector{Int}})
pyconvert(Vector{String}, tokenizer.py_tokenizer.decode_batch(batch_ids))
function decode_batch(tokenizer::Tokenizer, args...; kwargs...)
return pyconvert(Vector{String}, tokenizer.py_tokenizer.decode_batch(args...; kwargs...))
end
68 changes: 68 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,74 @@ using Test

@testset "HuggingFaceTokenizers.jl" begin

@testset "Basic Operations" begin
# Load pretrained tokenizer
tokenizer = from_pretrained(Tokenizer, "bert-base-uncased")

# Test single text encoding/decoding
text = "Hello, how are you?"
result = encode(tokenizer, text)

@test result.tokens isa Vector{String}
@test result.ids isa Vector{Int}
@test !isempty(result.tokens)
@test !isempty(result.ids)
@test length(result.tokens) == length(result.ids)

decoded_text = decode(tokenizer, result.ids)
@test decoded_text isa String
@test !isempty(decoded_text)
# Note: The decoded text might not match exactly due to tokenizer behavior
@test lowercase(decoded_text) == lowercase(text)

# Test batch processing
texts = ["Hello, how are you?", "I'm doing great!"]
batch_results = encode_batch(tokenizer, texts)

@test batch_results isa Vector
@test length(batch_results) == length(texts)

for result in batch_results
@test result.tokens isa Vector{String}
@test result.ids isa Vector{Int}
@test !isempty(result.tokens)
@test !isempty(result.ids)
@test length(result.tokens) == length(result.ids)
end

# Test batch decoding
ids_batch = [result.ids for result in batch_results]
decoded_texts = decode_batch(tokenizer, ids_batch)

@test decoded_texts isa Vector{String}
@test length(decoded_texts) == length(texts)
@test all(!isempty, decoded_texts)
end

@testset "File Operations" begin
tokenizer = from_pretrained(Tokenizer, "bert-base-uncased")

mktempdir() do temp_dir
# Test save and load
temp_path = joinpath(temp_dir, "tokenizer.json")

# Test saving
save(tokenizer, temp_path)
@test isfile(temp_path)

# Test loading from file
loaded_tokenizer = from_file(Tokenizer, temp_path)
@test loaded_tokenizer isa Tokenizer

# Verify the loaded tokenizer works
text = "Hello, world!"
result1 = encode(tokenizer, text)
result2 = encode(loaded_tokenizer, text)

@test result1.tokens == result2.tokens
@test result1.ids == result2.ids
end
# No need for explicit cleanup - mktempdir handles it
end

end

0 comments on commit d698d9b

Please sign in to comment.