Skip to content

Commit

Permalink
Fix typos and improve language (#59)
Browse files Browse the repository at this point in the history
  • Loading branch information
preciz authored Jul 31, 2024
1 parent 09f3f25 commit 3961417
Show file tree
Hide file tree
Showing 15 changed files with 30 additions and 30 deletions.
2 changes: 1 addition & 1 deletion lib/tokenizers/added_token.ex
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,6 @@ defimpl Inspect, for: Tokenizers.AddedToken do
|> Tokenizers.Native.added_token_info()
|> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)

concat(["#Tokenizers.PreTokenizer<", to_doc(attrs, opts), ">"])
concat(["#Tokenizers.AddedToken<", to_doc(attrs, opts), ">"])
end
end
14 changes: 7 additions & 7 deletions lib/tokenizers/decoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ defmodule Tokenizers.Decoder do
## Options
* `suffix` - the suffix to add to the end of each word. Defaults
* `:suffix` - the suffix to add to the end of each word. Defaults
to `</w>`
"""
Expand All @@ -48,12 +48,12 @@ defmodule Tokenizers.Decoder do
## Options
* `pad_token` - the token used for padding. Defaults to `<pad>`
* `:pad_token` - the token used for padding. Defaults to `<pad>`
* `word_delimiter_token` - the token used for word delimiter.
* `:word_delimiter_token` - the token used for word delimiter.
Defaults to `|`
* `cleanup` - whether to cleanup tokenization artifacts, defaults
* `:cleanup` - whether to cleanup tokenization artifacts, defaults
to `true`
"""
Expand All @@ -71,7 +71,7 @@ defmodule Tokenizers.Decoder do
## Options
* `replacement` - the replacement character. Defaults to `▁`
* `:replacement` - the replacement character. Defaults to `▁`
(as char)
* `:prepend_scheme` - whether to add a space to the first word if there
Expand Down Expand Up @@ -112,9 +112,9 @@ defmodule Tokenizers.Decoder do
## Options
* `prefix` - The prefix to use for subwords. Defaults to `##`
* `:prefix` - The prefix to use for subwords. Defaults to `##`
* `cleanup` - Whether to cleanup tokenization artifacts. Defaults
* `:cleanup` - Whether to cleanup tokenization artifacts. Defaults
to `true`
"""
Expand Down
2 changes: 1 addition & 1 deletion lib/tokenizers/encoding/transformation.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule Tokenizers.Encoding.Transformation do
@moduledoc """
Module containing handy functions to build the transformations list.
This list is aplied to an encoding using `Tokenizers.Encoding.transform/2`.
This list is applied to an encoding using `Tokenizers.Encoding.transform/2`.
"""

@type t :: [
Expand Down
2 changes: 1 addition & 1 deletion lib/tokenizers/model/bpe.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ defmodule Tokenizers.Model.BPE do
the result of the merge operations for a number of words.
Defaults to `10_000`
* `:dropout` - The BPE dropout to use. Must be an float between
* `:dropout` - The BPE dropout to use. Must be a float between
0 and 1
* `:unk_token` - The unknown token to be used by the model
Expand Down
2 changes: 1 addition & 1 deletion lib/tokenizers/model/unigram.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ defmodule Tokenizers.Model.Unigram do
"""
@type options() :: [
byte_fallback: boolean(),
unk_id: float()
unk_id: integer()
]

@doc """
Expand Down
8 changes: 4 additions & 4 deletions lib/tokenizers/model/wordpiece.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ defmodule Tokenizers.Model.WordPiece do
@typedoc """
Options for model initialisation.
* `:unk_token` - the unknown token to be used by the model.
* `:unk_token` - the unknown token to be used by the model.
Defaults to `"[UNK]"`
* `:max_input_chars_per_word` - the maximum number of characters
to authorize in a single word. Defaults to `100`
to allow in a single word. Defaults to `100`
* `:continuing_subword_prefix` - the prefix to attach to subword
units that don't represent a beginning of word Defaults to `"##"`
* `:continuing_subword_prefix` - the prefix to attach to subword
units that don't represent a beginning of word. Defaults to `"##"`.
"""
@type options() :: [
Expand Down
4 changes: 2 additions & 2 deletions lib/tokenizers/normalizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ defmodule Tokenizers.Normalizer do
defdelegate normalize(normalizer, input), to: Tokenizers.Native, as: :normalizers_normalize

@doc """
Takes care of normalizing raw text before giving it to a Bert model.
Takes care of normalizing raw text before giving it to a BERT model.
This includes cleaning the text, handling accents, chinese chars and
This includes cleaning the text, handling accents, Chinese chars and
lowercasing.
## Options
Expand Down
4 changes: 2 additions & 2 deletions lib/tokenizers/post_processor.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ defmodule Tokenizers.PostProcessor do
## Options
* `:trim_offest` - whether to trim the whitespaces in the produced
* `:trim_offsets` - whether to trim the whitespaces in the produced
offsets. Defaults to `true`
* `:add_prefix_space` - whether add_prefix_space was ON during the
Expand All @@ -47,7 +47,7 @@ defmodule Tokenizers.PostProcessor do
@doc """
Creates a Template post-processor.
Let’s you easily template the post processing, adding special tokens
Lets you easily template the post processing, adding special tokens
and specifying the type id for each sequence/special token. The
template is given two strings representing the single sequence and
the pair of sequences, as well as a set of special tokens to use.
Expand Down
2 changes: 1 addition & 1 deletion lib/tokenizers/pre_tokenizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ defmodule Tokenizers.PreTokenizer do
@doc """
Creates a BertPreTokenizer pre-tokenizer.
Splits for use in Bert models.
Splits for use in BERT models.
"""
@spec bert_pre_tokenizer() :: t()
defdelegate bert_pre_tokenizer(), to: Tokenizers.Native, as: :pre_tokenizers_bert
Expand Down
2 changes: 1 addition & 1 deletion native/ex_tokenizers/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ pub fn models_save(
.iter()
.map(|path| {
path.to_str()
// Unwraping here, because we are sure that pathes are valid
// Unwraping here, because we are sure that paths are valid
.unwrap()
.to_owned()
})
Expand Down
2 changes: 1 addition & 1 deletion test/tokenizers/decoder_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ defmodule Tokenizers.DecoderTest do
assert %Tokenizers.Decoder{} = Tokenizers.Decoder.strip(?_, 0, 0)
end

test "cant be initialized with invalid char" do
test "can't be initialized with invalid char" do
assert_raise ArgumentError, fn ->
Tokenizers.Decoder.strip(61_126_999, 0, 0)
end
Expand Down
6 changes: 3 additions & 3 deletions test/tokenizers/model/bpe_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,23 @@ defmodule Tokenizers.Model.BPETest do
end

describe "loaded from file" do
test "Good initialized with valid pathes" do
test "Good initialization with valid paths" do
assert {:ok, %Tokenizers.Model{}} =
Tokenizers.Model.BPE.from_file(
"test/fixtures/vocab.json",
"test/fixtures/merges.txt"
)
end

test "bad initialized with invalid pathes" do
test "bad initialization with invalid paths" do
assert {:error, _} =
Tokenizers.Model.BPE.from_file(
"test/fixtures/not_found_vocab.json",
"test/fixtures/merges.txt"
)
end

test "bad initialized with good pathes but invalid data" do
test "bad initialization with good paths but invalid data" do
assert {:error, _} =
Tokenizers.Model.BPE.from_file(
"test/fixtures/vocab.txt",
Expand Down
4 changes: 2 additions & 2 deletions test/tokenizers/model/wordlevel_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ defmodule Tokenizers.Model.WordLevelTest do
end

describe "loaded from file" do
test "good initialized with valid pathes" do
test "good initialization with valid paths" do
assert {:ok, %Tokenizers.Model{}} =
Tokenizers.Model.WordLevel.from_file("test/fixtures/vocab.json")
end

test "bad initialized with invalid pathes" do
test "bad initialization with invalid paths" do
assert {:error, _} =
Tokenizers.Model.WordLevel.from_file("test/fixtures/not_found_vocab.json")
end
Expand Down
4 changes: 2 additions & 2 deletions test/tokenizers/model/wordpiece_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ defmodule Tokenizers.Model.WordPieceTest do
end

describe "loaded from file" do
test "good initialized with valid pathes" do
test "good initialization with valid paths" do
assert {:ok, %Tokenizers.Model{}} =
Tokenizers.Model.WordPiece.from_file("test/fixtures/vocab.txt")
end

test "bad initialized with invalid pathes" do
test "bad initialization with invalid paths" do
assert {:error, _} =
Tokenizers.Model.WordPiece.from_file("test/fixtures/not_found_vocab.json")
end
Expand Down
2 changes: 1 addition & 1 deletion test/tokenizers/normalizer_test.exs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
defmodule Tokenizers.NormalizerTest do
use ExUnit.Case, async: true
doctest Tokenizers.Decoder
doctest Tokenizers.Normalizer

describe "Bert" do
test "accepts no parameters" do
Expand Down

0 comments on commit 3961417

Please sign in to comment.