Skip to content

Commit

Permalink
Update tokenizer dependency to new minor and chain changes (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
Virviil authored Oct 2, 2024
1 parent 0e4f8c4 commit b3cd84b
Show file tree
Hide file tree
Showing 18 changed files with 261 additions and 184 deletions.
2 changes: 2 additions & 0 deletions lib/tokenizers/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ defmodule Tokenizers.Native do
def normalizers_replace(_pattern, _content), do: err()
def normalizers_nmt(), do: err()
def normalizers_precompiled(_data), do: err()
def normalizers_byte_level(), do: err()
def normalizers_byte_level_alphabet(), do: err()

# PreTokenizers
def pre_tokenizers_pre_tokenize(_pre_tokenizer, _input), do: err()
Expand Down
87 changes: 50 additions & 37 deletions lib/tokenizers/normalizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ defmodule Tokenizers.Normalizer do
@spec normalize(t(), String.t()) :: {:ok, String.t()}
defdelegate normalize(normalizer, input), to: Tokenizers.Native, as: :normalizers_normalize

# Normalizer entities. Following the order in https://docs.rs/tokenizers/0.20.0/src/tokenizers/normalizers/mod.rs.html#24

@doc """
Takes care of normalizing raw text before giving it to a BERT model.
Expand Down Expand Up @@ -49,30 +51,6 @@ defmodule Tokenizers.Normalizer do
to: Tokenizers.Native,
as: :normalizers_bert_normalizer

@doc """
Creates a NFD Unicode normalizer.
"""
@spec nfd :: t()
defdelegate nfd(), to: Tokenizers.Native, as: :normalizers_nfd

@doc """
Creates a NFKD Unicode normalizer.
"""
@spec nfkd :: t()
defdelegate nfkd(), to: Tokenizers.Native, as: :normalizers_nfkd

@doc """
Creates a NFC Unicode normalizer.
"""
@spec nfc :: t()
defdelegate nfc(), to: Tokenizers.Native, as: :normalizers_nfc

@doc """
Creates a NFKC Unicode normalizer.
"""
@spec nfkc :: t()
defdelegate nfkc(), to: Tokenizers.Native, as: :normalizers_nfkc

@doc """
Creates a Strip normalizer.
Expand All @@ -89,12 +67,6 @@ defmodule Tokenizers.Normalizer do
@spec strip(keyword()) :: t()
defdelegate strip(opts \\ []), to: Tokenizers.Native, as: :normalizers_strip

@doc """
Creates a Prepend normalizer.
"""
@spec prepend(prepend :: String.t()) :: t()
defdelegate prepend(prepend), to: Tokenizers.Native, as: :normalizers_prepend

@doc """
Creates a Strip Accent normalizer.
Expand All @@ -104,6 +76,30 @@ defmodule Tokenizers.Normalizer do
@spec strip_accents :: t()
defdelegate strip_accents(), to: Tokenizers.Native, as: :normalizers_strip_accents

@doc """
Creates a NFC Unicode normalizer.
"""
@spec nfc :: t()
defdelegate nfc(), to: Tokenizers.Native, as: :normalizers_nfc

@doc """
Creates a NFD Unicode normalizer.
"""
@spec nfd :: t()
defdelegate nfd(), to: Tokenizers.Native, as: :normalizers_nfd

@doc """
Creates a NFKC Unicode normalizer.
"""
@spec nfkc :: t()
defdelegate nfkc(), to: Tokenizers.Native, as: :normalizers_nfkc

@doc """
Creates a NFKD Unicode normalizer.
"""
@spec nfkd :: t()
defdelegate nfkd(), to: Tokenizers.Native, as: :normalizers_nfkd

@doc """
Composes multiple normalizers that will run in the provided order.
"""
Expand All @@ -116,6 +112,20 @@ defmodule Tokenizers.Normalizer do
@spec lowercase :: t()
defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase

@doc """
Creates a Nmt normalizer.
"""
@spec nmt :: t()
defdelegate nmt(), to: Tokenizers.Native, as: :normalizers_nmt

@doc """
Precompiled normalizer.
Don’t use manually it is used for compatibility with SentencePiece.
"""
@spec precompiled(binary()) :: {:ok, t()} | {:error, any()}
defdelegate precompiled(data), to: Tokenizers.Native, as: :normalizers_precompiled

@doc """
Replaces a custom `search` string with the given `content`.
"""
Expand All @@ -136,18 +146,21 @@ defmodule Tokenizers.Normalizer do
end

@doc """
Creates a Nmt normalizer.
Creates a Prepend normalizer.
"""
@spec nmt :: t()
defdelegate nmt(), to: Tokenizers.Native, as: :normalizers_nmt
@spec prepend(prepend :: String.t()) :: t()
defdelegate prepend(prepend), to: Tokenizers.Native, as: :normalizers_prepend

@doc """
Precompiled normalizer.
Created ByteLevel normalizer.
"""
@spec byte_level :: t()
defdelegate byte_level(), to: Tokenizers.Native, as: :normalizers_byte_level

Don’t use manually it is used for compatibility with SentencePiece.
@doc """
Gets ByteLevel normalizer's alphabet.
"""
@spec precompiled(binary()) :: {:ok, t()} | {:error, any()}
defdelegate precompiled(data), to: Tokenizers.Native, as: :normalizers_precompiled
defdelegate byte_level_alphabet(), to: Tokenizers.Native, as: :normalizers_byte_level_alphabet
end

defimpl Inspect, for: Tokenizers.Normalizer do
Expand Down
2 changes: 1 addition & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule Tokenizers.MixProject do
use Mix.Project

@source_url "https://github.com/elixir-nx/tokenizers"
@version "0.5.0-dev"
@version "0.6.0-dev"

def project do
[
Expand Down
8 changes: 4 additions & 4 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
%{
"castore": {:hex, :castore, "1.0.8", "dedcf20ea746694647f883590b82d9e96014057aff1d44d03ec90f36a5c0dc6e", [:mix], [], "hexpm", "0b2b66d2ee742cb1d9cb8c8be3b43c3a70ee8651f37b75a8b982e036752983f1"},
"castore": {:hex, :castore, "1.0.9", "5cc77474afadf02c7c017823f460a17daa7908e991b0cc917febc90e466a375c", [:mix], [], "hexpm", "5ea956504f1ba6f2b4eb707061d8e17870de2bee95fb59d512872c2ef06925e7"},
"earmark_parser": {:hex, :earmark_parser, "1.4.41", "ab34711c9dc6212dda44fcd20ecb87ac3f3fce6f0ca2f28d4a00e4154f8cd599", [:mix], [], "hexpm", "a81a04c7e34b6617c2792e291b5a2e57ab316365c2644ddc553bb9ed863ebefa"},
"ex_doc": {:hex, :ex_doc, "0.34.2", "13eedf3844ccdce25cfd837b99bea9ad92c4e511233199440488d217c92571e8", [:mix], [{:earmark_parser, "~> 1.4.39", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "5ce5f16b41208a50106afed3de6a2ed34f4acfd65715b82a0b84b49d995f95c1"},
"finch": {:hex, :finch, "0.18.0", "944ac7d34d0bd2ac8998f79f7a811b21d87d911e77a786bc5810adb75632ada4", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6 or ~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69f5045b042e531e53edc2574f15e25e735b522c37e2ddb766e15b979e03aa65"},
"finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"},
"hpax": {:hex, :hpax, "1.0.0", "28dcf54509fe2152a3d040e4e3df5b265dcb6cb532029ecbacf4ce52caea3fd2", [:mix], [], "hexpm", "7f1314731d711e2ca5fdc7fd361296593fc2542570b3105595bb0bc6d0fad601"},
"jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
"makeup": {:hex, :makeup, "1.1.2", "9ba8837913bdf757787e71c1581c21f9d2455f4dd04cfca785c70bbfff1a76a3", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cce1566b81fbcbd21eca8ffe808f33b221f9eee2cbc7a1706fc3da9ff18e6cac"},
Expand All @@ -15,7 +15,7 @@
"nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
"req": {:hex, :req, "0.5.6", "8fe1eead4a085510fe3d51ad854ca8f20a622aae46e97b302f499dfb84f726ac", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "cfaa8e720945d46654853de39d368f40362c2641c4b2153c886418914b372185"},
"rustler": {:hex, :rustler, "0.34.0", "e9a73ee419fc296a10e49b415a2eb87a88c9217aa0275ec9f383d37eed290c1c", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:req, "~> 0.5", [hex: :req, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "1d0c7449482b459513003230c0e2422b0252245776fe6fd6e41cb2b11bd8e628"},
"rustler_precompiled": {:hex, :rustler_precompiled, "0.6.2", "d2218ba08a43fa331957f30481d00b666664d7e3861431b02bd3f4f30eec8e5b", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "b9048eaed8d7d14a53f758c91865cc616608a438d2595f621f6a4b32a5511709"},
"telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"},
"rustler_precompiled": {:hex, :rustler_precompiled, "0.8.2", "5f25cbe220a8fac3e7ad62e6f950fcdca5a5a5f8501835d2823e8c74bf4268d5", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "63d1bd5f8e23096d1ff851839923162096364bac8656a4a3c00d1fff8e83ee0a"},
"telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"},
"toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"},
}
File renamed without changes.
Loading

0 comments on commit b3cd84b

Please sign in to comment.