Skip to content

Commit

Permalink
add support for regular expressions in Tokenizers.Normalizer.replace/2
Browse files Browse the repository at this point in the history
  • Loading branch information
mruoss committed Apr 21, 2024
1 parent f560e8f commit 87addab
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 7 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased

### Added

- Support for regular expressions in `Tokenizers.Normalizer.replace/2`
- Support for regular expressions in `Tokenizers.PreTokenizer.split/3`

## [v0.4.0] - 2023-08-09

### Added
Expand Down
19 changes: 15 additions & 4 deletions lib/tokenizers/normalizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,23 @@ defmodule Tokenizers.Normalizer do
defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase

@doc """
Replaces a custom string or regexp and changes it with given content.
Replaces a custom `search` string with the given `content`.
"""
@spec replace(String.t(), String.t()) :: t()
defdelegate replace(pattern, content),
to: Tokenizers.Native,
as: :normalizers_replace
def replace(search, content) do
Tokenizers.Native.normalizers_replace({:string, search}, content)
end

@doc """
Replaces occurrences of a custom regexp `pattern` with the given `content`.
The `pattern` should be a string representing a regular expression
according to the [Oniguruma Regex Engine](https://github.com/kkos/oniguruma).
"""
@spec replace_regex(String.t(), String.t()) :: t()
def replace_regex(pattern, content) do
Tokenizers.Native.normalizers_replace({:regex, pattern}, content)
end

@doc """
Creates a Nmt normalizer.
Expand Down
18 changes: 15 additions & 3 deletions native/ex_tokenizers/src/normalizers.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::{new_info, util::Info, ExTokenizersError};
use rustler::NifTaggedEnum;
use serde::{Deserialize, Serialize};
use tokenizers::{NormalizedString, Normalizer, NormalizerWrapper};
use tokenizers::{NormalizedString, Normalizer, NormalizerWrapper, normalizers::replace::ReplacePattern};

pub struct ExTokenizersNormalizerRef(pub NormalizerWrapper);

Expand Down Expand Up @@ -241,13 +241,25 @@ pub fn normalizers_lowercase() -> ExTokenizersNormalizer {
ExTokenizersNormalizer::new(tokenizers::normalizers::utils::Lowercase)
}

#[derive(NifTaggedEnum)]
pub enum LocalReplacePattern {
String(String),
Regex(String),
}

#[rustler::nif]
pub fn normalizers_replace(
pattern: String,
pattern: LocalReplacePattern,
content: String,
) -> Result<ExTokenizersNormalizer, rustler::Error> {

let final_pattern = match pattern {
LocalReplacePattern::String(pattern) => ReplacePattern::String(pattern),
LocalReplacePattern::Regex(pattern) => ReplacePattern::Regex(pattern),
};

Ok(ExTokenizersNormalizer::new(
tokenizers::normalizers::replace::Replace::new(pattern, content)
tokenizers::normalizers::replace::Replace::new(final_pattern, content)
.map_err(|_| rustler::Error::BadArg)?,
))
}
Expand Down
24 changes: 24 additions & 0 deletions test/tokenizers/normalizer_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,28 @@ defmodule Tokenizers.NormalizerTest do
{:ok, "▁Hello"}
end
end

describe "Replace" do
test "can be initialized" do
assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace("find", "replace")
end

test "can normalize strings" do
assert Tokenizers.Normalizer.replace("Hello", "World")
|> Tokenizers.Normalizer.normalize("Hello") ==
{:ok, "World"}
end
end

describe "Replace Regex" do
test "can be initialized" do
assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.replace_regex("\\d*", "")
end

test "can normalize strings" do
assert Tokenizers.Normalizer.replace_regex("\\d*", "")
|> Tokenizers.Normalizer.normalize("1Hel2lo3") ==
{:ok, "Hello"}
end
end
end

0 comments on commit 87addab

Please sign in to comment.