From c9469311c35f1cd24871682d343a86bea7491962 Mon Sep 17 00:00:00 2001 From: Paul Guyot Date: Sun, 29 Sep 2024 12:21:18 +0200 Subject: [PATCH] Add `binary:split/3` and `string:find/2,3` Also update `string:split/2,3` to work with unicode binaries and fix specs. Signed-off-by: Paul Guyot --- CHANGELOG.md | 1 + libs/estdlib/src/binary.erl | 18 ++++- libs/estdlib/src/string.erl | 102 +++++++++++++++++++++++++--- src/libAtomVM/defaultatoms.c | 4 ++ src/libAtomVM/defaultatoms.h | 6 +- src/libAtomVM/nifs.c | 104 +++++++++++++++++++++-------- src/libAtomVM/nifs.gperf | 1 + tests/libs/estdlib/CMakeLists.txt | 1 + tests/libs/estdlib/test_binary.erl | 36 ++++++++++ tests/libs/estdlib/test_string.erl | 31 +++++++++ 10 files changed, 266 insertions(+), 38 deletions(-) create mode 100644 tests/libs/estdlib/test_binary.erl diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bffe7a1d..9ee4586a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ also non string parameters (e.g. `Enum.join([1, 2], ",")` - Support for `code:ensure_loaded/1` - Support for `io_lib:latin1_char_list/1` - Add support to Elixir for `Keyword.split/2` +- Support for `binary:split/3` and `string:find/2,3` ### Changed diff --git a/libs/estdlib/src/binary.erl b/libs/estdlib/src/binary.erl index cb93f9044..2254232c4 100644 --- a/libs/estdlib/src/binary.erl +++ b/libs/estdlib/src/binary.erl @@ -24,7 +24,7 @@ %%----------------------------------------------------------------------------- -module(binary). --export([at/2, part/3, split/2]). +-export([at/2, part/3, split/2, split/3]). %%----------------------------------------------------------------------------- %% @param Binary binary to get a byte from @@ -51,6 +51,7 @@ part(_Binary, _Pos, _Len) -> erlang:nif_error(undefined). %%----------------------------------------------------------------------------- +%% @equiv split(Binary, Pattern, []) %% @param Binary binary to split %% @param Pattern pattern to perform the split %% @return a list composed of one or two binaries @@ -62,3 +63,18 @@ part(_Binary, _Pos, _Len) -> -spec split(Binary :: binary(), Pattern :: binary()) -> [binary()]. split(_Binary, _Pattern) -> erlang:nif_error(undefined). + +%%----------------------------------------------------------------------------- +%% @param Binary binary to split +%% @param Pattern pattern to perform the split +%% @param Options options for the split +%% @return a list composed of one or two binaries +%% @doc Split a binary according to pattern. +%% If pattern is not found, returns a singleton list with the passed binary. +%% Unlike Erlang/OTP, pattern must be a binary. +%% Only implemented option is `global' +%% @end +%%----------------------------------------------------------------------------- +-spec split(Binary :: binary(), Pattern :: binary(), Option :: [global]) -> [binary()]. +split(_Binary, _Pattern, _Option) -> + erlang:nif_error(undefined). diff --git a/libs/estdlib/src/string.erl b/libs/estdlib/src/string.erl index ac701f207..63ee7a896 100644 --- a/libs/estdlib/src/string.erl +++ b/libs/estdlib/src/string.erl @@ -27,7 +27,7 @@ %%----------------------------------------------------------------------------- -module(string). --export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2]). +-export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2, find/2, find/3]). %%----------------------------------------------------------------------------- %% @param Input a string or character to convert @@ -76,7 +76,7 @@ lower_char(C) when is_integer(C) -> %% @returns chardata %% @end %%----------------------------------------------------------------------------- --spec split(String :: string(), Pattern :: string()) -> string() | char(). +-spec split(String :: unicode:chardata(), Pattern :: unicode:chardata()) -> [unicode:chardata()]. split(String, Pattern) -> split(String, Pattern, leading). @@ -98,25 +98,50 @@ split(String, Pattern) -> %% [<<"ab">>,<<"bc">>,<<>>,<<"cd">>]''' %% @end %%----------------------------------------------------------------------------- --spec split(String :: string(), Pattern :: string() | char(), Where :: atom()) -> string() | char(). -split(String, Pattern, Where) -> - split(String, Pattern, Where, [], []). +-spec split( + String :: unicode:chardata(), Pattern :: unicode:chardata(), Where :: leading | trailing | all +) -> [unicode:chardata()]. +split(String, Pattern, Where) when is_binary(String) andalso is_list(Pattern) -> + split_binary(String, unicode:characters_to_binary(Pattern), Where); +split(String, Pattern, Where) when is_binary(String) andalso is_binary(Pattern) -> + split_binary(String, Pattern, Where); +split(String, Pattern, Where) when is_list(String) andalso is_binary(Pattern) -> + split_list(String, unicode:characters_to_list(Pattern), Where); +split(String, Pattern, Where) when is_list(String) andalso is_list(Pattern) -> + split_list(String, Pattern, Where). %% @private -split([], _Pattern, _Where, Token, Accum) -> +split_binary(String, Pattern, leading) -> + binary:split(String, Pattern); +split_binary(String, Pattern, all) -> + binary:split(String, Pattern, [global]); +split_binary(String, Pattern, trailing) -> + case find_binary(String, Pattern, trailing) of + nomatch -> + [String]; + Rest -> + [binary:part(String, 0, byte_size(String) - byte_size(Rest) - byte_size(Pattern)), Rest] + end. + +%% @private +split_list(String, Pattern, Where) -> + split_list(String, Pattern, Where, [], []). + +%% @private +split_list([], _Pattern, _Where, Token, Accum) -> lists:reverse([lists:reverse(Token) | Accum]); -split(String, Pattern, Where, Token, Accum) -> +split_list(String, Pattern, Where, Token, Accum) -> case prefix_match(String, Pattern) of {ok, Rest} -> case Where of leading -> [lists:reverse(Token), Rest]; all -> - split(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum]) + split_list(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum]) end; no -> [Char | Rest] = String, - split(Rest, Pattern, Where, [Char | Token], Accum) + split_list(Rest, Pattern, Where, [Char | Token], Accum) end. %% @private @@ -167,3 +192,62 @@ triml([$\s | R]) -> triml(R); triml(R) -> R. + +%%----------------------------------------------------------------------------- +%% @equiv find(String, SearchPattern, leading) +%% @param String string to search in +%% @param SearchPattern pattern to search +%% @returns remainder of String starting from first occurrence of SearchPattern +%% or `nomatch' if SearchPattern cannot be found in String +%% @end +%%----------------------------------------------------------------------------- +-spec find(String :: unicode:chardata(), SearchPattern :: unicode:chardata()) -> + unicode:chardata() | nomatch. +find(String, SearchPattern) -> + find(String, SearchPattern, leading). + +%%----------------------------------------------------------------------------- +%% @param String string to search in +%% @param SearchPattern pattern to search +%% @param Direction direction to search, `leading' or `trailing' +%% @returns remainder of String starting from first or last occurrence of +%% SearchPattern or `nomatch' if SearchPattern cannot be found in String +%% @end +%%----------------------------------------------------------------------------- +-spec find( + String :: unicode:chardata(), + SearchPattern :: unicode:chardata(), + Direction :: leading | trailing +) -> unicode:chardata() | nomatch. +find(String, "", _Direction) -> + String; +find(String, <<>>, _Direction) -> + String; +find(String, SearchPattern, Direction) when is_binary(String) andalso is_list(SearchPattern) -> + find_binary(String, unicode:characters_to_binary(SearchPattern), Direction); +find(String, SearchPattern, Direction) when is_binary(String) andalso is_binary(SearchPattern) -> + find_binary(String, SearchPattern, Direction); +find(String, SearchPattern, Direction) when is_list(String) andalso is_binary(SearchPattern) -> + find_list(String, unicode:characters_to_list(SearchPattern), Direction); +find(String, SearchPattern, Direction) when is_list(String) andalso is_list(SearchPattern) -> + find_list(String, SearchPattern, Direction). + +%% @private +find_binary(<<_C, Rest/binary>> = String, SearchPattern, leading) when + byte_size(String) >= byte_size(SearchPattern) +-> + case binary:part(String, 0, byte_size(SearchPattern)) =:= SearchPattern of + true -> String; + false -> find_binary(Rest, SearchPattern, leading) + end; +find_binary(_Sring, _SearchPattern, leading) -> + nomatch. + +%% @private +find_list([_C | Rest] = String, SearchPattern, leading) -> + case prefix_match(String, SearchPattern) of + {ok, _Rest} -> String; + no -> find_list(Rest, SearchPattern, leading) + end; +find_list([], _SearchPattern, leading) -> + nomatch. diff --git a/src/libAtomVM/defaultatoms.c b/src/libAtomVM/defaultatoms.c index a8645adfd..f10d24293 100644 --- a/src/libAtomVM/defaultatoms.c +++ b/src/libAtomVM/defaultatoms.c @@ -160,6 +160,8 @@ static const char *const cast_atom = "\x5" "$cast"; static const char *const unicode_atom = "\x7" "unicode"; +static const char *const global_atom = "\x6" "global"; + void defaultatoms_init(GlobalContext *glb) { int ok = 1; @@ -304,6 +306,8 @@ void defaultatoms_init(GlobalContext *glb) ok &= globalcontext_insert_atom(glb, unicode_atom) == UNICODE_ATOM_INDEX; + ok &= globalcontext_insert_atom(glb, global_atom) == GLOBAL_ATOM_INDEX; + if (!ok) { AVM_ABORT(); } diff --git a/src/libAtomVM/defaultatoms.h b/src/libAtomVM/defaultatoms.h index eb0a4ff2c..285ee4f0e 100644 --- a/src/libAtomVM/defaultatoms.h +++ b/src/libAtomVM/defaultatoms.h @@ -169,7 +169,9 @@ extern "C" { #define UNICODE_ATOM_INDEX 110 -#define PLATFORM_ATOMS_BASE_INDEX 111 +#define GLOBAL_ATOM_INDEX 111 + +#define PLATFORM_ATOMS_BASE_INDEX 112 #define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX) #define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX) @@ -313,6 +315,8 @@ extern "C" { #define UNICODE_ATOM TERM_FROM_ATOM_INDEX(UNICODE_ATOM_INDEX) +#define GLOBAL_ATOM TERM_FROM_ATOM_INDEX(GLOBAL_ATOM_INDEX) + void defaultatoms_init(GlobalContext *glb); void platform_defaultatoms_init(GlobalContext *glb); diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c index 9a4732254..ea202c77f 100644 --- a/src/libAtomVM/nifs.c +++ b/src/libAtomVM/nifs.c @@ -91,7 +91,7 @@ static term nif_binary_at_2(Context *ctx, int argc, term argv[]); static term nif_binary_first_1(Context *ctx, int argc, term argv[]); static term nif_binary_last_1(Context *ctx, int argc, term argv[]); static term nif_binary_part_3(Context *ctx, int argc, term argv[]); -static term nif_binary_split_2(Context *ctx, int argc, term argv[]); +static term nif_binary_split(Context *ctx, int argc, term argv[]); static term nif_calendar_system_time_to_universal_time_2(Context *ctx, int argc, term argv[]); static term nif_erlang_delete_element_2(Context *ctx, int argc, term argv[]); static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[]); @@ -232,7 +232,7 @@ static const struct Nif binary_part_nif = static const struct Nif binary_split_nif = { .base.type = NIFFunctionType, - .nif_ptr = nif_binary_split_2 + .nif_ptr = nif_binary_split }; static const struct Nif make_ref_nif = @@ -3007,16 +3007,33 @@ static term nif_binary_part_3(Context *ctx, int argc, term argv[]) return term_maybe_create_sub_binary(bin_term, pos, len, &ctx->heap, ctx->global); } -static term nif_binary_split_2(Context *ctx, int argc, term argv[]) +static term nif_binary_split(Context *ctx, int argc, term argv[]) { - UNUSED(argc); - term bin_term = argv[0]; term pattern_term = argv[1]; VALIDATE_VALUE(bin_term, term_is_binary); VALIDATE_VALUE(pattern_term, term_is_binary); + bool global = false; + if (argc == 3) { + term options = argv[2]; + if (UNLIKELY(!term_is_list(options))) { + RAISE_ERROR(BADARG_ATOM); + } + if (term_is_nonempty_list(options)) { + term head = term_get_list_head(options); + term tail = term_get_list_tail(options); + if (UNLIKELY(head != GLOBAL_ATOM)) { + RAISE_ERROR(BADARG_ATOM); + } + if (UNLIKELY(!term_is_nil(tail))) { + RAISE_ERROR(BADARG_ATOM); + } + global = true; + } + } + int bin_size = term_binary_size(bin_term); int pattern_size = term_binary_size(pattern_term); @@ -3027,38 +3044,71 @@ static term nif_binary_split_2(Context *ctx, int argc, term argv[]) const char *bin_data = term_binary_data(bin_term); const char *pattern_data = term_binary_data(pattern_term); - const char *found = (const char *) memmem(bin_data, bin_size, pattern_data, pattern_size); + // Count segments first to allocate memory once. + size_t num_segments = 1; + const char *temp_bin_data = bin_data; + int temp_bin_size = bin_size; + do { + const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size); + if (!found) break; + num_segments++; + int next_search_offset = found - temp_bin_data + pattern_size; + temp_bin_data += next_search_offset; + temp_bin_size -= next_search_offset; + } while (global && temp_bin_size >= pattern_size); + + term result_list = term_nil(); + + if (num_segments == 1) { + // not found + if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } - int offset = found - bin_data; + return term_list_prepend(argv[0], result_list, &ctx->heap); + } - if (found) { - int tok_size = offset; - size_t tok_size_in_terms = term_sub_binary_heap_size(bin_term, tok_size); + // binary:split/2,3 always return sub binaries, except when copied binaries are as small as sub-binaries. + if (UNLIKELY(memory_ensure_free_with_roots(ctx, LIST_SIZE(num_segments, TERM_BOXED_SUB_BINARY_SIZE), 2, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { + RAISE_ERROR(OUT_OF_MEMORY_ATOM); + } - int rest_size = bin_size - offset - pattern_size; - size_t rest_size_in_terms = term_sub_binary_heap_size(bin_term, rest_size); + // Allocate list first + for (size_t index_segments = 0; index_segments < num_segments; index_segments++) { + result_list = term_list_prepend(term_nil(), result_list, &ctx->heap); + } - // + 4 which is the result cons - if (UNLIKELY(memory_ensure_free_with_roots(ctx, tok_size_in_terms + rest_size_in_terms + 4, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { - RAISE_ERROR(OUT_OF_MEMORY_ATOM); - } + // Reset pointers after allocation + bin_data = term_binary_data(argv[0]); + pattern_data = term_binary_data(argv[1]); + + term list_cursor = result_list; + temp_bin_data = bin_data; + temp_bin_size = bin_size; + term *list_ptr = term_get_list_ptr(list_cursor); + do { + const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size); - bin_term = argv[0]; - term tok = term_maybe_create_sub_binary(bin_term, 0, tok_size, &ctx->heap, ctx->global); - term rest = term_maybe_create_sub_binary(bin_term, offset + pattern_size, rest_size, &ctx->heap, ctx->global); + if (found) { + term tok = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, found - temp_bin_data, &ctx->heap, ctx->global); + list_ptr[LIST_HEAD_INDEX] = tok; - term result_list = term_list_prepend(rest, term_nil(), &ctx->heap); - result_list = term_list_prepend(tok, result_list, &ctx->heap); + list_cursor = list_ptr[LIST_TAIL_INDEX]; + list_ptr = term_get_list_ptr(list_cursor); - return result_list; + int next_search_offset = found - temp_bin_data + pattern_size; + temp_bin_data += next_search_offset; + temp_bin_size -= next_search_offset; + } - } else { - if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) { - RAISE_ERROR(OUT_OF_MEMORY_ATOM); + if (!found || !global) { + term rest = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, temp_bin_size, &ctx->heap, ctx->global); + list_ptr[LIST_HEAD_INDEX] = rest; + break; } + } while (!term_is_nil(list_cursor)); - return term_list_prepend(argv[0], term_nil(), &ctx->heap); - } + return result_list; } static term nif_erlang_throw(Context *ctx, int argc, term argv[]) diff --git a/src/libAtomVM/nifs.gperf b/src/libAtomVM/nifs.gperf index 49b1530e1..5852e86bf 100644 --- a/src/libAtomVM/nifs.gperf +++ b/src/libAtomVM/nifs.gperf @@ -36,6 +36,7 @@ binary:first/1, &binary_first_nif binary:last/1, &binary_last_nif binary:part/3, &binary_part_nif binary:split/2, &binary_split_nif +binary:split/3, &binary_split_nif calendar:system_time_to_universal_time/2, &system_time_to_universal_time_nif erlang:atom_to_binary/1, &atom_to_binary_nif erlang:atom_to_binary/2, &atom_to_binary_nif diff --git a/tests/libs/estdlib/CMakeLists.txt b/tests/libs/estdlib/CMakeLists.txt index 2dfe988bc..ebc4754f4 100644 --- a/tests/libs/estdlib/CMakeLists.txt +++ b/tests/libs/estdlib/CMakeLists.txt @@ -24,6 +24,7 @@ include(BuildErlang) set(ERLANG_MODULES test_apply + test_binary test_calendar test_gen_event test_gen_server diff --git a/tests/libs/estdlib/test_binary.erl b/tests/libs/estdlib/test_binary.erl new file mode 100644 index 000000000..2549e045d --- /dev/null +++ b/tests/libs/estdlib/test_binary.erl @@ -0,0 +1,36 @@ +% +% This file is part of AtomVM. +% +% Copyright 2024 Paul Guyot +% +% Licensed under the Apache License, Version 2.0 (the "License"); +% you may not use this file except in compliance with the License. +% You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +% See the License for the specific language governing permissions and +% limitations under the License. +% +% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later +% + +-module(test_binary). + +-export([test/0]). + +-include("etest.hrl"). + +test() -> + ok = test_split(), + ok. + +test_split() -> + ?ASSERT_MATCH(binary:split(<<"foobar">>, <<"oo">>), [<<"f">>, <<"bar">>]), + ?ASSERT_MATCH(binary:split(<<"foobar">>, <<"ooz">>), [<<"foobar">>]), + ?ASSERT_MATCH(binary:split(<<"foobar">>, <<"o">>), [<<"f">>, <<"obar">>]), + ?ASSERT_MATCH(binary:split(<<"foobar">>, <<"o">>, [global]), [<<"f">>, <<>>, <<"bar">>]), + ok. diff --git a/tests/libs/estdlib/test_string.erl b/tests/libs/estdlib/test_string.erl index 178df5562..3006dab40 100644 --- a/tests/libs/estdlib/test_string.erl +++ b/tests/libs/estdlib/test_string.erl @@ -28,6 +28,7 @@ test() -> ok = test_to_upper(), ok = test_split(), ok = test_trim(), + ok = test_find(), ok. test_to_upper() -> @@ -47,6 +48,18 @@ test_split() -> ?ASSERT_MATCH(string:split("fooXXXbar", "XXX"), ["foo", "bar"]), ?ASSERT_MATCH(string:split("foo barXXXtapas", "XXX"), ["foo bar", "tapas"]), ?ASSERT_MATCH(string:split("foo barXXXXXXtapas", "XXX", all), ["foo bar", [], "tapas"]), + + ?ASSERT_MATCH(string:split("ab..bc..cd", ".."), ["ab", "bc..cd"]), + ?ASSERT_MATCH(string:split(<<"ab..bc..cd">>, ".."), [<<"ab">>, <<"bc..cd">>]), + ?ASSERT_MATCH(string:split(<<"ab..bc..cd">>, "..", leading), [<<"ab">>, <<"bc..cd">>]), + % ?ASSERT_MATCH(string:split(<<"ab..bc..cd">>, "..", trailing), [<<"ab..bc">>, <<"cd">>]), + ?ASSERT_MATCH(string:split(<<"ab..bc..cd">>, "..", all), [<<"ab">>, <<"bc">>, <<"cd">>]), + + ?ASSERT_MATCH(string:split("ab..bc..cd", <<"..">>), ["ab", "bc..cd"]), + ?ASSERT_MATCH(string:split(<<"ab..bc..cd">>, <<"..">>, leading), [<<"ab">>, <<"bc..cd">>]), + % ?ASSERT_MATCH(string:split(<<"ab..bc..cd">>, <<"..">>, trailing), [<<"ab..bc">>, <<"cd">>]), + ?ASSERT_MATCH(string:split(<<"ab..bc..cd">>, <<"..">>, all), [<<"ab">>, <<"bc">>, <<"cd">>]), + ok. test_trim() -> @@ -60,4 +73,22 @@ test_trim() -> ?ASSERT_MATCH(string:trim(" foo bar ", both), "foo bar"), ok. +test_find() -> + ?ASSERT_MATCH(string:find("", ""), ""), + ?ASSERT_MATCH(string:find("foo", ""), "foo"), + ?ASSERT_MATCH(string:find("", "foo"), nomatch), + ?ASSERT_MATCH(string:find(<<>>, <<>>), <<>>), + ?ASSERT_MATCH(string:find(<<>>, ""), <<>>), + ?ASSERT_MATCH(string:find(<<"foo">>, <<"">>), <<"foo">>), + ?ASSERT_MATCH(string:find(<<"foo">>, ""), <<"foo">>), + ?ASSERT_MATCH(string:find(<<"">>, <<"foo">>), nomatch), + ?ASSERT_MATCH(string:find(<<"">>, "foo"), nomatch), + + ?ASSERT_MATCH(string:find("foobar", "ba"), "bar"), + ?ASSERT_MATCH(string:find(<<"foobar">>, "ba"), <<"bar">>), + ?ASSERT_MATCH(string:find("foobar", <<"ba">>), "bar"), + ?ASSERT_MATCH(string:find(<<"foobar">>, <<"ba">>), <<"bar">>), + + ok. + id(X) -> X.