Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add binary:split/3 and string:find/2,3 #1293

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ also non string parameters (e.g. `Enum.join([1, 2], ",")`
- Support for `code:ensure_loaded/1`
- Support for `io_lib:latin1_char_list/1`
- Add support to Elixir for `Keyword.split/2`
- Support for `binary:split/3` and `string:find/2,3`

### Changed

Expand Down
18 changes: 17 additions & 1 deletion libs/estdlib/src/binary.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
%%-----------------------------------------------------------------------------
-module(binary).

-export([at/2, part/3, split/2]).
-export([at/2, part/3, split/2, split/3]).

%%-----------------------------------------------------------------------------
%% @param Binary binary to get a byte from
Expand All @@ -51,6 +51,7 @@ part(_Binary, _Pos, _Len) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @equiv split(Binary, Pattern, [])
%% @param Binary binary to split
%% @param Pattern pattern to perform the split
%% @return a list composed of one or two binaries
Expand All @@ -62,3 +63,18 @@ part(_Binary, _Pos, _Len) ->
-spec split(Binary :: binary(), Pattern :: binary()) -> [binary()].
split(_Binary, _Pattern) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @param Binary binary to split
%% @param Pattern pattern to perform the split
%% @param Options options for the split
%% @return a list composed of one or two binaries
%% @doc Split a binary according to pattern.
%% If pattern is not found, returns a singleton list with the passed binary.
%% Unlike Erlang/OTP, pattern must be a binary.
%% Only implemented option is `global'
%% @end
%%-----------------------------------------------------------------------------
-spec split(Binary :: binary(), Pattern :: binary(), Option :: [global]) -> [binary()].
split(_Binary, _Pattern, _Option) ->
erlang:nif_error(undefined).
102 changes: 93 additions & 9 deletions libs/estdlib/src/string.erl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
%%-----------------------------------------------------------------------------
-module(string).

-export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2]).
-export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2, find/2, find/3]).

%%-----------------------------------------------------------------------------
%% @param Input a string or character to convert
Expand Down Expand Up @@ -76,7 +76,7 @@ lower_char(C) when is_integer(C) ->
%% @returns chardata
%% @end
%%-----------------------------------------------------------------------------
-spec split(String :: string(), Pattern :: string()) -> string() | char().
-spec split(String :: unicode:chardata(), Pattern :: unicode:chardata()) -> [unicode:chardata()].
split(String, Pattern) ->
split(String, Pattern, leading).

Expand All @@ -98,25 +98,50 @@ split(String, Pattern) ->
%% [<<"ab">>,<<"bc">>,<<>>,<<"cd">>]'''
%% @end
%%-----------------------------------------------------------------------------
-spec split(String :: string(), Pattern :: string() | char(), Where :: atom()) -> string() | char().
split(String, Pattern, Where) ->
split(String, Pattern, Where, [], []).
-spec split(
String :: unicode:chardata(), Pattern :: unicode:chardata(), Where :: leading | trailing | all
) -> [unicode:chardata()].
split(String, Pattern, Where) when is_binary(String) andalso is_list(Pattern) ->
split_binary(String, unicode:characters_to_binary(Pattern), Where);
split(String, Pattern, Where) when is_binary(String) andalso is_binary(Pattern) ->
split_binary(String, Pattern, Where);
split(String, Pattern, Where) when is_list(String) andalso is_binary(Pattern) ->
split_list(String, unicode:characters_to_list(Pattern), Where);
split(String, Pattern, Where) when is_list(String) andalso is_list(Pattern) ->
split_list(String, Pattern, Where).

%% @private
split([], _Pattern, _Where, Token, Accum) ->
split_binary(String, Pattern, leading) ->
binary:split(String, Pattern);
split_binary(String, Pattern, all) ->
binary:split(String, Pattern, [global]);
split_binary(String, Pattern, trailing) ->
case find_binary(String, Pattern, trailing) of
nomatch ->
[String];
Rest ->
[binary:part(String, 0, byte_size(String) - byte_size(Rest) - byte_size(Pattern)), Rest]
end.

%% @private
split_list(String, Pattern, Where) ->
split_list(String, Pattern, Where, [], []).

%% @private
split_list([], _Pattern, _Where, Token, Accum) ->
lists:reverse([lists:reverse(Token) | Accum]);
split(String, Pattern, Where, Token, Accum) ->
split_list(String, Pattern, Where, Token, Accum) ->
case prefix_match(String, Pattern) of
{ok, Rest} ->
case Where of
leading ->
[lists:reverse(Token), Rest];
all ->
split(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum])
split_list(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum])
end;
no ->
[Char | Rest] = String,
split(Rest, Pattern, Where, [Char | Token], Accum)
split_list(Rest, Pattern, Where, [Char | Token], Accum)
end.

%% @private
Expand Down Expand Up @@ -167,3 +192,62 @@ triml([$\s | R]) ->
triml(R);
triml(R) ->
R.

%%-----------------------------------------------------------------------------
%% @equiv find(String, SearchPattern, leading)
%% @param String string to search in
%% @param SearchPattern pattern to search
%% @returns remainder of String starting from first occurrence of SearchPattern
%% or `nomatch' if SearchPattern cannot be found in String
%% @end
%%-----------------------------------------------------------------------------
-spec find(String :: unicode:chardata(), SearchPattern :: unicode:chardata()) ->
unicode:chardata() | nomatch.
find(String, SearchPattern) ->
find(String, SearchPattern, leading).

%%-----------------------------------------------------------------------------
%% @param String string to search in
%% @param SearchPattern pattern to search
%% @param Direction direction to search, `leading' or `trailing'
%% @returns remainder of String starting from first or last occurrence of
%% SearchPattern or `nomatch' if SearchPattern cannot be found in String
%% @end
%%-----------------------------------------------------------------------------
-spec find(
String :: unicode:chardata(),
SearchPattern :: unicode:chardata(),
Direction :: leading | trailing
) -> unicode:chardata() | nomatch.
find(String, "", _Direction) ->
String;
find(String, <<>>, _Direction) ->
String;
find(String, SearchPattern, Direction) when is_binary(String) andalso is_list(SearchPattern) ->
find_binary(String, unicode:characters_to_binary(SearchPattern), Direction);
find(String, SearchPattern, Direction) when is_binary(String) andalso is_binary(SearchPattern) ->
find_binary(String, SearchPattern, Direction);
find(String, SearchPattern, Direction) when is_list(String) andalso is_binary(SearchPattern) ->
find_list(String, unicode:characters_to_list(SearchPattern), Direction);
find(String, SearchPattern, Direction) when is_list(String) andalso is_list(SearchPattern) ->
find_list(String, SearchPattern, Direction).

%% @private
find_binary(<<_C, Rest/binary>> = String, SearchPattern, leading) when
byte_size(String) >= byte_size(SearchPattern)
->
case binary:part(String, 0, byte_size(SearchPattern)) =:= SearchPattern of
true -> String;
false -> find_binary(Rest, SearchPattern, leading)
end;
find_binary(_Sring, _SearchPattern, leading) ->
nomatch.

%% @private
find_list([_C | Rest] = String, SearchPattern, leading) ->
case prefix_match(String, SearchPattern) of
{ok, _Rest} -> String;
no -> find_list(Rest, SearchPattern, leading)
end;
find_list([], _SearchPattern, leading) ->
nomatch.
4 changes: 4 additions & 0 deletions src/libAtomVM/defaultatoms.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ static const char *const cast_atom = "\x5" "$cast";

static const char *const unicode_atom = "\x7" "unicode";

static const char *const global_atom = "\x6" "global";

void defaultatoms_init(GlobalContext *glb)
{
int ok = 1;
Expand Down Expand Up @@ -304,6 +306,8 @@ void defaultatoms_init(GlobalContext *glb)

ok &= globalcontext_insert_atom(glb, unicode_atom) == UNICODE_ATOM_INDEX;

ok &= globalcontext_insert_atom(glb, global_atom) == GLOBAL_ATOM_INDEX;

if (!ok) {
AVM_ABORT();
}
Expand Down
6 changes: 5 additions & 1 deletion src/libAtomVM/defaultatoms.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,9 @@ extern "C" {

#define UNICODE_ATOM_INDEX 110

#define PLATFORM_ATOMS_BASE_INDEX 111
#define GLOBAL_ATOM_INDEX 111

#define PLATFORM_ATOMS_BASE_INDEX 112

#define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
#define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
Expand Down Expand Up @@ -313,6 +315,8 @@ extern "C" {

#define UNICODE_ATOM TERM_FROM_ATOM_INDEX(UNICODE_ATOM_INDEX)

#define GLOBAL_ATOM TERM_FROM_ATOM_INDEX(GLOBAL_ATOM_INDEX)

void defaultatoms_init(GlobalContext *glb);

void platform_defaultatoms_init(GlobalContext *glb);
Expand Down
104 changes: 77 additions & 27 deletions src/libAtomVM/nifs.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ static term nif_binary_at_2(Context *ctx, int argc, term argv[]);
static term nif_binary_first_1(Context *ctx, int argc, term argv[]);
static term nif_binary_last_1(Context *ctx, int argc, term argv[]);
static term nif_binary_part_3(Context *ctx, int argc, term argv[]);
static term nif_binary_split_2(Context *ctx, int argc, term argv[]);
static term nif_binary_split(Context *ctx, int argc, term argv[]);
static term nif_calendar_system_time_to_universal_time_2(Context *ctx, int argc, term argv[]);
static term nif_erlang_delete_element_2(Context *ctx, int argc, term argv[]);
static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[]);
Expand Down Expand Up @@ -232,7 +232,7 @@ static const struct Nif binary_part_nif =
static const struct Nif binary_split_nif =
{
.base.type = NIFFunctionType,
.nif_ptr = nif_binary_split_2
.nif_ptr = nif_binary_split
};

static const struct Nif make_ref_nif =
Expand Down Expand Up @@ -3007,16 +3007,33 @@ static term nif_binary_part_3(Context *ctx, int argc, term argv[])
return term_maybe_create_sub_binary(bin_term, pos, len, &ctx->heap, ctx->global);
}

static term nif_binary_split_2(Context *ctx, int argc, term argv[])
static term nif_binary_split(Context *ctx, int argc, term argv[])
{
UNUSED(argc);

term bin_term = argv[0];
term pattern_term = argv[1];

VALIDATE_VALUE(bin_term, term_is_binary);
VALIDATE_VALUE(pattern_term, term_is_binary);

bool global = false;
if (argc == 3) {
term options = argv[2];
if (UNLIKELY(!term_is_list(options))) {
RAISE_ERROR(BADARG_ATOM);
}
if (term_is_nonempty_list(options)) {
term head = term_get_list_head(options);
term tail = term_get_list_tail(options);
if (UNLIKELY(head != GLOBAL_ATOM)) {
RAISE_ERROR(BADARG_ATOM);
}
if (UNLIKELY(!term_is_nil(tail))) {
RAISE_ERROR(BADARG_ATOM);
}
global = true;
}
}

int bin_size = term_binary_size(bin_term);
int pattern_size = term_binary_size(pattern_term);

Expand All @@ -3027,38 +3044,71 @@ static term nif_binary_split_2(Context *ctx, int argc, term argv[])
const char *bin_data = term_binary_data(bin_term);
const char *pattern_data = term_binary_data(pattern_term);

const char *found = (const char *) memmem(bin_data, bin_size, pattern_data, pattern_size);
// Count segments first to allocate memory once.
size_t num_segments = 1;
const char *temp_bin_data = bin_data;
int temp_bin_size = bin_size;
do {
const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size);
if (!found) break;
num_segments++;
int next_search_offset = found - temp_bin_data + pattern_size;
temp_bin_data += next_search_offset;
temp_bin_size -= next_search_offset;
} while (global && temp_bin_size >= pattern_size);

term result_list = term_nil();

if (num_segments == 1) {
// not found
if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
}

int offset = found - bin_data;
return term_list_prepend(argv[0], result_list, &ctx->heap);
}

if (found) {
int tok_size = offset;
size_t tok_size_in_terms = term_sub_binary_heap_size(bin_term, tok_size);
// binary:split/2,3 always return sub binaries, except when copied binaries are as small as sub-binaries.
if (UNLIKELY(memory_ensure_free_with_roots(ctx, LIST_SIZE(num_segments, TERM_BOXED_SUB_BINARY_SIZE), 2, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
}

int rest_size = bin_size - offset - pattern_size;
size_t rest_size_in_terms = term_sub_binary_heap_size(bin_term, rest_size);
// Allocate list first
for (size_t index_segments = 0; index_segments < num_segments; index_segments++) {
result_list = term_list_prepend(term_nil(), result_list, &ctx->heap);
}

// + 4 which is the result cons
if (UNLIKELY(memory_ensure_free_with_roots(ctx, tok_size_in_terms + rest_size_in_terms + 4, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
}
// Reset pointers after allocation
bin_data = term_binary_data(argv[0]);
pattern_data = term_binary_data(argv[1]);

term list_cursor = result_list;
temp_bin_data = bin_data;
temp_bin_size = bin_size;
term *list_ptr = term_get_list_ptr(list_cursor);
do {
const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size);

bin_term = argv[0];
term tok = term_maybe_create_sub_binary(bin_term, 0, tok_size, &ctx->heap, ctx->global);
term rest = term_maybe_create_sub_binary(bin_term, offset + pattern_size, rest_size, &ctx->heap, ctx->global);
if (found) {
term tok = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, found - temp_bin_data, &ctx->heap, ctx->global);
list_ptr[LIST_HEAD_INDEX] = tok;

term result_list = term_list_prepend(rest, term_nil(), &ctx->heap);
result_list = term_list_prepend(tok, result_list, &ctx->heap);
list_cursor = list_ptr[LIST_TAIL_INDEX];
list_ptr = term_get_list_ptr(list_cursor);

return result_list;
int next_search_offset = found - temp_bin_data + pattern_size;
temp_bin_data += next_search_offset;
temp_bin_size -= next_search_offset;
}

} else {
if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
if (!found || !global) {
term rest = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, temp_bin_size, &ctx->heap, ctx->global);
list_ptr[LIST_HEAD_INDEX] = rest;
break;
}
} while (!term_is_nil(list_cursor));

return term_list_prepend(argv[0], term_nil(), &ctx->heap);
}
return result_list;
}

static term nif_erlang_throw(Context *ctx, int argc, term argv[])
Expand Down
1 change: 1 addition & 0 deletions src/libAtomVM/nifs.gperf
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ binary:first/1, &binary_first_nif
binary:last/1, &binary_last_nif
binary:part/3, &binary_part_nif
binary:split/2, &binary_split_nif
binary:split/3, &binary_split_nif
calendar:system_time_to_universal_time/2, &system_time_to_universal_time_nif
erlang:atom_to_binary/1, &atom_to_binary_nif
erlang:atom_to_binary/2, &atom_to_binary_nif
Expand Down
1 change: 1 addition & 0 deletions tests/libs/estdlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ include(BuildErlang)

set(ERLANG_MODULES
test_apply
test_binary
test_calendar
test_gen_event
test_gen_server
Expand Down
Loading
Loading