From 2e95b67c0ab719a961e86c905393033854b68704 Mon Sep 17 00:00:00 2001 From: Yoan Picchi Date: Mon, 21 Oct 2024 17:08:56 +0000 Subject: [PATCH] Draft of the extended vectorscan API This commit extends the vectorscan API with access to low level algorithms. This would enable a develloper to bypass most of the overhead of the regular vectorscan scan when the pattern to check is simple. Currently it only targets pure literal patterns. Signed-off-by: Yoan Picchi --- CMakeLists.txt | 1 + benchmarks/CMakeLists.txt | 5 + benchmarks/test_extended_api.cpp | 165 +++++++++++++++++++++ src/hs_common.h | 150 +++++++++++++++++++ src/hs_compile.h | 189 ++++++++++++++++++++++++ src/hs_extended_api.cpp | 146 +++++++++++++++++++ src/hs_runtime.h | 238 +++++++++++++++++++++++++++++++ 7 files changed, 894 insertions(+) create mode 100644 benchmarks/test_extended_api.cpp create mode 100644 src/hs_extended_api.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 928b6cb13..80f5cb3e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -291,6 +291,7 @@ set (hs_exec_SRCS src/crc32.h src/report.h src/runtime.c + src/hs_extended_api.cpp src/stream_compress.c src/stream_compress.h src/stream_compress_impl.h diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 63391a68c..b5a8ff528 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -6,4 +6,9 @@ if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS)) set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS "-Wall -Wno-unused-variable") target_link_libraries(benchmarks hs) + + add_executable(test_api test_extended_api.cpp) + set_source_files_properties(test_extended_api.cpp PROPERTIES COMPILE_FLAGS + "-Wall -Wno-unused-variable") + target_link_libraries(test_api hs) endif() diff --git a/benchmarks/test_extended_api.cpp b/benchmarks/test_extended_api.cpp new file mode 100644 index 000000000..d3d3aab13 --- /dev/null +++ b/benchmarks/test_extended_api.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2024-2025, Arm ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include "hs_compile.h" +#include "hs_runtime.h" + +#include "hwlm/hwlm_literal.h" +#include "hwlm/noodle_build.h" +#include "hwlm/noodle_engine.h" +#include "hwlm/noodle_internal.h" + +const char *buf1 = "azertyuioperty"; +int buf1_len = 14; +const char *buf2 = "AZERTYUIOPERTY"; +int buf2_len = 14; + +typedef struct context { + /* array of indices in the string where we expect match to be reported */ + size_t *expected_array; + size_t array_size; + /* counter of matches hapenning at a position in expected_array */ + size_t number_matched; + /* counter of matches hapenning at a position NOT in expected_array */ + size_t number_wrong; +} context_t; + +int callback(unsigned int id, unsigned long long start, + unsigned long long end_offset, unsigned int flags, + void *raw_context) { + (void)id; + (void)start; + (void)flags; + context_t *context = reinterpret_cast(raw_context); + bool matched = false; + // Check if the match is expected + for (size_t i = 0; i < context->array_size; i++) { + if (end_offset == context->expected_array[i]) { + matched = true; + } + } + // Tally the right counter wether the match was expected or not + if (matched) { + context->number_matched += 1; + } else { + context->number_wrong += 1; + printf("unplanned match at index %llu\n", end_offset); + } + + return CB_CONTINUE_MATCHING; +} + +int test_noodle() { + const char *pattern = "ert"; + hs_short_literal_compiled_pattern_t noodle_database; + + hs_error_t ret = + hs_compile_short_literal_search(pattern, 3, &noodle_database); + if (ret != HS_SUCCESS) { + printf("Fail to build the pattern\n"); + return 1; + } + + size_t expected_array[2] = {4, 12}; + context_t context = {&(expected_array[0]), 2, 0, 0}; + ret = hs_short_literal_search(&noodle_database, buf1, buf1_len, nullptr, + callback, &context); + if (ret != HS_SUCCESS) { + printf("Fail to run noodle\n"); + return 1; + } + if (context.number_matched != context.array_size) { + printf("1- missed some matches. Expected: %lu, got %lu\n", + reinterpret_cast(context.array_size), + reinterpret_cast(context.number_matched)); + } + + expected_array[0] = 8; + context = {&(expected_array[0]), 1, 0, 0}; + ret = hs_short_literal_search(&noodle_database, buf1 + 4, buf1_len - 4, + nullptr, callback, &context); + if (ret != HS_SUCCESS) { + printf("Fail to run noodle\n"); + return 1; + } + if (context.number_matched != context.array_size) { + printf("2- missed some matches. Expected: %lu, got %lu\n", + reinterpret_cast(context.array_size), + reinterpret_cast(context.number_matched)); + } + + pattern = "ERT"; + ret = hs_compile_short_literal_search(pattern, 3, &noodle_database); + if (ret != HS_SUCCESS) { + printf("Fail to build the pattern\n"); + return 1; + } + + expected_array[0] = 4; + context = {&(expected_array[0]), 2, 0, 0}; + ret = hs_short_literal_search(&noodle_database, buf2, buf2_len, nullptr, + callback, &context); + if (ret != HS_SUCCESS) { + printf("Fail to run noodle\n"); + return 1; + } + if (context.number_matched != context.array_size) { + printf("3- missed some matches. Expected: %lu, got %lu\n", + reinterpret_cast(context.array_size), + reinterpret_cast(context.number_matched)); + } + + expected_array[0] = 8; + context = {&(expected_array[0]), 1, 0, 0}; + ret = hs_short_literal_search(&noodle_database, buf2 + 4, buf2_len - 4, + nullptr, callback, &context); + if (ret != HS_SUCCESS) { + printf("Fail to run noodle\n"); + return 1; + } + if (context.number_matched != context.array_size) { + printf("4- missed some matches. Expected: %lu, got %lu\n", + reinterpret_cast(context.array_size), + reinterpret_cast(context.number_matched)); + } + + return 0; +} + +int main() { + // test_plain_noodle(); + if (!test_noodle()) { + printf("all test passed\n"); + } + + return 0; +} \ No newline at end of file diff --git a/src/hs_common.h b/src/hs_common.h index 3078ad7bb..cdeb6af4f 100644 --- a/src/hs_common.h +++ b/src/hs_common.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2019, Intel Corporation + * Copyright (c) 2024-2025, Arm ltd * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -583,6 +584,155 @@ hs_error_t HS_CDECL hs_valid_platform(void); */ #define HS_UNKNOWN_ERROR (-13) +/** + * The following functions and types are part of the extended API and are not + * cross compatible with hyperscan. This extension intends on providing the + * developer with minimal overhead search functions. + */ + +/** + * The size threshold after which a pattern is considered long and must be fed + * to @ref hs_compile_long_literal_search(). Patterns up to this length may be + * fed to hs_compile_short_literal_search() instead. + */ +#define HS_SHORT_PATTERN_THRESHOLD 16 + +/** + * The compiled pattern type for searches short literals + * Generated by @ref hs_compile_short_literal_search() + */ +typedef struct { + unsigned char data[32]; +} hs_short_literal_compiled_pattern_t; + +/** + * The compiled pattern type for searching long literals + * Generated by @ref hs_compile_long_literal_search() + * Note that given the unbounded nature of the pattern size, it is impossible to + * give a constant size to the structure. You'll need to use @ref + * hs_get_long_literal_search_database_size() and allocate enough memory to + * store the compiled pattern. + */ +typedef struct hs_long_literal_compiled_pattern hs_long_literal_compiled_pattern_t; + +/** + * RFC: hs_compile_long_literal_search() and a few other don't have any obvious + * bounds on the size of the pattern. To make it generic we need to get the time + * at runtime. The issue is that it is slow. I'd like to also provide an + * optional static size in some way when the backing algorithm allows it. It + * might be in the form of a polynomial function in a #define that takes in the + * expected max size of the pattern and the number of patterns. This would allow + * the user to use a static size in their own algo if they know in advance what + * kind of pattern they'll use. The problem with this though is that I'm not + * sure how often such feature would be used and I don't want to clutter the + * API. Any feedback? + */ + +/** + * This function calculate the size needed to store the compiled version of the + * given @p expression . + * + * @param expression + * The expression to parse. Note that this string must represent ONLY the + * pattern to be matched, with no delimiters. Null characters are accepted as + * part of the expression. + * + * @param expression_length + * The length of the expression in byte. + * + * @return + * On success, the size in byte that needs to be allocated to store the + * given pattern. Otherwise returns 0 + */ +size_t HS_CDECL hs_get_long_literal_search_database_size(const char *expression, + size_t pattern_len); + +/** + * The compiled pattern type for searching several long literal + * Generated by @ref hs_compile_multi_literal_search() + * Note that given the unbounded nature of the pattern size, it is impossible to + * give a constant size to the structure. You'll need to use @ref + * hs_get_multi_literal_search_database_size() and allocate enough memory to + * store the compiled pattern. + */ +typedef struct hs_multi_literal_compiled_pattern hs_multi_literal_compiled_pattern_t; + +/** + * This function calculate the size needed to store the compiled version of the + * given @p expression . + * + * @param expression + * The array of expressions to parse. Note that the strings must represent + * ONLY the patterns to be matched, with no delimiters. Null characters are + * accepted as part of the expression. + * + * @param pattern_count + * The number of expressions in the @p expression array. + * + * @param expression_length + * The array of length of each expression in the @p expression array. + * Expressed in byte. + * + * @return + * On success, the size in byte that needs to be allocated to store the + * given pattern. Otherwise returns 0 + */ +size_t HS_CDECL hs_get_multi_literal_search_database_size( + const char **expression, size_t pattern_count, size_t *pattern_len); + +/** + * The compiled pattern type for searching a single character + * Generated by @ref hs_compile_single_char_search() + */ +typedef struct { + unsigned char data[1]; +} hs_single_char_compiled_pattern_t; + +/** + * The compiled pattern type for searching a character set + * Generated by @ref hs_compile_multi_char_search() + */ +typedef struct { + unsigned char data[32]; +} hs_multi_char_compiled_pattern_t; + +/** + * The compiled pattern type for searching a character pair + * Generated by @ref hs_compile_char_pair_search() + */ +typedef struct { + unsigned char data[32]; +} hs_single_char_pair_compiled_pattern_t; + +/** + * The compiled pattern type for searching a set of character pairs + * Generated by @ref hs_compile_multi_char_pair_search() + * Note that given the unbounded maximum number of pair, it is impossible to + * give a constant size to the structure. You'll need to use @ref + * hs_get_multi_char_pair_search_database_size() and allocate enough memory to + * store the compiled pattern. + */ +typedef struct hs_multi_char_pair_compiled_pattern hs_multi_char_pair_compiled_pattern_t; + +/** + * This function calculate the size needed to store the compiled version of the + * given @p expression . + * + * @param expression + * The concatenation of all pairs to be parsed. If one want to search for + * "ab" or "Cd", then @p expression would be ['a','b','C','d']. Null terminator + * is optional. + * + * @param pair_count + * The number of characters pair in @p expression + * + * @return + * On success, the size in byte that needs to be allocated to store the + * given pattern. Otherwise returns 0 + */ +size_t HS_CDECL hs_get_multi_char_pair_search_database_size( + const char *expression, size_t pair_count); + /** @} */ #ifdef __cplusplus diff --git a/src/hs_compile.h b/src/hs_compile.h index 5aa241886..242ae7174 100644 --- a/src/hs_compile.h +++ b/src/hs_compile.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2021, Intel Corporation + * Copyright (c) 2024-2025, Arm ltd * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -1209,6 +1210,194 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform); */ #define HS_MODE_SOM_HORIZON_SMALL (1U << 26) +/** + * The following functions are part of the extended API and are not cross + * compatible with hyperscan. This extension intends on providing the developer + * with minimal overhead search functions. + * + * All search functions handle a limited kind of patterns. For more generic + * patterns, use @ref hs_compile() + * All search functions are considered case-sensitive. + */ + +/** + * This function compiles a short literal expression to be then searched for in + * @ref hs_short_literal_search() The expression must be at most @ref + * HS_SHORT_PATTERN_THRESHOLD characters longs. For longer expressions, use @ref + * hs_compile_long_literal_search() and @ref hs_long_literal_search() instead. + * Potentially faster search exist for character pairs and sets. + * + * @param expression + * The expression to parse. Note that this string must represent ONLY the + * pattern to be matched, with no delimiters. Null characters are accepted as + * part of the expression. + * + * @param expression_length + * The length of the expression in byte. + * + * @param output_database + * A pointer to the user-allocated @ref hs_short_literal_compiled_pattern_t + * structure On success, it will fill it with the data required to run @ref + * hs_short_literal_search() with the given expression. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR otherwise. + */ +hs_error_t HS_CDECL hs_compile_short_literal_search( + const char *expression, size_t expression_length, + hs_short_literal_compiled_pattern_t *output_database); + +/** + * This function compiles a literal expression to be then searched for in @ref + * hs_long_literal_search() There is no size limit. For expressions up to @ref + * HS_SHORT_PATTERN_THRESHOLD character longs, @ref + * hs_compile_short_literal_search() and @ref hs_short_literal_search() might be + * faster + * + * @param expression + * The expression to parse. Note that this string must represent ONLY the + * pattern to be matched, with no delimiters. Null characters are accepted as + * part of the expression. + * + * @param expression_length + * The length of the expression in byte. + * + * @param output_database + * A pointer to the user-allocated @ref hs_long_literal_compiled_pattern_t + * structure On success, it will fill it with the data required to run @ref + * hs_long_literal_search() with the given expression. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR otherwise. + */ +hs_error_t HS_CDECL hs_compile_long_literal_search( + const char *expression, size_t expression_length, + hs_long_literal_compiled_pattern_t *output_database); + +/** + * This function compiles several literal expression to be then searched for in + * @ref hs_multi_literal_search() There is no size limit. + * + * @param expression + * The array of expressions to parse. Note that the strings must represent + * ONLY the patterns to be matched, with no delimiters. Null characters are + * accepted as part of the expression. + * + * @param pattern_count + * The number of expressions in the @p expression array. + * + * @param expression_length + * The array of length of each expression in the @p expression array. + * Expressed in byte. + * + * @param output_database + * A pointer to the user-allocated @ref hs_multi_literal_compiled_pattern_t + * structure On success, it will fill it with the data required to run @ref + * hs_multi_literal_search() with the given expression. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR otherwise. + */ +hs_error_t HS_CDECL hs_compile_multi_literal_search( + const char **expression, size_t pattern_count, size_t *expression_length, + hs_multi_literal_compiled_pattern_t *output_database); + +/** + * This function compiles a single characters to then searched for in @ref + * hs_single_char_search(). + * + * @param character + * The single character to be searched. It is case sensitive. + * + * @param output_database + * A pointer to the user-allocated @ref hs_single_char_compiled_pattern_t + * structure On success, it will fill it with the data required to run @ref + * hs_single_char_search() with the given expression. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR otherwise. + */ +hs_error_t HS_CDECL hs_compile_single_char_search( + const char character, hs_single_char_compiled_pattern_t *output_database); + +// -- find char from a char set -- +/** + * This function compiles a set of characters to then searched for in @ref + * hs_multi_char_search(). + * + * @param character_array + * The string or character array containing all the character in the set. + * It is case sensitive. Null terminator is optional. + * + * @param character_count + * The number of characters in @p character_array + * + * @param output_database + * A pointer to the user-allocated @ref hs_multi_char_compiled_pattern_t + * structure On success, it will fill it with the data required to run @ref + * hs_multi_char_search() with the given expression. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR otherwise. + */ +hs_error_t HS_CDECL hs_compile_multi_char_search( + const char *character_array, size_t character_count, + hs_multi_char_compiled_pattern_t *output_database); + +/** + * This function compiles a pair of characters to then searched for in @ref + * hs_single_char_pair_search(). Note that the character are ordered in the + * pair. "Aj" won't match "jA" + * + * @param pair + * The string or character array containing the pair. Null terminator is + * optional. + * + * @param output_database + * A pointer to the user-allocated @ref + * hs_single_char_pair_compiled_pattern_t structure On success, it will fill it + * with the data required to run @ref hs_single_char_pair_search() with the + * given expression. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR otherwise. + */ +hs_error_t HS_CDECL hs_compile_single_char_pair_search( + const char *pair, hs_single_char_pair_compiled_pattern_t *output_database); + +/** + * This function compiles severals pairs to then searched for in @ref + * hs_multi_char_pair_search(). Note that the character are ordered in each + * pair. "Aj" won't match "jA" + * + * @param expression + * The concatenation of all pairs to be parsed. If one want to search for + * "ab" or "Cd", then @p expression would be ['a','b','C','d']. Null terminator + * is optional. + * + * @param pair_count + * The number of characters pair in @p expression + * + * @param output_database + * A pointer to the user-allocated @ref + * hs_multi_char_pair_compiled_pattern_t structure On success, it will fill it + * with the data required to run @ref hs_multi_char_pair_search() with the given + * expression. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR otherwise. + */ +hs_error_t HS_CDECL hs_compile_multi_char_pair_search( + const char *expression, size_t pair_count, + hs_multi_char_pair_compiled_pattern_t *output_database); + /** @} */ #ifdef __cplusplus diff --git a/src/hs_extended_api.cpp b/src/hs_extended_api.cpp new file mode 100644 index 000000000..4e56326ae --- /dev/null +++ b/src/hs_extended_api.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2024-2025, Arm ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include + +#include "hs_common.h" +#include "hs_compile.h" +#include "hs_runtime.h" +#include "scratch.h" + +#include "hwlm/hwlm.h" +#include "hwlm/hwlm_literal.h" +#include "hwlm/noodle_build.h" +#include "hwlm/noodle_engine.h" +#include "hwlm/noodle_internal.h" + +static_assert((uint64_t)CB_CONTINUE_MATCHING == HWLM_CONTINUE_MATCHING, + "CB_CONTINUE_MATCHING doesn't match HWLM_CONTINUE_MATCHING"); +static_assert((uint64_t)CB_TERMINATE_MATCHING == HWLM_TERMINATE_MATCHING, + "CB_TERMINATE_MATCHING doesn't match HWLM_TERMINATE_MATCHING"); + +static inline hs_error_t hwlm_to_hs_error(const hwlm_error_t error) { + switch (error) { + case HWLM_SUCCESS: + return HS_SUCCESS; + case HWLM_TERMINATED: + return HS_SCAN_TERMINATED; + case HWLM_ERROR_UNKNOWN: + return HS_UNKNOWN_ERROR; + case HWLM_LITERAL_MAX_LEN: + return HS_COMPILER_ERROR; + default: + return HS_UNKNOWN_ERROR; + } +} + +// Some algorithms don't use the scratch at all so we can save on memory +typedef struct scratchless_call_ctx { + void *ctx_ptr; + match_event_handler usr_cb; +} hs_scratchless_call_ctx_t; + +// --- short_literal --- +static_assert(sizeof(hs_short_literal_compiled_pattern_t) >= sizeof(noodTable), + "Short_literal_compiled_pattern_t is too small to fit the " + "underlying type's data"); + +hwlmcb_rv_t HS_CDECL noodle_to_hs_callback(size_t end, u32 id, + struct hs_scratch *scratch) { + hs_scratchless_call_ctx_t *light_scratch = + reinterpret_cast(scratch); + return (hwlmcb_rv_t)(light_scratch->usr_cb(id, 0, end, 0, + light_scratch->ctx_ptr)); +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_compile_short_literal_search( + const char *expression, size_t expression_length, + hs_short_literal_compiled_pattern_t *output_database) { + if (expression_length > HS_SHORT_PATTERN_THRESHOLD) { + return HS_INVALID; + } + /* + * Exposing caseness at the api level may restrict our ability to change + * the backing algorithm, so we decided to make all algo case sensitive + */ + bool is_case_insensitive = false; + bool only_need_first_match = false; + ue2::hwlmLiteral lit(std::string(expression, expression_length), + is_case_insensitive, only_need_first_match, 0, + HWLM_ALL_GROUPS, {}, {}); + + ue2::bytecode_ptr table = ue2::noodBuildTable(lit); + + if (table) { + *reinterpret_cast(output_database) = *(table.get()); + return HS_SUCCESS; + } else { + return HS_UNKNOWN_ERROR; + } +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_short_literal_search( + const hs_short_literal_compiled_pattern_t *database, + const char *data, size_t length, + struct hs_scratch *scratch, match_event_handler onEvent, + void *context) { + (void)scratch; + + hs_scratchless_call_ctx_t noodle_context; + noodle_context.ctx_ptr = context; + noodle_context.usr_cb = onEvent; + /* + * although noodle require a scratch, it never actually use it and just + * pass it down to the callback, so we can pass our context instead + */ + hwlm_error_t error = noodExec(reinterpret_cast(database), + reinterpret_cast(data), + length, 0, noodle_to_hs_callback, + reinterpret_cast(&noodle_context)); + // TODO The above is a hack. We need a clean solution like changing noodle's + // prototype, or finding where in hs_scratch we can add user data. (I + // didn't find it, but I'm sure there's some place for it) + return hwlm_to_hs_error(error); +} + +/** + * Some useful algorithm to implement the API: + * noodle: find a pair of chars and then do a vect compare to find up to 16 + * char long patterns + * fdr: find a matching string of any size. The first stage check up to 8 char + * long patterns, then run a slower search on the result. fdr is optimized + * to match against many string at once + * shufti: the simple version seems similar to truffle. find char of a charset. + * There's limitation on the charset (up to 8 different (char%16)) + * The double version is looking for a set of two characters following. If + * we have "ab" and "cd", does it match for "ad"? No. It reject ad. So + * this could be used to make a generalized noodle that can match more + * than one pattern + * truffle: find a char among charset + */ \ No newline at end of file diff --git a/src/hs_runtime.h b/src/hs_runtime.h index 6d34b6c48..30ea171b7 100644 --- a/src/hs_runtime.h +++ b/src/hs_runtime.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2018, Intel Corporation + * Copyright (c) 2024-2025, Arm ltd * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -614,6 +615,243 @@ hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch); */ #define HS_OFFSET_PAST_HORIZON (~0ULL) +/** + * The following functions are part of the extended API and are not cross + * compatible with hyperscan. This extension intends on providing the developer + * with minimal overhead search functions. + * + * All search functions handle a limited kind of patterns. For more generic + * patterns, use @ref hs_scan() + * All search functions are considered case-sensitive. + */ + +/** Callback return value indicating that we should continue matching. */ +#define CB_CONTINUE_MATCHING (int)(~0U) + +/** Callback return value indicating that we should halt matching. */ +#define CB_TERMINATE_MATCHING (int)0 + +/** + * Search the given data for the short literal pattern up to + * @ref HS_SHORT_PATTERN_THRESHOLD chars long. For longer patterns, use @ref + * hs_long_literal_search(). Other options exists for character pairs or set. + * + * @param database + * The compiled pattern returned by @ref hs_compile_short_literal_search() + * @param data + * Pointer to the data to be scanned. + * @param length + * The number of bytes to scan. + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for this + * database. If a NULL pointer is given a scratch will be created and freed + * as needed + * @param onEvent + * Pointer to a @ref match_event_handler callback function. If a NULL + * pointer is given, no matches will be returned. + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +hs_error_t HS_CDECL hs_short_literal_search( + const hs_short_literal_compiled_pattern_t *database, const char *data, + size_t length, struct hs_scratch *scratch, match_event_handler onEvent, + void *context); + +/** + * Search the given data for the long literal pattern. If the pattern length is + * less or equal to @ref HS_SHORT_PATTERN_THRESHOLD, @ref + * hs_short_literal_search() may be faster + * + * @param database + * The compiled pattern returned by @ref hs_compile_long_literal_search() + * @param data + * Pointer to the data to be scanned. + * @param length + * The number of bytes to scan. + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for this + * database. If a NULL pointer is given a scratch will be created and freed + * as needed + * @param onEvent + * Pointer to a @ref match_event_handler callback function. If a NULL + * pointer is given, no matches will be returned. + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +hs_error_t HS_CDECL hs_long_literal_search( + const hs_long_literal_compiled_pattern_t *database, const char *data, + size_t length, struct hs_scratch *scratch, match_event_handler onEvent, + void *context); + +/** + * Search the given data for several long literal patterns at once. + * + * @param database + * The compiled pattern returned by @ref hs_compile_multi_literal_search() + * @param data + * Pointer to the data to be scanned. + * @param length + * The number of bytes to scan. + * @param start_offset + * Offset in Byte at which to start the search in the buffer. + * @param cb + * Pointer to a @ref match_event_handler callback function. If a NULL + * pointer is given, no matches will be returned. + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for this + * database. If a NULL pointer is given a scratch will be created and freed + * as needed + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +hs_error_t HS_CDECL hs_multi_literal_search( + const hs_multi_literal_compiled_pattern_t *database, const char *data, + size_t length, struct hs_scratch *scratch, match_event_handler onEvent, + void *context); + +/** + * Search the given data for any occurrence of the given character. + * + * @param database + * The compiled pattern returned by @ref hs_compile_single_char_search() + * @param data + * Pointer to the data to be scanned. + * @param length + * The number of bytes to scan. + * @param start_offset + * Offset in Byte at which to start the search in the buffer. + * @param cb + * Pointer to a @ref match_event_handler callback function. If a NULL + * pointer is given, no matches will be returned. + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for this + * database. If a NULL pointer is given a scratch will be created and freed + * as needed + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +hs_error_t HS_CDECL hs_single_char_search( + const hs_single_char_compiled_pattern_t *database, const char *data, + size_t length, struct hs_scratch *scratch, match_event_handler onEvent, + void *context); + +/** + * Search the given data for occurrences of any character from the given + * character set. + * + * @param database + * The compiled pattern returned by @ref hs_compile_multi_char_search() + * @param data + * Pointer to the data to be scanned. + * @param length + * The number of bytes to scan. + * @param start_offset + * Offset in Byte at which to start the search in the buffer. + * @param cb + * Pointer to a @ref match_event_handler callback function. If a NULL + * pointer is given, no matches will be returned. + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for this + * database. If a NULL pointer is given a scratch will be created and freed + * as needed + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +hs_error_t HS_CDECL hs_multi_char_search( + const hs_multi_char_compiled_pattern_t *database, const char *data, + size_t length, struct hs_scratch *scratch, match_event_handler onEvent, + void *context); + +/** + * Search the given data for occurrences of the given ordered character pair + * ("Aj" won't match "jA"). + * + * @param database + * The compiled pattern returned by @ref hs_compile_char_pair_search() + * @param data + * Pointer to the data to be scanned. + * @param length + * The number of bytes to scan. + * @param start_offset + * Offset in Byte at which to start the search in the buffer. + * @param cb + * Pointer to a @ref match_event_handler callback function. If a NULL + * pointer is given, no matches will be returned. + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for this + * database. If a NULL pointer is given a scratch will be created and freed + * as needed + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +hs_error_t HS_CDECL hs_single_char_pair_search( + const hs_single_char_pair_compiled_pattern_t *database, const char *data, + size_t length, struct hs_scratch *scratch, match_event_handler onEvent, + void *context); + +/** + * Search the given data for occurrences of any of the ordered character pair + * from the given set ("Aj" won't match "jA") + * + * @param database + * The compiled pattern returned by @ref + * hs_compile_multi_char_pair_search() + * @param data + * Pointer to the data to be scanned. + * @param length + * The number of bytes to scan. + * @param start_offset + * Offset in Byte at which to start the search in the buffer. + * @param cb + * Pointer to a @ref match_event_handler callback function. If a NULL + * pointer is given, no matches will be returned. + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for this + * database. If a NULL pointer is given a scratch will be created and freed + * as needed + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +hs_error_t HS_CDECL hs_multi_char_pair_search( + const hs_multi_char_pair_compiled_pattern_t *database, const char *data, + size_t length, struct hs_scratch *scratch, match_event_handler onEvent, + void *context); + #ifdef __cplusplus } /* extern "C" */ #endif