-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: new operator for matching strings against regular expressions
- Loading branch information
Showing
12 changed files
with
288 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
17 changes: 17 additions & 0 deletions
17
endToEndTests/test/invalidQueries/stringSearch_nonExistingColumn.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"testCaseName": "StringSearch that wants to match a non-existing column", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "StringSearch", | ||
"column": "this_column_does_not_exist", | ||
"searchExpression": "test" | ||
} | ||
}, | ||
"expectedError": { | ||
"error": "Bad request", | ||
"message": "The database does not contain the string column 'this_column_does_not_exist'" | ||
} | ||
} |
17 changes: 17 additions & 0 deletions
17
endToEndTests/test/invalidQueries/stringSearch_nonStringColumn.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"testCaseName": "StringSearch that wants to match a non-string column", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "StringSearch", | ||
"column": "age", | ||
"searchExpression": "test" | ||
} | ||
}, | ||
"expectedError": { | ||
"error": "Bad request", | ||
"message": "The database does not contain the string column 'age'" | ||
} | ||
} |
17 changes: 17 additions & 0 deletions
17
endToEndTests/test/invalidQueries/stringSearch_withInvalidRegex.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"testCaseName": "StringSearch that contains an invalid regex", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "StringSearch", | ||
"column": "gisaid_epi_isl", | ||
"searchExpression": "\\" | ||
} | ||
}, | ||
"expectedError": { | ||
"error": "Bad request", | ||
"message": "Invalid Regular Expression. The parsing of the regular expression failed with the error 'trailing \\'. See https://github.com/google/re2/wiki/Syntax for a Syntax specification." | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
{ | ||
"testCaseName": "StringSearch with a basic regex", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "Not", | ||
"child": { | ||
"type": "StringSearch", | ||
"column": "gisaid_epi_isl", | ||
"searchExpression": ".*EPI.*" | ||
} | ||
} | ||
}, | ||
"expectedQueryResult": [ | ||
{ | ||
"count": 0 | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"testCaseName": "StringSearch that matches the primary key to end with exactly six digits", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "StringSearch", | ||
"column": "gisaid_epi_isl", | ||
"searchExpression": "\\D*\\d{6}" | ||
} | ||
}, | ||
"expectedQueryResult": [ | ||
{ | ||
"count": 12 | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"testCaseName": "StringSearch that matches exactly a string", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "StringSearch", | ||
"column": "division", | ||
"searchExpression": "Aargau" | ||
} | ||
}, | ||
"expectedQueryResult": [ | ||
{ | ||
"count": 6 | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
46 changes: 46 additions & 0 deletions
46
include/silo/query_engine/filter_expressions/string_search.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#pragma once | ||
|
||
#include <memory> | ||
#include <string> | ||
|
||
#include <nlohmann/json_fwd.hpp> | ||
|
||
#include "silo/query_engine/filter_expressions/expression.h" | ||
|
||
namespace re2 { | ||
class RE2; | ||
} | ||
|
||
namespace silo { | ||
class Database; | ||
class DatabasePartition; | ||
namespace query_engine { | ||
namespace operators { | ||
class Operator; | ||
} // namespace operators | ||
} // namespace query_engine | ||
} // namespace silo | ||
|
||
namespace silo::query_engine::filter_expressions { | ||
|
||
class StringSearch : public Expression { | ||
private: | ||
std::string column; | ||
std::unique_ptr<re2::RE2> search_expression; | ||
|
||
public: | ||
explicit StringSearch(std::string column, std::unique_ptr<re2::RE2> search_expression); | ||
|
||
std::string toString() const override; | ||
|
||
[[nodiscard]] std::unique_ptr<silo::query_engine::operators::Operator> compile( | ||
const Database& database, | ||
const DatabasePartition& database_partition, | ||
AmbiguityMode mode | ||
) const override; | ||
}; | ||
|
||
// NOLINTNEXTLINE(readability-identifier-naming) | ||
void from_json(const nlohmann::json& json, std::unique_ptr<StringSearch>& filter); | ||
|
||
} // namespace silo::query_engine::filter_expressions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
113 changes: 113 additions & 0 deletions
113
src/silo/query_engine/filter_expressions/string_search.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
#include "silo/query_engine/filter_expressions/string_search.h" | ||
|
||
#include <optional> | ||
#include <utility> | ||
|
||
#include <fmt/format.h> | ||
#include <re2/re2.h> | ||
#include <nlohmann/json.hpp> | ||
|
||
#include "silo/common/string.h" | ||
#include "silo/query_engine/filter_expressions/expression.h" | ||
#include "silo/query_engine/operators/bitmap_producer.h" | ||
#include "silo/query_engine/query_parse_exception.h" | ||
#include "silo/storage/database_partition.h" | ||
|
||
namespace silo { | ||
class Database; | ||
namespace query_engine::operators { | ||
class Operator; | ||
} // namespace query_engine::operators | ||
} // namespace silo | ||
|
||
namespace silo::query_engine::filter_expressions { | ||
|
||
StringSearch::StringSearch(std::string column, std::unique_ptr<re2::RE2> search_expression) | ||
: column(std::move(column)), | ||
search_expression(std::move(search_expression)) {} | ||
|
||
std::string StringSearch::toString() const { | ||
return fmt::format("column {} regex_matches \"{}\"", column, search_expression->pattern()); | ||
} | ||
|
||
namespace { | ||
template <typename GenericStringColumn> | ||
std::unique_ptr<silo::query_engine::operators::Operator> createMatchingBitmap( | ||
const GenericStringColumn& string_column, | ||
const RE2& search_expression, | ||
size_t row_count | ||
) { | ||
return std::make_unique<operators::BitmapProducer>( | ||
[&, row_count]() { | ||
roaring::Roaring result_bitmap; | ||
for (size_t row_idx = 0; row_idx < row_count; ++row_idx) { | ||
const auto& embedded_value = string_column.getValues().at(row_idx); | ||
const auto& string_value = string_column.lookupValue(embedded_value); | ||
if (re2::RE2::FullMatch(string_value, search_expression)) { | ||
result_bitmap.add(row_idx); | ||
} | ||
} | ||
return OperatorResult(std::move(result_bitmap)); | ||
}, | ||
row_count | ||
); | ||
} | ||
|
||
} // namespace | ||
|
||
std::unique_ptr<silo::query_engine::operators::Operator> StringSearch::compile( | ||
const silo::Database& /*database*/, | ||
const silo::DatabasePartition& database_partition, | ||
Expression::AmbiguityMode /*mode*/ | ||
) const { | ||
CHECK_SILO_QUERY( | ||
database_partition.columns.string_columns.contains(column) || | ||
database_partition.columns.indexed_string_columns.contains(column), | ||
fmt::format("The database does not contain the string column '{}'", column) | ||
) | ||
|
||
if (database_partition.columns.indexed_string_columns.contains(column)) { | ||
const auto& string_column = database_partition.columns.indexed_string_columns.at(column); | ||
return createMatchingBitmap( | ||
string_column, *search_expression, database_partition.sequence_count | ||
); | ||
} | ||
assert(database_partition.columns.string_columns.contains(column)); | ||
const auto& string_column = database_partition.columns.string_columns.at(column); | ||
return createMatchingBitmap( | ||
string_column, *search_expression, database_partition.sequence_count | ||
); | ||
} | ||
|
||
// NOLINTNEXTLINE(readability-identifier-naming) | ||
void from_json(const nlohmann::json& json, std::unique_ptr<StringSearch>& filter) { | ||
CHECK_SILO_QUERY( | ||
json.contains("column"), "The field 'column' is required in an StringSearch expression" | ||
) | ||
CHECK_SILO_QUERY( | ||
json["column"].is_string(), | ||
"The field 'column' in an StringSearch expression needs to be a string" | ||
) | ||
CHECK_SILO_QUERY( | ||
json.contains("searchExpression"), | ||
"The field 'searchExpression' is required in an StringSearch expression" | ||
) | ||
CHECK_SILO_QUERY( | ||
json["searchExpression"].is_string(), | ||
"The field 'searchExpression' in an StringSearch expression needs to be a string" | ||
) | ||
const std::string& column = json["column"]; | ||
const std::string& search_expression_string = json["searchExpression"].get<std::string>(); | ||
auto search_expression = std::make_unique<re2::RE2>(search_expression_string); | ||
CHECK_SILO_QUERY( | ||
search_expression->ok(), | ||
fmt::format( | ||
"Invalid Regular Expression. The parsing of the regular expression failed with the error " | ||
"'{}'. See https://github.com/google/re2/wiki/Syntax for a Syntax specification.", | ||
search_expression->error() | ||
) | ||
) | ||
filter = std::make_unique<StringSearch>(column, std::move(search_expression)); | ||
} | ||
|
||
} // namespace silo::query_engine::filter_expressions |