Skip to content

Commit

Permalink
feat: new operator for matching strings against regular expressions
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed Aug 7, 2024
1 parent 0099471 commit ae5cac4
Show file tree
Hide file tree
Showing 12 changed files with 288 additions and 7 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ find_package(spdlog REQUIRED)
find_package(TBB REQUIRED)
find_package(yaml-cpp REQUIRED)
find_package(zstd REQUIRED)
find_package(re2 REQUIRED)

# ---------------------------------------------------------------------------
# Includes
Expand Down Expand Up @@ -97,6 +98,7 @@ target_link_libraries(
Poco::Net
Poco::Util
Poco::JSON
re2::re2
)

add_executable(siloApi "${CMAKE_SOURCE_DIR}/src/silo_api/api.cpp" $<TARGET_OBJECTS:silolib>)
Expand Down
8 changes: 8 additions & 0 deletions conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ class SiloRecipe(ConanFile):
"spdlog/1.14.1",
"yaml-cpp/0.7.0",
"zstd/1.5.5",
"re2/20240702",
"abseil/20240116.1",
]

default_options = {
Expand Down Expand Up @@ -90,6 +92,10 @@ class SiloRecipe(ConanFile):
"poco/*:enable_redis": False,
"poco/*:enable_xml": False,
"poco/*:enable_zip": False,

"re2/*:shared": False,

"absl/*:shared": False,
}

def generate(self):
Expand All @@ -107,4 +113,6 @@ def generate(self):
deps.set_property("spdlog", "cmake_find_mode", "both")
deps.set_property("yaml-cpp", "cmake_find_mode", "both")
deps.set_property("zstd", "cmake_find_mode", "both")
deps.set_property("re2", "cmake_find_mode", "both")
deps.set_property("abseil", "cmake_find_mode", "both")
deps.generate()
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"testCaseName": "StringSearch that wants to match a non-existing column",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "StringSearch",
"column": "this_column_does_not_exist",
"searchExpression": "test"
}
},
"expectedError": {
"error": "Bad request",
"message": "The database does not contain the string column 'this_column_does_not_exist'"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"testCaseName": "StringSearch that wants to match a non-string column",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "StringSearch",
"column": "age",
"searchExpression": "test"
}
},
"expectedError": {
"error": "Bad request",
"message": "The database does not contain the string column 'age'"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"testCaseName": "StringSearch that contains an invalid regex",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "StringSearch",
"column": "gisaid_epi_isl",
"searchExpression": "\\"
}
},
"expectedError": {
"error": "Bad request",
"message": "Invalid Regular Expression. The parsing of the regular expression failed with the error 'trailing \\'. See https://github.com/google/re2/wiki/Syntax for a Syntax specification."
}
}
21 changes: 21 additions & 0 deletions endToEndTests/test/queries/stringSearch_basic_regex.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"testCaseName": "StringSearch with a basic regex",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "Not",
"child": {
"type": "StringSearch",
"column": "gisaid_epi_isl",
"searchExpression": ".*EPI.*"
}
}
},
"expectedQueryResult": [
{
"count": 0
}
]
}
18 changes: 18 additions & 0 deletions endToEndTests/test/queries/stringSearch_digitAmount.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"testCaseName": "StringSearch that matches the primary key to end with exactly six digits",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "StringSearch",
"column": "gisaid_epi_isl",
"searchExpression": "\\D*\\d{6}"
}
},
"expectedQueryResult": [
{
"count": 12
}
]
}
18 changes: 18 additions & 0 deletions endToEndTests/test/queries/stringSearch_justAString.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"testCaseName": "StringSearch that matches exactly a string",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "StringSearch",
"column": "division",
"searchExpression": "Aargau"
}
},
"expectedQueryResult": [
{
"count": 6
}
]
}
12 changes: 6 additions & 6 deletions endToEndTests/test/query.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@ describe('The /query endpoint', () => {
);
invalidQueryTestCases.forEach(testCase =>
it('should return the expected error for the test case ' + testCase.testCaseName, async () => {
const response = await server
.post('/query')
.send(testCase.query)
.expect(400)
.expect('Content-Type', 'application/json');
return expect(response.body).to.deep.equal(testCase.expectedError);
const response = await server.post('/query').send(testCase.query);

const errorMessage = 'Actual result is:\n' + response.text + '\n';
expect(response.status, errorMessage).to.equal(400);
expect(response.header['content-type'], errorMessage).to.equal('application/json');
return expect(response.body, errorMessage).to.deep.equal(testCase.expectedError);
})
);

Expand Down
46 changes: 46 additions & 0 deletions include/silo/query_engine/filter_expressions/string_search.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#pragma once

#include <memory>
#include <string>

#include <nlohmann/json_fwd.hpp>

#include "silo/query_engine/filter_expressions/expression.h"

namespace re2 {
class RE2;
}

namespace silo {
class Database;
class DatabasePartition;
namespace query_engine {
namespace operators {
class Operator;
} // namespace operators
} // namespace query_engine
} // namespace silo

namespace silo::query_engine::filter_expressions {

class StringSearch : public Expression {
private:
std::string column;
std::unique_ptr<re2::RE2> search_expression;

public:
explicit StringSearch(std::string column, std::unique_ptr<re2::RE2> search_expression);

std::string toString() const override;

[[nodiscard]] std::unique_ptr<silo::query_engine::operators::Operator> compile(
const Database& database,
const DatabasePartition& database_partition,
AmbiguityMode mode
) const override;
};

// NOLINTNEXTLINE(readability-identifier-naming)
void from_json(const nlohmann::json& json, std::unique_ptr<StringSearch>& filter);

} // namespace silo::query_engine::filter_expressions
6 changes: 5 additions & 1 deletion src/silo/query_engine/filter_expressions/expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <string>

#include <re2/re2.h>
#include <nlohmann/json.hpp>

#include "silo/query_engine/filter_expressions/and.h"
Expand All @@ -21,6 +22,7 @@
#include "silo/query_engine/filter_expressions/or.h"
#include "silo/query_engine/filter_expressions/pango_lineage_filter.h"
#include "silo/query_engine/filter_expressions/string_equals.h"
#include "silo/query_engine/filter_expressions/string_search.h"
#include "silo/query_engine/filter_expressions/symbol_equals.h"
#include "silo/query_engine/filter_expressions/true.h"
#include "silo/query_engine/query_parse_exception.h"
Expand All @@ -44,7 +46,7 @@ Expression::AmbiguityMode invertMode(Expression::AmbiguityMode mode) {
return mode;
}

// NOLINTNEXTLINE(readability-identifier-naming)
// NOLINTNEXTLINE(readability-identifier-naming,readability-function-cognitive-complexity)
void from_json(const nlohmann::json& json, std::unique_ptr<Expression>& filter) {
CHECK_SILO_QUERY(json.contains("type"), "The field 'type' is required in any filter expression");
CHECK_SILO_QUERY(
Expand Down Expand Up @@ -79,6 +81,8 @@ void from_json(const nlohmann::json& json, std::unique_ptr<Expression>& filter)
filter = json.get<std::unique_ptr<PangoLineageFilter>>();
} else if (expression_type == "StringEquals") {
filter = json.get<std::unique_ptr<StringEquals>>();
} else if (expression_type == "StringSearch") {
filter = json.get<std::unique_ptr<StringSearch>>();
} else if (expression_type == "BooleanEquals") {
filter = json.get<std::unique_ptr<BoolEquals>>();
} else if (expression_type == "IntEquals") {
Expand Down
113 changes: 113 additions & 0 deletions src/silo/query_engine/filter_expressions/string_search.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#include "silo/query_engine/filter_expressions/string_search.h"

#include <optional>
#include <utility>

#include <fmt/format.h>
#include <re2/re2.h>
#include <nlohmann/json.hpp>

#include "silo/common/string.h"
#include "silo/query_engine/filter_expressions/expression.h"
#include "silo/query_engine/operators/bitmap_producer.h"
#include "silo/query_engine/query_parse_exception.h"
#include "silo/storage/database_partition.h"

namespace silo {
class Database;
namespace query_engine::operators {
class Operator;
} // namespace query_engine::operators
} // namespace silo

namespace silo::query_engine::filter_expressions {

StringSearch::StringSearch(std::string column, std::unique_ptr<re2::RE2> search_expression)
: column(std::move(column)),
search_expression(std::move(search_expression)) {}

std::string StringSearch::toString() const {
return fmt::format("column {} regex_matches \"{}\"", column, search_expression->pattern());
}

namespace {
template <typename GenericStringColumn>
std::unique_ptr<silo::query_engine::operators::Operator> createMatchingBitmap(
const GenericStringColumn& string_column,
const RE2& search_expression,
size_t row_count
) {
return std::make_unique<operators::BitmapProducer>(
[&, row_count]() {
roaring::Roaring result_bitmap;
for (size_t row_idx = 0; row_idx < row_count; ++row_idx) {
const auto& embedded_value = string_column.getValues().at(row_idx);
const auto& string_value = string_column.lookupValue(embedded_value);
if (re2::RE2::FullMatch(string_value, search_expression)) {
result_bitmap.add(row_idx);
}
}
return OperatorResult(std::move(result_bitmap));
},
row_count
);
}

} // namespace

std::unique_ptr<silo::query_engine::operators::Operator> StringSearch::compile(
const silo::Database& /*database*/,
const silo::DatabasePartition& database_partition,
Expression::AmbiguityMode /*mode*/
) const {
CHECK_SILO_QUERY(
database_partition.columns.string_columns.contains(column) ||
database_partition.columns.indexed_string_columns.contains(column),
fmt::format("The database does not contain the string column '{}'", column)
)

if (database_partition.columns.indexed_string_columns.contains(column)) {
const auto& string_column = database_partition.columns.indexed_string_columns.at(column);
return createMatchingBitmap(
string_column, *search_expression, database_partition.sequence_count
);
}
assert(database_partition.columns.string_columns.contains(column));
const auto& string_column = database_partition.columns.string_columns.at(column);
return createMatchingBitmap(
string_column, *search_expression, database_partition.sequence_count
);
}

// NOLINTNEXTLINE(readability-identifier-naming)
void from_json(const nlohmann::json& json, std::unique_ptr<StringSearch>& filter) {
CHECK_SILO_QUERY(
json.contains("column"), "The field 'column' is required in an StringSearch expression"
)
CHECK_SILO_QUERY(
json["column"].is_string(),
"The field 'column' in an StringSearch expression needs to be a string"
)
CHECK_SILO_QUERY(
json.contains("searchExpression"),
"The field 'searchExpression' is required in an StringSearch expression"
)
CHECK_SILO_QUERY(
json["searchExpression"].is_string(),
"The field 'searchExpression' in an StringSearch expression needs to be a string"
)
const std::string& column = json["column"];
const std::string& search_expression_string = json["searchExpression"].get<std::string>();
auto search_expression = std::make_unique<re2::RE2>(search_expression_string);
CHECK_SILO_QUERY(
search_expression->ok(),
fmt::format(
"Invalid Regular Expression. The parsing of the regular expression failed with the error "
"'{}'. See https://github.com/google/re2/wiki/Syntax for a Syntax specification.",
search_expression->error()
)
)
filter = std::make_unique<StringSearch>(column, std::move(search_expression));
}

} // namespace silo::query_engine::filter_expressions

0 comments on commit ae5cac4

Please sign in to comment.