diff --git a/Cargo.toml b/Cargo.toml index a2bb1f45..711c982c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,14 +24,14 @@ dtoa = "1.0.9" base64 = "0.22.1" serde_json = "1.0.117" rand = "0.8.5" -regex = "1.11.1" +regress = "0.10.1" num-format = "0.4.4" uuid = { version = "1.8.0", features = ["fast-rng", "v4", "v7"] } [dev-dependencies] test-case = "3.3.1" test-generator = "0.3.1" -regex = "1.5.4" +regress = "0.10.1" [build-dependencies] glob = "0.3" diff --git a/src/errors.rs b/src/errors.rs index e83f23f8..22d497d9 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -221,7 +221,7 @@ impl fmt::Display for Error { S0302UnterminatedRegex(ref p) => write!(f, "{}: No terminating / in regular expression", p), S0303InvalidRegex(ref p, ref message) => - // The error message from `regex::Regex` a "regex parse error: " prefix, so don't be redundant here. + // The error message from `regress::Regex` a "regex parse error: " prefix, so don't be redundant here. write!(f, "{}: {}", p, message), // Runtime errors diff --git a/src/evaluator.rs b/src/evaluator.rs index a28e8a0a..b15c93a4 100644 --- a/src/evaluator.rs +++ b/src/evaluator.rs @@ -132,6 +132,10 @@ impl<'a> Evaluator<'a> { ref update, ref delete, } => Value::transformer(self.arena, pattern, update, delete), + AstKind::Regex(ref regex_literal) => { + // Wrap the regex literal in a `Value::Regex` and return it + self.arena.alloc(Value::Regex(regex_literal.clone())) + } _ => unimplemented!("TODO: node kind not yet supported: {:#?}", node.kind), }; diff --git a/src/evaluator/functions.rs b/src/evaluator/functions.rs index c2ab1f52..6401d1af 100644 --- a/src/evaluator/functions.rs +++ b/src/evaluator/functions.rs @@ -1,16 +1,18 @@ use base64::Engine; use chrono::{TimeZone, Utc}; +use hashbrown::{DefaultHashBuilder, HashMap}; use rand::Rng; -use regex::Regex; use std::borrow::{Borrow, Cow}; use std::collections::HashSet; use std::time::{SystemTime, UNIX_EPOCH}; use uuid::Uuid; use crate::datetime::{format_custom_date, parse_custom_format, parse_timezone_offset}; +use crate::evaluator::RegexLiteral; use crate::parser::expressions::check_balanced_brackets; use bumpalo::collections::CollectIn; +use bumpalo::collections::String as BumpString; use bumpalo::collections::Vec as BumpVec; use bumpalo::Bump; @@ -185,6 +187,7 @@ pub fn fn_boolean<'a>( Value::bool(false) } }, + Value::Regex(_) => Value::bool(true), Value::Lambda { .. } | Value::NativeFn { .. } | Value::Transformer { .. } => { Value::bool(false) } @@ -1741,7 +1744,7 @@ pub fn fn_pad<'a>( Ok(Value::string(context.arena, &result)) } -pub fn fn_match_regex<'a>( +pub fn fn_match<'a>( context: FunctionContext<'a, '_>, args: &[&'a Value<'a>], ) -> Result<&'a Value<'a>> { @@ -1752,20 +1755,72 @@ pub fn fn_match_regex<'a>( assert_arg!(value_to_validate.is_string(), context, 1); let pattern_value = match args.get(1).copied() { - Some(val) if val.is_string() => val, + Some(val) => val, _ => return Err(Error::D3010EmptyPattern(context.char_index)), }; - let regex_pattern = Regex::new(&pattern_value.as_str()) - .map_err(|_| Error::D3010EmptyPattern(context.char_index))?; + let regex_literal = match pattern_value { + Value::Regex(ref regex_literal) => regex_literal, + Value::String(ref s) => { + let regex = RegexLiteral::new(s.as_str(), false, false) + .map_err(|_| Error::D3010EmptyPattern(context.char_index))?; + &*context.arena.alloc(regex) + } + _ => return Err(Error::D3010EmptyPattern(context.char_index)), + }; - if regex_pattern.is_match(&value_to_validate.as_str()) { - Ok(value_to_validate) // Return input if it matches - } else { - Err(Error::D3137Error(format!( - "Invalid format: '{}' does not match the expected pattern '{}'", - value_to_validate.as_str(), - pattern_value.as_str() - ))) + let limit = args + .get(2) + .and_then(|val| { + if val.is_number() { + Some(val.as_f64() as usize) + } else { + None + } + }) + .unwrap_or(usize::MAX); + + let key_match = BumpString::from_str_in("match", context.arena); + let key_index = BumpString::from_str_in("index", context.arena); + let key_groups = BumpString::from_str_in("groups", context.arena); + + let mut matches: bumpalo::collections::Vec<&Value<'a>> = + bumpalo::collections::Vec::new_in(context.arena); + + for (i, m) in regex_literal + .get_regex() + .find_iter(&value_to_validate.as_str()) + .enumerate() + { + if i >= limit { + break; + } + + let matched_text = &value_to_validate.as_str()[m.start()..m.end()]; + let match_str = context + .arena + .alloc(Value::string(context.arena, matched_text)); + + let index_val = context + .arena + .alloc(Value::number(context.arena, m.start() as f64)); + + let group_vec: bumpalo::collections::Vec<&Value<'a>> = + bumpalo::collections::Vec::new_in(context.arena); + let groups_val = context + .arena + .alloc(Value::Array(group_vec, ArrayFlags::empty())); + + let mut match_obj: HashMap, DefaultHashBuilder, &Bump> = + HashMap::with_capacity_and_hasher_in(3, DefaultHashBuilder::default(), context.arena); + match_obj.insert(key_match.clone(), match_str); + match_obj.insert(key_index.clone(), index_val); + match_obj.insert(key_groups.clone(), groups_val); + + matches.push(context.arena.alloc(Value::Object(match_obj))); } + + Ok(context + .arena + .alloc(Value::Array(matches, ArrayFlags::empty()))) } diff --git a/src/evaluator/value.rs b/src/evaluator/value.rs index f57f4175..924850e2 100644 --- a/src/evaluator/value.rs +++ b/src/evaluator/value.rs @@ -10,7 +10,7 @@ use hashbrown::HashMap; use super::frame::Frame; use super::functions::FunctionContext; -use crate::parser::ast::{Ast, AstKind}; +use crate::parser::ast::{Ast, AstKind, RegexLiteral}; use crate::{Error, Result}; pub mod impls; @@ -51,6 +51,7 @@ pub enum Value<'a> { Number(f64), Bool(bool), String(BumpString<'a>), + Regex(RegexLiteral), Array(BumpVec<'a, &'a Value<'a>>, ArrayFlags), Object(HashMap, &'a Value<'a>, DefaultHashBuilder, &'a Bump>), Range(Range<'a>), @@ -309,6 +310,7 @@ impl<'a> Value<'a> { } }, Value::Object(ref o) => !o.is_empty(), + Value::Regex(_) => true, // Treat Regex as truthy if it exists Value::Lambda { .. } | Value::NativeFn { .. } | Value::Transformer { .. } => false, Value::Range(ref r) => !r.is_empty(), } @@ -516,6 +518,7 @@ impl<'a> Value<'a> { delete, } => Value::transformer(arena, pattern, update, delete), Self::Range(range) => Value::range_from(arena, range), + Self::Regex(regex) => arena.alloc(Value::Regex(regex.clone())), } } diff --git a/src/evaluator/value/impls.rs b/src/evaluator/value/impls.rs index 81c03b91..7d5d1a71 100644 --- a/src/evaluator/value/impls.rs +++ b/src/evaluator/value/impls.rs @@ -18,6 +18,7 @@ impl<'a> PartialEq> for Value<'a> { (Value::Array(l, ..), Value::Array(r, ..)) => *l == *r, (Value::Object(l), Value::Object(r)) => *l == *r, (Value::Range(l), Value::Range(r)) => *l == *r, + (Value::Regex(l), Value::Regex(r)) => l == r, _ => false, } } @@ -91,6 +92,7 @@ impl std::fmt::Debug for Value<'_> { Self::String(s) => s.fmt(f), Self::Array(a, _) => a.fmt(f), Self::Object(o) => o.fmt(f), + Self::Regex(r) => write!(f, "", r), Self::Lambda { .. } => write!(f, ""), Self::NativeFn { .. } => write!(f, ""), Self::Transformer { .. } => write!(f, ""), @@ -101,7 +103,10 @@ impl std::fmt::Debug for Value<'_> { impl std::fmt::Display for Value<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:#?}", self) + match self { + Self::Regex(r) => write!(f, "", r), + _ => write!(f, "{:#?}", self), + } } } @@ -123,6 +128,7 @@ impl Hash for Value<'_> { map.get(key).hash(state); } } + Value::Regex(r) => r.hash(state), Value::Range(r) => r.hash(state), Value::Lambda { .. } => generate_random_hash(state), Value::NativeFn { name, .. } => name.hash(state), diff --git a/src/evaluator/value/serialize.rs b/src/evaluator/value/serialize.rs index 1b3170b3..bffc3240 100644 --- a/src/evaluator/value/serialize.rs +++ b/src/evaluator/value/serialize.rs @@ -288,6 +288,10 @@ impl Serializer { Value::Bool(false) => self.write(b"false"), Value::Array(..) | Value::Range(..) => self.write_array(value)?, Value::Object(..) => self.write_object(value)?, + Value::Regex(ref regex) => { + let pattern = format!("\"{}\"", regex.as_pattern()); + self.write(pattern.as_bytes()); + } Value::Lambda { .. } | Value::NativeFn { .. } | Value::Transformer { .. } => { self.write(b"\"\"") } diff --git a/src/lib.rs b/src/lib.rs index a969917f..d0b9ba9b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -146,7 +146,7 @@ impl<'a> JsonAta<'a> { bind_native!("lookup", 2, fn_lookup); bind_native!("lowercase", 1, fn_lowercase); bind_native!("map", 2, fn_map); - bind_native!("matchRegex", 2, fn_match_regex); + bind_native!("match", 2, fn_match); bind_native!("max", 1, fn_max); bind_native!("merge", 1, fn_merge); bind_native!("min", 1, fn_min); @@ -185,7 +185,9 @@ impl<'a> JsonAta<'a> { #[cfg(test)] mod tests { use chrono::{DateTime, Offset}; - use regex::Regex; + use regress::Regex; + + use bumpalo::collections::String as BumpString; use super::*; @@ -346,9 +348,12 @@ mod tests { let expected_format = Regex::new(r"^\d{2}/\d{2}/\d{4} \d{1,2}:\d{2}(AM|PM|am|pm) GMT-05:00$").unwrap(); + // Check if the pattern exists within the result_str + let is_match = expected_format.find_iter(&result_str).next().is_some(); assert!( - expected_format.is_match(&result_str), - "Expected custom formatted time with timezone" + is_match, + "Expected custom formatted time with timezone, got: {}", + result_str ); } @@ -359,14 +364,15 @@ mod tests { let result = jsonata.evaluate(None, None).unwrap(); let result_str = result.as_str(); - println!("test_now_with_valid_format_but_no_timezone {}", result_str); + let expected_format = + Regex::new(r"^\d{2}/\d{2}/\d{4} \d{1,2}:\d{2}(AM|PM|am|pm)$").unwrap(); // Allow both AM/PM and am/pm in the regex + let is_match = expected_format.find_iter(&result_str).next().is_some(); assert!( - Regex::new(r"^\d{2}/\d{2}/\d{4} \d{1,2}:\d{2}(AM|PM|am|pm)$") - .unwrap() - .is_match(&result_str), - "Expected custom formatted time without timezone" + is_match, + "Expected custom formatted time without timezone, got: {}", + result_str ); } @@ -471,12 +477,16 @@ mod tests { println!("Formatted date: {}", result_str); - // Allow both AM/PM and am/pm in the regex + // Create the regex with regress::Regex let expected_format = Regex::new(r"^\d{2}/\d{2}/\d{4} \d{1,2}:\d{2}(am|pm|AM|PM) GMT-05:00$").unwrap(); + + // Check if the pattern exists within result_str using find_iter + let is_match = expected_format.find_iter(&result_str).next().is_some(); assert!( - expected_format.is_match(&result_str), - "Expected 12-hour format with timezone" + is_match, + "Expected 12-hour format with timezone, got: {}", + result_str ); } @@ -551,12 +561,16 @@ mod tests { println!("Formatted date: {}", result_str); - // Check if the formatted date matches the expected custom format + // Define the expected format using regress::Regex let expected_format = Regex::new(r"^\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2} GMT\+\d{2}:\d{2}$").unwrap(); + + // Simulate `is_match` by checking if there's at least one match in the string + let is_match = expected_format.find_iter(&result_str).next().is_some(); assert!( - expected_format.is_match(&result_str), - "Expected custom formatted date with timezone" + is_match, + "Expected custom formatted date with timezone, got: {}", + result_str ); } @@ -702,47 +716,46 @@ mod tests { } #[test] - fn evaluate_with_reduce_single_data_element() { + fn test_match_regex_with_jsonata() { let arena = Bump::new(); - // Passing an array with a single element ["data"] - let jsonata = - JsonAta::new("$reduce([\"data\"], function($i, $j){$i + $j})", &arena).unwrap(); - + // Test case with a valid postal code + let jsonata = JsonAta::new(r#"$match("123456789", /^[0-9]{9}$/)"#, &arena).unwrap(); let result = jsonata.evaluate(None, None).unwrap(); - // Since the array contains only one element "data", it should return "data" - assert_eq!(result.as_str(), "data"); // Expecting the string "data" as the result - } + // Expected output: an array with a single match object for "123456789" + let match_value: &Value = arena.alloc(Value::string(&arena, "123456789")); + let index_value: &Value = arena.alloc(Value::number(&arena, 0.0)); + let groups_array: &Value = &*arena.alloc(Value::Array( + bumpalo::collections::Vec::new_in(&arena), + ArrayFlags::empty(), + )); - #[test] - fn test_match_regex_with_jsonata() { - let arena = Bump::new(); + let mut match_obj = hashbrown::HashMap::with_capacity_in(3, &arena); + match_obj.insert(BumpString::from_str_in("match", &arena), match_value); + match_obj.insert(BumpString::from_str_in("index", &arena), index_value); + match_obj.insert(BumpString::from_str_in("groups", &arena), groups_array); - // Test case with a valid postal code - let jsonata = JsonAta::new(r#"$matchRegex("123456789", "^[0-9]{9}$")"#, &arena).unwrap(); - let result = jsonata.evaluate(None, None).unwrap(); + let expected_match: &Value = &*arena.alloc(Value::Object(match_obj)); - // Assert that the result is the postal code itself, indicating a valid match - assert_eq!(result.as_str(), "123456789"); + assert_eq!( + result, + &*arena.alloc(Value::Array( + bumpalo::collections::Vec::from_iter_in([expected_match], &arena), + ArrayFlags::empty() + )) + ); // Test case with an invalid postal code let jsonata_invalid = - JsonAta::new(r#"$matchRegex("12345-6789", "^[0-9]{9}$")"#, &arena).unwrap(); - - let result_invalid = jsonata_invalid.evaluate(None, None); - - // Check if an error occurred and ensure it contains the expected message - assert!(result_invalid.is_err()); - if let Err(error) = result_invalid { - // The core error message to match against - let expected_message = - "Invalid format: '12345-6789' does not match the expected pattern '^[0-9]{9}$'"; - assert!( - error.to_string().contains(expected_message), - "Unexpected error message: {}", - error - ); - } + JsonAta::new(r#"$match("12345-6789", /^[0-9]{9}$/)"#, &arena).unwrap(); + let result_invalid = jsonata_invalid.evaluate(None, None).unwrap(); + + // Expected output for invalid input: an empty array + let empty_array: &Value = &*arena.alloc(Value::Array( + bumpalo::collections::Vec::new_in(&arena), // Empty array for no matches + ArrayFlags::empty(), + )); + assert_eq!(result_invalid, empty_array); } } diff --git a/src/parser/expressions.rs b/src/parser/expressions.rs index 6a3b4d03..5346b7bc 100644 --- a/src/parser/expressions.rs +++ b/src/parser/expressions.rs @@ -1,3 +1,5 @@ +use regress::Regex; +use std::hash::{Hash, Hasher}; use std::ops::Deref; pub fn check_balanced_brackets(expr: &str) -> Result<(), String> { @@ -40,26 +42,68 @@ pub fn check_balanced_brackets(expr: &str) -> Result<(), String> { Ok(()) } -/// A wrapper type for a regex literal so that we can implement PartialEq #[derive(Debug, Clone)] -pub struct RegexLiteral(regex::Regex); +pub struct RegexLiteral { + regex: Regex, + pattern: String, // Store the original pattern string for comparisons +} impl RegexLiteral { - pub(super) fn new(regex: regex::Regex) -> Self { - Self(regex) + /// Create a new `RegexLiteral` with optional case-insensitive and multiline flags. + pub fn new( + pattern: &str, + case_insensitive: bool, + multi_line: bool, + ) -> Result { + // Add flags to the pattern string as needed + let mut flags = String::new(); + if case_insensitive { + flags.push('i'); + } + if multi_line { + flags.push('m'); + } + let regex = Regex::with_flags(pattern, flags.as_str())?; + Ok(Self { + regex, + pattern: pattern.to_string(), + }) + } + + /// Check if the regex pattern matches a given text. + pub fn is_match(&self, text: &str) -> bool { + self.regex.find(text).is_some() + } + + /// Retrieve the original pattern string for display purposes. + pub fn as_pattern(&self) -> &str { + &self.pattern + } + + /// Get a reference to the inner `regress::Regex`. + pub fn get_regex(&self) -> &Regex { + &self.regex } } impl Deref for RegexLiteral { - type Target = regex::Regex; + type Target = Regex; fn deref(&self) -> &Self::Target { - &self.0 + &self.regex } } impl PartialEq for RegexLiteral { fn eq(&self, other: &Self) -> bool { - self.0.as_str() == other.0.as_str() + self.pattern == other.pattern + } +} + +impl Eq for RegexLiteral {} + +impl Hash for RegexLiteral { + fn hash(&self, state: &mut H) { + self.pattern.hash(state); } } diff --git a/src/parser/tokenizer.rs b/src/parser/tokenizer.rs index 1981384c..2b734f0b 100644 --- a/src/parser/tokenizer.rs +++ b/src/parser/tokenizer.rs @@ -119,7 +119,7 @@ impl std::fmt::Display for TokenKind { Null => write!(f, "null"), Bool(v) => write!(f, "{}", v), Str(v) => write!(f, "\"{}\"", v), - Regex(v) => write!(f, "/{}/", v.as_str()), + Regex(v) => write!(f, "/{}/", v.as_pattern()), Number(v) => write!(f, "{}", v), Name(v) => write!(f, "{}", v), Var(v) => write!(f, "${}", v), @@ -352,6 +352,7 @@ impl<'a> Tokenizer<'a> { let mut buffer = String::new(); let mut is_escape = false; + // Parse the regex pattern between slashes loop { match self.peek() { '\\' => { @@ -364,13 +365,11 @@ impl<'a> Tokenizer<'a> { break; } c => { - // Check for unterminated regex literals if self.eof() { return Err(Error::S0302UnterminatedRegex( self.start_char_index, )); } - self.bump(); buffer.push(c); is_escape = false; @@ -378,15 +377,15 @@ impl<'a> Tokenizer<'a> { } } + // Check for an empty regex pattern if buffer.is_empty() { return Err(Error::S0301EmptyRegex(self.start_char_index)); } - // Check for the regex flags + // Parse regex flags let mut multi_line = false; let mut case_insensitive = false; loop { - // JSONata only supports these two flags. match self.peek() { 'i' if !case_insensitive => { case_insensitive = true; @@ -396,28 +395,23 @@ impl<'a> Tokenizer<'a> { multi_line = true; self.bump(); } - - // Any other alphanumeric character is an error (including repetitions of a supported flag) c if c.is_alphanumeric() => { return Err(Error::S0303InvalidRegex( self.start_char_index, "Invalid regex flags".to_string(), - )) + )); } - _ => break, } } - let r = regex::RegexBuilder::new(&buffer) - .case_insensitive(case_insensitive) - .multi_line(multi_line) - .build() - .map_err(|e| { - Error::S0303InvalidRegex(self.start_char_index, e.to_string()) - })?; + // Build the regex with the specified flags + let regex_literal = + RegexLiteral::new(&buffer, case_insensitive, multi_line).map_err( + |e| Error::S0303InvalidRegex(self.start_char_index, e.to_string()), + )?; - Regex(RegexLiteral::new(r)) + Regex(regex_literal) } _ => ForwardSlash, }, @@ -864,12 +858,7 @@ mod tests { #[test] fn regex() { for expr in [ - // Note: some of these might be non-sensical expressions, but the forward slash is in a value - // position, not an operator position, so it should be interpreted as beginning a regex. - // Eg, `$m < / 2` is interpreted as an unterminated regex by the javascript library as well. "/[0-9]+/", - "/[0-9]+/i", - "/[0-9]+/mi", r#"$matches("100", /[0-9]+/)"#, "path.to.object[stringProperty ~> /[0-9]+/", "$matcher := /[0-9]+/", @@ -883,10 +872,11 @@ mod tests { "false and /[0-9]+/", ] { let tokens = collect_tokens(Tokenizer::new(expr)).unwrap(); + assert!( tokens .iter() - .any(|t| matches!(&t.kind, TokenKind::Regex(s) if s.as_str() == "[0-9]+")), + .any(|t| matches!(&t.kind, TokenKind::Regex(s) if s.as_pattern() == "[0-9]+")), "Should contain the expected regex token: {}", expr ); @@ -903,27 +893,30 @@ mod tests { #[test] fn regex_with_flags() { + // Case-insensitive flag test with basic pattern let kind = Tokenizer::new("/^[a-z]+$/i").next_token().unwrap().kind; if let TokenKind::Regex(r) = kind { - // There's not a function on the `regex::Regex` to check its flags directly. + // There's not a function on the `regress::Regex` to check its flags directly. assert!(r.is_match("ABC")); assert!(!r.is_match("\nABC\n")); } else { panic!("Expected regex token") }; + // Multiline flag test with simple pattern let kind = Tokenizer::new("/^[a-z]+$/m").next_token().unwrap().kind; if let TokenKind::Regex(r) = kind { - // There's not a function on the `regex::Regex` to check its flags directly. + // There's not a function on the `regress::Regex` to check its flags directly. assert!(!r.is_match("ABC")); assert!(r.is_match("\nabc\n")); } else { panic!("Expected regex token") }; + // Case-insensitive and multiline flags together let kind = Tokenizer::new("/^[a-z]+$/im").next_token().unwrap().kind; if let TokenKind::Regex(r) = kind { - // There's not a function on the `regex::Regex` to check its flags directly. + // There's not a function on the `regress::Regex` to check its flags directly. assert!(r.is_match("ABC")); assert!(r.is_match("\nABC\n")); } else { diff --git a/tests/testsuite.rs b/tests/testsuite.rs index 233cdc52..293fb5b1 100644 --- a/tests/testsuite.rs +++ b/tests/testsuite.rs @@ -3,7 +3,6 @@ extern crate test_generator; use bumpalo::Bump; use jsonata_rs::{ArrayFlags, JsonAta, Value}; -use regex::Regex; use std::fs; use std::path; @@ -108,11 +107,12 @@ fn test_case(resource: &str) { } else if case["result_re"].is_string() { // Ability to define a regular expression to match the result. This strategy is useful // to validate the result of an expression that's not deterministic (like the $millis() function). - let regex_pattern = Regex::new(case["result_re"].as_str().as_ref()) - .expect("Should have a valid regex expression"); + let regex_pattern = + regress::Regex::with_flags(case["result_re"].as_str().as_ref(), "") + .expect("Should have a valid regex expression"); assert!( - regex_pattern.is_match(&result.as_str()), + regex_pattern.find(&result.as_str()).is_some(), "Value: {result:?}, did not match expected result_re", ); } else {