From 3157af01291311f9399b75f4ff78d9d835e108f8 Mon Sep 17 00:00:00 2001 From: "Joseph T. Lyons" Date: Wed, 16 Oct 2024 23:01:31 -0400 Subject: [PATCH] Implement custom parser (#4) * WIP * WIP * Use startest * WIP * WIP * WIP * WIP * WIP * Document custom error types * Use `result.replace()` instead of `result.map()` * Update examples * Refactor * Refactor * Use `result.lazy_or`, as it is a bit cleaner if error is not needed --- src/coerce.gleam | 198 +++++++++++++++++++++++++ src/lenient_parse.gleam | 80 +++------- test/coerce_test.gleam | 102 +++++++++++++ test/is_valid_number_string_test.gleam | 29 ---- test/to_float_parse_test.gleam | 20 ++- test/to_int_parse_test.gleam | 16 +- 6 files changed, 344 insertions(+), 101 deletions(-) create mode 100644 src/coerce.gleam create mode 100644 test/coerce_test.gleam delete mode 100644 test/is_valid_number_string_test.gleam diff --git a/src/coerce.gleam b/src/coerce.gleam new file mode 100644 index 0000000..371ba6e --- /dev/null +++ b/src/coerce.gleam @@ -0,0 +1,198 @@ +import gleam/bool +import gleam/list +import gleam/option.{type Option, None, Some} +import gleam/result +import gleam/set.{type Set} +import gleam/string + +pub type ParseError { + /// Represents an error when an invalid character is encountered during + /// parsing. The `String` parameter contains the invalid character. + InvalidCharacter(String) + + /// Represents an error when the input string is empty or contains only + /// whitespace. + WhitespaceOnlyOrEmptyString + + /// Represents an error when an underscore is in an invalid position within + /// the number string. + InvalidUnderscorePosition + + /// Represents an error when a decimal point is in an invalid position within + /// the number string. + InvalidDecimalPosition + + /// Represents an error when Gleam's `float.parse` fails after custom parsing + /// and coercion. Indicates the string couldn't be converted to a float even + /// with more permissive rules. + GleamFloatParseError + + /// Represents an error when Gleam's `int.parse` fails after custom parsing + /// and coercion. Indicates the string couldn't be converted to a float even + /// with more permissive rules. + GleamIntParseError +} + +@internal +pub fn coerce_into_valid_number_string( + text: String, +) -> Result(String, ParseError) { + let text = text |> string.trim + use <- bool.guard(text |> string.is_empty, Error(WhitespaceOnlyOrEmptyString)) + use _ <- result.try(text |> has_valid_characters()) + use text <- result.try(text |> coerce_into_valid_underscore_string) + text |> coerce_into_valid_decimal_string +} + +fn digit_set() -> Set(String) { + ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] |> set.from_list +} + +fn sign_set() -> Set(String) { + ["+", "-"] |> set.from_list +} + +fn separator_set() -> Set(String) { + [".", "_"] |> set.from_list +} + +fn valid_character_set() -> Set(String) { + let digits = digit_set() + let signs = sign_set() + let separators = separator_set() + + digits |> set.union(signs) |> set.union(separators) +} + +@internal +pub fn coerce_into_valid_underscore_string( + text: String, +) -> Result(String, ParseError) { + text + |> string.to_graphemes + |> do_coerce_into_valid_underscore_string( + previous: None, + digits: digit_set(), + acc: "", + ) +} + +fn do_coerce_into_valid_underscore_string( + characters: List(String), + previous previous: Option(String), + digits digits: Set(String), + acc acc: String, +) -> Result(String, ParseError) { + case characters { + [] -> { + use <- bool.guard(previous == Some("_"), Error(InvalidUnderscorePosition)) + Ok(acc |> string.reverse) + } + [first, ..rest] -> { + case first, previous { + "_", None -> Error(InvalidUnderscorePosition) + a, Some("_") -> + case digits |> set.contains(a) { + True -> + do_coerce_into_valid_underscore_string( + rest, + previous: Some(first), + digits: digits, + acc: first <> acc, + ) + False -> Error(InvalidUnderscorePosition) + } + "_", Some(a) -> + case digits |> set.contains(a) { + True -> + do_coerce_into_valid_underscore_string( + rest, + previous: Some(first), + digits: digits, + acc: acc, + ) + False -> Error(InvalidUnderscorePosition) + } + _, _ -> + do_coerce_into_valid_underscore_string( + rest, + previous: Some(first), + digits: digits, + acc: first <> acc, + ) + } + } + } +} + +@internal +pub fn has_valid_characters(text: String) -> Result(Nil, ParseError) { + let graphemes = text |> string.to_graphemes + list.try_map(graphemes, fn(grapheme) { + case valid_character_set() |> set.contains(grapheme) { + True -> Ok(Nil) + False -> Error(InvalidCharacter(grapheme)) + } + }) + |> result.replace(Nil) +} + +@internal +pub fn coerce_into_valid_decimal_string( + text: String, +) -> Result(String, ParseError) { + let text_length = text |> string.length + + text + |> string.to_graphemes + |> do_coerce_into_valid_decimal_string( + text_length: text_length, + previous: None, + seen_decimal: False, + acc: "", + ) +} + +fn do_coerce_into_valid_decimal_string( + characters: List(String), + text_length text_length: Int, + previous previous: Option(String), + seen_decimal seen_decimal: Bool, + acc acc: String, +) -> Result(String, ParseError) { + case characters { + [] -> { + case previous { + Some(".") -> Ok("0" <> acc) + _ -> Ok(acc) + } + |> result.map(string.reverse) + } + [first, ..rest] -> { + case first, previous { + ".", None -> + case text_length == 1 { + True -> Error(InvalidDecimalPosition) + False -> + rest + |> do_coerce_into_valid_decimal_string( + text_length: text_length, + previous: Some(first), + seen_decimal: True, + acc: acc <> ".0", + ) + } + ".", Some(_) if seen_decimal -> Error(InvalidDecimalPosition) + a, _ -> { + rest + |> do_coerce_into_valid_decimal_string( + text_length: text_length, + previous: Some(first), + seen_decimal: a == "." || seen_decimal, + acc: first <> acc, + ) + } + } + } + } +} diff --git a/src/lenient_parse.gleam b/src/lenient_parse.gleam index a9cd281..77f7273 100644 --- a/src/lenient_parse.gleam +++ b/src/lenient_parse.gleam @@ -1,9 +1,10 @@ -import gleam/bool +import coerce.{ + type ParseError, GleamFloatParseError, GleamIntParseError, + coerce_into_valid_number_string, +} import gleam/float import gleam/int -import gleam/regex import gleam/result -import gleam/string /// Converts a string to a float using a more lenient parsing method than gleam's `float.parse()`. It behaves similarly to Python's `float()` built-in function. /// @@ -19,27 +20,21 @@ import gleam/string /// lenient_parse.to_float("+123.321") // -> Ok(123.321) /// lenient_parse.to_float("-123.321") // -> Ok(-123.321) /// lenient_parse.to_float(" 1.0 ") // -> Ok(1.0) -/// lenient_parse.to_float("1_000.0") // -> Ok(1000.0) -/// lenient_parse.to_float(" ") // -> Error(Nil) -/// lenient_parse.to_float("") // -> Error(Nil) -/// lenient_parse.to_float("abc") // -> Error(Nil) +/// lenient_parse.to_float("1_000.0") // -> Ok(1.0e3) +/// lenient_parse.to_float(" ") // -> Error(WhitespaceOnlyOrEmptyString) +/// lenient_parse.to_float("") // -> Error(WhitespaceOnlyOrEmptyString) +/// lenient_parse.to_float("abc") // -> Error(InvalidCharacter("a")) /// ``` -pub fn to_float(text: String) -> Result(Float, Nil) { - use text <- result.try(text |> common_sanitize) - use _ <- result.try_recover(text |> float.parse) - use _ <- result.try_recover(text |> int.parse |> result.map(int.to_float)) - - let res = case string.first(text) { - Ok(".") -> float.parse("0" <> text) - _ -> Error(Nil) - } - +pub fn to_float(text: String) -> Result(Float, ParseError) { + let text = text |> coerce_into_valid_number_string + use text <- result.try(text) + let res = text |> float.parse |> result.replace_error(GleamFloatParseError) use <- result.lazy_or(res) - case string.last(text) { - Ok(".") -> float.parse(text <> "0") - _ -> Error(Nil) - } + text + |> int.parse + |> result.replace_error(GleamIntParseError) + |> result.map(int.to_float) } /// Converts a string to an integer using a more lenient parsing method than gleam's `int.parse()`. @@ -54,42 +49,11 @@ pub fn to_float(text: String) -> Result(Float, Nil) { /// lenient_parse.to_int("0123") // -> Ok(123) /// lenient_parse.to_int(" 123 ") // -> Ok(123) /// lenient_parse.to_int("1_000") // -> Ok(1000) -/// lenient_parse.to_int("") // -> Error(Nil) -/// lenient_parse.to_int("1.0") // -> Error(Nil) -/// lenient_parse.to_int("abc") // -> Error(Nil) +/// lenient_parse.to_int("") // -> Error(WhitespaceOnlyOrEmptyString) +/// lenient_parse.to_int("1.0") // -> Error(GleamIntParseError) +/// lenient_parse.to_int("abc") // -> Error(InvalidCharacter("a")) /// ``` -pub fn to_int(text: String) -> Result(Int, Nil) { - text |> common_sanitize |> result.try(int.parse) -} - -fn common_sanitize(text: String) -> Result(String, Nil) { - use <- bool.guard(!is_valid_number_string(text), Error(Nil)) - let text = text |> string.trim |> string.replace("_", "") - use <- bool.guard(text |> string.is_empty, Error(Nil)) - text |> Ok -} - -@internal -pub fn is_valid_number_string(text: String) -> Bool { - // ^ - Start of string - // \s* - Optional whitespace at the beginning - // [+-]? - Optional plus or minus sign - // (?!.*__) - Negative lookahead to prevent double underscores - // (?!_) - Negative lookahead to prevent leading underscore - // (?!^\s*[+-]?_\s*$) - Negative lookahead to prevent just an underscore - // [0-9_]* - Zero or more digits or underscores - // (? regex.from_string - |> result.map(regex.check(with: _, content: text)) - |> result.unwrap(False) +pub fn to_int(text: String) -> Result(Int, ParseError) { + use text <- result.try(text |> coerce_into_valid_number_string) + text |> int.parse |> result.replace_error(GleamIntParseError) } diff --git a/test/coerce_test.gleam b/test/coerce_test.gleam new file mode 100644 index 0000000..4980ea8 --- /dev/null +++ b/test/coerce_test.gleam @@ -0,0 +1,102 @@ +import coerce.{ + InvalidCharacter, InvalidDecimalPosition, InvalidUnderscorePosition, + WhitespaceOnlyOrEmptyString, coerce_into_valid_number_string, + coerce_into_valid_underscore_string, +} +import gleam/list +import startest/expect + +pub fn coerce_into_valid_number_string_test() { + "" + |> coerce_into_valid_number_string + |> expect.to_equal(Error(WhitespaceOnlyOrEmptyString)) + + " " + |> coerce_into_valid_number_string + |> expect.to_equal(Error(WhitespaceOnlyOrEmptyString)) + + "\t\n\r" + |> coerce_into_valid_number_string + |> expect.to_equal(Error(WhitespaceOnlyOrEmptyString)) + + "a" + |> coerce_into_valid_number_string + |> expect.to_equal(Error(InvalidCharacter("a"))) + + "1a1" + |> coerce_into_valid_number_string + |> expect.to_equal(Error(InvalidCharacter("a"))) +} + +pub fn coerce_into_valid_underscore_string_test() { + "0" + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok("0")) + + "0.0" + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok("0.0")) + + "+1000" + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok("+1000")) + + "-1000" + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok("-1000")) + + " 1000 " + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok(" 1000 ")) + + " -1000 " + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok(" -1000 ")) + + "1_000" + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok("1000")) + + "1_000_000" + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok("1000000")) + + "1_000_000.0" + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok("1000000.0")) + + "1_000_000.000_1" + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok("1000000.0001")) + + "1000.000_000" + |> coerce_into_valid_underscore_string + |> expect.to_equal(Ok("1000.000000")) + + [ + "_", "_1000", "1000_", "+_1000", "-_1000", "1__000", "1_.000", "1._000", + "_1000.0", "1000.0_", "1000._0", "1000_.0", "1000_.", + ] + |> list.each(fn(text) { + text + |> coerce_into_valid_underscore_string + |> expect.to_equal(Error(InvalidUnderscorePosition)) + }) +} + +pub fn check_for_valid_decimal_positions_test() { + ".1" + |> coerce_into_valid_number_string() + |> expect.to_equal(Ok("0.1")) + + "1." + |> coerce_into_valid_number_string() + |> expect.to_equal(Ok("1.0")) + + [".", "..", "0.0.", ".0.0"] + |> list.each(fn(text) { + text + |> coerce_into_valid_number_string() + |> expect.to_equal(Error(InvalidDecimalPosition)) + }) +} diff --git a/test/is_valid_number_string_test.gleam b/test/is_valid_number_string_test.gleam deleted file mode 100644 index ed2f1aa..0000000 --- a/test/is_valid_number_string_test.gleam +++ /dev/null @@ -1,29 +0,0 @@ -import lenient_parse -import startest/expect - -pub fn is_valid_number_string_true_test() { - lenient_parse.is_valid_number_string("0") |> expect.to_be_true - lenient_parse.is_valid_number_string("0.0") |> expect.to_be_true - lenient_parse.is_valid_number_string("0.0.") |> expect.to_be_false - lenient_parse.is_valid_number_string("1_000") |> expect.to_be_true - lenient_parse.is_valid_number_string("1000.000_000") - |> expect.to_be_true - lenient_parse.is_valid_number_string("+1000") |> expect.to_be_true - lenient_parse.is_valid_number_string("-1000") |> expect.to_be_true - - lenient_parse.is_valid_number_string(" 1000 ") |> expect.to_be_true - lenient_parse.is_valid_number_string(" -1000 ") |> expect.to_be_true -} - -pub fn is_valid_number_string_false_test() { - lenient_parse.is_valid_number_string("a") |> expect.to_be_false - lenient_parse.is_valid_number_string("1__000") |> expect.to_be_false - lenient_parse.is_valid_number_string("1000_") |> expect.to_be_false - lenient_parse.is_valid_number_string("_1000") |> expect.to_be_false - lenient_parse.is_valid_number_string("1000_.") |> expect.to_be_false - lenient_parse.is_valid_number_string("1000_.0") |> expect.to_be_false - lenient_parse.is_valid_number_string("1000._0") |> expect.to_be_false - lenient_parse.is_valid_number_string("1000.0_") |> expect.to_be_false - lenient_parse.is_valid_number_string("_1000.0") |> expect.to_be_false - lenient_parse.is_valid_number_string("_") |> expect.to_be_false -} diff --git a/test/to_float_parse_test.gleam b/test/to_float_parse_test.gleam index 7ab9951..6c1c533 100644 --- a/test/to_float_parse_test.gleam +++ b/test/to_float_parse_test.gleam @@ -1,3 +1,7 @@ +import coerce.{ + InvalidCharacter, InvalidDecimalPosition, InvalidUnderscorePosition, + WhitespaceOnlyOrEmptyString, +} import lenient_parse import startest/expect @@ -64,7 +68,7 @@ pub fn to_float_underscores_test() { pub fn to_float_invalid_underscores_test() { "1_000__000.0" |> lenient_parse.to_float - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(InvalidUnderscorePosition)) } pub fn to_float_with_surrounding_whitespace_integer_test() { @@ -92,41 +96,41 @@ pub fn to_float_with_surrounding_whitespace_float_test() { pub fn to_float_with_double_leading_dot_test() { "..1" |> lenient_parse.to_float - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(InvalidDecimalPosition)) } pub fn to_float_with_double_trailing_dot_test() { "1.." |> lenient_parse.to_float - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(InvalidDecimalPosition)) } pub fn to_float_with_sandwich_dot_test() { ".1." |> lenient_parse.to_float - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(InvalidDecimalPosition)) } pub fn to_float_with_single_dot_test() { "." |> lenient_parse.to_float - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(InvalidDecimalPosition)) } pub fn to_float_with_only_whitespace_test() { " " |> lenient_parse.to_float - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(WhitespaceOnlyOrEmptyString)) } pub fn to_float_with_empty_string_test() { "" |> lenient_parse.to_float - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(WhitespaceOnlyOrEmptyString)) } pub fn to_float_with_non_numeric_string_test() { "abc" |> lenient_parse.to_float - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(InvalidCharacter("a"))) } diff --git a/test/to_int_parse_test.gleam b/test/to_int_parse_test.gleam index 779a9c7..007278c 100644 --- a/test/to_int_parse_test.gleam +++ b/test/to_int_parse_test.gleam @@ -1,3 +1,7 @@ +import coerce.{ + GleamIntParseError, InvalidCharacter, InvalidUnderscorePosition, + WhitespaceOnlyOrEmptyString, +} import lenient_parse import startest/expect @@ -34,7 +38,7 @@ pub fn to_int_underscores_test() { pub fn to_int_invalid_underscores_test() { "1_000__000" |> lenient_parse.to_int - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(InvalidUnderscorePosition)) } pub fn to_int_with_surrounding_whitespace_test() { @@ -46,29 +50,29 @@ pub fn to_int_with_surrounding_whitespace_test() { pub fn to_int_with_decimal_point_test() { "1." |> lenient_parse.to_int - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(GleamIntParseError)) } pub fn to_int_with_decimal_number_test() { "1.0" |> lenient_parse.to_int - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(GleamIntParseError)) } pub fn to_int_with_only_whitespace_test() { " " |> lenient_parse.to_int - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(WhitespaceOnlyOrEmptyString)) } pub fn to_int_with_empty_string_test() { "" |> lenient_parse.to_int - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(WhitespaceOnlyOrEmptyString)) } pub fn to_int_with_non_numeric_string_test() { "abc" |> lenient_parse.to_int - |> expect.to_equal(Error(Nil)) + |> expect.to_equal(Error(InvalidCharacter("a"))) }