Do not rely on gleam parse functions (#37)

JosephTLyons · Nov 4, 2024 · 72eab98 · 72eab98
1 parent 45aa8c2
commit 72eab98
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 89 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## v1.3.0 - xxx
 
 Breaking changes:
+    - `GleamIntParseError` and `GleamIntParseErrorReason` have been removed.
     - `InvalidCharacter` has been renamed to `UnknownCharacter`.
     - A new `InvalidDigitPosition` error has been introduced.
 

diff --git a/src/lenient_parse.gleam b/src/lenient_parse.gleam
@@ -1,8 +1,5 @@
-import gleam/float
-import gleam/int
-import gleam/result
 import lenient_parse/internal/parse
-import parse_error.{type ParseError, GleamFloatParseError, GleamIntParseError}
+import parse_error.{type ParseError}
 
 /// Converts a string to a float using a more lenient parsing method than gleam's `float.parse()`. It behaves similarly to Python's `float()` built-in function.
 ///
@@ -24,15 +21,7 @@ import parse_error.{type ParseError, GleamFloatParseError, GleamIntParseError}
 /// lenient_parse.to_float("abc")      // -> Error(InvalidCharacter("a", 0))
 /// ```
 pub fn to_float(text: String) -> Result(Float, ParseError) {
-  let text = text |> parse.parse_float
-  use text <- result.try(text)
-  let res = text |> float.parse |> result.replace_error(GleamFloatParseError)
-  use <- result.lazy_or(res)
-
-  text
-  |> int.parse
-  |> result.map(int.to_float)
-  |> result.replace_error(GleamIntParseError)
+  text |> parse.parse_float
 }
 
 /// Converts a string to an integer using a more lenient parsing method than gleam's `int.parse()`.
@@ -52,6 +41,5 @@ pub fn to_float(text: String) -> Result(Float, ParseError) {
 /// lenient_parse.to_int("abc")   // -> Error(InvalidCharacter("a", 0))
 /// ```
 pub fn to_int(text: String) -> Result(Int, ParseError) {
-  use text <- result.try(text |> parse.parse_int)
-  text |> int.parse |> result.replace_error(GleamIntParseError)
+  text |> parse.parse_int
 }
diff --git a/src/lenient_parse/internal/parse.gleam b/src/lenient_parse/internal/parse.gleam
@@ -1,16 +1,17 @@
 import gleam/bool
+import gleam/int
 import gleam/list
 import gleam/option.{type Option, None, Some}
 import gleam/result
 import lenient_parse/internal/tokenizer.{
   type Token, DecimalPoint, Digit, Sign, Underscore, Unknown, Whitespace,
 }
 import parse_error.{
-  type ParseError, EmptyString, InvalidDecimalPosition, UnknownCharacter,
-  WhitespaceOnlyString,
+  type ParseError, EmptyString, InvalidDecimalPosition,
+  InvalidUnderscorePosition, UnknownCharacter, WhitespaceOnlyString,
 }
 
-pub fn parse_float(input: String) -> Result(String, ParseError) {
+pub fn parse_float(input: String) -> Result(Float, ParseError) {
   let tokens = input |> tokenizer.tokenize
   let index = 0
   let empty_string = ""
@@ -19,28 +20,35 @@ pub fn parse_float(input: String) -> Result(String, ParseError) {
   use #(leading_whitespace, tokens, index) <- result.try(pre_whitespace_result)
 
   let sign_result = parse_sign(tokens, index)
-  use #(sign, tokens, index) <- result.try(sign_result)
+  use #(is_positive, tokens, index) <- result.try(sign_result)
 
-  let digit_pre_decimal = parse_digit(tokens, empty_string, index, index)
-  use #(digit_pre_decimal, tokens, index) <- result.try(digit_pre_decimal)
+  let whole_digit_result = parse_digit(tokens, 0, index, index, 0)
+  use #(whole_digit, _, tokens, index) <- result.try(whole_digit_result)
 
-  let decimal_point_result = parse_decimal_point(tokens, index)
-  use #(decimal_specified, tokens, index) <- result.try(decimal_point_result)
+  let decimal_result = parse_decimal_point(tokens, index)
+  use #(decimal_specified, tokens, index) <- result.try(decimal_result)
 
-  let digit_post_decimal = parse_digit(tokens, empty_string, index, index)
-  use #(digit_post_decimal, tokens, index) <- result.try(digit_post_decimal)
+  let fractional_digit_result = parse_digit(tokens, 0, index, index, 0)
+  use #(fractional_digit, fractional_digit_length, tokens, index) <- result.try(
+    fractional_digit_result,
+  )
 
   let post_whitespace_result = parse_whitespace(tokens, empty_string, index)
   use #(_, tokens, index) <- result.try(post_whitespace_result)
 
   case tokens |> list.first {
-    Ok(token) -> Error(tokenizer.error_for_token(token, index))
+    Ok(token) -> Error(tokenizer.to_error(token, index))
     _ -> {
-      case digit_pre_decimal, digit_post_decimal {
-        Some(pre), Some(post) -> Ok(sign <> pre <> "." <> post)
-        Some(pre), None -> Ok(sign <> pre <> ".0")
-        None, Some(post) -> Ok(sign <> "0." <> post)
+      case whole_digit, fractional_digit {
+        Some(whole), Some(fractional) ->
+          Ok(form_float(is_positive, whole, fractional, fractional_digit_length))
+        Some(whole), None ->
+          Ok(form_float(is_positive, whole, 0, fractional_digit_length))
+        None, Some(fractional) ->
+          Ok(form_float(is_positive, 0, fractional, fractional_digit_length))
         _, _ -> {
+          // TODO: This sucks - hardcoded to take care of one specific test case during the rewrite: "."
+          // There is likely a better way to handle this.
           use <- bool.guard(
             decimal_specified,
             Error(InvalidDecimalPosition(index - 1)),
@@ -56,7 +64,7 @@ pub fn parse_float(input: String) -> Result(String, ParseError) {
   }
 }
 
-pub fn parse_int(input: String) -> Result(String, ParseError) {
+pub fn parse_int(input: String) -> Result(Int, ParseError) {
   let tokens = input |> tokenizer.tokenize
   let index = 0
   let empty_string = ""
@@ -65,19 +73,23 @@ pub fn parse_int(input: String) -> Result(String, ParseError) {
   use #(leading_whitespace, tokens, index) <- result.try(pre_whitespace_result)
 
   let sign_result = parse_sign(tokens, index)
-  use #(sign, tokens, index) <- result.try(sign_result)
+  use #(is_positive, tokens, index) <- result.try(sign_result)
 
-  let digit_result = parse_digit(tokens, empty_string, index, index)
-  use #(digit, tokens, index) <- result.try(digit_result)
+  let digit_result = parse_digit(tokens, 0, index, index, 0)
+  use #(digit, _, tokens, index) <- result.try(digit_result)
 
   let post_whitespace_result = parse_whitespace(tokens, empty_string, index)
   use #(_, tokens, index) <- result.try(post_whitespace_result)
 
   case tokens |> list.first {
-    Ok(token) -> Error(tokenizer.error_for_token(token, index))
+    Ok(token) -> Error(tokenizer.to_error(token, index))
     _ -> {
       case leading_whitespace, digit {
-        Some(_), Some(digit) | None, Some(digit) -> Ok(sign <> digit)
+        Some(_), Some(digit) | None, Some(digit) ->
+          case is_positive {
+            True -> Ok(digit)
+            False -> Ok(-digit)
+          }
         Some(_), None -> Error(WhitespaceOnlyString)
         _, _ -> Error(EmptyString)
       }
@@ -115,14 +127,14 @@ fn parse_whitespace(
 fn parse_sign(
   tokens: List(Token),
   index: Int,
-) -> Result(#(String, List(Token), Int), ParseError) {
+) -> Result(#(Bool, List(Token), Int), ParseError) {
   case tokens {
-    [] -> Ok(#("+", tokens, index))
+    [] -> Ok(#(True, tokens, index))
     [first, ..rest] -> {
       case first {
         Unknown(character) -> Error(UnknownCharacter(character, index))
-        Sign(a) -> Ok(#(a, rest, index + 1))
-        _ -> Ok(#("+", tokens, index))
+        Sign(is_positive) -> Ok(#(is_positive, rest, index + 1))
+        _ -> Ok(#(True, tokens, index))
       }
     }
   }
@@ -146,17 +158,18 @@ fn parse_decimal_point(
 
 fn parse_digit(
   tokens: List(Token),
-  acc: String,
+  acc: Int,
   index: Int,
   beginning_index: Int,
-) -> Result(#(Option(String), List(Token), Int), ParseError) {
+  digit_length: Int,
+) -> Result(#(Option(Int), Int, List(Token), Int), ParseError) {
   let at_beginning = index == beginning_index
 
   case tokens {
     [] ->
-      case acc {
-        "" -> Ok(#(None, tokens, index))
-        _ -> Ok(#(Some(acc), tokens, index))
+      case digit_length > 0 {
+        True -> Ok(#(Some(acc), digit_length, tokens, index))
+        False -> Ok(#(None, digit_length, tokens, index))
       }
     [first, ..rest] -> {
       let lookahead = rest |> list.first
@@ -170,23 +183,52 @@ fn parse_digit(
       }
 
       case first {
-        Digit(digit) ->
-          parse_digit(rest, acc <> digit, index + 1, beginning_index)
+        Digit(digit) -> {
+          let acc = acc * 10 + digit
+          parse_digit(rest, acc, index + 1, beginning_index, digit_length + 1)
+        }
         Underscore if next_is_underscore ->
-          Error(parse_error.InvalidUnderscorePosition(index + 1))
+          Error(InvalidUnderscorePosition(index + 1))
         Underscore if at_beginning || is_end ->
-          Error(parse_error.InvalidUnderscorePosition(index))
-        Underscore -> parse_digit(rest, acc, index + 1, beginning_index)
+          Error(InvalidUnderscorePosition(index))
+        Underscore -> {
+          parse_digit(rest, acc, index + 1, beginning_index, digit_length)
+        }
         Whitespace(whitespace) if at_beginning ->
           Error(UnknownCharacter(whitespace, index))
         Unknown(character) -> Error(UnknownCharacter(character, index))
         _ -> {
-          case acc {
-            "" -> Ok(#(None, tokens, index))
-            _ -> Ok(#(Some(acc), tokens, index))
+          case digit_length > 0 {
+            True -> Ok(#(Some(acc), digit_length, tokens, index))
+            False -> Ok(#(None, digit_length, tokens, index))
           }
         }
       }
     }
   }
 }
+
+fn form_float(
+  is_positive: Bool,
+  whole_digit: Int,
+  fractional_digit: Int,
+  fractional_length: Int,
+) -> Float {
+  let whole_float = whole_digit |> int.to_float
+  let fractional_float =
+    fractional_digit
+    |> int.to_float
+    |> normalize_fractional_part(fractional_length)
+  let float_value = whole_float +. fractional_float
+  case is_positive {
+    True -> float_value
+    False -> float_value *. -1.0
+  }
+}
+
+fn normalize_fractional_part(value: Float, fractional_length: Int) -> Float {
+  case fractional_length <= 0 {
+    True -> value
+    False -> normalize_fractional_part(value /. 10.0, fractional_length - 1)
+  }
+}
diff --git a/src/lenient_parse/internal/tokenizer.gleam b/src/lenient_parse/internal/tokenizer.gleam
@@ -1,3 +1,4 @@
+import gleam/int
 import gleam/list
 import gleam/string
 import parse_error.{
@@ -6,8 +7,8 @@ import parse_error.{
 }
 
 pub type Token {
-  Sign(String)
-  Digit(String)
+  Sign(Bool)
+  Digit(Int)
   Underscore
   DecimalPoint
   Whitespace(String)
@@ -23,9 +24,18 @@ fn do_tokenize(characters: List(String), acc: List(Token)) -> List(Token) {
     [] -> acc |> list.reverse
     [first, ..rest] -> {
       let token = case first {
-        "-" | "+" -> Sign(first)
-        "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ->
-          Digit(first)
+        "-" -> Sign(False)
+        "+" -> Sign(True)
+        "0" -> Digit(0)
+        "1" -> Digit(1)
+        "2" -> Digit(2)
+        "3" -> Digit(3)
+        "4" -> Digit(4)
+        "5" -> Digit(5)
+        "6" -> Digit(6)
+        "7" -> Digit(7)
+        "8" -> Digit(8)
+        "9" -> Digit(9)
         "." -> DecimalPoint
         "_" -> Underscore
         " " | "\n" | "\t" | "\r" | "\f" | "\r\n" -> Whitespace(first)
@@ -37,10 +47,14 @@ fn do_tokenize(characters: List(String), acc: List(Token)) -> List(Token) {
   }
 }
 
-pub fn error_for_token(token: Token, index) -> ParseError {
+pub fn to_error(token: Token, index) -> ParseError {
   case token {
-    Digit(digit) -> InvalidDigitPosition(digit, index)
-    Sign(sign) -> InvalidSignPosition(sign, index)
+    Digit(digit) -> {
+      let digit = digit |> int.to_string
+      InvalidDigitPosition(digit, index)
+    }
+    Sign(True) -> InvalidSignPosition("+", index)
+    Sign(False) -> InvalidSignPosition("-", index)
     Underscore -> InvalidUnderscorePosition(index)
     Unknown(character) -> UnknownCharacter(character, index)
     Whitespace(whitespace) -> UnknownCharacter(whitespace, index)

diff --git a/src/parse_error.gleam b/src/parse_error.gleam
@@ -40,26 +40,11 @@ pub type ParseError {
   /// - `character`: The invalid character as a `String`.
   /// - `index`: The position of the invalid character in the input string.
   UnknownCharacter(character: String, index: Int)
-
-  /// Represents an error when Gleam's `float.parse` fails after custom parsing
-  /// and coercion.
-  ///
-  /// This indicates that the string couldn't be converted to a float even with
-  /// more permissive rules.
-  GleamFloatParseError
-
-  /// Represents an error when Gleam's `int.parse` fails after custom parsing
-  /// and coercion.
-  ///
-  /// This indicates that the string couldn't be converted to an integer even
-  /// with more permissive rules.
-  GleamIntParseError
 }
 
 @internal
 pub fn to_string(error: ParseError) -> String {
   case error {
-    GleamIntParseError -> "gleam integer parse error"
     UnknownCharacter(character, index) ->
       "unknown character \""
       <> character
@@ -69,7 +54,6 @@ pub fn to_string(error: ParseError) -> String {
       "invalid underscore at position: " <> index |> int.to_string
     EmptyString -> "empty string"
     WhitespaceOnlyString -> "whitespace only string"
-    GleamFloatParseError -> "gleam float parse error"
     InvalidDecimalPosition(index) ->
       "invalid decimal at position: " <> index |> int.to_string
     InvalidSignPosition(sign, index) ->

diff --git a/test/tokenizer_test.gleam b/test/tokenizer_test.gleam
@@ -13,18 +13,18 @@ pub fn tokenize_test() {
     Whitespace("\r"),
     Whitespace("\f"),
     Whitespace("\r\n"),
-    Sign("+"),
-    Sign("-"),
-    Digit("0"),
-    Digit("1"),
-    Digit("2"),
-    Digit("3"),
-    Digit("4"),
-    Digit("5"),
-    Digit("6"),
-    Digit("7"),
-    Digit("8"),
-    Digit("9"),
+    Sign(True),
+    Sign(False),
+    Digit(0),
+    Digit(1),
+    Digit(2),
+    Digit(3),
+    Digit(4),
+    Digit(5),
+    Digit(6),
+    Digit(7),
+    Digit(8),
+    Digit(9),
     DecimalPoint,
     Underscore,
     Unknown("a"),