Skip to content

Commit

Permalink
Do not rely on gleam parse functions (#37)
Browse files Browse the repository at this point in the history
  • Loading branch information
JosephTLyons authored Nov 4, 2024
1 parent 45aa8c2 commit 72eab98
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 89 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## v1.3.0 - xxx

Breaking changes:
- `GleamIntParseError` and `GleamIntParseErrorReason` have been removed.
- `InvalidCharacter` has been renamed to `UnknownCharacter`.
- A new `InvalidDigitPosition` error has been introduced.

Expand Down
18 changes: 3 additions & 15 deletions src/lenient_parse.gleam
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import gleam/float
import gleam/int
import gleam/result
import lenient_parse/internal/parse
import parse_error.{type ParseError, GleamFloatParseError, GleamIntParseError}
import parse_error.{type ParseError}

/// Converts a string to a float using a more lenient parsing method than gleam's `float.parse()`. It behaves similarly to Python's `float()` built-in function.
///
Expand All @@ -24,15 +21,7 @@ import parse_error.{type ParseError, GleamFloatParseError, GleamIntParseError}
/// lenient_parse.to_float("abc") // -> Error(InvalidCharacter("a", 0))
/// ```
pub fn to_float(text: String) -> Result(Float, ParseError) {
let text = text |> parse.parse_float
use text <- result.try(text)
let res = text |> float.parse |> result.replace_error(GleamFloatParseError)
use <- result.lazy_or(res)

text
|> int.parse
|> result.map(int.to_float)
|> result.replace_error(GleamIntParseError)
text |> parse.parse_float
}

/// Converts a string to an integer using a more lenient parsing method than gleam's `int.parse()`.
Expand All @@ -52,6 +41,5 @@ pub fn to_float(text: String) -> Result(Float, ParseError) {
/// lenient_parse.to_int("abc") // -> Error(InvalidCharacter("a", 0))
/// ```
pub fn to_int(text: String) -> Result(Int, ParseError) {
use text <- result.try(text |> parse.parse_int)
text |> int.parse |> result.replace_error(GleamIntParseError)
text |> parse.parse_int
}
118 changes: 80 additions & 38 deletions src/lenient_parse/internal/parse.gleam
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import gleam/bool
import gleam/int
import gleam/list
import gleam/option.{type Option, None, Some}
import gleam/result
import lenient_parse/internal/tokenizer.{
type Token, DecimalPoint, Digit, Sign, Underscore, Unknown, Whitespace,
}
import parse_error.{
type ParseError, EmptyString, InvalidDecimalPosition, UnknownCharacter,
WhitespaceOnlyString,
type ParseError, EmptyString, InvalidDecimalPosition,
InvalidUnderscorePosition, UnknownCharacter, WhitespaceOnlyString,
}

pub fn parse_float(input: String) -> Result(String, ParseError) {
pub fn parse_float(input: String) -> Result(Float, ParseError) {
let tokens = input |> tokenizer.tokenize
let index = 0
let empty_string = ""
Expand All @@ -19,28 +20,35 @@ pub fn parse_float(input: String) -> Result(String, ParseError) {
use #(leading_whitespace, tokens, index) <- result.try(pre_whitespace_result)

let sign_result = parse_sign(tokens, index)
use #(sign, tokens, index) <- result.try(sign_result)
use #(is_positive, tokens, index) <- result.try(sign_result)

let digit_pre_decimal = parse_digit(tokens, empty_string, index, index)
use #(digit_pre_decimal, tokens, index) <- result.try(digit_pre_decimal)
let whole_digit_result = parse_digit(tokens, 0, index, index, 0)
use #(whole_digit, _, tokens, index) <- result.try(whole_digit_result)

let decimal_point_result = parse_decimal_point(tokens, index)
use #(decimal_specified, tokens, index) <- result.try(decimal_point_result)
let decimal_result = parse_decimal_point(tokens, index)
use #(decimal_specified, tokens, index) <- result.try(decimal_result)

let digit_post_decimal = parse_digit(tokens, empty_string, index, index)
use #(digit_post_decimal, tokens, index) <- result.try(digit_post_decimal)
let fractional_digit_result = parse_digit(tokens, 0, index, index, 0)
use #(fractional_digit, fractional_digit_length, tokens, index) <- result.try(
fractional_digit_result,
)

let post_whitespace_result = parse_whitespace(tokens, empty_string, index)
use #(_, tokens, index) <- result.try(post_whitespace_result)

case tokens |> list.first {
Ok(token) -> Error(tokenizer.error_for_token(token, index))
Ok(token) -> Error(tokenizer.to_error(token, index))
_ -> {
case digit_pre_decimal, digit_post_decimal {
Some(pre), Some(post) -> Ok(sign <> pre <> "." <> post)
Some(pre), None -> Ok(sign <> pre <> ".0")
None, Some(post) -> Ok(sign <> "0." <> post)
case whole_digit, fractional_digit {
Some(whole), Some(fractional) ->
Ok(form_float(is_positive, whole, fractional, fractional_digit_length))
Some(whole), None ->
Ok(form_float(is_positive, whole, 0, fractional_digit_length))
None, Some(fractional) ->
Ok(form_float(is_positive, 0, fractional, fractional_digit_length))
_, _ -> {
// TODO: This sucks - hardcoded to take care of one specific test case during the rewrite: "."
// There is likely a better way to handle this.
use <- bool.guard(
decimal_specified,
Error(InvalidDecimalPosition(index - 1)),
Expand All @@ -56,7 +64,7 @@ pub fn parse_float(input: String) -> Result(String, ParseError) {
}
}

pub fn parse_int(input: String) -> Result(String, ParseError) {
pub fn parse_int(input: String) -> Result(Int, ParseError) {
let tokens = input |> tokenizer.tokenize
let index = 0
let empty_string = ""
Expand All @@ -65,19 +73,23 @@ pub fn parse_int(input: String) -> Result(String, ParseError) {
use #(leading_whitespace, tokens, index) <- result.try(pre_whitespace_result)

let sign_result = parse_sign(tokens, index)
use #(sign, tokens, index) <- result.try(sign_result)
use #(is_positive, tokens, index) <- result.try(sign_result)

let digit_result = parse_digit(tokens, empty_string, index, index)
use #(digit, tokens, index) <- result.try(digit_result)
let digit_result = parse_digit(tokens, 0, index, index, 0)
use #(digit, _, tokens, index) <- result.try(digit_result)

let post_whitespace_result = parse_whitespace(tokens, empty_string, index)
use #(_, tokens, index) <- result.try(post_whitespace_result)

case tokens |> list.first {
Ok(token) -> Error(tokenizer.error_for_token(token, index))
Ok(token) -> Error(tokenizer.to_error(token, index))
_ -> {
case leading_whitespace, digit {
Some(_), Some(digit) | None, Some(digit) -> Ok(sign <> digit)
Some(_), Some(digit) | None, Some(digit) ->
case is_positive {
True -> Ok(digit)
False -> Ok(-digit)
}
Some(_), None -> Error(WhitespaceOnlyString)
_, _ -> Error(EmptyString)
}
Expand Down Expand Up @@ -115,14 +127,14 @@ fn parse_whitespace(
fn parse_sign(
tokens: List(Token),
index: Int,
) -> Result(#(String, List(Token), Int), ParseError) {
) -> Result(#(Bool, List(Token), Int), ParseError) {
case tokens {
[] -> Ok(#("+", tokens, index))
[] -> Ok(#(True, tokens, index))
[first, ..rest] -> {
case first {
Unknown(character) -> Error(UnknownCharacter(character, index))
Sign(a) -> Ok(#(a, rest, index + 1))
_ -> Ok(#("+", tokens, index))
Sign(is_positive) -> Ok(#(is_positive, rest, index + 1))
_ -> Ok(#(True, tokens, index))
}
}
}
Expand All @@ -146,17 +158,18 @@ fn parse_decimal_point(

fn parse_digit(
tokens: List(Token),
acc: String,
acc: Int,
index: Int,
beginning_index: Int,
) -> Result(#(Option(String), List(Token), Int), ParseError) {
digit_length: Int,
) -> Result(#(Option(Int), Int, List(Token), Int), ParseError) {
let at_beginning = index == beginning_index

case tokens {
[] ->
case acc {
"" -> Ok(#(None, tokens, index))
_ -> Ok(#(Some(acc), tokens, index))
case digit_length > 0 {
True -> Ok(#(Some(acc), digit_length, tokens, index))
False -> Ok(#(None, digit_length, tokens, index))
}
[first, ..rest] -> {
let lookahead = rest |> list.first
Expand All @@ -170,23 +183,52 @@ fn parse_digit(
}

case first {
Digit(digit) ->
parse_digit(rest, acc <> digit, index + 1, beginning_index)
Digit(digit) -> {
let acc = acc * 10 + digit
parse_digit(rest, acc, index + 1, beginning_index, digit_length + 1)
}
Underscore if next_is_underscore ->
Error(parse_error.InvalidUnderscorePosition(index + 1))
Error(InvalidUnderscorePosition(index + 1))
Underscore if at_beginning || is_end ->
Error(parse_error.InvalidUnderscorePosition(index))
Underscore -> parse_digit(rest, acc, index + 1, beginning_index)
Error(InvalidUnderscorePosition(index))
Underscore -> {
parse_digit(rest, acc, index + 1, beginning_index, digit_length)
}
Whitespace(whitespace) if at_beginning ->
Error(UnknownCharacter(whitespace, index))
Unknown(character) -> Error(UnknownCharacter(character, index))
_ -> {
case acc {
"" -> Ok(#(None, tokens, index))
_ -> Ok(#(Some(acc), tokens, index))
case digit_length > 0 {
True -> Ok(#(Some(acc), digit_length, tokens, index))
False -> Ok(#(None, digit_length, tokens, index))
}
}
}
}
}
}

fn form_float(
is_positive: Bool,
whole_digit: Int,
fractional_digit: Int,
fractional_length: Int,
) -> Float {
let whole_float = whole_digit |> int.to_float
let fractional_float =
fractional_digit
|> int.to_float
|> normalize_fractional_part(fractional_length)
let float_value = whole_float +. fractional_float
case is_positive {
True -> float_value
False -> float_value *. -1.0
}
}

fn normalize_fractional_part(value: Float, fractional_length: Int) -> Float {
case fractional_length <= 0 {
True -> value
False -> normalize_fractional_part(value /. 10.0, fractional_length - 1)
}
}
30 changes: 22 additions & 8 deletions src/lenient_parse/internal/tokenizer.gleam
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import gleam/int
import gleam/list
import gleam/string
import parse_error.{
Expand All @@ -6,8 +7,8 @@ import parse_error.{
}

pub type Token {
Sign(String)
Digit(String)
Sign(Bool)
Digit(Int)
Underscore
DecimalPoint
Whitespace(String)
Expand All @@ -23,9 +24,18 @@ fn do_tokenize(characters: List(String), acc: List(Token)) -> List(Token) {
[] -> acc |> list.reverse
[first, ..rest] -> {
let token = case first {
"-" | "+" -> Sign(first)
"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ->
Digit(first)
"-" -> Sign(False)
"+" -> Sign(True)
"0" -> Digit(0)
"1" -> Digit(1)
"2" -> Digit(2)
"3" -> Digit(3)
"4" -> Digit(4)
"5" -> Digit(5)
"6" -> Digit(6)
"7" -> Digit(7)
"8" -> Digit(8)
"9" -> Digit(9)
"." -> DecimalPoint
"_" -> Underscore
" " | "\n" | "\t" | "\r" | "\f" | "\r\n" -> Whitespace(first)
Expand All @@ -37,10 +47,14 @@ fn do_tokenize(characters: List(String), acc: List(Token)) -> List(Token) {
}
}

pub fn error_for_token(token: Token, index) -> ParseError {
pub fn to_error(token: Token, index) -> ParseError {
case token {
Digit(digit) -> InvalidDigitPosition(digit, index)
Sign(sign) -> InvalidSignPosition(sign, index)
Digit(digit) -> {
let digit = digit |> int.to_string
InvalidDigitPosition(digit, index)
}
Sign(True) -> InvalidSignPosition("+", index)
Sign(False) -> InvalidSignPosition("-", index)
Underscore -> InvalidUnderscorePosition(index)
Unknown(character) -> UnknownCharacter(character, index)
Whitespace(whitespace) -> UnknownCharacter(whitespace, index)
Expand Down
16 changes: 0 additions & 16 deletions src/parse_error.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -40,26 +40,11 @@ pub type ParseError {
/// - `character`: The invalid character as a `String`.
/// - `index`: The position of the invalid character in the input string.
UnknownCharacter(character: String, index: Int)

/// Represents an error when Gleam's `float.parse` fails after custom parsing
/// and coercion.
///
/// This indicates that the string couldn't be converted to a float even with
/// more permissive rules.
GleamFloatParseError

/// Represents an error when Gleam's `int.parse` fails after custom parsing
/// and coercion.
///
/// This indicates that the string couldn't be converted to an integer even
/// with more permissive rules.
GleamIntParseError
}

@internal
pub fn to_string(error: ParseError) -> String {
case error {
GleamIntParseError -> "gleam integer parse error"
UnknownCharacter(character, index) ->
"unknown character \""
<> character
Expand All @@ -69,7 +54,6 @@ pub fn to_string(error: ParseError) -> String {
"invalid underscore at position: " <> index |> int.to_string
EmptyString -> "empty string"
WhitespaceOnlyString -> "whitespace only string"
GleamFloatParseError -> "gleam float parse error"
InvalidDecimalPosition(index) ->
"invalid decimal at position: " <> index |> int.to_string
InvalidSignPosition(sign, index) ->
Expand Down
24 changes: 12 additions & 12 deletions test/tokenizer_test.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@ pub fn tokenize_test() {
Whitespace("\r"),
Whitespace("\f"),
Whitespace("\r\n"),
Sign("+"),
Sign("-"),
Digit("0"),
Digit("1"),
Digit("2"),
Digit("3"),
Digit("4"),
Digit("5"),
Digit("6"),
Digit("7"),
Digit("8"),
Digit("9"),
Sign(True),
Sign(False),
Digit(0),
Digit(1),
Digit(2),
Digit(3),
Digit(4),
Digit(5),
Digit(6),
Digit(7),
Digit(8),
Digit(9),
DecimalPoint,
Underscore,
Unknown("a"),
Expand Down

0 comments on commit 72eab98

Please sign in to comment.