Skip to content

Commit

Permalink
Merge pull request #211 from yoavg/dev
Browse files Browse the repository at this point in the history
support negative spelled-out numbers
  • Loading branch information
nielstron authored Jan 16, 2023
2 parents d08adba + 1be1b2c commit 6672f17
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 3 deletions.
17 changes: 14 additions & 3 deletions quantulum3/_lang/en_US/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def clean_surface(surface, span):

###############################################################################
def split_spellout_sequence(text, span):
negatives = reg.negatives(lang)
units = reg.units(lang)
tens = reg.tens(lang)
scales = reg.scales(lang)
Expand All @@ -78,19 +79,22 @@ def split_spellout_sequence(text, span):
)
# if should start a new seqquence
# split on:
# a minus word (minus ten)
# unit -> unit (one two three)
# unit -> tens (five twenty)
# tens -> tens (twenty thirty)
# same scale starts (hundred and one hundred and two)
should_split = False
if prev_word_rank == 1 and rank in [1, 2]:
if word in negatives:
should_split = True
elif prev_word_rank == 1 and rank in [1, 2]:
should_split = True
elif prev_word_rank == 2 and rank == 2:
should_split = True
elif rank >= 3 and rank == prev_scale:
should_split = True
prev_scale = rank
if should_split:
if should_split and last_word_end > 0:
# yield up to here
adjust = 0
if prev_word.lower() in [
Expand Down Expand Up @@ -141,6 +145,7 @@ def extract_spellout_values(text):
):
continue
try:
is_negative = False
surface, span = clean_surface(seq, span)
if not surface:
continue
Expand All @@ -160,6 +165,9 @@ def extract_spellout_values(text):
except ValueError:
match = re.search(reg.numberwords_regex(), word)
scale, increment = reg.numberwords(lang)[match.group(0)]
if scale < 0: # negative, must be the first word in the sequence
is_negative = True
continue
if (
scale > 0
and increment == 0
Expand All @@ -178,11 +186,14 @@ def extract_spellout_values(text):
if scale > 100 or word == "and":
result += curr
curr = 0.0
value = result + curr
if is_negative:
value = -value
values.append(
{
"old_surface": surface,
"old_span": span,
"new_surface": str(result + curr),
"new_surface": str(value),
}
)
except (KeyError, AttributeError):
Expand Down
2 changes: 2 additions & 0 deletions quantulum3/_lang/en_US/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@

DIVISION_OPERATORS = {" per ", " a "}

NEGATIVES = {"minus", "negative"}

GROUPING_OPERATORS = {",", " "}
DECIMAL_OPERATORS = {"."}

Expand Down
8 changes: 8 additions & 0 deletions quantulum3/_lang/en_US/tests/extract_spellout_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@
("hundred and five hundred and six", ["105.0", "106.0"]), # this is ambiguous..
("hundred and five twenty two", ["105.0", "22.0"]),
("hundred and five twenty two million", ["105.0", "22000000.0"]),
## negatives
("minus ten", ["-10.0"]),
("minus a million and a half", ["-1000000.5"]),
("negative million and a half", ["-1000000.5"]),
## negative splitting
("minus twenty five and thirty six", ["-25.0", "36.0"]),
("twenty five and minus thirty six", ["25.0", "-36.0"]),
("negative twenty five and minus thirty six", ["-25.0", "-36.0"]),
]


Expand Down
6 changes: 6 additions & 0 deletions quantulum3/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ def powers(lang="en_US"):
return _get_regex(lang).POWERS


def negatives(lang="en_US"):
return _get_regex(lang).NEGATIVES


def exponents_regex(lang="en_US"):
return _get_regex(lang).EXPONENTS_REGEX

Expand Down Expand Up @@ -75,6 +79,8 @@ def numberwords(lang="en_US"):

numwords.update(miscnum(lang))

for word in negatives(lang):
numwords[word] = (-1, 0)
for idx, word in enumerate(units(lang)):
numwords[word] = (1, idx)
for idx, word in enumerate(tens(lang)):
Expand Down

0 comments on commit 6672f17

Please sign in to comment.