Skip to content

Commit

Permalink
FIX: Improve unpreceded fractions
Browse files Browse the repository at this point in the history
See: #23
  • Loading branch information
jaidevd committed Sep 25, 2024
1 parent 49f2692 commit a0b053a
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 6 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ script:
- python -m spacy download en_core_web_sm
- python -m spacy download en_core_web_md
- python -m spacy download en_core_web_lg
- python -m spacy download en_core_web_trf
- coverage run -m pytest
- coverage report -m
8 changes: 2 additions & 6 deletions numerizer/numerizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ def _repl_all_fractions(m):
return f'<num>{m.group(1)}' + str(consts.ALL_FRACTIONS[m.group(1).lower()])


# Public

def preprocess(s):
s = re.sub(HYPHENATED, r'\1 \2', s)
s = re.sub(r'\ba$', '', s)
Expand Down Expand Up @@ -81,7 +79,6 @@ def numerize_numerals(s, ignore=None, bias=None):
if m is not None:
s = re.sub(pat, lambda m: f'{m.group(1)}{m.group(2)} hundred{m.group(3)}', s)

#
pat = re.compile(r'(^|\W)({0})(?=$|\W)'.format(dir_single_nums), flags=re.IGNORECASE)
m = re.search(pat, s)
if m is not None:
Expand Down Expand Up @@ -135,13 +132,11 @@ def numerize_fractions(s, ignore=None, bias=None):
ignore=ignore + ['quarter', 'quarters'])
quarters = regexify(['quarter', 'quarters'], ignore=ignore)

#
pat = re.compile(r'a ({})(?=$|\W)'.format(fractionals), flags=re.IGNORECASE)
m = re.search(pat, s)
if m is not None:
s = re.sub(pat, _repl_all_fractions, s)

#
if bias == 'fractional':
pat = re.compile(r'(^|\W)({})(?=$|\W)'.format(fractionals), flags=re.IGNORECASE)
m = re.search(pat, s)
Expand Down Expand Up @@ -274,7 +269,8 @@ def _repl_frac_cleanup(m):

# fix unpreceded fractions
s = re.sub(r'(?:(?<=^)|(?<=[^\w)]))\/(\d+)', r'1/\1', s)
s = re.sub(r'(?<=[a-zA-Z])\/(\d+)', r'1/\1', s)
# Following not needed yet, see: https://github.com/jaidevd/numerizer/issues/23
# s = re.sub(r'(?<=[a-zA-Z])\/(\d+)', r'1/\1', s)
return s


Expand Down
6 changes: 6 additions & 0 deletions test_numerize.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def test_fraction():
assert numerize("two and a half") == "2.5"
assert numerize("three quarters") == "3/4"
assert numerize("two and three eighths") == "2.375"
assert numerize("2B/2B") == "2B/2B"


def test_straight_parsing():
Expand Down Expand Up @@ -314,6 +315,11 @@ def test_span_token_extensions(self):
assert doc[-4:-2]._.numerize() == "2000000"
assert doc[6]._.numerized == "1/4"

def test_article(self):
# See: https://github.com/jaidevd/numerizer/issues/24
_, val = nlp("A cat, a baby and a hundred puppies.")._.numerize().popitem()
assert val == "100"

@skipUnless(TRF_INSTALLED, "python -m spacy download en_core_web_trf")
def test_whitespace(self):
# See https://github.com/jaidevd/numerizer/issues/25
Expand Down

0 comments on commit a0b053a

Please sign in to comment.