FIX: Improve unpreceded fractions

See: #23
jaidevd · Sep 25, 2024 · a0b053a · a0b053a
1 parent 49f2692
commit a0b053a
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 6 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -21,5 +21,6 @@ script:
   - python -m spacy download en_core_web_sm
   - python -m spacy download en_core_web_md
   - python -m spacy download en_core_web_lg
+  - python -m spacy download en_core_web_trf
   - coverage run -m pytest
   - coverage report -m
diff --git a/numerizer/numerizer.py b/numerizer/numerizer.py
@@ -42,8 +42,6 @@ def _repl_all_fractions(m):
     return f'<num>{m.group(1)}' + str(consts.ALL_FRACTIONS[m.group(1).lower()])
 
 
-# Public
-
 def preprocess(s):
     s = re.sub(HYPHENATED, r'\1 \2', s)
     s = re.sub(r'\ba$', '', s)
@@ -81,7 +79,6 @@ def numerize_numerals(s, ignore=None, bias=None):
     if m is not None:
         s = re.sub(pat, lambda m: f'{m.group(1)}{m.group(2)} hundred{m.group(3)}', s)
 
-    #
     pat = re.compile(r'(^|\W)({0})(?=$|\W)'.format(dir_single_nums), flags=re.IGNORECASE)
     m = re.search(pat, s)
     if m is not None:
@@ -135,13 +132,11 @@ def numerize_fractions(s, ignore=None, bias=None):
                                ignore=ignore + ['quarter', 'quarters'])
     quarters = regexify(['quarter', 'quarters'], ignore=ignore)
 
-    #
     pat = re.compile(r'a ({})(?=$|\W)'.format(fractionals), flags=re.IGNORECASE)
     m = re.search(pat, s)
     if m is not None:
         s = re.sub(pat, _repl_all_fractions, s)
 
-    #
     if bias == 'fractional':
         pat = re.compile(r'(^|\W)({})(?=$|\W)'.format(fractionals), flags=re.IGNORECASE)
         m = re.search(pat, s)
@@ -274,7 +269,8 @@ def _repl_frac_cleanup(m):
 
     # fix unpreceded fractions
     s = re.sub(r'(?:(?<=^)|(?<=[^\w)]))\/(\d+)', r'1/\1', s)
-    s = re.sub(r'(?<=[a-zA-Z])\/(\d+)', r'1/\1', s)
+    # Following not needed yet, see: https://github.com/jaidevd/numerizer/issues/23
+    # s = re.sub(r'(?<=[a-zA-Z])\/(\d+)', r'1/\1', s)
     return s
 
 

diff --git a/test_numerize.py b/test_numerize.py
@@ -43,6 +43,7 @@ def test_fraction():
     assert numerize("two and a half") == "2.5"
     assert numerize("three quarters") == "3/4"
     assert numerize("two and three eighths") == "2.375"
+    assert numerize("2B/2B") == "2B/2B"
 
 
 def test_straight_parsing():
@@ -314,6 +315,11 @@ def test_span_token_extensions(self):
         assert doc[-4:-2]._.numerize() == "2000000"
         assert doc[6]._.numerized == "1/4"
 
+    def test_article(self):
+        # See: https://github.com/jaidevd/numerizer/issues/24
+        _, val = nlp("A cat, a baby and a hundred puppies.")._.numerize().popitem()
+        assert val == "100"
+
     @skipUnless(TRF_INSTALLED, "python -m spacy download en_core_web_trf")
     def test_whitespace(self):
         # See https://github.com/jaidevd/numerizer/issues/25