Skip to content

Commit

Permalink
ENH: Numerizer as a spaCy extension (#10)
Browse files Browse the repository at this point in the history
  • Loading branch information
jaidevd authored Apr 3, 2021
1 parent b179988 commit 3caeadf
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 11 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ install:
- hash -r
- conda config --set always_yes yes --set changeps1 no
# Install pip modules
- pip install flake8 nose coverage
- pip install flake8 nose coverage spacy
- python -m spacy download en_core_web_sm
# Set up variables
- export BRANCH=$TRAVIS_BRANCH

Expand Down
37 changes: 36 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ This is a port of the Ruby gem `numerizer
Installation
------------

The NLG library can be installed from PyPI as follows:
The numerizer library can be installed from PyPI as follows:

.. code:: bash
Expand Down Expand Up @@ -52,6 +52,41 @@ Usage
'platform 9.75'
Using the SpaCy extension
^^^^^^^^^^^^^^^^^^^^^^^^^

Since version 0.2, numerizer is available as a `SpaCy extension <https://spacy.io/usage/processing-pipelines#custom-components-attributes>`_.

Any named entities of a quantitative nature within a SpaCy document can be numerized as follows:

.. code:: python
>>> from spacy import load
>>> nlp = load('en_core_web_sm') # or load any other spaCy model
>>> doc = nlp('The projected revenue for the next quarter is over two million dollars.')
>>> doc._.numerize()
{the next quarter: 'the next 1/4', over two million dollars: 'over 2000000 dollars'}
Users can specify which entity types are to be numerized, by using the `labels` argument in the extension function, as follows:

.. code:: python
>>> doc._.numerize(labels=['MONEY']) # only numerize entities of type 'MONEY'
{over two million dollars: 'over 2000000 dollars'}
The extension is available for tokens and spans as well.

.. code:: python
>>> two_million = doc[-4:-2] # span corresponding to "two million"
>>> two_million._.numerize()
'2000000'
>>> quarter = doc[6] # token corresponding to "quarter"
>>> quarter._.numerized
'1/4'
Extras
------

Expand Down
2 changes: 1 addition & 1 deletion numerizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .numerizer import numerize # NOQA: F401
from .numerizer import numerize, spacy_numerize # NOQA: F401
88 changes: 81 additions & 7 deletions numerizer/numerizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import re
from . import consts
import sys
try:
import spacy
nlp = spacy.load('en_core_web_sm')
SPACY_INSTALLED = True
except ImportError:
SPACY_INSTALLED = False


HYPHENATED = re.compile(r' +|([^\d])-([^\d])')
isub = lambda x, y, s: re.sub(x, y, s, flags=re.IGNORECASE) # noqa: E731
SPACY_ENT_LABELS = ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'CARDINAL', 'ORDINAL']


# Replacement regular expressions - to be used only in `re.sub`
Expand Down Expand Up @@ -259,9 +265,77 @@ def _repl_frac_cleanup(m):
return s


def main():
print(numerize(sys.argv[1])) # NOQA


if __name__ == "__main__":
main()
def _span_numerize(span):
return numerize(span.text)


def spacy_numerize(doc, labels='all', retokenize=False):
"""Numerize a spacy document.
Parameters
----------
doc : spacy.tokens.Doc
The SpaCy document to be numerized
labels : str / list, optional
The list of entity labels to be processed for numerization.
By default, all numeric tokens
(['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'CARDINAL', 'ORDINAL'])
are numerized. Any subset of this list can be specified to restrict
the types of entities to be numerized.
retokenize: bool, optional
If True, the original document is retokenized such that the span corresponding
to each numerized entity becomes a single token.
Examples
--------
>>> from spacy import load
>>> nlp = load('en_core_web_sm')
>>> spacy_numerize(nlp('The Hogwarts Express is at platform nine and three quarters.'))
{nine and three quarters: '9.75'}
>>> spacy_numerize(
... nlp('Their revenue has been a billion dollars, as of six months ago.'),
... labels=['MONEY']
... )
{a billion dollars: '1000000000 dollars'}
>>> doc = nlp('The Hogwarts Express is at platform nine and three quarters.')
>>> spacy_numerize(doc, retokenize=True)
>>> [(c.text, c._.numerized) for c in doc]
[('The', 'The'),
('Hogwarts', 'Hogwarts'),
('Express', 'Express'),
('is', 'is'),
('at', 'at'),
('platform', 'platform'),
('nine and three quarters', '9.75'),
('.', '.')]
"""
if not SPACY_INSTALLED:
import warnings
warnings.warn('SpaCy is not installed. Please pip install spacy.')
return
if labels == 'all':
labels = SPACY_ENT_LABELS
elif not labels:
return numerize(doc.text)
numerized_spans = {span: span._.numerize() for span in doc.ents if span.label_ in labels}
if not retokenize:
return numerized_spans
with doc.retokenize() as retokenizer:
for span, numerized in numerized_spans.items():
retokenizer.merge(span, attrs={'_': {'numerized': numerized}})
return doc


def _span_setter(token, numerized): return # NOQA: E704


def register_extension():
if SPACY_INSTALLED:
spacy.tokens.Token.set_extension(
'numerized', getter=_span_numerize,
setter=_span_setter)
spacy.tokens.Span.set_extension('numerize', method=_span_numerize)
spacy.tokens.Doc.set_extension('numerize', method=spacy_numerize)


register_extension()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
'Documentation': 'https://github.com/jaidevd/numerizer/tree/master/README.rst',
'Source Code': 'https://github.com/jaidevd/numerizer'
}
VERSION = '0.1.5'
VERSION = '0.2.0'

# Requirements
install_requires = []
Expand Down
40 changes: 40 additions & 0 deletions test_numerize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from numerizer import numerizer as num
from spacy import load
from spacy.tokens import Token
numerize = num.numerize
nlp = load('en_core_web_sm')


def test_init():
Expand Down Expand Up @@ -222,3 +225,40 @@ def test_numerize_big_prefixes():
def test_misc():
assert numerize(
'two hundred twenty five thousand seven hundred and fifty-five') == '225755'


# Test the spacy extensions

def test_spacy_default():
doc = nlp('The Hogwarts Express is at platform nine and three quarters.')
numerized = doc._.numerize()
assert isinstance(numerized, dict)
assert len(numerized) == 1
key, val = numerized.popitem()
assert key.text == 'nine and three quarters'
assert val == '9.75'


def test_entity_filters():
doc = nlp("""
Their revenue has been a billion dollars, as of six months ago.
The next quarter is not so promising.""")
numerized = doc._.numerize(labels=['MONEY'])
assert len(numerized) == 1
key, val = numerized.popitem()
assert key.text == 'a billion dollars'
assert val == '1000000000 dollars'


def test_retokenize():
doc = nlp('The Hogwarts Express is at platform nine and three quarters.')
doc._.numerize(retokenize=True)
assert isinstance(doc[-2], Token)
assert doc[-2].text == 'nine and three quarters'
assert doc[-2]._.numerized == '9.75'


def test_span_token_extensions():
doc = nlp('The projected revenue for the next quarter is over two million dollars.')
assert doc[-4:-2]._.numerize() == '2000000'
assert doc[6]._.numerized == '1/4'

0 comments on commit 3caeadf

Please sign in to comment.