diff --git a/.travis.yml b/.travis.yml index 2403591..f93d82b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,8 @@ install: - hash -r - conda config --set always_yes yes --set changeps1 no # Install pip modules - - pip install flake8 nose coverage + - pip install flake8 nose coverage spacy + - python -m spacy download en_core_web_sm # Set up variables - export BRANCH=$TRAVIS_BRANCH diff --git a/README.rst b/README.rst index c6f4483..340e33e 100644 --- a/README.rst +++ b/README.rst @@ -10,7 +10,7 @@ This is a port of the Ruby gem `numerizer Installation ------------ -The NLG library can be installed from PyPI as follows: +The numerizer library can be installed from PyPI as follows: .. code:: bash @@ -52,6 +52,41 @@ Usage 'platform 9.75' +Using the SpaCy extension +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Since version 0.2, numerizer is available as a `SpaCy extension `_. + +Any named entities of a quantitative nature within a SpaCy document can be numerized as follows: + +.. code:: python + + >>> from spacy import load + >>> nlp = load('en_core_web_sm') # or load any other spaCy model + >>> doc = nlp('The projected revenue for the next quarter is over two million dollars.') + >>> doc._.numerize() + {the next quarter: 'the next 1/4', over two million dollars: 'over 2000000 dollars'} + +Users can specify which entity types are to be numerized, by using the `labels` argument in the extension function, as follows: + +.. code:: python + + >>> doc._.numerize(labels=['MONEY']) # only numerize entities of type 'MONEY' + {over two million dollars: 'over 2000000 dollars'} + + +The extension is available for tokens and spans as well. + +.. code:: python + + >>> two_million = doc[-4:-2] # span corresponding to "two million" + >>> two_million._.numerize() + '2000000' + >>> quarter = doc[6] # token corresponding to "quarter" + >>> quarter._.numerized + '1/4' + + Extras ------ diff --git a/numerizer/__init__.py b/numerizer/__init__.py index 39dda3a..22b1952 100644 --- a/numerizer/__init__.py +++ b/numerizer/__init__.py @@ -1 +1 @@ -from .numerizer import numerize # NOQA: F401 +from .numerizer import numerize, spacy_numerize # NOQA: F401 diff --git a/numerizer/numerizer.py b/numerizer/numerizer.py index 652bd6f..c5f9603 100644 --- a/numerizer/numerizer.py +++ b/numerizer/numerizer.py @@ -1,10 +1,16 @@ import re from . import consts -import sys +try: + import spacy + nlp = spacy.load('en_core_web_sm') + SPACY_INSTALLED = True +except ImportError: + SPACY_INSTALLED = False HYPHENATED = re.compile(r' +|([^\d])-([^\d])') isub = lambda x, y, s: re.sub(x, y, s, flags=re.IGNORECASE) # noqa: E731 +SPACY_ENT_LABELS = ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'CARDINAL', 'ORDINAL'] # Replacement regular expressions - to be used only in `re.sub` @@ -259,9 +265,77 @@ def _repl_frac_cleanup(m): return s -def main(): - print(numerize(sys.argv[1])) # NOQA - - -if __name__ == "__main__": - main() +def _span_numerize(span): + return numerize(span.text) + + +def spacy_numerize(doc, labels='all', retokenize=False): + """Numerize a spacy document. + + Parameters + ---------- + doc : spacy.tokens.Doc + The SpaCy document to be numerized + labels : str / list, optional + The list of entity labels to be processed for numerization. + By default, all numeric tokens + (['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'CARDINAL', 'ORDINAL']) + are numerized. Any subset of this list can be specified to restrict + the types of entities to be numerized. + retokenize: bool, optional + If True, the original document is retokenized such that the span corresponding + to each numerized entity becomes a single token. + + Examples + -------- + >>> from spacy import load + >>> nlp = load('en_core_web_sm') + >>> spacy_numerize(nlp('The Hogwarts Express is at platform nine and three quarters.')) + {nine and three quarters: '9.75'} + >>> spacy_numerize( + ... nlp('Their revenue has been a billion dollars, as of six months ago.'), + ... labels=['MONEY'] + ... ) + {a billion dollars: '1000000000 dollars'} + >>> doc = nlp('The Hogwarts Express is at platform nine and three quarters.') + >>> spacy_numerize(doc, retokenize=True) + >>> [(c.text, c._.numerized) for c in doc] + [('The', 'The'), + ('Hogwarts', 'Hogwarts'), + ('Express', 'Express'), + ('is', 'is'), + ('at', 'at'), + ('platform', 'platform'), + ('nine and three quarters', '9.75'), + ('.', '.')] + """ + if not SPACY_INSTALLED: + import warnings + warnings.warn('SpaCy is not installed. Please pip install spacy.') + return + if labels == 'all': + labels = SPACY_ENT_LABELS + elif not labels: + return numerize(doc.text) + numerized_spans = {span: span._.numerize() for span in doc.ents if span.label_ in labels} + if not retokenize: + return numerized_spans + with doc.retokenize() as retokenizer: + for span, numerized in numerized_spans.items(): + retokenizer.merge(span, attrs={'_': {'numerized': numerized}}) + return doc + + +def _span_setter(token, numerized): return # NOQA: E704 + + +def register_extension(): + if SPACY_INSTALLED: + spacy.tokens.Token.set_extension( + 'numerized', getter=_span_numerize, + setter=_span_setter) + spacy.tokens.Span.set_extension('numerize', method=_span_numerize) + spacy.tokens.Doc.set_extension('numerize', method=spacy_numerize) + + +register_extension() diff --git a/setup.py b/setup.py index ad48ffc..5f1d8a9 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ 'Documentation': 'https://github.com/jaidevd/numerizer/tree/master/README.rst', 'Source Code': 'https://github.com/jaidevd/numerizer' } -VERSION = '0.1.5' +VERSION = '0.2.0' # Requirements install_requires = [] diff --git a/test_numerize.py b/test_numerize.py index 83ee3d4..01b061e 100644 --- a/test_numerize.py +++ b/test_numerize.py @@ -1,5 +1,8 @@ from numerizer import numerizer as num +from spacy import load +from spacy.tokens import Token numerize = num.numerize +nlp = load('en_core_web_sm') def test_init(): @@ -222,3 +225,40 @@ def test_numerize_big_prefixes(): def test_misc(): assert numerize( 'two hundred twenty five thousand seven hundred and fifty-five') == '225755' + + +# Test the spacy extensions + +def test_spacy_default(): + doc = nlp('The Hogwarts Express is at platform nine and three quarters.') + numerized = doc._.numerize() + assert isinstance(numerized, dict) + assert len(numerized) == 1 + key, val = numerized.popitem() + assert key.text == 'nine and three quarters' + assert val == '9.75' + + +def test_entity_filters(): + doc = nlp(""" + Their revenue has been a billion dollars, as of six months ago. + The next quarter is not so promising.""") + numerized = doc._.numerize(labels=['MONEY']) + assert len(numerized) == 1 + key, val = numerized.popitem() + assert key.text == 'a billion dollars' + assert val == '1000000000 dollars' + + +def test_retokenize(): + doc = nlp('The Hogwarts Express is at platform nine and three quarters.') + doc._.numerize(retokenize=True) + assert isinstance(doc[-2], Token) + assert doc[-2].text == 'nine and three quarters' + assert doc[-2]._.numerized == '9.75' + + +def test_span_token_extensions(): + doc = nlp('The projected revenue for the next quarter is over two million dollars.') + assert doc[-4:-2]._.numerize() == '2000000' + assert doc[6]._.numerized == '1/4'