diff --git a/.gitignore b/.gitignore index 7f253e9..0f65a6b 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,5 @@ _build # mypy .mypy_cache/ + +.DS_Store \ No newline at end of file diff --git a/README.rst b/README.rst index c477f5d..af5ed35 100644 --- a/README.rst +++ b/README.rst @@ -6,8 +6,8 @@ text2num ``text2num`` is a python package that provides functions and parser classes for: -- parsing numbers expressed as words in French, English and Spanish and convert them to integer values; -- detect ordinal, cardinal and decimal numbers in a stream of French, English or Spanish words and get their decimal digit representations. Spanish does not support ordinal numbers yet. +- parsing numbers expressed as words in French, English, Spanish and Portuguese and convert them to integer values; +- detect ordinal, cardinal and decimal numbers in a stream of French, English, Spanish and Portuguese words and get their decimal digit representations. Spanish does not support ordinal numbers yet. Compatibility ------------- @@ -88,6 +88,20 @@ Spanish examples: 53243724 +Portuguese examples: + +.. code-block:: python + + >>> from text_to_num import text2num + >>> text2num("trinta e dois", "pt") + 32 + + >>> text2num("mil novecentos e seis", "pt") + 1906 + + >>> text2num("vinte e quatro milhões duzentos mil quarenta e sete", "pt") + 24200047 + Find and transcribe ~~~~~~~~~~~~~~~~~~~ @@ -142,6 +156,27 @@ Spanish (ordinals not supported): >>> alpha2digit(text, "es") 'Tenemos +20 grados dentro y -15 fuera.' + +Portuguese: + +.. code-block:: python + + >>> from text_to_num import alpha2digit + + >>> text = "Comprámos vinte e cinco vacas, doze galinhas e cento vinte e cinco vírgula quarenta kg de batatas." + >>> alpha2digit(text, "pt") + 'Comprámos 25 vacas, 12 galinhas e 125,40 kg de batatas.' + + >>> text = "Temos mais vinte graus dentro e menos quinze fora." + >>> alpha2digit(text, "pt") + 'Temos +20 graus dentro e -15 fora.' + + >>> text = "Ordinais: quinto, terceiro, vigésimo, vigésimo primeiro, centésimo quarto" + >>> alpha2digit(text, "pt") + 'Ordinais: 5º, terceiro, 20ª, 21º, 104º' + + + Read the complete documentation on `ReadTheDocs `_. Contribute diff --git a/setup.py b/setup.py index 102ca57..6739062 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages -VERSION = "2.2.1" +VERSION = "2.3.0" def readme(): @@ -12,7 +12,7 @@ def readme(): setup( name="text2num", version=VERSION, - description="Parse and convert numbers written in French, Spanish or English into their digit representation.", + description="Parse and convert numbers written in French, Spanish, English or Portuguese into their digit representation.", long_description=readme(), classifiers=[ "Development Status :: 5 - Production/Stable", @@ -22,9 +22,10 @@ def readme(): "Topic :: Text Processing :: Filters", "Natural Language :: French", "Natural Language :: English", - "Natural Language :: Spanish" + "Natural Language :: Spanish", + "Natural Language :: Portuguese" ], - keywords="French, Spanish and English NLP words-to-numbers", + keywords="French, Spanish, English and Portuguese NLP words-to-numbers", url="https://github.com/allo-media/text2num", author="Allo-Media", author_email="contact@allo-media.fr", diff --git a/tests/test_text_to_num_es.py b/tests/test_text_to_num_es.py index 4b6567d..7633952 100644 --- a/tests/test_text_to_num_es.py +++ b/tests/test_text_to_num_es.py @@ -52,7 +52,8 @@ def test_text2num(self): self.assertEqual(text2num("dos mil", "es"), 2000) self.assertEqual(text2num("dos mil noventa y nueve", "es"), 2099) self.assertEqual(text2num("nueve mil novecientos noventa y nueve", "es"), 9999) - self.assertEqual(text2num("novecientos noventa y nueve mil novecientos noventa y nueve", "es"), 999999) + self.assertEqual(text2num("novecientos noventa y nueve mil novecientos noventa y nueve", "es"), + 999999) long_text = "novecientos noventa y nueve mil novecientos noventa y nueve millones novecientos noventa y nueve mil novecientos noventa y nueve" self.assertEqual(text2num(long_text, "es"), 999999999999) @@ -154,7 +155,7 @@ def test_alpha2digit_decimals(self): self.assertEqual(alpha2digit(source, "es"), expected) self.assertEqual(alpha2digit("coma quince", "es"), "0.15") - #self.assertEqual(alpha2digit("cero coma quince", "es"), "0.15") # TODO + # self.assertEqual(alpha2digit("cero coma quince", "es"), "0.15") # TODO def test_alpha2digit_signed(self): source = "Tenemos mas veinte grados dentro y menos quince fuera." @@ -176,7 +177,6 @@ def test_accent(self): self.assertEqual(alpha2digit("Un millon", "es"), "1000000") self.assertEqual(alpha2digit("Un millón", "es"), "1000000") - # ord2card NOT implemented in Spanish """ def test_second_as_time_unit_vs_ordinal(self): diff --git a/tests/test_text_to_num_pt.py b/tests/test_text_to_num_pt.py new file mode 100644 index 0000000..d16996e --- /dev/null +++ b/tests/test_text_to_num_pt.py @@ -0,0 +1,219 @@ +# MIT License + +# Copyright (c) 2018-2019 Groupe Allo-Media + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +""" +Test the ``text_to_num`` library. +""" +from unittest import TestCase +from text_to_num import alpha2digit, text2num + + +class TestTextToNumPT(TestCase): + def test_text2num(self): + self.assertEqual(text2num("zero", "pt"), 0) + self.assertEqual(text2num("um", "pt"), 1) + self.assertEqual(text2num("oito", "pt"), 8) + self.assertEqual(text2num("dez", "pt"), 10) + self.assertEqual(text2num("onze", "pt"), 11) + self.assertEqual(text2num("dezanove", "pt"), 19) + self.assertEqual(text2num("vinte", "pt"), 20) + self.assertEqual(text2num("vinte e um", "pt"), 21) + self.assertEqual(text2num("trinta", "pt"), 30) + self.assertEqual(text2num("trinta e um", "pt"), 31) + self.assertEqual(text2num("trinta e dois", "pt"), 32) + self.assertEqual(text2num("trinta e três", "pt"), 33) + self.assertEqual(text2num("trinta e nove", "pt"), 39) + self.assertEqual(text2num("noventa e nove", "pt"), 99) + self.assertEqual(text2num("cem", "pt"), 100) + self.assertEqual(text2num("cento e um", "pt"), 101) + self.assertEqual(text2num("duzentos", "pt"), 200) + self.assertEqual(text2num("duzentos e um", "pt"), 201) + self.assertEqual(text2num("mil", "pt"), 1000) + self.assertEqual(text2num("mil e um", "pt"), 1001) + self.assertEqual(text2num("dois mil", "pt"), 2000) + self.assertEqual(text2num("dois mil noventa e nove", "pt"), 2099) + self.assertEqual( + text2num("nove mil novecentos noventa e nove", "pt"), 9999) + self.assertEqual(text2num( + "novecentos noventa e nove mil novecentos noventa e nove", "pt"), 999999) + + self.assertEqual(alpha2digit("um vírgula um", "pt"), "1,1") + self.assertEqual(alpha2digit( + "um vírgula quatrocentos e um", "pt"), "1,401") + + # fail +# self.assertEqual(alpha2digit("zero vírgula cinco", "pt"), "0,5") + + # test1 = "cincuenta y tres mil veinte millones doscientos cuarenta y tres mil setecientos veinticuatro" + # self.assertEqual(text2num(test1, "pt"), 53_020_243_724) + + # test2 = ( + # "cincuenta y un millones quinientos setenta y ocho mil trescientos dos" + # ) + # self.assertEqual(text2num(test2, "pt"), 51_578_302) + + test3 = "oitenta e cinco" + self.assertEqual(text2num(test3, "pt"), 85) + + test4 = "oitenta e um" + self.assertEqual(text2num(test4, "pt"), 81) + + self.assertEqual(text2num("quinze", "pt"), 15) + self.assertEqual(text2num("cento quinze", "pt"), 115) + self.assertEqual(text2num("setenta e cinco mil", "pt"), 75000) + self.assertEqual(text2num("mil novecentos vinte", "pt"), 1920) + + def test_text2num_exc(self): + self.assertRaises(ValueError, text2num, "mil mil duzentos", "pt") + self.assertRaises(ValueError, text2num, "sessenta quinze", "pt") + self.assertRaises(ValueError, text2num, "sessenta cem", "pt") + + def test_text2num_zeroes(self): + self.assertEqual(text2num("zero", "pt"), 0) + self.assertEqual(text2num("zero oito", "pt"), 8) + self.assertEqual(text2num("zero zero cento vinte e cinco", "pt"), 125) + self.assertRaises(ValueError, text2num, "cinco zero", "pt") + self.assertRaises(ValueError, text2num, "cinquenta zero três", "pt") + self.assertRaises(ValueError, text2num, "cinquenta e três zero", "pt") + + def test_alpha2digit_integers(self): + source = "vinte cinco vacas, doze galinhas e cento vinte e cinco kg de batatas." + expected = "25 vacas, 12 galinhas e 125 kg de batatas." + self.assertEqual(alpha2digit(source, "pt"), expected) + + source = "mil duzentos sessenta e seis dólares." + expected = "1266 dólares." + self.assertEqual(alpha2digit(source, "pt"), expected) + + source = "um dois três quatro vinte quinze" + expected = "1 2 3 4 20 15" + self.assertEqual(alpha2digit(source, "pt"), expected) + + source = "vinte e um, trinta e um." + expected = "21, 31." + self.assertEqual(alpha2digit(source, "pt"), expected) + + def test_relaxed(self): + source = "um dois três quatro trinta e cinco." + expected = "1 2 3 4 35." + self.assertEqual(alpha2digit(source, "pt", relaxed=True), expected) + + source = "um dois três quatro vinte, cinco." + expected = "1 2 3 4 20, 5." + self.assertEqual(alpha2digit(source, "pt", relaxed=True), expected) + + source = "trinta e quatro = trinta quatro" + expected = "34 = 34" + self.assertEqual(alpha2digit(source, "pt", relaxed=True), expected) + + def test_alpha2digit_formal(self): + source = "mais trinta e três nove sessenta zero seis doze vinte e um" + expected = "+33 9 60 06 12 21" + self.assertEqual(alpha2digit(source, "pt"), expected) + + source = "zero nove sessenta zero seis doze vinte e um" + expected = "09 60 06 12 21" + self.assertEqual(alpha2digit(source, "pt"), expected) + + def test_and(self): + source = "cinquenta sessenta trinta onze" + expected = "50 60 30 11" + self.assertEqual(alpha2digit(source, "pt"), expected) + + def test_pt_conjunction(self): + source = "duzentos e quarenta e quatro" + expected = "244" + self.assertEqual(alpha2digit(source, "pt"), expected) + + source = "dois mil e vinte" + expected = "2020" + self.assertEqual(alpha2digit(source, "pt"), expected) + + source = "mil novecentos e oitenta e quatro" + expected = "1984" + self.assertEqual(alpha2digit(source, "pt"), expected) + + source = "mil e novecentos" + expected = "1900" + self.assertEqual(alpha2digit(source, "pt"), expected) + + source = "dois mil cento e vinte cinco" + expected = "2125" + self.assertEqual(alpha2digit(source, "pt"), expected) + + source = "Trezentos e setenta e oito milhões vinte e sete mil trezentos e doze" + expected = "378027312" + self.assertEqual(alpha2digit(source, "pt"), expected) + + def test_alpha2digit_zero(self): + source = "treze mil zero noventa" + expected = "13000 090" + self.assertEqual(alpha2digit(source, "pt"), expected) + + self.assertEqual(alpha2digit("zero", "pt"), "0") + + def test_alpha2digit_decimals(self): + source = ( + "doze vírgula noventa e nove, cento e vinte vírgula zero cinco, " + "um vírgula duzentos e trinta e seis, um vírgula dois três seis." + ) + expected = "12,99, 120,05, 1,236, 1,2 3 6." + self.assertEqual(alpha2digit(source, "pt"), expected) + + self.assertEqual(alpha2digit("vírgula quinze", "pt"), "0,15") + # self.assertEqual(alpha2digit("zero vírgula quinze", "pt"), "0,15") # TODO + + def test_alpha2digit_signed(self): + source = "Temos mais vinte graus dentro e menos quinze fora." + expected = "Temos +20 graus dentro e -15 fora." + self.assertEqual(alpha2digit(source, "pt"), expected) + + def test_one_as_noun_or_article(self): + source = "Um momento por favor! trinta e um gatos. Um dois três quatro!" + expected = "Um momento por favor! 31 gatos. 1 2 3 4!" + self.assertEqual(alpha2digit(source, "pt"), expected) + # End of segment + source = "Nem um. Um um. Trinta e um" + expected = "Nem um. 1 1. 31" + self.assertEqual(alpha2digit(source, "pt"), expected) + + def test_accent(self): + self.assertEqual(text2num("um milhao", "pt"), 1000000) + self.assertEqual(text2num("um milhão", "pt"), 1000000) + self.assertEqual(alpha2digit("Um milhao", "pt"), "1000000") + self.assertEqual(alpha2digit("Um milhão", "pt"), "1000000") + + def test_second_as_time_unit_vs_ordinal(self): + source = "Um segundo por favor! Vigésimo segundo é diferente de vinte segundos." + expected = "Um segundo por favor! 22º é diferente de 20 segundos." + self.assertEqual(alpha2digit(source, "pt"), expected) + + def test_alpha2digit_ordinals(self): + source = "Ordinais: primeiro, quinto, terceiro, vigésima, vigésimo primeiro, centésimo quadragésimo quinto" + expected = "Ordinais: primeiro, 5º, terceiro, 20ª, 21º, 145º" + self.assertEqual(alpha2digit(source, "pt"), expected) + + def test_alpha2digit_ordinals_more(self): + source = "A décima quarta brigada do exército português, juntamento com o nonagésimo sexto regimento britânico, bateu o centésimo vigésimo sétimo regimento de infantaria de Napoleão" + expected = "A 14ª brigada do exército português, juntamento com o 96º regimento britânico, bateu o 127º regimento de infantaria de Napoleão" + self.assertEqual(alpha2digit(source, "pt"), expected) diff --git a/text_to_num/lang/__init__.py b/text_to_num/lang/__init__.py index 84f90cb..ba58ef2 100644 --- a/text_to_num/lang/__init__.py +++ b/text_to_num/lang/__init__.py @@ -28,6 +28,6 @@ from .french import French from .english import English from .spanish import Spanish +from .portuguese import Portuguese - -LANG = {"fr": French(), "en": English(), "es": Spanish()} +LANG = {"fr": French(), "en": English(), "es": Spanish(), "pt": Portuguese()} diff --git a/text_to_num/lang/english.py b/text_to_num/lang/english.py index a9dcfa4..613eee1 100644 --- a/text_to_num/lang/english.py +++ b/text_to_num/lang/english.py @@ -146,12 +146,12 @@ def ord2card(self, word: str) -> Optional[str]: source = RAD_MAP[source] elif source.endswith("ie"): source = source[:-2] + "y" - elif source.endswith('fif'): # fifth -> five - source = source[:-1] + 've' - elif source.endswith('eigh'): # eighth -> eight - source = source + 't' - elif source.endswith('nin'): # ninth -> nine - source = source + 'e' + elif source.endswith("fif"): # fifth -> five + source = source[:-1] + "ve" + elif source.endswith("eigh"): # eighth -> eight + source = source + "t" + elif source.endswith("nin"): # ninth -> nine + source = source + "e" if source not in self.NUMBERS: return None return source diff --git a/text_to_num/lang/portuguese.py b/text_to_num/lang/portuguese.py new file mode 100644 index 0000000..b3c73f2 --- /dev/null +++ b/text_to_num/lang/portuguese.py @@ -0,0 +1,352 @@ +# MIT License + +# Copyright (c) 2018-2019 Groupe Allo-Media + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import re +from typing import Dict, Optional, Set, Tuple, List + +from .base import Language + +# +# CONSTANTS +# Built once on import. +# + +# Those words multiplies lesser numbers (see Rules) +# Exception: "(de) milliards" that can multiply bigger numbers ("milliards de milliards") +MULTIPLIERS = { + "mil": 1000, + "milhar": 1000, + "milhares": 1000, + "milhão": 1000000, + "milhao": 1000000, + "milhões": 1000000, +} + + +# Units are terminals (see Rules) +UNITS: Dict[str, int] = { + word: value + for value, word in enumerate( + "um dois três quatro cinco seis sete oito nove".split(), 1 + ) +} +# Unit variants +UNITS["uma"] = 1 +UNITS["duas"] = 2 + +# Single tens are terminals (see Rules) +# exact find +STENS: Dict[str, int] = { + word: value + for value, word in enumerate( + "dez onze doze treze quatorze quinze dezasseis dezassete dezoito dezanove".split(), + 10, + ) +} + +# Ten multiples +# Ten multiples may be followed by a unit only; +# the number is the multiplier of the first token +MTENS: Dict[str, int] = { + word: value * 10 + for value, word in enumerate( + "vinte trinta quarenta cinquenta sessenta setenta oitenta noventa".split(), 2 + ) +} + +# Ten multiples that can be combined with STENS +MTENS_WSTENS: Set[str] = set() + +HUNDRED = { + "cem": 100, + "centena": 100, + "cento": 100, + "centenas": 100, + "duzentos": 200, + "duzentas": 200, + "trezentos": 300, + "trezentas": 300, + "quatrocentos": 400, + "quatrocentas": 400, + "quinhentos": 500, + "quinhentas": 500, + "seiscentos": 600, + "seiscentas": 600, + "setecentos": 700, + "setecentas": 700, + "oitocentos": 800, + "oitocentas": 800, + "novecentos": 900, + "novecentas": 900, +} + +# Composites are tens already composed with terminals in one word. +# Composites are terminals. + +COMPOSITES: Dict[str, int] = {} + +# All number words +NUMBERS = MULTIPLIERS.copy() +NUMBERS.update(UNITS) +NUMBERS.update(STENS) +NUMBERS.update(MTENS) +NUMBERS.update(HUNDRED) +NUMBERS.update(COMPOSITES) + + +class Portuguese(Language): + + ISO_CODE = "pt" + MULTIPLIERS = MULTIPLIERS + UNITS = UNITS + STENS = STENS + MTENS = MTENS + MTENS_WSTENS = MTENS_WSTENS + HUNDRED = HUNDRED + NUMBERS = NUMBERS + SIGN = {"mais": "+", "menos": "-"} + ZERO = {"zero"} + DECIMAL_SEP = "vírgula" + DECIMAL_SYM = "," + + # pt conjunction rules are complex + # https://duvidas.dicio.com.br/como-escrever-numeros-por-extenso/ + AND_NUMS = { + "um", + "uma", + "duas", + "dois", + "três", + "quatro", + "cinco", + "seis", + "sete", + "oito", + "nove", + "dez", + "onze", + "doze", + "treze", + "quatorze", + "quinze", + "dezasseis", + "dezassete", + "dezoito", + "dezanove", + "vinte", + "trinta", + "quarenta", + "cinquenta", + "sessenta", + "setenta", + "oitenta", + "noventa", + "cem", + "duzentos", + "trezentos", + "quatrocentos", + "quinhentos", + "seiscentos", + "setecentos", + "oitocentos", + "novecentos", + } + + AND = "e" + NEVER_IF_ALONE = {"um", "uma"} + + # Relaxed composed numbers (two-words only) + # start => (next, target) + RELAXED: Dict[str, Tuple[str, str]] = {} + + PT_ORDINALS = { + "primeir": "um", + "segund": "dois", + "terceir": "três", + "quart": "quatro", + "quint": "cinco", + "sext": "seis", + "sétim": "sete", + "oitav": "oito", + "non": "nove", + "décim": "dez", + "vigésim": "vinte", + "trigésim": "trinta", + "quadragésim": "quarenta", + "quinquagésim": "cinquenta", + "sexagésim": "sessenta", + "septagésim": "setenta", + "octagésim": "oitenta", + "nonagésim": "noventa", + "centésim": "cem", + "ducentésim": "cem", + "trecentésim": "cem", + "quadrigentésim": "cem", + "quingentésim": "cem", + "sexgentésim": "cem", + "setingentésim": "cem", + "octigentésim": "cem", + "nonigentésim": "mil", + "milionésim": "milhão", + } + + def ord2card(self, word: str) -> Optional[str]: + """Convert ordinal number to cardinal. + + Return None if word is not an ordinal or is better left in letters + as is the case for first and second. + """ + + ord_ = self.PT_ORDINALS.get(word[:-1], None) + return ord_ + + def num_ord(self, digits: str, original_word: str) -> str: + """Add suffix to number in digits to make an ordinal + + Portuguese language: 22° : vigésimo segundo: 20 + 2 ° + so if there is a couple of ordinals found, only add suffix to the last one + """ + + return f"{digits}º" if original_word.endswith("o") else f"{digits}ª" + + def normalize(self, word: str) -> str: + return word + + +SEGMENT_BREAK = re.compile(r"\s*[\.,;\(\)…\[\]:!\?]+\s*") + +SUB_REGEXES = [ + (re.compile(r"1\s"), "um "), + (re.compile(r"2\s"), "dois"), + (re.compile(r"\b1[\º\°]\b"), "primeiro"), + (re.compile(r"\b2[\º\°]\b"), "segundo"), + (re.compile(r"\b3[\º\°]\b"), "terceiro"), + (re.compile(r"\b1\ª\b"), "primeira"), + (re.compile(r"\b2\ª\b"), "segunda"), + (re.compile(r"\b3\ª\b"), "terceira"), +] + + +class OrdinalsMerger: + def merge_compound_ordinals_pt(self, text: str) -> str: + """join compound ordinal cases created by a text2num 1st pass + + Example: + 20° 7° -> 27° + + Greedy pusher: push along the token stream, + create a new ordinal sequence if an ordinal is found + stop sequence when no more ordinals are found + sum ordinal sequence + + """ + + segments = re.split(SEGMENT_BREAK, text) + punct = re.findall(SEGMENT_BREAK, text) + if len(punct) < len(segments): + punct.append("") + out_segments = [] + for segment, sep in zip(segments, punct): # loop over segments + tokens = [t for t in segment.split(" ") if len(t) > 0] + + pointer = 0 + tokens_ = [] + current_is_ordinal = False + seq = [] + + while pointer < len(tokens): + token = tokens[pointer] + if self.is_ordinal(token): # found an ordinal, push into new seq + current_is_ordinal = True + seq.append(self.get_cardinal(token)) + gender = self.get_gender(token) + else: + if current_is_ordinal is False: # add standard token + tokens_.append(token) + else: # close seq + ordinal = sum(seq) + tokens_.append(str(ordinal) + gender) + tokens_.append(token) + seq = [] + current_is_ordinal = False + pointer += 1 + + if current_is_ordinal is True: # close seq for single token expressions + ordinal = sum(seq) + tokens_.append(str(ordinal) + gender) + + tokens_ = self.text2num_style(tokens_) + segment = " ".join(tokens_) + sep + out_segments.append(segment) + + text = "".join(out_segments) + + return text + + @staticmethod + def is_ordinal(token: str) -> bool: + out = False + if len(token) > 1 and ("º" in token or "°" in token or "ª" in token): + out = True + + if token in [ + "primeiro", + "primeira", + "segundo", + "segunda", + "terceiro", + "terceira", + ]: + out = True + return out + + @staticmethod + def get_cardinal(token: str) -> int: + out = 0 + try: + out = int(token[:-1]) + except ValueError: + if token[:-1] == "primeir": + out = 1 + elif token[:-1] == "segund": + out = 2 + elif token[:-1] == "terceir": + out = 3 + return out + + @staticmethod + def get_gender(token: str) -> str: + gender = token[-1] + if gender == "a": + gender = "ª" + if gender == "o": + gender = "º" + return gender + + @staticmethod + def text2num_style(tokens: List[str]) -> List[str]: + """convert a list of tokens to text2num_style, i.e. : 1 -> un/one/uno/um""" + + for regex in SUB_REGEXES: + tokens = [re.sub(regex[0], regex[1], token) for token in tokens] + + return tokens diff --git a/text_to_num/transforms.py b/text_to_num/transforms.py index ee4cf8f..2a52238 100644 --- a/text_to_num/transforms.py +++ b/text_to_num/transforms.py @@ -27,6 +27,11 @@ from .lang import LANG from .parsers import WordStreamValueParser, WordToDigitParser +from text_to_num.lang.portuguese import OrdinalsMerger + +omg = OrdinalsMerger() +USE_PT_ORDINALS_MERGER = True + def look_ahead(sequence: Sequence[Any]) -> Iterator[Tuple[Any, Any]]: """Look-ahead iterator. @@ -67,12 +72,15 @@ def text2num(text: str, lang: str, relaxed: bool = False) -> int: def alpha2digit( text: str, lang: str, relaxed: bool = False, signed: bool = True ) -> str: - """Return the text of ``text`` with all the French spelled numbers converted to digits. + """Return the text of ``text`` with all the ``lang`` spelled numbers converted to digits. Takes care of punctuation. Set ``relaxed`` to True if you want to accept some disjoint numbers as compounds. Set ``signed`` to False if you don't want to produce signed numbers, that is, for example, if you prefer to get « moins 2 » instead of « -2 ». + """ + if lang not in LANG.keys(): + raise Exception("Language not supported") language = LANG[lang] segments = re.split(r"\s*[\.,;\(\)…\[\]:!\?]+\s*", text) punct = re.findall(r"\s*[\.,;\(\)…\[\]:!\?]+\s*", text) @@ -89,7 +97,9 @@ def alpha2digit( in_number = True elif in_number: out_tokens.append(num_builder.value) - num_builder = WordToDigitParser(language, relaxed=relaxed) + num_builder = WordToDigitParser( + language, relaxed=relaxed, signed=signed + ) in_number = num_builder.push(word.lower(), ahead and ahead.lower()) if not in_number: out_tokens.append(word) @@ -99,4 +109,7 @@ def alpha2digit( out_tokens.append(num_builder.value) out_segments.append(" ".join(out_tokens)) out_segments.append(sep) - return "".join(out_segments) + text = "".join(out_segments) + if lang == "pt" and USE_PT_ORDINALS_MERGER: + text = omg.merge_compound_ordinals_pt(text) + return text