diff --git a/requirements.txt b/requirements.txt index 638e2f6..3af5ea0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ lxml==5.1.0 numpy==1.22.4; python_version == '3.8' numpy==1.26.4; python_version >= '3.9' optbinning==0.19.0 +pcre2==0.4.0 pandas==1.5.3 scikit-learn scikit-lego==0.7.4 diff --git a/sklearn2pmml/preprocessing/regex.py b/sklearn2pmml/preprocessing/regex.py index a6337db..74bde44 100644 --- a/sklearn2pmml/preprocessing/regex.py +++ b/sklearn2pmml/preprocessing/regex.py @@ -39,6 +39,25 @@ def matches(self, x): def replace(self, replacement, x): return self.pattern_.sub(replacement, x) +class PCRE2Engine(RegExEngine): + + def __init__(self, pattern): + import pcre2 + + super(PCRE2Engine, self).__init__(pattern) + self.pattern_ = pcre2.compile(pattern) + + def matches(self, x): + scanner = self.pattern_.scan(x) + try: + scanner.__next__() + return True + except StopIteration: + return False + + def replace(self, replacement, x): + return self.pattern_.substitute(replacement, x) + def make_regex_engine(pattern): try: return PCREEngine(pattern) diff --git a/sklearn2pmml/preprocessing/tests/test_regex.py b/sklearn2pmml/preprocessing/tests/test_regex.py index 846db05..74fc798 100644 --- a/sklearn2pmml/preprocessing/tests/test_regex.py +++ b/sklearn2pmml/preprocessing/tests/test_regex.py @@ -1,6 +1,19 @@ -from sklearn2pmml.preprocessing.regex import REEngine +from sklearn2pmml.preprocessing.regex import PCRE2Engine, REEngine from unittest import TestCase +class PCRE2EngineTest(TestCase): + + def test_matches(self): + engine = PCRE2Engine("ar?y") + self.assertTrue(engine.matches("January")) + self.assertFalse(engine.matches("March")) + self.assertTrue(engine.matches("May")) + + def test_replace(self): + engine = PCRE2Engine(r"(\w)") + self.assertEqual("P u p p y", engine.replace(r"$1 ", "Puppy").rstrip()) + self.assertEqual(r"\1 \1 \1 \1 \1", engine.replace(r"\1 ", "Puppy").rstrip()) + class REEngineTest(TestCase): def test_matches(self): @@ -14,4 +27,5 @@ def test_matches(self): def test_replace(self): engine = REEngine(r"(\w)") - self.assertEqual("P u p p y", engine.replace(r"\1 ", "Puppy").strip()) + self.assertEqual("$1 $1 $1 $1 $1", engine.replace(r"$1 ", "Puppy").rstrip()) + self.assertEqual("P u p p y", engine.replace(r"\1 ", "Puppy").rstrip())