From 50c7dc8806a070eb015a00acae670cbd17bd1150 Mon Sep 17 00:00:00 2001
From: Kevin Klein <7267523+kklein@users.noreply.github.com>
Date: Mon, 6 May 2024 10:40:09 +0200
Subject: [PATCH] Implement function returning a MetaLearner factory based on
 prefix, e.g. mapping `"T"` to `TLearner` (#61)

Co-authored-by: kklein </>
---
 conda.recipe/recipe.yaml |   1 +
 metalearners/utils.py    |  15 +++++
 tests/test__utils.py     | 127 ++++++++++++++++++++++++++++++++++++++
 tests/test_utils.py      | 130 ++++-----------------------------------
 4 files changed, 155 insertions(+), 118 deletions(-)
 create mode 100644 metalearners/utils.py
 create mode 100644 tests/test__utils.py

diff --git a/conda.recipe/recipe.yaml b/conda.recipe/recipe.yaml
index 6a354e19..b21f4fa8 100644
--- a/conda.recipe/recipe.yaml
+++ b/conda.recipe/recipe.yaml
@@ -33,6 +33,7 @@ tests:
   - python:
       imports:
         - metalearners
+        - metalearners.utils
         - metalearners.cross_fit_estimator
         - metalearners.data_generation
         - metalearners.outcome_functions
diff --git a/metalearners/utils.py b/metalearners/utils.py
new file mode 100644
index 00000000..0076b870
--- /dev/null
+++ b/metalearners/utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) QuantCo 2024-2024
+# SPDX-License-Identifier: LicenseRef-QuantCo
+
+from metalearners.metalearner import MetaLearner
+from metalearners.tlearner import TLearner
+
+
+def metalearner_factory(metalearner_prefix: str) -> type[MetaLearner]:
+    match metalearner_prefix:
+        case "T":
+            return TLearner
+        case _:
+            raise ValueError(
+                f"No MetaLearner implementation found for prefix {metalearner_prefix}."
+            )
diff --git a/tests/test__utils.py b/tests/test__utils.py
new file mode 100644
index 00000000..7b1211de
--- /dev/null
+++ b/tests/test__utils.py
@@ -0,0 +1,127 @@
+# Copyright (c) QuantCo 2024-2024
+# SPDX-License-Identifier: LicenseRef-QuantCo
+
+from contextlib import nullcontext as does_not_raise
+
+import numpy as np
+import pytest
+
+from metalearners._utils import (
+    check_probability,
+    check_propensity_score,
+    get_linear_dimension,
+)
+from metalearners.data_generation import generate_covariates
+
+
+@pytest.mark.parametrize("n_numericals", [0, 5, 10])
+@pytest.mark.parametrize("n_categoricals, n_categories", [(5, 5), (3, [3, 4, 10])])
+def test_get_linear_dimension(n_numericals, n_categoricals, n_categories, rng):
+    features, _, _ = generate_covariates(
+        1000,
+        n_numericals + n_categoricals,
+        n_categoricals=n_categoricals,
+        n_categories=n_categories,
+        rng=rng,
+    )
+    dim = get_linear_dimension(features)
+    if isinstance(n_categories, int):
+        total_categories = n_categoricals * n_categories
+    else:
+        total_categories = sum(n_categories)
+    assert dim == n_numericals + total_categories
+
+
+@pytest.mark.parametrize(
+    "p, expected",
+    [(np.array([2, 3]), (2,)), (np.array([[2, 3, 4], [34, 35, 66]]), (2, 3))],
+)
+def test_check_propensity_score_shape(p, expected):
+    with pytest.raises(ValueError) as e:
+        check_propensity_score(p)
+    assert (
+        e.value.args[0]
+        == f"One propensity score must be provided for each variant. There are 2 but "
+        f"the shape of the propensity scores is {expected}."
+    )
+
+
+@pytest.mark.parametrize("check_kwargs", [None, {"force_all_finite": "allow-nan"}])
+def test_check_propensity_score_handle_nan(check_kwargs):
+    if check_kwargs is None:
+        with pytest.raises(ValueError) as e:
+            check_propensity_score(
+                np.array([[0.2, 0.8], [0.4, 0.6]]),
+                np.array([[np.nan, 1], [2.0, 1]]),
+                check_kwargs=check_kwargs,
+            )
+        assert "contains NaN" in e.value.args[0]
+    else:
+        check_propensity_score(
+            np.array([[0.2, 0.8], [0.4, 0.6]]),
+            np.array([[np.nan, 1], [2.0, 1]]),
+            check_kwargs=check_kwargs,
+        )
+
+
+@pytest.mark.parametrize(
+    "p, expected",
+    [
+        (np.array([[-0.2, 0.4], [0.4, 0.6], [0.9, 0.1]]), (-0.2, 0.9)),
+        (np.array([[0.2, 0.4], [0.4, 0.6], [0.9, 1.1]]), (0.2, 1.1)),
+    ],
+)
+def test_check_propensity_score_min_max(p, expected):
+    with pytest.raises(ValueError) as e:
+        check_propensity_score(p)
+    assert (
+        e.value.args[0] == f"Propensity scores have to be between 0 and 1. Minimum is "
+        f"{expected[0]:.4f} and maximum is {expected[1]:.4f}."
+    )
+
+
+@pytest.mark.parametrize(
+    "p, expected",
+    [
+        (np.array([[0.2, 0.4], [0.4, 0.6], [0.9, 0.1]]), (0.6, 1)),
+        (np.array([[0.2, 0.8], [0.4, 0.6], [0.9, 0.4]]), (1, 1.3)),
+    ],
+)
+def test_check_propensity_score_sum_to_one(p, expected):
+    with pytest.raises(ValueError) as e:
+        check_propensity_score(p, sum_to_one=True)
+    assert (
+        e.value.args[0]
+        == f"Propensity scores for all observations must sum to 1. Minimum is "
+        f"{expected[0]:.4f} and maximum is {expected[1]:.4f}."
+    )
+
+
+@pytest.mark.parametrize("value", [np.nan, -0.5, 0, 0.5, 1, 1.5])
+@pytest.mark.parametrize("zero_included", [False, True])
+@pytest.mark.parametrize("one_included", [False, True])
+def test_check_probability(value, zero_included, one_included):
+    if np.isnan(value):
+        context = pytest.raises(
+            ValueError, match="Invalid input! Probability p should not be NaN."
+        )
+    elif zero_included and value < 0:
+        context = pytest.raises(
+            ValueError, match="Probability p must be greater than or equal to 0."
+        )
+    elif not zero_included and value <= 0:
+        context = pytest.raises(
+            ValueError, match="Probability p must be greater than or equal to 0."
+        )
+    elif one_included and value > 1:
+        context = pytest.raises(
+            ValueError, match="Probability p must be less than or equal to 1."
+        )
+    elif not one_included and value >= 1:
+        context = pytest.raises(
+            ValueError, match="Probability p must be less than or equal to 1."
+        )
+    else:
+        context = does_not_raise()  # type: ignore
+    with context:
+        check_probability(value, zero_included, one_included)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 7b1211de..ec480762 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,127 +1,21 @@
 # Copyright (c) QuantCo 2024-2024
 # SPDX-License-Identifier: LicenseRef-QuantCo
 
-from contextlib import nullcontext as does_not_raise
-
-import numpy as np
 import pytest
+from lightgbm import LGBMRegressor
 
-from metalearners._utils import (
-    check_probability,
-    check_propensity_score,
-    get_linear_dimension,
-)
-from metalearners.data_generation import generate_covariates
-
-
-@pytest.mark.parametrize("n_numericals", [0, 5, 10])
-@pytest.mark.parametrize("n_categoricals, n_categories", [(5, 5), (3, [3, 4, 10])])
-def test_get_linear_dimension(n_numericals, n_categoricals, n_categories, rng):
-    features, _, _ = generate_covariates(
-        1000,
-        n_numericals + n_categoricals,
-        n_categoricals=n_categoricals,
-        n_categories=n_categories,
-        rng=rng,
-    )
-    dim = get_linear_dimension(features)
-    if isinstance(n_categories, int):
-        total_categories = n_categoricals * n_categories
-    else:
-        total_categories = sum(n_categories)
-    assert dim == n_numericals + total_categories
-
-
-@pytest.mark.parametrize(
-    "p, expected",
-    [(np.array([2, 3]), (2,)), (np.array([[2, 3, 4], [34, 35, 66]]), (2, 3))],
-)
-def test_check_propensity_score_shape(p, expected):
-    with pytest.raises(ValueError) as e:
-        check_propensity_score(p)
-    assert (
-        e.value.args[0]
-        == f"One propensity score must be provided for each variant. There are 2 but "
-        f"the shape of the propensity scores is {expected}."
-    )
-
-
-@pytest.mark.parametrize("check_kwargs", [None, {"force_all_finite": "allow-nan"}])
-def test_check_propensity_score_handle_nan(check_kwargs):
-    if check_kwargs is None:
-        with pytest.raises(ValueError) as e:
-            check_propensity_score(
-                np.array([[0.2, 0.8], [0.4, 0.6]]),
-                np.array([[np.nan, 1], [2.0, 1]]),
-                check_kwargs=check_kwargs,
-            )
-        assert "contains NaN" in e.value.args[0]
-    else:
-        check_propensity_score(
-            np.array([[0.2, 0.8], [0.4, 0.6]]),
-            np.array([[np.nan, 1], [2.0, 1]]),
-            check_kwargs=check_kwargs,
-        )
-
-
-@pytest.mark.parametrize(
-    "p, expected",
-    [
-        (np.array([[-0.2, 0.4], [0.4, 0.6], [0.9, 0.1]]), (-0.2, 0.9)),
-        (np.array([[0.2, 0.4], [0.4, 0.6], [0.9, 1.1]]), (0.2, 1.1)),
-    ],
-)
-def test_check_propensity_score_min_max(p, expected):
-    with pytest.raises(ValueError) as e:
-        check_propensity_score(p)
-    assert (
-        e.value.args[0] == f"Propensity scores have to be between 0 and 1. Minimum is "
-        f"{expected[0]:.4f} and maximum is {expected[1]:.4f}."
-    )
+from metalearners.metalearner import MetaLearner
+from metalearners.utils import metalearner_factory
 
 
-@pytest.mark.parametrize(
-    "p, expected",
-    [
-        (np.array([[0.2, 0.4], [0.4, 0.6], [0.9, 0.1]]), (0.6, 1)),
-        (np.array([[0.2, 0.8], [0.4, 0.6], [0.9, 0.4]]), (1, 1.3)),
-    ],
-)
-def test_check_propensity_score_sum_to_one(p, expected):
-    with pytest.raises(ValueError) as e:
-        check_propensity_score(p, sum_to_one=True)
-    assert (
-        e.value.args[0]
-        == f"Propensity scores for all observations must sum to 1. Minimum is "
-        f"{expected[0]:.4f} and maximum is {expected[1]:.4f}."
-    )
+@pytest.mark.parametrize("prefix", ["T"])
+def test_metalearner_factory_smoke(prefix):
+    factory = metalearner_factory(prefix)
+    model = factory(nuisance_model_factory=LGBMRegressor, is_classification=False)
+    assert isinstance(model, MetaLearner)
 
 
-@pytest.mark.parametrize("value", [np.nan, -0.5, 0, 0.5, 1, 1.5])
-@pytest.mark.parametrize("zero_included", [False, True])
-@pytest.mark.parametrize("one_included", [False, True])
-def test_check_probability(value, zero_included, one_included):
-    if np.isnan(value):
-        context = pytest.raises(
-            ValueError, match="Invalid input! Probability p should not be NaN."
-        )
-    elif zero_included and value < 0:
-        context = pytest.raises(
-            ValueError, match="Probability p must be greater than or equal to 0."
-        )
-    elif not zero_included and value <= 0:
-        context = pytest.raises(
-            ValueError, match="Probability p must be greater than or equal to 0."
-        )
-    elif one_included and value > 1:
-        context = pytest.raises(
-            ValueError, match="Probability p must be less than or equal to 1."
-        )
-    elif not one_included and value >= 1:
-        context = pytest.raises(
-            ValueError, match="Probability p must be less than or equal to 1."
-        )
-    else:
-        context = does_not_raise()  # type: ignore
-    with context:
-        check_probability(value, zero_included, one_included)
+@pytest.mark.parametrize("prefix", ["", "H", None])
+def test_metalearner_factory_raises(prefix):
+    with pytest.raises(ValueError, match="No MetaLearner implementation found"):
+        metalearner_factory(prefix)