From 1d06386d986eae227ed8ac442078c45b57da67d5 Mon Sep 17 00:00:00 2001
From: Kevin Klein <7267523+kklein@users.noreply.github.com>
Date: Fri, 10 May 2024 12:29:49 +0200
Subject: [PATCH] Reduce test size (#73)

---
 .github/workflows/ci.yml      |  2 +-
 benchmarks/benchmark.py       |  2 +-
 benchmarks/readme.md          | 12 ++++++------
 tests/conftest.py             |  2 +-
 tests/test_data_generation.py |  2 +-
 tests/test_learner.py         | 28 +++++++++++++++-------------
 6 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 535d6f8..e3c02b6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -78,7 +78,7 @@ jobs:
         uses: quantco/pytest-action@v2
         with:
           report-title: Unit Tests - ${{ matrix.os == 'ubuntu-latest' && 'Linux' || 'Windows' }} - Python ${{ matrix.python-version }}
-          custom-arguments: -n auto --cov=metalearners --cov-report=xml --cov-report term-missing --color=yes
+          custom-arguments: --cov=metalearners --cov-report=xml --cov-report term-missing --color=yes
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4.3.1
         with:
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
index 81bf8eb..5972719 100644
--- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
@@ -38,7 +38,7 @@
 def _synthetic_data(
     is_classification,
     rng,
-    sample_size=1_000_000,
+    sample_size=100_000,
     n_numericals=25,
     test_fraction=0.2,
     propensity_score=0.3,
diff --git a/benchmarks/readme.md b/benchmarks/readme.md
index c79ae67..2cbc554 100644
--- a/benchmarks/readme.md
+++ b/benchmarks/readme.md
@@ -25,16 +25,16 @@ on ground truth CATEs:
 
 | T-learner                                                    | causalml_in_sample | causalml_oos | econml_in_sample | econml_oos | metalearners_in_sample | metalearners_oos |
 | :----------------------------------------------------------- | -----------------: | -----------: | ---------------: | ---------: | ---------------------: | ---------------: |
-| synthetic_data_continuous_outcome_binary_treatment_linear_te |          0.0121381 |      0.01212 |        0.0121381 |    0.01212 |              0.0124729 |          0.01212 |
-| synthetic_data_binary_outcome_binary_treatment_linear_te     |          0.0149216 |    0.0148903 |              nan |        nan |              0.0149779 |        0.0148903 |
+| synthetic_data_continuous_outcome_binary_treatment_linear_te |          0.0458966 |    0.0456347 |        0.0458966 |  0.0456347 |              0.0467864 |        0.0456347 |
+| synthetic_data_binary_outcome_binary_treatment_linear_te     |          0.0212419 |    0.0215154 |              nan |        nan |               0.021512 |        0.0215154 |
 | twins_pandas                                                 |            0.34843 |     0.362315 |              nan |        nan |               0.354783 |         0.348551 |
 | twins_numpy                                                  |           0.308362 |     0.345602 |              nan |        nan |               0.349543 |         0.345602 |
 
 | S-learner                                                     | causalml_in_sample | causalml_oos | econml_in_sample | econml_oos | metalearners_in_sample | metalearners_oos |
 | :------------------------------------------------------------ | -----------------: | -----------: | ---------------: | ---------: | ---------------------: | ---------------: |
-| synthetic_data_continuous_outcome_binary_treatment_linear_te  |            11.6777 |      11.6584 |          11.6777 |    11.6585 |                11.6778 |          11.6584 |
-| synthetic_data_binary_outcome_binary_treatment_linear_te      |           0.256848 |     0.256304 |              nan |        nan |               0.256859 |         0.256304 |
+| synthetic_data_continuous_outcome_binary_treatment_linear_te  |            14.5706 |      14.6248 |          14.5706 |    14.6248 |                14.5729 |          14.6248 |
+| synthetic_data_binary_outcome_binary_treatment_linear_te      |           0.229101 |     0.228616 |              nan |        nan |               0.229231 |           0.2286 |
 | twins_pandas                                                  |           0.314253 |     0.318554 |              nan |        nan |               0.371613 |         0.319028 |
 | twins_numpy                                                   |           0.314253 |     0.318554 |              nan |        nan |               0.361345 |         0.318554 |
-| synthetic_data_continuous_outcome_multi_treatment_linear_te   |                nan |          nan |          11.6025 |    11.5957 |                11.6025 |          11.5957 |
-| synthetic_data_continuous_outcome_multi_treatment_constant_te |                nan |          nan |       0.00499728 | 0.00499728 |             0.00384154 |       0.00349068 |
+| synthetic_data_continuous_outcome_multi_treatment_linear_te   |                nan |          nan |          14.1468 |     14.185 |                14.1478 |          14.1853 |
+| synthetic_data_continuous_outcome_multi_treatment_constant_te |                nan |          nan |        0.0110779 |  0.0110778 |              0.0104649 |       0.00897915 |
diff --git a/tests/conftest.py b/tests/conftest.py
index de3beec..1d02f81 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -180,7 +180,7 @@ def n_categoricals():
 
 @pytest.fixture(scope="module")
 def sample_size():
-    return 1_000_000
+    return 100_000
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/test_data_generation.py b/tests/test_data_generation.py
index 9be477c..2ce272a 100644
--- a/tests/test_data_generation.py
+++ b/tests/test_data_generation.py
@@ -208,7 +208,7 @@ def test_compute_experiment_outputs(
     expected_std = np.sqrt(sigma_y**2 + sigma_tau**2)
     for k in range(1, n_variants):
         actual_std = (y - mu[:, k])[treatment == k].std()
-        assert actual_std == pytest.approx(expected_std, abs=1e-2)
+        assert actual_std == pytest.approx(expected_std, abs=1e-2, rel=1e-1)
 
 
 @pytest.mark.parametrize("dataset", ["numerical_covariates", "mixed_covariates"])
diff --git a/tests/test_learner.py b/tests/test_learner.py
index 2378382..ed89fdf 100644
--- a/tests/test_learner.py
+++ b/tests/test_learner.py
@@ -43,10 +43,10 @@ def _linear_base_learner_params(
 @pytest.mark.parametrize(
     "metalearner, outcome_kind, reference_value, treatment_kind, te_kind",
     [
-        ("T", "binary", 0.0149, "binary", "linear"),
-        ("T", "continuous", 0.0121, "binary", "linear"),
-        ("S", "binary", 0.2568, "binary", "linear"),
-        ("S", "continuous", 11.6777, "binary", "linear"),
+        ("T", "binary", 0.0212, "binary", "linear"),
+        ("T", "continuous", 0.0456, "binary", "linear"),
+        ("S", "binary", 0.2290, "binary", "linear"),
+        ("S", "continuous", 14.5706, "binary", "linear"),
     ],
 )
 def test_learner_synthetic_in_sample(
@@ -84,18 +84,20 @@ def test_learner_synthetic_in_sample(
     rmse = root_mean_squared_error(true_cate, cate_estimates)
     assert rmse < reference_value * (1 + _REFERENCE_VALUE_TOLERANCE)
     if metalearner == "T":
-        np.testing.assert_allclose(cate_estimates, true_cate.reshape(-1), atol=0.15)
+        np.testing.assert_allclose(
+            cate_estimates, true_cate.reshape(-1), atol=0.3, rtol=0.3
+        )
 
 
 @pytest.mark.parametrize(
     "metalearner, outcome_kind, reference_value, treatment_kind, te_kind",
     [
-        ("T", "binary", 0.0149, "binary", "linear"),
-        ("T", "continuous", 0.0121, "binary", "linear"),
-        ("S", "binary", 0.2563, "binary", "linear"),
-        ("S", "continuous", 11.6584, "binary", "linear"),
-        ("S", "continuous", 11.5957, "multi", "linear"),
-        ("S", "continuous", 0.004997, "multi", "constant"),
+        ("T", "binary", 0.0215, "binary", "linear"),
+        ("T", "continuous", 0.0456, "binary", "linear"),
+        ("S", "binary", 0.2286, "binary", "linear"),
+        ("S", "continuous", 14.6248, "binary", "linear"),
+        ("S", "continuous", 14.185, "multi", "linear"),
+        ("S", "continuous", 0.0111, "multi", "constant"),
     ],
 )
 @pytest.mark.parametrize("oos_method", ["overall", "mean", "median"])
@@ -161,7 +163,7 @@ def test_learner_synthetic_oos(
     assert rmse < reference_value * (1 + _REFERENCE_VALUE_TOLERANCE)
     if metalearner == "T":
         np.testing.assert_allclose(
-            cate_estimates, true_cate_test.reshape(-1), atol=0.15
+            cate_estimates, true_cate_test.reshape(-1), atol=0.3, rtol=0.3
         )
 
 
@@ -220,7 +222,7 @@ def test_learner_synthetic_oos_ate(metalearner, treatment_kind, oos_method, requ
     )
     actual_ate_estimate = np.mean(cate_estimates)
     target_ate_estimate = np.mean(true_cate_test)
-    assert actual_ate_estimate == pytest.approx(target_ate_estimate, abs=1e-2, rel=1e-2)
+    assert actual_ate_estimate == pytest.approx(target_ate_estimate, abs=1e-2, rel=1e-1)
 
 
 @pytest.mark.parametrize("metalearner, reference_value", [("T", 0.3623), ("S", 0.3186)])