fixing various pipeline bugs with the new smd function

owkin · Aug 19, 2024 · 3b4802b · 3b4802b
1 parent 4cf7a88
commit 3b4802b
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 19 deletions.
diff --git a/experiments/config/experiment/smd_cov_shift.yaml b/experiments/config/experiment/smd_cov_shift.yaml
@@ -26,7 +26,6 @@ models:
       robust: True
   FedECA:
     ndim: ${data.ndim}
-    fedeca_path: "/home/owkin/federated-eca-code/"
 
 # config fit FedECA
 fit_fedeca:
@@ -50,7 +49,7 @@ parameters:
 
 hydra:
   sweep:
-    dir: "/home/owkin/project/results_experiments/smd_cov_shift"
+    dir: "./results_experiments/smd_cov_shift"
   sweeper:
     params:
       data.overlap: range(-1,4,1)
diff --git a/experiments/synthetic.py b/experiments/synthetic.py
@@ -94,14 +94,14 @@ def single_experiment(
     covariates = [x for x in data.columns if x not in non_cov]
 
     mask_treated = data[treated_col].eq(1)
-    smd_true_ps = standardized_mean_diff(data[ps_col], mask_treated)
+    smd_true_ps = standardized_mean_diff(data[ps_col], mask_treated).to_frame().T
+
     df_smd_raw = (
-        data[covariates]
-        .apply(lambda s: standardized_mean_diff(s, mask_treated))
+        standardized_mean_diff(data[covariates], mask_treated)
         .to_frame()
-        .transpose()
-        .add_prefix("smd_raw_")
+        .T.add_prefix("smd_raw_")
     )
+
     ate_true = data_gen.average_treatment_effect_
     percent_ties = data_gen.percent_ties
     models_fit_times: dict[str, Optional[float]] = {
@@ -156,25 +156,28 @@ def single_experiment(
             smd_estim_ps = None
             ess = None
             if model.propensity_scores_ is not None:
-                smd_estim_ps = standardized_mean_diff(
-                    model.propensity_scores_,
-                    mask_treated,
+                dummy_df = pd.DataFrame(
+                    model.propensity_scores_, columns=["empirical propensity"]
+                )
+                smd_estim_ps = (
+                    standardized_mean_diff(
+                        dummy_df,
+                        mask_treated,
+                    )
+                    .to_frame()
+                    .T
                 )
 
             if model.weights_ is not None:
                 ess = effective_sample_size(model.weights_[mask_treated])
                 df_smd_weighted = (
-                    data[covariates]
-                    .apply(
-                        lambda s: standardized_mean_diff(
-                            s,
-                            mask_treated,
-                            weights=model.weights_.detach().cpu().numpy(),
-                        )
+                    standardized_mean_diff(
+                        data[covariates],
+                        mask_treated,
+                        weights=model.weights_,
                     )
                     .to_frame()
-                    .transpose()
-                    .add_prefix("smd_weighted_")
+                    .T.add_prefix("smd_weighted_")
                 )
 
             log_likelihood = model.log_likelihood_

diff --git a/fedeca/metrics/metrics.py b/fedeca/metrics/metrics.py
@@ -30,6 +30,12 @@ def standardized_mean_diff(
     smd: np.ndarray
         standardized mean differences of the confounders.
     """
+    assert isinstance(confounders, pd.DataFrame) or isinstance(
+        confounders, pd.Series
+    ), "confounders type is not supported"  # noqa: E501
+    if isinstance(confounders, pd.Series):
+        confounders = pd.DataFrame(confounders, columns=[confounders.name])
+
     if weights is None:
         weights = np.ones((len(confounders.index)))