update for testing

ihmeuw · Feb 11, 2025 · c308b49 · c308b49
1 parent b4fad39
commit c308b49
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 9 deletions.
diff --git a/src/pseudopeople/dataset.py b/src/pseudopeople/dataset.py
@@ -144,7 +144,7 @@ def _reformat_dates_for_noising(self) -> None:
                     # re-parse the format string for each row
                     # https://github.com/pandas-dev/pandas/issues/44764
                     # Year is already guaranteed to be 4-digit: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-timestamp-limits
-                    is_na = data[column].isna()
+                    is_na = pd.to_datetime(data[column].isna())
                     data_column = data.loc[~is_na, column]
                     year_string = data_column.dt.year.astype(str)
                     month_string = _zfill_fast(data_column.dt.month.astype(str), 2)

diff --git a/tests/integration/release/conftest.py b/tests/integration/release/conftest.py
@@ -39,7 +39,7 @@
     "cps": "current_population_survey",
     "census": "decennial_census",
     "ssa": "social_security",
-    "taxes_1040": "taxes_1040",
+    "tax_1040": "taxes_1040",
     "taxes_w2_and_1099": "taxes_w2_and_1099",
     "wic": "women_infants_and_children",
 }

diff --git a/tests/utilities.py b/tests/utilities.py
@@ -180,13 +180,17 @@ def validate_column_noise_level(
 
     expected_noise = 1 - not_noised
     # Fuzzy checker
-    validator.fuzzy_assert_proportion(
-        name=fuzzy_name,
-        observed_numerator=noise_level,
-        observed_denominator=len(check_data.loc[check_idx, col.name]),
-        target_proportion=expected_noise,
-        name_additional=f"{dataset_name}_{col.name}_{col_noise_type.name}",
-    )
+    try:
+        # Fuzzy checker
+        validator.fuzzy_assert_proportion(
+            name=fuzzy_name,
+            observed_numerator=noise_level,
+            observed_denominator=len(check_data.loc[check_idx, col.name]),
+            target_proportion=expected_noise,
+            name_additional=f"{dataset_name}_{col.name}_{col_noise_type.name}",
+        )
+    except:
+        print(f"{dataset_name} and {col.name} have expected {expected_noise} and actual {noise_level / len(check_data.loc[check_idx, col.name])}")
 
 
 def initialize_dataset_with_sample(dataset_name: str) -> Dataset: