configure tests

py-econometrics · Aug 6, 2024 · f20fc36 · f20fc36
1 parent f8675c2
commit f20fc36
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 108 deletions.
diff --git a/duckreg/estimators.py b/duckreg/estimators.py
@@ -141,6 +141,7 @@ def estimate_feols(self):
         return fit
 
     def bootstrap(self):
+        self.se = "bootstrap"
         if self.fevars:
             boot_coefs = np.zeros(
                 (self.n_bootstraps, len(self.covars) * len(self.outcome_vars))
@@ -207,8 +208,10 @@ def bootstrap(self):
 
         return vcov
 
-    def summary(self): # ovveride the summary method to include the heteroskedasticity-robust variance covariance matrix when available
-        if self.n_bootstraps > 0 or self.se == "hc1":
+    def summary(
+        self,
+    ):  # ovveride the summary method to include the heteroskedasticity-robust variance covariance matrix when available
+        if self.n_bootstraps > 0 or (hasattr(self, "se") and self.se == "hc1"):
             return {
                 "point_estimate": self.point_estimate,
                 "standard_error": np.sqrt(np.diag(self.vcov)),

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,15 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--force-regen",
+        action="store_true",
+        default=False,
+        help="Force regeneration of test data",
+    )
+
+
+@pytest.fixture(scope="session")
+def force_regen(request):
+    return request.config.getoption("--force-regen")
diff --git a/tests/test_fitter.py b/tests/test_fitter.py
@@ -1,47 +1,74 @@
-import pytest
 import numpy as np
+import pytest
+import os
 from duckreg.estimators import DuckRegression
 from tests.utils import generate_sample_data, create_duckdb_database
+import duckdb
+import pandas as pd
+
 
 @pytest.fixture(scope="session")
-def database():
-    df = generate_sample_data()
-    db_name = 'test_dataset.db'
-    create_duckdb_database(df, db_name)
+def get_data(force_regen):
+    if force_regen:
+        return generate_sample_data(1_000_000, seed=42)
+    else:
+        return generate_sample_data(1_000_000, seed=42)
 
-@pytest.mark.parametrize("fml", ["Y ~ D", "Y ~ D + f1", "Y ~ D + f1 + f2"])
-@pytest.mark.parametrize("cluster_col", ["f1"])
-def test_fitters(fml, cluster_col):
 
-    m_duck = DuckRegression(
-        db_name='test_dataset.db',
-        table_name='data',
-        formula=fml,
-        cluster_col=cluster_col,
-        n_bootstraps=20,
-        seed = 42
-    )
-    m_duck.fit()
+@pytest.fixture(scope="session")
+def database(get_data, force_regen):
+    df = get_data
+    db_name = "test_dataset.db"
+    if force_regen and os.path.exists(db_name):
+        os.remove(db_name)
+    db_path = create_duckdb_database(df, db_name)
+    return db_path
 
 
-    m_feols = DuckRegression(
-        db_name='test_dataset.db',
-        table_name='data',
-        formula=fml,
-        cluster_col=cluster_col,
-        n_bootstraps=20,
-        seed = 42,
-        fitter = "feols"
-    ).fit()
+def get_numpy_coefficients(db_path, formula):
+    conn = duckdb.connect(db_path)
+    df = conn.execute("SELECT * FROM data").df()
+    conn.close()
 
-    results = m_duck.summary()
-    coefs = results["point_estimate"]
-    se = results["standard_error"]
+    y = df["Y"].values
+    X_cols = [x.strip() for x in formula.split("~")[1].strip().split("+")]
+    X = df[X_cols].values
+    X = np.column_stack([np.ones(X.shape[0]), X])
+
+    coeffs = np.linalg.inv(X.T @ X) @ X.T @ y
+    return coeffs[1:]
 
-    assert np.all(np.abs(coefs) - np.abs(m_feols.coef().values) < 1e-12), "Coeficients are not equal"
-    assert np.all(np.abs(se) - np.abs(m_feols.se().values) < 1e-12), "Standard errors are not equal"
 
+@pytest.mark.parametrize(
+    "fml",
+    [
+        "Y ~ D",
+        "Y ~ D + f1",
+        "Y ~ D + f1 + f2",
+    ],
+)
+def test_fitters(database, fml):
+    db_path = database
 
+    uncompressed_coeffs = get_numpy_coefficients(db_path, fml)
+
+    m_duck = DuckRegression(
+        db_name=db_path,
+        table_name="data",
+        formula=fml,
+        cluster_col="",
+        n_bootstraps=0,
+        seed=42,
+    )
+    m_duck.fit()
 
+    np.testing.assert_allclose(
+        m_duck.df_compressed["count"].sum(), 1_000_000, rtol=1e-4
+    ), "Number of observations are not equal"
 
+    results = m_duck.summary()
+    compressed_coeffs = results["point_estimate"][1:]
 
+    np.testing.assert_allclose(
+        compressed_coeffs, uncompressed_coeffs, rtol=1e-4
+    ), f"Coefficients are not equal for formula {fml}"
diff --git a/tests/test_vs_pyfixest.py b/tests/test_vs_pyfixest.py
diff --git a/tests/utils.py b/tests/utils.py
@@ -1,24 +1,30 @@
+import os
 import numpy as np
 import pandas as pd
 import duckdb
 
+
 # Generate sample data
-def generate_sample_data(N=10_000, seed=12345):
+def generate_sample_data(N=10_000_000, seed=42):
     rng = np.random.default_rng(seed)
     D = rng.choice([0, 1], size=(N, 1))
     X = rng.choice(range(20), (N, 2), True)
     Y = D + X @ np.array([1, 2]).reshape(2, 1) + rng.normal(size=(N, 1))
     Y2 = -1 * D + X @ np.array([1, 2]).reshape(2, 1) + rng.normal(size=(N, 1))
     df = pd.DataFrame(
         np.concatenate([Y, Y2, D, X], axis=1), columns=["Y", "Y2", "D", "f1", "f2"]
-    ).assign(rowid=range(N))
+    )
     return df
 
 
-# Function to create and populate DuckDB database
-def create_duckdb_database(df, db_name="large_dataset.db", table="data"):
-    conn = duckdb.connect(db_name)
-    conn.execute(f"DROP TABLE IF EXISTS {table}")
-    conn.execute(f"CREATE TABLE {table} AS SELECT * FROM df")
-    conn.close()
-    print(f"Data loaded into DuckDB database: {db_name}")
+def create_duckdb_database(df, db_name="test_dataset.db", table="data"):
+    db_path = os.path.abspath(db_name)
+    conn = duckdb.connect(db_path)
+    try:
+        conn.execute(f"DROP TABLE IF EXISTS {table}")
+        conn.execute(f"CREATE TABLE {table} AS SELECT * FROM df")
+        result = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
+        print(f"Created table '{table}' with {result[0]} rows in database: {db_path}")
+    finally:
+        conn.close()
+    return db_path