diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..895f551 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,33 @@ +name: Testing duckreg + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ['3.9', '3.10', '3.11', '3.12'] + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install . + + - name: Install test dependencies + run: | + pip install pytest + + - name: Run tests + run: | + pytest diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_fitter.py b/tests/test_fitter.py new file mode 100644 index 0000000..3b2a206 --- /dev/null +++ b/tests/test_fitter.py @@ -0,0 +1,47 @@ +import pytest +import numpy as np +from duckreg.estimators import DuckRegression +from tests.utils import generate_sample_data, create_duckdb_database + +@pytest.fixture(scope="session") +def database(): + df = generate_sample_data() + db_name = 'test_dataset.db' + create_duckdb_database(df, db_name) + +@pytest.mark.parametrize("fml", ["Y ~ D", "Y ~ D + f1", "Y ~ D + f1 + f2"]) +@pytest.mark.parametrize("cluster_col", ["f1"]) +def test_fitters(fml, cluster_col): + + m_duck = DuckRegression( + db_name='test_dataset.db', + table_name='data', + formula=fml, + cluster_col=cluster_col, + n_bootstraps=20, + seed = 42 + ) + m_duck.fit() + + + m_feols = DuckRegression( + db_name='test_dataset.db', + table_name='data', + formula=fml, + cluster_col=cluster_col, + n_bootstraps=20, + seed = 42, + fitter = "feols" + ).fit() + + results = m_duck.summary() + coefs = results["point_estimate"] + se = results["standard_error"] + + assert np.all(np.abs(coefs) - np.abs(m_feols.coef().values) < 1e-12), "Coeficients are not equal" + assert np.all(np.abs(se) - np.abs(m_feols.se().values) < 1e-12), "Standard errors are not equal" + + + + + diff --git a/tests/test_vs_pyfixest.py b/tests/test_vs_pyfixest.py new file mode 100644 index 0000000..292b0ac --- /dev/null +++ b/tests/test_vs_pyfixest.py @@ -0,0 +1,66 @@ +import pytest +import numpy as np +from duckreg.estimators import DuckRegression +from tests.utils import generate_sample_data, create_duckdb_database +import pyfixest as pf + +@pytest.fixture(scope="session") +def get_data(): + return generate_sample_data() + +@pytest.fixture(scope="session") +def database(get_data): + df = get_data + db_name = 'test_dataset.db' + create_duckdb_database(df, db_name) + return df + + +@pytest.mark.parametrize("fml", ["Y ~ D", "Y ~ D + f1", "Y ~ D + f1 + f2"]) +@pytest.mark.parametrize("cluster_col", [""]) + +def test_vs_pyfixest_deterministic(get_data, fml, cluster_col): + + m_duck = DuckRegression( + db_name='test_dataset.db', + table_name='data', + formula=fml, + cluster_col=cluster_col, + n_bootstraps=0, + seed = 42 + ) + m_duck.fit() + m_duck.fit_vcov() + + m_feols = pf.feols( + fml, + data = get_data, + vcov = "hetero" if cluster_col == "" else {"CRV1": cluster_col}, + ssc = pf.ssc(adj = False, cluster_adj = True) + ) + + results = m_duck.summary() + coefs = results["point_estimate"] + se = results["standard_error"] + + assert np.all(np.abs(coefs) - np.abs(m_feols.coef().values) < 1e-8), "Coeficients are not equal" + assert np.all(np.abs(se) - np.abs(m_feols.se().values) < 1e-4), "Standard errors are not equal" + +def test_multiple_estimation_stochastic(): + + pass + + +def test_vs_pyfixest_stochastic(): + + pass + + +def test_mundlak(): + + pass + + +def test_double_demeaning(): + + pass \ No newline at end of file diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..bd13836 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,24 @@ +import numpy as np +import pandas as pd +import duckdb + +# Generate sample data +def generate_sample_data(N=10_000, seed=12345): + rng = np.random.default_rng(seed) + D = rng.choice([0, 1], size=(N, 1)) + X = rng.choice(range(20), (N, 2), True) + Y = D + X @ np.array([1, 2]).reshape(2, 1) + rng.normal(size=(N, 1)) + Y2 = -1 * D + X @ np.array([1, 2]).reshape(2, 1) + rng.normal(size=(N, 1)) + df = pd.DataFrame( + np.concatenate([Y, Y2, D, X], axis=1), columns=["Y", "Y2", "D", "f1", "f2"] + ).assign(rowid=range(N)) + return df + + +# Function to create and populate DuckDB database +def create_duckdb_database(df, db_name="large_dataset.db", table="data"): + conn = duckdb.connect(db_name) + conn.execute(f"DROP TABLE IF EXISTS {table}") + conn.execute(f"CREATE TABLE {table} AS SELECT * FROM df") + conn.close() + print(f"Data loaded into DuckDB database: {db_name}") \ No newline at end of file