Skip to content

Commit

Permalink
Basic Unit Test Framework (#6)
Browse files Browse the repository at this point in the history
* add github actions to install and test package

* add tests

* small tweaks

* update tests

* stricter internal equivalence tests

* also test on py 3.12
  • Loading branch information
s3alfisc authored Aug 6, 2024
1 parent 2b25a6e commit ee558db
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 0 deletions.
33 changes: 33 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Testing duckreg

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ['3.9', '3.10', '3.11', '3.12']

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install .
- name: Install test dependencies
run: |
pip install pytest
- name: Run tests
run: |
pytest
Empty file added tests/__init__.py
Empty file.
47 changes: 47 additions & 0 deletions tests/test_fitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pytest
import numpy as np
from duckreg.estimators import DuckRegression
from tests.utils import generate_sample_data, create_duckdb_database

@pytest.fixture(scope="session")
def database():
df = generate_sample_data()
db_name = 'test_dataset.db'
create_duckdb_database(df, db_name)

@pytest.mark.parametrize("fml", ["Y ~ D", "Y ~ D + f1", "Y ~ D + f1 + f2"])
@pytest.mark.parametrize("cluster_col", ["f1"])
def test_fitters(fml, cluster_col):

m_duck = DuckRegression(
db_name='test_dataset.db',
table_name='data',
formula=fml,
cluster_col=cluster_col,
n_bootstraps=20,
seed = 42
)
m_duck.fit()


m_feols = DuckRegression(
db_name='test_dataset.db',
table_name='data',
formula=fml,
cluster_col=cluster_col,
n_bootstraps=20,
seed = 42,
fitter = "feols"
).fit()

results = m_duck.summary()
coefs = results["point_estimate"]
se = results["standard_error"]

assert np.all(np.abs(coefs) - np.abs(m_feols.coef().values) < 1e-12), "Coeficients are not equal"
assert np.all(np.abs(se) - np.abs(m_feols.se().values) < 1e-12), "Standard errors are not equal"





66 changes: 66 additions & 0 deletions tests/test_vs_pyfixest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pytest
import numpy as np
from duckreg.estimators import DuckRegression
from tests.utils import generate_sample_data, create_duckdb_database
import pyfixest as pf

@pytest.fixture(scope="session")
def get_data():
return generate_sample_data()

@pytest.fixture(scope="session")
def database(get_data):
df = get_data
db_name = 'test_dataset.db'
create_duckdb_database(df, db_name)
return df


@pytest.mark.parametrize("fml", ["Y ~ D", "Y ~ D + f1", "Y ~ D + f1 + f2"])
@pytest.mark.parametrize("cluster_col", [""])

def test_vs_pyfixest_deterministic(get_data, fml, cluster_col):

m_duck = DuckRegression(
db_name='test_dataset.db',
table_name='data',
formula=fml,
cluster_col=cluster_col,
n_bootstraps=0,
seed = 42
)
m_duck.fit()
m_duck.fit_vcov()

m_feols = pf.feols(
fml,
data = get_data,
vcov = "hetero" if cluster_col == "" else {"CRV1": cluster_col},
ssc = pf.ssc(adj = False, cluster_adj = True)
)

results = m_duck.summary()
coefs = results["point_estimate"]
se = results["standard_error"]

assert np.all(np.abs(coefs) - np.abs(m_feols.coef().values) < 1e-8), "Coeficients are not equal"
assert np.all(np.abs(se) - np.abs(m_feols.se().values) < 1e-4), "Standard errors are not equal"

def test_multiple_estimation_stochastic():

pass


def test_vs_pyfixest_stochastic():

pass


def test_mundlak():

pass


def test_double_demeaning():

pass
24 changes: 24 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import numpy as np
import pandas as pd
import duckdb

# Generate sample data
def generate_sample_data(N=10_000, seed=12345):
rng = np.random.default_rng(seed)
D = rng.choice([0, 1], size=(N, 1))
X = rng.choice(range(20), (N, 2), True)
Y = D + X @ np.array([1, 2]).reshape(2, 1) + rng.normal(size=(N, 1))
Y2 = -1 * D + X @ np.array([1, 2]).reshape(2, 1) + rng.normal(size=(N, 1))
df = pd.DataFrame(
np.concatenate([Y, Y2, D, X], axis=1), columns=["Y", "Y2", "D", "f1", "f2"]
).assign(rowid=range(N))
return df


# Function to create and populate DuckDB database
def create_duckdb_database(df, db_name="large_dataset.db", table="data"):
conn = duckdb.connect(db_name)
conn.execute(f"DROP TABLE IF EXISTS {table}")
conn.execute(f"CREATE TABLE {table} AS SELECT * FROM df")
conn.close()
print(f"Data loaded into DuckDB database: {db_name}")

0 comments on commit ee558db

Please sign in to comment.