diff --git a/.github/workflows/edgetest.yml b/.github/workflows/edgetest.yml index ee0760f..588b5d1 100644 --- a/.github/workflows/edgetest.yml +++ b/.github/workflows/edgetest.yml @@ -13,9 +13,10 @@ jobs: - uses: actions/checkout@v2 with: ref: develop + - id: run-edgetest - uses: fdosani/run-edgetest-action@v1.0 + uses: edgetest-dev/run-edgetest-action@v1.4 with: - edgetest-flags: '-c setup.cfg -r requirements.txt --export' + edgetest-flags: '-c pyproject.toml -r requirements.txt --export' base-branch: 'develop' skip-pr: 'false' diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 44036c6..679c758 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -18,7 +18,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.8' + python-version: '3.9' - name: Install dependencies run: python -m pip install -r requirements.txt .[dev] - name: Build and publish diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index fecc1f6..55991eb 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -18,29 +18,40 @@ jobs: strategy: matrix: python_version: - - 3.7 - 3.8 - 3.9 - '3.10' + - '3.11' steps: - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + + - name: Install setuptools and upgrade pip + run: python -m pip install 'setuptools>=64.0.0' 'pip>=22.3' + - name: Install dependencies - run: python -m pip install -r requirements.txt .[dev] - - name: Check docstrings - run: python -m pydocstyle bayte --convention=numpy + run: python -m pip install -r requirements.txt .[plots,tests,qa] + + - name: Run ruff QA checks + run: python -m ruff check . + + - name: Check formatting + run: python -m ruff format . --check + - name: Check static typing run: python -m mypy bayte - - name: Run flake8 - run: python -m flake8 bayte + - name: Run unit testing run: python -m pytest tests --cov=./bayte --cov-report=xml + - name: Run Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v3 with: + token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: true files: ./coverage.xml verbose: true diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4073e31 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Akshay Gupta + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/bayte/__init__.py b/bayte/__init__.py index f5065b9..e426095 100644 --- a/bayte/__init__.py +++ b/bayte/__init__.py @@ -1,14 +1,13 @@ """Import path.""" -from ._meta import __version__ # noqa: F401 - from typing import List -from .encoder import BayesianTargetEncoder -from .ensemble import BayesianTargetClassifier, BayesianTargetRegressor +from bayte._meta import __version__ # noqa: F401 +from bayte.encoder import BayesianTargetEncoder +from bayte.ensemble import BayesianTargetClassifier, BayesianTargetRegressor __all__: List[str] = [ - "BayesianTargetEncoder", "BayesianTargetClassifier", + "BayesianTargetEncoder", "BayesianTargetRegressor", ] diff --git a/bayte/_meta.py b/bayte/_meta.py index ae4f5f0..2ac109b 100644 --- a/bayte/_meta.py +++ b/bayte/_meta.py @@ -1,3 +1,3 @@ """Package metadata.""" -__version__ = "0.1.1" +__version__ = "0.2.1" diff --git a/bayte/encoder.py b/bayte/encoder.py index 8bc8167..9acebdf 100644 --- a/bayte/encoder.py +++ b/bayte/encoder.py @@ -1,11 +1,11 @@ """Bayesian target encoder.""" import logging -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, ClassVar, List, Optional, Tuple, Union -from joblib import Parallel, effective_n_jobs import numpy as np import scipy.stats +from joblib import Parallel, effective_n_jobs from sklearn.preprocessing._encoders import _BaseEncoder from sklearn.utils.fixes import delayed from sklearn.utils.validation import check_is_fitted @@ -242,7 +242,7 @@ class BayesianTargetEncoder(_BaseEncoder): the parameters for the posterior distribution for the given level. """ - _required_parameters = ["dist"] + _required_parameters: ClassVar[List[str]] = ["dist"] def __init__( self, @@ -283,8 +283,15 @@ def fit(self, X, y): self : object Fitted encoder. """ - X, y = self._validate_data(X, y, dtype=None) - self._fit(X, handle_unknown=self.handle_unknown, force_all_finite=True) + tags = self._get_tags() + X, y = self._validate_data( + X, y, dtype=None, force_all_finite=not tags.get("allow_nan", True) + ) + self._fit( + X, + handle_unknown=self.handle_unknown, + force_all_finite=not tags.get("allow_nan", True), + ) # Initialize the prior distribution parameters initializer_ = self.initializer or _init_prior self.prior_params_ = initializer_(self.dist, y) @@ -322,10 +329,11 @@ def transform(self, X): """ check_is_fitted(self) + tags = self._get_tags() X_int, X_mask = self._transform( X, handle_unknown=self.handle_unknown, - force_all_finite=True, + force_all_finite=not tags.get("allow_nan", True), ) if effective_n_jobs(self.n_jobs) == 1: @@ -376,14 +384,17 @@ def transform(self, X): n_chunks = np.ceil(len(varencoded) / self.chunksize) chunks = np.array_split(np.arange(len(varencoded)), n_chunks) - varencoded = list( + varencoded = [ np.ma.stack(varencoded[chunk[0] : chunk[-1] + 1], axis=2).sum( axis=2 ) for chunk in chunks - ) + ] combined = np.ma.stack(varencoded, axis=2).sum(axis=2) encoded.append(combined.data) return np.hstack(encoded) + + def _more_tags(self): + return {"allow_nan": False} diff --git a/bayte/ensemble.py b/bayte/ensemble.py index a2229d2..5169d8d 100644 --- a/bayte/ensemble.py +++ b/bayte/ensemble.py @@ -3,12 +3,12 @@ Ensemble estimator that creates multiple models through sampling. """ -from copy import deepcopy import logging -from typing import List, Optional, Union +from copy import deepcopy +from typing import ClassVar, List, Literal, Optional, Union -from joblib import Parallel, effective_n_jobs import numpy as np +from joblib import Parallel, effective_n_jobs from pandas.api.types import is_categorical_dtype from sklearn.base import ( ClassifierMixin, @@ -18,14 +18,34 @@ ) from sklearn.ensemble._base import BaseEnsemble from sklearn.utils import check_random_state +from sklearn.utils._available_if import available_if from sklearn.utils.fixes import delayed -from sklearn.utils.metaestimators import if_delegate_has_method from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_array, check_is_fitted LOG = logging.getLogger(__name__) +def _available_if_estimator_has(attr: str): + """Return a function to check if the estimator has ``attr``. + + Parameters + ---------- + attr : str + The attribute to look for. + + Returns + ------- + Any + The output of ``available_if`` + """ + + def _check(self): + return hasattr(self.estimator, attr) + + return available_if(_check) + + def _sample_and_fit( estimator, encoder, X, y, categorical_feature, random_state, **fit_params ): @@ -72,7 +92,7 @@ class BaseSamplingEstimator(BaseEnsemble): Parameters ---------- - base_estimator : object + estimator : object The base estimator from which the ensemble is built. encoder : BayesianTargetEncoder A bayesian target encoder object. @@ -84,40 +104,44 @@ class BaseSamplingEstimator(BaseEnsemble): ``-1`` means using all processors. random_state : int, optional (default None) Random seed used for generating random seeds for sampling. + base_estimator : {"deprecated"} + Use ``estimator`` instead. Attributes ---------- categorical_ : np.ndarray A boolean mask indicating which columns are categorical and which are continuous. - base_estimator_ : estimator object + estimator_ : estimator object The base estimator from which the ensemble is grown. estimators_ : list The collection of fitted base estimators. """ - _required_parameters = ["base_estimator", "encoder"] + _required_parameters: ClassVar[List[str]] = ["estimator", "encoder"] def __init__( self, - base_estimator, + estimator, encoder, n_estimators: int = 10, n_jobs: Optional[int] = None, random_state: Optional[int] = None, + base_estimator: Literal["deprecated"] = "deprecated", ): """Init method.""" - self.base_estimator = base_estimator + self.estimator = estimator self.n_estimators = n_estimators self.encoder = encoder self.n_jobs = n_jobs self.random_state = random_state + self.base_estimator = base_estimator def fit( self, X, y, categorical_feature: Union[List[str], List[int], str] = "auto", - **fit_params + **fit_params, ): """Fit the estimator. @@ -152,7 +176,7 @@ def fit( self.rstates_ = rng.randint(self.n_estimators * 10, size=self.n_estimators) # Get the categorical columns if hasattr(X, "columns"): - self.categorical_ = np.zeros(X.shape[1], dtype=bool) + self.categorical_: np.ndarray = np.zeros(X.shape[1], dtype=bool) for idx, col in enumerate(X.columns): if categorical_feature == "auto": if is_categorical_dtype(X[col]): @@ -160,7 +184,7 @@ def fit( elif col in categorical_feature: self.categorical_[idx] = True - if is_classifier(self.base_estimator): + if is_classifier(self.estimator): check_classification_targets(y) self.classes_ = np.unique(y) @@ -200,13 +224,13 @@ def fit( LOG.info("Training the estimator(s).") self.estimators_ = parallel( fn( - clone(self.base_estimator), + clone(self.estimator), deepcopy(self.encoder_), X, y, self.categorical_, self.rstates_[idx], - **fit_params + **fit_params, ) for idx in range(self.n_estimators) ) @@ -224,7 +248,7 @@ class BayesianTargetRegressor(RegressorMixin, BaseSamplingEstimator): Parameters ---------- - base_estimator : object + estimator : object The base estimator from which the ensemble is built. encoder : BayesianTargetEncoder A bayesian target encoder object. @@ -234,18 +258,20 @@ class BayesianTargetRegressor(RegressorMixin, BaseSamplingEstimator): The number of cores to run in parallel when fitting the encoder. ``None`` means 1 unless in a ``joblib.parallel_backend`` context. ``-1`` means using all processors. + base_estimator : {"deprecated"} + Use ``estimator`` instead. Attributes ---------- categorical_ : np.ndarray A boolean mask indicating which columns are categorical and which are continuous. - base_estimator_ : estimator object + estimator_ : estimator object The base estimator from which the ensemble is grown. estimators_ : list The collection of fitted base estimators. """ - @if_delegate_has_method(delegate="base_estimator") + @_available_if_estimator_has("predict") def predict(self, X): """Call predict on the estimators. @@ -292,7 +318,7 @@ class BayesianTargetClassifier(ClassifierMixin, BaseSamplingEstimator): Parameters ---------- - base_estimator : object + estimator : object The base estimator from which the ensemble is built. encoder : BayesianTargetEncoder A bayesian target encoder object. @@ -309,37 +335,41 @@ class BayesianTargetClassifier(ClassifierMixin, BaseSamplingEstimator): ``-1`` means using all processors. random_state : int, optional (default None) Random seed used for generating random seeds for sampling. + base_estimator : {"deprecated"} + Use ``estimator`` instead. Attributes ---------- categorical_ : np.ndarray A boolean mask indicating which columns are categorical and which are continuous. - base_estimator_ : estimator object + estimator_ : estimator object The base estimator from which the ensemble is grown. estimators_ : list The collection of fitted base estimators. """ - _required_parameters = ["base_estimator", "encoder"] + _required_parameters: ClassVar[List[str]] = ["estimator", "encoder"] def __init__( self, - base_estimator, + estimator, encoder, n_estimators: int = 10, voting: str = "hard", n_jobs: Optional[int] = None, random_state: Optional[int] = None, + base_estimator: Literal["deprecated"] = "deprecated", ): """Init method.""" - self.base_estimator = base_estimator + self.estimator = estimator self.n_estimators = n_estimators self.voting = voting self.encoder = encoder self.n_jobs = n_jobs self.random_state = random_state + self.base_estimator = base_estimator - @if_delegate_has_method(delegate="base_estimator") + @_available_if_estimator_has("predict") def predict(self, X): """Predict class labels for X. @@ -382,7 +412,7 @@ def predict(self, X): return vote - @if_delegate_has_method(delegate="base_estimator") + @_available_if_estimator_has("predict_proba") def predict_proba(self, X): """Call predict_proba on the estimators. diff --git a/bayte/plots.py b/bayte/plots.py index 7d19c3b..076e6b3 100644 --- a/bayte/plots.py +++ b/bayte/plots.py @@ -1,14 +1,13 @@ """Helpful visualizations for target encoding.""" -from typing import Dict, List +from typing import Dict, List, Optional -from matplotlib.figure import Figure import matplotlib.pyplot as plt import numpy as np import pandas as pd import scipy.stats import seaborn as sns - +from matplotlib.figure import Figure DIST_MAPPING: Dict = { "exponential": "expon", @@ -17,10 +16,12 @@ "normal": "norm", } +DEFAULT_CANDIDATES: List[str] = ["exponential", "gamma", "invgamma", "normal"] + def visualize_target_dist( y: np.ndarray, - candidates: List[str] = ["exponential", "gamma", "invgamma", "normal"], + candidates: Optional[List[str]] = None, ) -> Figure: """Produce a histogram for the target variable with traces. @@ -32,8 +33,14 @@ def visualize_target_dist( ---------- y : array-like of shape (n_samples,) Target values. - candidates : list, optional (default ["exponential", "gamma", "invgamma", "normal"]) - The candidate likelihoods to consider. + candidates : list, optional (default None) + The candidate likelihoods to consider. By default, the following distributions + will be visualized: + + * ``exponential``, + * ``gamma``, + * ``invgamma``, and + * ``normal``. Returns ------- @@ -44,7 +51,8 @@ def visualize_target_dist( # Clip target values above the 99th percentile of the data extremes = np.quantile(y, q=[0.01, 0.99]) target = y[(y > extremes[0]) & (y < extremes[1])] - for label in candidates: + cand_ = candidates or DEFAULT_CANDIDATES + for label in cand_: params = getattr(scipy.stats, DIST_MAPPING[label]).fit(target) rv = getattr(scipy.stats, DIST_MAPPING[label])(*params) x = np.linspace(rv.ppf(0.01), rv.ppf(0.99)) diff --git a/dev-requirements.txt b/dev-requirements.txt index 952bc26..6a77096 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,63 +1,71 @@ -# -# This file is autogenerated by pip-compile with python 3.9 -# To update, run: -# -# pip-compile --extra=dev --output-file=dev-requirements.txt setup.cfg -# -alabaster==0.7.12 +# This file was autogenerated by uv via the following command: +# uv pip compile -o dev-requirements.txt --extra dev pyproject.toml +alabaster==0.7.16 # via sphinx -asttokens==2.1.0 +asttokens==2.4.1 # via stack-data -attrs==22.1.0 +attrs==23.2.0 # via # jsonschema # jupyter-cache - # pytest -babel==2.11.0 + # lazyscribe + # referencing +babel==2.14.0 # via sphinx -backcall==0.2.0 - # via ipython -beautifulsoup4==4.11.1 +backports-tarfile==1.1.0 + # via jaraco-context +beautifulsoup4==4.12.3 # via furo -black==22.10.0 - # via bayte (setup.cfg) -bleach==5.0.1 - # via readme-renderer -build==0.9.0 - # via - # bayte (setup.cfg) - # pip-tools -bumpver==2022.1119 - # via bayte (setup.cfg) -certifi==2022.9.24 +build==1.2.1 + # via pip-tools +bumpver==2023.1129 +category-encoders==2.3.0 +certifi==2024.2.2 # via requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography -charset-normalizer==2.1.1 +charset-normalizer==2.0.12 # via requests -click==8.1.3 +click==8.1.7 # via - # black # bumpver + # dask + # distributed # jupyter-cache # pip-tools + # prefect +cloudpickle==3.0.0 + # via + # dask + # distributed + # prefect colorama==0.4.6 # via bumpver -commonmark==0.9.1 - # via rich -contourpy==1.0.6 +comm==0.2.2 + # via ipykernel +contourpy==1.2.1 # via matplotlib -coverage[toml]==6.5.0 +coverage==7.4.4 # via pytest-cov -cryptography==38.0.3 +croniter==1.4.1 + # via prefect +cryptography==42.0.5 # via secretstorage -cycler==0.11.0 +cycler==0.12.1 # via matplotlib -debugpy==1.6.3 +dask==2024.4.1 + # via + # distributed + # prefect +debugpy==1.8.1 # via ipykernel decorator==5.1.1 # via ipython -docutils==0.18.1 +distributed==2024.4.1 + # via prefect +docker==7.0.0 + # via prefect +docutils==0.20.1 # via # myst-parser # pybtex-docutils @@ -65,167 +73,210 @@ docutils==0.18.1 # sphinx # sphinx-tabs # sphinxcontrib-bibtex -entrypoints==0.4 - # via jupyter-client -exceptiongroup==1.0.4 - # via pytest -executing==1.2.0 +exceptiongroup==1.2.1 + # via + # ipython + # pytest +executing==2.0.1 # via stack-data -fastjsonschema==2.16.2 +fastjsonschema==2.19.1 # via nbformat -flake8==6.0.0 - # via bayte (setup.cfg) -fonttools==4.38.0 +fonttools==4.51.0 # via matplotlib -furo==2022.9.29 - # via bayte (setup.cfg) -greenlet==2.0.1 +fsspec==2024.3.1 + # via dask +furo==2024.1.29 +greenlet==3.0.3 # via sqlalchemy -idna==3.4 +idna==3.7 # via requests imagesize==1.4.1 # via sphinx -importlib-metadata==5.0.0 +importlib-metadata==7.1.0 # via + # build + # dask # jupyter-cache + # jupyter-client # keyring # myst-nb # sphinx # sphinxcontrib-bibtex # twine -iniconfig==1.1.1 +importlib-resources==6.4.0 + # via matplotlib +iniconfig==2.0.0 # via pytest -ipykernel==6.17.1 +ipykernel==6.29.4 # via myst-nb -ipython==8.6.0 +ipython==8.18.1 # via # ipykernel # myst-nb -jaraco-classes==3.2.3 +jaraco-classes==3.4.0 + # via keyring +jaraco-context==5.3.0 # via keyring -jedi==0.18.2 +jaraco-functools==4.0.1 + # via keyring +jedi==0.19.1 # via ipython jeepney==0.8.0 # via # keyring # secretstorage -jinja2==3.1.2 +jinja2==3.1.3 # via + # distributed # myst-parser # sphinx -joblib==1.2.0 +joblib==1.4.0 # via scikit-learn -jsonschema==4.17.1 +jsonschema==4.21.1 # via nbformat -jupyter-cache==0.5.0 +jsonschema-specifications==2023.12.1 + # via jsonschema +jupyter-cache==1.0.0 # via myst-nb -jupyter-client==7.4.7 +jupyter-client==8.6.1 # via # ipykernel # nbclient -jupyter-core==5.0.0 +jupyter-core==5.7.2 # via + # ipykernel # jupyter-client + # nbclient # nbformat -keyring==23.11.0 +keyring==25.1.0 # via twine -kiwisolver==1.4.4 +kiwisolver==1.4.5 # via matplotlib -latexcodec==2.0.1 +latexcodec==3.0.0 # via pybtex +lazyscribe==0.3.0 lexid==2021.1006 # via bumpver -markdown-it-py==2.1.0 +lightgbm==3.3.1 +locket==1.0.0 + # via + # distributed + # partd +looseversion==1.3.0 + # via bumpver +markdown-it-py==3.0.0 # via # mdit-py-plugins # myst-parser -markupsafe==2.1.1 + # rich +markupsafe==2.1.5 # via jinja2 -matplotlib==3.6.2 +marshmallow==3.21.1 + # via + # marshmallow-oneofschema + # prefect +marshmallow-oneofschema==3.1.1 + # via prefect +matplotlib==3.8.4 # via seaborn -matplotlib-inline==0.1.6 +matplotlib-inline==0.1.7 # via # ipykernel # ipython -mccabe==0.7.0 - # via flake8 -mdit-py-plugins==0.3.1 +mdit-py-plugins==0.4.0 # via myst-parser mdurl==0.1.2 # via markdown-it-py -more-itertools==9.0.0 - # via jaraco-classes -mypy==0.991 - # via bayte (setup.cfg) -mypy-extensions==0.4.3 +more-itertools==10.2.0 + # via + # jaraco-classes + # jaraco-functools +msgpack==1.0.8 + # via + # distributed + # prefect +mypy==1.9.0 +mypy-extensions==1.0.0 # via - # black # mypy -myst-nb==0.17.1 - # via bayte (setup.cfg) -myst-parser==0.18.1 + # prefect +myst-nb==1.1.0 +myst-parser==2.0.0 # via myst-nb -nbclient==0.5.13 +nbclient==0.10.0 # via # jupyter-cache # myst-nb -nbformat==5.7.0 +nbformat==5.10.4 # via # jupyter-cache # myst-nb # nbclient -nest-asyncio==1.5.6 - # via - # ipykernel - # jupyter-client - # nbclient -numpy==1.23.5 +nest-asyncio==1.6.0 + # via ipykernel +nh3==0.2.17 + # via readme-renderer +numpy==1.26.4 # via + # category-encoders # contourpy + # lightgbm # matplotlib # pandas + # patsy # scikit-learn # scipy # seaborn + # statsmodels + # xgboost packaging==21.3 # via # build + # dask + # distributed + # docker # ipykernel + # marshmallow # matplotlib + # prefect # pytest # sphinx -pandas==1.5.2 + # statsmodels +pandas==2.2.2 # via - # bayte (setup.cfg) + # category-encoders # seaborn -parso==0.8.3 + # statsmodels +parso==0.8.4 # via jedi -pathlib2==2.3.7.post1 - # via bumpver -pathspec==0.10.2 - # via black -pep517==0.13.0 - # via build -pexpect==4.8.0 +partd==1.4.1 + # via dask +patsy==0.5.6 + # via + # category-encoders + # statsmodels +pendulum==3.0.0 + # via prefect +pexpect==4.9.0 # via ipython -pickleshare==0.7.5 - # via ipython -pillow==9.3.0 +pillow==10.3.0 # via matplotlib -pip-tools==6.10.0 - # via bayte (setup.cfg) -pkginfo==1.8.3 +pip==24.0 + # via pip-tools +pip-tools==7.4.1 +pkginfo==1.10.0 # via twine -platformdirs==2.5.4 - # via - # black - # jupyter-core -pluggy==1.0.0 +platformdirs==4.2.0 + # via jupyter-core +pluggy==1.4.0 # via pytest -prompt-toolkit==3.0.33 +prefect==0.15.10 +prompt-toolkit==3.0.43 # via ipython -psutil==5.9.4 - # via ipykernel +psutil==5.9.8 + # via + # distributed + # ipykernel ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 @@ -234,17 +285,11 @@ pybtex==0.24.0 # via # pybtex-docutils # sphinxcontrib-bibtex -pybtex-docutils==1.0.2 +pybtex-docutils==1.0.3 # via sphinxcontrib-bibtex -pycodestyle==2.10.0 - # via flake8 -pycparser==2.21 +pycparser==2.22 # via cffi -pydocstyle==6.1.1 - # via bayte (setup.cfg) -pyflakes==3.0.0 - # via flake8 -pygments==2.13.0 +pygments==2.17.2 # via # furo # ipython @@ -252,123 +297,169 @@ pygments==2.13.0 # rich # sphinx # sphinx-tabs -pyparsing==3.0.9 +pyparsing==3.1.2 # via # matplotlib # packaging -pyrsistent==0.19.2 - # via jsonschema -pytest==7.2.0 +pyproject-hooks==1.0.0 # via - # bayte (setup.cfg) - # pytest-cov -pytest-cov==4.0.0 - # via bayte (setup.cfg) -python-dateutil==2.8.2 + # build + # pip-tools +pytest==8.1.1 + # via pytest-cov +pytest-cov==5.0.0 +python-box==7.1.1 + # via prefect +python-dateutil==2.9.0.post0 # via + # croniter # jupyter-client # matplotlib # pandas -pytz==2022.6 + # pendulum + # prefect + # time-machine +python-slugify==8.0.4 + # via + # lazyscribe + # prefect +pytz==2024.1 # via - # babel # pandas -pyyaml==6.0 + # prefect +pyyaml==6.0.1 # via + # dask + # distributed # jupyter-cache # myst-nb # myst-parser + # prefect # pybtex -pyzmq==24.0.1 +pyzmq==26.0.1 # via # ipykernel # jupyter-client -readme-renderer==37.3 +readme-renderer==43.0 # via twine -requests==2.28.1 +referencing==0.34.0 # via + # jsonschema + # jsonschema-specifications +requests==2.26.0 + # via + # docker + # prefect # requests-toolbelt # sphinx # twine -requests-toolbelt==0.10.1 +requests-toolbelt==1.0.0 # via twine rfc3986==2.0.0 # via twine -rich==12.6.0 +rich==13.7.1 # via twine -scikit-learn==1.1.3 - # via bayte (setup.cfg) -scipy==1.9.3 - # via scikit-learn -seaborn==0.12.1 - # via bayte (setup.cfg) +rpds-py==0.18.0 + # via + # jsonschema + # referencing +ruff==0.3.7 +scikit-learn==1.4.2 + # via + # category-encoders + # lightgbm +scipy==1.13.0 + # via + # category-encoders + # lightgbm + # scikit-learn + # statsmodels + # xgboost +seaborn==0.13.2 secretstorage==3.3.3 # via keyring +setuptools==69.5.1 + # via pip-tools six==1.16.0 # via # asttokens - # bleach - # latexcodec - # pathlib2 + # patsy # pybtex # python-dateutil snowballstemmer==2.2.0 - # via - # pydocstyle - # sphinx -soupsieve==2.3.2.post1 + # via sphinx +sortedcontainers==2.4.0 + # via distributed +soupsieve==2.5 # via beautifulsoup4 -sphinx==5.3.0 +sphinx==7.3.7 # via - # bayte (setup.cfg) # furo # myst-nb # myst-parser # sphinx-basic-ng # sphinx-tabs # sphinxcontrib-bibtex -sphinx-basic-ng==1.0.0b1 +sphinx-basic-ng==1.0.0b2 # via furo -sphinx-tabs==3.4.1 - # via bayte (setup.cfg) -sphinxcontrib-applehelp==1.0.2 +sphinx-tabs==3.4.5 +sphinxcontrib-applehelp==1.0.8 # via sphinx -sphinxcontrib-bibtex==2.5.0 - # via bayte (setup.cfg) -sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-bibtex==2.6.2 +sphinxcontrib-devhelp==1.0.6 # via sphinx -sphinxcontrib-htmlhelp==2.0.0 +sphinxcontrib-htmlhelp==2.0.5 # via sphinx sphinxcontrib-jsmath==1.0.1 # via sphinx -sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-qthelp==1.0.7 # via sphinx -sphinxcontrib-serializinghtml==1.1.5 +sphinxcontrib-serializinghtml==1.1.10 # via sphinx -sqlalchemy==1.4.44 +sqlalchemy==2.0.29 # via jupyter-cache -stack-data==0.6.1 +stack-data==0.6.3 # via ipython +statsmodels==0.14.2 + # via category-encoders tabulate==0.9.0 - # via jupyter-cache -threadpoolctl==3.1.0 + # via + # jupyter-cache + # prefect +tblib==3.0.0 + # via distributed +text-unidecode==1.3 + # via python-slugify +threadpoolctl==3.4.0 # via scikit-learn +time-machine==2.14.1 + # via pendulum toml==0.10.2 - # via bumpver + # via + # bumpver + # prefect tomli==2.0.1 # via - # black # build # coverage # mypy - # pep517 + # pip-tools + # pyproject-hooks # pytest -tornado==6.2 + # sphinx +toolz==0.12.1 # via + # dask + # distributed + # partd +tornado==6.4 + # via + # distributed # ipykernel # jupyter-client -traitlets==5.5.0 +traitlets==5.14.3 # via + # comm # ipykernel # ipython # jupyter-client @@ -376,29 +467,34 @@ traitlets==5.5.0 # matplotlib-inline # nbclient # nbformat -twine==4.0.1 - # via bayte (setup.cfg) -typing-extensions==4.4.0 +twine==5.0.0 +typing-extensions==4.11.0 # via - # black + # ipython # mypy # myst-nb - # myst-parser -urllib3==1.26.12 + # sqlalchemy +tzdata==2024.1 + # via + # pandas + # pendulum +urllib3==1.26.18 # via + # distributed + # docker + # prefect # requests # twine -wcwidth==0.2.5 +wcwidth==0.2.13 # via prompt-toolkit -webencodings==0.5.1 - # via bleach -wheel==0.38.4 +wheel==0.43.0 # via - # bayte (setup.cfg) + # lightgbm # pip-tools -zipp==3.10.0 - # via importlib-metadata - -# The following packages are considered to be unsafe in a requirements file: -# pip -# setuptools +xgboost==1.5.1 +zict==3.0.0 + # via distributed +zipp==3.18.1 + # via + # importlib-metadata + # importlib-resources diff --git a/docs/conf.py b/docs/conf.py index a40be94..9ea9eb0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,7 +19,8 @@ # import os import sys -sys.path.insert(0, os.path.abspath('..')) + +sys.path.insert(0, os.path.abspath("..")) # -- General configuration --------------------------------------------- @@ -30,28 +31,28 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.viewcode', - 'myst_nb', - 'sphinx.ext.mathjax', - 'sphinxcontrib.bibtex', - 'sphinx_tabs.tabs', + "sphinx.ext.autodoc", + "sphinx.ext.viewcode", + "myst_nb", + "sphinx.ext.mathjax", + "sphinxcontrib.bibtex", + "sphinx_tabs.tabs", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'Bayesian target encoding' +project = "Bayesian target encoding" copyright = "2021, Akshay Gupta" author = "Akshay Gupta" @@ -60,7 +61,7 @@ # the built documents. # # The short X.Y version. -version = "0.1.1" +version = "0.2.1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -72,21 +73,21 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False jupyter_execute_notebooks = "off" -bibtex_bibfiles = ['refs.bib'] +bibtex_bibfiles = ["refs.bib"] # -- Options for HTML output ------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'furo' +html_theme = "furo" # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the @@ -97,13 +98,13 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # -- Options for HTMLHelp output --------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'baytedoc' +htmlhelp_basename = "baytedoc" # -- Options for LaTeX output ------------------------------------------ @@ -112,15 +113,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -130,9 +128,13 @@ # (source start file, target name, title, author, documentclass # [howto, manual, or own class]). latex_documents = [ - (master_doc, 'bayte.tex', - 'Bayesian target encoding Documentation', - 'Akshay Gupta', 'manual'), + ( + master_doc, + "bayte.tex", + "Bayesian target encoding Documentation", + "Akshay Gupta", + "manual", + ), ] @@ -141,9 +143,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'bayte', - 'Bayesian target encoding Documentation', - [author], 1) + (master_doc, "bayte", "Bayesian target encoding Documentation", [author], 1) ] @@ -153,10 +153,13 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'bayte', - 'Bayesian target encoding Documentation', - author, - 'bayte', - 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "bayte", + "Bayesian target encoding Documentation", + author, + "bayte", + "One line description of project.", + "Miscellaneous", + ), ] diff --git a/experiments/flow.py b/experiments/flow.py index 67402a9..02d611e 100644 --- a/experiments/flow.py +++ b/experiments/flow.py @@ -1,10 +1,10 @@ """Basic model fit and score flow.""" from lazyscribe.prefect import LazyProject -from prefect import Flow, Parameter, case +from prefect import Flow, case from prefect.tasks.control_flow import merge -from . import tasks, OUTPUT_DIR +from experiments import OUTPUT_DIR, tasks def gen_flow( @@ -15,7 +15,7 @@ def gen_flow( marginal: bool = False, residual: bool = False, seed: int = 42, - n_estimators: int = 0 + n_estimators: int = 0, ) -> Flow: """Create a model fit and score flow. @@ -63,12 +63,13 @@ def gen_flow( meta = tasks.read_metadata(dataset=dataset) data = tasks.read_data(metadata=meta) - estimator = tasks.init_model(algorithm=algorithm, metadata=meta, seed=seed) # Split and encode supervised = tasks.check_supervised(algorithm=encoder) - encoder_object = tasks.init_encoder(algorithm=encoder, metadata=meta, residual=residual) + encoder_object = tasks.init_encoder( + algorithm=encoder, metadata=meta, residual=residual + ) train, test = tasks.split(data=data, metadata=meta, seed=seed) with case(supervised, True): fitted_encoder_super = tasks.fit_encoder( @@ -77,7 +78,7 @@ def gen_flow( encoder=encoder_object, estimator=estimator, marginal=marginal, - residual=residual + residual=residual, ) with case(supervised, False): fitted_encoder_unsup = tasks.fit_encoder( @@ -99,10 +100,7 @@ def gen_flow( finaltest = tasks.drop_nulls(data=test_transformed) # Fit and score fitted_estimator_std = tasks.train( - data=finaltrain, - metadata=meta, - estimator=estimator, - seed=seed + data=finaltrain, metadata=meta, estimator=estimator, seed=seed ) score_std = tasks.score_model( data=finaltest, metadata=meta, estimator=fitted_estimator_std diff --git a/experiments/tasks/__init__.py b/experiments/tasks/__init__.py index d764315..bff1a59 100644 --- a/experiments/tasks/__init__.py +++ b/experiments/tasks/__init__.py @@ -2,28 +2,33 @@ from typing import List -from .data import drop_nulls, split -from .encode import check_supervised, init_encoder, fit_encoder, transform -from .io import read_data, read_metadata -from .model import check_ensemble, init_model, train -from .plots import render_sample_perf_plot, render_comparison_perf_plot -from .projects import project_to_df -from .scoring import score_model +from experiments.tasks.data import drop_nulls, split +from experiments.tasks.encode import ( + check_supervised, + fit_encoder, + init_encoder, + transform, +) +from experiments.tasks.io import read_data, read_metadata +from experiments.tasks.model import check_ensemble, init_model, train +from experiments.tasks.plots import render_comparison_perf_plot, render_sample_perf_plot +from experiments.tasks.projects import project_to_df +from experiments.tasks.scoring import score_model __all__: List[str] = [ - "drop_nulls", - "split", + "check_ensemble", "check_supervised", - "init_encoder", + "drop_nulls", "fit_encoder", - "transform", + "init_encoder", + "init_model", + "project_to_df", "read_data", "read_metadata", - "check_ensemble", - "init_model", - "train", - "render_sample_perf_plot", "render_comparison_perf_plot", - "project_to_df", - "score_model" + "render_sample_perf_plot", + "score_model", + "split", + "train", + "transform", ] diff --git a/experiments/tasks/data.py b/experiments/tasks/data.py index b4b65ef..6afa3a8 100644 --- a/experiments/tasks/data.py +++ b/experiments/tasks/data.py @@ -51,5 +51,5 @@ def split( return train_test_split( data[metadata["numeric"] + metadata["nominal"] + [metadata["target"]]], test_size=0.2, - random_state=seed + random_state=seed, ) diff --git a/experiments/tasks/encode.py b/experiments/tasks/encode.py index 397831d..2d20d46 100644 --- a/experiments/tasks/encode.py +++ b/experiments/tasks/encode.py @@ -2,19 +2,21 @@ from typing import Dict -from bayte import BayesianTargetEncoder +import numpy as np +import pandas as pd +import prefect from category_encoders import ( CountEncoder, GLMMEncoder, JamesSteinEncoder, TargetEncoder, ) -import numpy as np -import pandas as pd -import prefect from prefect import task from sklearn.preprocessing import OrdinalEncoder +from bayte import BayesianTargetEncoder + + @task(name="Check supervised") def check_supervised(algorithm: str) -> bool: """Check if the encoder is supervised. @@ -76,7 +78,7 @@ def fit_encoder( encoder, estimator=None, marginal: bool = False, - residual: bool = False + residual: bool = False, ): """Fit the encoder. diff --git a/experiments/tasks/io.py b/experiments/tasks/io.py index db6be5e..86963e7 100644 --- a/experiments/tasks/io.py +++ b/experiments/tasks/io.py @@ -7,7 +7,7 @@ from prefect import task from scipy.io.arff import loadarff -from .. import DATA_DIR, METADATA_DIR +from experiments import DATA_DIR, METADATA_DIR @task(name="Read Metadata") @@ -54,8 +54,10 @@ def read_data(metadata: Dict) -> pd.DataFrame: else: raise NotImplementedError(f"File type `{fpath.suffix}` not supported.") - if metadata["dataset_type"] == "classification": - if pd.api.types.infer_dtype(data[metadata["target"]]) == "bytes": - data[metadata["target"]] = data[metadata["target"]].astype(int) + if ( + metadata["dataset_type"] == "classification" + and pd.api.types.infer_dtype(data[metadata["target"]]) == "bytes" + ): + data[metadata["target"]] = data[metadata["target"]].astype(int) return data diff --git a/experiments/tasks/model.py b/experiments/tasks/model.py index ceb6d1d..36b7485 100644 --- a/experiments/tasks/model.py +++ b/experiments/tasks/model.py @@ -3,15 +3,13 @@ from typing import Dict import pandas as pd -from prefect import task from lightgbm import LGBMClassifier, LGBMRegressor -from sklearn.ensemble import ( - GradientBoostingClassifier, - GradientBoostingRegressor -) +from prefect import task +from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from xgboost import XGBClassifier, XGBRegressor -from bayte import BayesianTargetRegressor, BayesianTargetClassifier +from bayte import BayesianTargetClassifier, BayesianTargetRegressor + @task(name="Check if ensemble") def check_ensemble(n_estimators: int) -> bool: @@ -77,7 +75,7 @@ def train( estimator, encoder=None, n_estimators: int = 0, - seed: int = 42 + seed: int = 42, ): """Fit the estimator. @@ -108,20 +106,20 @@ def train( base_estimator=estimator, encoder=encoder, n_estimators=n_estimators, - random_state=seed + random_state=seed, ) else: estimator_ = BayesianTargetClassifier( base_estimator=estimator, encoder=encoder, n_estimators=n_estimators, - random_state=seed + random_state=seed, ) estimator_.fit( data[features], data[metadata["target"]], - categorical_feature=metadata["nominal"] + categorical_feature=metadata["nominal"], ) else: estimator_ = estimator.fit(data[features], data[metadata["target"]]) diff --git a/experiments/tasks/plots.py b/experiments/tasks/plots.py index 6992425..f8bb83d 100644 --- a/experiments/tasks/plots.py +++ b/experiments/tasks/plots.py @@ -5,12 +5,11 @@ import matplotlib.pyplot as plt import matplotlib.ticker as ticker import pandas as pd -from prefect import task import seaborn as sns +from prefect import task from bayte.plots import visualize_target_dist - -from .. import OUTPUT_DIR +from experiments import OUTPUT_DIR @task(name="Visualize target distribution") @@ -44,6 +43,7 @@ def render_sample_perf_plot(data: pd.DataFrame): data : pd.DataFrame The output from ``create_plot_df``. """ + def _single_plot(name: str, data: pd.DataFrame): """Create a single plot.""" with sns.axes_style("dark"): @@ -51,7 +51,9 @@ def _single_plot(name: str, data: pd.DataFrame): non_sample = data.loc[ data[("parameters", "n_estimators")] == 0, ("metrics", "score") ].mean() - data["score-change"] = (data[("metrics", "score")] - non_sample) / abs(non_sample) + data["score-change"] = (data[("metrics", "score")] - non_sample) / abs( + non_sample + ) fig, ax = plt.subplots(figsize=(12, 8)) sns.violinplot( x=("parameters", "n_estimators"), @@ -60,11 +62,11 @@ def _single_plot(name: str, data: pd.DataFrame): data=data[data[("parameters", "n_estimators")] > 0], palette="flare", inner="quartile", - ax=ax + ax=ax, ).set( title=f"Effect of number of samples on performance for {name}", xlabel="Number of estimators", - ylabel="Score change vs. no sampling (higher is better)" + ylabel="Score change vs. no sampling (higher is better)", ) plt.legend(title="Model") ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1)) @@ -72,7 +74,6 @@ def _single_plot(name: str, data: pd.DataFrame): return fig - grouped = data.groupby(("parameters", "dataset")) for name, group in grouped: fig = _single_plot(name, group) @@ -86,29 +87,32 @@ def _single_plot(name: str, data: pd.DataFrame): @task(name="Visualize comparison performance") def render_comparison_perf_plot(data: pd.DataFrame): """Render performance plots for the comparison experiment.""" + def _single_plot(name: str, data: pd.DataFrame): with sns.axes_style("dark"): # Get the baseline performance mask = ( (data[("parameters", "n_estimators")] == 0) - & (data[("parameters", "marginal")] == False) - & (data[("parameters", "residual")] == False) + & (not data[("parameters", "marginal")]) + & (not data[("parameters", "residual")]) & (data[("parameters", "encoder")] == "bayes") ) baseline = data.loc[mask, ("metrics", "score")].mean() - data["score-change"] = (data[("metrics", "score")] - baseline) / abs(baseline) - data.loc[ - data[("parameters", "marginal")] == True, ("parameters", "encoder") - ] += " (m)" - data.loc[ - data[("parameters", "residual")] == True, ("parameters", "encoder") - ] += " (r)" + data["score-change"] = (data[("metrics", "score")] - baseline) / abs( + baseline + ) + data.loc[data[("parameters", "marginal")], ("parameters", "encoder")] += ( + " (m)" + ) + data.loc[data[("parameters", "residual")], ("parameters", "encoder")] += ( + " (r)" + ) data.loc[ ( (data[("parameters", "n_estimators")] > 0) & (data[("parameters", "encoder")].str.startswith("bayes")) ), - ("parameters", "encoder") + ("parameters", "encoder"), ] += " (" + data[("parameters", "n_estimators")].astype(str) + ")" fig, ax = plt.subplots(figsize=(12, 8)) @@ -119,11 +123,11 @@ def _single_plot(name: str, data: pd.DataFrame): data=data[~mask], palette="flare", inner="quartile", - ax=ax + ax=ax, ).set( title=f"Effect of encoder on performance for {name}", xlabel="Encoder (number of samples)", - ylabel="Score change vs. standard bayes (higher is better)" + ylabel="Score change vs. standard bayes (higher is better)", ) plt.legend(title="Model") ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1)) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..130edb3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,174 @@ +[project] +name = "bayte" +requires-python = ">=3.8.0" +description = "Bayesian target encoding with scikit-learn and scipy" +authors = [ + { name = "Akshay Gupta", email="akgcodes@gmail.com" } +] +license = { file = "LICENSE" } +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] +dependencies = ["pandas<=2.2.2,>=1.0.0", "scikit-learn<=1.4.2,>=1.0.0"] + +dynamic = ["readme", "version"] + +[project.optional-dependencies] +build = [ + "build", + "bumpver", + "twine", + "wheel", +] +docs = [ + "furo", + "myst-nb", + "sphinx", + "sphinxcontrib-bibtex", + "sphinx-tabs", +] +experiments = [ + "category_encoders==2.3.0", + "lazyscribe==0.3.0", + "lightgbm==3.3.1", + "seaborn<=0.13.2,>=0.11.0", + "prefect==0.15.10", + "xgboost==1.5.1", +] +plots = [ + "seaborn<=0.13.2,>=0.11.0" +] +qa = [ + "ruff==0.3.7", + "mypy", + "pip-tools", +] +tests = [ + "pytest", + "pytest-cov", +] +dev = [ + "bayte[build]", + "bayte[docs]", + "bayte[experiments]", + "bayte[plots]", + "bayte[qa]", + "bayte[tests]", +] + +[project.urls] +documentation = "https://bayte.readthedocs.io/" +repository = "https://github.com/ak-gupta/bayte" + +# Build system +[build-system] +requires = ["setuptools>=64.0.0"] +build-backend = "setuptools.build_meta" + +############################################################################## +# Setuptools configuration +############################################################################## + +[tool.setuptools] +include-package-data = true +zip-safe = false +packages = ["bayte"] + +[tool.setuptools.dynamic] +version = { attr = "bayte._meta.__version__" } +readme = { file = ["README.md"], content-type = "text/markdown" } + +############################################################################## +# Tooling +############################################################################## + +# BUMPVER -------------------------------------------------------------------- + +[bumpver] +current_version = "0.2.1" +version_pattern = "MAJOR.MINOR.PATCH" + +[bumpver.file_patterns] +"pyproject.toml" = [ + 'current_version = "{version}"', +] +"bayte/_meta.py" = [ + '__version__ = "{version}"', +] +"docs/conf.py" = [ + 'version = "{version}"', +] + +# EDGETEST ------------------------------------------------------------------- + +[edgetest.envs.core] +python_version = "3.9" +extras = ["plots", "tests"] +upgrade = ["pandas", "scikit-learn", "seaborn"] +command = "pytest tests" + +# RUFF ----------------------------------------------------------------------- + +[tool.ruff] +extend-include = ["*.ipynb"] +target-version = "py38" + +[tool.ruff.lint] +preview = true +ignore-init-module-imports = true +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "D", # pydocstyle + "I", # isort + "UP", # pyupgrade + "B", # flake8-bugbear + "C", # flake8-comprehensions + "T20", # flake8-print + "TID252", # flake8-tidy-imports ban relative imports + "SIM", # flake8-simplify + "LOG", # flake8-logging + "RUF", # Ruff errors +] +ignore = [ + "C901", # Add back in later + "E111", # Check indentation level. Using formatter instead. + "E114", # Check indentation level. Using formatter instead. + "E117", # Check indentation level. Using formatter instead. + "E203", # Check whitespace. Using formatter instead. + "E501", # Line too long. Using formatter instead. + "D206", # Docstring indentation. Using formatter instead. + "D300", # Use triple single quotes. Using formatter instead. + "SIM108", # Use ternary operator instead of if-else blocks. + "SIM105", # Use ``contextlib.suppress(FileNotFoundError)`` insetad of try - execpt - pass. + "UP035", # ``typing.x`` is deprecated, use ``x`` instead + "UP006", # ``typing.x`` is deprecated, use ``x`` instead +] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402"] +"**/{tests,docs}/*" = ["E402", "D", "F841", "ARG"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +# MYPY ----------------------------------------------------------------------- + +[tool.mypy] +python_version = 3.9 +warn_return_any = true +warn_unused_configs = true +ignore_missing_imports = true +allow_redefinition = true diff --git a/requirements.txt b/requirements.txt index 77abb09..5edec84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,27 +1,23 @@ -# -# This file is autogenerated by pip-compile with python 3.9 -# To update, run: -# -# pip-compile setup.cfg -# -joblib==1.2.0 +# This file was autogenerated by uv via the following command: +# uv pip compile -o requirements.txt pyproject.toml +joblib==1.4.0 # via scikit-learn -numpy==1.23.5 +numpy==1.26.4 # via # pandas # scikit-learn # scipy -pandas==1.5.2 - # via bayte (setup.cfg) -python-dateutil==2.8.2 +pandas==2.2.2 +python-dateutil==2.9.0.post0 # via pandas -pytz==2022.6 +pytz==2024.1 # via pandas -scikit-learn==1.1.3 - # via bayte (setup.cfg) -scipy==1.9.3 +scikit-learn==1.4.2 +scipy==1.13.0 # via scikit-learn six==1.16.0 # via python-dateutil -threadpoolctl==3.1.0 +threadpoolctl==3.4.0 # via scikit-learn +tzdata==2024.1 + # via pandas diff --git a/run_experiments.py b/run_experiments.py index b2fe351..13b37d3 100644 --- a/run_experiments.py +++ b/run_experiments.py @@ -1,6 +1,5 @@ """Run experiments. - I recommend suppressing logging from Prefect. ```console @@ -12,6 +11,7 @@ from experiments.flow import gen_flow + @click.group() def cli(): """CLI group.""" @@ -29,28 +29,23 @@ def cli(): "flight-delay-usa-dec-2017", "particulate-matter-ukair-2017", "churn", - "click_prediction_small" + "click_prediction_small", ] ), - help="The dataset" + help="The dataset", ) @click.option( "--algorithm", type=click.Choice(["xgboost", "lightgbm", "gbm"]), - help="The algorithm" + help="The algorithm", ) @click.option( "--n-estimators", type=click.INT, multiple=True, - default=[0, 25, 50, 75, 100, 125, 150, 175, 200] -) -@click.option( - "--seeds", - type=click.INT, - multiple=True, - default=[5, 10, 16, 42, 44] + default=[0, 25, 50, 75, 100, 125, 150, 175, 200], ) +@click.option("--seeds", type=click.INT, multiple=True, default=[5, 10, 16, 42, 44]) def sample(dataset, algorithm, n_estimators, seeds): """Run the sampling experiment.""" for n_est in n_estimators: @@ -61,7 +56,7 @@ def sample(dataset, algorithm, n_estimators, seeds): f"Running experiment for {dataset} with algorithm {algorithm}, " f"{n_est} estimators, and seed {seed}" ), - fg="green" + fg="green", ) ) flow = gen_flow( @@ -70,7 +65,7 @@ def sample(dataset, algorithm, n_estimators, seeds): encoder="bayes", algorithm=algorithm, seed=seed, - n_estimators=n_est + n_estimators=n_est, ) _ = flow.run() if not _.is_successful(): @@ -90,41 +85,30 @@ def sample(dataset, algorithm, n_estimators, seeds): "flight-delay-usa-dec-2017", "particulate-matter-ukair-2017", "churn", - "click_prediction_small" + "click_prediction_small", ] ), - help="The dataset" + help="The dataset", ) @click.option( "--algorithm", type=click.Choice(["xgboost", "lightgbm", "gbm"]), - help="The algorithm" + help="The algorithm", ) @click.option( "--encoder", - type=click.Choice(["frequency", "glmm", "james-stein", "integer", "target", "bayes"]), - help="Categorical encoder" -) -@click.option( - "--n-estimators", - type=click.INT, - default=0 -) -@click.option( - "--seeds", - type=click.INT, - multiple=True, - default=[5, 10, 16, 42, 44] + type=click.Choice( + ["frequency", "glmm", "james-stein", "integer", "target", "bayes"] + ), + help="Categorical encoder", ) +@click.option("--n-estimators", type=click.INT, default=0) +@click.option("--seeds", type=click.INT, multiple=True, default=[5, 10, 16, 42, 44]) @click.option( - "--marginal", - is_flag=True, - help="Whether or not to use marginal encoding" + "--marginal", is_flag=True, help="Whether or not to use marginal encoding" ) @click.option( - "--residual", - is_flag=True, - help="Whether or not to use residual encoding" + "--residual", is_flag=True, help="Whether or not to use residual encoding" ) def compare(dataset, algorithm, encoder, n_estimators, seeds, marginal, residual): """Run the comparison experiment.""" @@ -135,7 +119,7 @@ def compare(dataset, algorithm, encoder, n_estimators, seeds, marginal, residual f"Running experiment for {dataset} with algorithm {algorithm}, " f"encoder {encoder}, {n_estimators} estimators, and seed {seed}." ), - fg="green" + fg="green", ) ) flow = gen_flow( diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 893b121..0000000 --- a/setup.cfg +++ /dev/null @@ -1,110 +0,0 @@ -[metadata] -name = bayte -version = attr: bayte._meta.__version__ -description = Bayesian target encoding with scikit-learn and scipy -long_description = file: README.md -long_description_content_type = text/markdown -author = Akshay Gupta -author_email = akgcodes@gmail.com -url = https://github.com/ak-gupta/bayte -python_requires = - >=3.7 -license = MIT license -classifiers = - Development Status :: 3 - Alpha - License :: OSI Approved :: MIT License - Natural Language :: English - Programming Language :: Python :: 3 - Programming Language :: Python :: 3 :: Only - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - -[options] -zip_safe = False -include_package_data = True -packages = find: -install_requires = - pandas<=1.5.2,>=1.0.0 - scikit-learn<=1.1.3,>=1.0.0 - -[options.extras_require] -build = - build - bumpver - twine - wheel -docs = - furo - myst-nb - sphinx - sphinxcontrib-bibtex - sphinx-tabs -experiments = - category_encoders==2.3.0 - lazyscribe==0.3.0 - lightgbm==3.3.1 - %(plots)s - prefect==0.15.10 - xgboost==1.5.1 -plots = - seaborn<=0.12.1,>=0.11.0 -qa = - black - flake8 - mypy - pip-tools - pydocstyle -tests = - pytest - pytest-cov -dev = - %(build)s - %(plots)s - %(docs)s - %(qa)s - %(tests)s - - -[options.packages.find] -exclude = - experiments - experiments.* - tests - tests.* - -[bumpver] -current_version = "0.1.1" -version_pattern = "MAJOR.MINOR.PATCH" - -[bumpver:file_patterns] -setup.cfg = - current_version = "{version}" -bayte/_meta.py = - __version__ = "{version}" -docs/conf.py = - version = "{version}" - -[flake8] -max-line-length = 100 -ignore = E203,W503 - -[mypy] -python_version = 3.8 -warn_return_any = True -warn_unused_configs = True -ignore_missing_imports = True -allow_redefinition = True - -[edgetest.envs.core] -python_version = 3.9 -extras = - plots - tests -command = - pytest tests -upgrade = - pandas - scikit-learn - seaborn diff --git a/setup.py b/setup.py deleted file mode 100644 index fca15eb..0000000 --- a/setup.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Package installation. Only included to allow editable installs.""" - -from setuptools import setup - -setup() diff --git a/tests/test_encoder.py b/tests/test_encoder.py index 48ac39f..26aaed9 100644 --- a/tests/test_encoder.py +++ b/tests/test_encoder.py @@ -1,20 +1,22 @@ """Test the encoder.""" import numpy as np -from numpy.testing import assert_allclose, assert_array_equal import pandas as pd import pytest import scipy.stats +from numpy.testing import assert_allclose, assert_array_equal from sklearn.utils.estimator_checks import check_estimator -from bayte.encoder import ( - BayesianTargetEncoder, - _init_prior -) +from bayte.encoder import BayesianTargetEncoder, _init_prior -def test_encoder_validity(): + +@pytest.mark.parametrize( + "estimator,check", + list(check_estimator(BayesianTargetEncoder(dist="bernoulli"), generate_only=True)), +) +def test_encoder_validity(estimator, check): """Test the validity against the scikit-learn API.""" - check_estimator(BayesianTargetEncoder(dist="bernoulli")) + check(estimator) def test_init_prior_bernoulli(): @@ -55,7 +57,7 @@ def test_init_prior_gamma(): y = scipy.stats.gamma(3).rvs(size=1000) out = _init_prior("gamma", y) - assert np.abs(out[0]/1000 - 3) <= 1 + assert np.abs(out[0] / 1000 - 3) <= 1 assert out[1] == 0 assert out[2] == np.sum(y) @@ -66,7 +68,7 @@ def test_init_prior_invgamma(): y = scipy.stats.invgamma(5).rvs(size=1000) out = _init_prior("invgamma", y) - assert np.abs(out[0]/1000 - 5) <= 1 + assert np.abs(out[0] / 1000 - 5) <= 1 assert out[1] == 0 assert out[2] == np.sum(y) @@ -74,23 +76,18 @@ def test_init_prior_invgamma(): def test_fit_invalid_dist(): """Test raising an error with an invalid likelihood.""" df = pd.DataFrame( - { - "x1": [0, 1, 2, 1, 0, 1, 2, 3, 3, 2], - "y": [0, 1, 1, 0, 1, 0, 0, 1, 1, 0] - } + {"x1": [0, 1, 2, 1, 0, 1, 2, 3, 3, 2], "y": [0, 1, 1, 0, 1, 0, 0, 1, 1, 0]} ) encoder = BayesianTargetEncoder(dist="fake") with pytest.raises(NotImplementedError): encoder.fit(df[["x1"]], df["y"]) + def test_bernoulli_fit(): """Test fitting the encoder with a binary classification task.""" df = pd.DataFrame( - { - "x1": [0, 1, 2, 1, 0, 1, 2, 3, 3, 2], - "y": [0, 1, 1, 0, 1, 0, 0, 1, 1, 0] - } + {"x1": [0, 1, 2, 1, 0, 1, 2, 3, 3, 2], "y": [0, 1, 1, 0, 1, 0, 0, 1, 1, 0]} ) encoder = BayesianTargetEncoder(dist="bernoulli") @@ -103,13 +100,8 @@ def test_bernoulli_fit(): assert_array_equal( encoder.posterior_params_[0], np.array( - [ - (1.5, 1.5, 0, 1), - (1.5, 2.5, 0, 1), - (1.5, 2.5, 0, 1), - (2.5, 0.5, 0, 1) - ] - ) + [(1.5, 1.5, 0, 1), (1.5, 2.5, 0, 1), (1.5, 2.5, 0, 1), (2.5, 0.5, 0, 1)] + ), ) # Test parallel @@ -124,7 +116,7 @@ def test_multinomial_fit(): df = pd.DataFrame( { "x1": [0, 1, 2, 2, 2, 1, 1, 3, 3, 1, 2, 3, 3, 2, 0, 0, 0, 1, 0, 1], - "y": [0, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 2, 1, 1, 0, 1, 1, 0, 2] + "y": [0, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 2, 1, 1, 0, 1, 1, 0, 2], } ) @@ -135,14 +127,7 @@ def test_multinomial_fit(): assert len(encoder.posterior_params_) == 1 assert_array_equal( encoder.posterior_params_[0], - np.array( - [ - (9, 11, 5), - (7, 11, 8), - (7, 13, 5), - (7, 10, 7) - ] - ) + np.array([(9, 11, 5), (7, 11, 8), (7, 13, 5), (7, 10, 7)]), ) # Test parallel @@ -155,10 +140,7 @@ def test_multinomial_fit(): def test_multinomial_fit_missing_classes(): """Test multinomial fit with missing target levels in categorical.""" df = pd.DataFrame( - { - "x1": [0, 1, 0, 1, 1, 1, 0, 2, 1, 2], - "y": [0, 0, 1, 1, 0, 2, 2, 1, 0, 2] - } + {"x1": [0, 1, 0, 1, 1, 1, 0, 2, 1, 2], "y": [0, 0, 1, 1, 0, 2, 2, 1, 0, 2]} ) encoder = BayesianTargetEncoder(dist="multinomial") @@ -167,24 +149,14 @@ def test_multinomial_fit_missing_classes(): assert encoder.prior_params_ == (4, 3, 3) assert len(encoder.posterior_params_) == 1 assert_array_equal( - encoder.posterior_params_[0], - np.array( - [ - (5, 4, 4), - (7, 4, 4), - (4, 4, 4) - ] - ) + encoder.posterior_params_[0], np.array([(5, 4, 4), (7, 4, 4), (4, 4, 4)]) ) def test_transform_bernoulli(): """Test transforming with a bernoulli likelihood.""" df = pd.DataFrame( - { - "x1": [0, 1, 2, 1, 0, 1, 2, 3, 3, 2], - "y": [0, 1, 1, 0, 1, 0, 0, 1, 1, 0] - } + {"x1": [0, 1, 2, 1, 0, 1, 2, 3, 3, 2], "y": [0, 1, 1, 0, 1, 0, 0, 1, 1, 0]} ) encoder = BayesianTargetEncoder(dist="bernoulli") @@ -197,7 +169,6 @@ def test_transform_bernoulli(): assert_allclose(out.ravel(), expected, rtol=1e-5) - # Test parallel transform encoder.set_params(n_jobs=2) out = encoder.transform(df[["x1"]]) @@ -208,10 +179,7 @@ def test_transform_bernoulli(): def test_transform_bernoulli_new_level(): """Test transforming with a bernoulli likelihood and new levels.""" df = pd.DataFrame( - { - "x1": [0, 1, 2, 1, 0, 1, 2, 3, 3, 2], - "y": [0, 1, 1, 0, 1, 0, 0, 1, 1, 0] - } + {"x1": [0, 1, 2, 1, 0, 1, 2, 3, 3, 2], "y": [0, 1, 1, 0, 1, 0, 0, 1, 1, 0]} ) encoder = BayesianTargetEncoder(dist="bernoulli") @@ -229,7 +197,7 @@ def test_transform_multinomial(): df = pd.DataFrame( { "x1": [0, 1, 2, 2, 2, 1, 1, 3, 3, 1, 2, 3, 3, 2, 0, 0, 0, 1, 0, 1], - "y": [0, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 2, 1, 1, 0, 1, 1, 0, 2] + "y": [0, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 2, 1, 1, 0, 1, 1, 0, 2], } ) @@ -285,7 +253,7 @@ def test_transform_exponential(toy_regression_dataset): for index, params in enumerate(encoder.posterior_params_[0]): assert params[1] == 0 - assert params[2] == (np.sum(y)/(1 + np.sum(y) * np.sum(y[X[:, 9] == index]))) + assert params[2] == (np.sum(y) / (1 + np.sum(y) * np.sum(y[X[:, 9] == index]))) # Mean of posterior is params[0] * params[2] assert np.unique(out[X[:, 9] == index]) == np.array([params[0] * params[2]]) @@ -309,15 +277,15 @@ def test_transform_normal(toy_regression_dataset): var = np.var(y) mean = np.mean(y) - hypervar = 1/np.sum(1/np.square(y - mean)) + hypervar = 1 / np.sum(1 / np.square(y - mean)) for index, params in enumerate(encoder.posterior_params_[0]): nlevel = np.sum(X[:, 9] == index) lvlsum = np.sum(y[X[:, 9] == index]) - assert params[0] == 1/((1/hypervar) + (nlevel/var)) * ((mean/hypervar) + (lvlsum/var)) - assert params[1] == np.sqrt(1/((1/hypervar) + (nlevel/var))) - assert_allclose( - np.unique(out[X[:, 9] == index]), np.array(params[0]) + assert params[0] == 1 / ((1 / hypervar) + (nlevel / var)) * ( + (mean / hypervar) + (lvlsum / var) ) + assert params[1] == np.sqrt(1 / ((1 / hypervar) + (nlevel / var))) + assert_allclose(np.unique(out[X[:, 9] == index]), np.array(params[0])) def test_transform_gamma(toy_regression_dataset): diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py index 458a502..48a6993 100644 --- a/tests/test_ensemble.py +++ b/tests/test_ensemble.py @@ -6,10 +6,10 @@ """ import numpy as np -from numpy.testing import assert_array_equal import pandas as pd -from sklearn.svm import SVR +from numpy.testing import assert_array_equal from sklearn.linear_model import LogisticRegression +from sklearn.svm import SVR from sklearn.utils.validation import check_is_fitted import bayte as bt @@ -19,59 +19,84 @@ def test_estimator_reg_fit(toy_regression_dataset): """Test a basic fit.""" X, y = toy_regression_dataset estimator = bt.BayesianTargetRegressor( - base_estimator=SVR(kernel="linear"), + estimator=SVR(kernel="linear"), encoder=bt.BayesianTargetEncoder(dist="normal"), - n_estimators=2 + n_estimators=2, + ) + estimator.fit( + X, + y, + categorical_feature=[ + 9, + ], ) - estimator.fit(X, y, categorical_feature=[9,]) assert hasattr(estimator, "estimators_") assert len(estimator.estimators_) == 2 - assert not np.array_equal(estimator.estimators_[0].coef_, estimator.estimators_[1].coef_) + assert not np.array_equal( + estimator.estimators_[0].coef_, estimator.estimators_[1].coef_ + ) def test_estimator_parallel_fit(toy_regression_dataset): """Test a parallel fit.""" X, y = toy_regression_dataset estimator = bt.BayesianTargetRegressor( - base_estimator=SVR(kernel="linear"), + estimator=SVR(kernel="linear"), encoder=bt.BayesianTargetEncoder(dist="normal"), n_estimators=2, - n_jobs=2 + n_jobs=2, + ) + estimator.fit( + X, + y, + categorical_feature=[ + 9, + ], ) - estimator.fit(X, y, categorical_feature=[9,]) assert hasattr(estimator, "estimators_") assert len(estimator.estimators_) == 2 for est in estimator.estimators_: check_is_fitted(est) - assert not np.array_equal(estimator.estimators_[0].coef_, estimator.estimators_[1].coef_) + assert not np.array_equal( + estimator.estimators_[0].coef_, estimator.estimators_[1].coef_ + ) def test_estimator_clf_fit(toy_classification_dataset): """Test a basic fit with a classification task.""" X, y = toy_classification_dataset estimator = bt.BayesianTargetClassifier( - base_estimator=LogisticRegression(), + estimator=LogisticRegression(), encoder=bt.BayesianTargetEncoder(dist="bernoulli"), - n_estimators=2 + n_estimators=2, + ) + estimator.fit( + X, + y, + categorical_feature=[ + 9, + ], ) - estimator.fit(X, y, categorical_feature=[9,]) assert hasattr(estimator, "estimators_") assert len(estimator.estimators_) == 2 for est in estimator.estimators_: check_is_fitted(est) - assert not np.array_equal(estimator.estimators_[0].coef_, estimator.estimators_[1].coef_) + assert not np.array_equal( + estimator.estimators_[0].coef_, estimator.estimators_[1].coef_ + ) + def test_estimator_fit_pandas(toy_regression_dataset): """Test a basic fit with a pandas DataFrame.""" X, y = toy_regression_dataset estimator = bt.BayesianTargetRegressor( - base_estimator=SVR(kernel="linear"), + estimator=SVR(kernel="linear"), encoder=bt.BayesianTargetEncoder(dist="normal"), - n_estimators=2 + n_estimators=2, ) X = pd.DataFrame(X) X[9] = X[9].astype("category") @@ -79,14 +104,13 @@ def test_estimator_fit_pandas(toy_regression_dataset): estimator.fit(X, y) assert_array_equal( - estimator.categorical_, - np.array([False, False, False, False, False, False, False, False, False, True]) + estimator.categorical_, + np.array([False, False, False, False, False, False, False, False, False, True]), ) assert hasattr(estimator, "estimators_") assert len(estimator.estimators_) == 2 assert not np.array_equal( - estimator.estimators_[0].coef_, - estimator.estimators_[1].coef_ + estimator.estimators_[0].coef_, estimator.estimators_[1].coef_ ) @@ -94,24 +118,29 @@ def test_estimator_fit_pandas_manual(toy_regression_dataset): """Test a basic fit with a pandas DataFrame and no automatic detection.""" X, y = toy_regression_dataset estimator = bt.BayesianTargetRegressor( - base_estimator=SVR(kernel="linear"), + estimator=SVR(kernel="linear"), encoder=bt.BayesianTargetEncoder(dist="normal"), - n_estimators=2 + n_estimators=2, ) X = pd.DataFrame(X) X.columns = X.columns.astype(str) - estimator.fit(X, y, categorical_feature=["9",]) + estimator.fit( + X, + y, + categorical_feature=[ + "9", + ], + ) assert_array_equal( - estimator.categorical_, - np.array([False, False, False, False, False, False, False, False, False, True]) + estimator.categorical_, + np.array([False, False, False, False, False, False, False, False, False, True]), ) assert hasattr(estimator, "estimators_") assert len(estimator.estimators_) == 2 assert not np.array_equal( - estimator.estimators_[0].coef_, - estimator.estimators_[1].coef_ + estimator.estimators_[0].coef_, estimator.estimators_[1].coef_ ) @@ -122,17 +151,22 @@ def test_estimator_reg_prefit(toy_regression_dataset): encoder.fit(X[:, [9]], y) estimator = bt.BayesianTargetRegressor( - base_estimator=SVR(kernel="linear"), + estimator=SVR(kernel="linear"), encoder=encoder, n_estimators=2, ) - estimator.fit(X, y, categorical_feature=[9,]) + estimator.fit( + X, + y, + categorical_feature=[ + 9, + ], + ) assert hasattr(estimator, "estimators_") assert len(estimator.estimators_) == 2 assert not np.array_equal( - estimator.estimators_[0].coef_, - estimator.estimators_[1].coef_ + estimator.estimators_[0].coef_, estimator.estimators_[1].coef_ ) @@ -140,11 +174,17 @@ def test_estimator_reg_predict(toy_regression_dataset): """Test basic prediction with a regression dataset.""" X, y = toy_regression_dataset estimator = bt.BayesianTargetRegressor( - base_estimator=SVR(kernel="linear"), + estimator=SVR(kernel="linear"), encoder=bt.BayesianTargetEncoder(dist="normal"), - n_estimators=2 + n_estimators=2, + ) + estimator.fit( + X, + y, + categorical_feature=[ + 9, + ], ) - estimator.fit(X, y, categorical_feature=[9,]) y = estimator.predict(X) @@ -155,18 +195,24 @@ def test_estimator_clf_predict(toy_classification_dataset): """Test basic prediction with a classification target.""" X, y = toy_classification_dataset estimator = bt.BayesianTargetClassifier( - base_estimator=LogisticRegression(), + estimator=LogisticRegression(), encoder=bt.BayesianTargetEncoder(dist="bernoulli"), - n_estimators=10 + n_estimators=10, + ) + estimator.fit( + X, + y, + categorical_feature=[ + 9, + ], ) - estimator.fit(X, y, categorical_feature=[9,]) y = estimator.predict(X) yprob = estimator.predict_proba(X) assert y.shape == (1000,) assert_array_equal(np.unique(y), np.arange(2)) - assert yprob.shape == (1000,2) + assert yprob.shape == (1000, 2) assert ((yprob > 1) & (yprob < 0)).sum() == 0 estimator.set_params(voting="soft") diff --git a/tests/test_plots.py b/tests/test_plots.py index 3c23160..38b6c8b 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -2,6 +2,7 @@ from bayte.plots import visualize_target_dist + def test_plot(toy_regression_dataset): """Test visualizing the target distribution.""" _, y = toy_regression_dataset