diff --git a/404.html b/404.html index c79b86c..ee97512 100644 --- a/404.html +++ b/404.html @@ -362,6 +362,27 @@ +
  • + + + + + Callbacks + + + + +
  • + + + + + + + + + +
  • diff --git a/index.html b/index.html index 2faa0a6..df2c790 100644 --- a/index.html +++ b/index.html @@ -491,6 +491,27 @@ +
  • + + + + + Callbacks + + + + +
  • + + + + + + + + + +
  • diff --git a/objects.inv b/objects.inv index d88ab58..41272db 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/reference/RFE/index.html b/reference/RFE/index.html index bb4e294..4f1cb3c 100644 --- a/reference/RFE/index.html +++ b/reference/RFE/index.html @@ -14,7 +14,7 @@ - + @@ -509,6 +509,27 @@ +
  • + + + + + Callbacks + + + + +
  • + + + + + + + + + +
  • diff --git a/reference/callbacks/index.html b/reference/callbacks/index.html new file mode 100644 index 0000000..9e061e7 --- /dev/null +++ b/reference/callbacks/index.html @@ -0,0 +1,810 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Callbacks - felimination + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + Skip to content + + +
    +
    + +
    + + + + + + +
    + + + + + + + +
    + +
    + + + + +
    +
    + + + +
    +
    +
    + + + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    + + + + +

    Callbacks

    + +
    + + + + +
    + +

    Callbacks for feature selection algorithms.

    + + + +
    + + + + + + + + + +
    + + +

    + plot_progress_callback(selector, *args, **kwargs) + +

    + + +
    + +

    Plot the feature selection progress during the algorithm execution.

    + + +

    Parameters:

    +
      +
    • + selector + (object) + – +
      +

      The feature selector object.

      +
      +
    • +
    + +
    + Source code in felimination/callbacks.py +
     4
    + 5
    + 6
    + 7
    + 8
    + 9
    +10
    +11
    +12
    +13
    +14
    +15
    +16
    +17
    def plot_progress_callback(selector, *args, **kwargs):
    +    """Plot the feature selection progress during the algorithm execution.
    +
    +    Parameters
    +    ----------
    +    selector : object
    +        The feature selector object.
    +    """
    +    from IPython import display
    +    from matplotlib import pyplot as plt
    +
    +    display.clear_output(wait=True)
    +    selector.plot()
    +    plt.show()
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + \ No newline at end of file diff --git a/reference/drift/index.html b/reference/drift/index.html index 507b80a..5a947a0 100644 --- a/reference/drift/index.html +++ b/reference/drift/index.html @@ -11,7 +11,7 @@ - + @@ -378,6 +378,27 @@ + + +
  • + + + + + Callbacks + + + + +
  • + + + + + + + + diff --git a/reference/genetic_algorithms/index.html b/reference/genetic_algorithms/index.html index 2ae5a9d..0761eb3 100644 --- a/reference/genetic_algorithms/index.html +++ b/reference/genetic_algorithms/index.html @@ -380,6 +380,27 @@ +
  • + + + + + Callbacks + + + + +
  • + + + + + + + + + +
  • diff --git a/reference/importance/index.html b/reference/importance/index.html index b42d052..ebee64c 100644 --- a/reference/importance/index.html +++ b/reference/importance/index.html @@ -380,6 +380,27 @@ +
  • + + + + + Callbacks + + + + +
  • + + + + + + + + + +
  • diff --git a/search/search_index.json b/search/search_index.json index 7b3ab7f..138d730 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Homepage","text":"

    This library contains some useful scikit-learn compatible classes for feature selection.

    "},{"location":"#features","title":"Features","text":""},{"location":"#requirements","title":"Requirements","text":""},{"location":"#installation","title":"Installation","text":"

    In a terminal shell run the following command

    pip install felimination\n

    "},{"location":"#usage","title":"Usage","text":""},{"location":"#recursive-feature-elimination","title":"Recursive Feature Elimination","text":"

    In this section it will be illustrated how to use the PermutationImportanceRFECV class.

    from felimination.rfe import PermutationImportanceRFECV\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import make_classification\nimport numpy as np\n\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=20,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n)\n\nselector = PermutationImportanceRFECV(LogisticRegression(), step=0.3)\n\nselector.fit(X, y)\n\nselector.support_\n# array([False, False, False, False, False, False, False, False, False,\n#        False, False,  True, False, False, False, False, False, False,\n#        False, False])\n\nselector.ranking_\n# array([9, 3, 8, 9, 7, 8, 5, 6, 9, 6, 8, 1, 9, 7, 8, 9, 9, 2, 4, 7])\nselector.plot()\n

    It looks like 5 is a good number of features, we can set the number of features to select to 5 without need of retraining

    selector.set_n_features_to_select(5)\nselector.support_\n# array([False,  True, False, False, False, False,  True, False, False,\n#        False, False,  True, False, False, False, False, False,  True,\n#         True, False])\n
    "},{"location":"#genetic-algorithms","title":"Genetic Algorithms","text":"

    In this section it will be illustrated how to use the HybridImportanceGACVFeatureSelector class.

    from felimination.ga import HybridImportanceGACVFeatureSelector\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import make_classification\nimport numpy as np\n\n# Create dummy dataset\nX, y = make_classification(\n    n_samples=1000,\n    n_features=20,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n)\n\n# Initialize selector\nselector = HybridImportanceGACVFeatureSelector(\n    LogisticRegression(random_state=42),\n    random_state=42,\n    pool_size=5,\n    patience=5\n)\n\n# Run optimisation\nselector.fit(X, y)\n\n# Show selected features\nselector.support_\n#array([False,  True, False,  True,  True, False, False, False,  True,\n#       False, False, False,  True,  True,  True,  True, False,  True,\n#        True, False])\n\n# Show best solution\nselector.best_solution_\n# {'features': [1, 12, 13, 8, 17, 15, 18, 4, 3, 14],\n#  'train_scores_per_fold': [0.88625, 0.89, 0.8825, 0.8925, 0.88625],\n#  'test_scores_per_fold': [0.895, 0.885, 0.885, 0.89, 0.89],\n#  'cv_importances': [array([[ 1.09135972,  1.13502636,  1.12100231,  0.38285736,  0.28944072,\n#            0.04688614,  0.44259813,  0.09832365,  0.10190421, -0.48101593]]),\n#   array([[ 1.17345812,  1.29375208,  1.2065342 ,  0.40418709,  0.41839714,\n#            0.00447802,  0.466717  ,  0.21733829, -0.00842075, -0.50078996]]),\n#   array([[ 1.15416104,  1.18458564,  1.18083266,  0.37071253,  0.22842685,\n#            0.1087814 ,  0.44446793,  0.12740545,  0.00621562, -0.54064287]]),\n#   array([[ 1.26011643,  1.36996058,  1.30481424,  0.48183549,  0.40589887,\n#           -0.01849671,  0.45606913,  0.18330816,  0.03667055, -0.50869557]]),\n#   array([[ 1.18227123,  1.28988253,  1.2496398 ,  0.50754295,  0.38942303,\n#           -0.01725074,  0.4481891 ,  0.19472963,  0.10034316, -0.50131192]])],\n#  'mean_train_score': 0.8875,\n#  'mean_test_score': 0.889,\n#  'mean_cv_importances': array([ 1.17227331,  1.25464144,  1.21256464,  0.42942709,  0.34631732,\n#          0.02487962,  0.45160826,  0.16422104,  0.04734256, -0.50649125])}\n\n# Show progress as a plot\nselector.plot()\n

    Looks like that the optimisation process converged after 2 steps, since the best score did not improve for 5(=patience) consecutive steps, the optimisation process stopped early.

    "},{"location":"#license","title":"License","text":"

    This project is licensed under the BSD 3-Clause License - see the LICENSE.md file for details

    "},{"location":"#acknowledgments","title":"Acknowledgments","text":""},{"location":"reference/RFE/","title":"RFE","text":"

    Module with tools to perform feature selection.

    This module contains the following classes:

    "},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV","title":"FeliminationRFECV(estimator, *, step=1, n_features_to_select=1, cv=None, scoring=None, random_state=None, verbose=0, n_jobs=None, importance_getter='auto', callbacks=None)","text":"

    Bases: RFE

    Perform recursive feature elimination with cross-validation following scikit-learn standards.

    It has the following differences with RFECV from scikit-learn:

    Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.

    The algorithm of feature selection goes as follows:

    while n_features > n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n

    Parameters:

    Attributes:

    Examples:

    The following example shows how to retrieve the 5 most informative features in the Friedman #1 dataset.

    >>> from felimination.rfe import FeliminationRFECV\n>>> from felimination.importance import PermutationImportance\n>>> from sklearn.datasets import make_friedman1\n>>> from sklearn.svm import SVR\n>>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n>>> estimator = SVR(kernel=\"linear\")\n>>> selector = selector = FeliminationRFECV(\n    estimator,\n    step=1,\n    cv=5,\n    n_features_to_select=5,\n    importance_getter=PermutationImportance()\n)\n>>> selector = selector.fit(X, y)\n>>> selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n>>> selector.ranking_\narray([1, 1, 1, 1, 1, 6, 3, 4, 2, 5])\n
    Source code in felimination/rfe.py
    def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    step=1,\n    n_features_to_select=1,\n    cv=None,\n    scoring=None,\n    random_state=None,\n    verbose=0,\n    n_jobs=None,\n    importance_getter=\"auto\",\n    callbacks=None,\n) -> None:\n    self.cv = cv\n    self.scoring = scoring\n    self.n_jobs = n_jobs\n    self.random_state = random_state\n    self.callbacks = callbacks\n    super().__init__(\n        estimator,\n        n_features_to_select=n_features_to_select,\n        step=step,\n        verbose=verbose,\n        importance_getter=importance_getter,\n    )\n
    "},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.fit","title":"fit(X, y, groups=None, **fit_params)","text":"

    Fit the RFE model and then the underlying estimator on the selected features.

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.n_features_to_select is None:\n        n_features_to_select = n_features // 2\n    elif isinstance(self.n_features_to_select, Integral):  # int\n        n_features_to_select = self.n_features_to_select\n    else:  # float\n        n_features_to_select = int(n_features * self.n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = n_features\n    self.cv_results_ = defaultdict(list)\n\n    # Elimination\n    while current_number_of_features > n_features_to_select:\n        # Select remaining features\n        X_remaining_features, features = self._select_X_with_remaining_features(\n            X, support=support_, n_features=n_features\n        )\n\n        if self.verbose > 0:\n            print(\n                \"Fitting estimator with %d features.\" % current_number_of_features\n            )\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.estimator,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(mean_importances)\n\n        # for sparse case ranks is matrix\n        ranks = np.ravel(ranks)\n\n        if 0.0 < self.step < 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n\n        # Eliminate the worst features\n        threshold = min(step, current_number_of_features - n_features_to_select)\n\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, cv_importances)\n\n        current_number_of_features = np.sum(support_)\n    # Set final attributes\n\n    # Estimate performances of final model\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    cv_scores = cross_validate(\n        self.estimator,\n        X_remaining_features,\n        y,\n        groups=groups,\n        scoring=scorer,\n        cv=cv,\n        n_jobs=self.n_jobs,\n        fit_params=fit_params,\n        return_train_score=True,\n    )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n    # Update cv scores\n    for train_or_test in [\"train\", \"test\"]:\n        scores_per_fold = cv_scores[f\"{train_or_test}_score\"]\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n\n    if self.callbacks:\n        for callback in self.callbacks:\n            callback(self, cv_importances)\n\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    self.estimator_ = clone(self.estimator)\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n
    "},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.plot","title":"plot(**kwargs)","text":"

    Plot a feature selection plot with number of features

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n
    "},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.set_n_features_to_select","title":"set_n_features_to_select(n_features_to_select)","text":"

    Changes the number of features to select after fitting.

    The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.

    Parameters:

    Returns:

    Raises:

    Source code in felimination/rfe.py
    def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n
    "},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV","title":"PermutationImportanceRFECV(estimator, *, step=1, n_features_to_select=1, cv=None, scoring=None, verbose=0, n_jobs=None, n_repeats=5, random_state=None, sample_weight=None, max_samples=1.0, callbacks=None)","text":"

    Bases: FeliminationRFECV

    Preset of FeliminationRFECV using permutation importance as importance getter.

    It has the following differences with RFECV from scikit-learn:

    Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.

    The algorithm of feature selection goes as follows:

    while n_features > n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n

    Parameters:

    Attributes:

    Examples:

    The following example shows how to retrieve the 5 most informative features in the Friedman #1 dataset.

    >>> from felimination.rfe import PermutationImportanceRFECV\n>>> from sklearn.datasets import make_friedman1\n>>> from sklearn.svm import SVR\n>>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n>>> estimator = SVR(kernel=\"linear\")\n>>> selector = selector = PermutationImportanceRFECV(\n        estimator,\n        step=1,\n        cv=5,\n        n_features_to_select=5,\n    )\n>>> selector = selector.fit(X, y)\n>>> selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n>>> selector.ranking_\narray([1, 1, 1, 1, 1, 6, 3, 4, 2, 5])\n
    Source code in felimination/rfe.py
    def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    step=1,\n    n_features_to_select=1,\n    cv=None,\n    scoring=None,\n    verbose=0,\n    n_jobs=None,\n    n_repeats=5,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n    callbacks=None,\n) -> None:\n    self.n_repeats = n_repeats\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n    super().__init__(\n        estimator,\n        step=step,\n        n_features_to_select=n_features_to_select,\n        cv=cv,\n        random_state=random_state,\n        scoring=scoring,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        callbacks=callbacks,\n        importance_getter=PermutationImportance(\n            scoring=scoring,\n            n_repeats=n_repeats,\n            # Better not to do double parallelization\n            n_jobs=1,\n            random_state=random_state,\n            sample_weight=sample_weight,\n            max_samples=max_samples,\n        ),\n    )\n
    "},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.fit","title":"fit(X, y, groups=None, **fit_params)","text":"

    Fit the RFE model and then the underlying estimator on the selected features.

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.n_features_to_select is None:\n        n_features_to_select = n_features // 2\n    elif isinstance(self.n_features_to_select, Integral):  # int\n        n_features_to_select = self.n_features_to_select\n    else:  # float\n        n_features_to_select = int(n_features * self.n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = n_features\n    self.cv_results_ = defaultdict(list)\n\n    # Elimination\n    while current_number_of_features > n_features_to_select:\n        # Select remaining features\n        X_remaining_features, features = self._select_X_with_remaining_features(\n            X, support=support_, n_features=n_features\n        )\n\n        if self.verbose > 0:\n            print(\n                \"Fitting estimator with %d features.\" % current_number_of_features\n            )\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.estimator,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(mean_importances)\n\n        # for sparse case ranks is matrix\n        ranks = np.ravel(ranks)\n\n        if 0.0 < self.step < 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n\n        # Eliminate the worst features\n        threshold = min(step, current_number_of_features - n_features_to_select)\n\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, cv_importances)\n\n        current_number_of_features = np.sum(support_)\n    # Set final attributes\n\n    # Estimate performances of final model\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    cv_scores = cross_validate(\n        self.estimator,\n        X_remaining_features,\n        y,\n        groups=groups,\n        scoring=scorer,\n        cv=cv,\n        n_jobs=self.n_jobs,\n        fit_params=fit_params,\n        return_train_score=True,\n    )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n    # Update cv scores\n    for train_or_test in [\"train\", \"test\"]:\n        scores_per_fold = cv_scores[f\"{train_or_test}_score\"]\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n\n    if self.callbacks:\n        for callback in self.callbacks:\n            callback(self, cv_importances)\n\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    self.estimator_ = clone(self.estimator)\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n
    "},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.plot","title":"plot(**kwargs)","text":"

    Plot a feature selection plot with number of features

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n
    "},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.set_n_features_to_select","title":"set_n_features_to_select(n_features_to_select)","text":"

    Changes the number of features to select after fitting.

    The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.

    Parameters:

    Returns:

    Raises:

    Source code in felimination/rfe.py
    def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n
    "},{"location":"reference/drift/","title":"Drift","text":"

    The idea behind this module comes from the conjunction of two concepts:

    In [1] classifier performances are used to determine how similar two samples are. More specifically, imagine to have two samples: reference and test. In order to assess whether reference and test have been drawn from the same distribution, we could train a classifier in classifying which instances belong to which sample. If the model easily distinguishes instances from the two samples, then the two samples have been probably drawn from two different distributions. Conversely, if the classifier struggles to distinguish them, then it is likely that the samples have been drawn from the same distribution.

    In the context of drift detection, the classifier two-sample test can be used to assess whether drift has happened between the reference and the test set and to which degree.

    The classes of this module take this idea one step further and attempt to reduce the drift using recursive feature selection. After a classifier is trained to distinguish between reference and test, the feature importance of the classifier is used to determine which features contribute the most in distinguishing between the two sets. The most important features are then eliminated and the procedure is repeated until the classifier is not able anymore to distinguish between the two samples, or until a certain amount of features has been removed.

    This module contains the following classes: - SampleSimilarityDriftRFE: base class for drift-based sample similarity feature selection.

    "},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE","title":"PermImpSampleSimilarityDriftRFE(clf, *, step=1, max_score=0.55, min_n_features_to_select=1, split_col=0, split_value=None, split_frac=0.5, split_unique_values=True, cv=None, scoring=None, verbose=0, n_jobs=None, n_repeats=5, random_state=None, sample_weight=None, max_samples=1.0)","text":"

    Bases: SampleSimilarityDriftRFE

    Preset of SampleSimilarityDriftRFE using permutation importance as importance getter.

    It has the following differences with RFECV from scikit-learn:

    Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.

    The algorithm of feature selection goes as follows:

    while n_features > n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n

    Parameters:

    Attributes:

    Source code in felimination/drift.py
    def __init__(\n    self,\n    clf: ClassifierMixin,\n    *,\n    step=1,\n    max_score=0.55,\n    min_n_features_to_select=1,\n    split_col=0,\n    split_value=None,\n    split_frac=0.5,\n    split_unique_values=True,\n    cv=None,\n    scoring=None,\n    verbose=0,\n    n_jobs=None,\n    n_repeats=5,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n) -> None:\n    self.n_repeats = n_repeats\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n    super().__init__(\n        clf=clf,\n        max_score=max_score,\n        min_n_features_to_select=min_n_features_to_select,\n        split_col=split_col,\n        split_value=split_value,\n        split_frac=split_frac,\n        split_unique_values=split_unique_values,\n        step=step,\n        cv=cv,\n        scoring=scoring,\n        random_state=random_state,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        importance_getter=PermutationImportance(\n            scoring=scoring,\n            n_repeats=n_repeats,\n            # Better not to do double parallelization\n            n_jobs=1,\n            random_state=random_state,\n            sample_weight=sample_weight,\n            max_samples=max_samples,\n        ),\n    )\n
    "},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.fit","title":"fit(X, y=None, groups=None, **fit_params)","text":"

    Fit the RFE model and then the underlying clf on the selected features.

    Parameters:

    Returns:

    Source code in felimination/drift.py
    def fit(self, X, y=None, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying clf on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values. Not used, kept for compatibility.\n    groups : array-like of shape (n_samples,), default=None\n        Group labels for the samples used while splitting the dataset into\n        train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n        instance.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        clf.\n\n    Returns\n    -------\n    self : object\n        Fitted selector.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    X = self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        dtype=None,\n    )\n    if isinstance(self.split_col, str):\n        split_col_idx = list(self.feature_names_in_).index(self.split_col)\n    else:\n        split_col_idx = self.split_col\n    split_col_values = X[:, split_col_idx]\n    X, y = self._build_sample_similarity_x_y(X, split_col_values=split_col_values)\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=True)\n    scorer = check_scoring(self.clf, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    support_[split_col_idx] = False\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = support_.sum()\n    self.cv_results_ = defaultdict(list)\n\n    if self.verbose > 0:\n        print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n    # Train model, score it and get importances\n    if effective_n_jobs(self.n_jobs) == 1:\n        parallel, func = list, _train_score_get_importance\n    else:\n        parallel = Parallel(n_jobs=self.n_jobs)\n        func = delayed(_train_score_get_importance)\n\n    features = np.arange(n_features)[support_]\n    X_remaining_features = X[:, features]\n\n    scores_importances = parallel(\n        func(\n            self.clf,\n            X_remaining_features,\n            y,\n            train,\n            test,\n            scorer,\n            self.importance_getter,\n        )\n        for train, test in cv.split(X_remaining_features, y, groups)\n    )\n\n    test_scores_per_fold = [\n        score_importance[1] for score_importance in scores_importances\n    ]\n    train_scores_per_fold = [\n        score_importance[0] for score_importance in scores_importances\n    ]\n\n    # Update cv scores\n    for train_or_test, scores_per_fold in zip(\n        [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n    ):\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    # Elimination\n    while (\n        np.mean(test_scores_per_fold) > self.max_score\n        and current_number_of_features > min_n_features_to_select\n    ):\n        features = np.arange(n_features)[support_]\n        if 0.0 < self.step < 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n        # Eliminate most important features\n        threshold = min(step, current_number_of_features - min_n_features_to_select)\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(-mean_importances)\n        ranks = np.ravel(ranks)\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n        current_number_of_features = np.sum(support_)\n        # Select remaining features\n        features = np.arange(n_features)[support_]\n        X_remaining_features = X[:, features]\n\n        if self.verbose > 0:\n            print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.clf,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    features = np.arange(n_features)[support_]\n    self.clf_ = clone(self.clf)\n    self.clf_.fit(X[:, features], y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n
    "},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.plot","title":"plot(**kwargs)","text":"

    Plot a feature selection plot with number of features

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n
    "},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.set_n_features_to_select","title":"set_n_features_to_select(n_features_to_select)","text":"

    Changes the number of features to select after fitting.

    The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.

    Parameters:

    Returns:

    Raises:

    Source code in felimination/rfe.py
    def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n
    "},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE","title":"SampleSimilarityDriftRFE(clf, *, step=1, max_score=0.55, min_n_features_to_select=1, split_col=0, split_value=None, split_frac=0.5, split_unique_values=True, cv=None, scoring=None, random_state=None, verbose=0, n_jobs=None, importance_getter='auto')","text":"

    Bases: FeliminationRFECV

    Recursively discards the features that introduce the highest drift.

    The algorithm of feature selection goes as follows:

    Split X into two sets using the `split_column`: X1 and X2\ncreate target array y1 for X1 as an array of zeroes\ncreate target array y2 for X2 as an array of ones\nvertically concatenate X1, X2 and y1 and y2, obtaining X_ss and y_ss\nCalculate Cross-validation performances of the estimator on X_ss and y_ss.\nwhile cross-validation-performances > max_score and n_features > min_n_features_to_select:\n    Discard most important features\n    Calculate Cross-validation performances of the estimator on X_ss and y_ss using the new feature set.\n

    Parameters:

    Attributes:

    Source code in felimination/drift.py
    def __init__(\n    self,\n    clf: ClassifierMixin,\n    *,\n    step=1,\n    max_score=0.55,\n    min_n_features_to_select=1,\n    split_col=0,\n    split_value=None,\n    split_frac=0.5,\n    split_unique_values=True,\n    cv=None,\n    scoring=None,\n    random_state=None,\n    verbose=0,\n    n_jobs=None,\n    importance_getter=\"auto\",\n) -> None:\n    self.max_score = max_score\n    self.split_col = split_col\n    self.split_value = split_value\n    self.split_unique_values = split_unique_values\n    self.split_frac = split_frac\n    self.min_n_features_to_select = min_n_features_to_select\n    self.clf = clf\n    super().__init__(\n        estimator=clf,\n        n_features_to_select=min_n_features_to_select,\n        step=step,\n        cv=cv,\n        scoring=scoring,\n        random_state=random_state,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        importance_getter=importance_getter,\n    )\n
    "},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.fit","title":"fit(X, y=None, groups=None, **fit_params)","text":"

    Fit the RFE model and then the underlying clf on the selected features.

    Parameters:

    Returns:

    Source code in felimination/drift.py
    def fit(self, X, y=None, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying clf on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values. Not used, kept for compatibility.\n    groups : array-like of shape (n_samples,), default=None\n        Group labels for the samples used while splitting the dataset into\n        train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n        instance.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        clf.\n\n    Returns\n    -------\n    self : object\n        Fitted selector.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    X = self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        dtype=None,\n    )\n    if isinstance(self.split_col, str):\n        split_col_idx = list(self.feature_names_in_).index(self.split_col)\n    else:\n        split_col_idx = self.split_col\n    split_col_values = X[:, split_col_idx]\n    X, y = self._build_sample_similarity_x_y(X, split_col_values=split_col_values)\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=True)\n    scorer = check_scoring(self.clf, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    support_[split_col_idx] = False\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = support_.sum()\n    self.cv_results_ = defaultdict(list)\n\n    if self.verbose > 0:\n        print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n    # Train model, score it and get importances\n    if effective_n_jobs(self.n_jobs) == 1:\n        parallel, func = list, _train_score_get_importance\n    else:\n        parallel = Parallel(n_jobs=self.n_jobs)\n        func = delayed(_train_score_get_importance)\n\n    features = np.arange(n_features)[support_]\n    X_remaining_features = X[:, features]\n\n    scores_importances = parallel(\n        func(\n            self.clf,\n            X_remaining_features,\n            y,\n            train,\n            test,\n            scorer,\n            self.importance_getter,\n        )\n        for train, test in cv.split(X_remaining_features, y, groups)\n    )\n\n    test_scores_per_fold = [\n        score_importance[1] for score_importance in scores_importances\n    ]\n    train_scores_per_fold = [\n        score_importance[0] for score_importance in scores_importances\n    ]\n\n    # Update cv scores\n    for train_or_test, scores_per_fold in zip(\n        [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n    ):\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    # Elimination\n    while (\n        np.mean(test_scores_per_fold) > self.max_score\n        and current_number_of_features > min_n_features_to_select\n    ):\n        features = np.arange(n_features)[support_]\n        if 0.0 < self.step < 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n        # Eliminate most important features\n        threshold = min(step, current_number_of_features - min_n_features_to_select)\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(-mean_importances)\n        ranks = np.ravel(ranks)\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n        current_number_of_features = np.sum(support_)\n        # Select remaining features\n        features = np.arange(n_features)[support_]\n        X_remaining_features = X[:, features]\n\n        if self.verbose > 0:\n            print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.clf,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    features = np.arange(n_features)[support_]\n    self.clf_ = clone(self.clf)\n    self.clf_.fit(X[:, features], y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n
    "},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.plot","title":"plot(**kwargs)","text":"

    Plot a feature selection plot with number of features

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n
    "},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.set_n_features_to_select","title":"set_n_features_to_select(n_features_to_select)","text":"

    Changes the number of features to select after fitting.

    The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.

    Parameters:

    Returns:

    Raises:

    Source code in felimination/rfe.py
    def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n
    "},{"location":"reference/genetic_algorithms/","title":"Genetic algorithms","text":"

    This module contains the implementation of the Hybrid Genetic Algorithm-Importance with Cross-Validation. The algorithm is implemented in the HybridImportanceGACVFeatureSelector class.

    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector","title":"HybridImportanceGACVFeatureSelector(estimator, *, cv=5, scoring=None, random_state=None, n_jobs=None, importance_getter='auto', min_n_features_to_select=1, init_avg_features_num=15, init_std_features_num=5, pool_size=20, is_parent_selection_chance_proportional_to_fitness=True, n_children_cross_over=5, n_parents_cross_over=2, n_mutations=5, range_change_n_features_mutation=(-2, 3), range_randomly_swapped_features_mutation=(1, 4), max_generations=100, patience=5, callbacks=None, fitness_function=rank_mean_test_score_overfit_fitness)","text":"

    Bases: SelectorMixin, MetaEstimatorMixin, BaseEstimator

    Feature selection using Hybrid Genetic Algorithm-Importance with Cross-Validation.

    This feature selector uses a genetic algorithm to select features. The genetic algorithm is hybridized with feature importance. The feature importance is calculated using a cross-validation scheme. The algorithm works as follows:

    Pool initialization: The pool is initialized with random features. The number of features is randomly generated using a normal distribution with the average number of features to select and the standard deviation of the number of features to select as parameters. The number of features is clipped to be between the minimum number of features to select and the number of features in the dataset.

    Cross Over: The cross over is done by combining the features of the parents. The features are sorted by importance and the children are created by combining the features of the parents in a round-robin fashion. The number of features of the children is the average of the number of features of the parents. In this way, the children will have the most important features of the parents.

    Mutation: The mutation is done by randomly changing the number of features and replacing the least important features with random features.

    Selection: The selection is done by selecting the top pool_size solutions based on the fitness function.

    Parameters:

    Attributes:

    Examples:

    >>> from felimination.ga import HybridImportanceGACVFeatureSelector\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.linear_model import LogisticRegression\n>>> X, y = make_classification(\n    n_samples=sample_size,\n    n_features=2,\n    n_classes=2,\n    n_redundant=0,\n    n_clusters_per_class=1,\n    random_state=random_state,\n)\n>>> estimator = LogisticRegression(random_state=42)\n>>> selector = selector = HybridImportanceGACVFeatureSelector(\n    random_state=random_state,\n    init_avg_features_num=2,\n    init_std_features_num=1,\n)\n>>> selector = selector.fit(X, y)\n>>> selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n
    Source code in felimination/ga.py
    def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    cv=5,\n    scoring=None,\n    random_state=None,\n    n_jobs=None,\n    importance_getter=\"auto\",\n    min_n_features_to_select=1,\n    init_avg_features_num=15,\n    init_std_features_num=5,\n    pool_size=20,\n    is_parent_selection_chance_proportional_to_fitness=True,\n    n_children_cross_over=5,\n    n_parents_cross_over=2,\n    n_mutations=5,\n    range_change_n_features_mutation=(-2, 3),\n    range_randomly_swapped_features_mutation=(1, 4),\n    max_generations=100,\n    patience=5,\n    callbacks=None,\n    fitness_function=rank_mean_test_score_overfit_fitness,\n) -> None:\n    self.estimator = estimator\n    self.cv = cv\n    self.scoring = scoring\n    self.random_state = random_state\n    self.n_jobs = n_jobs\n    self.importance_getter = importance_getter\n    self.min_n_features_to_select = min_n_features_to_select\n    self.init_avg_features_num = init_avg_features_num\n    self.init_std_features_num = init_std_features_num\n    self.pool_size = pool_size\n    self.n_children_cross_over = n_children_cross_over\n    self.is_parent_selection_chance_proportional_to_fitness = (\n        is_parent_selection_chance_proportional_to_fitness\n    )\n    self.n_parents_cross_over = n_parents_cross_over\n    self.n_mutations = n_mutations\n    self.range_change_n_features_mutation = range_change_n_features_mutation\n    self.range_randomly_swapped_features_mutation = (\n        range_randomly_swapped_features_mutation\n    )\n    self.max_generations = max_generations\n    self.patience = patience\n    self.callbacks = callbacks\n    self.fitness_function = fitness_function\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.decision_function","title":"decision_function(X)","text":"

    Compute the decision function of X.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"decision_function\"))\ndef decision_function(self, X):\n    \"\"\"Compute the decision function of ``X``.\n\n    Parameters\n    ----------\n    X : {array-like or sparse matrix} of shape (n_samples, n_features)\n        The input samples. Internally, it will be converted to\n        ``dtype=np.float32`` and if a sparse matrix is provided\n        to a sparse ``csr_matrix``.\n\n    Returns\n    -------\n    score : array, shape = [n_samples, n_classes] or [n_samples]\n        The decision function of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n        Regression and binary classification produce an array of shape\n        [n_samples].\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.decision_function(self.transform(X))\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.fit","title":"fit(X, y, groups=None, **fit_params)","text":"

    Fit the selector and then the underlying estimator on the selected features.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the selector and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    if isinstance(X, pd.DataFrame):\n        all_features = X.columns.to_list()\n    else:\n        all_features = list(range(n_features))\n\n    np.random.seed(self.random_state)\n\n    # Create the initial pool of solutions\n    pool = [\n        {\n            \"features\": list(\n                np.random.choice(\n                    all_features,\n                    min(\n                        max(\n                            int(\n                                np.random.normal(\n                                    self.init_avg_features_num,\n                                    self.init_std_features_num,\n                                )\n                            ),\n                            min_n_features_to_select,\n                        ),\n                        n_features,\n                    ),\n                    replace=False,\n                )\n            ),\n        }\n        for _ in range(self.pool_size)\n    ]\n\n    # Evaluate the initial pool of solutions\n    pool = self._evaluate_calculate_importances(\n        pool, X, y, groups, cv, scorer, **fit_params\n    )\n    self.best_solutions_ = []\n    for _ in range(1, self.max_generations):\n        children = self._cross_over(pool)\n        children = self._evaluate_calculate_importances(\n            children, X, y, groups, cv, scorer, **fit_params\n        )\n        pool.extend(children)\n        mutations = self._mutate(pool, all_features)\n        mutations = self._evaluate_calculate_importances(\n            mutations, X, y, groups, cv, scorer, **fit_params\n        )\n        pool.extend(mutations)\n        pool_sorted = [\n            element\n            for _, element in sorted(\n                zip(self._calculate_fitness(pool), pool),\n                reverse=True,\n                key=itemgetter(0),\n            )\n        ]\n        pool = pool_sorted[: self.pool_size]\n        self.best_solutions_.append(pool[0])\n\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, pool)\n\n        if len(self.best_solutions_) > self.patience:\n            if all(\n                [\n                    self.best_solutions_[-1][\"features\"] == solution[\"features\"]\n                    for solution in self.best_solutions_[-self.patience :]\n                ]\n            ):\n                break\n\n    self.estimator_ = clone(self.estimator)\n    X_remaining_features = _select_X_with_features(\n        X, self.best_solution_[\"features\"]\n    )\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n    self.support_ = np.array(\n        [\n            True if feature in self.best_solution_[\"features\"] else False\n            for feature in all_features\n        ]\n    )\n\n    return self\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.plot","title":"plot(**kwargs)","text":"

    Plot the mean test score and mean train score of the best solution at each generation.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    def plot(self, **kwargs):\n    \"\"\"Plot the mean test score and mean train score of the best solution at each generation.\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    data_points_to_plot_long_form = []\n    for generation, best_solution in enumerate(self.best_solutions_, start=1):\n        for set, scores in zip(\n            [\"validation\", \"train\"],\n            [\n                best_solution[\"test_scores_per_fold\"],\n                best_solution[\"train_scores_per_fold\"],\n            ],\n        ):\n            for score in scores:\n                data_points_to_plot_long_form.append(\n                    {\"generation\": generation, \"score\": score, \"set\": set}\n                )\n    df_plot = pd.DataFrame(data_points_to_plot_long_form)\n    lineplot_kwargs = dict(\n        x=\"generation\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    return sns.lineplot(data=df_plot, **lineplot_kwargs)\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict","title":"predict(X)","text":"

    Reduce X to the selected features and predict using the estimator.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"predict\"))\ndef predict(self, X):\n    \"\"\"Reduce X to the selected features and predict using the estimator.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    Returns\n    -------\n    y : array of shape [n_samples]\n        The predicted target values.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict(self.transform(X))\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict_log_proba","title":"predict_log_proba(X)","text":"

    Predict class log-probabilities for X.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"predict_log_proba\"))\ndef predict_log_proba(self, X):\n    \"\"\"Predict class log-probabilities for X.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    Returns\n    -------\n    p : array of shape (n_samples, n_classes)\n        The class log-probabilities of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict_log_proba(self.transform(X))\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict_proba","title":"predict_proba(X)","text":"

    Predict class probabilities for X.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"predict_proba\"))\ndef predict_proba(self, X):\n    \"\"\"Predict class probabilities for X.\n\n    Parameters\n    ----------\n    X : {array-like or sparse matrix} of shape (n_samples, n_features)\n        The input samples. Internally, it will be converted to\n        ``dtype=np.float32`` and if a sparse matrix is provided\n        to a sparse ``csr_matrix``.\n\n    Returns\n    -------\n    p : array of shape (n_samples, n_classes)\n        The class probabilities of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict_proba(self.transform(X))\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.score","title":"score(X, y, **fit_params)","text":"

    Reduce X to the selected features and return the score of the estimator.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"score\"))\ndef score(self, X, y, **fit_params):\n    \"\"\"Reduce X to the selected features and return the score of the estimator.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    y : array of shape [n_samples]\n        The target values.\n\n    **fit_params : dict\n        Parameters to pass to the `score` method of the underlying\n        estimator.\n\n        .. versionadded:: 1.0\n\n    Returns\n    -------\n    score : float\n        Score of the underlying base estimator computed with the selected\n        features returned by `rfe.transform(X)` and `y`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.score(self.transform(X), y, **fit_params)\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.rank_mean_test_score_fitness","title":"rank_mean_test_score_fitness(pool)","text":"

    Define the fitness function as the rank of the mean test score.

    The rank of the mean test score is calculated by ranking the mean test score in ascending order.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    def rank_mean_test_score_fitness(pool):\n    \"\"\"Define the fitness function as the rank of the mean test score.\n\n    The rank of the mean test score is calculated by ranking the mean test score in ascending order.\n\n    Parameters\n    ----------\n\n    pool : list of dict\n        Each element in the list is a dictionary with the following keys:\n        - features: list of int\n            The features selected for this element.\n        - mean_test_score: float\n            The mean test score of the element.\n        - mean_train_score: float\n            The mean train score of the element.\n\n    Returns\n    -------\n    fitness : list of float\n        The fitness of each element in the pool.\n    \"\"\"\n    pool_df = pd.DataFrame(pool)\n    pool_df[\"rank_mean_test_score\"] = pool_df[\"mean_test_score\"].rank(ascending=True)\n    return pool_df[\"rank_mean_test_score\"].to_list()\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.rank_mean_test_score_overfit_fitness","title":"rank_mean_test_score_overfit_fitness(pool)","text":"

    Define the fitness function as the sum of the rank of the mean test score and the rank of the overfit.

    The rank of the mean test score is calculated by ranking the mean test score in ascending order. The rank of the overfit is calculated by ranking the overfit in ascending order. The overfit is calculated as the difference between the mean train score and the mean test score. The fitness is the sum of the rank of the mean test score and the rank of the overfit.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    def rank_mean_test_score_overfit_fitness(pool):\n    \"\"\"Define the fitness function as the sum of the rank of the mean test score and the rank of the\n    overfit.\n\n    The rank of the mean test score is calculated by ranking the mean test score in ascending order.\n    The rank of the overfit is calculated by ranking the overfit in ascending order.\n    The overfit is calculated as the difference between the mean train score and the mean test score.\n    The fitness is the sum of the rank of the mean test score and the rank of the overfit.\n\n    Parameters\n    ----------\n    pool : list of dict\n        Each element in the list is a dictionary with the following keys:\n        - features: list of int\n            The features selected for this element.\n        - mean_test_score: float\n            The mean test score of the element.\n        - mean_train_score: float\n            The mean train score of the element.\n\n    Returns\n    -------\n    fitness : list of float\n        The fitness of each element in the pool.\n    \"\"\"\n\n    pool_df = pd.DataFrame(pool)\n    pool_df[\"rank_mean_test_score\"] = pool_df[\"mean_test_score\"].rank(ascending=False)\n    pool_df[\"overfit\"] = pool_df[\"mean_train_score\"] - pool_df[\"mean_test_score\"]\n    pool_df[\"rank_overfit\"] = pool_df[\"overfit\"].rank(ascending=True)\n    pool_df[\"rank_sum\"] = pool_df[\"rank_mean_test_score\"] + pool_df[\"rank_overfit\"]\n\n    pool_df[\"rank_sum_rank\"] = pool_df[\"rank_sum\"].rank(ascending=False)\n    return pool_df[\"rank_sum_rank\"].to_list()\n
    "},{"location":"reference/importance/","title":"Importance","text":""},{"location":"reference/importance/#felimination.importance.PermutationImportance","title":"PermutationImportance(scoring=None, n_repeats=5, n_jobs=None, random_state=None, sample_weight=None, max_samples=1.0)","text":"

    Wrapper around sklearn.inspection.permutation_importance.

    Parameters:

    Source code in felimination/importance.py
    def __init__(\n    self,\n    scoring=None,\n    n_repeats=5,\n    n_jobs=None,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n):\n    self.scoring = scoring\n    self.n_repeats = n_repeats\n    self.n_jobs = n_jobs\n    self.random_state = random_state\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n
    "},{"location":"reference/importance/#felimination.importance.PermutationImportance.__call__","title":"__call__(estimator, X, y)","text":"

    Computes the permutation importance.

    Parameters:

    Returns:

    Source code in felimination/importance.py
    def __call__(self, estimator, X, y) -> Any:\n    \"\"\"Computes the permutation importance.\n\n    Parameters\n    ----------\n    estimator : object\n        An estimator that has already been fitted and is compatible\n        with scorer.\n    X : ndarray or DataFrame, shape (n_samples, n_features)\n        Data on which permutation importance will be computed.\n    y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)\n        Targets for supervised or `None` for unsupervised.\n\n    Returns\n    -------\n    importances_mean : ndarray of shape (n_features, )\n        Mean of feature importance over `n_repeats`.\n    \"\"\"\n    return permutation_importance(\n        estimator,\n        X,\n        y,\n        scoring=self.scoring,\n        n_repeats=self.n_repeats,\n        n_jobs=self.n_jobs,\n        random_state=self.random_state,\n        sample_weight=self.sample_weight,\n        max_samples=self.max_samples,\n    ).importances_mean\n
    "},{"location":"tutorials/genetic_algorithms_x_feature_selection/","title":"Genetic Algorithms x Feature Selection","text":"In\u00a0[\u00a0]: Copied!
    # Install felimination\n! pip install felimination\n
    # Install felimination ! pip install felimination In\u00a0[2]: Copied!
    from sklearn.datasets import make_classification\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=200,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n    shuffle=False\n)\n
    from sklearn.datasets import make_classification X, y = make_classification( n_samples=1000, n_features=200, n_informative=6, n_redundant=10, n_clusters_per_class=1, random_state=42, shuffle=False ) In\u00a0[3]: Copied!
    from sklearn.model_selection import cross_validate, StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\n\n# Define a simple logistic regression model\nmodel = LogisticRegression(random_state=42)\n\n# Perform cross-validation\ncv_results = cross_validate(\n    model,\n    X,\n    y,\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    scoring=\"roc_auc\",\n    return_train_score=True,\n)\n\ncv_results[\"test_score\"].mean()\n
    from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.linear_model import LogisticRegression # Define a simple logistic regression model model = LogisticRegression(random_state=42) # Perform cross-validation cv_results = cross_validate( model, X, y, cv=StratifiedKFold(random_state=42, shuffle=True), scoring=\"roc_auc\", return_train_score=True, ) cv_results[\"test_score\"].mean() Out[3]:
    0.8561362716271628
    In\u00a0[4]: Copied!
    from felimination.ga import HybridImportanceGACVFeatureSelector\nfrom felimination.callbacks import plot_progress_callback\n\n\nselector = HybridImportanceGACVFeatureSelector(\n    model,\n    callbacks=[plot_progress_callback],\n    scoring=\"roc_auc\",\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    init_avg_features_num=5,\n    min_n_features_to_select=3,\n    pool_size=20,\n    n_children_cross_over=20,\n    n_mutations=20,\n    random_state=42,\n)\nselector.fit(X, y)\n
    from felimination.ga import HybridImportanceGACVFeatureSelector from felimination.callbacks import plot_progress_callback selector = HybridImportanceGACVFeatureSelector( model, callbacks=[plot_progress_callback], scoring=\"roc_auc\", cv=StratifiedKFold(random_state=42, shuffle=True), init_avg_features_num=5, min_n_features_to_select=3, pool_size=20, n_children_cross_over=20, n_mutations=20, random_state=42, ) selector.fit(X, y) Out[4]:
    HybridImportanceGACVFeatureSelector(callbacks=[<function plot_progress_callback at 0x31aaa4fe0>],\n                                    cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                                    estimator=LogisticRegression(random_state=42),\n                                    init_avg_features_num=5,\n                                    min_n_features_to_select=3,\n                                    n_children_cross_over=20, n_mutations=20,\n                                    random_state=42, scoring='roc_auc')
    In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.\u00a0HybridImportanceGACVFeatureSelectoriFitted
    HybridImportanceGACVFeatureSelector(callbacks=[<function plot_progress_callback at 0x31aaa4fe0>],\n                                    cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                                    estimator=LogisticRegression(random_state=42),\n                                    init_avg_features_num=5,\n                                    min_n_features_to_select=3,\n                                    n_children_cross_over=20, n_mutations=20,\n                                    random_state=42, scoring='roc_auc')
    estimator: LogisticRegression
    LogisticRegression(random_state=42)
    \u00a0LogisticRegression?Documentation for LogisticRegression
    LogisticRegression(random_state=42)

    Notice how model performances increase with the progressive elimination of features.

    This is due to the fact that models with a lot of not predictive feature tend to find patterns even in random noise and end up overfitting, see how the train score and the validation score get closer with the progressive elimination of features.

    In\u00a0[5]: Copied!
    sorted(selector.best_solution_['features'])\n
    sorted(selector.best_solution_['features']) Out[5]:
    [6, 10, 82, 93, 168]

    The features with index <= 15 are relevant, the others are random noise. We see that some of the relevant features are being selected. Nevertheless we got a good improvement in AUC score:

    In\u00a0[6]: Copied!
    selector.best_solution_['mean_test_score']\n
    selector.best_solution_['mean_test_score'] Out[6]:
    0.9197176917691768

    The best AUC score obtained with feature elimination is now 0.92, that's ~0.06 AUC points obtained from removing useless features.

    In\u00a0[8]: Copied!
    selector.transform(X).shape\n
    selector.transform(X).shape Out[8]:
    (1000, 5)
    "},{"location":"tutorials/genetic_algorithms_x_feature_selection/#genetic-algorithms-x-feature-selection","title":"Genetic Algorithms x Feature Selection\u00b6","text":"

    This tutorial will show an example of how we can use genetic algorithms applied to feature selection to improve our model performances.

    More specifically, this tutorial will illustrate how to perform feature selection using genetic algorithm as implemented in the class felimination.ga.HybridImportanceGACVFeatureSelector

    "},{"location":"tutorials/genetic_algorithms_x_feature_selection/#create-a-dummy-dataset","title":"Create a dummy Dataset\u00b6","text":"

    For this tutorial we will use a dummy classification dataset created using sklearn.datasets.make_classification. For this dataset we will have 6 predictive features, 10 redundant and 184 random features.

    "},{"location":"tutorials/genetic_algorithms_x_feature_selection/#evaluate-performances-without-feature-elimination","title":"Evaluate performances without feature elimination\u00b6","text":""},{"location":"tutorials/genetic_algorithms_x_feature_selection/#perform-now-feature-elimination","title":"Perform now feature elimination\u00b6","text":""},{"location":"tutorials/recursive_feature_elimination/","title":"Recursive Feature Elimination (RFE)","text":"In\u00a0[\u00a0]: Copied!
    # Install felimination\n! pip install felimination\n
    # Install felimination ! pip install felimination In\u00a0[2]: Copied!
    from sklearn.datasets import make_classification\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=200,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n    shuffle=False\n)\n
    from sklearn.datasets import make_classification X, y = make_classification( n_samples=1000, n_features=200, n_informative=6, n_redundant=10, n_clusters_per_class=1, random_state=42, shuffle=False ) In\u00a0[3]: Copied!
    from sklearn.model_selection import cross_validate, StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\n\n# Define a simple logistic regression model\nmodel = LogisticRegression(random_state=42)\n\n# Perform cross-validation\ncv_results = cross_validate(\n    model,\n    X,\n    y,\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    scoring=\"roc_auc\",\n    return_train_score=True,\n)\n\ncv_results[\"test_score\"].mean()\n
    from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.linear_model import LogisticRegression # Define a simple logistic regression model model = LogisticRegression(random_state=42) # Perform cross-validation cv_results = cross_validate( model, X, y, cv=StratifiedKFold(random_state=42, shuffle=True), scoring=\"roc_auc\", return_train_score=True, ) cv_results[\"test_score\"].mean() Out[3]:
    0.8561362716271628
    In\u00a0[4]: Copied!
    from felimination.rfe import PermutationImportanceRFECV\nfrom felimination.callbacks import plot_progress_callback\n\n\nselector = PermutationImportanceRFECV(\n    model,\n    step=0.2,\n    callbacks=[plot_progress_callback],\n    scoring=\"roc_auc\",\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n)\nselector.fit(X, y)\n
    from felimination.rfe import PermutationImportanceRFECV from felimination.callbacks import plot_progress_callback selector = PermutationImportanceRFECV( model, step=0.2, callbacks=[plot_progress_callback], scoring=\"roc_auc\", cv=StratifiedKFold(random_state=42, shuffle=True), ) selector.fit(X, y) Out[4]:
    PermutationImportanceRFECV(callbacks=[<function plot_progress_callback at 0x103583d80>],\n                           cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                           estimator=LogisticRegression(random_state=42),\n                           scoring='roc_auc', step=0.2)
    In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.\u00a0PermutationImportanceRFECViFitted
    PermutationImportanceRFECV(callbacks=[<function plot_progress_callback at 0x103583d80>],\n                           cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                           estimator=LogisticRegression(random_state=42),\n                           scoring='roc_auc', step=0.2)
    estimator: LogisticRegression
    LogisticRegression(random_state=42)
    \u00a0LogisticRegression?Documentation for LogisticRegression
    LogisticRegression(random_state=42)

    Notice how model performances increase with the progressive elimination of features.

    This is due to the fact that models with a lot of not predictive feature tend to find patterns even in random noise and end up overfitting, see how the train score and the validation score get closer with the progressive elimination of features.

    In\u00a0[5]: Copied!
    import pandas as pd\n\ncv_results_df = pd.DataFrame(selector.cv_results_)\n\ncv_results_df[[\"mean_test_score\", \"n_features\"]].sort_values(\n    \"mean_test_score\", ascending=False\n).head(10)\n
    import pandas as pd cv_results_df = pd.DataFrame(selector.cv_results_) cv_results_df[[\"mean_test_score\", \"n_features\"]].sort_values( \"mean_test_score\", ascending=False ).head(10) Out[5]: mean_test_score n_features 7 0.944138 44 6 0.943558 54 8 0.943018 36 9 0.942478 29 5 0.942438 67 4 0.942058 83 10 0.939718 24 11 0.937578 20 12 0.935838 16 13 0.935698 13

    The best AUC score obtained with feature elimination is now 0.94, that's 0.08 AUC points obtained from less features.

    If I had to choose a number of features, I would probably go for 13 number of features because there the validation score is very close to the train score.

    We can do this using the method set_n_features_to_select. This will change the support of the selector as well as the behavior of the transform method.

    In\u00a0[6]: Copied!
    selector.set_n_features_to_select(13)\nselector.transform(X).shape\n
    selector.set_n_features_to_select(13) selector.transform(X).shape Out[6]:
    (1000, 13)
    In\u00a0[7]: Copied!
    import numpy as np\n\n# Show the index of the selected features, index <= 15 are relevant\nnp.arange(0, X.shape[1])[selector.support_]\n
    import numpy as np # Show the index of the selected features, index <= 15 are relevant np.arange(0, X.shape[1])[selector.support_] Out[7]:
    array([  1,   2,   3,   7,   8,   9,  10,  69,  80,  82, 155, 186, 197])

    We can see from the index of selected features that most of the selected features are informative (index<=15) while still some random features are being selected. Also some of the features are still redundant.

    "},{"location":"tutorials/recursive_feature_elimination/#recursive-feature-elimination-rfe","title":"Recursive Feature Elimination (RFE)\u00b6","text":"

    This tutorial will show an example of how we can use recursive feature elimination to improve our model performances. More specifically, this tutorial will illustrate how to perform backward recursive feature elimination based on permutation importance using the class felimination.rfe.PermutationImportanceRFECV

    "},{"location":"tutorials/recursive_feature_elimination/#create-a-dummy-dataset","title":"Create a dummy Dataset\u00b6","text":"

    For this tutorial we will use a dummy classification dataset created using sklearn.datasets.make_classification. For this dataset we will have 6 predictive features, 10 redundant and 184 random features.

    "},{"location":"tutorials/recursive_feature_elimination/#evaluate-performances-without-feature-elimination","title":"Evaluate performances without feature elimination\u00b6","text":""},{"location":"tutorials/recursive_feature_elimination/#perform-now-feature-elimination","title":"Perform now feature elimination\u00b6","text":""}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Homepage","text":"

    This library contains some useful scikit-learn compatible classes for feature selection.

    "},{"location":"#features","title":"Features","text":""},{"location":"#requirements","title":"Requirements","text":""},{"location":"#installation","title":"Installation","text":"

    In a terminal shell run the following command

    pip install felimination\n

    "},{"location":"#usage","title":"Usage","text":""},{"location":"#recursive-feature-elimination","title":"Recursive Feature Elimination","text":"

    In this section it will be illustrated how to use the PermutationImportanceRFECV class.

    from felimination.rfe import PermutationImportanceRFECV\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import make_classification\nimport numpy as np\n\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=20,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n)\n\nselector = PermutationImportanceRFECV(LogisticRegression(), step=0.3)\n\nselector.fit(X, y)\n\nselector.support_\n# array([False, False, False, False, False, False, False, False, False,\n#        False, False,  True, False, False, False, False, False, False,\n#        False, False])\n\nselector.ranking_\n# array([9, 3, 8, 9, 7, 8, 5, 6, 9, 6, 8, 1, 9, 7, 8, 9, 9, 2, 4, 7])\nselector.plot()\n

    It looks like 5 is a good number of features, we can set the number of features to select to 5 without need of retraining

    selector.set_n_features_to_select(5)\nselector.support_\n# array([False,  True, False, False, False, False,  True, False, False,\n#        False, False,  True, False, False, False, False, False,  True,\n#         True, False])\n
    "},{"location":"#genetic-algorithms","title":"Genetic Algorithms","text":"

    In this section it will be illustrated how to use the HybridImportanceGACVFeatureSelector class.

    from felimination.ga import HybridImportanceGACVFeatureSelector\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import make_classification\nimport numpy as np\n\n# Create dummy dataset\nX, y = make_classification(\n    n_samples=1000,\n    n_features=20,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n)\n\n# Initialize selector\nselector = HybridImportanceGACVFeatureSelector(\n    LogisticRegression(random_state=42),\n    random_state=42,\n    pool_size=5,\n    patience=5\n)\n\n# Run optimisation\nselector.fit(X, y)\n\n# Show selected features\nselector.support_\n#array([False,  True, False,  True,  True, False, False, False,  True,\n#       False, False, False,  True,  True,  True,  True, False,  True,\n#        True, False])\n\n# Show best solution\nselector.best_solution_\n# {'features': [1, 12, 13, 8, 17, 15, 18, 4, 3, 14],\n#  'train_scores_per_fold': [0.88625, 0.89, 0.8825, 0.8925, 0.88625],\n#  'test_scores_per_fold': [0.895, 0.885, 0.885, 0.89, 0.89],\n#  'cv_importances': [array([[ 1.09135972,  1.13502636,  1.12100231,  0.38285736,  0.28944072,\n#            0.04688614,  0.44259813,  0.09832365,  0.10190421, -0.48101593]]),\n#   array([[ 1.17345812,  1.29375208,  1.2065342 ,  0.40418709,  0.41839714,\n#            0.00447802,  0.466717  ,  0.21733829, -0.00842075, -0.50078996]]),\n#   array([[ 1.15416104,  1.18458564,  1.18083266,  0.37071253,  0.22842685,\n#            0.1087814 ,  0.44446793,  0.12740545,  0.00621562, -0.54064287]]),\n#   array([[ 1.26011643,  1.36996058,  1.30481424,  0.48183549,  0.40589887,\n#           -0.01849671,  0.45606913,  0.18330816,  0.03667055, -0.50869557]]),\n#   array([[ 1.18227123,  1.28988253,  1.2496398 ,  0.50754295,  0.38942303,\n#           -0.01725074,  0.4481891 ,  0.19472963,  0.10034316, -0.50131192]])],\n#  'mean_train_score': 0.8875,\n#  'mean_test_score': 0.889,\n#  'mean_cv_importances': array([ 1.17227331,  1.25464144,  1.21256464,  0.42942709,  0.34631732,\n#          0.02487962,  0.45160826,  0.16422104,  0.04734256, -0.50649125])}\n\n# Show progress as a plot\nselector.plot()\n

    Looks like that the optimisation process converged after 2 steps, since the best score did not improve for 5(=patience) consecutive steps, the optimisation process stopped early.

    "},{"location":"#license","title":"License","text":"

    This project is licensed under the BSD 3-Clause License - see the LICENSE.md file for details

    "},{"location":"#acknowledgments","title":"Acknowledgments","text":""},{"location":"reference/RFE/","title":"RFE","text":"

    Module with tools to perform feature selection.

    This module contains the following classes:

    "},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV","title":"FeliminationRFECV(estimator, *, step=1, n_features_to_select=1, cv=None, scoring=None, random_state=None, verbose=0, n_jobs=None, importance_getter='auto', callbacks=None)","text":"

    Bases: RFE

    Perform recursive feature elimination with cross-validation following scikit-learn standards.

    It has the following differences with RFECV from scikit-learn:

    Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.

    The algorithm of feature selection goes as follows:

    while n_features > n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n

    Parameters:

    Attributes:

    Examples:

    The following example shows how to retrieve the 5 most informative features in the Friedman #1 dataset.

    >>> from felimination.rfe import FeliminationRFECV\n>>> from felimination.importance import PermutationImportance\n>>> from sklearn.datasets import make_friedman1\n>>> from sklearn.svm import SVR\n>>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n>>> estimator = SVR(kernel=\"linear\")\n>>> selector = selector = FeliminationRFECV(\n    estimator,\n    step=1,\n    cv=5,\n    n_features_to_select=5,\n    importance_getter=PermutationImportance()\n)\n>>> selector = selector.fit(X, y)\n>>> selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n>>> selector.ranking_\narray([1, 1, 1, 1, 1, 6, 3, 4, 2, 5])\n
    Source code in felimination/rfe.py
    def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    step=1,\n    n_features_to_select=1,\n    cv=None,\n    scoring=None,\n    random_state=None,\n    verbose=0,\n    n_jobs=None,\n    importance_getter=\"auto\",\n    callbacks=None,\n) -> None:\n    self.cv = cv\n    self.scoring = scoring\n    self.n_jobs = n_jobs\n    self.random_state = random_state\n    self.callbacks = callbacks\n    super().__init__(\n        estimator,\n        n_features_to_select=n_features_to_select,\n        step=step,\n        verbose=verbose,\n        importance_getter=importance_getter,\n    )\n
    "},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.fit","title":"fit(X, y, groups=None, **fit_params)","text":"

    Fit the RFE model and then the underlying estimator on the selected features.

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.n_features_to_select is None:\n        n_features_to_select = n_features // 2\n    elif isinstance(self.n_features_to_select, Integral):  # int\n        n_features_to_select = self.n_features_to_select\n    else:  # float\n        n_features_to_select = int(n_features * self.n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = n_features\n    self.cv_results_ = defaultdict(list)\n\n    # Elimination\n    while current_number_of_features > n_features_to_select:\n        # Select remaining features\n        X_remaining_features, features = self._select_X_with_remaining_features(\n            X, support=support_, n_features=n_features\n        )\n\n        if self.verbose > 0:\n            print(\n                \"Fitting estimator with %d features.\" % current_number_of_features\n            )\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.estimator,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(mean_importances)\n\n        # for sparse case ranks is matrix\n        ranks = np.ravel(ranks)\n\n        if 0.0 < self.step < 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n\n        # Eliminate the worst features\n        threshold = min(step, current_number_of_features - n_features_to_select)\n\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, cv_importances)\n\n        current_number_of_features = np.sum(support_)\n    # Set final attributes\n\n    # Estimate performances of final model\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    cv_scores = cross_validate(\n        self.estimator,\n        X_remaining_features,\n        y,\n        groups=groups,\n        scoring=scorer,\n        cv=cv,\n        n_jobs=self.n_jobs,\n        fit_params=fit_params,\n        return_train_score=True,\n    )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n    # Update cv scores\n    for train_or_test in [\"train\", \"test\"]:\n        scores_per_fold = cv_scores[f\"{train_or_test}_score\"]\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n\n    if self.callbacks:\n        for callback in self.callbacks:\n            callback(self, cv_importances)\n\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    self.estimator_ = clone(self.estimator)\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n
    "},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.plot","title":"plot(**kwargs)","text":"

    Plot a feature selection plot with number of features

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n
    "},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.set_n_features_to_select","title":"set_n_features_to_select(n_features_to_select)","text":"

    Changes the number of features to select after fitting.

    The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.

    Parameters:

    Returns:

    Raises:

    Source code in felimination/rfe.py
    def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n
    "},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV","title":"PermutationImportanceRFECV(estimator, *, step=1, n_features_to_select=1, cv=None, scoring=None, verbose=0, n_jobs=None, n_repeats=5, random_state=None, sample_weight=None, max_samples=1.0, callbacks=None)","text":"

    Bases: FeliminationRFECV

    Preset of FeliminationRFECV using permutation importance as importance getter.

    It has the following differences with RFECV from scikit-learn:

    Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.

    The algorithm of feature selection goes as follows:

    while n_features > n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n

    Parameters:

    Attributes:

    Examples:

    The following example shows how to retrieve the 5 most informative features in the Friedman #1 dataset.

    >>> from felimination.rfe import PermutationImportanceRFECV\n>>> from sklearn.datasets import make_friedman1\n>>> from sklearn.svm import SVR\n>>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n>>> estimator = SVR(kernel=\"linear\")\n>>> selector = selector = PermutationImportanceRFECV(\n        estimator,\n        step=1,\n        cv=5,\n        n_features_to_select=5,\n    )\n>>> selector = selector.fit(X, y)\n>>> selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n>>> selector.ranking_\narray([1, 1, 1, 1, 1, 6, 3, 4, 2, 5])\n
    Source code in felimination/rfe.py
    def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    step=1,\n    n_features_to_select=1,\n    cv=None,\n    scoring=None,\n    verbose=0,\n    n_jobs=None,\n    n_repeats=5,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n    callbacks=None,\n) -> None:\n    self.n_repeats = n_repeats\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n    super().__init__(\n        estimator,\n        step=step,\n        n_features_to_select=n_features_to_select,\n        cv=cv,\n        random_state=random_state,\n        scoring=scoring,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        callbacks=callbacks,\n        importance_getter=PermutationImportance(\n            scoring=scoring,\n            n_repeats=n_repeats,\n            # Better not to do double parallelization\n            n_jobs=1,\n            random_state=random_state,\n            sample_weight=sample_weight,\n            max_samples=max_samples,\n        ),\n    )\n
    "},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.fit","title":"fit(X, y, groups=None, **fit_params)","text":"

    Fit the RFE model and then the underlying estimator on the selected features.

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.n_features_to_select is None:\n        n_features_to_select = n_features // 2\n    elif isinstance(self.n_features_to_select, Integral):  # int\n        n_features_to_select = self.n_features_to_select\n    else:  # float\n        n_features_to_select = int(n_features * self.n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = n_features\n    self.cv_results_ = defaultdict(list)\n\n    # Elimination\n    while current_number_of_features > n_features_to_select:\n        # Select remaining features\n        X_remaining_features, features = self._select_X_with_remaining_features(\n            X, support=support_, n_features=n_features\n        )\n\n        if self.verbose > 0:\n            print(\n                \"Fitting estimator with %d features.\" % current_number_of_features\n            )\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.estimator,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(mean_importances)\n\n        # for sparse case ranks is matrix\n        ranks = np.ravel(ranks)\n\n        if 0.0 < self.step < 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n\n        # Eliminate the worst features\n        threshold = min(step, current_number_of_features - n_features_to_select)\n\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, cv_importances)\n\n        current_number_of_features = np.sum(support_)\n    # Set final attributes\n\n    # Estimate performances of final model\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    cv_scores = cross_validate(\n        self.estimator,\n        X_remaining_features,\n        y,\n        groups=groups,\n        scoring=scorer,\n        cv=cv,\n        n_jobs=self.n_jobs,\n        fit_params=fit_params,\n        return_train_score=True,\n    )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n    # Update cv scores\n    for train_or_test in [\"train\", \"test\"]:\n        scores_per_fold = cv_scores[f\"{train_or_test}_score\"]\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n\n    if self.callbacks:\n        for callback in self.callbacks:\n            callback(self, cv_importances)\n\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    self.estimator_ = clone(self.estimator)\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n
    "},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.plot","title":"plot(**kwargs)","text":"

    Plot a feature selection plot with number of features

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n
    "},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.set_n_features_to_select","title":"set_n_features_to_select(n_features_to_select)","text":"

    Changes the number of features to select after fitting.

    The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.

    Parameters:

    Returns:

    Raises:

    Source code in felimination/rfe.py
    def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n
    "},{"location":"reference/callbacks/","title":"Callbacks","text":"

    Callbacks for feature selection algorithms.

    "},{"location":"reference/callbacks/#felimination.callbacks.plot_progress_callback","title":"plot_progress_callback(selector, *args, **kwargs)","text":"

    Plot the feature selection progress during the algorithm execution.

    Parameters:

    Source code in felimination/callbacks.py
    def plot_progress_callback(selector, *args, **kwargs):\n    \"\"\"Plot the feature selection progress during the algorithm execution.\n\n    Parameters\n    ----------\n    selector : object\n        The feature selector object.\n    \"\"\"\n    from IPython import display\n    from matplotlib import pyplot as plt\n\n    display.clear_output(wait=True)\n    selector.plot()\n    plt.show()\n
    "},{"location":"reference/drift/","title":"Drift","text":"

    The idea behind this module comes from the conjunction of two concepts:

    In [1] classifier performances are used to determine how similar two samples are. More specifically, imagine to have two samples: reference and test. In order to assess whether reference and test have been drawn from the same distribution, we could train a classifier in classifying which instances belong to which sample. If the model easily distinguishes instances from the two samples, then the two samples have been probably drawn from two different distributions. Conversely, if the classifier struggles to distinguish them, then it is likely that the samples have been drawn from the same distribution.

    In the context of drift detection, the classifier two-sample test can be used to assess whether drift has happened between the reference and the test set and to which degree.

    The classes of this module take this idea one step further and attempt to reduce the drift using recursive feature selection. After a classifier is trained to distinguish between reference and test, the feature importance of the classifier is used to determine which features contribute the most in distinguishing between the two sets. The most important features are then eliminated and the procedure is repeated until the classifier is not able anymore to distinguish between the two samples, or until a certain amount of features has been removed.

    This module contains the following classes: - SampleSimilarityDriftRFE: base class for drift-based sample similarity feature selection.

    "},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE","title":"PermImpSampleSimilarityDriftRFE(clf, *, step=1, max_score=0.55, min_n_features_to_select=1, split_col=0, split_value=None, split_frac=0.5, split_unique_values=True, cv=None, scoring=None, verbose=0, n_jobs=None, n_repeats=5, random_state=None, sample_weight=None, max_samples=1.0)","text":"

    Bases: SampleSimilarityDriftRFE

    Preset of SampleSimilarityDriftRFE using permutation importance as importance getter.

    It has the following differences with RFECV from scikit-learn:

    Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.

    The algorithm of feature selection goes as follows:

    while n_features > n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n

    Parameters:

    Attributes:

    Source code in felimination/drift.py
    def __init__(\n    self,\n    clf: ClassifierMixin,\n    *,\n    step=1,\n    max_score=0.55,\n    min_n_features_to_select=1,\n    split_col=0,\n    split_value=None,\n    split_frac=0.5,\n    split_unique_values=True,\n    cv=None,\n    scoring=None,\n    verbose=0,\n    n_jobs=None,\n    n_repeats=5,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n) -> None:\n    self.n_repeats = n_repeats\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n    super().__init__(\n        clf=clf,\n        max_score=max_score,\n        min_n_features_to_select=min_n_features_to_select,\n        split_col=split_col,\n        split_value=split_value,\n        split_frac=split_frac,\n        split_unique_values=split_unique_values,\n        step=step,\n        cv=cv,\n        scoring=scoring,\n        random_state=random_state,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        importance_getter=PermutationImportance(\n            scoring=scoring,\n            n_repeats=n_repeats,\n            # Better not to do double parallelization\n            n_jobs=1,\n            random_state=random_state,\n            sample_weight=sample_weight,\n            max_samples=max_samples,\n        ),\n    )\n
    "},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.fit","title":"fit(X, y=None, groups=None, **fit_params)","text":"

    Fit the RFE model and then the underlying clf on the selected features.

    Parameters:

    Returns:

    Source code in felimination/drift.py
    def fit(self, X, y=None, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying clf on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values. Not used, kept for compatibility.\n    groups : array-like of shape (n_samples,), default=None\n        Group labels for the samples used while splitting the dataset into\n        train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n        instance.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        clf.\n\n    Returns\n    -------\n    self : object\n        Fitted selector.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    X = self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        dtype=None,\n    )\n    if isinstance(self.split_col, str):\n        split_col_idx = list(self.feature_names_in_).index(self.split_col)\n    else:\n        split_col_idx = self.split_col\n    split_col_values = X[:, split_col_idx]\n    X, y = self._build_sample_similarity_x_y(X, split_col_values=split_col_values)\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=True)\n    scorer = check_scoring(self.clf, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    support_[split_col_idx] = False\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = support_.sum()\n    self.cv_results_ = defaultdict(list)\n\n    if self.verbose > 0:\n        print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n    # Train model, score it and get importances\n    if effective_n_jobs(self.n_jobs) == 1:\n        parallel, func = list, _train_score_get_importance\n    else:\n        parallel = Parallel(n_jobs=self.n_jobs)\n        func = delayed(_train_score_get_importance)\n\n    features = np.arange(n_features)[support_]\n    X_remaining_features = X[:, features]\n\n    scores_importances = parallel(\n        func(\n            self.clf,\n            X_remaining_features,\n            y,\n            train,\n            test,\n            scorer,\n            self.importance_getter,\n        )\n        for train, test in cv.split(X_remaining_features, y, groups)\n    )\n\n    test_scores_per_fold = [\n        score_importance[1] for score_importance in scores_importances\n    ]\n    train_scores_per_fold = [\n        score_importance[0] for score_importance in scores_importances\n    ]\n\n    # Update cv scores\n    for train_or_test, scores_per_fold in zip(\n        [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n    ):\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    # Elimination\n    while (\n        np.mean(test_scores_per_fold) > self.max_score\n        and current_number_of_features > min_n_features_to_select\n    ):\n        features = np.arange(n_features)[support_]\n        if 0.0 < self.step < 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n        # Eliminate most important features\n        threshold = min(step, current_number_of_features - min_n_features_to_select)\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(-mean_importances)\n        ranks = np.ravel(ranks)\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n        current_number_of_features = np.sum(support_)\n        # Select remaining features\n        features = np.arange(n_features)[support_]\n        X_remaining_features = X[:, features]\n\n        if self.verbose > 0:\n            print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.clf,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    features = np.arange(n_features)[support_]\n    self.clf_ = clone(self.clf)\n    self.clf_.fit(X[:, features], y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n
    "},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.plot","title":"plot(**kwargs)","text":"

    Plot a feature selection plot with number of features

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n
    "},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.set_n_features_to_select","title":"set_n_features_to_select(n_features_to_select)","text":"

    Changes the number of features to select after fitting.

    The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.

    Parameters:

    Returns:

    Raises:

    Source code in felimination/rfe.py
    def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n
    "},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE","title":"SampleSimilarityDriftRFE(clf, *, step=1, max_score=0.55, min_n_features_to_select=1, split_col=0, split_value=None, split_frac=0.5, split_unique_values=True, cv=None, scoring=None, random_state=None, verbose=0, n_jobs=None, importance_getter='auto')","text":"

    Bases: FeliminationRFECV

    Recursively discards the features that introduce the highest drift.

    The algorithm of feature selection goes as follows:

    Split X into two sets using the `split_column`: X1 and X2\ncreate target array y1 for X1 as an array of zeroes\ncreate target array y2 for X2 as an array of ones\nvertically concatenate X1, X2 and y1 and y2, obtaining X_ss and y_ss\nCalculate Cross-validation performances of the estimator on X_ss and y_ss.\nwhile cross-validation-performances > max_score and n_features > min_n_features_to_select:\n    Discard most important features\n    Calculate Cross-validation performances of the estimator on X_ss and y_ss using the new feature set.\n

    Parameters:

    Attributes:

    Source code in felimination/drift.py
    def __init__(\n    self,\n    clf: ClassifierMixin,\n    *,\n    step=1,\n    max_score=0.55,\n    min_n_features_to_select=1,\n    split_col=0,\n    split_value=None,\n    split_frac=0.5,\n    split_unique_values=True,\n    cv=None,\n    scoring=None,\n    random_state=None,\n    verbose=0,\n    n_jobs=None,\n    importance_getter=\"auto\",\n) -> None:\n    self.max_score = max_score\n    self.split_col = split_col\n    self.split_value = split_value\n    self.split_unique_values = split_unique_values\n    self.split_frac = split_frac\n    self.min_n_features_to_select = min_n_features_to_select\n    self.clf = clf\n    super().__init__(\n        estimator=clf,\n        n_features_to_select=min_n_features_to_select,\n        step=step,\n        cv=cv,\n        scoring=scoring,\n        random_state=random_state,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        importance_getter=importance_getter,\n    )\n
    "},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.fit","title":"fit(X, y=None, groups=None, **fit_params)","text":"

    Fit the RFE model and then the underlying clf on the selected features.

    Parameters:

    Returns:

    Source code in felimination/drift.py
    def fit(self, X, y=None, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying clf on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values. Not used, kept for compatibility.\n    groups : array-like of shape (n_samples,), default=None\n        Group labels for the samples used while splitting the dataset into\n        train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n        instance.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        clf.\n\n    Returns\n    -------\n    self : object\n        Fitted selector.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    X = self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        dtype=None,\n    )\n    if isinstance(self.split_col, str):\n        split_col_idx = list(self.feature_names_in_).index(self.split_col)\n    else:\n        split_col_idx = self.split_col\n    split_col_values = X[:, split_col_idx]\n    X, y = self._build_sample_similarity_x_y(X, split_col_values=split_col_values)\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=True)\n    scorer = check_scoring(self.clf, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    support_[split_col_idx] = False\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = support_.sum()\n    self.cv_results_ = defaultdict(list)\n\n    if self.verbose > 0:\n        print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n    # Train model, score it and get importances\n    if effective_n_jobs(self.n_jobs) == 1:\n        parallel, func = list, _train_score_get_importance\n    else:\n        parallel = Parallel(n_jobs=self.n_jobs)\n        func = delayed(_train_score_get_importance)\n\n    features = np.arange(n_features)[support_]\n    X_remaining_features = X[:, features]\n\n    scores_importances = parallel(\n        func(\n            self.clf,\n            X_remaining_features,\n            y,\n            train,\n            test,\n            scorer,\n            self.importance_getter,\n        )\n        for train, test in cv.split(X_remaining_features, y, groups)\n    )\n\n    test_scores_per_fold = [\n        score_importance[1] for score_importance in scores_importances\n    ]\n    train_scores_per_fold = [\n        score_importance[0] for score_importance in scores_importances\n    ]\n\n    # Update cv scores\n    for train_or_test, scores_per_fold in zip(\n        [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n    ):\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    # Elimination\n    while (\n        np.mean(test_scores_per_fold) > self.max_score\n        and current_number_of_features > min_n_features_to_select\n    ):\n        features = np.arange(n_features)[support_]\n        if 0.0 < self.step < 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n        # Eliminate most important features\n        threshold = min(step, current_number_of_features - min_n_features_to_select)\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(-mean_importances)\n        ranks = np.ravel(ranks)\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n        current_number_of_features = np.sum(support_)\n        # Select remaining features\n        features = np.arange(n_features)[support_]\n        X_remaining_features = X[:, features]\n\n        if self.verbose > 0:\n            print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.clf,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    features = np.arange(n_features)[support_]\n    self.clf_ = clone(self.clf)\n    self.clf_.fit(X[:, features], y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n
    "},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.plot","title":"plot(**kwargs)","text":"

    Plot a feature selection plot with number of features

    Parameters:

    Returns:

    Source code in felimination/rfe.py
    def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n
    "},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.set_n_features_to_select","title":"set_n_features_to_select(n_features_to_select)","text":"

    Changes the number of features to select after fitting.

    The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.

    Parameters:

    Returns:

    Raises:

    Source code in felimination/rfe.py
    def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n
    "},{"location":"reference/genetic_algorithms/","title":"Genetic algorithms","text":"

    This module contains the implementation of the Hybrid Genetic Algorithm-Importance with Cross-Validation. The algorithm is implemented in the HybridImportanceGACVFeatureSelector class.

    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector","title":"HybridImportanceGACVFeatureSelector(estimator, *, cv=5, scoring=None, random_state=None, n_jobs=None, importance_getter='auto', min_n_features_to_select=1, init_avg_features_num=15, init_std_features_num=5, pool_size=20, is_parent_selection_chance_proportional_to_fitness=True, n_children_cross_over=5, n_parents_cross_over=2, n_mutations=5, range_change_n_features_mutation=(-2, 3), range_randomly_swapped_features_mutation=(1, 4), max_generations=100, patience=5, callbacks=None, fitness_function=rank_mean_test_score_overfit_fitness)","text":"

    Bases: SelectorMixin, MetaEstimatorMixin, BaseEstimator

    Feature selection using Hybrid Genetic Algorithm-Importance with Cross-Validation.

    This feature selector uses a genetic algorithm to select features. The genetic algorithm is hybridized with feature importance. The feature importance is calculated using a cross-validation scheme. The algorithm works as follows:

    Pool initialization: The pool is initialized with random features. The number of features is randomly generated using a normal distribution with the average number of features to select and the standard deviation of the number of features to select as parameters. The number of features is clipped to be between the minimum number of features to select and the number of features in the dataset.

    Cross Over: The cross over is done by combining the features of the parents. The features are sorted by importance and the children are created by combining the features of the parents in a round-robin fashion. The number of features of the children is the average of the number of features of the parents. In this way, the children will have the most important features of the parents.

    Mutation: The mutation is done by randomly changing the number of features and replacing the least important features with random features.

    Selection: The selection is done by selecting the top pool_size solutions based on the fitness function.

    Parameters:

    Attributes:

    Examples:

    >>> from felimination.ga import HybridImportanceGACVFeatureSelector\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.linear_model import LogisticRegression\n>>> X, y = make_classification(\n    n_samples=sample_size,\n    n_features=2,\n    n_classes=2,\n    n_redundant=0,\n    n_clusters_per_class=1,\n    random_state=random_state,\n)\n>>> estimator = LogisticRegression(random_state=42)\n>>> selector = selector = HybridImportanceGACVFeatureSelector(\n    random_state=random_state,\n    init_avg_features_num=2,\n    init_std_features_num=1,\n)\n>>> selector = selector.fit(X, y)\n>>> selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n
    Source code in felimination/ga.py
    def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    cv=5,\n    scoring=None,\n    random_state=None,\n    n_jobs=None,\n    importance_getter=\"auto\",\n    min_n_features_to_select=1,\n    init_avg_features_num=15,\n    init_std_features_num=5,\n    pool_size=20,\n    is_parent_selection_chance_proportional_to_fitness=True,\n    n_children_cross_over=5,\n    n_parents_cross_over=2,\n    n_mutations=5,\n    range_change_n_features_mutation=(-2, 3),\n    range_randomly_swapped_features_mutation=(1, 4),\n    max_generations=100,\n    patience=5,\n    callbacks=None,\n    fitness_function=rank_mean_test_score_overfit_fitness,\n) -> None:\n    self.estimator = estimator\n    self.cv = cv\n    self.scoring = scoring\n    self.random_state = random_state\n    self.n_jobs = n_jobs\n    self.importance_getter = importance_getter\n    self.min_n_features_to_select = min_n_features_to_select\n    self.init_avg_features_num = init_avg_features_num\n    self.init_std_features_num = init_std_features_num\n    self.pool_size = pool_size\n    self.n_children_cross_over = n_children_cross_over\n    self.is_parent_selection_chance_proportional_to_fitness = (\n        is_parent_selection_chance_proportional_to_fitness\n    )\n    self.n_parents_cross_over = n_parents_cross_over\n    self.n_mutations = n_mutations\n    self.range_change_n_features_mutation = range_change_n_features_mutation\n    self.range_randomly_swapped_features_mutation = (\n        range_randomly_swapped_features_mutation\n    )\n    self.max_generations = max_generations\n    self.patience = patience\n    self.callbacks = callbacks\n    self.fitness_function = fitness_function\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.decision_function","title":"decision_function(X)","text":"

    Compute the decision function of X.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"decision_function\"))\ndef decision_function(self, X):\n    \"\"\"Compute the decision function of ``X``.\n\n    Parameters\n    ----------\n    X : {array-like or sparse matrix} of shape (n_samples, n_features)\n        The input samples. Internally, it will be converted to\n        ``dtype=np.float32`` and if a sparse matrix is provided\n        to a sparse ``csr_matrix``.\n\n    Returns\n    -------\n    score : array, shape = [n_samples, n_classes] or [n_samples]\n        The decision function of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n        Regression and binary classification produce an array of shape\n        [n_samples].\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.decision_function(self.transform(X))\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.fit","title":"fit(X, y, groups=None, **fit_params)","text":"

    Fit the selector and then the underlying estimator on the selected features.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the selector and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    if isinstance(X, pd.DataFrame):\n        all_features = X.columns.to_list()\n    else:\n        all_features = list(range(n_features))\n\n    np.random.seed(self.random_state)\n\n    # Create the initial pool of solutions\n    pool = [\n        {\n            \"features\": list(\n                np.random.choice(\n                    all_features,\n                    min(\n                        max(\n                            int(\n                                np.random.normal(\n                                    self.init_avg_features_num,\n                                    self.init_std_features_num,\n                                )\n                            ),\n                            min_n_features_to_select,\n                        ),\n                        n_features,\n                    ),\n                    replace=False,\n                )\n            ),\n        }\n        for _ in range(self.pool_size)\n    ]\n\n    # Evaluate the initial pool of solutions\n    pool = self._evaluate_calculate_importances(\n        pool, X, y, groups, cv, scorer, **fit_params\n    )\n    self.best_solutions_ = []\n    for _ in range(1, self.max_generations):\n        children = self._cross_over(pool)\n        children = self._evaluate_calculate_importances(\n            children, X, y, groups, cv, scorer, **fit_params\n        )\n        pool.extend(children)\n        mutations = self._mutate(pool, all_features)\n        mutations = self._evaluate_calculate_importances(\n            mutations, X, y, groups, cv, scorer, **fit_params\n        )\n        pool.extend(mutations)\n        pool_sorted = [\n            element\n            for _, element in sorted(\n                zip(self._calculate_fitness(pool), pool),\n                reverse=True,\n                key=itemgetter(0),\n            )\n        ]\n        pool = pool_sorted[: self.pool_size]\n        self.best_solutions_.append(pool[0])\n\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, pool)\n\n        if len(self.best_solutions_) > self.patience:\n            if all(\n                [\n                    self.best_solutions_[-1][\"features\"] == solution[\"features\"]\n                    for solution in self.best_solutions_[-self.patience :]\n                ]\n            ):\n                break\n\n    self.estimator_ = clone(self.estimator)\n    X_remaining_features = _select_X_with_features(\n        X, self.best_solution_[\"features\"]\n    )\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n    self.support_ = np.array(\n        [\n            True if feature in self.best_solution_[\"features\"] else False\n            for feature in all_features\n        ]\n    )\n\n    return self\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.plot","title":"plot(**kwargs)","text":"

    Plot the mean test score and mean train score of the best solution at each generation.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    def plot(self, **kwargs):\n    \"\"\"Plot the mean test score and mean train score of the best solution at each generation.\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    data_points_to_plot_long_form = []\n    for generation, best_solution in enumerate(self.best_solutions_, start=1):\n        for set, scores in zip(\n            [\"validation\", \"train\"],\n            [\n                best_solution[\"test_scores_per_fold\"],\n                best_solution[\"train_scores_per_fold\"],\n            ],\n        ):\n            for score in scores:\n                data_points_to_plot_long_form.append(\n                    {\"generation\": generation, \"score\": score, \"set\": set}\n                )\n    df_plot = pd.DataFrame(data_points_to_plot_long_form)\n    lineplot_kwargs = dict(\n        x=\"generation\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    return sns.lineplot(data=df_plot, **lineplot_kwargs)\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict","title":"predict(X)","text":"

    Reduce X to the selected features and predict using the estimator.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"predict\"))\ndef predict(self, X):\n    \"\"\"Reduce X to the selected features and predict using the estimator.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    Returns\n    -------\n    y : array of shape [n_samples]\n        The predicted target values.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict(self.transform(X))\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict_log_proba","title":"predict_log_proba(X)","text":"

    Predict class log-probabilities for X.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"predict_log_proba\"))\ndef predict_log_proba(self, X):\n    \"\"\"Predict class log-probabilities for X.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    Returns\n    -------\n    p : array of shape (n_samples, n_classes)\n        The class log-probabilities of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict_log_proba(self.transform(X))\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict_proba","title":"predict_proba(X)","text":"

    Predict class probabilities for X.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"predict_proba\"))\ndef predict_proba(self, X):\n    \"\"\"Predict class probabilities for X.\n\n    Parameters\n    ----------\n    X : {array-like or sparse matrix} of shape (n_samples, n_features)\n        The input samples. Internally, it will be converted to\n        ``dtype=np.float32`` and if a sparse matrix is provided\n        to a sparse ``csr_matrix``.\n\n    Returns\n    -------\n    p : array of shape (n_samples, n_classes)\n        The class probabilities of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict_proba(self.transform(X))\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.score","title":"score(X, y, **fit_params)","text":"

    Reduce X to the selected features and return the score of the estimator.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    @available_if(_estimator_has(\"score\"))\ndef score(self, X, y, **fit_params):\n    \"\"\"Reduce X to the selected features and return the score of the estimator.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    y : array of shape [n_samples]\n        The target values.\n\n    **fit_params : dict\n        Parameters to pass to the `score` method of the underlying\n        estimator.\n\n        .. versionadded:: 1.0\n\n    Returns\n    -------\n    score : float\n        Score of the underlying base estimator computed with the selected\n        features returned by `rfe.transform(X)` and `y`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.score(self.transform(X), y, **fit_params)\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.rank_mean_test_score_fitness","title":"rank_mean_test_score_fitness(pool)","text":"

    Define the fitness function as the rank of the mean test score.

    The rank of the mean test score is calculated by ranking the mean test score in ascending order.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    def rank_mean_test_score_fitness(pool):\n    \"\"\"Define the fitness function as the rank of the mean test score.\n\n    The rank of the mean test score is calculated by ranking the mean test score in ascending order.\n\n    Parameters\n    ----------\n\n    pool : list of dict\n        Each element in the list is a dictionary with the following keys:\n        - features: list of int\n            The features selected for this element.\n        - mean_test_score: float\n            The mean test score of the element.\n        - mean_train_score: float\n            The mean train score of the element.\n\n    Returns\n    -------\n    fitness : list of float\n        The fitness of each element in the pool.\n    \"\"\"\n    pool_df = pd.DataFrame(pool)\n    pool_df[\"rank_mean_test_score\"] = pool_df[\"mean_test_score\"].rank(ascending=True)\n    return pool_df[\"rank_mean_test_score\"].to_list()\n
    "},{"location":"reference/genetic_algorithms/#felimination.ga.rank_mean_test_score_overfit_fitness","title":"rank_mean_test_score_overfit_fitness(pool)","text":"

    Define the fitness function as the sum of the rank of the mean test score and the rank of the overfit.

    The rank of the mean test score is calculated by ranking the mean test score in ascending order. The rank of the overfit is calculated by ranking the overfit in ascending order. The overfit is calculated as the difference between the mean train score and the mean test score. The fitness is the sum of the rank of the mean test score and the rank of the overfit.

    Parameters:

    Returns:

    Source code in felimination/ga.py
    def rank_mean_test_score_overfit_fitness(pool):\n    \"\"\"Define the fitness function as the sum of the rank of the mean test score and the rank of the\n    overfit.\n\n    The rank of the mean test score is calculated by ranking the mean test score in ascending order.\n    The rank of the overfit is calculated by ranking the overfit in ascending order.\n    The overfit is calculated as the difference between the mean train score and the mean test score.\n    The fitness is the sum of the rank of the mean test score and the rank of the overfit.\n\n    Parameters\n    ----------\n    pool : list of dict\n        Each element in the list is a dictionary with the following keys:\n        - features: list of int\n            The features selected for this element.\n        - mean_test_score: float\n            The mean test score of the element.\n        - mean_train_score: float\n            The mean train score of the element.\n\n    Returns\n    -------\n    fitness : list of float\n        The fitness of each element in the pool.\n    \"\"\"\n\n    pool_df = pd.DataFrame(pool)\n    pool_df[\"rank_mean_test_score\"] = pool_df[\"mean_test_score\"].rank(ascending=False)\n    pool_df[\"overfit\"] = pool_df[\"mean_train_score\"] - pool_df[\"mean_test_score\"]\n    pool_df[\"rank_overfit\"] = pool_df[\"overfit\"].rank(ascending=True)\n    pool_df[\"rank_sum\"] = pool_df[\"rank_mean_test_score\"] + pool_df[\"rank_overfit\"]\n\n    pool_df[\"rank_sum_rank\"] = pool_df[\"rank_sum\"].rank(ascending=False)\n    return pool_df[\"rank_sum_rank\"].to_list()\n
    "},{"location":"reference/importance/","title":"Importance","text":""},{"location":"reference/importance/#felimination.importance.PermutationImportance","title":"PermutationImportance(scoring=None, n_repeats=5, n_jobs=None, random_state=None, sample_weight=None, max_samples=1.0)","text":"

    Wrapper around sklearn.inspection.permutation_importance.

    Parameters:

    Source code in felimination/importance.py
    def __init__(\n    self,\n    scoring=None,\n    n_repeats=5,\n    n_jobs=None,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n):\n    self.scoring = scoring\n    self.n_repeats = n_repeats\n    self.n_jobs = n_jobs\n    self.random_state = random_state\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n
    "},{"location":"reference/importance/#felimination.importance.PermutationImportance.__call__","title":"__call__(estimator, X, y)","text":"

    Computes the permutation importance.

    Parameters:

    Returns:

    Source code in felimination/importance.py
    def __call__(self, estimator, X, y) -> Any:\n    \"\"\"Computes the permutation importance.\n\n    Parameters\n    ----------\n    estimator : object\n        An estimator that has already been fitted and is compatible\n        with scorer.\n    X : ndarray or DataFrame, shape (n_samples, n_features)\n        Data on which permutation importance will be computed.\n    y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)\n        Targets for supervised or `None` for unsupervised.\n\n    Returns\n    -------\n    importances_mean : ndarray of shape (n_features, )\n        Mean of feature importance over `n_repeats`.\n    \"\"\"\n    return permutation_importance(\n        estimator,\n        X,\n        y,\n        scoring=self.scoring,\n        n_repeats=self.n_repeats,\n        n_jobs=self.n_jobs,\n        random_state=self.random_state,\n        sample_weight=self.sample_weight,\n        max_samples=self.max_samples,\n    ).importances_mean\n
    "},{"location":"tutorials/genetic_algorithms_x_feature_selection/","title":"Genetic Algorithms x Feature Selection","text":"In\u00a0[\u00a0]: Copied!
    # Install felimination\n! pip install felimination\n
    # Install felimination ! pip install felimination In\u00a0[2]: Copied!
    from sklearn.datasets import make_classification\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=200,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n    shuffle=False\n)\n
    from sklearn.datasets import make_classification X, y = make_classification( n_samples=1000, n_features=200, n_informative=6, n_redundant=10, n_clusters_per_class=1, random_state=42, shuffle=False ) In\u00a0[3]: Copied!
    from sklearn.model_selection import cross_validate, StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\n\n# Define a simple logistic regression model\nmodel = LogisticRegression(random_state=42)\n\n# Perform cross-validation\ncv_results = cross_validate(\n    model,\n    X,\n    y,\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    scoring=\"roc_auc\",\n    return_train_score=True,\n)\n\ncv_results[\"test_score\"].mean()\n
    from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.linear_model import LogisticRegression # Define a simple logistic regression model model = LogisticRegression(random_state=42) # Perform cross-validation cv_results = cross_validate( model, X, y, cv=StratifiedKFold(random_state=42, shuffle=True), scoring=\"roc_auc\", return_train_score=True, ) cv_results[\"test_score\"].mean() Out[3]:
    0.8561362716271628
    In\u00a0[4]: Copied!
    from felimination.ga import HybridImportanceGACVFeatureSelector\nfrom felimination.callbacks import plot_progress_callback\n\n\nselector = HybridImportanceGACVFeatureSelector(\n    model,\n    callbacks=[plot_progress_callback],\n    scoring=\"roc_auc\",\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    init_avg_features_num=5,\n    min_n_features_to_select=3,\n    pool_size=20,\n    n_children_cross_over=20,\n    n_mutations=20,\n    random_state=42,\n)\nselector.fit(X, y)\n
    from felimination.ga import HybridImportanceGACVFeatureSelector from felimination.callbacks import plot_progress_callback selector = HybridImportanceGACVFeatureSelector( model, callbacks=[plot_progress_callback], scoring=\"roc_auc\", cv=StratifiedKFold(random_state=42, shuffle=True), init_avg_features_num=5, min_n_features_to_select=3, pool_size=20, n_children_cross_over=20, n_mutations=20, random_state=42, ) selector.fit(X, y) Out[4]:
    HybridImportanceGACVFeatureSelector(callbacks=[<function plot_progress_callback at 0x31aaa4fe0>],\n                                    cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                                    estimator=LogisticRegression(random_state=42),\n                                    init_avg_features_num=5,\n                                    min_n_features_to_select=3,\n                                    n_children_cross_over=20, n_mutations=20,\n                                    random_state=42, scoring='roc_auc')
    In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.\u00a0HybridImportanceGACVFeatureSelectoriFitted
    HybridImportanceGACVFeatureSelector(callbacks=[<function plot_progress_callback at 0x31aaa4fe0>],\n                                    cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                                    estimator=LogisticRegression(random_state=42),\n                                    init_avg_features_num=5,\n                                    min_n_features_to_select=3,\n                                    n_children_cross_over=20, n_mutations=20,\n                                    random_state=42, scoring='roc_auc')
    estimator: LogisticRegression
    LogisticRegression(random_state=42)
    \u00a0LogisticRegression?Documentation for LogisticRegression
    LogisticRegression(random_state=42)

    Notice how model performances increase with the progressive elimination of features.

    This is due to the fact that models with a lot of not predictive feature tend to find patterns even in random noise and end up overfitting, see how the train score and the validation score get closer with the progressive elimination of features.

    In\u00a0[5]: Copied!
    sorted(selector.best_solution_['features'])\n
    sorted(selector.best_solution_['features']) Out[5]:
    [6, 10, 82, 93, 168]

    The features with index <= 15 are relevant, the others are random noise. We see that some of the relevant features are being selected. Nevertheless we got a good improvement in AUC score:

    In\u00a0[6]: Copied!
    selector.best_solution_['mean_test_score']\n
    selector.best_solution_['mean_test_score'] Out[6]:
    0.9197176917691768

    The best AUC score obtained with feature elimination is now 0.92, that's ~0.06 AUC points obtained from removing useless features.

    In\u00a0[8]: Copied!
    selector.transform(X).shape\n
    selector.transform(X).shape Out[8]:
    (1000, 5)
    "},{"location":"tutorials/genetic_algorithms_x_feature_selection/#genetic-algorithms-x-feature-selection","title":"Genetic Algorithms x Feature Selection\u00b6","text":"

    This tutorial will show an example of how we can use genetic algorithms applied to feature selection to improve our model performances.

    More specifically, this tutorial will illustrate how to perform feature selection using genetic algorithm as implemented in the class felimination.ga.HybridImportanceGACVFeatureSelector

    "},{"location":"tutorials/genetic_algorithms_x_feature_selection/#create-a-dummy-dataset","title":"Create a dummy Dataset\u00b6","text":"

    For this tutorial we will use a dummy classification dataset created using sklearn.datasets.make_classification. For this dataset we will have 6 predictive features, 10 redundant and 184 random features.

    "},{"location":"tutorials/genetic_algorithms_x_feature_selection/#evaluate-performances-without-feature-elimination","title":"Evaluate performances without feature elimination\u00b6","text":""},{"location":"tutorials/genetic_algorithms_x_feature_selection/#perform-now-feature-elimination","title":"Perform now feature elimination\u00b6","text":""},{"location":"tutorials/recursive_feature_elimination/","title":"Recursive Feature Elimination (RFE)","text":"In\u00a0[\u00a0]: Copied!
    # Install felimination\n! pip install felimination\n
    # Install felimination ! pip install felimination In\u00a0[2]: Copied!
    from sklearn.datasets import make_classification\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=200,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n    shuffle=False\n)\n
    from sklearn.datasets import make_classification X, y = make_classification( n_samples=1000, n_features=200, n_informative=6, n_redundant=10, n_clusters_per_class=1, random_state=42, shuffle=False ) In\u00a0[3]: Copied!
    from sklearn.model_selection import cross_validate, StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\n\n# Define a simple logistic regression model\nmodel = LogisticRegression(random_state=42)\n\n# Perform cross-validation\ncv_results = cross_validate(\n    model,\n    X,\n    y,\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    scoring=\"roc_auc\",\n    return_train_score=True,\n)\n\ncv_results[\"test_score\"].mean()\n
    from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.linear_model import LogisticRegression # Define a simple logistic regression model model = LogisticRegression(random_state=42) # Perform cross-validation cv_results = cross_validate( model, X, y, cv=StratifiedKFold(random_state=42, shuffle=True), scoring=\"roc_auc\", return_train_score=True, ) cv_results[\"test_score\"].mean() Out[3]:
    0.8561362716271628
    In\u00a0[4]: Copied!
    from felimination.rfe import PermutationImportanceRFECV\nfrom felimination.callbacks import plot_progress_callback\n\n\nselector = PermutationImportanceRFECV(\n    model,\n    step=0.2,\n    callbacks=[plot_progress_callback],\n    scoring=\"roc_auc\",\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n)\nselector.fit(X, y)\n
    from felimination.rfe import PermutationImportanceRFECV from felimination.callbacks import plot_progress_callback selector = PermutationImportanceRFECV( model, step=0.2, callbacks=[plot_progress_callback], scoring=\"roc_auc\", cv=StratifiedKFold(random_state=42, shuffle=True), ) selector.fit(X, y) Out[4]:
    PermutationImportanceRFECV(callbacks=[<function plot_progress_callback at 0x103583d80>],\n                           cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                           estimator=LogisticRegression(random_state=42),\n                           scoring='roc_auc', step=0.2)
    In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.\u00a0PermutationImportanceRFECViFitted
    PermutationImportanceRFECV(callbacks=[<function plot_progress_callback at 0x103583d80>],\n                           cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                           estimator=LogisticRegression(random_state=42),\n                           scoring='roc_auc', step=0.2)
    estimator: LogisticRegression
    LogisticRegression(random_state=42)
    \u00a0LogisticRegression?Documentation for LogisticRegression
    LogisticRegression(random_state=42)

    Notice how model performances increase with the progressive elimination of features.

    This is due to the fact that models with a lot of not predictive feature tend to find patterns even in random noise and end up overfitting, see how the train score and the validation score get closer with the progressive elimination of features.

    In\u00a0[5]: Copied!
    import pandas as pd\n\ncv_results_df = pd.DataFrame(selector.cv_results_)\n\ncv_results_df[[\"mean_test_score\", \"n_features\"]].sort_values(\n    \"mean_test_score\", ascending=False\n).head(10)\n
    import pandas as pd cv_results_df = pd.DataFrame(selector.cv_results_) cv_results_df[[\"mean_test_score\", \"n_features\"]].sort_values( \"mean_test_score\", ascending=False ).head(10) Out[5]: mean_test_score n_features 7 0.944138 44 6 0.943558 54 8 0.943018 36 9 0.942478 29 5 0.942438 67 4 0.942058 83 10 0.939718 24 11 0.937578 20 12 0.935838 16 13 0.935698 13

    The best AUC score obtained with feature elimination is now 0.94, that's 0.08 AUC points obtained from less features.

    If I had to choose a number of features, I would probably go for 13 number of features because there the validation score is very close to the train score.

    We can do this using the method set_n_features_to_select. This will change the support of the selector as well as the behavior of the transform method.

    In\u00a0[6]: Copied!
    selector.set_n_features_to_select(13)\nselector.transform(X).shape\n
    selector.set_n_features_to_select(13) selector.transform(X).shape Out[6]:
    (1000, 13)
    In\u00a0[7]: Copied!
    import numpy as np\n\n# Show the index of the selected features, index <= 15 are relevant\nnp.arange(0, X.shape[1])[selector.support_]\n
    import numpy as np # Show the index of the selected features, index <= 15 are relevant np.arange(0, X.shape[1])[selector.support_] Out[7]:
    array([  1,   2,   3,   7,   8,   9,  10,  69,  80,  82, 155, 186, 197])

    We can see from the index of selected features that most of the selected features are informative (index<=15) while still some random features are being selected. Also some of the features are still redundant.

    "},{"location":"tutorials/recursive_feature_elimination/#recursive-feature-elimination-rfe","title":"Recursive Feature Elimination (RFE)\u00b6","text":"

    This tutorial will show an example of how we can use recursive feature elimination to improve our model performances. More specifically, this tutorial will illustrate how to perform backward recursive feature elimination based on permutation importance using the class felimination.rfe.PermutationImportanceRFECV

    "},{"location":"tutorials/recursive_feature_elimination/#create-a-dummy-dataset","title":"Create a dummy Dataset\u00b6","text":"

    For this tutorial we will use a dummy classification dataset created using sklearn.datasets.make_classification. For this dataset we will have 6 predictive features, 10 redundant and 184 random features.

    "},{"location":"tutorials/recursive_feature_elimination/#evaluate-performances-without-feature-elimination","title":"Evaluate performances without feature elimination\u00b6","text":""},{"location":"tutorials/recursive_feature_elimination/#perform-now-feature-elimination","title":"Perform now feature elimination\u00b6","text":""}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml index f2d8dde..a795e62 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -10,6 +10,11 @@ 2024-07-31 daily + + https://claudiosalvatorearcidiacono.github.io/felimination/reference/callbacks/ + 2024-07-31 + daily + https://claudiosalvatorearcidiacono.github.io/felimination/reference/drift/ 2024-07-31 diff --git a/sitemap.xml.gz b/sitemap.xml.gz index b06abde..b755ec8 100644 Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ diff --git a/tutorials/genetic_algorithms_x_feature_selection/index.html b/tutorials/genetic_algorithms_x_feature_selection/index.html index 79247df..ddfad87 100644 --- a/tutorials/genetic_algorithms_x_feature_selection/index.html +++ b/tutorials/genetic_algorithms_x_feature_selection/index.html @@ -375,6 +375,27 @@ +
  • + + + + + Callbacks + + + + +
  • + + + + + + + + + +
  • diff --git a/tutorials/recursive_feature_elimination/index.html b/tutorials/recursive_feature_elimination/index.html index b9cebcf..e57d8a5 100644 --- a/tutorials/recursive_feature_elimination/index.html +++ b/tutorials/recursive_feature_elimination/index.html @@ -373,6 +373,27 @@ +
  • + + + + + Callbacks + + + + +
  • + + + + + + + + + +