diff --git a/404.html b/404.html
index c79b86c..ee97512 100644
--- a/404.html
+++ b/404.html
@@ -362,6 +362,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="/felimination/reference/callbacks/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="/felimination/reference/drift/" class="md-nav__link">
         
diff --git a/index.html b/index.html
index 2faa0a6..df2c790 100644
--- a/index.html
+++ b/index.html
@@ -491,6 +491,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="reference/callbacks/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="reference/drift/" class="md-nav__link">
         
diff --git a/objects.inv b/objects.inv
index d88ab58..41272db 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/reference/RFE/index.html b/reference/RFE/index.html
index bb4e294..4f1cb3c 100644
--- a/reference/RFE/index.html
+++ b/reference/RFE/index.html
@@ -14,7 +14,7 @@
         <link rel="prev" href="../..">
       
       
-        <link rel="next" href="../drift/">
+        <link rel="next" href="../callbacks/">
       
       
       <link rel="icon" href="../../assets/images/favicon.png">
@@ -509,6 +509,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../callbacks/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../drift/" class="md-nav__link">
         
diff --git a/reference/callbacks/index.html b/reference/callbacks/index.html
new file mode 100644
index 0000000..9e061e7
--- /dev/null
+++ b/reference/callbacks/index.html
@@ -0,0 +1,810 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+      
+      
+        <link rel="canonical" href="https://claudiosalvatorearcidiacono.github.io/felimination/reference/callbacks/">
+      
+      
+        <link rel="prev" href="../RFE/">
+      
+      
+        <link rel="next" href="../drift/">
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.30">
+    
+    
+      
+        <title>Callbacks - felimination</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css">
+      
+        
+        <link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/_mkdocstrings.css">
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    
+    
+      
+    
+    
+    
+    
+    <body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#felimination.callbacks" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow md-header--lifted" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="felimination" class="md-header__button md-logo" aria-label="felimination" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            felimination
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Callbacks
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+      
+        <form class="md-header__option" data-md-component="palette">
+  
+    
+    
+    
+    <input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">
+    
+      <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+      </label>
+    
+  
+    
+    
+    
+    <input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="indigo" data-md-color-accent="indigo"  aria-label="Switch to light mode"  type="radio" name="__palette" id="__palette_1">
+    
+      <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12c0-2.42-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12 20 8.69Z"/></svg>
+      </label>
+    
+  
+</form>
+      
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+  </nav>
+  
+    
+      
+<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
+  <div class="md-grid">
+    <ul class="md-tabs__list">
+      
+        
+  
+  
+  
+    <li class="md-tabs__item">
+      <a href="../.." class="md-tabs__link">
+        
+  
+    
+  
+  Homepage
+
+      </a>
+    </li>
+  
+
+      
+        
+  
+  
+    
+  
+  
+    
+    
+      <li class="md-tabs__item md-tabs__item--active">
+        <a href="../RFE/" class="md-tabs__link">
+          
+  
+  Reference
+
+        </a>
+      </li>
+    
+  
+
+      
+        
+  
+  
+  
+    
+    
+      <li class="md-tabs__item">
+        <a href="../../tutorials/genetic_algorithms_x_feature_selection/" class="md-tabs__link">
+          
+  
+  Tutorials
+
+        </a>
+      </li>
+    
+  
+
+      
+    </ul>
+  </div>
+</nav>
+    
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+  
+
+
+<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="felimination" class="md-nav__button md-logo" aria-label="felimination" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    felimination
+  </label>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Homepage
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    
+    
+    
+      
+        
+        
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="">
+            
+  
+  <span class="md-ellipsis">
+    Reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_2">
+            <span class="md-nav__icon md-icon"></span>
+            Reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../RFE/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    RFE
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#felimination.callbacks" class="md-nav__link">
+    <span class="md-ellipsis">
+      callbacks
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#felimination.callbacks.plot_progress_callback" class="md-nav__link">
+    <span class="md-ellipsis">
+      plot_progress_callback
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../drift/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Drift
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../genetic_algorithms/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Genetic algorithms
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../importance/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Importance
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+      
+      
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
+        
+          
+          <label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Tutorials
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_3">
+            <span class="md-nav__icon md-icon"></span>
+            Tutorials
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../tutorials/genetic_algorithms_x_feature_selection/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Genetic Algorithms x Feature Selection
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../tutorials/recursive_feature_elimination/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Recursive Feature Elimination (RFE)
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#felimination.callbacks" class="md-nav__link">
+    <span class="md-ellipsis">
+      callbacks
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#felimination.callbacks.plot_progress_callback" class="md-nav__link">
+    <span class="md-ellipsis">
+      plot_progress_callback
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+
+  <h1>Callbacks</h1>
+
+<div class="doc doc-object doc-module">
+
+
+
+<a id="felimination.callbacks"></a>
+    <div class="doc doc-contents first">
+
+      <p>Callbacks for feature selection algorithms.</p>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="felimination.callbacks.plot_progress_callback" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">plot_progress_callback</span><span class="p">(</span><span class="n">selector</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></code>
+
+</h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Plot the feature selection progress during the algorithm execution.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <ul>
+        <li class="doc-section-item field-body">
+          <b><code>selector</code></b>
+              (<code>object</code>)
+          –
+          <div class="doc-md-description">
+            <p>The feature selector object.</p>
+          </div>
+        </li>
+    </ul>
+
+            <details class="quote">
+              <summary>Source code in <code>felimination/callbacks.py</code></summary>
+              <div class="language-python highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"><a href="#__codelineno-0-4"> 4</a></span>
+<span class="normal"><a href="#__codelineno-0-5"> 5</a></span>
+<span class="normal"><a href="#__codelineno-0-6"> 6</a></span>
+<span class="normal"><a href="#__codelineno-0-7"> 7</a></span>
+<span class="normal"><a href="#__codelineno-0-8"> 8</a></span>
+<span class="normal"><a href="#__codelineno-0-9"> 9</a></span>
+<span class="normal"><a href="#__codelineno-0-10">10</a></span>
+<span class="normal"><a href="#__codelineno-0-11">11</a></span>
+<span class="normal"><a href="#__codelineno-0-12">12</a></span>
+<span class="normal"><a href="#__codelineno-0-13">13</a></span>
+<span class="normal"><a href="#__codelineno-0-14">14</a></span>
+<span class="normal"><a href="#__codelineno-0-15">15</a></span>
+<span class="normal"><a href="#__codelineno-0-16">16</a></span>
+<span class="normal"><a href="#__codelineno-0-17">17</a></span></pre></div></td><td class="code"><div><pre><span></span><code><span id="__span-0-4"><a id="__codelineno-0-4" name="__codelineno-0-4"></a><span class="k">def</span> <span class="nf">plot_progress_callback</span><span class="p">(</span><span class="n">selector</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+</span><span id="__span-0-5"><a id="__codelineno-0-5" name="__codelineno-0-5"></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;Plot the feature selection progress during the algorithm execution.</span>
+</span><span id="__span-0-6"><a id="__codelineno-0-6" name="__codelineno-0-6"></a>
+</span><span id="__span-0-7"><a id="__codelineno-0-7" name="__codelineno-0-7"></a><span class="sd">    Parameters</span>
+</span><span id="__span-0-8"><a id="__codelineno-0-8" name="__codelineno-0-8"></a><span class="sd">    ----------</span>
+</span><span id="__span-0-9"><a id="__codelineno-0-9" name="__codelineno-0-9"></a><span class="sd">    selector : object</span>
+</span><span id="__span-0-10"><a id="__codelineno-0-10" name="__codelineno-0-10"></a><span class="sd">        The feature selector object.</span>
+</span><span id="__span-0-11"><a id="__codelineno-0-11" name="__codelineno-0-11"></a><span class="sd">    &quot;&quot;&quot;</span>
+</span><span id="__span-0-12"><a id="__codelineno-0-12" name="__codelineno-0-12"></a>    <span class="kn">from</span> <span class="nn">IPython</span> <span class="kn">import</span> <span class="n">display</span>
+</span><span id="__span-0-13"><a id="__codelineno-0-13" name="__codelineno-0-13"></a>    <span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
+</span><span id="__span-0-14"><a id="__codelineno-0-14" name="__codelineno-0-14"></a>
+</span><span id="__span-0-15"><a id="__codelineno-0-15" name="__codelineno-0-15"></a>    <span class="n">display</span><span class="o">.</span><span class="n">clear_output</span><span class="p">(</span><span class="n">wait</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+</span><span id="__span-0-16"><a id="__codelineno-0-16" name="__codelineno-0-16"></a>    <span class="n">selector</span><span class="o">.</span><span class="n">plot</span><span class="p">()</span>
+</span><span id="__span-0-17"><a id="__codelineno-0-17" name="__codelineno-0-17"></a>    <span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
+</span></code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs", "navigation.tabs.sticky"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/reference/drift/index.html b/reference/drift/index.html
index 507b80a..5a947a0 100644
--- a/reference/drift/index.html
+++ b/reference/drift/index.html
@@ -11,7 +11,7 @@
         <link rel="canonical" href="https://claudiosalvatorearcidiacono.github.io/felimination/reference/drift/">
       
       
-        <link rel="prev" href="../RFE/">
+        <link rel="prev" href="../callbacks/">
       
       
         <link rel="next" href="../genetic_algorithms/">
@@ -378,6 +378,27 @@
                 
   
   
+  
+  
+    <li class="md-nav__item">
+      <a href="../callbacks/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
     
   
   
diff --git a/reference/genetic_algorithms/index.html b/reference/genetic_algorithms/index.html
index 2ae5a9d..0761eb3 100644
--- a/reference/genetic_algorithms/index.html
+++ b/reference/genetic_algorithms/index.html
@@ -380,6 +380,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../callbacks/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../drift/" class="md-nav__link">
         
diff --git a/reference/importance/index.html b/reference/importance/index.html
index b42d052..ebee64c 100644
--- a/reference/importance/index.html
+++ b/reference/importance/index.html
@@ -380,6 +380,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../callbacks/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../drift/" class="md-nav__link">
         
diff --git a/search/search_index.json b/search/search_index.json
index 7b3ab7f..138d730 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Homepage","text":"<p>This library contains some useful scikit-learn compatible classes for feature selection.</p>"},{"location":"#features","title":"Features","text":"<ul> <li>Recursive Feature Elimination with Cross Validation using Permutation Importance</li> <li>Hybrid Genetic Algorithms x Feature Importance selection</li> </ul>"},{"location":"#requirements","title":"Requirements","text":"<ul> <li>Python 3.7+</li> <li>NumPy</li> <li>Scikit-learn</li> <li>Pandas</li> </ul>"},{"location":"#installation","title":"Installation","text":"<p>In a terminal shell run the following command <pre><code>pip install felimination\n</code></pre></p>"},{"location":"#usage","title":"Usage","text":""},{"location":"#recursive-feature-elimination","title":"Recursive Feature Elimination","text":"<p>In this section it will be illustrated how to use the <code>PermutationImportanceRFECV</code> class.</p> <p><pre><code>from felimination.rfe import PermutationImportanceRFECV\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import make_classification\nimport numpy as np\n\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=20,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n)\n\nselector = PermutationImportanceRFECV(LogisticRegression(), step=0.3)\n\nselector.fit(X, y)\n\nselector.support_\n# array([False, False, False, False, False, False, False, False, False,\n#        False, False,  True, False, False, False, False, False, False,\n#        False, False])\n\nselector.ranking_\n# array([9, 3, 8, 9, 7, 8, 5, 6, 9, 6, 8, 1, 9, 7, 8, 9, 9, 2, 4, 7])\nselector.plot()\n</code></pre> </p> <p>It looks like <code>5</code> is a good number of features, we can set the number of features to select to 5 without need of retraining</p> <pre><code>selector.set_n_features_to_select(5)\nselector.support_\n# array([False,  True, False, False, False, False,  True, False, False,\n#        False, False,  True, False, False, False, False, False,  True,\n#         True, False])\n</code></pre>"},{"location":"#genetic-algorithms","title":"Genetic Algorithms","text":"<p>In this section it will be illustrated how to use the <code>HybridImportanceGACVFeatureSelector</code> class.</p> <p><pre><code>from felimination.ga import HybridImportanceGACVFeatureSelector\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import make_classification\nimport numpy as np\n\n# Create dummy dataset\nX, y = make_classification(\n    n_samples=1000,\n    n_features=20,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n)\n\n# Initialize selector\nselector = HybridImportanceGACVFeatureSelector(\n    LogisticRegression(random_state=42),\n    random_state=42,\n    pool_size=5,\n    patience=5\n)\n\n# Run optimisation\nselector.fit(X, y)\n\n# Show selected features\nselector.support_\n#array([False,  True, False,  True,  True, False, False, False,  True,\n#       False, False, False,  True,  True,  True,  True, False,  True,\n#        True, False])\n\n# Show best solution\nselector.best_solution_\n# {'features': [1, 12, 13, 8, 17, 15, 18, 4, 3, 14],\n#  'train_scores_per_fold': [0.88625, 0.89, 0.8825, 0.8925, 0.88625],\n#  'test_scores_per_fold': [0.895, 0.885, 0.885, 0.89, 0.89],\n#  'cv_importances': [array([[ 1.09135972,  1.13502636,  1.12100231,  0.38285736,  0.28944072,\n#            0.04688614,  0.44259813,  0.09832365,  0.10190421, -0.48101593]]),\n#   array([[ 1.17345812,  1.29375208,  1.2065342 ,  0.40418709,  0.41839714,\n#            0.00447802,  0.466717  ,  0.21733829, -0.00842075, -0.50078996]]),\n#   array([[ 1.15416104,  1.18458564,  1.18083266,  0.37071253,  0.22842685,\n#            0.1087814 ,  0.44446793,  0.12740545,  0.00621562, -0.54064287]]),\n#   array([[ 1.26011643,  1.36996058,  1.30481424,  0.48183549,  0.40589887,\n#           -0.01849671,  0.45606913,  0.18330816,  0.03667055, -0.50869557]]),\n#   array([[ 1.18227123,  1.28988253,  1.2496398 ,  0.50754295,  0.38942303,\n#           -0.01725074,  0.4481891 ,  0.19472963,  0.10034316, -0.50131192]])],\n#  'mean_train_score': 0.8875,\n#  'mean_test_score': 0.889,\n#  'mean_cv_importances': array([ 1.17227331,  1.25464144,  1.21256464,  0.42942709,  0.34631732,\n#          0.02487962,  0.45160826,  0.16422104,  0.04734256, -0.50649125])}\n\n# Show progress as a plot\nselector.plot()\n</code></pre> </p> <p>Looks like that the optimisation process converged after 2 steps, since the best score did not improve for 5(=<code>patience</code>) consecutive steps, the optimisation process stopped early.</p>"},{"location":"#license","title":"License","text":"<p>This project is licensed under the BSD 3-Clause License - see the LICENSE.md file for details</p>"},{"location":"#acknowledgments","title":"Acknowledgments","text":"<ul> <li>scikit-learn</li> </ul>"},{"location":"reference/RFE/","title":"RFE","text":"<p>Module with tools to perform feature selection.</p> <p>This module contains the following classes:</p> <ul> <li><code>FeliminationRFECV</code>: base class for feature selection.</li> <li><code>PermutationImportanceRFECV</code>: recursive feature elimination with     cross-validation based on permutation importance.</li> </ul>"},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV","title":"<code>FeliminationRFECV(estimator, *, step=1, n_features_to_select=1, cv=None, scoring=None, random_state=None, verbose=0, n_jobs=None, importance_getter='auto', callbacks=None)</code>","text":"<p>               Bases: <code>RFE</code></p> <p>Perform recursive feature elimination with cross-validation following scikit-learn standards.</p> <p>It has the following differences with RFECV from scikit-learn:</p> <ul> <li>It supports an <code>importance_getter</code> function that also uses a validation set to compute the feature importances. This allows to use importance measures like permutation importance or shap.</li> <li>Instead of using Cross Validation to select the number of features, it uses cross validation to get a more accurate estimate of the feature importances. This means that the number of features to select has to be set during initialization, similarly to RFE.</li> <li>When <code>step</code> is a float value it is removes a percentage of the number of remaining features, not total like in RFE/RFECV. This allows to drop big chunks of feature at the beginning of the RFE process and to slow down towards the end of the process.</li> <li>Has a plotting function</li> <li>Adds information about the number of features selected at each step in the attribute <code>cv_results_</code></li> <li>Allows to change the number of features to be selected after fitting.</li> </ul> <p>Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.</p> <p>The algorithm of feature selection goes as follows: <pre><code>while n_features &gt; n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n</code></pre></p> <p>Parameters:</p> <ul> <li> <code>estimator</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>A supervised learning estimator with a <code>fit</code> method.</p> </li> <li> <code>step</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>If greater than or equal to 1, then <code>step</code> corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then <code>step</code> corresponds to the percentage (rounded down) of remaining features to remove at each iteration. Note that the last iteration may remove fewer than <code>step</code> features in order to reach <code>min_features_to_select</code>.</p> </li> <li> <code>n_features_to_select</code>               (<code>int or float</code>, default:                   <code>None</code> )           \u2013            <p>The number of features to select. If <code>None</code>, half of the features are selected. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of the features to select.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>None</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are:</p> <pre><code>- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n</code></pre> <p>For integer/None inputs, if <code>y</code> is binary or multiclass, <code>~sklearn.model_selection.StratifiedKFold</code> is used. If the estimator is a classifier or if <code>y</code> is neither binary nor multiclass, <code>~sklearn.model_selection.KFold</code> is used.</p> <p>Refer :ref:<code>User Guide &lt;cross_validation&gt;</code> for the various cross-validation strategies that can be used here.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>Controls verbosity of output.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of cores to run in parallel while fitting across folds. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>importance_getter</code>               (<code>str or callable</code>, default:                   <code>'auto'</code> )           \u2013            <p>If 'auto', uses the feature importance either through a <code>coef_</code> or <code>feature_importances_</code> attributes of estimator.</p> <p>Also accepts a string that specifies an attribute name/path for extracting feature importance. For example, give <code>regressor_.coef_</code> in case of <code>~sklearn.compose.TransformedTargetRegressor</code>  or <code>named_steps.clf.feature_importances_</code> in case of <code>~sklearn.pipeline.Pipeline</code> with its last step named <code>clf</code>.</p> <p>If <code>callable</code>, overrides the default feature importance getter. The callable is passed with the fitted estimator and the validation set (X_val, y_val, estimator) and it should return importance for each feature.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>classes_</code>               (<code>ndarray of shape (n_classes,)</code>)           \u2013            <p>The classes labels. Only available when <code>estimator</code> is a classifier.</p> </li> <li> <code>estimator_</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>The fitted estimator used to select features.</p> </li> <li> <code>cv_results_</code>               (<code>dict of ndarrays</code>)           \u2013            <p>A dict with keys: n_features : ndarray of shape (n_subsets_of_features,)     The number of features used at that step. split(k)_test_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_test_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_test_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds. split(k)_train_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_train_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_train_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds.</p> </li> <li> <code>n_features_</code>               (<code>int</code>)           \u2013            <p>The number of selected features.</p> </li> <li> <code>n_features_in_</code>               (<code>int</code>)           \u2013            <p>Number of features seen during :term:<code>fit</code>. Only defined if the underlying estimator exposes such an attribute when fit.</p> </li> <li> <code>feature_names_in_</code>               (<code>ndarray of shape (`n_features_in_`,)</code>)           \u2013            <p>Names of features seen during :term:<code>fit</code>. Defined only when <code>X</code> has feature names that are all strings.</p> </li> <li> <code>ranking_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The feature ranking, such that <code>ranking_[i]</code> corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.</p> </li> <li> <code>support_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> <li> <code>callbacks</code>               (<code>list of callable, default=None</code>)           \u2013            <p>List of callables to be called at the end of each step of the feature selection. Each callable should accept two parameters: the selector and the importances computed at that step.</p> </li> </ul> <p>Examples:</p> <p>The following example shows how to retrieve the 5 most informative features in the Friedman #1 dataset.</p> <pre><code>&gt;&gt;&gt; from felimination.rfe import FeliminationRFECV\n&gt;&gt;&gt; from felimination.importance import PermutationImportance\n&gt;&gt;&gt; from sklearn.datasets import make_friedman1\n&gt;&gt;&gt; from sklearn.svm import SVR\n&gt;&gt;&gt; X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n&gt;&gt;&gt; estimator = SVR(kernel=\"linear\")\n&gt;&gt;&gt; selector = selector = FeliminationRFECV(\n    estimator,\n    step=1,\n    cv=5,\n    n_features_to_select=5,\n    importance_getter=PermutationImportance()\n)\n&gt;&gt;&gt; selector = selector.fit(X, y)\n&gt;&gt;&gt; selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n&gt;&gt;&gt; selector.ranking_\narray([1, 1, 1, 1, 1, 6, 3, 4, 2, 5])\n</code></pre> Source code in <code>felimination/rfe.py</code> <pre><code>def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    step=1,\n    n_features_to_select=1,\n    cv=None,\n    scoring=None,\n    random_state=None,\n    verbose=0,\n    n_jobs=None,\n    importance_getter=\"auto\",\n    callbacks=None,\n) -&gt; None:\n    self.cv = cv\n    self.scoring = scoring\n    self.n_jobs = n_jobs\n    self.random_state = random_state\n    self.callbacks = callbacks\n    super().__init__(\n        estimator,\n        n_features_to_select=n_features_to_select,\n        step=step,\n        verbose=verbose,\n        importance_getter=importance_getter,\n    )\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.fit","title":"<code>fit(X, y, groups=None, **fit_params)</code>","text":"<p>Fit the RFE model and then the underlying estimator on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>)           \u2013            <p>The target values.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying estimator.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.n_features_to_select is None:\n        n_features_to_select = n_features // 2\n    elif isinstance(self.n_features_to_select, Integral):  # int\n        n_features_to_select = self.n_features_to_select\n    else:  # float\n        n_features_to_select = int(n_features * self.n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = n_features\n    self.cv_results_ = defaultdict(list)\n\n    # Elimination\n    while current_number_of_features &gt; n_features_to_select:\n        # Select remaining features\n        X_remaining_features, features = self._select_X_with_remaining_features(\n            X, support=support_, n_features=n_features\n        )\n\n        if self.verbose &gt; 0:\n            print(\n                \"Fitting estimator with %d features.\" % current_number_of_features\n            )\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.estimator,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(mean_importances)\n\n        # for sparse case ranks is matrix\n        ranks = np.ravel(ranks)\n\n        if 0.0 &lt; self.step &lt; 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n\n        # Eliminate the worst features\n        threshold = min(step, current_number_of_features - n_features_to_select)\n\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, cv_importances)\n\n        current_number_of_features = np.sum(support_)\n    # Set final attributes\n\n    # Estimate performances of final model\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    cv_scores = cross_validate(\n        self.estimator,\n        X_remaining_features,\n        y,\n        groups=groups,\n        scoring=scorer,\n        cv=cv,\n        n_jobs=self.n_jobs,\n        fit_params=fit_params,\n        return_train_score=True,\n    )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n    # Update cv scores\n    for train_or_test in [\"train\", \"test\"]:\n        scores_per_fold = cv_scores[f\"{train_or_test}_score\"]\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n\n    if self.callbacks:\n        for callback in self.callbacks:\n            callback(self, cv_importances)\n\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    self.estimator_ = clone(self.estimator)\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot a feature selection plot with number of features</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.set_n_features_to_select","title":"<code>set_n_features_to_select(n_features_to_select)</code>","text":"<p>Changes the number of features to select after fitting.</p> <p>The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.</p> <p>Parameters:</p> <ul> <li> <code>n_features_to_select</code>               (<code>int</code>)           \u2013            <p>The number of features to select. Must be a value among <code>cv_results_[\"n_features\"]</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> <p>Raises:</p> <ul> <li> <code>ValueError</code>             \u2013            <p>When the number of features to select has not been tried during the feature selection procedure.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV","title":"<code>PermutationImportanceRFECV(estimator, *, step=1, n_features_to_select=1, cv=None, scoring=None, verbose=0, n_jobs=None, n_repeats=5, random_state=None, sample_weight=None, max_samples=1.0, callbacks=None)</code>","text":"<p>               Bases: <code>FeliminationRFECV</code></p> <p>Preset of FeliminationRFECV using permutation importance as importance getter.</p> <p>It has the following differences with RFECV from scikit-learn:</p> <ul> <li>It supports an <code>importance_getter</code> function that also uses a validation   set to compute the feature importances. This allows to use importance measures   like permutation importance or shap.</li> <li>Instead of using Cross Validation to select the number of features, it   uses cross validation to get a more accurate estimate of the feature   importances. This means that the number of features to select has to be   set during initialization, similarly to RFE.</li> <li>When <code>step</code> is a float value it is removes a percentage of the number   of remaining features, not total like in RFE/RFECV. This allows to   drop big chunks of feature at the beginning of the RFE process and to slow   down towards the end of the process.</li> <li>Has a plotting function</li> <li>Adds information about the number of features selected at each step in the   attribute <code>cv_results_</code></li> <li>Allows to change the number of features to be selected after fitting.</li> </ul> <p>Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.</p> <p>The algorithm of feature selection goes as follows: <pre><code>while n_features &gt; n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n</code></pre></p> <p>Parameters:</p> <ul> <li> <code>estimator</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>A supervised learning estimator with a <code>fit</code> method.</p> </li> <li> <code>step</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>If greater than or equal to 1, then <code>step</code> corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then <code>step</code> corresponds to the percentage (rounded down) of remaining features to remove at each iteration. Note that the last iteration may remove fewer than <code>step</code> features in order to reach <code>min_features_to_select</code>.</p> </li> <li> <code>n_features_to_select</code>               (<code>int or float</code>, default:                   <code>None</code> )           \u2013            <p>The number of features to select. If <code>None</code>, half of the features are selected. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of the features to select.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>None</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are:</p> <ul> <li>None, to use the default 5-fold cross-validation,</li> <li>integer, to specify the number of folds.</li> <li>:term:<code>CV splitter</code>,</li> <li>An iterable yielding (train, test) splits as arrays of indices.</li> </ul> <p>For integer/None inputs, if <code>y</code> is binary or multiclass, <code>~sklearn.model_selection.StratifiedKFold</code> is used. If the estimator is a classifier or if <code>y</code> is neither binary nor multiclass, <code>~sklearn.model_selection.KFold</code> is used.</p> <p>Refer :ref:<code>User Guide &lt;cross_validation&gt;</code> for the various cross-validation strategies that can be used here.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>Controls verbosity of output.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of cores to run in parallel while fitting across folds. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>n_repeats</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Number of times to permute a feature.</p> </li> <li> <code>random_state</code>               (<code>int, RandomState instance</code>, default:                   <code>None</code> )           \u2013            <p>Pseudo-random number generator to control the permutations of each feature. Pass an int to get reproducible results across function calls.</p> </li> <li> <code>sample_weight</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Sample weights used in scoring.</p> </li> <li> <code>max_samples</code>               (<code>int or float</code>, default:                   <code>1.0</code> )           \u2013            <p>The number of samples to draw from X to compute feature importance in each repeat (without replacement). - If int, then draw <code>max_samples</code> samples. - If float, then draw <code>max_samples * X.shape[0]</code> samples. - If <code>max_samples</code> is equal to <code>1.0</code> or <code>X.shape[0]</code>, all samples will be used. While using this option may provide less accurate importance estimates, it keeps the method tractable when evaluating feature importance on large datasets. In combination with <code>n_repeats</code>, this allows to control the computational speed vs statistical accuracy trade-off of this method.</p> </li> <li> <code>callbacks</code>               (<code>list of callable</code>, default:                   <code>None</code> )           \u2013            <p>List of callables to be called at the end of each step of the feature selection. Each callable should accept two parameters: the selector and the importances computed at that step.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>classes_</code>               (<code>ndarray of shape (n_classes,)</code>)           \u2013            <p>The classes labels. Only available when <code>estimator</code> is a classifier.</p> </li> <li> <code>estimator_</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>The fitted estimator used to select features.</p> </li> <li> <code>cv_results_</code>               (<code>dict of ndarrays</code>)           \u2013            <p>A dict with keys: n_features : ndarray of shape (n_subsets_of_features,)     The number of features used at that step. split(k)_test_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_test_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_test_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds. split(k)_train_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_train_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_train_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds.</p> </li> <li> <code>n_features_</code>               (<code>int</code>)           \u2013            <p>The number of selected features.</p> </li> <li> <code>n_features_in_</code>               (<code>int</code>)           \u2013            <p>Number of features seen during :term:<code>fit</code>. Only defined if the underlying estimator exposes such an attribute when fit.</p> </li> <li> <code>feature_names_in_</code>               (<code>ndarray of shape (`n_features_in_`,)</code>)           \u2013            <p>Names of features seen during :term:<code>fit</code>. Defined only when <code>X</code> has feature names that are all strings.</p> </li> <li> <code>ranking_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The feature ranking, such that <code>ranking_[i]</code> corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.</p> </li> <li> <code>support_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> </ul> <p>Examples:</p> <p>The following example shows how to retrieve the 5 most informative features in the Friedman #1 dataset.</p> <pre><code>&gt;&gt;&gt; from felimination.rfe import PermutationImportanceRFECV\n&gt;&gt;&gt; from sklearn.datasets import make_friedman1\n&gt;&gt;&gt; from sklearn.svm import SVR\n&gt;&gt;&gt; X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n&gt;&gt;&gt; estimator = SVR(kernel=\"linear\")\n&gt;&gt;&gt; selector = selector = PermutationImportanceRFECV(\n        estimator,\n        step=1,\n        cv=5,\n        n_features_to_select=5,\n    )\n&gt;&gt;&gt; selector = selector.fit(X, y)\n&gt;&gt;&gt; selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n&gt;&gt;&gt; selector.ranking_\narray([1, 1, 1, 1, 1, 6, 3, 4, 2, 5])\n</code></pre> Source code in <code>felimination/rfe.py</code> <pre><code>def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    step=1,\n    n_features_to_select=1,\n    cv=None,\n    scoring=None,\n    verbose=0,\n    n_jobs=None,\n    n_repeats=5,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n    callbacks=None,\n) -&gt; None:\n    self.n_repeats = n_repeats\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n    super().__init__(\n        estimator,\n        step=step,\n        n_features_to_select=n_features_to_select,\n        cv=cv,\n        random_state=random_state,\n        scoring=scoring,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        callbacks=callbacks,\n        importance_getter=PermutationImportance(\n            scoring=scoring,\n            n_repeats=n_repeats,\n            # Better not to do double parallelization\n            n_jobs=1,\n            random_state=random_state,\n            sample_weight=sample_weight,\n            max_samples=max_samples,\n        ),\n    )\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.fit","title":"<code>fit(X, y, groups=None, **fit_params)</code>","text":"<p>Fit the RFE model and then the underlying estimator on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>)           \u2013            <p>The target values.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying estimator.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.n_features_to_select is None:\n        n_features_to_select = n_features // 2\n    elif isinstance(self.n_features_to_select, Integral):  # int\n        n_features_to_select = self.n_features_to_select\n    else:  # float\n        n_features_to_select = int(n_features * self.n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = n_features\n    self.cv_results_ = defaultdict(list)\n\n    # Elimination\n    while current_number_of_features &gt; n_features_to_select:\n        # Select remaining features\n        X_remaining_features, features = self._select_X_with_remaining_features(\n            X, support=support_, n_features=n_features\n        )\n\n        if self.verbose &gt; 0:\n            print(\n                \"Fitting estimator with %d features.\" % current_number_of_features\n            )\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.estimator,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(mean_importances)\n\n        # for sparse case ranks is matrix\n        ranks = np.ravel(ranks)\n\n        if 0.0 &lt; self.step &lt; 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n\n        # Eliminate the worst features\n        threshold = min(step, current_number_of_features - n_features_to_select)\n\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, cv_importances)\n\n        current_number_of_features = np.sum(support_)\n    # Set final attributes\n\n    # Estimate performances of final model\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    cv_scores = cross_validate(\n        self.estimator,\n        X_remaining_features,\n        y,\n        groups=groups,\n        scoring=scorer,\n        cv=cv,\n        n_jobs=self.n_jobs,\n        fit_params=fit_params,\n        return_train_score=True,\n    )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n    # Update cv scores\n    for train_or_test in [\"train\", \"test\"]:\n        scores_per_fold = cv_scores[f\"{train_or_test}_score\"]\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n\n    if self.callbacks:\n        for callback in self.callbacks:\n            callback(self, cv_importances)\n\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    self.estimator_ = clone(self.estimator)\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot a feature selection plot with number of features</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.set_n_features_to_select","title":"<code>set_n_features_to_select(n_features_to_select)</code>","text":"<p>Changes the number of features to select after fitting.</p> <p>The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.</p> <p>Parameters:</p> <ul> <li> <code>n_features_to_select</code>               (<code>int</code>)           \u2013            <p>The number of features to select. Must be a value among <code>cv_results_[\"n_features\"]</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> <p>Raises:</p> <ul> <li> <code>ValueError</code>             \u2013            <p>When the number of features to select has not been tried during the feature selection procedure.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n</code></pre>"},{"location":"reference/drift/","title":"Drift","text":"<p>The idea behind this module comes from the conjunction of two concepts:</p> <ul> <li>[1] Classifier Two-Sample Test</li> <li>[2] Recursive Feature Elimination</li> </ul> <p>In [1] classifier performances are used to determine how similar two samples are. More specifically, imagine to have two samples: <code>reference</code> and <code>test</code>. In order to assess whether <code>reference</code> and <code>test</code> have been drawn from the same distribution, we could train a classifier in classifying which instances belong to which sample. If the model easily distinguishes instances from the two samples, then the two samples have been probably drawn from two different distributions. Conversely, if the classifier struggles to distinguish them, then it is likely that the samples have been drawn from the same distribution.</p> <p>In the context of drift detection, the classifier two-sample test can be used to assess whether drift has happened between the reference and the test set and to which degree.</p> <p>The classes of this module take this idea one step further and attempt to reduce the drift using recursive feature selection. After a classifier is trained to distinguish between <code>reference</code> and <code>test</code>, the feature importance of the classifier is used to determine which features contribute the most in distinguishing between the two sets. The most important features are then eliminated and the procedure is repeated until the classifier is not able anymore to distinguish between the two samples, or until a certain amount of features has been removed.</p> <p>This module contains the following classes: - <code>SampleSimilarityDriftRFE</code>: base class for drift-based sample similarity     feature selection.</p>"},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE","title":"<code>PermImpSampleSimilarityDriftRFE(clf, *, step=1, max_score=0.55, min_n_features_to_select=1, split_col=0, split_value=None, split_frac=0.5, split_unique_values=True, cv=None, scoring=None, verbose=0, n_jobs=None, n_repeats=5, random_state=None, sample_weight=None, max_samples=1.0)</code>","text":"<p>               Bases: <code>SampleSimilarityDriftRFE</code></p> <p>Preset of SampleSimilarityDriftRFE using permutation importance as importance getter.</p> <p>It has the following differences with RFECV from scikit-learn:</p> <ul> <li>It supports an <code>importance_getter</code> function that also uses a validation   set to compute the feature importances. This allows to use importance measures   like permutation importance or shap.</li> <li>Instead of using Cross Validation to select the number of features, it   uses cross validation to get a more accurate estimate of the feature   importances. This means that the number of features to select has to be   set during initialization, similarly to RFE.</li> <li>When <code>step</code> is a float value it is removes a percentage of the number   of remaining features, not total like in RFE/RFECV. This allows to   drop big chunks of feature at the beginning of the RFE process and to slow   down towards the end of the process.</li> <li>Has a plotting function</li> <li>Adds information about the number of features selected at each step in the   attribute <code>cv_results_</code></li> <li>Allows to change the number of features to be selected after fitting.</li> </ul> <p>Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.</p> <p>The algorithm of feature selection goes as follows: <pre><code>while n_features &gt; n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n</code></pre></p> <p>Parameters:</p> <ul> <li> <code>clf</code>               (<code>``Classifier`` instance</code>)           \u2013            <p>A Classifier with a <code>fit</code> method.</p> </li> <li> <code>step</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>If greater than or equal to 1, then <code>step</code> corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then <code>step</code> corresponds to the percentage (rounded down) of remaining features to remove at each iteration. Note that the last iteration may remove fewer than <code>step</code> features in order to reach <code>min_features_to_select</code>.</p> </li> <li> <code>max_score</code>               (<code>float</code>, default:                   <code>0.55</code> )           \u2013            <p>Stops the feature selection procedure when the cross-validation score of the sample similarity classifier is lower than <code>max_score</code>.</p> </li> <li> <code>min_n_features_to_select</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>The minimum number of features to select. If <code>None</code>, half of the features are selected. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of the features to select.</p> </li> <li> <code>split_column</code>               (<code>str</code>, default:                   <code>'split'</code> )           \u2013            <p>The name of the column in the dataset that will be used to split the dataset into two sets.</p> </li> <li> <code>split_value</code>               (<code>Any</code>, default:                   <code>None</code> )           \u2013            <p>If defined, this value will be used to split the dataset into two sets.</p> </li> <li> <code>split_frac</code>               (<code>float</code>, default:                   <code>0.5</code> )           \u2013            <p>If split_value, split frac is used to determine a split_value. The split frac corresponds to the quantile of the split_column to use as the split_value.</p> </li> <li> <code>split_unique_values</code>           \u2013            <p>Whether to calculate the quantile of the split_column to use as the split_value based on the unique values of the split_column.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>None</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are:</p> <ul> <li>None, to use the default 5-fold cross-validation,</li> <li>integer, to specify the number of folds.</li> <li>:term:<code>CV splitter</code>,</li> <li>An iterable yielding (train, test) splits as arrays of indices.</li> </ul> <p>For integer/None inputs, if <code>y</code> is binary or multiclass, :class:<code>~sklearn.model_selection.StratifiedKFold</code> is used. If the estimator is a classifier or if <code>y</code> is neither binary nor multiclass, :class:<code>~sklearn.model_selection.KFold</code> is used.</p> <p>Refer :ref:<code>User Guide &lt;cross_validation&gt;</code> for the various cross-validation strategies that can be used here.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>Controls verbosity of output.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of cores to run in parallel while fitting across folds. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>n_repeats</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Number of times to permute a feature.</p> </li> <li> <code>random_state</code>               (<code>int, RandomState instance</code>, default:                   <code>None</code> )           \u2013            <p>Pseudo-random number generator to control the permutations of each feature. Pass an int to get reproducible results across function calls.</p> </li> <li> <code>sample_weight</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Sample weights used in scoring.</p> </li> <li> <code>max_samples</code>               (<code>int or float</code>, default:                   <code>1.0</code> )           \u2013            <p>The number of samples to draw from X to compute feature importance in each repeat (without replacement). - If int, then draw <code>max_samples</code> samples. - If float, then draw <code>max_samples * X.shape[0]</code> samples. - If <code>max_samples</code> is equal to <code>1.0</code> or <code>X.shape[0]</code>, all samples will be used. While using this option may provide less accurate importance estimates, it keeps the method tractable when evaluating feature importance on large datasets. In combination with <code>n_repeats</code>, this allows to control the computational speed vs statistical accuracy trade-off of this method.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>classes_</code>               (<code>ndarray of shape (n_classes,)</code>)           \u2013            <p>The classes labels. Only available when <code>estimator</code> is a classifier.</p> </li> <li> <code>estimator_</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>The fitted estimator used to select features.</p> </li> <li> <code>cv_results_</code>               (<code>dict of ndarrays</code>)           \u2013            <p>A dict with keys: n_features : ndarray of shape (n_subsets_of_features,)     The number of features used at that step. split(k)_test_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_test_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_test_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds. split(k)_train_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_train_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_train_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds.</p> </li> <li> <code>n_features_</code>               (<code>int</code>)           \u2013            <p>The number of selected features.</p> </li> <li> <code>n_features_in_</code>               (<code>int</code>)           \u2013            <p>Number of features seen during :term:<code>fit</code>. Only defined if the underlying estimator exposes such an attribute when fit.</p> </li> <li> <code>feature_names_in_</code>               (<code>ndarray of shape (`n_features_in_`,)</code>)           \u2013            <p>Names of features seen during :term:<code>fit</code>. Defined only when <code>X</code> has feature names that are all strings.</p> </li> <li> <code>ranking_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The feature ranking, such that <code>ranking_[i]</code> corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.</p> </li> <li> <code>support_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> </ul> Source code in <code>felimination/drift.py</code> <pre><code>def __init__(\n    self,\n    clf: ClassifierMixin,\n    *,\n    step=1,\n    max_score=0.55,\n    min_n_features_to_select=1,\n    split_col=0,\n    split_value=None,\n    split_frac=0.5,\n    split_unique_values=True,\n    cv=None,\n    scoring=None,\n    verbose=0,\n    n_jobs=None,\n    n_repeats=5,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n) -&gt; None:\n    self.n_repeats = n_repeats\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n    super().__init__(\n        clf=clf,\n        max_score=max_score,\n        min_n_features_to_select=min_n_features_to_select,\n        split_col=split_col,\n        split_value=split_value,\n        split_frac=split_frac,\n        split_unique_values=split_unique_values,\n        step=step,\n        cv=cv,\n        scoring=scoring,\n        random_state=random_state,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        importance_getter=PermutationImportance(\n            scoring=scoring,\n            n_repeats=n_repeats,\n            # Better not to do double parallelization\n            n_jobs=1,\n            random_state=random_state,\n            sample_weight=sample_weight,\n            max_samples=max_samples,\n        ),\n    )\n</code></pre>"},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.fit","title":"<code>fit(X, y=None, groups=None, **fit_params)</code>","text":"<p>Fit the RFE model and then the underlying clf on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>The target values. Not used, kept for compatibility.</p> </li> <li> <code>groups</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a \"Group\" :term:<code>cv</code> instance.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying clf.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted selector.</p> </li> </ul> Source code in <code>felimination/drift.py</code> <pre><code>def fit(self, X, y=None, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying clf on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values. Not used, kept for compatibility.\n    groups : array-like of shape (n_samples,), default=None\n        Group labels for the samples used while splitting the dataset into\n        train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n        instance.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        clf.\n\n    Returns\n    -------\n    self : object\n        Fitted selector.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    X = self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        dtype=None,\n    )\n    if isinstance(self.split_col, str):\n        split_col_idx = list(self.feature_names_in_).index(self.split_col)\n    else:\n        split_col_idx = self.split_col\n    split_col_values = X[:, split_col_idx]\n    X, y = self._build_sample_similarity_x_y(X, split_col_values=split_col_values)\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=True)\n    scorer = check_scoring(self.clf, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    support_[split_col_idx] = False\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = support_.sum()\n    self.cv_results_ = defaultdict(list)\n\n    if self.verbose &gt; 0:\n        print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n    # Train model, score it and get importances\n    if effective_n_jobs(self.n_jobs) == 1:\n        parallel, func = list, _train_score_get_importance\n    else:\n        parallel = Parallel(n_jobs=self.n_jobs)\n        func = delayed(_train_score_get_importance)\n\n    features = np.arange(n_features)[support_]\n    X_remaining_features = X[:, features]\n\n    scores_importances = parallel(\n        func(\n            self.clf,\n            X_remaining_features,\n            y,\n            train,\n            test,\n            scorer,\n            self.importance_getter,\n        )\n        for train, test in cv.split(X_remaining_features, y, groups)\n    )\n\n    test_scores_per_fold = [\n        score_importance[1] for score_importance in scores_importances\n    ]\n    train_scores_per_fold = [\n        score_importance[0] for score_importance in scores_importances\n    ]\n\n    # Update cv scores\n    for train_or_test, scores_per_fold in zip(\n        [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n    ):\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    # Elimination\n    while (\n        np.mean(test_scores_per_fold) &gt; self.max_score\n        and current_number_of_features &gt; min_n_features_to_select\n    ):\n        features = np.arange(n_features)[support_]\n        if 0.0 &lt; self.step &lt; 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n        # Eliminate most important features\n        threshold = min(step, current_number_of_features - min_n_features_to_select)\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(-mean_importances)\n        ranks = np.ravel(ranks)\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n        current_number_of_features = np.sum(support_)\n        # Select remaining features\n        features = np.arange(n_features)[support_]\n        X_remaining_features = X[:, features]\n\n        if self.verbose &gt; 0:\n            print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.clf,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    features = np.arange(n_features)[support_]\n    self.clf_ = clone(self.clf)\n    self.clf_.fit(X[:, features], y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n</code></pre>"},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot a feature selection plot with number of features</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n</code></pre>"},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.set_n_features_to_select","title":"<code>set_n_features_to_select(n_features_to_select)</code>","text":"<p>Changes the number of features to select after fitting.</p> <p>The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.</p> <p>Parameters:</p> <ul> <li> <code>n_features_to_select</code>               (<code>int</code>)           \u2013            <p>The number of features to select. Must be a value among <code>cv_results_[\"n_features\"]</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> <p>Raises:</p> <ul> <li> <code>ValueError</code>             \u2013            <p>When the number of features to select has not been tried during the feature selection procedure.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n</code></pre>"},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE","title":"<code>SampleSimilarityDriftRFE(clf, *, step=1, max_score=0.55, min_n_features_to_select=1, split_col=0, split_value=None, split_frac=0.5, split_unique_values=True, cv=None, scoring=None, random_state=None, verbose=0, n_jobs=None, importance_getter='auto')</code>","text":"<p>               Bases: <code>FeliminationRFECV</code></p> <p>Recursively discards the features that introduce the highest drift.</p> <p>The algorithm of feature selection goes as follows: <pre><code>Split X into two sets using the `split_column`: X1 and X2\ncreate target array y1 for X1 as an array of zeroes\ncreate target array y2 for X2 as an array of ones\nvertically concatenate X1, X2 and y1 and y2, obtaining X_ss and y_ss\nCalculate Cross-validation performances of the estimator on X_ss and y_ss.\nwhile cross-validation-performances &gt; max_score and n_features &gt; min_n_features_to_select:\n    Discard most important features\n    Calculate Cross-validation performances of the estimator on X_ss and y_ss using the new feature set.\n</code></pre></p> <p>Parameters:</p> <ul> <li> <code>clf</code>               (<code>``Classifier`` instance</code>)           \u2013            <p>A Classifier with a <code>fit</code> method.</p> </li> <li> <code>step</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>If greater than or equal to 1, then <code>step</code> corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then <code>step</code> corresponds to the percentage (rounded down) of remaining features to remove at each iteration. Note that the last iteration may remove fewer than <code>step</code> features in order to reach <code>min_features_to_select</code>.</p> </li> <li> <code>max_score</code>               (<code>float</code>, default:                   <code>0.55</code> )           \u2013            <p>Stops the feature selection procedure when the cross-validation score of the sample similarity classifier is lower than <code>max_score</code>.</p> </li> <li> <code>min_n_features_to_select</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>The minimum number of features to select. If <code>None</code>, half of the features are selected. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of the features to select.</p> </li> <li> <code>split_column</code>               (<code>str</code>, default:                   <code>'split'</code> )           \u2013            <p>The name of the column in the dataset that will be used to split the dataset into two sets.</p> </li> <li> <code>split_value</code>               (<code>Any</code>, default:                   <code>None</code> )           \u2013            <p>If defined, this value will be used to split the dataset into two sets.</p> </li> <li> <code>split_frac</code>               (<code>float</code>, default:                   <code>0.5</code> )           \u2013            <p>If split_value, split frac is used to determine a split_value. The split frac corresponds to the quantile of the split_column to use as the split_value.</p> </li> <li> <code>split_unique_values</code>           \u2013            <p>Whether to calculate the quantile of the split_column to use as the split_value based on the unique values of the split_column.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>None</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are:</p> <pre><code>- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n</code></pre> <p>For integer/None inputs, if <code>y</code> is binary or multiclass, :class:<code>~sklearn.model_selection.StratifiedKFold</code> is used. If the estimator is a classifier or if <code>y</code> is neither binary nor multiclass, :class:<code>~sklearn.model_selection.KFold</code> is used.</p> <p>Refer :ref:<code>User Guide &lt;cross_validation&gt;</code> for the various cross-validation strategies that can be used here.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>Controls verbosity of output.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of cores to run in parallel while fitting across folds. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>importance_getter</code>               (<code>str or callable</code>, default:                   <code>'auto'</code> )           \u2013            <p>If 'auto', uses the feature importance either through a <code>coef_</code> or <code>feature_importances_</code> attributes of estimator.</p> <p>Also accepts a string that specifies an attribute name/path for extracting feature importance. For example, give <code>regressor_.coef_</code> in case of :class:<code>~sklearn.compose.TransformedTargetRegressor</code>  or <code>named_steps.clf.feature_importances_</code> in case of :class:<code>~sklearn.pipeline.Pipeline</code> with its last step named <code>clf</code>.</p> <p>If <code>callable</code>, overrides the default feature importance getter. The callable is passed with the fitted estimator and the validation set (X_val, y_val, estimator) and it should return importance for each feature.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>classes_</code>               (<code>ndarray of shape (n_classes,)</code>)           \u2013            <p>The classes labels.</p> </li> <li> <code>clf_</code>               (<code>``Classifier`` instance</code>)           \u2013            <p>The fitted classifier used to select features.</p> </li> <li> <code>cv_results_</code>               (<code>dict of ndarrays</code>)           \u2013            <p>A dict with keys: n_features : ndarray of shape (n_subsets_of_features,)     The number of features used at that step. split(k)_test_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_test_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_test_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds. split(k)_train_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_train_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_train_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds.</p> </li> <li> <code>n_features_</code>               (<code>int</code>)           \u2013            <p>The number of selected features.</p> </li> <li> <code>n_features_in_</code>               (<code>int</code>)           \u2013            <p>Number of features seen during :term:<code>fit</code>. Only defined if the underlying estimator exposes such an attribute when fit.</p> </li> <li> <code>feature_names_in_</code>               (<code>ndarray of shape (`n_features_in_`,)</code>)           \u2013            <p>Names of features seen during :term:<code>fit</code>. Defined only when <code>X</code> has feature names that are all strings.</p> </li> <li> <code>ranking_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The feature ranking, such that <code>ranking_[i]</code> corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.</p> </li> <li> <code>support_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> <li> <code># TODO</code>               (<code>Add example</code>)           \u2013            </li> </ul> Source code in <code>felimination/drift.py</code> <pre><code>def __init__(\n    self,\n    clf: ClassifierMixin,\n    *,\n    step=1,\n    max_score=0.55,\n    min_n_features_to_select=1,\n    split_col=0,\n    split_value=None,\n    split_frac=0.5,\n    split_unique_values=True,\n    cv=None,\n    scoring=None,\n    random_state=None,\n    verbose=0,\n    n_jobs=None,\n    importance_getter=\"auto\",\n) -&gt; None:\n    self.max_score = max_score\n    self.split_col = split_col\n    self.split_value = split_value\n    self.split_unique_values = split_unique_values\n    self.split_frac = split_frac\n    self.min_n_features_to_select = min_n_features_to_select\n    self.clf = clf\n    super().__init__(\n        estimator=clf,\n        n_features_to_select=min_n_features_to_select,\n        step=step,\n        cv=cv,\n        scoring=scoring,\n        random_state=random_state,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        importance_getter=importance_getter,\n    )\n</code></pre>"},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.fit","title":"<code>fit(X, y=None, groups=None, **fit_params)</code>","text":"<p>Fit the RFE model and then the underlying clf on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>The target values. Not used, kept for compatibility.</p> </li> <li> <code>groups</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a \"Group\" :term:<code>cv</code> instance.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying clf.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted selector.</p> </li> </ul> Source code in <code>felimination/drift.py</code> <pre><code>def fit(self, X, y=None, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying clf on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values. Not used, kept for compatibility.\n    groups : array-like of shape (n_samples,), default=None\n        Group labels for the samples used while splitting the dataset into\n        train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n        instance.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        clf.\n\n    Returns\n    -------\n    self : object\n        Fitted selector.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    X = self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        dtype=None,\n    )\n    if isinstance(self.split_col, str):\n        split_col_idx = list(self.feature_names_in_).index(self.split_col)\n    else:\n        split_col_idx = self.split_col\n    split_col_values = X[:, split_col_idx]\n    X, y = self._build_sample_similarity_x_y(X, split_col_values=split_col_values)\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=True)\n    scorer = check_scoring(self.clf, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    support_[split_col_idx] = False\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = support_.sum()\n    self.cv_results_ = defaultdict(list)\n\n    if self.verbose &gt; 0:\n        print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n    # Train model, score it and get importances\n    if effective_n_jobs(self.n_jobs) == 1:\n        parallel, func = list, _train_score_get_importance\n    else:\n        parallel = Parallel(n_jobs=self.n_jobs)\n        func = delayed(_train_score_get_importance)\n\n    features = np.arange(n_features)[support_]\n    X_remaining_features = X[:, features]\n\n    scores_importances = parallel(\n        func(\n            self.clf,\n            X_remaining_features,\n            y,\n            train,\n            test,\n            scorer,\n            self.importance_getter,\n        )\n        for train, test in cv.split(X_remaining_features, y, groups)\n    )\n\n    test_scores_per_fold = [\n        score_importance[1] for score_importance in scores_importances\n    ]\n    train_scores_per_fold = [\n        score_importance[0] for score_importance in scores_importances\n    ]\n\n    # Update cv scores\n    for train_or_test, scores_per_fold in zip(\n        [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n    ):\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    # Elimination\n    while (\n        np.mean(test_scores_per_fold) &gt; self.max_score\n        and current_number_of_features &gt; min_n_features_to_select\n    ):\n        features = np.arange(n_features)[support_]\n        if 0.0 &lt; self.step &lt; 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n        # Eliminate most important features\n        threshold = min(step, current_number_of_features - min_n_features_to_select)\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(-mean_importances)\n        ranks = np.ravel(ranks)\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n        current_number_of_features = np.sum(support_)\n        # Select remaining features\n        features = np.arange(n_features)[support_]\n        X_remaining_features = X[:, features]\n\n        if self.verbose &gt; 0:\n            print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.clf,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    features = np.arange(n_features)[support_]\n    self.clf_ = clone(self.clf)\n    self.clf_.fit(X[:, features], y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n</code></pre>"},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot a feature selection plot with number of features</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n</code></pre>"},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.set_n_features_to_select","title":"<code>set_n_features_to_select(n_features_to_select)</code>","text":"<p>Changes the number of features to select after fitting.</p> <p>The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.</p> <p>Parameters:</p> <ul> <li> <code>n_features_to_select</code>               (<code>int</code>)           \u2013            <p>The number of features to select. Must be a value among <code>cv_results_[\"n_features\"]</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> <p>Raises:</p> <ul> <li> <code>ValueError</code>             \u2013            <p>When the number of features to select has not been tried during the feature selection procedure.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n</code></pre>"},{"location":"reference/genetic_algorithms/","title":"Genetic algorithms","text":"<p>This module contains the implementation of the Hybrid Genetic Algorithm-Importance with Cross-Validation. The algorithm is implemented in the <code>HybridImportanceGACVFeatureSelector</code> class.</p>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector","title":"<code>HybridImportanceGACVFeatureSelector(estimator, *, cv=5, scoring=None, random_state=None, n_jobs=None, importance_getter='auto', min_n_features_to_select=1, init_avg_features_num=15, init_std_features_num=5, pool_size=20, is_parent_selection_chance_proportional_to_fitness=True, n_children_cross_over=5, n_parents_cross_over=2, n_mutations=5, range_change_n_features_mutation=(-2, 3), range_randomly_swapped_features_mutation=(1, 4), max_generations=100, patience=5, callbacks=None, fitness_function=rank_mean_test_score_overfit_fitness)</code>","text":"<p>               Bases: <code>SelectorMixin</code>, <code>MetaEstimatorMixin</code>, <code>BaseEstimator</code></p> <p>Feature selection using Hybrid Genetic Algorithm-Importance with Cross-Validation.</p> <p>This feature selector uses a genetic algorithm to select features. The genetic algorithm is hybridized with feature importance. The feature importance is calculated using a cross-validation scheme. The algorithm works as follows:</p> <p>Pool initialization: The pool is initialized with random features. The number of features is randomly generated using a normal distribution with the average number of features to select and the standard deviation of the number of features to select as parameters. The number of features is clipped to be between the minimum number of features to select and the number of features in the dataset.</p> <p>Cross Over: The cross over is done by combining the features of the parents. The features are sorted by importance and the children are created by combining the features of the parents in a round-robin fashion. The number of features of the children is the average of the number of features of the parents. In this way, the children will have the most important features of the parents.</p> <p>Mutation: The mutation is done by randomly changing the number of features and replacing the least important features with random features.</p> <p>Selection: The selection is done by selecting the top <code>pool_size</code> solutions based on the fitness function.</p> <p>Parameters:</p> <ul> <li> <code>estimator</code>               (<code>object</code>)           \u2013            <p>An estimator that follows the scikit-learn API and has a <code>fit</code> method.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>5</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - int, to specify the number of folds in a (Stratified)KFold, - :term:<code>CV splitter</code>, - An iterable yielding (train, test) splits as arrays of indices.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>random_state</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Controls the random seed given at the beginning of the algorithm.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>The number of jobs to run in parallel. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors.</p> </li> <li> <code>importance_getter</code>               (<code>str or callable</code>, default:                   <code>'auto'</code> )           \u2013            <p>If 'auto', uses the feature importance either through a <code>coef_</code> or <code>feature_importances_</code> attributes of estimator.</p> <p>Also accepts a string that specifies an attribute name/path for extracting feature importance. For example, give <code>regressor_.coef_</code> in case of <code>~sklearn.compose.TransformedTargetRegressor</code>  or <code>named_steps.clf.feature_importances_</code> in case of <code>~sklearn.pipeline.Pipeline</code> with its last step named <code>clf</code>.</p> <p>If <code>callable</code>, overrides the default feature importance getter. The callable is passed with the fitted estimator and the validation set (X_val, y_val, estimator) and it should return importance for each feature.</p> </li> <li> <code>min_n_features_to_select</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>The minimum number of features to select. If float, it represents the fraction of features to select.</p> </li> <li> <code>init_avg_features_num</code>               (<code>float</code>, default:                   <code>15</code> )           \u2013            <p>The average number of features to select in the initial pool of solutions.</p> </li> <li> <code>init_std_features_num</code>               (<code>float</code>, default:                   <code>5</code> )           \u2013            <p>The standard deviation of the number of features to select in the initial pool of solutions.</p> </li> <li> <code>pool_size</code>               (<code>int</code>, default:                   <code>20</code> )           \u2013            <p>The number of solutions in the pool.</p> </li> <li> <code>n_children_cross_over</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>The number of children to create by cross-over.</p> </li> <li> <code>is_parent_selection_chance_proportional_to_fitness</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>If True, the probability of selecting a parent is proportional to its fitness. This means that the fittest parents are more likely to be selected during crossover.</p> </li> <li> <code>n_parents_cross_over</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>The number of parents to select in each crossover. More than 2 parents can be selected during crossover. In that case, the top features of each parent are combined in a round-robin fashion to create a children. The number of features of the children is the average of the number of features of the parents.</p> </li> <li> <code>n_mutations</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>The number of mutations to apply to the pool.</p> </li> <li> <code>range_change_n_features_mutation</code>               (<code>tuple</code>, default:                   <code>(-2, 3)</code> )           \u2013            <p>The range of the number of features to change during mutation. The first element is the minimum number of features to change and the second element is the maximum number of features to change. The right limit is exclusive.</p> </li> <li> <code>range_randomly_swapped_features_mutation</code>               (<code>tuple</code>, default:                   <code>(1, 4)</code> )           \u2013            <p>The range of the number of features to replace during mutation. The first element is the minimum number of features to replace and the second element is the maximum number of features to replace. The right limit is exclusive.</p> </li> <li> <code>max_generations</code>               (<code>int</code>, default:                   <code>100</code> )           \u2013            <p>The maximum number of generations to run the genetic algorithm.</p> </li> <li> <code>patience</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>The number of generations without improvement to wait before stopping the algorithm.</p> </li> <li> <code>callbacks</code>               (<code>list of callable</code>, default:                   <code>None</code> )           \u2013            <p>A list of callables that are called after each generation. Each callable should accept the selector and the pool as arguments.</p> </li> <li> <code>fitness_function</code>               (<code>str or callable</code>, default:                   <code>rank_mean_test_score_overfit_fitness</code> )           \u2013            <p>The fitness function to use. Possible string values are: <code>'mean_test_score'</code>, <code>'mean_train_score'</code>, If a callable is passed, it should accept a list of dictionaries where each dictionary has the following keys 'features', 'mean_test_score', 'mean_train_score' and return a list of floats with the fitness of each element in the pool. Defaults to rank_mean_test_score_overfit_fitness</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>estimator_</code>               (<code>object</code>)           \u2013            <p>The fitted estimator.</p> </li> <li> <code>support_</code>               (<code>array of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> <li> <code>best_solution_</code>               (<code>dict</code>)           \u2013            <p>The best solution found by the genetic algorithm. It is a dictionary with the following keys - features: list of int     The features selected for this element. - mean_test_score: float     The mean test score of the element. - mean_train_score: float     The mean train score of the element. - train_scores_per_fold: list of float     The train score of each fold. - test_scores_per_fold: list of float     The test score of each fold. - cv_importances: list of array     The importances of each fold. - mean_cv_importances: array     The mean importances of each fold.</p> </li> <li> <code>best_solutions_</code>               (<code>list of dict</code>)           \u2013            <p>The best solutions found by the genetic algorithm at each generation. Each element is defined as in <code>best_solution_</code>.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from felimination.ga import HybridImportanceGACVFeatureSelector\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n&gt;&gt;&gt; from sklearn.linear_model import LogisticRegression\n&gt;&gt;&gt; X, y = make_classification(\n    n_samples=sample_size,\n    n_features=2,\n    n_classes=2,\n    n_redundant=0,\n    n_clusters_per_class=1,\n    random_state=random_state,\n)\n&gt;&gt;&gt; estimator = LogisticRegression(random_state=42)\n&gt;&gt;&gt; selector = selector = HybridImportanceGACVFeatureSelector(\n    random_state=random_state,\n    init_avg_features_num=2,\n    init_std_features_num=1,\n)\n&gt;&gt;&gt; selector = selector.fit(X, y)\n&gt;&gt;&gt; selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n</code></pre> Source code in <code>felimination/ga.py</code> <pre><code>def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    cv=5,\n    scoring=None,\n    random_state=None,\n    n_jobs=None,\n    importance_getter=\"auto\",\n    min_n_features_to_select=1,\n    init_avg_features_num=15,\n    init_std_features_num=5,\n    pool_size=20,\n    is_parent_selection_chance_proportional_to_fitness=True,\n    n_children_cross_over=5,\n    n_parents_cross_over=2,\n    n_mutations=5,\n    range_change_n_features_mutation=(-2, 3),\n    range_randomly_swapped_features_mutation=(1, 4),\n    max_generations=100,\n    patience=5,\n    callbacks=None,\n    fitness_function=rank_mean_test_score_overfit_fitness,\n) -&gt; None:\n    self.estimator = estimator\n    self.cv = cv\n    self.scoring = scoring\n    self.random_state = random_state\n    self.n_jobs = n_jobs\n    self.importance_getter = importance_getter\n    self.min_n_features_to_select = min_n_features_to_select\n    self.init_avg_features_num = init_avg_features_num\n    self.init_std_features_num = init_std_features_num\n    self.pool_size = pool_size\n    self.n_children_cross_over = n_children_cross_over\n    self.is_parent_selection_chance_proportional_to_fitness = (\n        is_parent_selection_chance_proportional_to_fitness\n    )\n    self.n_parents_cross_over = n_parents_cross_over\n    self.n_mutations = n_mutations\n    self.range_change_n_features_mutation = range_change_n_features_mutation\n    self.range_randomly_swapped_features_mutation = (\n        range_randomly_swapped_features_mutation\n    )\n    self.max_generations = max_generations\n    self.patience = patience\n    self.callbacks = callbacks\n    self.fitness_function = fitness_function\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.decision_function","title":"<code>decision_function(X)</code>","text":"<p>Compute the decision function of <code>X</code>.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like or sparse matrix</code>, default:                   <code>array-like or sparse matrix</code> )           \u2013            <p>The input samples. Internally, it will be converted to <code>dtype=np.float32</code> and if a sparse matrix is provided to a sparse <code>csr_matrix</code>.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>score</code> (              <code>array, shape = [n_samples, n_classes] or [n_samples]</code> )          \u2013            <p>The decision function of the input samples. The order of the classes corresponds to that in the attribute :term:<code>classes_</code>. Regression and binary classification produce an array of shape [n_samples].</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"decision_function\"))\ndef decision_function(self, X):\n    \"\"\"Compute the decision function of ``X``.\n\n    Parameters\n    ----------\n    X : {array-like or sparse matrix} of shape (n_samples, n_features)\n        The input samples. Internally, it will be converted to\n        ``dtype=np.float32`` and if a sparse matrix is provided\n        to a sparse ``csr_matrix``.\n\n    Returns\n    -------\n    score : array, shape = [n_samples, n_classes] or [n_samples]\n        The decision function of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n        Regression and binary classification produce an array of shape\n        [n_samples].\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.decision_function(self.transform(X))\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.fit","title":"<code>fit(X, y, groups=None, **fit_params)</code>","text":"<p>Fit the selector and then the underlying estimator on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>)           \u2013            <p>The target values.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying estimator.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the selector and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    if isinstance(X, pd.DataFrame):\n        all_features = X.columns.to_list()\n    else:\n        all_features = list(range(n_features))\n\n    np.random.seed(self.random_state)\n\n    # Create the initial pool of solutions\n    pool = [\n        {\n            \"features\": list(\n                np.random.choice(\n                    all_features,\n                    min(\n                        max(\n                            int(\n                                np.random.normal(\n                                    self.init_avg_features_num,\n                                    self.init_std_features_num,\n                                )\n                            ),\n                            min_n_features_to_select,\n                        ),\n                        n_features,\n                    ),\n                    replace=False,\n                )\n            ),\n        }\n        for _ in range(self.pool_size)\n    ]\n\n    # Evaluate the initial pool of solutions\n    pool = self._evaluate_calculate_importances(\n        pool, X, y, groups, cv, scorer, **fit_params\n    )\n    self.best_solutions_ = []\n    for _ in range(1, self.max_generations):\n        children = self._cross_over(pool)\n        children = self._evaluate_calculate_importances(\n            children, X, y, groups, cv, scorer, **fit_params\n        )\n        pool.extend(children)\n        mutations = self._mutate(pool, all_features)\n        mutations = self._evaluate_calculate_importances(\n            mutations, X, y, groups, cv, scorer, **fit_params\n        )\n        pool.extend(mutations)\n        pool_sorted = [\n            element\n            for _, element in sorted(\n                zip(self._calculate_fitness(pool), pool),\n                reverse=True,\n                key=itemgetter(0),\n            )\n        ]\n        pool = pool_sorted[: self.pool_size]\n        self.best_solutions_.append(pool[0])\n\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, pool)\n\n        if len(self.best_solutions_) &gt; self.patience:\n            if all(\n                [\n                    self.best_solutions_[-1][\"features\"] == solution[\"features\"]\n                    for solution in self.best_solutions_[-self.patience :]\n                ]\n            ):\n                break\n\n    self.estimator_ = clone(self.estimator)\n    X_remaining_features = _select_X_with_features(\n        X, self.best_solution_[\"features\"]\n    )\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n    self.support_ = np.array(\n        [\n            True if feature in self.best_solution_[\"features\"] else False\n            for feature in all_features\n        ]\n    )\n\n    return self\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot the mean test score and mean train score of the best solution at each generation.</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot the mean test score and mean train score of the best solution at each generation.\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    data_points_to_plot_long_form = []\n    for generation, best_solution in enumerate(self.best_solutions_, start=1):\n        for set, scores in zip(\n            [\"validation\", \"train\"],\n            [\n                best_solution[\"test_scores_per_fold\"],\n                best_solution[\"train_scores_per_fold\"],\n            ],\n        ):\n            for score in scores:\n                data_points_to_plot_long_form.append(\n                    {\"generation\": generation, \"score\": score, \"set\": set}\n                )\n    df_plot = pd.DataFrame(data_points_to_plot_long_form)\n    lineplot_kwargs = dict(\n        x=\"generation\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    return sns.lineplot(data=df_plot, **lineplot_kwargs)\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict","title":"<code>predict(X)</code>","text":"<p>Reduce X to the selected features and predict using the estimator.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array of shape [n_samples, n_features]</code>)           \u2013            <p>The input samples.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>y</code> (              <code>array of shape [n_samples]</code> )          \u2013            <p>The predicted target values.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"predict\"))\ndef predict(self, X):\n    \"\"\"Reduce X to the selected features and predict using the estimator.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    Returns\n    -------\n    y : array of shape [n_samples]\n        The predicted target values.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict(self.transform(X))\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict_log_proba","title":"<code>predict_log_proba(X)</code>","text":"<p>Predict class log-probabilities for X.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array of shape [n_samples, n_features]</code>)           \u2013            <p>The input samples.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>p</code> (              <code>array of shape (n_samples, n_classes)</code> )          \u2013            <p>The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:<code>classes_</code>.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"predict_log_proba\"))\ndef predict_log_proba(self, X):\n    \"\"\"Predict class log-probabilities for X.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    Returns\n    -------\n    p : array of shape (n_samples, n_classes)\n        The class log-probabilities of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict_log_proba(self.transform(X))\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict_proba","title":"<code>predict_proba(X)</code>","text":"<p>Predict class probabilities for X.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like or sparse matrix</code>, default:                   <code>array-like or sparse matrix</code> )           \u2013            <p>The input samples. Internally, it will be converted to <code>dtype=np.float32</code> and if a sparse matrix is provided to a sparse <code>csr_matrix</code>.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>p</code> (              <code>array of shape (n_samples, n_classes)</code> )          \u2013            <p>The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:<code>classes_</code>.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"predict_proba\"))\ndef predict_proba(self, X):\n    \"\"\"Predict class probabilities for X.\n\n    Parameters\n    ----------\n    X : {array-like or sparse matrix} of shape (n_samples, n_features)\n        The input samples. Internally, it will be converted to\n        ``dtype=np.float32`` and if a sparse matrix is provided\n        to a sparse ``csr_matrix``.\n\n    Returns\n    -------\n    p : array of shape (n_samples, n_classes)\n        The class probabilities of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict_proba(self.transform(X))\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.score","title":"<code>score(X, y, **fit_params)</code>","text":"<p>Reduce X to the selected features and return the score of the estimator.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array of shape [n_samples, n_features]</code>)           \u2013            <p>The input samples.</p> </li> <li> <code>y</code>               (<code>array of shape [n_samples]</code>)           \u2013            <p>The target values.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Parameters to pass to the <code>score</code> method of the underlying estimator.</p> <p>.. versionadded:: 1.0</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>score</code> (              <code>float</code> )          \u2013            <p>Score of the underlying base estimator computed with the selected features returned by <code>rfe.transform(X)</code> and <code>y</code>.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"score\"))\ndef score(self, X, y, **fit_params):\n    \"\"\"Reduce X to the selected features and return the score of the estimator.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    y : array of shape [n_samples]\n        The target values.\n\n    **fit_params : dict\n        Parameters to pass to the `score` method of the underlying\n        estimator.\n\n        .. versionadded:: 1.0\n\n    Returns\n    -------\n    score : float\n        Score of the underlying base estimator computed with the selected\n        features returned by `rfe.transform(X)` and `y`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.score(self.transform(X), y, **fit_params)\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.rank_mean_test_score_fitness","title":"<code>rank_mean_test_score_fitness(pool)</code>","text":"<p>Define the fitness function as the rank of the mean test score.</p> <p>The rank of the mean test score is calculated by ranking the mean test score in ascending order.</p> <p>Parameters:</p> <ul> <li> <code>pool</code>               (<code>list of dict</code>)           \u2013            <p>Each element in the list is a dictionary with the following keys: - features: list of int     The features selected for this element. - mean_test_score: float     The mean test score of the element. - mean_train_score: float     The mean train score of the element.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>fitness</code> (              <code>list of float</code> )          \u2013            <p>The fitness of each element in the pool.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>def rank_mean_test_score_fitness(pool):\n    \"\"\"Define the fitness function as the rank of the mean test score.\n\n    The rank of the mean test score is calculated by ranking the mean test score in ascending order.\n\n    Parameters\n    ----------\n\n    pool : list of dict\n        Each element in the list is a dictionary with the following keys:\n        - features: list of int\n            The features selected for this element.\n        - mean_test_score: float\n            The mean test score of the element.\n        - mean_train_score: float\n            The mean train score of the element.\n\n    Returns\n    -------\n    fitness : list of float\n        The fitness of each element in the pool.\n    \"\"\"\n    pool_df = pd.DataFrame(pool)\n    pool_df[\"rank_mean_test_score\"] = pool_df[\"mean_test_score\"].rank(ascending=True)\n    return pool_df[\"rank_mean_test_score\"].to_list()\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.rank_mean_test_score_overfit_fitness","title":"<code>rank_mean_test_score_overfit_fitness(pool)</code>","text":"<p>Define the fitness function as the sum of the rank of the mean test score and the rank of the overfit.</p> <p>The rank of the mean test score is calculated by ranking the mean test score in ascending order. The rank of the overfit is calculated by ranking the overfit in ascending order. The overfit is calculated as the difference between the mean train score and the mean test score. The fitness is the sum of the rank of the mean test score and the rank of the overfit.</p> <p>Parameters:</p> <ul> <li> <code>pool</code>               (<code>list of dict</code>)           \u2013            <p>Each element in the list is a dictionary with the following keys: - features: list of int     The features selected for this element. - mean_test_score: float     The mean test score of the element. - mean_train_score: float     The mean train score of the element.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>fitness</code> (              <code>list of float</code> )          \u2013            <p>The fitness of each element in the pool.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>def rank_mean_test_score_overfit_fitness(pool):\n    \"\"\"Define the fitness function as the sum of the rank of the mean test score and the rank of the\n    overfit.\n\n    The rank of the mean test score is calculated by ranking the mean test score in ascending order.\n    The rank of the overfit is calculated by ranking the overfit in ascending order.\n    The overfit is calculated as the difference between the mean train score and the mean test score.\n    The fitness is the sum of the rank of the mean test score and the rank of the overfit.\n\n    Parameters\n    ----------\n    pool : list of dict\n        Each element in the list is a dictionary with the following keys:\n        - features: list of int\n            The features selected for this element.\n        - mean_test_score: float\n            The mean test score of the element.\n        - mean_train_score: float\n            The mean train score of the element.\n\n    Returns\n    -------\n    fitness : list of float\n        The fitness of each element in the pool.\n    \"\"\"\n\n    pool_df = pd.DataFrame(pool)\n    pool_df[\"rank_mean_test_score\"] = pool_df[\"mean_test_score\"].rank(ascending=False)\n    pool_df[\"overfit\"] = pool_df[\"mean_train_score\"] - pool_df[\"mean_test_score\"]\n    pool_df[\"rank_overfit\"] = pool_df[\"overfit\"].rank(ascending=True)\n    pool_df[\"rank_sum\"] = pool_df[\"rank_mean_test_score\"] + pool_df[\"rank_overfit\"]\n\n    pool_df[\"rank_sum_rank\"] = pool_df[\"rank_sum\"].rank(ascending=False)\n    return pool_df[\"rank_sum_rank\"].to_list()\n</code></pre>"},{"location":"reference/importance/","title":"Importance","text":""},{"location":"reference/importance/#felimination.importance.PermutationImportance","title":"<code>PermutationImportance(scoring=None, n_repeats=5, n_jobs=None, random_state=None, sample_weight=None, max_samples=1.0)</code>","text":"<p>Wrapper around sklearn.inspection.permutation_importance.</p> <p>Parameters:</p> <ul> <li> <code>scoring</code>               (<code>str, callable, list, tuple, or dict</code>, default:                   <code>None</code> )           \u2013            <p>Scorer to use. If <code>scoring</code> represents a single score, one can use: - a single string; - a callable that returns a single value. If <code>scoring</code> represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric names and the values are the metric scores; - a dictionary with metric names as keys and callables a values. Passing multiple scores to <code>scoring</code> is more efficient than calling <code>permutation_importance</code> for each of the scores as it reuses predictions to avoid redundant computation. If None, the estimator's default scorer is used.</p> </li> <li> <code>n_repeats</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Number of times to permute a feature.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of jobs to run in parallel. The computation is done by computing permutation score for each columns and parallelized over the columns. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>random_state</code>               (<code>int, RandomState instance</code>, default:                   <code>None</code> )           \u2013            <p>Pseudo-random number generator to control the permutations of each feature. Pass an int to get reproducible results across function calls.</p> </li> <li> <code>sample_weight</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Sample weights used in scoring.</p> </li> <li> <code>max_samples</code>               (<code>int or float</code>, default:                   <code>1.0</code> )           \u2013            <p>The number of samples to draw from X to compute feature importance in each repeat (without replacement). - If int, then draw <code>max_samples</code> samples. - If float, then draw <code>max_samples * X.shape[0]</code> samples. - If <code>max_samples</code> is equal to <code>1.0</code> or <code>X.shape[0]</code>, all samples will be used. While using this option may provide less accurate importance estimates, it keeps the method tractable when evaluating feature importance on large datasets. In combination with <code>n_repeats</code>, this allows to control the computational speed vs statistical accuracy trade-off of this method.</p> </li> </ul> Source code in <code>felimination/importance.py</code> <pre><code>def __init__(\n    self,\n    scoring=None,\n    n_repeats=5,\n    n_jobs=None,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n):\n    self.scoring = scoring\n    self.n_repeats = n_repeats\n    self.n_jobs = n_jobs\n    self.random_state = random_state\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n</code></pre>"},{"location":"reference/importance/#felimination.importance.PermutationImportance.__call__","title":"<code>__call__(estimator, X, y)</code>","text":"<p>Computes the permutation importance.</p> <p>Parameters:</p> <ul> <li> <code>estimator</code>               (<code>object</code>)           \u2013            <p>An estimator that has already been fitted and is compatible with scorer.</p> </li> <li> <code>X</code>               (<code>(ndarray or DataFrame, shape(n_samples, n_features))</code>)           \u2013            <p>Data on which permutation importance will be computed.</p> </li> <li> <code>y</code>               (<code>(array - like or None, shape(n_samples) or (n_samples, n_classes))</code>)           \u2013            <p>Targets for supervised or <code>None</code> for unsupervised.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>importances_mean</code> (              <code>ndarray of shape (n_features, )</code> )          \u2013            <p>Mean of feature importance over <code>n_repeats</code>.</p> </li> </ul> Source code in <code>felimination/importance.py</code> <pre><code>def __call__(self, estimator, X, y) -&gt; Any:\n    \"\"\"Computes the permutation importance.\n\n    Parameters\n    ----------\n    estimator : object\n        An estimator that has already been fitted and is compatible\n        with scorer.\n    X : ndarray or DataFrame, shape (n_samples, n_features)\n        Data on which permutation importance will be computed.\n    y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)\n        Targets for supervised or `None` for unsupervised.\n\n    Returns\n    -------\n    importances_mean : ndarray of shape (n_features, )\n        Mean of feature importance over `n_repeats`.\n    \"\"\"\n    return permutation_importance(\n        estimator,\n        X,\n        y,\n        scoring=self.scoring,\n        n_repeats=self.n_repeats,\n        n_jobs=self.n_jobs,\n        random_state=self.random_state,\n        sample_weight=self.sample_weight,\n        max_samples=self.max_samples,\n    ).importances_mean\n</code></pre>"},{"location":"tutorials/genetic_algorithms_x_feature_selection/","title":"Genetic Algorithms x Feature Selection","text":"In\u00a0[\u00a0]: Copied! <pre># Install felimination\n! pip install felimination\n</pre> # Install felimination ! pip install felimination In\u00a0[2]: Copied! <pre>from sklearn.datasets import make_classification\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=200,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n    shuffle=False\n)\n</pre> from sklearn.datasets import make_classification  X, y = make_classification(     n_samples=1000,     n_features=200,     n_informative=6,     n_redundant=10,     n_clusters_per_class=1,     random_state=42,     shuffle=False ) In\u00a0[3]: Copied! <pre>from sklearn.model_selection import cross_validate, StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\n\n# Define a simple logistic regression model\nmodel = LogisticRegression(random_state=42)\n\n# Perform cross-validation\ncv_results = cross_validate(\n    model,\n    X,\n    y,\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    scoring=\"roc_auc\",\n    return_train_score=True,\n)\n\ncv_results[\"test_score\"].mean()\n</pre> from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.linear_model import LogisticRegression   # Define a simple logistic regression model model = LogisticRegression(random_state=42)  # Perform cross-validation cv_results = cross_validate(     model,     X,     y,     cv=StratifiedKFold(random_state=42, shuffle=True),     scoring=\"roc_auc\",     return_train_score=True, )  cv_results[\"test_score\"].mean() Out[3]: <pre>0.8561362716271628</pre> In\u00a0[4]: Copied! <pre>from felimination.ga import HybridImportanceGACVFeatureSelector\nfrom felimination.callbacks import plot_progress_callback\n\n\nselector = HybridImportanceGACVFeatureSelector(\n    model,\n    callbacks=[plot_progress_callback],\n    scoring=\"roc_auc\",\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    init_avg_features_num=5,\n    min_n_features_to_select=3,\n    pool_size=20,\n    n_children_cross_over=20,\n    n_mutations=20,\n    random_state=42,\n)\nselector.fit(X, y)\n</pre> from felimination.ga import HybridImportanceGACVFeatureSelector from felimination.callbacks import plot_progress_callback   selector = HybridImportanceGACVFeatureSelector(     model,     callbacks=[plot_progress_callback],     scoring=\"roc_auc\",     cv=StratifiedKFold(random_state=42, shuffle=True),     init_avg_features_num=5,     min_n_features_to_select=3,     pool_size=20,     n_children_cross_over=20,     n_mutations=20,     random_state=42, ) selector.fit(X, y) Out[4]: <pre>HybridImportanceGACVFeatureSelector(callbacks=[&lt;function plot_progress_callback at 0x31aaa4fe0&gt;],\n                                    cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                                    estimator=LogisticRegression(random_state=42),\n                                    init_avg_features_num=5,\n                                    min_n_features_to_select=3,\n                                    n_children_cross_over=20, n_mutations=20,\n                                    random_state=42, scoring='roc_auc')</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.\u00a0HybridImportanceGACVFeatureSelectoriFitted<pre>HybridImportanceGACVFeatureSelector(callbacks=[&lt;function plot_progress_callback at 0x31aaa4fe0&gt;],\n                                    cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                                    estimator=LogisticRegression(random_state=42),\n                                    init_avg_features_num=5,\n                                    min_n_features_to_select=3,\n                                    n_children_cross_over=20, n_mutations=20,\n                                    random_state=42, scoring='roc_auc')</pre> estimator: LogisticRegression<pre>LogisticRegression(random_state=42)</pre> \u00a0LogisticRegression?Documentation for LogisticRegression<pre>LogisticRegression(random_state=42)</pre> <p>Notice how model performances increase with the progressive elimination of features.</p> <p>This is due to the fact that models with a lot of not predictive feature tend to find patterns even in random noise and end up overfitting, see how the train score and the validation score get closer with the progressive elimination of features.</p> In\u00a0[5]: Copied! <pre>sorted(selector.best_solution_['features'])\n</pre> sorted(selector.best_solution_['features']) Out[5]: <pre>[6, 10, 82, 93, 168]</pre> <p>The features with index &lt;= 15 are relevant, the others are random noise. We see that some of the relevant features are being selected. Nevertheless we got a good improvement in AUC score:</p> In\u00a0[6]: Copied! <pre>selector.best_solution_['mean_test_score']\n</pre> selector.best_solution_['mean_test_score'] Out[6]: <pre>0.9197176917691768</pre> <p>The best AUC score obtained with feature elimination is now 0.92, that's ~0.06 AUC points obtained from removing useless features.</p> In\u00a0[8]: Copied! <pre>selector.transform(X).shape\n</pre> selector.transform(X).shape Out[8]: <pre>(1000, 5)</pre>"},{"location":"tutorials/genetic_algorithms_x_feature_selection/#genetic-algorithms-x-feature-selection","title":"Genetic Algorithms x Feature Selection\u00b6","text":"<p>This tutorial will show an example of how we can use genetic algorithms applied to feature selection to improve our model performances.</p> <p>More specifically, this tutorial will illustrate how to perform feature selection using genetic algorithm as implemented in the class <code>felimination.ga.HybridImportanceGACVFeatureSelector</code></p>"},{"location":"tutorials/genetic_algorithms_x_feature_selection/#create-a-dummy-dataset","title":"Create a dummy Dataset\u00b6","text":"<p>For this tutorial we will use a dummy classification dataset created using <code>sklearn.datasets.make_classification</code>. For this dataset we will have <code>6</code> predictive features, <code>10</code> redundant and <code>184</code> random features.</p>"},{"location":"tutorials/genetic_algorithms_x_feature_selection/#evaluate-performances-without-feature-elimination","title":"Evaluate performances without feature elimination\u00b6","text":""},{"location":"tutorials/genetic_algorithms_x_feature_selection/#perform-now-feature-elimination","title":"Perform now feature elimination\u00b6","text":""},{"location":"tutorials/recursive_feature_elimination/","title":"Recursive Feature Elimination (RFE)","text":"In\u00a0[\u00a0]: Copied! <pre># Install felimination\n! pip install felimination\n</pre> # Install felimination ! pip install felimination In\u00a0[2]: Copied! <pre>from sklearn.datasets import make_classification\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=200,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n    shuffle=False\n)\n</pre> from sklearn.datasets import make_classification  X, y = make_classification(     n_samples=1000,     n_features=200,     n_informative=6,     n_redundant=10,     n_clusters_per_class=1,     random_state=42,     shuffle=False ) In\u00a0[3]: Copied! <pre>from sklearn.model_selection import cross_validate, StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\n\n# Define a simple logistic regression model\nmodel = LogisticRegression(random_state=42)\n\n# Perform cross-validation\ncv_results = cross_validate(\n    model,\n    X,\n    y,\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    scoring=\"roc_auc\",\n    return_train_score=True,\n)\n\ncv_results[\"test_score\"].mean()\n</pre> from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.linear_model import LogisticRegression   # Define a simple logistic regression model model = LogisticRegression(random_state=42)  # Perform cross-validation cv_results = cross_validate(     model,     X,     y,     cv=StratifiedKFold(random_state=42, shuffle=True),     scoring=\"roc_auc\",     return_train_score=True, )  cv_results[\"test_score\"].mean() Out[3]: <pre>0.8561362716271628</pre> In\u00a0[4]: Copied! <pre>from felimination.rfe import PermutationImportanceRFECV\nfrom felimination.callbacks import plot_progress_callback\n\n\nselector = PermutationImportanceRFECV(\n    model,\n    step=0.2,\n    callbacks=[plot_progress_callback],\n    scoring=\"roc_auc\",\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n)\nselector.fit(X, y)\n</pre> from felimination.rfe import PermutationImportanceRFECV from felimination.callbacks import plot_progress_callback   selector = PermutationImportanceRFECV(     model,     step=0.2,     callbacks=[plot_progress_callback],     scoring=\"roc_auc\",     cv=StratifiedKFold(random_state=42, shuffle=True), ) selector.fit(X, y) Out[4]: <pre>PermutationImportanceRFECV(callbacks=[&lt;function plot_progress_callback at 0x103583d80&gt;],\n                           cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                           estimator=LogisticRegression(random_state=42),\n                           scoring='roc_auc', step=0.2)</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.\u00a0PermutationImportanceRFECViFitted<pre>PermutationImportanceRFECV(callbacks=[&lt;function plot_progress_callback at 0x103583d80&gt;],\n                           cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                           estimator=LogisticRegression(random_state=42),\n                           scoring='roc_auc', step=0.2)</pre> estimator: LogisticRegression<pre>LogisticRegression(random_state=42)</pre> \u00a0LogisticRegression?Documentation for LogisticRegression<pre>LogisticRegression(random_state=42)</pre> <p>Notice how model performances increase with the progressive elimination of features.</p> <p>This is due to the fact that models with a lot of not predictive feature tend to find patterns even in random noise and end up overfitting, see how the train score and the validation score get closer with the progressive elimination of features.</p> In\u00a0[5]: Copied! <pre>import pandas as pd\n\ncv_results_df = pd.DataFrame(selector.cv_results_)\n\ncv_results_df[[\"mean_test_score\", \"n_features\"]].sort_values(\n    \"mean_test_score\", ascending=False\n).head(10)\n</pre> import pandas as pd  cv_results_df = pd.DataFrame(selector.cv_results_)  cv_results_df[[\"mean_test_score\", \"n_features\"]].sort_values(     \"mean_test_score\", ascending=False ).head(10) Out[5]: mean_test_score n_features 7 0.944138 44 6 0.943558 54 8 0.943018 36 9 0.942478 29 5 0.942438 67 4 0.942058 83 10 0.939718 24 11 0.937578 20 12 0.935838 16 13 0.935698 13 <p>The best AUC score obtained with feature elimination is now 0.94, that's 0.08 AUC points obtained from less features.</p> <p>If I had to choose a number of features, I would probably go for 13 number of features because there the validation score is very close to the train score.</p> <p>We can do this using the method <code>set_n_features_to_select</code>. This will change the support of the selector as well as the behavior of the <code>transform</code> method.</p> In\u00a0[6]: Copied! <pre>selector.set_n_features_to_select(13)\nselector.transform(X).shape\n</pre> selector.set_n_features_to_select(13) selector.transform(X).shape Out[6]: <pre>(1000, 13)</pre> In\u00a0[7]: Copied! <pre>import numpy as np\n\n# Show the index of the selected features, index &lt;= 15 are relevant\nnp.arange(0, X.shape[1])[selector.support_]\n</pre> import numpy as np  # Show the index of the selected features, index &lt;= 15 are relevant np.arange(0, X.shape[1])[selector.support_] Out[7]: <pre>array([  1,   2,   3,   7,   8,   9,  10,  69,  80,  82, 155, 186, 197])</pre> <p>We can see from the index of selected features that most of the selected features are informative (index&lt;=15) while still some random features are being selected. Also some of the features are still redundant.</p>"},{"location":"tutorials/recursive_feature_elimination/#recursive-feature-elimination-rfe","title":"Recursive Feature Elimination (RFE)\u00b6","text":"<p>This tutorial will show an example of how we can use recursive feature elimination to improve our model performances. More specifically, this tutorial will illustrate how to perform backward recursive feature elimination based on permutation importance using the class <code>felimination.rfe.PermutationImportanceRFECV</code></p>"},{"location":"tutorials/recursive_feature_elimination/#create-a-dummy-dataset","title":"Create a dummy Dataset\u00b6","text":"<p>For this tutorial we will use a dummy classification dataset created using <code>sklearn.datasets.make_classification</code>. For this dataset we will have <code>6</code> predictive features, <code>10</code> redundant and <code>184</code> random features.</p>"},{"location":"tutorials/recursive_feature_elimination/#evaluate-performances-without-feature-elimination","title":"Evaluate performances without feature elimination\u00b6","text":""},{"location":"tutorials/recursive_feature_elimination/#perform-now-feature-elimination","title":"Perform now feature elimination\u00b6","text":""}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Homepage","text":"<p>This library contains some useful scikit-learn compatible classes for feature selection.</p>"},{"location":"#features","title":"Features","text":"<ul> <li>Recursive Feature Elimination with Cross Validation using Permutation Importance</li> <li>Hybrid Genetic Algorithms x Feature Importance selection</li> </ul>"},{"location":"#requirements","title":"Requirements","text":"<ul> <li>Python 3.7+</li> <li>NumPy</li> <li>Scikit-learn</li> <li>Pandas</li> </ul>"},{"location":"#installation","title":"Installation","text":"<p>In a terminal shell run the following command <pre><code>pip install felimination\n</code></pre></p>"},{"location":"#usage","title":"Usage","text":""},{"location":"#recursive-feature-elimination","title":"Recursive Feature Elimination","text":"<p>In this section it will be illustrated how to use the <code>PermutationImportanceRFECV</code> class.</p> <p><pre><code>from felimination.rfe import PermutationImportanceRFECV\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import make_classification\nimport numpy as np\n\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=20,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n)\n\nselector = PermutationImportanceRFECV(LogisticRegression(), step=0.3)\n\nselector.fit(X, y)\n\nselector.support_\n# array([False, False, False, False, False, False, False, False, False,\n#        False, False,  True, False, False, False, False, False, False,\n#        False, False])\n\nselector.ranking_\n# array([9, 3, 8, 9, 7, 8, 5, 6, 9, 6, 8, 1, 9, 7, 8, 9, 9, 2, 4, 7])\nselector.plot()\n</code></pre> </p> <p>It looks like <code>5</code> is a good number of features, we can set the number of features to select to 5 without need of retraining</p> <pre><code>selector.set_n_features_to_select(5)\nselector.support_\n# array([False,  True, False, False, False, False,  True, False, False,\n#        False, False,  True, False, False, False, False, False,  True,\n#         True, False])\n</code></pre>"},{"location":"#genetic-algorithms","title":"Genetic Algorithms","text":"<p>In this section it will be illustrated how to use the <code>HybridImportanceGACVFeatureSelector</code> class.</p> <p><pre><code>from felimination.ga import HybridImportanceGACVFeatureSelector\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import make_classification\nimport numpy as np\n\n# Create dummy dataset\nX, y = make_classification(\n    n_samples=1000,\n    n_features=20,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n)\n\n# Initialize selector\nselector = HybridImportanceGACVFeatureSelector(\n    LogisticRegression(random_state=42),\n    random_state=42,\n    pool_size=5,\n    patience=5\n)\n\n# Run optimisation\nselector.fit(X, y)\n\n# Show selected features\nselector.support_\n#array([False,  True, False,  True,  True, False, False, False,  True,\n#       False, False, False,  True,  True,  True,  True, False,  True,\n#        True, False])\n\n# Show best solution\nselector.best_solution_\n# {'features': [1, 12, 13, 8, 17, 15, 18, 4, 3, 14],\n#  'train_scores_per_fold': [0.88625, 0.89, 0.8825, 0.8925, 0.88625],\n#  'test_scores_per_fold': [0.895, 0.885, 0.885, 0.89, 0.89],\n#  'cv_importances': [array([[ 1.09135972,  1.13502636,  1.12100231,  0.38285736,  0.28944072,\n#            0.04688614,  0.44259813,  0.09832365,  0.10190421, -0.48101593]]),\n#   array([[ 1.17345812,  1.29375208,  1.2065342 ,  0.40418709,  0.41839714,\n#            0.00447802,  0.466717  ,  0.21733829, -0.00842075, -0.50078996]]),\n#   array([[ 1.15416104,  1.18458564,  1.18083266,  0.37071253,  0.22842685,\n#            0.1087814 ,  0.44446793,  0.12740545,  0.00621562, -0.54064287]]),\n#   array([[ 1.26011643,  1.36996058,  1.30481424,  0.48183549,  0.40589887,\n#           -0.01849671,  0.45606913,  0.18330816,  0.03667055, -0.50869557]]),\n#   array([[ 1.18227123,  1.28988253,  1.2496398 ,  0.50754295,  0.38942303,\n#           -0.01725074,  0.4481891 ,  0.19472963,  0.10034316, -0.50131192]])],\n#  'mean_train_score': 0.8875,\n#  'mean_test_score': 0.889,\n#  'mean_cv_importances': array([ 1.17227331,  1.25464144,  1.21256464,  0.42942709,  0.34631732,\n#          0.02487962,  0.45160826,  0.16422104,  0.04734256, -0.50649125])}\n\n# Show progress as a plot\nselector.plot()\n</code></pre> </p> <p>Looks like that the optimisation process converged after 2 steps, since the best score did not improve for 5(=<code>patience</code>) consecutive steps, the optimisation process stopped early.</p>"},{"location":"#license","title":"License","text":"<p>This project is licensed under the BSD 3-Clause License - see the LICENSE.md file for details</p>"},{"location":"#acknowledgments","title":"Acknowledgments","text":"<ul> <li>scikit-learn</li> </ul>"},{"location":"reference/RFE/","title":"RFE","text":"<p>Module with tools to perform feature selection.</p> <p>This module contains the following classes:</p> <ul> <li><code>FeliminationRFECV</code>: base class for feature selection.</li> <li><code>PermutationImportanceRFECV</code>: recursive feature elimination with     cross-validation based on permutation importance.</li> </ul>"},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV","title":"<code>FeliminationRFECV(estimator, *, step=1, n_features_to_select=1, cv=None, scoring=None, random_state=None, verbose=0, n_jobs=None, importance_getter='auto', callbacks=None)</code>","text":"<p>               Bases: <code>RFE</code></p> <p>Perform recursive feature elimination with cross-validation following scikit-learn standards.</p> <p>It has the following differences with RFECV from scikit-learn:</p> <ul> <li>It supports an <code>importance_getter</code> function that also uses a validation set to compute the feature importances. This allows to use importance measures like permutation importance or shap.</li> <li>Instead of using Cross Validation to select the number of features, it uses cross validation to get a more accurate estimate of the feature importances. This means that the number of features to select has to be set during initialization, similarly to RFE.</li> <li>When <code>step</code> is a float value it is removes a percentage of the number of remaining features, not total like in RFE/RFECV. This allows to drop big chunks of feature at the beginning of the RFE process and to slow down towards the end of the process.</li> <li>Has a plotting function</li> <li>Adds information about the number of features selected at each step in the attribute <code>cv_results_</code></li> <li>Allows to change the number of features to be selected after fitting.</li> </ul> <p>Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.</p> <p>The algorithm of feature selection goes as follows: <pre><code>while n_features &gt; n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n</code></pre></p> <p>Parameters:</p> <ul> <li> <code>estimator</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>A supervised learning estimator with a <code>fit</code> method.</p> </li> <li> <code>step</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>If greater than or equal to 1, then <code>step</code> corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then <code>step</code> corresponds to the percentage (rounded down) of remaining features to remove at each iteration. Note that the last iteration may remove fewer than <code>step</code> features in order to reach <code>min_features_to_select</code>.</p> </li> <li> <code>n_features_to_select</code>               (<code>int or float</code>, default:                   <code>None</code> )           \u2013            <p>The number of features to select. If <code>None</code>, half of the features are selected. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of the features to select.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>None</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are:</p> <pre><code>- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n</code></pre> <p>For integer/None inputs, if <code>y</code> is binary or multiclass, <code>~sklearn.model_selection.StratifiedKFold</code> is used. If the estimator is a classifier or if <code>y</code> is neither binary nor multiclass, <code>~sklearn.model_selection.KFold</code> is used.</p> <p>Refer :ref:<code>User Guide &lt;cross_validation&gt;</code> for the various cross-validation strategies that can be used here.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>Controls verbosity of output.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of cores to run in parallel while fitting across folds. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>importance_getter</code>               (<code>str or callable</code>, default:                   <code>'auto'</code> )           \u2013            <p>If 'auto', uses the feature importance either through a <code>coef_</code> or <code>feature_importances_</code> attributes of estimator.</p> <p>Also accepts a string that specifies an attribute name/path for extracting feature importance. For example, give <code>regressor_.coef_</code> in case of <code>~sklearn.compose.TransformedTargetRegressor</code>  or <code>named_steps.clf.feature_importances_</code> in case of <code>~sklearn.pipeline.Pipeline</code> with its last step named <code>clf</code>.</p> <p>If <code>callable</code>, overrides the default feature importance getter. The callable is passed with the fitted estimator and the validation set (X_val, y_val, estimator) and it should return importance for each feature.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>classes_</code>               (<code>ndarray of shape (n_classes,)</code>)           \u2013            <p>The classes labels. Only available when <code>estimator</code> is a classifier.</p> </li> <li> <code>estimator_</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>The fitted estimator used to select features.</p> </li> <li> <code>cv_results_</code>               (<code>dict of ndarrays</code>)           \u2013            <p>A dict with keys: n_features : ndarray of shape (n_subsets_of_features,)     The number of features used at that step. split(k)_test_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_test_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_test_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds. split(k)_train_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_train_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_train_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds.</p> </li> <li> <code>n_features_</code>               (<code>int</code>)           \u2013            <p>The number of selected features.</p> </li> <li> <code>n_features_in_</code>               (<code>int</code>)           \u2013            <p>Number of features seen during :term:<code>fit</code>. Only defined if the underlying estimator exposes such an attribute when fit.</p> </li> <li> <code>feature_names_in_</code>               (<code>ndarray of shape (`n_features_in_`,)</code>)           \u2013            <p>Names of features seen during :term:<code>fit</code>. Defined only when <code>X</code> has feature names that are all strings.</p> </li> <li> <code>ranking_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The feature ranking, such that <code>ranking_[i]</code> corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.</p> </li> <li> <code>support_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> <li> <code>callbacks</code>               (<code>list of callable, default=None</code>)           \u2013            <p>List of callables to be called at the end of each step of the feature selection. Each callable should accept two parameters: the selector and the importances computed at that step.</p> </li> </ul> <p>Examples:</p> <p>The following example shows how to retrieve the 5 most informative features in the Friedman #1 dataset.</p> <pre><code>&gt;&gt;&gt; from felimination.rfe import FeliminationRFECV\n&gt;&gt;&gt; from felimination.importance import PermutationImportance\n&gt;&gt;&gt; from sklearn.datasets import make_friedman1\n&gt;&gt;&gt; from sklearn.svm import SVR\n&gt;&gt;&gt; X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n&gt;&gt;&gt; estimator = SVR(kernel=\"linear\")\n&gt;&gt;&gt; selector = selector = FeliminationRFECV(\n    estimator,\n    step=1,\n    cv=5,\n    n_features_to_select=5,\n    importance_getter=PermutationImportance()\n)\n&gt;&gt;&gt; selector = selector.fit(X, y)\n&gt;&gt;&gt; selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n&gt;&gt;&gt; selector.ranking_\narray([1, 1, 1, 1, 1, 6, 3, 4, 2, 5])\n</code></pre> Source code in <code>felimination/rfe.py</code> <pre><code>def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    step=1,\n    n_features_to_select=1,\n    cv=None,\n    scoring=None,\n    random_state=None,\n    verbose=0,\n    n_jobs=None,\n    importance_getter=\"auto\",\n    callbacks=None,\n) -&gt; None:\n    self.cv = cv\n    self.scoring = scoring\n    self.n_jobs = n_jobs\n    self.random_state = random_state\n    self.callbacks = callbacks\n    super().__init__(\n        estimator,\n        n_features_to_select=n_features_to_select,\n        step=step,\n        verbose=verbose,\n        importance_getter=importance_getter,\n    )\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.fit","title":"<code>fit(X, y, groups=None, **fit_params)</code>","text":"<p>Fit the RFE model and then the underlying estimator on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>)           \u2013            <p>The target values.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying estimator.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.n_features_to_select is None:\n        n_features_to_select = n_features // 2\n    elif isinstance(self.n_features_to_select, Integral):  # int\n        n_features_to_select = self.n_features_to_select\n    else:  # float\n        n_features_to_select = int(n_features * self.n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = n_features\n    self.cv_results_ = defaultdict(list)\n\n    # Elimination\n    while current_number_of_features &gt; n_features_to_select:\n        # Select remaining features\n        X_remaining_features, features = self._select_X_with_remaining_features(\n            X, support=support_, n_features=n_features\n        )\n\n        if self.verbose &gt; 0:\n            print(\n                \"Fitting estimator with %d features.\" % current_number_of_features\n            )\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.estimator,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(mean_importances)\n\n        # for sparse case ranks is matrix\n        ranks = np.ravel(ranks)\n\n        if 0.0 &lt; self.step &lt; 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n\n        # Eliminate the worst features\n        threshold = min(step, current_number_of_features - n_features_to_select)\n\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, cv_importances)\n\n        current_number_of_features = np.sum(support_)\n    # Set final attributes\n\n    # Estimate performances of final model\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    cv_scores = cross_validate(\n        self.estimator,\n        X_remaining_features,\n        y,\n        groups=groups,\n        scoring=scorer,\n        cv=cv,\n        n_jobs=self.n_jobs,\n        fit_params=fit_params,\n        return_train_score=True,\n    )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n    # Update cv scores\n    for train_or_test in [\"train\", \"test\"]:\n        scores_per_fold = cv_scores[f\"{train_or_test}_score\"]\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n\n    if self.callbacks:\n        for callback in self.callbacks:\n            callback(self, cv_importances)\n\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    self.estimator_ = clone(self.estimator)\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot a feature selection plot with number of features</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.FeliminationRFECV.set_n_features_to_select","title":"<code>set_n_features_to_select(n_features_to_select)</code>","text":"<p>Changes the number of features to select after fitting.</p> <p>The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.</p> <p>Parameters:</p> <ul> <li> <code>n_features_to_select</code>               (<code>int</code>)           \u2013            <p>The number of features to select. Must be a value among <code>cv_results_[\"n_features\"]</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> <p>Raises:</p> <ul> <li> <code>ValueError</code>             \u2013            <p>When the number of features to select has not been tried during the feature selection procedure.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV","title":"<code>PermutationImportanceRFECV(estimator, *, step=1, n_features_to_select=1, cv=None, scoring=None, verbose=0, n_jobs=None, n_repeats=5, random_state=None, sample_weight=None, max_samples=1.0, callbacks=None)</code>","text":"<p>               Bases: <code>FeliminationRFECV</code></p> <p>Preset of FeliminationRFECV using permutation importance as importance getter.</p> <p>It has the following differences with RFECV from scikit-learn:</p> <ul> <li>It supports an <code>importance_getter</code> function that also uses a validation   set to compute the feature importances. This allows to use importance measures   like permutation importance or shap.</li> <li>Instead of using Cross Validation to select the number of features, it   uses cross validation to get a more accurate estimate of the feature   importances. This means that the number of features to select has to be   set during initialization, similarly to RFE.</li> <li>When <code>step</code> is a float value it is removes a percentage of the number   of remaining features, not total like in RFE/RFECV. This allows to   drop big chunks of feature at the beginning of the RFE process and to slow   down towards the end of the process.</li> <li>Has a plotting function</li> <li>Adds information about the number of features selected at each step in the   attribute <code>cv_results_</code></li> <li>Allows to change the number of features to be selected after fitting.</li> </ul> <p>Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.</p> <p>The algorithm of feature selection goes as follows: <pre><code>while n_features &gt; n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n</code></pre></p> <p>Parameters:</p> <ul> <li> <code>estimator</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>A supervised learning estimator with a <code>fit</code> method.</p> </li> <li> <code>step</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>If greater than or equal to 1, then <code>step</code> corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then <code>step</code> corresponds to the percentage (rounded down) of remaining features to remove at each iteration. Note that the last iteration may remove fewer than <code>step</code> features in order to reach <code>min_features_to_select</code>.</p> </li> <li> <code>n_features_to_select</code>               (<code>int or float</code>, default:                   <code>None</code> )           \u2013            <p>The number of features to select. If <code>None</code>, half of the features are selected. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of the features to select.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>None</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are:</p> <ul> <li>None, to use the default 5-fold cross-validation,</li> <li>integer, to specify the number of folds.</li> <li>:term:<code>CV splitter</code>,</li> <li>An iterable yielding (train, test) splits as arrays of indices.</li> </ul> <p>For integer/None inputs, if <code>y</code> is binary or multiclass, <code>~sklearn.model_selection.StratifiedKFold</code> is used. If the estimator is a classifier or if <code>y</code> is neither binary nor multiclass, <code>~sklearn.model_selection.KFold</code> is used.</p> <p>Refer :ref:<code>User Guide &lt;cross_validation&gt;</code> for the various cross-validation strategies that can be used here.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>Controls verbosity of output.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of cores to run in parallel while fitting across folds. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>n_repeats</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Number of times to permute a feature.</p> </li> <li> <code>random_state</code>               (<code>int, RandomState instance</code>, default:                   <code>None</code> )           \u2013            <p>Pseudo-random number generator to control the permutations of each feature. Pass an int to get reproducible results across function calls.</p> </li> <li> <code>sample_weight</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Sample weights used in scoring.</p> </li> <li> <code>max_samples</code>               (<code>int or float</code>, default:                   <code>1.0</code> )           \u2013            <p>The number of samples to draw from X to compute feature importance in each repeat (without replacement). - If int, then draw <code>max_samples</code> samples. - If float, then draw <code>max_samples * X.shape[0]</code> samples. - If <code>max_samples</code> is equal to <code>1.0</code> or <code>X.shape[0]</code>, all samples will be used. While using this option may provide less accurate importance estimates, it keeps the method tractable when evaluating feature importance on large datasets. In combination with <code>n_repeats</code>, this allows to control the computational speed vs statistical accuracy trade-off of this method.</p> </li> <li> <code>callbacks</code>               (<code>list of callable</code>, default:                   <code>None</code> )           \u2013            <p>List of callables to be called at the end of each step of the feature selection. Each callable should accept two parameters: the selector and the importances computed at that step.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>classes_</code>               (<code>ndarray of shape (n_classes,)</code>)           \u2013            <p>The classes labels. Only available when <code>estimator</code> is a classifier.</p> </li> <li> <code>estimator_</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>The fitted estimator used to select features.</p> </li> <li> <code>cv_results_</code>               (<code>dict of ndarrays</code>)           \u2013            <p>A dict with keys: n_features : ndarray of shape (n_subsets_of_features,)     The number of features used at that step. split(k)_test_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_test_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_test_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds. split(k)_train_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_train_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_train_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds.</p> </li> <li> <code>n_features_</code>               (<code>int</code>)           \u2013            <p>The number of selected features.</p> </li> <li> <code>n_features_in_</code>               (<code>int</code>)           \u2013            <p>Number of features seen during :term:<code>fit</code>. Only defined if the underlying estimator exposes such an attribute when fit.</p> </li> <li> <code>feature_names_in_</code>               (<code>ndarray of shape (`n_features_in_`,)</code>)           \u2013            <p>Names of features seen during :term:<code>fit</code>. Defined only when <code>X</code> has feature names that are all strings.</p> </li> <li> <code>ranking_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The feature ranking, such that <code>ranking_[i]</code> corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.</p> </li> <li> <code>support_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> </ul> <p>Examples:</p> <p>The following example shows how to retrieve the 5 most informative features in the Friedman #1 dataset.</p> <pre><code>&gt;&gt;&gt; from felimination.rfe import PermutationImportanceRFECV\n&gt;&gt;&gt; from sklearn.datasets import make_friedman1\n&gt;&gt;&gt; from sklearn.svm import SVR\n&gt;&gt;&gt; X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n&gt;&gt;&gt; estimator = SVR(kernel=\"linear\")\n&gt;&gt;&gt; selector = selector = PermutationImportanceRFECV(\n        estimator,\n        step=1,\n        cv=5,\n        n_features_to_select=5,\n    )\n&gt;&gt;&gt; selector = selector.fit(X, y)\n&gt;&gt;&gt; selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n&gt;&gt;&gt; selector.ranking_\narray([1, 1, 1, 1, 1, 6, 3, 4, 2, 5])\n</code></pre> Source code in <code>felimination/rfe.py</code> <pre><code>def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    step=1,\n    n_features_to_select=1,\n    cv=None,\n    scoring=None,\n    verbose=0,\n    n_jobs=None,\n    n_repeats=5,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n    callbacks=None,\n) -&gt; None:\n    self.n_repeats = n_repeats\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n    super().__init__(\n        estimator,\n        step=step,\n        n_features_to_select=n_features_to_select,\n        cv=cv,\n        random_state=random_state,\n        scoring=scoring,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        callbacks=callbacks,\n        importance_getter=PermutationImportance(\n            scoring=scoring,\n            n_repeats=n_repeats,\n            # Better not to do double parallelization\n            n_jobs=1,\n            random_state=random_state,\n            sample_weight=sample_weight,\n            max_samples=max_samples,\n        ),\n    )\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.fit","title":"<code>fit(X, y, groups=None, **fit_params)</code>","text":"<p>Fit the RFE model and then the underlying estimator on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>)           \u2013            <p>The target values.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying estimator.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.n_features_to_select is None:\n        n_features_to_select = n_features // 2\n    elif isinstance(self.n_features_to_select, Integral):  # int\n        n_features_to_select = self.n_features_to_select\n    else:  # float\n        n_features_to_select = int(n_features * self.n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = n_features\n    self.cv_results_ = defaultdict(list)\n\n    # Elimination\n    while current_number_of_features &gt; n_features_to_select:\n        # Select remaining features\n        X_remaining_features, features = self._select_X_with_remaining_features(\n            X, support=support_, n_features=n_features\n        )\n\n        if self.verbose &gt; 0:\n            print(\n                \"Fitting estimator with %d features.\" % current_number_of_features\n            )\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.estimator,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(mean_importances)\n\n        # for sparse case ranks is matrix\n        ranks = np.ravel(ranks)\n\n        if 0.0 &lt; self.step &lt; 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n\n        # Eliminate the worst features\n        threshold = min(step, current_number_of_features - n_features_to_select)\n\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, cv_importances)\n\n        current_number_of_features = np.sum(support_)\n    # Set final attributes\n\n    # Estimate performances of final model\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    cv_scores = cross_validate(\n        self.estimator,\n        X_remaining_features,\n        y,\n        groups=groups,\n        scoring=scorer,\n        cv=cv,\n        n_jobs=self.n_jobs,\n        fit_params=fit_params,\n        return_train_score=True,\n    )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n    # Update cv scores\n    for train_or_test in [\"train\", \"test\"]:\n        scores_per_fold = cv_scores[f\"{train_or_test}_score\"]\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n\n    if self.callbacks:\n        for callback in self.callbacks:\n            callback(self, cv_importances)\n\n    X_remaining_features, features = self._select_X_with_remaining_features(\n        X, support=support_, n_features=n_features\n    )\n\n    self.estimator_ = clone(self.estimator)\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot a feature selection plot with number of features</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n</code></pre>"},{"location":"reference/RFE/#felimination.rfe.PermutationImportanceRFECV.set_n_features_to_select","title":"<code>set_n_features_to_select(n_features_to_select)</code>","text":"<p>Changes the number of features to select after fitting.</p> <p>The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.</p> <p>Parameters:</p> <ul> <li> <code>n_features_to_select</code>               (<code>int</code>)           \u2013            <p>The number of features to select. Must be a value among <code>cv_results_[\"n_features\"]</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> <p>Raises:</p> <ul> <li> <code>ValueError</code>             \u2013            <p>When the number of features to select has not been tried during the feature selection procedure.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n</code></pre>"},{"location":"reference/callbacks/","title":"Callbacks","text":"<p>Callbacks for feature selection algorithms.</p>"},{"location":"reference/callbacks/#felimination.callbacks.plot_progress_callback","title":"<code>plot_progress_callback(selector, *args, **kwargs)</code>","text":"<p>Plot the feature selection progress during the algorithm execution.</p> <p>Parameters:</p> <ul> <li> <code>selector</code>               (<code>object</code>)           \u2013            <p>The feature selector object.</p> </li> </ul> Source code in <code>felimination/callbacks.py</code> <pre><code>def plot_progress_callback(selector, *args, **kwargs):\n    \"\"\"Plot the feature selection progress during the algorithm execution.\n\n    Parameters\n    ----------\n    selector : object\n        The feature selector object.\n    \"\"\"\n    from IPython import display\n    from matplotlib import pyplot as plt\n\n    display.clear_output(wait=True)\n    selector.plot()\n    plt.show()\n</code></pre>"},{"location":"reference/drift/","title":"Drift","text":"<p>The idea behind this module comes from the conjunction of two concepts:</p> <ul> <li>[1] Classifier Two-Sample Test</li> <li>[2] Recursive Feature Elimination</li> </ul> <p>In [1] classifier performances are used to determine how similar two samples are. More specifically, imagine to have two samples: <code>reference</code> and <code>test</code>. In order to assess whether <code>reference</code> and <code>test</code> have been drawn from the same distribution, we could train a classifier in classifying which instances belong to which sample. If the model easily distinguishes instances from the two samples, then the two samples have been probably drawn from two different distributions. Conversely, if the classifier struggles to distinguish them, then it is likely that the samples have been drawn from the same distribution.</p> <p>In the context of drift detection, the classifier two-sample test can be used to assess whether drift has happened between the reference and the test set and to which degree.</p> <p>The classes of this module take this idea one step further and attempt to reduce the drift using recursive feature selection. After a classifier is trained to distinguish between <code>reference</code> and <code>test</code>, the feature importance of the classifier is used to determine which features contribute the most in distinguishing between the two sets. The most important features are then eliminated and the procedure is repeated until the classifier is not able anymore to distinguish between the two samples, or until a certain amount of features has been removed.</p> <p>This module contains the following classes: - <code>SampleSimilarityDriftRFE</code>: base class for drift-based sample similarity     feature selection.</p>"},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE","title":"<code>PermImpSampleSimilarityDriftRFE(clf, *, step=1, max_score=0.55, min_n_features_to_select=1, split_col=0, split_value=None, split_frac=0.5, split_unique_values=True, cv=None, scoring=None, verbose=0, n_jobs=None, n_repeats=5, random_state=None, sample_weight=None, max_samples=1.0)</code>","text":"<p>               Bases: <code>SampleSimilarityDriftRFE</code></p> <p>Preset of SampleSimilarityDriftRFE using permutation importance as importance getter.</p> <p>It has the following differences with RFECV from scikit-learn:</p> <ul> <li>It supports an <code>importance_getter</code> function that also uses a validation   set to compute the feature importances. This allows to use importance measures   like permutation importance or shap.</li> <li>Instead of using Cross Validation to select the number of features, it   uses cross validation to get a more accurate estimate of the feature   importances. This means that the number of features to select has to be   set during initialization, similarly to RFE.</li> <li>When <code>step</code> is a float value it is removes a percentage of the number   of remaining features, not total like in RFE/RFECV. This allows to   drop big chunks of feature at the beginning of the RFE process and to slow   down towards the end of the process.</li> <li>Has a plotting function</li> <li>Adds information about the number of features selected at each step in the   attribute <code>cv_results_</code></li> <li>Allows to change the number of features to be selected after fitting.</li> </ul> <p>Rater than that, it is a copy-paste of RFE, so credit goes to scikit-learn.</p> <p>The algorithm of feature selection goes as follows: <pre><code>while n_features &gt; n_features_to_select:\n    - The estimator is trained on the selected features and the score is\n      computed using cross validation.\n    - feature importance is computed for each validation fold on the validation\n      set and then averaged.\n    - The least important features are pruned.\n    - The pruned features are removed from the dataset.\n</code></pre></p> <p>Parameters:</p> <ul> <li> <code>clf</code>               (<code>``Classifier`` instance</code>)           \u2013            <p>A Classifier with a <code>fit</code> method.</p> </li> <li> <code>step</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>If greater than or equal to 1, then <code>step</code> corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then <code>step</code> corresponds to the percentage (rounded down) of remaining features to remove at each iteration. Note that the last iteration may remove fewer than <code>step</code> features in order to reach <code>min_features_to_select</code>.</p> </li> <li> <code>max_score</code>               (<code>float</code>, default:                   <code>0.55</code> )           \u2013            <p>Stops the feature selection procedure when the cross-validation score of the sample similarity classifier is lower than <code>max_score</code>.</p> </li> <li> <code>min_n_features_to_select</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>The minimum number of features to select. If <code>None</code>, half of the features are selected. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of the features to select.</p> </li> <li> <code>split_column</code>               (<code>str</code>, default:                   <code>'split'</code> )           \u2013            <p>The name of the column in the dataset that will be used to split the dataset into two sets.</p> </li> <li> <code>split_value</code>               (<code>Any</code>, default:                   <code>None</code> )           \u2013            <p>If defined, this value will be used to split the dataset into two sets.</p> </li> <li> <code>split_frac</code>               (<code>float</code>, default:                   <code>0.5</code> )           \u2013            <p>If split_value, split frac is used to determine a split_value. The split frac corresponds to the quantile of the split_column to use as the split_value.</p> </li> <li> <code>split_unique_values</code>           \u2013            <p>Whether to calculate the quantile of the split_column to use as the split_value based on the unique values of the split_column.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>None</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are:</p> <ul> <li>None, to use the default 5-fold cross-validation,</li> <li>integer, to specify the number of folds.</li> <li>:term:<code>CV splitter</code>,</li> <li>An iterable yielding (train, test) splits as arrays of indices.</li> </ul> <p>For integer/None inputs, if <code>y</code> is binary or multiclass, :class:<code>~sklearn.model_selection.StratifiedKFold</code> is used. If the estimator is a classifier or if <code>y</code> is neither binary nor multiclass, :class:<code>~sklearn.model_selection.KFold</code> is used.</p> <p>Refer :ref:<code>User Guide &lt;cross_validation&gt;</code> for the various cross-validation strategies that can be used here.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>Controls verbosity of output.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of cores to run in parallel while fitting across folds. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>n_repeats</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Number of times to permute a feature.</p> </li> <li> <code>random_state</code>               (<code>int, RandomState instance</code>, default:                   <code>None</code> )           \u2013            <p>Pseudo-random number generator to control the permutations of each feature. Pass an int to get reproducible results across function calls.</p> </li> <li> <code>sample_weight</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Sample weights used in scoring.</p> </li> <li> <code>max_samples</code>               (<code>int or float</code>, default:                   <code>1.0</code> )           \u2013            <p>The number of samples to draw from X to compute feature importance in each repeat (without replacement). - If int, then draw <code>max_samples</code> samples. - If float, then draw <code>max_samples * X.shape[0]</code> samples. - If <code>max_samples</code> is equal to <code>1.0</code> or <code>X.shape[0]</code>, all samples will be used. While using this option may provide less accurate importance estimates, it keeps the method tractable when evaluating feature importance on large datasets. In combination with <code>n_repeats</code>, this allows to control the computational speed vs statistical accuracy trade-off of this method.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>classes_</code>               (<code>ndarray of shape (n_classes,)</code>)           \u2013            <p>The classes labels. Only available when <code>estimator</code> is a classifier.</p> </li> <li> <code>estimator_</code>               (<code>``Estimator`` instance</code>)           \u2013            <p>The fitted estimator used to select features.</p> </li> <li> <code>cv_results_</code>               (<code>dict of ndarrays</code>)           \u2013            <p>A dict with keys: n_features : ndarray of shape (n_subsets_of_features,)     The number of features used at that step. split(k)_test_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_test_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_test_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds. split(k)_train_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_train_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_train_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds.</p> </li> <li> <code>n_features_</code>               (<code>int</code>)           \u2013            <p>The number of selected features.</p> </li> <li> <code>n_features_in_</code>               (<code>int</code>)           \u2013            <p>Number of features seen during :term:<code>fit</code>. Only defined if the underlying estimator exposes such an attribute when fit.</p> </li> <li> <code>feature_names_in_</code>               (<code>ndarray of shape (`n_features_in_`,)</code>)           \u2013            <p>Names of features seen during :term:<code>fit</code>. Defined only when <code>X</code> has feature names that are all strings.</p> </li> <li> <code>ranking_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The feature ranking, such that <code>ranking_[i]</code> corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.</p> </li> <li> <code>support_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> </ul> Source code in <code>felimination/drift.py</code> <pre><code>def __init__(\n    self,\n    clf: ClassifierMixin,\n    *,\n    step=1,\n    max_score=0.55,\n    min_n_features_to_select=1,\n    split_col=0,\n    split_value=None,\n    split_frac=0.5,\n    split_unique_values=True,\n    cv=None,\n    scoring=None,\n    verbose=0,\n    n_jobs=None,\n    n_repeats=5,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n) -&gt; None:\n    self.n_repeats = n_repeats\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n    super().__init__(\n        clf=clf,\n        max_score=max_score,\n        min_n_features_to_select=min_n_features_to_select,\n        split_col=split_col,\n        split_value=split_value,\n        split_frac=split_frac,\n        split_unique_values=split_unique_values,\n        step=step,\n        cv=cv,\n        scoring=scoring,\n        random_state=random_state,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        importance_getter=PermutationImportance(\n            scoring=scoring,\n            n_repeats=n_repeats,\n            # Better not to do double parallelization\n            n_jobs=1,\n            random_state=random_state,\n            sample_weight=sample_weight,\n            max_samples=max_samples,\n        ),\n    )\n</code></pre>"},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.fit","title":"<code>fit(X, y=None, groups=None, **fit_params)</code>","text":"<p>Fit the RFE model and then the underlying clf on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>The target values. Not used, kept for compatibility.</p> </li> <li> <code>groups</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a \"Group\" :term:<code>cv</code> instance.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying clf.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted selector.</p> </li> </ul> Source code in <code>felimination/drift.py</code> <pre><code>def fit(self, X, y=None, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying clf on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values. Not used, kept for compatibility.\n    groups : array-like of shape (n_samples,), default=None\n        Group labels for the samples used while splitting the dataset into\n        train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n        instance.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        clf.\n\n    Returns\n    -------\n    self : object\n        Fitted selector.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    X = self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        dtype=None,\n    )\n    if isinstance(self.split_col, str):\n        split_col_idx = list(self.feature_names_in_).index(self.split_col)\n    else:\n        split_col_idx = self.split_col\n    split_col_values = X[:, split_col_idx]\n    X, y = self._build_sample_similarity_x_y(X, split_col_values=split_col_values)\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=True)\n    scorer = check_scoring(self.clf, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    support_[split_col_idx] = False\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = support_.sum()\n    self.cv_results_ = defaultdict(list)\n\n    if self.verbose &gt; 0:\n        print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n    # Train model, score it and get importances\n    if effective_n_jobs(self.n_jobs) == 1:\n        parallel, func = list, _train_score_get_importance\n    else:\n        parallel = Parallel(n_jobs=self.n_jobs)\n        func = delayed(_train_score_get_importance)\n\n    features = np.arange(n_features)[support_]\n    X_remaining_features = X[:, features]\n\n    scores_importances = parallel(\n        func(\n            self.clf,\n            X_remaining_features,\n            y,\n            train,\n            test,\n            scorer,\n            self.importance_getter,\n        )\n        for train, test in cv.split(X_remaining_features, y, groups)\n    )\n\n    test_scores_per_fold = [\n        score_importance[1] for score_importance in scores_importances\n    ]\n    train_scores_per_fold = [\n        score_importance[0] for score_importance in scores_importances\n    ]\n\n    # Update cv scores\n    for train_or_test, scores_per_fold in zip(\n        [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n    ):\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    # Elimination\n    while (\n        np.mean(test_scores_per_fold) &gt; self.max_score\n        and current_number_of_features &gt; min_n_features_to_select\n    ):\n        features = np.arange(n_features)[support_]\n        if 0.0 &lt; self.step &lt; 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n        # Eliminate most important features\n        threshold = min(step, current_number_of_features - min_n_features_to_select)\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(-mean_importances)\n        ranks = np.ravel(ranks)\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n        current_number_of_features = np.sum(support_)\n        # Select remaining features\n        features = np.arange(n_features)[support_]\n        X_remaining_features = X[:, features]\n\n        if self.verbose &gt; 0:\n            print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.clf,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    features = np.arange(n_features)[support_]\n    self.clf_ = clone(self.clf)\n    self.clf_.fit(X[:, features], y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n</code></pre>"},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot a feature selection plot with number of features</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n</code></pre>"},{"location":"reference/drift/#felimination.drift.PermImpSampleSimilarityDriftRFE.set_n_features_to_select","title":"<code>set_n_features_to_select(n_features_to_select)</code>","text":"<p>Changes the number of features to select after fitting.</p> <p>The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.</p> <p>Parameters:</p> <ul> <li> <code>n_features_to_select</code>               (<code>int</code>)           \u2013            <p>The number of features to select. Must be a value among <code>cv_results_[\"n_features\"]</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> <p>Raises:</p> <ul> <li> <code>ValueError</code>             \u2013            <p>When the number of features to select has not been tried during the feature selection procedure.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n</code></pre>"},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE","title":"<code>SampleSimilarityDriftRFE(clf, *, step=1, max_score=0.55, min_n_features_to_select=1, split_col=0, split_value=None, split_frac=0.5, split_unique_values=True, cv=None, scoring=None, random_state=None, verbose=0, n_jobs=None, importance_getter='auto')</code>","text":"<p>               Bases: <code>FeliminationRFECV</code></p> <p>Recursively discards the features that introduce the highest drift.</p> <p>The algorithm of feature selection goes as follows: <pre><code>Split X into two sets using the `split_column`: X1 and X2\ncreate target array y1 for X1 as an array of zeroes\ncreate target array y2 for X2 as an array of ones\nvertically concatenate X1, X2 and y1 and y2, obtaining X_ss and y_ss\nCalculate Cross-validation performances of the estimator on X_ss and y_ss.\nwhile cross-validation-performances &gt; max_score and n_features &gt; min_n_features_to_select:\n    Discard most important features\n    Calculate Cross-validation performances of the estimator on X_ss and y_ss using the new feature set.\n</code></pre></p> <p>Parameters:</p> <ul> <li> <code>clf</code>               (<code>``Classifier`` instance</code>)           \u2013            <p>A Classifier with a <code>fit</code> method.</p> </li> <li> <code>step</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>If greater than or equal to 1, then <code>step</code> corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then <code>step</code> corresponds to the percentage (rounded down) of remaining features to remove at each iteration. Note that the last iteration may remove fewer than <code>step</code> features in order to reach <code>min_features_to_select</code>.</p> </li> <li> <code>max_score</code>               (<code>float</code>, default:                   <code>0.55</code> )           \u2013            <p>Stops the feature selection procedure when the cross-validation score of the sample similarity classifier is lower than <code>max_score</code>.</p> </li> <li> <code>min_n_features_to_select</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>The minimum number of features to select. If <code>None</code>, half of the features are selected. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of the features to select.</p> </li> <li> <code>split_column</code>               (<code>str</code>, default:                   <code>'split'</code> )           \u2013            <p>The name of the column in the dataset that will be used to split the dataset into two sets.</p> </li> <li> <code>split_value</code>               (<code>Any</code>, default:                   <code>None</code> )           \u2013            <p>If defined, this value will be used to split the dataset into two sets.</p> </li> <li> <code>split_frac</code>               (<code>float</code>, default:                   <code>0.5</code> )           \u2013            <p>If split_value, split frac is used to determine a split_value. The split frac corresponds to the quantile of the split_column to use as the split_value.</p> </li> <li> <code>split_unique_values</code>           \u2013            <p>Whether to calculate the quantile of the split_column to use as the split_value based on the unique values of the split_column.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>None</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are:</p> <pre><code>- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n</code></pre> <p>For integer/None inputs, if <code>y</code> is binary or multiclass, :class:<code>~sklearn.model_selection.StratifiedKFold</code> is used. If the estimator is a classifier or if <code>y</code> is neither binary nor multiclass, :class:<code>~sklearn.model_selection.KFold</code> is used.</p> <p>Refer :ref:<code>User Guide &lt;cross_validation&gt;</code> for the various cross-validation strategies that can be used here.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>verbose</code>               (<code>int</code>, default:                   <code>0</code> )           \u2013            <p>Controls verbosity of output.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of cores to run in parallel while fitting across folds. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>importance_getter</code>               (<code>str or callable</code>, default:                   <code>'auto'</code> )           \u2013            <p>If 'auto', uses the feature importance either through a <code>coef_</code> or <code>feature_importances_</code> attributes of estimator.</p> <p>Also accepts a string that specifies an attribute name/path for extracting feature importance. For example, give <code>regressor_.coef_</code> in case of :class:<code>~sklearn.compose.TransformedTargetRegressor</code>  or <code>named_steps.clf.feature_importances_</code> in case of :class:<code>~sklearn.pipeline.Pipeline</code> with its last step named <code>clf</code>.</p> <p>If <code>callable</code>, overrides the default feature importance getter. The callable is passed with the fitted estimator and the validation set (X_val, y_val, estimator) and it should return importance for each feature.</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>classes_</code>               (<code>ndarray of shape (n_classes,)</code>)           \u2013            <p>The classes labels.</p> </li> <li> <code>clf_</code>               (<code>``Classifier`` instance</code>)           \u2013            <p>The fitted classifier used to select features.</p> </li> <li> <code>cv_results_</code>               (<code>dict of ndarrays</code>)           \u2013            <p>A dict with keys: n_features : ndarray of shape (n_subsets_of_features,)     The number of features used at that step. split(k)_test_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_test_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_test_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds. split(k)_train_score : ndarray of shape (n_subsets_of_features,)     The cross-validation scores across (k)th fold. mean_train_score : ndarray of shape (n_subsets_of_features,)     Mean of scores over the folds. std_train_score : ndarray of shape (n_subsets_of_features,)     Standard deviation of scores over the folds.</p> </li> <li> <code>n_features_</code>               (<code>int</code>)           \u2013            <p>The number of selected features.</p> </li> <li> <code>n_features_in_</code>               (<code>int</code>)           \u2013            <p>Number of features seen during :term:<code>fit</code>. Only defined if the underlying estimator exposes such an attribute when fit.</p> </li> <li> <code>feature_names_in_</code>               (<code>ndarray of shape (`n_features_in_`,)</code>)           \u2013            <p>Names of features seen during :term:<code>fit</code>. Defined only when <code>X</code> has feature names that are all strings.</p> </li> <li> <code>ranking_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The feature ranking, such that <code>ranking_[i]</code> corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.</p> </li> <li> <code>support_</code>               (<code>ndarray of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> <li> <code># TODO</code>               (<code>Add example</code>)           \u2013            </li> </ul> Source code in <code>felimination/drift.py</code> <pre><code>def __init__(\n    self,\n    clf: ClassifierMixin,\n    *,\n    step=1,\n    max_score=0.55,\n    min_n_features_to_select=1,\n    split_col=0,\n    split_value=None,\n    split_frac=0.5,\n    split_unique_values=True,\n    cv=None,\n    scoring=None,\n    random_state=None,\n    verbose=0,\n    n_jobs=None,\n    importance_getter=\"auto\",\n) -&gt; None:\n    self.max_score = max_score\n    self.split_col = split_col\n    self.split_value = split_value\n    self.split_unique_values = split_unique_values\n    self.split_frac = split_frac\n    self.min_n_features_to_select = min_n_features_to_select\n    self.clf = clf\n    super().__init__(\n        estimator=clf,\n        n_features_to_select=min_n_features_to_select,\n        step=step,\n        cv=cv,\n        scoring=scoring,\n        random_state=random_state,\n        verbose=verbose,\n        n_jobs=n_jobs,\n        importance_getter=importance_getter,\n    )\n</code></pre>"},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.fit","title":"<code>fit(X, y=None, groups=None, **fit_params)</code>","text":"<p>Fit the RFE model and then the underlying clf on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>The target values. Not used, kept for compatibility.</p> </li> <li> <code>groups</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a \"Group\" :term:<code>cv</code> instance.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying clf.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted selector.</p> </li> </ul> Source code in <code>felimination/drift.py</code> <pre><code>def fit(self, X, y=None, groups=None, **fit_params):\n    \"\"\"Fit the RFE model and then the underlying clf on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values. Not used, kept for compatibility.\n    groups : array-like of shape (n_samples,), default=None\n        Group labels for the samples used while splitting the dataset into\n        train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n        instance.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        clf.\n\n    Returns\n    -------\n    self : object\n        Fitted selector.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    X = self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        dtype=None,\n    )\n    if isinstance(self.split_col, str):\n        split_col_idx = list(self.feature_names_in_).index(self.split_col)\n    else:\n        split_col_idx = self.split_col\n    split_col_values = X[:, split_col_idx]\n    X, y = self._build_sample_similarity_x_y(X, split_col_values=split_col_values)\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=True)\n    scorer = check_scoring(self.clf, scoring=self.scoring)\n    n_features = X.shape[1]\n\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    support_ = np.ones(n_features, dtype=bool)\n    support_[split_col_idx] = False\n    ranking_ = np.ones(n_features, dtype=int)\n\n    current_number_of_features = support_.sum()\n    self.cv_results_ = defaultdict(list)\n\n    if self.verbose &gt; 0:\n        print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n    # Train model, score it and get importances\n    if effective_n_jobs(self.n_jobs) == 1:\n        parallel, func = list, _train_score_get_importance\n    else:\n        parallel = Parallel(n_jobs=self.n_jobs)\n        func = delayed(_train_score_get_importance)\n\n    features = np.arange(n_features)[support_]\n    X_remaining_features = X[:, features]\n\n    scores_importances = parallel(\n        func(\n            self.clf,\n            X_remaining_features,\n            y,\n            train,\n            test,\n            scorer,\n            self.importance_getter,\n        )\n        for train, test in cv.split(X_remaining_features, y, groups)\n    )\n\n    test_scores_per_fold = [\n        score_importance[1] for score_importance in scores_importances\n    ]\n    train_scores_per_fold = [\n        score_importance[0] for score_importance in scores_importances\n    ]\n\n    # Update cv scores\n    for train_or_test, scores_per_fold in zip(\n        [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n    ):\n        for i, score in enumerate(scores_per_fold):\n            self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n        self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n            np.mean(scores_per_fold)\n        )\n        self.cv_results_[f\"std_{train_or_test}_score\"].append(\n            np.std(scores_per_fold)\n        )\n    self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    # Elimination\n    while (\n        np.mean(test_scores_per_fold) &gt; self.max_score\n        and current_number_of_features &gt; min_n_features_to_select\n    ):\n        features = np.arange(n_features)[support_]\n        if 0.0 &lt; self.step &lt; 1.0:\n            step = int(max(1, self.step * current_number_of_features))\n        else:\n            step = int(self.step)\n        # Eliminate most important features\n        threshold = min(step, current_number_of_features - min_n_features_to_select)\n        cv_importances = [\n            score_importance[2] for score_importance in scores_importances\n        ]\n        mean_importances = np.mean(np.vstack(cv_importances), axis=0)\n        ranks = np.argsort(-mean_importances)\n        ranks = np.ravel(ranks)\n        support_[features[ranks][:threshold]] = False\n        ranking_[np.logical_not(support_)] += 1\n        current_number_of_features = np.sum(support_)\n        # Select remaining features\n        features = np.arange(n_features)[support_]\n        X_remaining_features = X[:, features]\n\n        if self.verbose &gt; 0:\n            print(\"Fitting clf with %d features.\" % current_number_of_features)\n\n        # Train model, score it and get importances\n        if effective_n_jobs(self.n_jobs) == 1:\n            parallel, func = list, _train_score_get_importance\n        else:\n            parallel = Parallel(n_jobs=self.n_jobs)\n            func = delayed(_train_score_get_importance)\n\n        scores_importances = parallel(\n            func(\n                self.clf,\n                X_remaining_features,\n                y,\n                train,\n                test,\n                scorer,\n                self.importance_getter,\n            )\n            for train, test in cv.split(X_remaining_features, y, groups)\n        )\n        train_scores_per_fold = [\n            score_importance[0] for score_importance in scores_importances\n        ]\n        test_scores_per_fold = [\n            score_importance[1] for score_importance in scores_importances\n        ]\n\n        # Update cv scores\n        for train_or_test, scores_per_fold in zip(\n            [\"train\", \"test\"], [train_scores_per_fold, test_scores_per_fold]\n        ):\n            for i, score in enumerate(scores_per_fold):\n                self.cv_results_[f\"split{i}_{train_or_test}_score\"].append(score)\n            self.cv_results_[f\"mean_{train_or_test}_score\"].append(\n                np.mean(scores_per_fold)\n            )\n            self.cv_results_[f\"std_{train_or_test}_score\"].append(\n                np.std(scores_per_fold)\n            )\n        self.cv_results_[\"n_features\"].append(current_number_of_features)\n\n    features = np.arange(n_features)[support_]\n    self.clf_ = clone(self.clf)\n    self.clf_.fit(X[:, features], y, **fit_params)\n\n    self.n_features_ = support_.sum()\n    self.support_ = support_\n    self.ranking_ = ranking_\n    self.cv_results_ = dict(self.cv_results_)\n    return self\n</code></pre>"},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot a feature selection plot with number of features</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot a feature selection plot with number of features\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    check_is_fitted(self)\n    df = pd.DataFrame(self.cv_results_)\n    split_score_cols = [col for col in df if \"split\" in col]\n    df_long_form = df[split_score_cols + [\"n_features\"]].melt(\n        id_vars=[\"n_features\"],\n        value_vars=split_score_cols,\n        var_name=\"split\",\n        value_name=\"score\",\n    )\n    df_long_form[\"set\"] = np.where(\n        df_long_form[\"split\"].str.contains(\"train\"), \"train\", \"validation\"\n    )\n    lineplot_kwargs = dict(\n        x=\"n_features\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    ax = sns.lineplot(data=df_long_form, **lineplot_kwargs)\n    ax.set_xticks(df.n_features)\n    return ax\n</code></pre>"},{"location":"reference/drift/#felimination.drift.SampleSimilarityDriftRFE.set_n_features_to_select","title":"<code>set_n_features_to_select(n_features_to_select)</code>","text":"<p>Changes the number of features to select after fitting.</p> <p>The underlying estimator will not be retrained. So this method will not alter the behavior of predict/predict_proba but it will change the behavior of transform and get_feature_names_out.</p> <p>Parameters:</p> <ul> <li> <code>n_features_to_select</code>               (<code>int</code>)           \u2013            <p>The number of features to select. Must be a value among <code>cv_results_[\"n_features\"]</code></p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> <p>Raises:</p> <ul> <li> <code>ValueError</code>             \u2013            <p>When the number of features to select has not been tried during the feature selection procedure.</p> </li> </ul> Source code in <code>felimination/rfe.py</code> <pre><code>def set_n_features_to_select(self, n_features_to_select):\n    \"\"\"Changes the number of features to select after fitting.\n\n    The underlying estimator **will not be retrained**. So this method will not\n    alter the behavior of predict/predict_proba but it will change the behavior\n    of transform and get_feature_names_out.\n\n    Parameters\n    ----------\n    n_features_to_select : int\n        The number of features to select. Must be a value among\n        `cv_results_[\"n_features\"]`\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n\n    Raises\n    ------\n    ValueError\n        When the number of features to select has not been tried during the\n        feature selection procedure.\n    \"\"\"\n    check_is_fitted(self)\n    if n_features_to_select not in self.cv_results_[\"n_features\"]:\n        raise ValueError(\n            f\"This selector has not been fitted up with {n_features_to_select}, \"\n            f\"please select a value in {set(self.cv_results_['n_features'])} or \"\n            \"refit the selector changing the step parameter of the n_features_to_select\"\n        )\n    support_ = np.zeros_like(self.support_, dtype=bool)\n    support_[np.argsort(self.ranking_)[:n_features_to_select]] = True\n    self.support_ = support_\n    return self\n</code></pre>"},{"location":"reference/genetic_algorithms/","title":"Genetic algorithms","text":"<p>This module contains the implementation of the Hybrid Genetic Algorithm-Importance with Cross-Validation. The algorithm is implemented in the <code>HybridImportanceGACVFeatureSelector</code> class.</p>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector","title":"<code>HybridImportanceGACVFeatureSelector(estimator, *, cv=5, scoring=None, random_state=None, n_jobs=None, importance_getter='auto', min_n_features_to_select=1, init_avg_features_num=15, init_std_features_num=5, pool_size=20, is_parent_selection_chance_proportional_to_fitness=True, n_children_cross_over=5, n_parents_cross_over=2, n_mutations=5, range_change_n_features_mutation=(-2, 3), range_randomly_swapped_features_mutation=(1, 4), max_generations=100, patience=5, callbacks=None, fitness_function=rank_mean_test_score_overfit_fitness)</code>","text":"<p>               Bases: <code>SelectorMixin</code>, <code>MetaEstimatorMixin</code>, <code>BaseEstimator</code></p> <p>Feature selection using Hybrid Genetic Algorithm-Importance with Cross-Validation.</p> <p>This feature selector uses a genetic algorithm to select features. The genetic algorithm is hybridized with feature importance. The feature importance is calculated using a cross-validation scheme. The algorithm works as follows:</p> <p>Pool initialization: The pool is initialized with random features. The number of features is randomly generated using a normal distribution with the average number of features to select and the standard deviation of the number of features to select as parameters. The number of features is clipped to be between the minimum number of features to select and the number of features in the dataset.</p> <p>Cross Over: The cross over is done by combining the features of the parents. The features are sorted by importance and the children are created by combining the features of the parents in a round-robin fashion. The number of features of the children is the average of the number of features of the parents. In this way, the children will have the most important features of the parents.</p> <p>Mutation: The mutation is done by randomly changing the number of features and replacing the least important features with random features.</p> <p>Selection: The selection is done by selecting the top <code>pool_size</code> solutions based on the fitness function.</p> <p>Parameters:</p> <ul> <li> <code>estimator</code>               (<code>object</code>)           \u2013            <p>An estimator that follows the scikit-learn API and has a <code>fit</code> method.</p> </li> <li> <code>cv</code>               (<code>int, cross-validation generator or an iterable</code>, default:                   <code>5</code> )           \u2013            <p>Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - int, to specify the number of folds in a (Stratified)KFold, - :term:<code>CV splitter</code>, - An iterable yielding (train, test) splits as arrays of indices.</p> </li> <li> <code>scoring</code>               (<code>(str, callable or None)</code>, default:                   <code>None</code> )           \u2013            <p>A string (see model evaluation documentation) or a scorer callable object / function with signature <code>scorer(estimator, X, y)</code>.</p> </li> <li> <code>random_state</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Controls the random seed given at the beginning of the algorithm.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>The number of jobs to run in parallel. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors.</p> </li> <li> <code>importance_getter</code>               (<code>str or callable</code>, default:                   <code>'auto'</code> )           \u2013            <p>If 'auto', uses the feature importance either through a <code>coef_</code> or <code>feature_importances_</code> attributes of estimator.</p> <p>Also accepts a string that specifies an attribute name/path for extracting feature importance. For example, give <code>regressor_.coef_</code> in case of <code>~sklearn.compose.TransformedTargetRegressor</code>  or <code>named_steps.clf.feature_importances_</code> in case of <code>~sklearn.pipeline.Pipeline</code> with its last step named <code>clf</code>.</p> <p>If <code>callable</code>, overrides the default feature importance getter. The callable is passed with the fitted estimator and the validation set (X_val, y_val, estimator) and it should return importance for each feature.</p> </li> <li> <code>min_n_features_to_select</code>               (<code>int or float</code>, default:                   <code>1</code> )           \u2013            <p>The minimum number of features to select. If float, it represents the fraction of features to select.</p> </li> <li> <code>init_avg_features_num</code>               (<code>float</code>, default:                   <code>15</code> )           \u2013            <p>The average number of features to select in the initial pool of solutions.</p> </li> <li> <code>init_std_features_num</code>               (<code>float</code>, default:                   <code>5</code> )           \u2013            <p>The standard deviation of the number of features to select in the initial pool of solutions.</p> </li> <li> <code>pool_size</code>               (<code>int</code>, default:                   <code>20</code> )           \u2013            <p>The number of solutions in the pool.</p> </li> <li> <code>n_children_cross_over</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>The number of children to create by cross-over.</p> </li> <li> <code>is_parent_selection_chance_proportional_to_fitness</code>               (<code>bool</code>, default:                   <code>True</code> )           \u2013            <p>If True, the probability of selecting a parent is proportional to its fitness. This means that the fittest parents are more likely to be selected during crossover.</p> </li> <li> <code>n_parents_cross_over</code>               (<code>int</code>, default:                   <code>2</code> )           \u2013            <p>The number of parents to select in each crossover. More than 2 parents can be selected during crossover. In that case, the top features of each parent are combined in a round-robin fashion to create a children. The number of features of the children is the average of the number of features of the parents.</p> </li> <li> <code>n_mutations</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>The number of mutations to apply to the pool.</p> </li> <li> <code>range_change_n_features_mutation</code>               (<code>tuple</code>, default:                   <code>(-2, 3)</code> )           \u2013            <p>The range of the number of features to change during mutation. The first element is the minimum number of features to change and the second element is the maximum number of features to change. The right limit is exclusive.</p> </li> <li> <code>range_randomly_swapped_features_mutation</code>               (<code>tuple</code>, default:                   <code>(1, 4)</code> )           \u2013            <p>The range of the number of features to replace during mutation. The first element is the minimum number of features to replace and the second element is the maximum number of features to replace. The right limit is exclusive.</p> </li> <li> <code>max_generations</code>               (<code>int</code>, default:                   <code>100</code> )           \u2013            <p>The maximum number of generations to run the genetic algorithm.</p> </li> <li> <code>patience</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>The number of generations without improvement to wait before stopping the algorithm.</p> </li> <li> <code>callbacks</code>               (<code>list of callable</code>, default:                   <code>None</code> )           \u2013            <p>A list of callables that are called after each generation. Each callable should accept the selector and the pool as arguments.</p> </li> <li> <code>fitness_function</code>               (<code>str or callable</code>, default:                   <code>rank_mean_test_score_overfit_fitness</code> )           \u2013            <p>The fitness function to use. Possible string values are: <code>'mean_test_score'</code>, <code>'mean_train_score'</code>, If a callable is passed, it should accept a list of dictionaries where each dictionary has the following keys 'features', 'mean_test_score', 'mean_train_score' and return a list of floats with the fitness of each element in the pool. Defaults to rank_mean_test_score_overfit_fitness</p> </li> </ul> <p>Attributes:</p> <ul> <li> <code>estimator_</code>               (<code>object</code>)           \u2013            <p>The fitted estimator.</p> </li> <li> <code>support_</code>               (<code>array of shape (n_features,)</code>)           \u2013            <p>The mask of selected features.</p> </li> <li> <code>best_solution_</code>               (<code>dict</code>)           \u2013            <p>The best solution found by the genetic algorithm. It is a dictionary with the following keys - features: list of int     The features selected for this element. - mean_test_score: float     The mean test score of the element. - mean_train_score: float     The mean train score of the element. - train_scores_per_fold: list of float     The train score of each fold. - test_scores_per_fold: list of float     The test score of each fold. - cv_importances: list of array     The importances of each fold. - mean_cv_importances: array     The mean importances of each fold.</p> </li> <li> <code>best_solutions_</code>               (<code>list of dict</code>)           \u2013            <p>The best solutions found by the genetic algorithm at each generation. Each element is defined as in <code>best_solution_</code>.</p> </li> </ul> <p>Examples:</p> <pre><code>&gt;&gt;&gt; from felimination.ga import HybridImportanceGACVFeatureSelector\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n&gt;&gt;&gt; from sklearn.linear_model import LogisticRegression\n&gt;&gt;&gt; X, y = make_classification(\n    n_samples=sample_size,\n    n_features=2,\n    n_classes=2,\n    n_redundant=0,\n    n_clusters_per_class=1,\n    random_state=random_state,\n)\n&gt;&gt;&gt; estimator = LogisticRegression(random_state=42)\n&gt;&gt;&gt; selector = selector = HybridImportanceGACVFeatureSelector(\n    random_state=random_state,\n    init_avg_features_num=2,\n    init_std_features_num=1,\n)\n&gt;&gt;&gt; selector = selector.fit(X, y)\n&gt;&gt;&gt; selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n</code></pre> Source code in <code>felimination/ga.py</code> <pre><code>def __init__(\n    self,\n    estimator: BaseEstimator | LogisticRegression,\n    *,\n    cv=5,\n    scoring=None,\n    random_state=None,\n    n_jobs=None,\n    importance_getter=\"auto\",\n    min_n_features_to_select=1,\n    init_avg_features_num=15,\n    init_std_features_num=5,\n    pool_size=20,\n    is_parent_selection_chance_proportional_to_fitness=True,\n    n_children_cross_over=5,\n    n_parents_cross_over=2,\n    n_mutations=5,\n    range_change_n_features_mutation=(-2, 3),\n    range_randomly_swapped_features_mutation=(1, 4),\n    max_generations=100,\n    patience=5,\n    callbacks=None,\n    fitness_function=rank_mean_test_score_overfit_fitness,\n) -&gt; None:\n    self.estimator = estimator\n    self.cv = cv\n    self.scoring = scoring\n    self.random_state = random_state\n    self.n_jobs = n_jobs\n    self.importance_getter = importance_getter\n    self.min_n_features_to_select = min_n_features_to_select\n    self.init_avg_features_num = init_avg_features_num\n    self.init_std_features_num = init_std_features_num\n    self.pool_size = pool_size\n    self.n_children_cross_over = n_children_cross_over\n    self.is_parent_selection_chance_proportional_to_fitness = (\n        is_parent_selection_chance_proportional_to_fitness\n    )\n    self.n_parents_cross_over = n_parents_cross_over\n    self.n_mutations = n_mutations\n    self.range_change_n_features_mutation = range_change_n_features_mutation\n    self.range_randomly_swapped_features_mutation = (\n        range_randomly_swapped_features_mutation\n    )\n    self.max_generations = max_generations\n    self.patience = patience\n    self.callbacks = callbacks\n    self.fitness_function = fitness_function\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.decision_function","title":"<code>decision_function(X)</code>","text":"<p>Compute the decision function of <code>X</code>.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like or sparse matrix</code>, default:                   <code>array-like or sparse matrix</code> )           \u2013            <p>The input samples. Internally, it will be converted to <code>dtype=np.float32</code> and if a sparse matrix is provided to a sparse <code>csr_matrix</code>.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>score</code> (              <code>array, shape = [n_samples, n_classes] or [n_samples]</code> )          \u2013            <p>The decision function of the input samples. The order of the classes corresponds to that in the attribute :term:<code>classes_</code>. Regression and binary classification produce an array of shape [n_samples].</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"decision_function\"))\ndef decision_function(self, X):\n    \"\"\"Compute the decision function of ``X``.\n\n    Parameters\n    ----------\n    X : {array-like or sparse matrix} of shape (n_samples, n_features)\n        The input samples. Internally, it will be converted to\n        ``dtype=np.float32`` and if a sparse matrix is provided\n        to a sparse ``csr_matrix``.\n\n    Returns\n    -------\n    score : array, shape = [n_samples, n_classes] or [n_samples]\n        The decision function of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n        Regression and binary classification produce an array of shape\n        [n_samples].\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.decision_function(self.transform(X))\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.fit","title":"<code>fit(X, y, groups=None, **fit_params)</code>","text":"<p>Fit the selector and then the underlying estimator on the selected features.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like, sparse matrix</code>, default:                   <code>array-like</code> )           \u2013            <p>The training input samples.</p> </li> <li> <code>y</code>               (<code>array-like of shape (n_samples,)</code>)           \u2013            <p>The target values.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to the <code>fit</code> method of the underlying estimator.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>self</code> (              <code>object</code> )          \u2013            <p>Fitted estimator.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>def fit(self, X, y, groups=None, **fit_params):\n    \"\"\"Fit the selector and then the underlying estimator on the selected features.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix} of shape (n_samples, n_features)\n        The training input samples.\n    y : array-like of shape (n_samples,)\n        The target values.\n    **fit_params : dict\n        Additional parameters passed to the `fit` method of the underlying\n        estimator.\n\n    Returns\n    -------\n    self : object\n        Fitted estimator.\n    \"\"\"\n    self._validate_params()\n    tags = self._get_tags()\n    self._validate_data(\n        X,\n        y,\n        accept_sparse=\"csc\",\n        ensure_min_features=2,\n        force_all_finite=not tags.get(\"allow_nan\", True),\n        multi_output=True,\n        dtype=None,\n    )\n\n    # Initialization\n    cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n    scorer = check_scoring(self.estimator, scoring=self.scoring)\n    n_features = X.shape[1]\n    if self.min_n_features_to_select is None:\n        min_n_features_to_select = n_features // 2\n    elif isinstance(self.min_n_features_to_select, Integral):  # int\n        min_n_features_to_select = self.min_n_features_to_select\n    else:  # float\n        min_n_features_to_select = int(n_features * self.min_n_features_to_select)\n\n    if isinstance(X, pd.DataFrame):\n        all_features = X.columns.to_list()\n    else:\n        all_features = list(range(n_features))\n\n    np.random.seed(self.random_state)\n\n    # Create the initial pool of solutions\n    pool = [\n        {\n            \"features\": list(\n                np.random.choice(\n                    all_features,\n                    min(\n                        max(\n                            int(\n                                np.random.normal(\n                                    self.init_avg_features_num,\n                                    self.init_std_features_num,\n                                )\n                            ),\n                            min_n_features_to_select,\n                        ),\n                        n_features,\n                    ),\n                    replace=False,\n                )\n            ),\n        }\n        for _ in range(self.pool_size)\n    ]\n\n    # Evaluate the initial pool of solutions\n    pool = self._evaluate_calculate_importances(\n        pool, X, y, groups, cv, scorer, **fit_params\n    )\n    self.best_solutions_ = []\n    for _ in range(1, self.max_generations):\n        children = self._cross_over(pool)\n        children = self._evaluate_calculate_importances(\n            children, X, y, groups, cv, scorer, **fit_params\n        )\n        pool.extend(children)\n        mutations = self._mutate(pool, all_features)\n        mutations = self._evaluate_calculate_importances(\n            mutations, X, y, groups, cv, scorer, **fit_params\n        )\n        pool.extend(mutations)\n        pool_sorted = [\n            element\n            for _, element in sorted(\n                zip(self._calculate_fitness(pool), pool),\n                reverse=True,\n                key=itemgetter(0),\n            )\n        ]\n        pool = pool_sorted[: self.pool_size]\n        self.best_solutions_.append(pool[0])\n\n        if self.callbacks:\n            for callback in self.callbacks:\n                callback(self, pool)\n\n        if len(self.best_solutions_) &gt; self.patience:\n            if all(\n                [\n                    self.best_solutions_[-1][\"features\"] == solution[\"features\"]\n                    for solution in self.best_solutions_[-self.patience :]\n                ]\n            ):\n                break\n\n    self.estimator_ = clone(self.estimator)\n    X_remaining_features = _select_X_with_features(\n        X, self.best_solution_[\"features\"]\n    )\n    self.estimator_.fit(X_remaining_features, y, **fit_params)\n    self.support_ = np.array(\n        [\n            True if feature in self.best_solution_[\"features\"] else False\n            for feature in all_features\n        ]\n    )\n\n    return self\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.plot","title":"<code>plot(**kwargs)</code>","text":"<p>Plot the mean test score and mean train score of the best solution at each generation.</p> <p>Parameters:</p> <ul> <li> <code>**kwargs</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Additional parameters passed to seaborn.lineplot. For a list of possible options, please visit seaborn.lineplot  # noqa</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>Axes</code>           \u2013            <p>The axis where the plot has been plotted.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>def plot(self, **kwargs):\n    \"\"\"Plot the mean test score and mean train score of the best solution at each generation.\n\n    Parameters\n    ----------\n    **kwargs : dict\n        Additional parameters passed to seaborn.lineplot. For a list\n        of possible options, please visit\n        [seaborn.lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html)  # noqa\n\n    Returns\n    -------\n    matplotlib.axes.Axes\n        The axis where the plot has been plotted.\n    \"\"\"\n    data_points_to_plot_long_form = []\n    for generation, best_solution in enumerate(self.best_solutions_, start=1):\n        for set, scores in zip(\n            [\"validation\", \"train\"],\n            [\n                best_solution[\"test_scores_per_fold\"],\n                best_solution[\"train_scores_per_fold\"],\n            ],\n        ):\n            for score in scores:\n                data_points_to_plot_long_form.append(\n                    {\"generation\": generation, \"score\": score, \"set\": set}\n                )\n    df_plot = pd.DataFrame(data_points_to_plot_long_form)\n    lineplot_kwargs = dict(\n        x=\"generation\",\n        y=\"score\",\n        hue=\"set\",\n        markers=True,\n        style=\"set\",\n        hue_order=[\"validation\", \"train\"],\n        style_order=[\"validation\", \"train\"],\n        seed=self.random_state,\n    )\n    lineplot_kwargs.update(**kwargs)\n    return sns.lineplot(data=df_plot, **lineplot_kwargs)\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict","title":"<code>predict(X)</code>","text":"<p>Reduce X to the selected features and predict using the estimator.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array of shape [n_samples, n_features]</code>)           \u2013            <p>The input samples.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>y</code> (              <code>array of shape [n_samples]</code> )          \u2013            <p>The predicted target values.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"predict\"))\ndef predict(self, X):\n    \"\"\"Reduce X to the selected features and predict using the estimator.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    Returns\n    -------\n    y : array of shape [n_samples]\n        The predicted target values.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict(self.transform(X))\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict_log_proba","title":"<code>predict_log_proba(X)</code>","text":"<p>Predict class log-probabilities for X.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array of shape [n_samples, n_features]</code>)           \u2013            <p>The input samples.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>p</code> (              <code>array of shape (n_samples, n_classes)</code> )          \u2013            <p>The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:<code>classes_</code>.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"predict_log_proba\"))\ndef predict_log_proba(self, X):\n    \"\"\"Predict class log-probabilities for X.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    Returns\n    -------\n    p : array of shape (n_samples, n_classes)\n        The class log-probabilities of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict_log_proba(self.transform(X))\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.predict_proba","title":"<code>predict_proba(X)</code>","text":"<p>Predict class probabilities for X.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array-like or sparse matrix</code>, default:                   <code>array-like or sparse matrix</code> )           \u2013            <p>The input samples. Internally, it will be converted to <code>dtype=np.float32</code> and if a sparse matrix is provided to a sparse <code>csr_matrix</code>.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>p</code> (              <code>array of shape (n_samples, n_classes)</code> )          \u2013            <p>The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:<code>classes_</code>.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"predict_proba\"))\ndef predict_proba(self, X):\n    \"\"\"Predict class probabilities for X.\n\n    Parameters\n    ----------\n    X : {array-like or sparse matrix} of shape (n_samples, n_features)\n        The input samples. Internally, it will be converted to\n        ``dtype=np.float32`` and if a sparse matrix is provided\n        to a sparse ``csr_matrix``.\n\n    Returns\n    -------\n    p : array of shape (n_samples, n_classes)\n        The class probabilities of the input samples. The order of the\n        classes corresponds to that in the attribute :term:`classes_`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.predict_proba(self.transform(X))\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.HybridImportanceGACVFeatureSelector.score","title":"<code>score(X, y, **fit_params)</code>","text":"<p>Reduce X to the selected features and return the score of the estimator.</p> <p>Parameters:</p> <ul> <li> <code>X</code>               (<code>array of shape [n_samples, n_features]</code>)           \u2013            <p>The input samples.</p> </li> <li> <code>y</code>               (<code>array of shape [n_samples]</code>)           \u2013            <p>The target values.</p> </li> <li> <code>**fit_params</code>               (<code>dict</code>, default:                   <code>{}</code> )           \u2013            <p>Parameters to pass to the <code>score</code> method of the underlying estimator.</p> <p>.. versionadded:: 1.0</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>score</code> (              <code>float</code> )          \u2013            <p>Score of the underlying base estimator computed with the selected features returned by <code>rfe.transform(X)</code> and <code>y</code>.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>@available_if(_estimator_has(\"score\"))\ndef score(self, X, y, **fit_params):\n    \"\"\"Reduce X to the selected features and return the score of the estimator.\n\n    Parameters\n    ----------\n    X : array of shape [n_samples, n_features]\n        The input samples.\n\n    y : array of shape [n_samples]\n        The target values.\n\n    **fit_params : dict\n        Parameters to pass to the `score` method of the underlying\n        estimator.\n\n        .. versionadded:: 1.0\n\n    Returns\n    -------\n    score : float\n        Score of the underlying base estimator computed with the selected\n        features returned by `rfe.transform(X)` and `y`.\n    \"\"\"\n    check_is_fitted(self)\n    return self.estimator_.score(self.transform(X), y, **fit_params)\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.rank_mean_test_score_fitness","title":"<code>rank_mean_test_score_fitness(pool)</code>","text":"<p>Define the fitness function as the rank of the mean test score.</p> <p>The rank of the mean test score is calculated by ranking the mean test score in ascending order.</p> <p>Parameters:</p> <ul> <li> <code>pool</code>               (<code>list of dict</code>)           \u2013            <p>Each element in the list is a dictionary with the following keys: - features: list of int     The features selected for this element. - mean_test_score: float     The mean test score of the element. - mean_train_score: float     The mean train score of the element.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>fitness</code> (              <code>list of float</code> )          \u2013            <p>The fitness of each element in the pool.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>def rank_mean_test_score_fitness(pool):\n    \"\"\"Define the fitness function as the rank of the mean test score.\n\n    The rank of the mean test score is calculated by ranking the mean test score in ascending order.\n\n    Parameters\n    ----------\n\n    pool : list of dict\n        Each element in the list is a dictionary with the following keys:\n        - features: list of int\n            The features selected for this element.\n        - mean_test_score: float\n            The mean test score of the element.\n        - mean_train_score: float\n            The mean train score of the element.\n\n    Returns\n    -------\n    fitness : list of float\n        The fitness of each element in the pool.\n    \"\"\"\n    pool_df = pd.DataFrame(pool)\n    pool_df[\"rank_mean_test_score\"] = pool_df[\"mean_test_score\"].rank(ascending=True)\n    return pool_df[\"rank_mean_test_score\"].to_list()\n</code></pre>"},{"location":"reference/genetic_algorithms/#felimination.ga.rank_mean_test_score_overfit_fitness","title":"<code>rank_mean_test_score_overfit_fitness(pool)</code>","text":"<p>Define the fitness function as the sum of the rank of the mean test score and the rank of the overfit.</p> <p>The rank of the mean test score is calculated by ranking the mean test score in ascending order. The rank of the overfit is calculated by ranking the overfit in ascending order. The overfit is calculated as the difference between the mean train score and the mean test score. The fitness is the sum of the rank of the mean test score and the rank of the overfit.</p> <p>Parameters:</p> <ul> <li> <code>pool</code>               (<code>list of dict</code>)           \u2013            <p>Each element in the list is a dictionary with the following keys: - features: list of int     The features selected for this element. - mean_test_score: float     The mean test score of the element. - mean_train_score: float     The mean train score of the element.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>fitness</code> (              <code>list of float</code> )          \u2013            <p>The fitness of each element in the pool.</p> </li> </ul> Source code in <code>felimination/ga.py</code> <pre><code>def rank_mean_test_score_overfit_fitness(pool):\n    \"\"\"Define the fitness function as the sum of the rank of the mean test score and the rank of the\n    overfit.\n\n    The rank of the mean test score is calculated by ranking the mean test score in ascending order.\n    The rank of the overfit is calculated by ranking the overfit in ascending order.\n    The overfit is calculated as the difference between the mean train score and the mean test score.\n    The fitness is the sum of the rank of the mean test score and the rank of the overfit.\n\n    Parameters\n    ----------\n    pool : list of dict\n        Each element in the list is a dictionary with the following keys:\n        - features: list of int\n            The features selected for this element.\n        - mean_test_score: float\n            The mean test score of the element.\n        - mean_train_score: float\n            The mean train score of the element.\n\n    Returns\n    -------\n    fitness : list of float\n        The fitness of each element in the pool.\n    \"\"\"\n\n    pool_df = pd.DataFrame(pool)\n    pool_df[\"rank_mean_test_score\"] = pool_df[\"mean_test_score\"].rank(ascending=False)\n    pool_df[\"overfit\"] = pool_df[\"mean_train_score\"] - pool_df[\"mean_test_score\"]\n    pool_df[\"rank_overfit\"] = pool_df[\"overfit\"].rank(ascending=True)\n    pool_df[\"rank_sum\"] = pool_df[\"rank_mean_test_score\"] + pool_df[\"rank_overfit\"]\n\n    pool_df[\"rank_sum_rank\"] = pool_df[\"rank_sum\"].rank(ascending=False)\n    return pool_df[\"rank_sum_rank\"].to_list()\n</code></pre>"},{"location":"reference/importance/","title":"Importance","text":""},{"location":"reference/importance/#felimination.importance.PermutationImportance","title":"<code>PermutationImportance(scoring=None, n_repeats=5, n_jobs=None, random_state=None, sample_weight=None, max_samples=1.0)</code>","text":"<p>Wrapper around sklearn.inspection.permutation_importance.</p> <p>Parameters:</p> <ul> <li> <code>scoring</code>               (<code>str, callable, list, tuple, or dict</code>, default:                   <code>None</code> )           \u2013            <p>Scorer to use. If <code>scoring</code> represents a single score, one can use: - a single string; - a callable that returns a single value. If <code>scoring</code> represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric names and the values are the metric scores; - a dictionary with metric names as keys and callables a values. Passing multiple scores to <code>scoring</code> is more efficient than calling <code>permutation_importance</code> for each of the scores as it reuses predictions to avoid redundant computation. If None, the estimator's default scorer is used.</p> </li> <li> <code>n_repeats</code>               (<code>int</code>, default:                   <code>5</code> )           \u2013            <p>Number of times to permute a feature.</p> </li> <li> <code>n_jobs</code>               (<code>int or None</code>, default:                   <code>None</code> )           \u2013            <p>Number of jobs to run in parallel. The computation is done by computing permutation score for each columns and parallelized over the columns. <code>None</code> means 1 unless in a :obj:<code>joblib.parallel_backend</code> context. <code>-1</code> means using all processors.</p> </li> <li> <code>random_state</code>               (<code>int, RandomState instance</code>, default:                   <code>None</code> )           \u2013            <p>Pseudo-random number generator to control the permutations of each feature. Pass an int to get reproducible results across function calls.</p> </li> <li> <code>sample_weight</code>               (<code>array-like of shape (n_samples,)</code>, default:                   <code>None</code> )           \u2013            <p>Sample weights used in scoring.</p> </li> <li> <code>max_samples</code>               (<code>int or float</code>, default:                   <code>1.0</code> )           \u2013            <p>The number of samples to draw from X to compute feature importance in each repeat (without replacement). - If int, then draw <code>max_samples</code> samples. - If float, then draw <code>max_samples * X.shape[0]</code> samples. - If <code>max_samples</code> is equal to <code>1.0</code> or <code>X.shape[0]</code>, all samples will be used. While using this option may provide less accurate importance estimates, it keeps the method tractable when evaluating feature importance on large datasets. In combination with <code>n_repeats</code>, this allows to control the computational speed vs statistical accuracy trade-off of this method.</p> </li> </ul> Source code in <code>felimination/importance.py</code> <pre><code>def __init__(\n    self,\n    scoring=None,\n    n_repeats=5,\n    n_jobs=None,\n    random_state=None,\n    sample_weight=None,\n    max_samples=1.0,\n):\n    self.scoring = scoring\n    self.n_repeats = n_repeats\n    self.n_jobs = n_jobs\n    self.random_state = random_state\n    self.sample_weight = sample_weight\n    self.max_samples = max_samples\n</code></pre>"},{"location":"reference/importance/#felimination.importance.PermutationImportance.__call__","title":"<code>__call__(estimator, X, y)</code>","text":"<p>Computes the permutation importance.</p> <p>Parameters:</p> <ul> <li> <code>estimator</code>               (<code>object</code>)           \u2013            <p>An estimator that has already been fitted and is compatible with scorer.</p> </li> <li> <code>X</code>               (<code>(ndarray or DataFrame, shape(n_samples, n_features))</code>)           \u2013            <p>Data on which permutation importance will be computed.</p> </li> <li> <code>y</code>               (<code>(array - like or None, shape(n_samples) or (n_samples, n_classes))</code>)           \u2013            <p>Targets for supervised or <code>None</code> for unsupervised.</p> </li> </ul> <p>Returns:</p> <ul> <li> <code>importances_mean</code> (              <code>ndarray of shape (n_features, )</code> )          \u2013            <p>Mean of feature importance over <code>n_repeats</code>.</p> </li> </ul> Source code in <code>felimination/importance.py</code> <pre><code>def __call__(self, estimator, X, y) -&gt; Any:\n    \"\"\"Computes the permutation importance.\n\n    Parameters\n    ----------\n    estimator : object\n        An estimator that has already been fitted and is compatible\n        with scorer.\n    X : ndarray or DataFrame, shape (n_samples, n_features)\n        Data on which permutation importance will be computed.\n    y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)\n        Targets for supervised or `None` for unsupervised.\n\n    Returns\n    -------\n    importances_mean : ndarray of shape (n_features, )\n        Mean of feature importance over `n_repeats`.\n    \"\"\"\n    return permutation_importance(\n        estimator,\n        X,\n        y,\n        scoring=self.scoring,\n        n_repeats=self.n_repeats,\n        n_jobs=self.n_jobs,\n        random_state=self.random_state,\n        sample_weight=self.sample_weight,\n        max_samples=self.max_samples,\n    ).importances_mean\n</code></pre>"},{"location":"tutorials/genetic_algorithms_x_feature_selection/","title":"Genetic Algorithms x Feature Selection","text":"In\u00a0[\u00a0]: Copied! <pre># Install felimination\n! pip install felimination\n</pre> # Install felimination ! pip install felimination In\u00a0[2]: Copied! <pre>from sklearn.datasets import make_classification\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=200,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n    shuffle=False\n)\n</pre> from sklearn.datasets import make_classification  X, y = make_classification(     n_samples=1000,     n_features=200,     n_informative=6,     n_redundant=10,     n_clusters_per_class=1,     random_state=42,     shuffle=False ) In\u00a0[3]: Copied! <pre>from sklearn.model_selection import cross_validate, StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\n\n# Define a simple logistic regression model\nmodel = LogisticRegression(random_state=42)\n\n# Perform cross-validation\ncv_results = cross_validate(\n    model,\n    X,\n    y,\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    scoring=\"roc_auc\",\n    return_train_score=True,\n)\n\ncv_results[\"test_score\"].mean()\n</pre> from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.linear_model import LogisticRegression   # Define a simple logistic regression model model = LogisticRegression(random_state=42)  # Perform cross-validation cv_results = cross_validate(     model,     X,     y,     cv=StratifiedKFold(random_state=42, shuffle=True),     scoring=\"roc_auc\",     return_train_score=True, )  cv_results[\"test_score\"].mean() Out[3]: <pre>0.8561362716271628</pre> In\u00a0[4]: Copied! <pre>from felimination.ga import HybridImportanceGACVFeatureSelector\nfrom felimination.callbacks import plot_progress_callback\n\n\nselector = HybridImportanceGACVFeatureSelector(\n    model,\n    callbacks=[plot_progress_callback],\n    scoring=\"roc_auc\",\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    init_avg_features_num=5,\n    min_n_features_to_select=3,\n    pool_size=20,\n    n_children_cross_over=20,\n    n_mutations=20,\n    random_state=42,\n)\nselector.fit(X, y)\n</pre> from felimination.ga import HybridImportanceGACVFeatureSelector from felimination.callbacks import plot_progress_callback   selector = HybridImportanceGACVFeatureSelector(     model,     callbacks=[plot_progress_callback],     scoring=\"roc_auc\",     cv=StratifiedKFold(random_state=42, shuffle=True),     init_avg_features_num=5,     min_n_features_to_select=3,     pool_size=20,     n_children_cross_over=20,     n_mutations=20,     random_state=42, ) selector.fit(X, y) Out[4]: <pre>HybridImportanceGACVFeatureSelector(callbacks=[&lt;function plot_progress_callback at 0x31aaa4fe0&gt;],\n                                    cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                                    estimator=LogisticRegression(random_state=42),\n                                    init_avg_features_num=5,\n                                    min_n_features_to_select=3,\n                                    n_children_cross_over=20, n_mutations=20,\n                                    random_state=42, scoring='roc_auc')</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.\u00a0HybridImportanceGACVFeatureSelectoriFitted<pre>HybridImportanceGACVFeatureSelector(callbacks=[&lt;function plot_progress_callback at 0x31aaa4fe0&gt;],\n                                    cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                                    estimator=LogisticRegression(random_state=42),\n                                    init_avg_features_num=5,\n                                    min_n_features_to_select=3,\n                                    n_children_cross_over=20, n_mutations=20,\n                                    random_state=42, scoring='roc_auc')</pre> estimator: LogisticRegression<pre>LogisticRegression(random_state=42)</pre> \u00a0LogisticRegression?Documentation for LogisticRegression<pre>LogisticRegression(random_state=42)</pre> <p>Notice how model performances increase with the progressive elimination of features.</p> <p>This is due to the fact that models with a lot of not predictive feature tend to find patterns even in random noise and end up overfitting, see how the train score and the validation score get closer with the progressive elimination of features.</p> In\u00a0[5]: Copied! <pre>sorted(selector.best_solution_['features'])\n</pre> sorted(selector.best_solution_['features']) Out[5]: <pre>[6, 10, 82, 93, 168]</pre> <p>The features with index &lt;= 15 are relevant, the others are random noise. We see that some of the relevant features are being selected. Nevertheless we got a good improvement in AUC score:</p> In\u00a0[6]: Copied! <pre>selector.best_solution_['mean_test_score']\n</pre> selector.best_solution_['mean_test_score'] Out[6]: <pre>0.9197176917691768</pre> <p>The best AUC score obtained with feature elimination is now 0.92, that's ~0.06 AUC points obtained from removing useless features.</p> In\u00a0[8]: Copied! <pre>selector.transform(X).shape\n</pre> selector.transform(X).shape Out[8]: <pre>(1000, 5)</pre>"},{"location":"tutorials/genetic_algorithms_x_feature_selection/#genetic-algorithms-x-feature-selection","title":"Genetic Algorithms x Feature Selection\u00b6","text":"<p>This tutorial will show an example of how we can use genetic algorithms applied to feature selection to improve our model performances.</p> <p>More specifically, this tutorial will illustrate how to perform feature selection using genetic algorithm as implemented in the class <code>felimination.ga.HybridImportanceGACVFeatureSelector</code></p>"},{"location":"tutorials/genetic_algorithms_x_feature_selection/#create-a-dummy-dataset","title":"Create a dummy Dataset\u00b6","text":"<p>For this tutorial we will use a dummy classification dataset created using <code>sklearn.datasets.make_classification</code>. For this dataset we will have <code>6</code> predictive features, <code>10</code> redundant and <code>184</code> random features.</p>"},{"location":"tutorials/genetic_algorithms_x_feature_selection/#evaluate-performances-without-feature-elimination","title":"Evaluate performances without feature elimination\u00b6","text":""},{"location":"tutorials/genetic_algorithms_x_feature_selection/#perform-now-feature-elimination","title":"Perform now feature elimination\u00b6","text":""},{"location":"tutorials/recursive_feature_elimination/","title":"Recursive Feature Elimination (RFE)","text":"In\u00a0[\u00a0]: Copied! <pre># Install felimination\n! pip install felimination\n</pre> # Install felimination ! pip install felimination In\u00a0[2]: Copied! <pre>from sklearn.datasets import make_classification\n\nX, y = make_classification(\n    n_samples=1000,\n    n_features=200,\n    n_informative=6,\n    n_redundant=10,\n    n_clusters_per_class=1,\n    random_state=42,\n    shuffle=False\n)\n</pre> from sklearn.datasets import make_classification  X, y = make_classification(     n_samples=1000,     n_features=200,     n_informative=6,     n_redundant=10,     n_clusters_per_class=1,     random_state=42,     shuffle=False ) In\u00a0[3]: Copied! <pre>from sklearn.model_selection import cross_validate, StratifiedKFold\nfrom sklearn.linear_model import LogisticRegression\n\n\n# Define a simple logistic regression model\nmodel = LogisticRegression(random_state=42)\n\n# Perform cross-validation\ncv_results = cross_validate(\n    model,\n    X,\n    y,\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n    scoring=\"roc_auc\",\n    return_train_score=True,\n)\n\ncv_results[\"test_score\"].mean()\n</pre> from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.linear_model import LogisticRegression   # Define a simple logistic regression model model = LogisticRegression(random_state=42)  # Perform cross-validation cv_results = cross_validate(     model,     X,     y,     cv=StratifiedKFold(random_state=42, shuffle=True),     scoring=\"roc_auc\",     return_train_score=True, )  cv_results[\"test_score\"].mean() Out[3]: <pre>0.8561362716271628</pre> In\u00a0[4]: Copied! <pre>from felimination.rfe import PermutationImportanceRFECV\nfrom felimination.callbacks import plot_progress_callback\n\n\nselector = PermutationImportanceRFECV(\n    model,\n    step=0.2,\n    callbacks=[plot_progress_callback],\n    scoring=\"roc_auc\",\n    cv=StratifiedKFold(random_state=42, shuffle=True),\n)\nselector.fit(X, y)\n</pre> from felimination.rfe import PermutationImportanceRFECV from felimination.callbacks import plot_progress_callback   selector = PermutationImportanceRFECV(     model,     step=0.2,     callbacks=[plot_progress_callback],     scoring=\"roc_auc\",     cv=StratifiedKFold(random_state=42, shuffle=True), ) selector.fit(X, y) Out[4]: <pre>PermutationImportanceRFECV(callbacks=[&lt;function plot_progress_callback at 0x103583d80&gt;],\n                           cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                           estimator=LogisticRegression(random_state=42),\n                           scoring='roc_auc', step=0.2)</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.\u00a0PermutationImportanceRFECViFitted<pre>PermutationImportanceRFECV(callbacks=[&lt;function plot_progress_callback at 0x103583d80&gt;],\n                           cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n                           estimator=LogisticRegression(random_state=42),\n                           scoring='roc_auc', step=0.2)</pre> estimator: LogisticRegression<pre>LogisticRegression(random_state=42)</pre> \u00a0LogisticRegression?Documentation for LogisticRegression<pre>LogisticRegression(random_state=42)</pre> <p>Notice how model performances increase with the progressive elimination of features.</p> <p>This is due to the fact that models with a lot of not predictive feature tend to find patterns even in random noise and end up overfitting, see how the train score and the validation score get closer with the progressive elimination of features.</p> In\u00a0[5]: Copied! <pre>import pandas as pd\n\ncv_results_df = pd.DataFrame(selector.cv_results_)\n\ncv_results_df[[\"mean_test_score\", \"n_features\"]].sort_values(\n    \"mean_test_score\", ascending=False\n).head(10)\n</pre> import pandas as pd  cv_results_df = pd.DataFrame(selector.cv_results_)  cv_results_df[[\"mean_test_score\", \"n_features\"]].sort_values(     \"mean_test_score\", ascending=False ).head(10) Out[5]: mean_test_score n_features 7 0.944138 44 6 0.943558 54 8 0.943018 36 9 0.942478 29 5 0.942438 67 4 0.942058 83 10 0.939718 24 11 0.937578 20 12 0.935838 16 13 0.935698 13 <p>The best AUC score obtained with feature elimination is now 0.94, that's 0.08 AUC points obtained from less features.</p> <p>If I had to choose a number of features, I would probably go for 13 number of features because there the validation score is very close to the train score.</p> <p>We can do this using the method <code>set_n_features_to_select</code>. This will change the support of the selector as well as the behavior of the <code>transform</code> method.</p> In\u00a0[6]: Copied! <pre>selector.set_n_features_to_select(13)\nselector.transform(X).shape\n</pre> selector.set_n_features_to_select(13) selector.transform(X).shape Out[6]: <pre>(1000, 13)</pre> In\u00a0[7]: Copied! <pre>import numpy as np\n\n# Show the index of the selected features, index &lt;= 15 are relevant\nnp.arange(0, X.shape[1])[selector.support_]\n</pre> import numpy as np  # Show the index of the selected features, index &lt;= 15 are relevant np.arange(0, X.shape[1])[selector.support_] Out[7]: <pre>array([  1,   2,   3,   7,   8,   9,  10,  69,  80,  82, 155, 186, 197])</pre> <p>We can see from the index of selected features that most of the selected features are informative (index&lt;=15) while still some random features are being selected. Also some of the features are still redundant.</p>"},{"location":"tutorials/recursive_feature_elimination/#recursive-feature-elimination-rfe","title":"Recursive Feature Elimination (RFE)\u00b6","text":"<p>This tutorial will show an example of how we can use recursive feature elimination to improve our model performances. More specifically, this tutorial will illustrate how to perform backward recursive feature elimination based on permutation importance using the class <code>felimination.rfe.PermutationImportanceRFECV</code></p>"},{"location":"tutorials/recursive_feature_elimination/#create-a-dummy-dataset","title":"Create a dummy Dataset\u00b6","text":"<p>For this tutorial we will use a dummy classification dataset created using <code>sklearn.datasets.make_classification</code>. For this dataset we will have <code>6</code> predictive features, <code>10</code> redundant and <code>184</code> random features.</p>"},{"location":"tutorials/recursive_feature_elimination/#evaluate-performances-without-feature-elimination","title":"Evaluate performances without feature elimination\u00b6","text":""},{"location":"tutorials/recursive_feature_elimination/#perform-now-feature-elimination","title":"Perform now feature elimination\u00b6","text":""}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index f2d8dde..a795e62 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -10,6 +10,11 @@
          <lastmod>2024-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
+    <url>
+         <loc>https://claudiosalvatorearcidiacono.github.io/felimination/reference/callbacks/</loc>
+         <lastmod>2024-07-31</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
     <url>
          <loc>https://claudiosalvatorearcidiacono.github.io/felimination/reference/drift/</loc>
          <lastmod>2024-07-31</lastmod>
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index b06abde..b755ec8 100644
Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ
diff --git a/tutorials/genetic_algorithms_x_feature_selection/index.html b/tutorials/genetic_algorithms_x_feature_selection/index.html
index 79247df..ddfad87 100644
--- a/tutorials/genetic_algorithms_x_feature_selection/index.html
+++ b/tutorials/genetic_algorithms_x_feature_selection/index.html
@@ -375,6 +375,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../reference/callbacks/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../reference/drift/" class="md-nav__link">
         
diff --git a/tutorials/recursive_feature_elimination/index.html b/tutorials/recursive_feature_elimination/index.html
index b9cebcf..e57d8a5 100644
--- a/tutorials/recursive_feature_elimination/index.html
+++ b/tutorials/recursive_feature_elimination/index.html
@@ -373,6 +373,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../reference/callbacks/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Callbacks
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../reference/drift/" class="md-nav__link">