El archivo ml-helpper.py
contiene todas las funciones utiles de la asignatura de Aprendizaje Automático
El paquete Preprocessing
tiene estructurados cada uno de las funciones de ml-helpper.py
de la siguiente manera:
Preprocessing\datachecking.py
- missig_values
- outliers
Preprocessing\discretization.py
- discretize
Preprocessing\featureselection.py
- select_features
Preprocessing\featureextraction.py
- get_PCA
- get_score_2_features_subset
Preprocessing\unbalnced.py
OverSampling
- Borderline_SMOTE
- SMOTE
- ADASYN
- Random
UnderSampling
- NearMiss
- ENN
- Tomeks_Link
- Random
Missing Values: Borrar NaN
import pandas as pd
from Preprocessing.datachecking import DataChecking
nan_df = pd.read_csv("notebooks/nan_data.csv")
# Borrar Missing Values. Modifica directamente la variable nan_df
print(len(nan_df.values[:,:]))
DataChecking.missig_values(nan_df, action="remove", debug=True)
print(len(nan_df.values[:,:]))
Missing Values: Imputar NaN
import pandas as pd
from Preprocessing.datachecking import DataChecking
nan_df = pd.read_csv("notebooks/nan_data.csv")
# Imputar Missing Values. Modifica directamente la variable nan_df
print(len(nan_df.values[:,:]))
DataChecking.missig_values(nan_df, action="impute", debug=True)
print(len(nan_df.values[:,:]))
Missing Values: Imputar NaN usando la mediana
import pandas as pd
from Preprocessing.datachecking import DataChecking
nan_df = pd.read_csv("notebooks/nan_data.csv")
# Imputar Missing Values. Modifica directamente la variable nan_df
print(len(nan_df.values[:,:]))
DataChecking.missig_values(nan_df, action="impute", debug=True, imputer="median")
print(len(nan_df.values[:,:]))
Missing Values: Imputar NaN usando una constante (por ejemplo 100)
import pandas as pd
from Preprocessing.datachecking import DataChecking
nan_df = pd.read_csv("notebooks/nan_data.csv")
# Imputar Missing Values. Modifica directamente la variable nan_df
print(len(nan_df.values[:,:]))
DataChecking.missig_values(nan_df, action="impute", debug=True, imputer="constant", fill_value=100)
print(len(nan_df.values[:,:]))
Outliers: Tratar individualmente
import pandas as pd
from Preprocessing.datachecking import DataChecking
df = pd.read_csv("notebooks/iris_data.csv")
# Posicion de todos los outliers. Modifica directamente la variable df
outlier_positions = DataChecking.outliers(df, action="individual", debug=True, remove=True)
print(outlier_positions)
df.describe()
# Podemos ver como faltan 4 valores. Corresponden a los outliers borrados
Outliers: Tratar colectivamente
import pandas as pd
from Preprocessing.datachecking import DataChecking
df = pd.read_csv("notebooks/iris_data.csv")
# Posicion de todos los outliers. Modifica directamente la variable df
outlier_positions = DataChecking.outliers(df, action="colective", debug=True, remove=True)
print(outlier_positions)
df.describe()
# Podemos ver como faltan 15 valores. Corresponden a los outliers borrados
Outliers: Tratar en paralelo
import pandas as pd
from Preprocessing.datachecking import DataChecking
df = pd.read_csv("notebooks/iris_data.csv")
# Posicion de todos los outliers. Modifica directamente la variable df
outlier_positions = DataChecking.outliers(df, action="parallel", debug=True, remove=True)
print(outlier_positions)
df.describe()
# Podemos ver como faltan 19 valores (4 individuales + 15 colectivos). Corresponden a los outliers borrados
Discretizamos X
import pandas as pd
from Preprocessing.discretization import Discretization
from sklearn.preprocessing import KBinsDiscretizer
df = pd.read_csv("notebooks/iris_data.csv")
k3frequency = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
# Discretizar con Bins = 3 y Strategy = quantiel
df_discretized = Discretization.discretize(df, k3frequency)
print(df_discretized)
Seleccionar las features mas relevantes. Internamente usa ExtraTreesClassifier
(https://scikit-learn.org/stable/modules/feature_selection.html)
import pandas as pd
from Preprocessing.featureselection import FeatureSelection
import pandas as pd
df = pd.read_csv("../notebooks/iris_data.csv")
selected_features = FeatureSelection.select_features(df)
print(selected_features)
# las features seleccionadas han sido: ['petal_length', 'petal_width']
Obtener un DataFrame que capture el 0.95 de variabilidad usando PCA
import pandas as pd
from Preprocessing.featureextraction import FeatureExtraction
df = pd.read_csv("notebooks/Thyroids.csv")
df_pca = FeatureExtraction.get_PCA(df, variance=0.95)
print(df_pca)
# Utiliza 4 principal compoents
Obtener un DataFrame con 40 principal components
import pandas as pd
from Preprocessing.featureextraction import FeatureExtraction
df = pd.read_csv("notebooks/Thyroids.csv")
df_pca = FeatureExtraction.get_PCA(df, variance=40)
print(df_pca)
# Utiliza 40 principal compoents
Obtener TODOS los score por cada 2 variables en el DataFrame.
Atencion: La funcion get_score_2_features_subset
utiliza Util.automatic_scoring
. automatic_scoring
utiliza RandomForest. (Cambiar automatic_scoring
si se quiere utilizar otro clasificador)
import pandas as pd
from Preprocessing.featureselection import FeatureSelection
df = pd.read_csv("notebooks/iris_data.csv")
scores = FeatureSelection.get_score_2_features_subset(df)
print(scores)
Hace oversampling con SMOTE (por defecto usa k=5)
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
df_oversampled = Unbalanced.OverSampling.SMOTE(df)
#df_oversampled = Unbalanced.OverSampling.SMOTE(df, k=5)
Hace oversampling con ADASYN (por defecto usa k=5)
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
df_oversampled = Unbalanced.OverSampling.ADASYN(df)
#df_oversampled = Unbalanced.OverSampling.ADASYN(df, k=5)
Hace oversampling con Random
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
df_oversampled = Unbalanced.OverSampling.Random(df)
Hace oversampling con Borderline SMOTE version 1 (por defecto usa k=5)
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
#Borderline_SMOTE tiene 2 versiones: variant=1 y varaint=2
df_ovesampled = Unbalanced.OverSampling.Borderline_SMOTE(df, variant=1)
#df_ovesampled = Unbalanced.OverSampling.Borderline_SMOTE(df, variant=1, k=5)
Hacer undersampling con NearMiss 1
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
df_undersampled = Unbalanced.UnderSampling.NearMiss(df, variant=1)
Hacer undersampling con NearMiss 2
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
df_oversdf_undersampledampled = Unbalanced.UnderSampling.NearMiss(df, variant=2)
Hacer undersampling con ENN
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
df_undersampled = Unbalanced.UnderSampling.ENN(df)
Hacer undersampling con Tomeks Link
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
df_undersampled = Unbalanced.UnderSampling.Tomeks_Link(df)
Hacer undersampling con Random
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
df_undersampled = Unbalanced.UnderSampling.Random(df)
Hacer oversampling (SMOTE) + undersampling (Tomeks Link)
import pandas as pd
from Preprocessing.unbalnced import Unbalanced
df = pd.read_csv("notebooks/Thyroids.csv")
df_ovesampled = Unbalanced.OverSampling.SMOTE(df)
df_ovesampled_undersampled = Unbalanced.UnderSampling.Tomeks_Link(df_ovesampled)