Skip to content

Commit

Permalink
Improved docs
Browse files Browse the repository at this point in the history
  • Loading branch information
sidchaini committed Oct 21, 2024
1 parent b0a013e commit cb918db
Showing 1 changed file with 96 additions and 24 deletions.
120 changes: 96 additions & 24 deletions distclassipy/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,8 @@ def find_best_metrics(
y: np.ndarray,
feat_idx: int,
n_quantiles: int = 4,
metrics_to_consider: list = None,
) -> Tuple[pd.DataFrame, pd.Series]:
metrics_to_consider: list[str] = None,
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
"""Evaluate and find the best distance metrics for a given feature.
This function evaluates different distance metrics to determine which
Expand All @@ -452,7 +452,7 @@ def find_best_metrics(
The index of the feature to be used for quantile splitting.
n_quantiles : int, default=4
The number of quantiles to split the data into.
metrics_to_consider : list, optional
metrics_to_consider : list of str, optional
A list of distance metrics to evaluate. If None, all available
metrics within DistClassiPy will be considered.
Expand All @@ -463,6 +463,8 @@ def find_best_metrics(
different quantiles.
best_metrics_per_quantile : pd.Series
A Series indicating the best-performing metric for each quantile.
group_bins : np.ndarray
The bins used for quantile splitting.
"""
X = check_array(X)
feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
Expand Down Expand Up @@ -501,15 +503,6 @@ def find_best_metrics(

best_metrics_per_quantile = quantile_scores_df.idxmax()

# todo for pred during best:
# loop through each metric, merge quantiles for each metric
# pred on this

# alt, but slower:
# loop through each quantile, and append pred
# group_bins = []
# for bins, group in grouped_test_data:
# group_bins.append(bins)
return quantile_scores_df, best_metrics_per_quantile, group_bins


Expand All @@ -522,32 +515,61 @@ def __init__(
scale: bool = True,
central_stat: str = "median",
dispersion_stat: str = "std",
metrics_to_consider: list = None,
metrics_to_consider: list[str] = None,
) -> None:
"""Initialize the classifier with specified parameters."""
"""Initialize the classifier with specified parameters.
Parameters
----------
feat_idx : int
The index of the feature to be used for quantile splitting.
scale : bool, default=True
Whether to scale the distance between the test object and the centroid.
central_stat : str, default="median"
The statistic used to calculate the central tendency of the data.
dispersion_stat : str, default="std"
The statistic used to calculate the dispersion of the data.
metrics_to_consider : list of str, optional
A list of distance metrics to evaluate. If None, all available
metrics within DistClassiPy will be considered.
"""
self.feat_idx = feat_idx
self.scale = scale
self.central_stat = central_stat
self.dispersion_stat = dispersion_stat
self.metrics_to_consider = metrics_to_consider
self.metrics_to_consider = metrics_to_consider or _ALL_METRICS

def fit(
self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
) -> "EnsembleDistanceMetricClassifier":
"""Fit the ensemble classifier using the best metrics for each quantile."""
"""Fit the ensemble classifier using the best metrics for each quantile.
Parameters
----------
X : np.ndarray
The input feature matrix.
y : np.ndarray
The target labels.
n_quantiles : int, default=4
The number of quantiles to split the data into.
Returns
-------
self : object
Fitted estimator.
"""
self.clf_ = DistanceMetricClassifier(
scale=self.scale,
central_stat=self.central_stat,
dispersion_stat=self.dispersion_stat,
)
(
self.quantile_scores_df_,
self.best_metrics_per_quantile_,
self.group_bins,
) = find_best_metrics(
self.clf_, X, y, self.feat_idx, n_quantiles, self.metrics_to_consider

# Find best metrics based on training set quantiles
self.quantile_scores_df_, self.best_metrics_per_quantile_, self.group_bins = (
self.evaluate_metrics(X, y, n_quantiles)
)
# To make sure the bins work with values outside of training data

# Ensure the bins work with values outside of training data
self.group_bins[0] = -np.inf
self.group_bins[-1] = np.inf

Expand All @@ -557,10 +579,28 @@ def fit(
return self

def predict(self, X: np.ndarray) -> np.ndarray:
"""Predict class labels using the best metric for each quantile."""
"""Predict class labels using the best metric for each quantile.
Parameters
----------
X : np.ndarray
The input samples.
Returns
-------
predictions : np.ndarray
The predicted class labels.
"""
check_is_fitted(self, "is_fitted_")
X = check_array(X)

# notes for pred during best:
# option 1:
# loop through each metric, merge quantiles for each metric
# pred on this
# option 2, easier, but slower:
# loop through each quantile, and append pred

quantiles = pd.cut(
X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
)
Expand All @@ -572,3 +612,35 @@ def predict(self, X: np.ndarray) -> np.ndarray:
predictions[subdf.index] = preds

return predictions

def evaluate_metrics(
self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
"""Evaluate and find the best distance metrics for the specified feature.
This method uses the standalone `find_best_metrics` function to evaluate
different distance metrics and determine the best-performing ones for
each quantile.
Parameters
----------
X : np.ndarray
The input feature matrix.
y : np.ndarray
The target labels.
n_quantiles : int, default=4
The number of quantiles to split the data into.
Returns
-------
quantile_scores_df : pd.DataFrame
A DataFrame containing the accuracy scores for each metric across
different quantiles.
best_metrics_per_quantile : pd.Series
A Series indicating the best-performing metric for each quantile.
group_bins : np.ndarray
The bins used for quantile splitting.
"""
return find_best_metrics(
self.clf_, X, y, self.feat_idx, n_quantiles, self.metrics_to_consider
)

0 comments on commit cb918db

Please sign in to comment.