diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md new file mode 100644 index 0000000000..efdbb9dd40 --- /dev/null +++ b/docs/releases/unreleased.md @@ -0,0 +1,27 @@ +# Unreleased + +## cluster + +- Renamed `sample_weight` to `w`. + +## ensemble + +- Renamed `sample_weight` to `w`. + +## facto + +- Renamed `sample_weight` to `w`. + +## forest + +- Renamed `sample_weight` to `w`. + +## tree + +- Renamed `sample_weight` to `w`. + +## metrics + +- Renamed `sample_weight` to `w`. + + diff --git a/river/cluster/dbstream.py b/river/cluster/dbstream.py index 10966aa284..88bdc0149f 100644 --- a/river/cluster/dbstream.py +++ b/river/cluster/dbstream.py @@ -389,7 +389,7 @@ def _recluster(self): self.clustering_is_up_to_date = True - def learn_one(self, x, sample_weight=None): + def learn_one(self, x, w=None): self._update(x) if self._time_stamp % self.cleanup_interval == 0: @@ -399,7 +399,7 @@ def learn_one(self, x, sample_weight=None): return self - def predict_one(self, x, sample_weight=None): + def predict_one(self, x, w=None): self._recluster() min_distance = math.inf diff --git a/river/cluster/denstream.py b/river/cluster/denstream.py index c6e16bbff2..e663cc1506 100644 --- a/river/cluster/denstream.py +++ b/river/cluster/denstream.py @@ -313,7 +313,7 @@ def _initial_dbscan(self): else: item.covered = False - def learn_one(self, x, sample_weight=None): + def learn_one(self, x, w=None): self._n_samples_seen += 1 # control the stream speed if self._n_samples_seen % self.stream_speed == 0: @@ -352,7 +352,7 @@ def learn_one(self, x, sample_weight=None): self.o_micro_clusters.pop(j) return self - def predict_one(self, x, sample_weight=None): + def predict_one(self, x, w=None): # This function handles the case when a clustering request arrives. # implementation of the DBSCAN algorithm proposed by Ester et al. if not self.initialized: diff --git a/river/cluster/streamkmeans.py b/river/cluster/streamkmeans.py index 083d24ae3e..76a02d9d45 100644 --- a/river/cluster/streamkmeans.py +++ b/river/cluster/streamkmeans.py @@ -84,7 +84,7 @@ def __init__(self, chunk_size=10, n_clusters=2, **kwargs): self._temp_chunk = {} self.centers = {} - def learn_one(self, x, sample_weight=None): + def learn_one(self, x, w=None): self.time_stamp += 1 index = self.time_stamp % self.chunk_size @@ -107,7 +107,7 @@ def learn_one(self, x, sample_weight=None): return self - def predict_one(self, x, sample_weight=None): + def predict_one(self, x, w=None): def get_distance(c): return utils.math.minkowski_distance(self.centers[c], x, 2) diff --git a/river/cluster/textclust.py b/river/cluster/textclust.py index aa42d81be1..0407991043 100644 --- a/river/cluster/textclust.py +++ b/river/cluster/textclust.py @@ -153,7 +153,7 @@ def __init__( self.micro_distance = self.distances(self.micro_distance) self.macro_distance = self.distances(self.macro_distance) - def learn_one(self, x, t=None, sample_weight=None): + def learn_one(self, x, t=None, w=None): localdict = {} for key in x.keys(): new_key = key @@ -213,7 +213,7 @@ def learn_one(self, x, t=None, sample_weight=None): ## predicts the cluster number. The type specifies whether this should happen on micro-cluster ## or macro-cluster level - def predict_one(self, x, sample_weight=None, type="micro"): + def predict_one(self, x, w=None, type="micro"): localdict = {} for key in x.keys(): new_key = key diff --git a/river/ensemble/streaming_random_patches.py b/river/ensemble/streaming_random_patches.py index 07ee620c1a..7cd9f87992 100644 --- a/river/ensemble/streaming_random_patches.py +++ b/river/ensemble/streaming_random_patches.py @@ -109,7 +109,7 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs): k = poisson(rate=self.lam, rng=self._rng) if k == 0: continue - model.learn_one(x=x, y=y, sample_weight=k, n_samples_seen=self._n_samples_seen) + model.learn_one(x=x, y=y, w=k, n_samples_seen=self._n_samples_seen) return self @@ -532,7 +532,7 @@ def learn_one( x: dict, y: base.typing.ClfTarget, *, - sample_weight: int, + w: int, n_samples_seen: int, **kwargs, ): @@ -543,8 +543,8 @@ def learn_one( # Use all features x_subset = x - # TODO Find a way to verify if the model natively supports sample_weight - for _ in range(int(sample_weight)): + # TODO Find a way to verify if the model natively supports sample_weight (w) + for _ in range(int(w)): self.model.learn_one(x=x_subset, y=y, **kwargs) if self._background_learner: @@ -552,7 +552,7 @@ def learn_one( # Note: Pass the original instance x so features are correctly # selected based on the corresponding subspace self._background_learner.learn_one( - x=x, y=y, sample_weight=sample_weight, n_samples_seen=n_samples_seen # type: ignore + x=x, y=y, w=w, n_samples_seen=n_samples_seen # type: ignore ) if not self.disable_drift_detector and not self.is_background_learner: @@ -830,7 +830,7 @@ def learn_one( x: dict, y: base.typing.RegTarget, *, - sample_weight: int, + w: int, n_samples_seen: int, **kwargs, ): @@ -842,8 +842,8 @@ def learn_one( # Use all features x_subset = x - # TODO Find a way to verify if the model natively supports sample_weight - for _ in range(int(sample_weight)): + # TODO Find a way to verify if the model natively supports sample_weight (w) + for _ in range(int(w)): self.model.learn_one(x=x_subset, y=y, **kwargs) # Drift detection input @@ -860,7 +860,7 @@ def learn_one( # Note: Pass the original instance x so features are correctly # selected based on the corresponding subspace self._background_learner.learn_one( - x=x, y=y, sample_weight=sample_weight, n_samples_seen=n_samples_seen # type: ignore + x=x, y=y, w=w, n_samples_seen=n_samples_seen # type: ignore ) if not self.disable_drift_detector and not self.is_background_learner: diff --git a/river/facto/base.py b/river/facto/base.py index 802e161d6f..46957e8933 100644 --- a/river/facto/base.py +++ b/river/facto/base.py @@ -65,20 +65,20 @@ def __init__( def _init_latents(self) -> collections.defaultdict: """Initializes latent weights dict.""" - def learn_one(self, x, y, sample_weight=1.0): + def learn_one(self, x, y, w=1.0): x = self._ohe_cat_features(x) if self.sample_normalization: x_l2_norm = sum(xj**2 for xj in x.values()) ** 0.5 x = {j: xj / x_l2_norm for j, xj in x.items()} - return self._learn_one(x, y, sample_weight=sample_weight) + return self._learn_one(x, y, w=w) def _ohe_cat_features(self, x): """One hot encodes string features considering them as categorical.""" return dict((f"{j}_{xj}", 1) if isinstance(xj, str) else (j, xj) for j, xj in x.items()) - def _learn_one(self, x, y, sample_weight=1.0): + def _learn_one(self, x, y, w=1.0): # Calculate the gradient of the loss with respect to the raw output g_loss = self.loss.gradient(y_true=y, y_pred=self._raw_dot(x)) @@ -86,7 +86,7 @@ def _learn_one(self, x, y, sample_weight=1.0): g_loss = utils.math.clamp(g_loss, minimum=-self.clip_gradient, maximum=self.clip_gradient) # Apply the sample weight - g_loss *= sample_weight + g_loss *= w # Update the intercept intercept_lr = self.intercept_lr.get(self.weight_optimizer.n_iterations) diff --git a/river/forest/adaptive_random_forest.py b/river/forest/adaptive_random_forest.py index c96d83e231..d432ac997a 100644 --- a/river/forest/adaptive_random_forest.py +++ b/river/forest/adaptive_random_forest.py @@ -169,9 +169,9 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs): k = poisson(rate=self.lambda_value, rng=self._rng) if k > 0: if not self._warning_detection_disabled and self._background[i] is not None: - self._background[i].learn_one(x=x, y=y, sample_weight=k) # type: ignore + self._background[i].learn_one(x=x, y=y, w=k) # type: ignore - model.learn_one(x=x, y=y, sample_weight=k) + model.learn_one(x=x, y=y, w=k) drift_input = None if not self._warning_detection_disabled: diff --git a/river/forest/online_extra_trees.py b/river/forest/online_extra_trees.py index ce904c9b83..1e43b9e661 100644 --- a/river/forest/online_extra_trees.py +++ b/river/forest/online_extra_trees.py @@ -314,10 +314,10 @@ def learn_one(self, x, y): if w == 0: # Skip model update if w is zero continue - model.learn_one(x, y, sample_weight=w) + model.learn_one(x, y, w=w) if i in self._background_trees: - self._background_trees[i].learn_one(x, y, sample_weight=w) + self._background_trees[i].learn_one(x, y, w=w) trained.append(i) diff --git a/river/metrics/base.py b/river/metrics/base.py index b34b571cac..af9997a8e6 100644 --- a/river/metrics/base.py +++ b/river/metrics/base.py @@ -84,19 +84,19 @@ def __init__(self, cm: confusion.ConfusionMatrix | None = None): cm = confusion.ConfusionMatrix() self.cm = cm - def update(self, y_true, y_pred, sample_weight=1.0): + def update(self, y_true, y_pred, w=1.0): self.cm.update( y_true, y_pred, - sample_weight=sample_weight, + w=w, ) return self - def revert(self, y_true, y_pred, sample_weight=1.0): + def revert(self, y_true, y_pred, w=1.0): self.cm.revert( y_true, y_pred, - sample_weight=sample_weight, + w=w, ) return self @@ -148,21 +148,21 @@ def update( self, y_true: bool, y_pred: bool | float | dict[bool, float], - sample_weight=1.0, + w=1.0, ) -> BinaryMetric: if self.requires_labels: y_pred = y_pred == self.pos_val - return super().update(y_true == self.pos_val, y_pred, sample_weight) + return super().update(y_true == self.pos_val, y_pred, w) def revert( self, y_true: bool, y_pred: bool | float | dict[bool, float], - sample_weight=1.0, + w=1.0, ) -> BinaryMetric: if self.requires_labels: y_pred = y_pred == self.pos_val - return super().revert(y_true == self.pos_val, y_pred, sample_weight) + return super().revert(y_true == self.pos_val, y_pred, w) class MultiClassMetric(ClassificationMetric): @@ -224,7 +224,7 @@ def __init__(self, metrics, str_sep=", "): super().__init__(metrics) self.str_sep = str_sep - def update(self, y_true, y_pred, sample_weight=1.0): + def update(self, y_true, y_pred, w=1.0): # If the metrics are classification metrics, then we have to handle the case where some # of the metrics require labels, whilst others need to be fed probabilities if hasattr(self, "requires_labels") and not self.requires_labels: @@ -239,19 +239,19 @@ def update(self, y_true, y_pred, sample_weight=1.0): m.update(y_true, y_pred) return self - def revert(self, y_true, y_pred, sample_weight=1.0): + def revert(self, y_true, y_pred, w=1.0): # If the metrics are classification metrics, then we have to handle the case where some # of the metrics require labels, whilst others need to be fed probabilities if hasattr(self, "requires_labels") and not self.requires_labels: for m in self: if m.requires_labels: - m.revert(y_true, max(y_pred, key=y_pred.get), sample_weight) + m.revert(y_true, max(y_pred, key=y_pred.get), w) else: - m.revert(y_true, y_pred, sample_weight) + m.revert(y_true, y_pred, w) return self for m in self: - m.revert(y_true, y_pred, sample_weight) + m.revert(y_true, y_pred, w) return self def get(self): @@ -333,12 +333,12 @@ def __init__(self): def _eval(self, y_true, y_pred): pass - def update(self, y_true, y_pred, sample_weight=1.0): - self._mean.update(x=self._eval(y_true, y_pred), w=sample_weight) + def update(self, y_true, y_pred, w=1.0): + self._mean.update(x=self._eval(y_true, y_pred), w=w) return self - def revert(self, y_true, y_pred, sample_weight=1.0): - self._mean.revert(x=self._eval(y_true, y_pred), w=sample_weight) + def revert(self, y_true, y_pred, w=1.0): + self._mean.revert(x=self._eval(y_true, y_pred), w=w) return self def get(self): @@ -354,11 +354,11 @@ class ClusteringMetric(base.Base, abc.ABC): _fmt = ",.6f" # Use commas to separate big numbers and show 6 decimals @abc.abstractmethod - def update(self, x, y_pred, centers, sample_weight=1.0) -> ClusteringMetric: + def update(self, x, y_pred, centers, w=1.0) -> ClusteringMetric: """Update the metric.""" @abc.abstractmethod - def revert(self, x, y_pred, centers, sample_weight=1.0) -> ClusteringMetric: + def revert(self, x, y_pred, centers, w=1.0) -> ClusteringMetric: """Revert the metric.""" @abc.abstractmethod diff --git a/river/metrics/confusion.py b/river/metrics/confusion.py index 7c16809fa4..ccf321b592 100644 --- a/river/metrics/confusion.py +++ b/river/metrics/confusion.py @@ -62,22 +62,22 @@ def __getitem__(self, key): """Syntactic sugar for accessing the counts directly.""" return self.data[key] - def update(self, y_true, y_pred, sample_weight=1.0): + def update(self, y_true, y_pred, w=1.0): self.n_samples += 1 - self._update(y_true, y_pred, sample_weight) + self._update(y_true, y_pred, w) return self - def revert(self, y_true, y_pred, sample_weight=1.0): + def revert(self, y_true, y_pred, w=1.0): self.n_samples -= 1 - # Revert is equal to subtracting so we pass the negative sample_weight - self._update(y_true, y_pred, -sample_weight) + # Revert is equal to subtracting so we pass the negative sample_weight (w) + self._update(y_true, y_pred, -w) return self - def _update(self, y_true, y_pred, sample_weight): - self.data[y_true][y_pred] += sample_weight - self.total_weight += sample_weight - self.sum_row[y_true] += sample_weight - self.sum_col[y_pred] += sample_weight + def _update(self, y_true, y_pred, w): + self.data[y_true][y_pred] += w + self.total_weight += w + self.sum_row[y_true] += w + self.sum_col[y_pred] += w @property def classes(self): diff --git a/river/metrics/mse.py b/river/metrics/mse.py index 6fcd750e8a..526d2af933 100644 --- a/river/metrics/mse.py +++ b/river/metrics/mse.py @@ -81,5 +81,5 @@ class RMSLE(RMSE): """ - def update(self, y_true, y_pred, sample_weight=1.0): - return super().update(math.log(y_true + 1), math.log(y_pred + 1), sample_weight) + def update(self, y_true, y_pred, w=1.0): + return super().update(math.log(y_true + 1), math.log(y_pred + 1), w) diff --git a/river/metrics/multioutput/base.py b/river/metrics/multioutput/base.py index 9eee7b21b9..e37431e31e 100644 --- a/river/metrics/multioutput/base.py +++ b/river/metrics/multioutput/base.py @@ -37,10 +37,10 @@ def update( y_true: dict[str | int, base.typing.ClfTarget], y_pred: dict[str | int, base.typing.ClfTarget] | dict[str | int, dict[base.typing.ClfTarget, float]], - sample_weight=1.0, + w=1.0, ) -> MultiOutputClassificationMetric: """Update the metric.""" - self.cm.update(y_true, y_pred, sample_weight) + self.cm.update(y_true, y_pred, w) return self def revert( @@ -48,10 +48,10 @@ def revert( y_true: dict[str | int, base.typing.ClfTarget], y_pred: dict[str | int, base.typing.ClfTarget] | dict[str | int, dict[base.typing.ClfTarget, float]], - sample_weight=1.0, + w=1.0, ) -> MultiOutputClassificationMetric: """Revert the metric.""" - self.cm.revert(y_true, y_pred, sample_weight) + self.cm.revert(y_true, y_pred, w) return self def works_with(self, model) -> bool: diff --git a/river/metrics/multioutput/confusion.py b/river/metrics/multioutput/confusion.py index 7d20b613ac..f5c6b5b4c9 100644 --- a/river/metrics/multioutput/confusion.py +++ b/river/metrics/multioutput/confusion.py @@ -51,24 +51,24 @@ class MultiLabelConfusionMatrix: def __init__(self): self.data = dict() - def update(self, y_true, y_pred, sample_weight=1.0): + def update(self, y_true, y_pred, w=1.0): for label, yt in y_true.items(): try: cm = self.data[label] except KeyError: cm = metrics.ConfusionMatrix() self.data[label] = cm - cm.update(yt, y_pred[label], sample_weight) + cm.update(yt, y_pred[label], w) return self - def revert(self, y_true, y_pred, sample_weight=1.0): + def revert(self, y_true, y_pred, w=1.0): for label, yt in y_true.items(): try: cm = self.data[label] except KeyError: cm = metrics.ConfusionMatrix() self.data[label] = cm - cm.update(yt, y_pred[label], sample_weight) + cm.update(yt, y_pred[label], w) return self def __repr__(self): diff --git a/river/metrics/multioutput/macro.py b/river/metrics/multioutput/macro.py index 45fbf0b1d7..cbd5699ea1 100644 --- a/river/metrics/multioutput/macro.py +++ b/river/metrics/multioutput/macro.py @@ -37,14 +37,14 @@ def works_with(self, model) -> bool: return utils.inspect.ismoclassifier(model) return utils.inspect.ismoregressor(model) - def update(self, y_true, y_pred, sample_weight=1.0): + def update(self, y_true, y_pred, w=1.0): for i in y_true: - self.metrics[i].update(y_true[i], y_pred[i], sample_weight) + self.metrics[i].update(y_true[i], y_pred[i], w) return self - def revert(self, y_true, y_pred, sample_weight=1.0): + def revert(self, y_true, y_pred, w=1.0): for i in y_true: - self.metrics[i].revert(y_true[i], y_pred[i], sample_weight) + self.metrics[i].revert(y_true[i], y_pred[i], w) return self def get(self): diff --git a/river/metrics/multioutput/micro.py b/river/metrics/multioutput/micro.py index 3449563929..079a613753 100644 --- a/river/metrics/multioutput/micro.py +++ b/river/metrics/multioutput/micro.py @@ -30,14 +30,14 @@ def works_with(self, model) -> bool: return utils.inspect.ismoclassifier(model) return utils.inspect.ismoregressor(model) - def update(self, y_true, y_pred, sample_weight=1.0): + def update(self, y_true, y_pred, w=1.0): for i in y_true: - self.metric.update(y_true[i], y_pred[i], sample_weight) + self.metric.update(y_true[i], y_pred[i], w) return self - def revert(self, y_true, y_pred, sample_weight=1.0): + def revert(self, y_true, y_pred, w=1.0): for i in y_true: - self.metric.revert(y_true[i], y_pred[i], sample_weight) + self.metric.revert(y_true[i], y_pred[i], w) return self def get(self): diff --git a/river/metrics/multioutput/per_output.py b/river/metrics/multioutput/per_output.py index db75519add..0e810e8858 100644 --- a/river/metrics/multioutput/per_output.py +++ b/river/metrics/multioutput/per_output.py @@ -35,14 +35,14 @@ def works_with(self, model) -> bool: return utils.inspect.ismoclassifier(model) return utils.inspect.ismoregressor(model) - def update(self, y_true, y_pred, sample_weight=1.0): + def update(self, y_true, y_pred, w=1.0): for i in y_true: - self.metrics[i].update(y_true[i], y_pred[i], sample_weight) + self.metrics[i].update(y_true[i], y_pred[i], w) return self - def revert(self, y_true, y_pred, sample_weight=1.0): + def revert(self, y_true, y_pred, w=1.0): for i in y_true: - self.metrics[i].revert(y_true[i], y_pred[i], sample_weight) + self.metrics[i].revert(y_true[i], y_pred[i], w) return self def get(self): diff --git a/river/metrics/multioutput/sample_average.py b/river/metrics/multioutput/sample_average.py index 17278f0d12..66c522c62b 100644 --- a/river/metrics/multioutput/sample_average.py +++ b/river/metrics/multioutput/sample_average.py @@ -53,18 +53,18 @@ def works_with(self, model) -> bool: return utils.inspect.ismoclassifier(model) return utils.inspect.ismoregressor(model) - def update(self, y_true, y_pred, sample_weight=1.0): + def update(self, y_true, y_pred, w=1.0): metric = self.metric.clone() for i in y_true: metric.update(y_true[i], y_pred[i]) - self._avg.update(metric.get(), sample_weight) + self._avg.update(metric.get(), w) return self - def revert(self, y_true, y_pred, sample_weight=1.0): + def revert(self, y_true, y_pred, w=1.0): metric = self.metric.clone() for i in y_true: metric.update(y_true[i], y_pred[i]) - self._avg.revert(metric.get(), sample_weight) + self._avg.revert(metric.get(), w) return self def get(self): diff --git a/river/metrics/r2.py b/river/metrics/r2.py index 88685e4b28..479591fc2b 100644 --- a/river/metrics/r2.py +++ b/river/metrics/r2.py @@ -51,15 +51,15 @@ def __init__(self): def bigger_is_better(self): return True - def update(self, y_true, y_pred, sample_weight=1.0): - self._y_var.update(y_true, w=sample_weight) - squared_error = (y_true - y_pred) * (y_true - y_pred) * sample_weight + def update(self, y_true, y_pred, w=1.0): + self._y_var.update(y_true, w=w) + squared_error = (y_true - y_pred) * (y_true - y_pred) * w self._residual_sum_of_squares += squared_error return self - def revert(self, y_true, y_pred, sample_weight=1.0): - self._y_var.update(y_true, w=-sample_weight) - self._residual_sum_of_squares -= (y_true - y_pred) * (y_true - y_pred) * sample_weight + def revert(self, y_true, y_pred, w=1.0): + self._y_var.update(y_true, w=-w) + self._residual_sum_of_squares -= (y_true - y_pred) * (y_true - y_pred) * w return self def get(self): diff --git a/river/metrics/roc_auc.py b/river/metrics/roc_auc.py index 7f46432300..2e4a2d3c9f 100644 --- a/river/metrics/roc_auc.py +++ b/river/metrics/roc_auc.py @@ -67,16 +67,16 @@ def works_with(self, model) -> bool: or utils.inspect.isanomalyfilter(model) ) - def update(self, y_true, y_pred, sample_weight=1.0): + def update(self, y_true, y_pred, w=1.0): p_true = y_pred.get(True, 0.0) if isinstance(y_pred, dict) else y_pred for t, cm in zip(self.thresholds, self.cms): - cm.update(y_true == self.pos_val, p_true > t, sample_weight) + cm.update(y_true == self.pos_val, p_true > t, w) return self - def revert(self, y_true, y_pred, sample_weight=1.0): + def revert(self, y_true, y_pred, w=1.0): p_true = y_pred.get(True, 0.0) if isinstance(y_pred, dict) else y_pred for t, cm in zip(self.thresholds, self.cms): - cm.revert(y_true == self.pos_val, p_true > t, sample_weight) + cm.revert(y_true == self.pos_val, p_true > t, w) return self @property diff --git a/river/metrics/silhouette.py b/river/metrics/silhouette.py index 9d5e44e9ae..3866eb5f1f 100644 --- a/river/metrics/silhouette.py +++ b/river/metrics/silhouette.py @@ -68,7 +68,7 @@ def _find_distance_second_closest_center(centers, x): distances = {i: math.sqrt(utils.math.minkowski_distance(centers[i], x, 2)) for i in centers} return sorted(distances.values())[-2] - def update(self, x, y_pred, centers, sample_weight=1.0): + def update(self, x, y_pred, centers, w=1.0): distance_closest_centroid = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2)) self._sum_distance_closest_centroid += distance_closest_centroid @@ -77,7 +77,7 @@ def update(self, x, y_pred, centers, sample_weight=1.0): return self - def revert(self, x, y_pred, centers, sample_weight=1.0): + def revert(self, x, y_pred, centers, w=1.0): distance_closest_centroid = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2)) self._sum_distance_closest_centroid -= distance_closest_centroid diff --git a/river/metrics/test_metrics.py b/river/metrics/test_metrics.py index df8c07e942..fa66a9fc6b 100644 --- a/river/metrics/test_metrics.py +++ b/river/metrics/test_metrics.py @@ -236,7 +236,7 @@ def test_metric(metric, sk_metric): m = copy.deepcopy(metric) for i, (yt, yp, w) in enumerate(zip(y_true, y_pred, sample_weights)): if metric.works_with_weights: - m.update(y_true=yt, y_pred=yp, sample_weight=w) + m.update(y_true=yt, y_pred=yp, w=w) else: m.update(y_true=yt, y_pred=yp) diff --git a/river/tree/extremely_fast_decision_tree.py b/river/tree/extremely_fast_decision_tree.py index 9b948e356b..935f42b74b 100755 --- a/river/tree/extremely_fast_decision_tree.py +++ b/river/tree/extremely_fast_decision_tree.py @@ -204,7 +204,7 @@ def _branch_selector(self, numerical_feature=True, multiway_split=False) -> type else: return EFDTNominalMultiwayBranch - def learn_one(self, x, y, *, sample_weight=1.0): + def learn_one(self, x, y, *, w=1.0): """Incrementally train the model Parameters @@ -213,7 +213,7 @@ def learn_one(self, x, y, *, sample_weight=1.0): Instance attributes. y The label of the instance. - sample_weight + w The weight of the sample. Notes @@ -234,20 +234,20 @@ def learn_one(self, x, y, *, sample_weight=1.0): # Updates the set of observed classes self.classes.add(y) - self._train_weight_seen_by_model += sample_weight + self._train_weight_seen_by_model += w if self._root is None: self._root = self._new_leaf() self._n_active_leaves = 1 # Sort instance X into a leaf - self._sort_to_leaf(x, y, sample_weight) + self._sort_to_leaf(x, y, w) # Process all nodes, starting from root to the leaf where the instance x belongs. - self._process_nodes(x, y, sample_weight, self._root, None, None) + self._process_nodes(x, y, w, self._root, None, None) return self - def _sort_to_leaf(self, x, y, sample_weight): + def _sort_to_leaf(self, x, y, w): """For a given instance, find the corresponding leaf and update it. Private function where leaf learn from instance. @@ -262,7 +262,7 @@ def _sort_to_leaf(self, x, y, sample_weight): Instance attributes. y The instance label. - sample_weight + w The weight of the sample. """ @@ -287,12 +287,12 @@ def _sort_to_leaf(self, x, y, sample_weight): node = node.traverse(x, until_leaf=False) if isinstance(node, HTLeaf): break - node.learn_one(x, y, sample_weight=sample_weight, tree=self) + node.learn_one(x, y, w=w, tree=self) if self._train_weight_seen_by_model % self.memory_estimate_period == 0: self._estimate_model_size() - def _process_nodes(self, x, y, sample_weight, node, parent, branch_index): + def _process_nodes(self, x, y, w, node, parent, branch_index): """Process nodes from the root to the leaf where the instance belongs. 1. If the node is internal: @@ -306,7 +306,7 @@ def _process_nodes(self, x, y, sample_weight, node, parent, branch_index): Instance attributes. y The label of the instance. - sample_weight + w The weight of the sample. node The node to process. @@ -317,7 +317,7 @@ def _process_nodes(self, x, y, sample_weight, node, parent, branch_index): """ if isinstance(node, BaseEFDTBranch): # Update split nodes as the tree is traversed - node.learn_one(x, y, sample_weight=sample_weight, tree=self) + node.learn_one(x, y, w=w, tree=self) old_weight = node.last_split_reevaluation_at new_weight = node.total_weight @@ -342,7 +342,7 @@ def _process_nodes(self, x, y, sample_weight, node, parent, branch_index): child = node.children[child_index] except KeyError: child_index, child = node.most_common_path() - self._process_nodes(x, y, sample_weight, child, node, child_index) + self._process_nodes(x, y, w, child, node, child_index) elif self._growth_allowed and node.is_active(): if node.depth >= self.max_depth: # Max depth reached node.deactivate() diff --git a/river/tree/hoeffding_adaptive_tree_classifier.py b/river/tree/hoeffding_adaptive_tree_classifier.py index 98c3b06df2..05807fe031 100644 --- a/river/tree/hoeffding_adaptive_tree_classifier.py +++ b/river/tree/hoeffding_adaptive_tree_classifier.py @@ -217,17 +217,17 @@ def summary(self): ) return summ - def learn_one(self, x, y, *, sample_weight=1.0): + def learn_one(self, x, y, *, w=1.0): # Updates the set of observed classes self.classes.add(y) - self._train_weight_seen_by_model += sample_weight + self._train_weight_seen_by_model += w if self._root is None: self._root = self._new_leaf() self._n_active_leaves = 1 - self._root.learn_one(x, y, sample_weight=sample_weight, tree=self) + self._root.learn_one(x, y, w=w, tree=self) if self._train_weight_seen_by_model % self.memory_estimate_period == 0: self._estimate_model_size() diff --git a/river/tree/hoeffding_adaptive_tree_regressor.py b/river/tree/hoeffding_adaptive_tree_regressor.py index eeab272d0b..8b1193f274 100644 --- a/river/tree/hoeffding_adaptive_tree_regressor.py +++ b/river/tree/hoeffding_adaptive_tree_regressor.py @@ -227,13 +227,13 @@ def summary(self): ) return summ - def learn_one(self, x, y, *, sample_weight=1.0): - self._train_weight_seen_by_model += sample_weight + def learn_one(self, x, y, *, w=1.0): + self._train_weight_seen_by_model += w if self._root is None: self._root = self._new_leaf() self._n_active_leaves = 1 - self._root.learn_one(x, y, sample_weight=sample_weight, tree=self) + self._root.learn_one(x, y, w=w, tree=self) if self._train_weight_seen_by_model % self.memory_estimate_period == 0: self._estimate_model_size() diff --git a/river/tree/hoeffding_tree_classifier.py b/river/tree/hoeffding_tree_classifier.py index 91d09f79c2..7d7feb19bb 100755 --- a/river/tree/hoeffding_tree_classifier.py +++ b/river/tree/hoeffding_tree_classifier.py @@ -318,7 +318,7 @@ def _attempt_to_split(self, leaf: HTLeaf, parent: DTBranch, parent_branch: int, # Manage memory self._enforce_size_limit() - def learn_one(self, x, y, *, sample_weight=1.0): + def learn_one(self, x, y, *, w=1.0): """Train the model on instance x and corresponding target y. Parameters @@ -327,7 +327,7 @@ def learn_one(self, x, y, *, sample_weight=1.0): Instance attributes. y Class label for sample x. - sample_weight + w Sample weight. Returns @@ -349,7 +349,7 @@ def learn_one(self, x, y, *, sample_weight=1.0): # Updates the set of observed classes self.classes.add(y) - self._train_weight_seen_by_model += sample_weight + self._train_weight_seen_by_model += w if self._root is None: self._root = self._new_leaf() @@ -369,7 +369,7 @@ def learn_one(self, x, y, *, sample_weight=1.0): node = self._root if isinstance(node, HTLeaf): - node.learn_one(x, y, sample_weight=sample_weight, tree=self) + node.learn_one(x, y, w=w, tree=self) if self._growth_allowed and node.is_active(): if node.depth >= self.max_depth: # Max depth reached node.deactivate() @@ -403,7 +403,7 @@ def learn_one(self, x, y, *, sample_weight=1.0): if isinstance(node, HTLeaf): break # Learn from the sample - node.learn_one(x, y, sample_weight=sample_weight, tree=self) + node.learn_one(x, y, w=w, tree=self) if self._train_weight_seen_by_model % self.memory_estimate_period == 0: self._estimate_model_size() diff --git a/river/tree/hoeffding_tree_regressor.py b/river/tree/hoeffding_tree_regressor.py index f5a873f7a0..b7afa030d3 100644 --- a/river/tree/hoeffding_tree_regressor.py +++ b/river/tree/hoeffding_tree_regressor.py @@ -217,7 +217,7 @@ def _new_leaf(self, initial_stats=None, parent=None): return new_adaptive - def learn_one(self, x, y, *, sample_weight=1.0): + def learn_one(self, x, y, *, w=1.0): """Train the tree model on sample x and corresponding target y. Parameters @@ -226,7 +226,7 @@ def learn_one(self, x, y, *, sample_weight=1.0): Instance attributes. y Target value for sample x. - sample_weight + w The weight of the sample. Returns @@ -234,7 +234,7 @@ def learn_one(self, x, y, *, sample_weight=1.0): self """ - self._train_weight_seen_by_model += sample_weight + self._train_weight_seen_by_model += w if self._root is None: self._root = self._new_leaf() @@ -254,7 +254,7 @@ def learn_one(self, x, y, *, sample_weight=1.0): node = self._root if isinstance(node, HTLeaf): - node.learn_one(x, y, sample_weight=sample_weight, tree=self) + node.learn_one(x, y, w=w, tree=self) if self._growth_allowed and node.is_active(): if node.depth >= self.max_depth: # Max depth reached node.deactivate() @@ -288,7 +288,7 @@ def learn_one(self, x, y, *, sample_weight=1.0): if isinstance(node, HTLeaf): break # Learn from the sample - node.learn_one(x, y, sample_weight=sample_weight, tree=self) + node.learn_one(x, y, w=w, tree=self) if self._train_weight_seen_by_model % self.memory_estimate_period == 0: self._estimate_model_size() diff --git a/river/tree/isoup_tree_regressor.py b/river/tree/isoup_tree_regressor.py index c61a9c9f3a..d4413674e1 100644 --- a/river/tree/isoup_tree_regressor.py +++ b/river/tree/isoup_tree_regressor.py @@ -207,7 +207,7 @@ def _new_leaf(self, initial_stats=None, parent=None): return new_adaptive - def learn_one(self, x, y, *, sample_weight: float = 1.0) -> iSOUPTreeRegressor: # type: ignore + def learn_one(self, x, y, *, w: float = 1.0) -> iSOUPTreeRegressor: # type: ignore """Incrementally train the model with one sample. Training tasks: @@ -225,13 +225,13 @@ def learn_one(self, x, y, *, sample_weight: float = 1.0) -> iSOUPTreeRegressor: Instance attributes. y Target values. - sample_weight + w The weight of the passed sample. """ # Update target set self.targets.update(y.keys()) - super().learn_one(x, y, sample_weight=sample_weight) # type: ignore + super().learn_one(x, y, w=w) # type: ignore return self diff --git a/river/tree/nodes/efdtc_nodes.py b/river/tree/nodes/efdtc_nodes.py index 34c870da13..5fb611f034 100644 --- a/river/tree/nodes/efdtc_nodes.py +++ b/river/tree/nodes/efdtc_nodes.py @@ -111,13 +111,13 @@ def total_weight(self) -> float: def new_nominal_splitter(): return NominalSplitterClassif() - def update_stats(self, y, sample_weight): + def update_stats(self, y, w): try: - self.stats[y] += sample_weight + self.stats[y] += w except KeyError: - self.stats[y] = sample_weight + self.stats[y] = w - def update_splitters(self, x, y, sample_weight, nominal_attributes): + def update_splitters(self, x, y, w, nominal_attributes): for att_id, att_val in x.items(): if att_id in self._disabled_attrs: continue @@ -133,9 +133,9 @@ def update_splitters(self, x, y, sample_weight, nominal_attributes): splitter = copy.deepcopy(self.splitter) self.splitters[att_id] = splitter - splitter.update(att_val, y, sample_weight) + splitter.update(att_val, y, w) - def learn_one(self, x, y, *, sample_weight=1.0, tree=None): + def learn_one(self, x, y, *, w=1.0, tree=None): """Update branch with the provided sample. Parameters @@ -144,13 +144,13 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None): Sample attributes for updating the node. y Target value. - sample_weight + w Sample weight. tree Tree to update. """ - self.update_stats(y, sample_weight) - self.update_splitters(x, y, sample_weight, tree.nominal_attributes) + self.update_stats(y, w) + self.update_splitters(x, y, w, tree.nominal_attributes) def prediction(self, x, *, tree=None): return normalize_values_in_dict(self.stats, inplace=False) diff --git a/river/tree/nodes/hatc_nodes.py b/river/tree/nodes/hatc_nodes.py index 0f9fa241e5..bab558192d 100644 --- a/river/tree/nodes/hatc_nodes.py +++ b/river/tree/nodes/hatc_nodes.py @@ -47,12 +47,12 @@ def __init__(self, stats, depth, splitter, drift_detector, rng, **kwargs): def kill_tree_children(self, hat): pass - def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_branch=None): + def learn_one(self, x, y, *, w=1.0, tree=None, parent=None, parent_branch=None): if tree.bootstrap_sampling: # Perform bootstrap-sampling k = poisson(rate=1, rng=self.rng) if k > 0: - sample_weight *= k + w *= k aux = self.prediction(x, tree=tree) y_pred = max(aux, key=aux.get) if aux else None @@ -71,7 +71,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b self._mean_error = self._mean_error.clone() # Update statistics - super().learn_one(x, y, sample_weight=sample_weight, tree=tree) + super().learn_one(x, y, w=w, tree=tree) weight_seen = self.total_weight @@ -176,7 +176,7 @@ def iter_leaves(self): if isinstance(child, AdaBranchClassifier) and child._alternate_tree: yield from child._alternate_tree.iter_leaves() - def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_branch=None): + def learn_one(self, x, y, *, w=1.0, tree=None, parent=None, parent_branch=None): leaf = super().traverse(x, until_leaf=True) aux = leaf.prediction(x, tree=tree) y_pred = max(aux, key=aux.get) if aux else None @@ -185,9 +185,9 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b # Update stats as traverse the tree to improve predictions (in case split nodes are used # to provide responses) try: - self.stats[y] += sample_weight + self.stats[y] += w except KeyError: - self.stats[y] = sample_weight + self.stats[y] = w old_error = self._mean_error.get() @@ -250,7 +250,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b self._alternate_tree.learn_one( x, y, - sample_weight=sample_weight, + w=w, tree=tree, parent=parent, parent_branch=parent_branch, @@ -265,7 +265,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b child.learn_one( x, y, - sample_weight=sample_weight, + w=w, tree=tree, parent=self, parent_branch=self.branch_no(x), @@ -280,7 +280,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b leaf.learn_one( x, y, - sample_weight=sample_weight, + w=w, tree=tree, parent=self, parent_branch=self.branch_no(x), @@ -292,7 +292,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b child.learn_one( x, y, - sample_weight=sample_weight, + w=w, tree=tree, parent=self, parent_branch=child_id, diff --git a/river/tree/nodes/hatr_nodes.py b/river/tree/nodes/hatr_nodes.py index a8e134ddd4..aa496c4f4d 100644 --- a/river/tree/nodes/hatr_nodes.py +++ b/river/tree/nodes/hatr_nodes.py @@ -47,14 +47,14 @@ def __init__(self, stats, depth, splitter, drift_detector, rng, **kwargs): def kill_tree_children(self, hatr): pass - def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_branch=None): + def learn_one(self, x, y, *, w=1.0, tree=None, parent=None, parent_branch=None): y_pred = self.prediction(x, tree=tree) if tree.bootstrap_sampling: # Perform bootstrap-sampling k = poisson(rate=1, rng=self.rng) if k > 0: - sample_weight *= k + w *= k drift_input = abs(y - y_pred) old_error = self._error_tracker.mean.get() @@ -69,7 +69,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b self._error_tracker = self._error_tracker.clone() # Update learning model - super().learn_one(x, y, sample_weight=sample_weight, tree=tree) + super().learn_one(x, y, w=w, tree=tree) weight_seen = self.total_weight @@ -149,13 +149,13 @@ def iter_leaves(self): if isinstance(child, AdaBranchRegressor) and child._alternate_tree: yield from child._alternate_tree.iter_leaves() - def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_branch=None): + def learn_one(self, x, y, *, w=1.0, tree=None, parent=None, parent_branch=None): leaf = super().traverse(x, until_leaf=True) y_pred = leaf.prediction(x, tree=tree) # Update stats as traverse the tree to improve predictions (in case split nodes are used # to provide responses) - self.stats.update(y, sample_weight) + self.stats.update(y, w) drift_input = abs(y - y_pred) old_error = self._error_tracker.mean.get() @@ -224,7 +224,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b self._alternate_tree.learn_one( x, y, - sample_weight=sample_weight, + w=w, tree=tree, parent=parent, parent_branch=parent_branch, @@ -238,7 +238,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b child.learn_one( x, y, - sample_weight=sample_weight, + w=w, tree=tree, parent=self, parent_branch=self.branch_no(x), @@ -253,7 +253,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b leaf.learn_one( x, y, - sample_weight=sample_weight, + w=w, tree=tree, parent=self, parent_branch=self.branch_no(x), @@ -265,7 +265,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None, parent=None, parent_b child.learn_one( x, y, - sample_weight=sample_weight, + w=w, tree=tree, parent=self, parent_branch=child_id, diff --git a/river/tree/nodes/htc_nodes.py b/river/tree/nodes/htc_nodes.py index 6063af90e3..d35549ee79 100644 --- a/river/tree/nodes/htc_nodes.py +++ b/river/tree/nodes/htc_nodes.py @@ -31,11 +31,11 @@ def __init__(self, stats, depth, splitter, **kwargs): def new_nominal_splitter(): return NominalSplitterClassif() - def update_stats(self, y, sample_weight): + def update_stats(self, y, w): try: - self.stats[y] += sample_weight + self.stats[y] += w except KeyError: - self.stats[y] = sample_weight + self.stats[y] = w def prediction(self, x, *, tree=None): return normalize_values_in_dict(self.stats, inplace=False) @@ -164,7 +164,7 @@ def __init__(self, stats, depth, splitter, **kwargs): self._mc_correct_weight = 0.0 self._nb_correct_weight = 0.0 - def learn_one(self, x, y, *, sample_weight=1.0, tree=None): + def learn_one(self, x, y, *, w=1.0, tree=None): """Update the node with the provided instance. Parameters @@ -173,7 +173,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None): Instance attributes for updating the node. y Instance class. - sample_weight + w The instance's weight. tree The Hoeffding Tree to update. @@ -184,13 +184,13 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None): # Empty node (assume the majority class will be the best option) or majority # class prediction is correct if len(self.stats) == 0 or max(mc_pred, key=mc_pred.get) == y: - self._mc_correct_weight += sample_weight + self._mc_correct_weight += w nb_pred = do_naive_bayes_prediction(x, self.stats, self.splitters) if len(nb_pred) > 0 and max(nb_pred, key=nb_pred.get) == y: - self._nb_correct_weight += sample_weight + self._nb_correct_weight += w - super().learn_one(x, y, sample_weight=sample_weight, tree=tree) + super().learn_one(x, y, w=w, tree=tree) def prediction(self, x, *, tree=None): """Get the probabilities per class for a given instance. diff --git a/river/tree/nodes/htr_nodes.py b/river/tree/nodes/htr_nodes.py index 13a04d484a..248164399a 100644 --- a/river/tree/nodes/htr_nodes.py +++ b/river/tree/nodes/htr_nodes.py @@ -64,8 +64,8 @@ def manage_memory(self, criterion, last_check_ratio, last_check_vr, last_check_e pre_split_dist=self.stats, ) - def update_stats(self, y, sample_weight): - self.stats.update(y, sample_weight) + def update_stats(self, y, w): + self.stats.update(y, w) def prediction(self, x, *, tree=None): return self.stats.mean.get() @@ -133,13 +133,13 @@ def __init__(self, stats, depth, splitter, leaf_model, **kwargs): sign = inspect.signature(leaf_model.learn_one).parameters self._model_supports_weights = "sample_weight" in sign or "w" in sign - def learn_one(self, x, y, *, sample_weight=1.0, tree=None): - super().learn_one(x, y, sample_weight=sample_weight, tree=tree) + def learn_one(self, x, y, *, w=1.0, tree=None): + super().learn_one(x, y, w=w, tree=tree) if self._model_supports_weights: - self._leaf_model.learn_one(x, y, sample_weight) + self._leaf_model.learn_one(x, y, w) else: - for _ in range(int(sample_weight)): + for _ in range(int(w)): self._leaf_model.learn_one(x, y) def prediction(self, x, *, tree=None): @@ -173,14 +173,14 @@ def __init__(self, stats, depth, splitter, leaf_model, **kwargs): self._fmse_mean = 0.0 self._fmse_model = 0.0 - def learn_one(self, x, y, *, sample_weight=1.0, tree=None): + def learn_one(self, x, y, *, w=1.0, tree=None): pred_mean = self.stats.mean.get() pred_model = self._leaf_model.predict_one(x) self._fmse_mean = tree.model_selector_decay * self._fmse_mean + (y - pred_mean) ** 2 self._fmse_model = tree.model_selector_decay * self._fmse_model + (y - pred_model) ** 2 - super().learn_one(x, y, sample_weight=sample_weight, tree=tree) + super().learn_one(x, y, w=w, tree=tree) def prediction(self, x, *, tree=None): if self._fmse_mean < self._fmse_model: # Act as a regression tree diff --git a/river/tree/nodes/isouptr_nodes.py b/river/tree/nodes/isouptr_nodes.py index bfdb632094..4a20f6328f 100644 --- a/river/tree/nodes/isouptr_nodes.py +++ b/river/tree/nodes/isouptr_nodes.py @@ -33,9 +33,9 @@ def __init__(self, stats, depth, splitter, **kwargs): stats = stats if stats else VectorDict(default_factory=functools.partial(Var)) super().__init__(stats, depth, splitter, **kwargs) - def update_stats(self, y, sample_weight): + def update_stats(self, y, w): for t in y: - self.stats[t].update(y[t], sample_weight) + self.stats[t].update(y[t], w) def prediction(self, x, *, tree=None): return {t: self.stats[t].mean.get() if t in self.stats else 0.0 for t in tree.targets} @@ -82,8 +82,8 @@ def __init__(self, stats, depth, splitter, leaf_models, **kwargs): sign = inspect.signature(self._leaf_models[t].learn_one).parameters self._model_supports_weights[t] = "sample_weight" in sign or "w" in sign - def learn_one(self, x, y, *, sample_weight=1.0, tree=None): - super().learn_one(x, y, sample_weight=sample_weight, tree=tree) + def learn_one(self, x, y, *, w=1.0, tree=None): + super().learn_one(x, y, w=w, tree=tree) for target_id, y_ in y.items(): try: @@ -107,9 +107,9 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None): # Now the proper training if self._model_supports_weights[target_id]: - model.learn_one(x, y_, sample_weight) + model.learn_one(x, y_, w) else: - for _ in range(int(sample_weight)): + for _ in range(int(w)): model.learn_one(x, y_) def prediction(self, x, *, tree=None): @@ -145,7 +145,7 @@ def __init__(self, stats, depth, splitter, leaf_models, **kwargs): self._fmse_mean = collections.defaultdict(float) self._fmse_model = collections.defaultdict(float) - def learn_one(self, x, y, *, sample_weight=1.0, tree=None): + def learn_one(self, x, y, *, w=1.0, tree=None): pred_mean = {t: self.stats[t].mean.get() if t in self.stats else 0.0 for t in tree.targets} pred_model = super().prediction(x, tree=tree) @@ -157,7 +157,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None): tree.model_selector_decay * self._fmse_model[t] + (y[t] - pred_model[t]) ** 2 ) - super().learn_one(x, y, sample_weight=sample_weight, tree=tree) + super().learn_one(x, y, w=w, tree=tree) def prediction(self, x, *, tree=None): pred = {} diff --git a/river/tree/nodes/leaf.py b/river/tree/nodes/leaf.py index 0333815970..dfab307504 100644 --- a/river/tree/nodes/leaf.py +++ b/river/tree/nodes/leaf.py @@ -77,7 +77,7 @@ def new_nominal_splitter(): pass @abc.abstractmethod - def update_stats(self, y, sample_weight): + def update_stats(self, y, w): pass def _iter_features(self, x) -> typing.Iterable: @@ -90,7 +90,7 @@ def _iter_features(self, x) -> typing.Iterable: """ yield from x.items() - def update_splitters(self, x, y, sample_weight, nominal_attributes): + def update_splitters(self, x, y, w, nominal_attributes): for att_id, att_val in self._iter_features(x): if att_id in self._disabled_attrs: continue @@ -106,7 +106,7 @@ def update_splitters(self, x, y, sample_weight, nominal_attributes): splitter = self.splitter.clone() self.splitters[att_id] = splitter - splitter.update(att_val, y, sample_weight) + splitter.update(att_val, y, w) def best_split_suggestions(self, criterion, tree) -> list[BranchFactory]: """Find possible split candidates. @@ -149,7 +149,7 @@ def disable_attribute(self, att_id): del self.splitters[att_id] self._disabled_attrs.add(att_id) - def learn_one(self, x, y, *, sample_weight=1.0, tree=None): + def learn_one(self, x, y, *, w=1.0, tree=None): """Update the node with the provided sample. Parameters @@ -158,7 +158,7 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None): Sample attributes for updating the node. y Target value. - sample_weight + w Sample weight. tree Tree to update. @@ -169,9 +169,9 @@ def learn_one(self, x, y, *, sample_weight=1.0, tree=None): All classes overriding this method should include a call to `super().learn_one` to guarantee the learning process happens consistently. """ - self.update_stats(y, sample_weight) + self.update_stats(y, w) if self.is_active(): - self.update_splitters(x, y, sample_weight, tree.nominal_attributes) + self.update_splitters(x, y, w, tree.nominal_attributes) @abc.abstractmethod def prediction(self, x, *, tree=None) -> dict: diff --git a/river/tree/splitter/base.py b/river/tree/splitter/base.py index 98056dbffb..8524e14b8b 100644 --- a/river/tree/splitter/base.py +++ b/river/tree/splitter/base.py @@ -21,7 +21,7 @@ class Splitter(base.Estimator, abc.ABC): """ @abc.abstractmethod - def update(self, att_val, target_val: base.typing.Target, sample_weight: float): + def update(self, att_val, target_val: base.typing.Target, w: float): """Update statistics of this observer given an attribute value, its target value and the weight of the instance observed. @@ -31,7 +31,7 @@ def update(self, att_val, target_val: base.typing.Target, sample_weight: float): The value of the monitored attribute. target_val The target value. - sample_weight + w The weight of the instance. """ diff --git a/river/tree/splitter/ebst_splitter.py b/river/tree/splitter/ebst_splitter.py index 17bffc6269..f11bce48e4 100644 --- a/river/tree/splitter/ebst_splitter.py +++ b/river/tree/splitter/ebst_splitter.py @@ -43,14 +43,14 @@ def __init__(self): def is_target_class(self) -> bool: return False - def update(self, att_val, target_val, sample_weight): + def update(self, att_val, target_val, w): if att_val is None: return else: if self._root is None: - self._root = EBSTNode(att_val, target_val, sample_weight) + self._root = EBSTNode(att_val, target_val, w) else: - self._root.insert_value(att_val, target_val, sample_weight) + self._root.insert_value(att_val, target_val, w) def cond_proba(self, att_val, target_val): """Not implemented in regression splitters.""" @@ -237,7 +237,7 @@ def _remove_bad_split_nodes(self, current_node, parent=None, is_left_child=True) class EBSTNode: - def __init__(self, att_val, target_val, sample_weight): + def __init__(self, att_val, target_val, w): self.att_val = att_val if isinstance(target_val, dict): @@ -250,23 +250,23 @@ def __init__(self, att_val, target_val, sample_weight): self.estimator = Var() self._update_estimator = self._update_estimator_univariate - self._update_estimator(self, target_val, sample_weight) + self._update_estimator(self, target_val, w) self._left = None self._right = None @staticmethod - def _update_estimator_univariate(node, target, sample_weight): - node.estimator.update(target, sample_weight) + def _update_estimator_univariate(node, target, w): + node.estimator.update(target, w) @staticmethod - def _update_estimator_multivariate(node, target, sample_weight): + def _update_estimator_multivariate(node, target, w): for t in target: - node.estimator[t].update(target[t], sample_weight) + node.estimator[t].update(target[t], w) # Incremental implementation of the insert method. Avoiding unnecessary # stack tracing must decrease memory costs - def insert_value(self, att_val, target_val, sample_weight): + def insert_value(self, att_val, target_val, w): current = self antecedent = None is_right = False @@ -274,10 +274,10 @@ def insert_value(self, att_val, target_val, sample_weight): while current is not None: antecedent = current if att_val == current.att_val: - self._update_estimator(current, target_val, sample_weight) + self._update_estimator(current, target_val, w) return elif att_val < current.att_val: - self._update_estimator(current, target_val, sample_weight) + self._update_estimator(current, target_val, w) current = current._left is_right = False @@ -287,6 +287,6 @@ def insert_value(self, att_val, target_val, sample_weight): # Value was not yet added to the tree if is_right: - antecedent._right = EBSTNode(att_val, target_val, sample_weight) + antecedent._right = EBSTNode(att_val, target_val, w) else: - antecedent._left = EBSTNode(att_val, target_val, sample_weight) + antecedent._left = EBSTNode(att_val, target_val, w) diff --git a/river/tree/splitter/exhaustive_splitter.py b/river/tree/splitter/exhaustive_splitter.py index 7ba3b8ad83..945f00daea 100644 --- a/river/tree/splitter/exhaustive_splitter.py +++ b/river/tree/splitter/exhaustive_splitter.py @@ -30,14 +30,14 @@ def __init__(self): super().__init__() self._root = None - def update(self, att_val, target_val, sample_weight): + def update(self, att_val, target_val, w): if att_val is None: return else: if self._root is None: - self._root = ExhaustiveNode(att_val, target_val, sample_weight) + self._root = ExhaustiveNode(att_val, target_val, w) else: - self._root.insert_value(att_val, target_val, sample_weight) + self._root.insert_value(att_val, target_val, w) def cond_proba(self, att_val, target_val): """The underlying data structure used to monitor the input does not allow probability @@ -148,27 +148,27 @@ def _search_for_best_split_option( class ExhaustiveNode: - def __init__(self, att_val, target_val, sample_weight): + def __init__(self, att_val, target_val, w): self.class_count_left = defaultdict(float) self.class_count_right = defaultdict(float) self._left = None self._right = None self.cut_point = att_val - self.class_count_left[target_val] += sample_weight + self.class_count_left[target_val] += w - def insert_value(self, val, label, sample_weight): + def insert_value(self, val, label, w): if val == self.cut_point: - self.class_count_left[label] += sample_weight + self.class_count_left[label] += w elif val < self.cut_point: - self.class_count_left[label] += sample_weight + self.class_count_left[label] += w if self._left is None: - self._left = ExhaustiveNode(val, label, sample_weight) + self._left = ExhaustiveNode(val, label, w) else: - self._left.insert_value(val, label, sample_weight) + self._left.insert_value(val, label, w) else: - self.class_count_right[label] += sample_weight + self.class_count_right[label] += w if self._right is None: - self._right = ExhaustiveNode(val, label, sample_weight) + self._right = ExhaustiveNode(val, label, w) else: - self._right.insert_value(val, label, sample_weight) + self._right.insert_value(val, label, w) diff --git a/river/tree/splitter/gaussian_splitter.py b/river/tree/splitter/gaussian_splitter.py index b7edb9d4ea..cdf50f5cc5 100644 --- a/river/tree/splitter/gaussian_splitter.py +++ b/river/tree/splitter/gaussian_splitter.py @@ -30,7 +30,7 @@ def __init__(self, n_splits: int = 10): self._att_dist_per_class: dict[ClfTarget, Gaussian] = {} self.n_splits = n_splits - def update(self, att_val, target_val, sample_weight): + def update(self, att_val, target_val, w): if att_val is None: return else: @@ -46,7 +46,7 @@ def update(self, att_val, target_val, sample_weight): self._min_per_class[target_val] = att_val self._max_per_class[target_val] = att_val - val_dist.update(att_val, sample_weight) + val_dist.update(att_val, w) def cond_proba(self, att_val, target_val): if target_val in self._att_dist_per_class: diff --git a/river/tree/splitter/histogram_splitter.py b/river/tree/splitter/histogram_splitter.py index 55a5f6bdf7..cde9d490c3 100644 --- a/river/tree/splitter/histogram_splitter.py +++ b/river/tree/splitter/histogram_splitter.py @@ -33,8 +33,8 @@ def __init__(self, n_bins: int = 256, n_splits: int = 32): functools.partial(sketch.Histogram, max_bins=self.n_bins) ) - def update(self, att_val, target_val, sample_weight): - for _ in range(int(sample_weight)): + def update(self, att_val, target_val, w): + for _ in range(int(w)): self.hists[target_val].update(att_val) def cond_proba(self, att_val, target_val): diff --git a/river/tree/splitter/nominal_splitter_classif.py b/river/tree/splitter/nominal_splitter_classif.py index f1fcf09c77..189d04b3d5 100644 --- a/river/tree/splitter/nominal_splitter_classif.py +++ b/river/tree/splitter/nominal_splitter_classif.py @@ -24,18 +24,18 @@ def __init__(self): def is_numeric(self): return False - def update(self, att_val, target_val, sample_weight): + def update(self, att_val, target_val, w): if att_val is None: - self._missing_weight_observed += sample_weight + self._missing_weight_observed += w else: self._att_values.add(att_val) try: - self._att_dist_per_class[target_val][att_val] += sample_weight + self._att_dist_per_class[target_val][att_val] += w except KeyError: - self._att_dist_per_class[target_val][att_val] = sample_weight + self._att_dist_per_class[target_val][att_val] = w - self._total_weight_observed += sample_weight + self._total_weight_observed += w def cond_proba(self, att_val, target_val): class_dist = self._att_dist_per_class[target_val] diff --git a/river/tree/splitter/nominal_splitter_reg.py b/river/tree/splitter/nominal_splitter_reg.py index ebe7bf7b5e..a2ebff001c 100644 --- a/river/tree/splitter/nominal_splitter_reg.py +++ b/river/tree/splitter/nominal_splitter_reg.py @@ -28,16 +28,16 @@ def is_numeric(self): return False @staticmethod - def _update_estimator_univariate(estimator, target, sample_weight): - estimator.update(target, sample_weight) + def _update_estimator_univariate(estimator, target, w): + estimator.update(target, w) @staticmethod - def _update_estimator_multivariate(estimator, target, sample_weight): + def _update_estimator_multivariate(estimator, target, w): for t in target: - estimator[t].update(target[t], sample_weight) + estimator[t].update(target[t], w) - def update(self, att_val, target_val, sample_weight): - if att_val is None or sample_weight is None: + def update(self, att_val, target_val, w): + if att_val is None or w is None: return else: try: @@ -49,7 +49,7 @@ def update(self, att_val, target_val, sample_weight): else: self._statistics[att_val] = Var() estimator = self._statistics[att_val] - self._update_estimator(estimator, target_val, sample_weight) + self._update_estimator(estimator, target_val, w) def cond_proba(self, att_val, target_val): """Not implemented in regression splitters.""" diff --git a/river/tree/splitter/qo_splitter.py b/river/tree/splitter/qo_splitter.py index 8e246e17a4..269249a19b 100644 --- a/river/tree/splitter/qo_splitter.py +++ b/river/tree/splitter/qo_splitter.py @@ -66,11 +66,11 @@ def __init__(self, radius: float = 0.25, allow_multiway_splits=False): self.allow_multiway_splits = allow_multiway_splits - def update(self, att_val, target_val, sample_weight): + def update(self, att_val, target_val, w): if att_val is None: return else: - self._quantizer.update(att_val, target_val, sample_weight) + self._quantizer.update(att_val, target_val, w) def cond_proba(self, att_val, target_val): raise NotImplementedError @@ -162,12 +162,12 @@ def _init_estimator(self, y): self.y_stats = stats.Var() self._update_estimator = self._update_estimator_univariate - def _update_estimator_univariate(self, target, sample_weight): - self.y_stats.update(target, sample_weight) + def _update_estimator_univariate(self, target, w): + self.y_stats.update(target, w) - def _update_estimator_multivariate(self, target, sample_weight): + def _update_estimator_multivariate(self, target, w): for t in target: - self.y_stats[t].update(target[t], sample_weight) + self.y_stats[t].update(target[t], w) def __iadd__(self, o): self.x_stats += o.x_stats @@ -175,9 +175,9 @@ def __iadd__(self, o): return self - def update(self, x, y, sample_weight): - self.x_stats.update(x, sample_weight) - self._update_estimator(y, sample_weight) + def update(self, x, y, w): + self.x_stats.update(x, w) + self._update_estimator(y, w) class FeatureQuantizer: diff --git a/river/tree/splitter/random_splitter.py b/river/tree/splitter/random_splitter.py index f198083414..4fd099f379 100644 --- a/river/tree/splitter/random_splitter.py +++ b/river/tree/splitter/random_splitter.py @@ -29,17 +29,17 @@ def clone(self, new_params: dict | None = None, include_attributes=False): return super().clone(new_params, include_attributes) @abc.abstractmethod - def _update_stats(self, branch, target_val, sample_weight): + def _update_stats(self, branch, target_val, w): pass def cond_proba(self, att_val, class_val) -> float: """This attribute observer does not support probability density estimation.""" raise NotImplementedError - def update(self, att_val, target_val, sample_weight) -> Splitter: + def update(self, att_val, target_val, w) -> Splitter: if self.threshold is None: if len(self._buffer) < self.buffer_size: - self._buffer.append((att_val, target_val, sample_weight)) + self._buffer.append((att_val, target_val, w)) return self mn = min(self._buffer, key=lambda t: t[0])[0] @@ -53,7 +53,7 @@ def update(self, att_val, target_val, sample_weight) -> Splitter: return self - self._update_stats(0 if att_val <= self.threshold else 1, target_val, sample_weight) + self._update_stats(0 if att_val <= self.threshold else 1, target_val, w) return self @@ -76,8 +76,8 @@ def __init__(self, seed, buffer_size): super().__init__(seed, buffer_size) self.stats = {0: stats.Var(), 1: stats.Var()} - def _update_stats(self, branch, target_val, sample_weight): - self.stats[branch].update(target_val, sample_weight) + def _update_stats(self, branch, target_val, w): + self.stats[branch].update(target_val, w) @property def is_target_class(self) -> bool: diff --git a/river/tree/splitter/tebst_splitter.py b/river/tree/splitter/tebst_splitter.py index f5b623060e..68e2212b82 100644 --- a/river/tree/splitter/tebst_splitter.py +++ b/river/tree/splitter/tebst_splitter.py @@ -22,10 +22,10 @@ def __init__(self, digits: int = 1): super().__init__() self.digits = digits - def update(self, att_val, target_val, sample_weight): + def update(self, att_val, target_val, w): try: att_val = round(att_val, self.digits) - super().update(att_val, target_val, sample_weight) + super().update(att_val, target_val, w) except TypeError: # feature value is None pass