Skip to content

Commit

Permalink
Fix stats for data with NaN (not a number) values (#2797)
Browse files Browse the repository at this point in the history
* replace nan values with None (null) before computing stats

* add all float("nan") column to test

* update docs
  • Loading branch information
polinaeterna authored Jun 6, 2024
1 parent 2d2172b commit 89a74df
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 134 deletions.
2 changes: 1 addition & 1 deletion docs/source/statistics.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ This type represents categorical data encoded as [`ClassLabel`](https://huggingf
The following measures are returned for float data types:

* minimum, maximum, mean, and standard deviation values
* number and proportion of `null` values
* number and proportion of `null` and `NaN` values (`NaN` values are treated as `null`)
* histogram with 10 bins

<details><summary>Example </summary>
Expand Down
1 change: 1 addition & 0 deletions services/worker/src/worker/statistics_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ def _compute_statistics(
column_name: str,
n_samples: int,
) -> NumericalStatisticsItem:
data = data.fill_nan(None)
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
if nan_count == n_samples: # all values are None
return all_nan_statistics_item(n_samples)
Expand Down
22 changes: 11 additions & 11 deletions services/worker/tests/fixtures/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
from datasets.features.features import FeatureType

from .statistics_dataset import (
all_nan_column,
audio_dataset,
image_dataset,
null_column,
statistics_dataset,
statistics_not_supported_dataset,
statistics_string_text_dataset,
Expand Down Expand Up @@ -180,7 +180,7 @@ def datasets() -> Mapping[str, Dataset]:
"duckdb_index": Dataset.from_dict(
{
"text": SEARCH_TEXT_CONTENT,
"text_all_nan": all_nan_column(5),
"text_all_null": null_column(5),
"column with spaces": [
"a",
"b",
Expand All @@ -195,15 +195,15 @@ def datasets() -> Mapping[str, Dataset]:
[1, 2, 3, 4],
[1, 2, 3, 4, 5],
],
"list_all_nan": all_nan_column(5),
"list_all_null": null_column(5),
"sequence_list": [
[1],
[1, 2],
None,
[1, 2, 3, 4],
[1, 2, 3, 4, 5],
],
"sequence_list_all_nan": all_nan_column(5),
"sequence_list_all_null": null_column(5),
"sequence_struct": [
[],
[{"author": "cat", "likes": 5}],
Expand All @@ -212,24 +212,24 @@ def datasets() -> Mapping[str, Dataset]:
None,
],
"audio": audio_dataset["audio"] + [None],
"audio_all_nan": all_nan_column(5),
"audio_all_null": null_column(5),
"image": image_dataset["image"] + [None],
"image_all_nan": all_nan_column(5),
"image_all_null": null_column(5),
},
features=Features(
{
"text": Value(dtype="string"),
"text_all_nan": Value(dtype="string"),
"text_all_null": Value(dtype="string"),
"column with spaces": Value(dtype="string"),
"list": [Value(dtype="int32")],
"list_all_nan": [Value(dtype="int32")],
"list_all_null": [Value(dtype="int32")],
"sequence_list": Sequence(Value(dtype="int32")),
"sequence_list_all_nan": Sequence(Value(dtype="int32")),
"sequence_list_all_null": Sequence(Value(dtype="int32")),
"sequence_struct": Sequence({"author": Value("string"), "likes": Value("int32")}),
"audio": Audio(sampling_rate=1600, decode=False),
"audio_all_nan": Audio(sampling_rate=1600, decode=False),
"audio_all_null": Audio(sampling_rate=1600, decode=False),
"image": Image(decode=False),
"image_all_nan": Image(decode=False),
"image_all_null": Image(decode=False),
}
),
),
Expand Down
Loading

0 comments on commit 89a74df

Please sign in to comment.