aai-institute · mdbenito · Jan 13, 2025 · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -37,11 +37,11 @@
   approximation
   [PR #601](https://github.com/aai-institute/pyDVL/pull/601)
 
-## Fixed
+### Fixed
+
 - Replace `np.float_` with `np.float64` and `np.alltrue` with `np.all`,
   as the old aliases are removed in NumPy 2.0
   [PR #604](https://github.com/aai-institute/pyDVL/pull/604)
-
 - Fix a bug in pydvl.utils.numeric.random_subset where 1 - q was used instead of q
   as the probability of an element being sampled
   [PR #597](https://github.com/aai-institute/pyDVL/pull/597)
@@ -53,6 +53,10 @@
 
 ### Changed
 
+- Introduced the concept of logical vs data indices for `Dataset`, and
+  `GroupedDataset`, fixing inconsistencies in how the latter operates on indices.
+  Also, both now return objects of the same type when slicing.
+  [PR #631](https://github.com/aai-institute/pyDVL/pull/631)
 - Use tighter bounds for the calculation of the minimal sample size that guarantees
   an epsilon-delta approximation in group testing (Jia et al. 2023)
   [PR #602](https://github.com/aai-institute/pyDVL/pull/602)

diff --git a/notebooks/least_core_basic_new.ipynb b/notebooks/least_core_basic_new.ipynb
@@ -216,13 +216,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.fit(full_train_data.x, full_train_data.y)\n",
-    "print(\n",
-    "    f\"Training accuracy: {100 * model.score(full_train_data.x, full_train_data.y):0.2f}%\"\n",
-    ")\n",
-    "print(\n",
-    "    f\"Testing accuracy: {100 * model.score(full_test_data.x, full_test_data.y):0.2f}%\"\n",
-    ")"
+    "model.fit(*full_train_data.data())\n",
+    "print(f\"Training accuracy: {100 * model.score(*full_train_data.data()):0.2f}%\")\n",
+    "print(f\"Testing accuracy: {100 * model.score(*full_test_data.data()):0.2f}%\")"
    ]
   },
   {
@@ -232,21 +228,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.fit(small_train_data.x, small_train_data.y)\n",
-    "print(\n",
-    "    f\"Training accuracy: {100 * model.score(small_train_data.x, small_train_data.y):0.2f}%\"\n",
-    ")\n",
-    "print(\n",
-    "    f\"Testing accuracy: {100 * model.score(small_test_data.x, small_test_data.y):0.2f}%\"\n",
-    ")"
+    "model.fit(*small_train_data.data())\n",
+    "print(f\"Training accuracy: {100 * model.score(*small_train_data.data()):0.2f}%\")\n",
+    "print(f\"Testing accuracy: {100 * model.score(*small_test_data.data()):0.2f}%\")"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "92b4a2d1",
    "metadata": {},
    "source": [
-    "## Estimating Least Core Values\n",
+    "## Estimating Least-Core Values\n",
     "\n",
     "In this first section we will use a smaller subset of the dataset containing 10 samples in order to be able to compute exact values in a reasonable amount of time. Afterwards, we will use the Monte Carlo method with a limited budget (maximum number of subsets) to approximate these values."
    ]
@@ -495,9 +487,9 @@
     "    if np.any([x >= 1.0 or x < 0.0 for x in percentages]):\n",
     "        raise ValueError(\"All percentages should be in the range [0.0, 1.0)\")\n",
     "\n",
-    "    if len(values) != len(training_data.indices):\n",
+    "    if len(values) != len(training_data):\n",
     "        raise ValueError(\n",
-    "            f\"The number of values, {len(values)}, should be equal to the number of data indices, {len(training_data.indices)}\"\n",
+    "            f\"The number of values, {len(values)}, should be equal to the number of data indices, {len(training_data)}\"\n",
     "        )\n",
     "\n",
     "    scores = {}\n",

diff --git a/src/pydvl/reporting/plots.py b/src/pydvl/reporting/plots.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, List, Literal, Optional, OrderedDict, Sequence
+from typing import Any, List, Literal, Optional, OrderedDict, Sequence, cast
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -292,7 +292,7 @@ def plot_influence_distribution(
     ax.set_xlabel("Influence values")
     ax.set_ylabel("Number of samples")
     ax.set_title(f"Distribution of influences {title_extra}")
-    return ax
+    return cast(plt.Axes, ax)
 
 
 def plot_influence_distribution_by_label(