feat: Add HBN dataset (#576)

aeye-lab · Sep 28, 2023 · c8ef04d · c8ef04d
1 parent 73793f3
commit c8ef04d
Show file tree

Hide file tree

Showing 6 changed files with 313 additions and 1 deletion.
diff --git a/docs/source/bibliography.bib b/docs/source/bibliography.bib
@@ -86,3 +86,80 @@ @inproceedings{SB-SAT
     publisher = {Association for Computing Machinery},
     address = "Stuttgart, Germany",
 }
+
+@article{HBN,
+  title={An open resource for transdiagnostic research in pediatric mental health and learning disorders},
+  author={Alexander, Lindsay M. and
+ Escalera, Jasmine and
+ Ai, Lei and
+ Andreotti, Charissa and
+ Febre, Karina and
+ Mangone, Alexander and
+ Vega-Potler, Natan and
+ Langer, Nicolas and
+ Alexander, Alexis and
+ Kovacs, Meagan and
+ Litke, Shannon and
+ O'Hagan, Bridget and
+ Andersen, Jennifer and
+ Bronstein, Batya and
+ Bui, Anastasia and
+ Bushey, Marijayne and
+ Butler, Henry and
+ Castagna, Victoria and
+ Camacho, Nicolas and
+ Chan, Elisha and
+ Citera, Danielle and
+ Clucas, Jon and
+ Cohen, Samantha and
+ Dufek, Sarah and
+ Eaves, Megan and
+ Fradera, Brian and
+ Gardner, Judith and
+ Grant-Villegas, Natalie and
+ Green, Gabriella and
+ Gregory, Camille and
+ Hart, Emily and
+ Harris, Shana and
+ Horton, Megan and
+ Kahn, Danielle and
+ Kabotyanski, Katherine and
+ Karmel, Bernard and
+ Kelly, Simon P. and
+ Kleinman, Kayla and
+ Koo, Bonhwang and
+ Kramer, Eliza and
+ Lennon, Elizabeth and
+ Lord, Catherine and
+ Mantello, Ginny and
+ Margolis, Amy and
+ Merikangas, Kathleen R. and
+ Milham, Judith and
+ Minniti, Giuseppe and
+ Neuhaus, Rebecca and
+ Levine, Alexandra and
+ Osman, Yael and
+ Parra, Lucas C. and
+ Pugh, Ken R. and
+ Racanello, Amy and
+ Restrepo, Anita and
+ Saltzman, Tian and
+ Septimus, Batya and
+ Tobe, Russell and
+ Waltz, Rachel and
+ Williams, Anna and
+ Yeo, Anna and
+Castellanos, Francisco X. and
+Klein, Arno and
+Paus, Tomas and
+Leventhal, Bennett L. and
+Craddock, R. Cameron and
+Koplewicz, Harold S. and
+Milham, Michael P.},
+  journal={Scientific data},
+  volume={4},
+  number={1},
+  pages={1--26},
+  year={2017},
+  publisher={Nature Publishing Group}
+}
diff --git a/src/pymovements/datasets/__init__.py b/src/pymovements/datasets/__init__.py
@@ -28,6 +28,7 @@
     pymovements.datasets.GazeBase
     pymovements.datasets.GazeBaseVR
     pymovements.datasets.GazeOnFaces
+    pymovements.datasets.HBN
     pymovements.datasets.JuDo1000
     pymovements.datasets.SBSAT
 
@@ -44,6 +45,7 @@
 from pymovements.datasets.gaze_on_faces import GazeOnFaces
 from pymovements.datasets.gazebase import GazeBase
 from pymovements.datasets.gazebasevr import GazeBaseVR
+from pymovements.datasets.hbn import HBN
 from pymovements.datasets.judo1000 import JuDo1000
 from pymovements.datasets.sb_sat import SBSAT
 from pymovements.datasets.toy_dataset import ToyDataset
@@ -54,6 +56,7 @@
     'GazeBase',
     'GazeBaseVR',
     'GazeOnFaces',
+    'HBN',
     'JuDo1000',
     'SBSAT',
     'ToyDataset',

diff --git a/src/pymovements/datasets/hbn.py b/src/pymovements/datasets/hbn.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022-2023 The pymovements Project Authors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""This module provides an interface to the HBN dataset."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Any
+
+import polars as pl
+
+from pymovements.dataset.dataset_definition import DatasetDefinition
+from pymovements.dataset.dataset_library import register_dataset
+from pymovements.gaze.experiment import Experiment
+
+
+@dataclass
+@register_dataset
+class HBN(DatasetDefinition):
+    """HBN dataset :cite:p:`HBN`.
+
+    This dataset consists of recordings from children
+    watching four different age-appropriate videos: (1) an
+    educational video clip (Fun with Fractals), (2) a short animated
+    film (The Present), (3) a short clip of an animated film (Despicable Me),
+    and (4) a trailer for a feature-length movie (Diary of a Wimpy Kid).
+    The eye gaze was recorded at a sampling rate of 120 Hz.
+
+    Check the respective paper for details :cite:p:`HBN`.
+
+    Attributes
+    ----------
+    name : str
+        The name of the dataset.
+
+    mirrors : tuple[str, ...]
+        A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
+
+    resources : tuple[dict[str, str], ...]
+        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+        - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
+        - `filename`: The filename under which the file is saved as.
+        - `md5`: The MD5 checksum of the respective file.
+
+    experiment : Experiment
+        The experiment definition.
+
+    filename_format : str
+        Regular expression which will be matched before trying to load the file. Namedgroups will
+        appear in the `fileinfo` dataframe.
+
+    filename_format_dtypes : dict[str, type], optional
+        If named groups are present in the `filename_format`, this makes it possible to cast
+        specific named groups to a particular datatype.
+
+    column_map : dict[str, str]
+        The keys are the columns to read, the values are the names to which they should be renamed.
+
+    custom_read_kwargs : dict[str, Any], optional
+        If specified, these keyword arguments will be passed to the file reading function.
+
+    Examples
+    --------
+    Initialize your :py:class:`~pymovements.PublicDataset` object with the
+    :py:class:`~pymovements.HBN` definition:
+
+    >>> import pymovements as pm
+    >>>
+    >>> dataset = pm.Dataset("HBN", path='data/HBN')
+
+    Download the dataset resources resources:
+
+    >>> dataset.download()# doctest: +SKIP
+
+    Load the data into memory:
+
+    >>> dataset.load()# doctest: +SKIP
+    """
+
+    # pylint: disable=similarities
+    # The PublicDatasetDefinition child classes potentially share code chunks for definitions.
+
+    name: str = 'HBN'
+
+    mirrors: tuple[str, ...] = (
+        'https://files.osf.io/v1/resources/qknuv/providers/osfstorage/',
+    )
+
+    resources: tuple[dict[str, str], ...] = (
+        {
+            'resource': '651190031e76a453918a9971',
+            'filename': 'data.zip',
+            'md5': '2c523e911022ffc0eab700e34e9f7f30',
+        },
+    )
+
+    experiment: Experiment = Experiment(
+        screen_width_px=800,
+        screen_height_px=600,
+        screen_width_cm=33.8,
+        screen_height_cm=27.0,
+        distance_cm=63.5,
+        origin='center',
+        sampling_rate=120,
+    )
+
+    filename_format: str = r'{subject_id:12}_{video_id}.csv'
+
+    filename_format_dtypes: dict[str, type] = field(
+        default_factory=lambda: {
+            'subject_id': str,
+            'video_id': str,
+        },
+    )
+
+    trial_columns: list[str] = field(default_factory=lambda: ['video_id'])
+
+    time_column: str = 'time'
+
+    pixel_columns: list[str] = field(default_factory=lambda: ['x_pix', 'y_pix'])
+
+    column_map: dict[str, str] = field(default_factory=lambda: {})
+
+    custom_read_kwargs: dict[str, Any] = field(
+        default_factory=lambda: {
+            'separator': ',',
+            'columns': [
+                'time', 'x_pix', 'y_pix',
+            ],
+            'dtypes': [
+                pl.Float64, pl.Float64, pl.Float64,
+            ],
+        },
+    )
diff --git a/src/pymovements/datasets/sb_sat.py b/src/pymovements/datasets/sb_sat.py
@@ -17,7 +17,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-"""This module provides an interface to the GazeOnFaces dataset."""
+"""This module provides an interface to the SB-SAT dataset."""
 from __future__ import annotations
 
 from dataclasses import dataclass

diff --git a/tests/datasets/datasets_test.py b/tests/datasets/datasets_test.py
@@ -32,6 +32,7 @@
         pytest.param(pm.datasets.GazeBase, 'GazeBase', id='GazeBase'),
         pytest.param(pm.datasets.GazeBaseVR, 'GazeBaseVR', id='GazeBaseVR'),
         pytest.param(pm.datasets.GazeOnFaces, 'GazeOnFaces', id='GazeOnFaces'),
+        pytest.param(pm.datasets.HBN, 'HBN', id='HBN'),
         pytest.param(pm.datasets.JuDo1000, 'JuDo1000', id='JuDo1000'),
         pytest.param(pm.datasets.SBSAT, 'SBSAT', id='SBSAT'),
     ],
@@ -49,6 +50,7 @@ def test_public_dataset_registered(definition_class, dataset_name):
         pytest.param(pm.datasets.GazeBase, id='GazeBase'),
         pytest.param(pm.datasets.GazeBaseVR, id='GazeBaseVR'),
         pytest.param(pm.datasets.GazeOnFaces, id='GazeOnFaces'),
+        pytest.param(pm.datasets.HBN, id='HBN'),
         pytest.param(pm.datasets.JuDo1000, id='JuDo1000'),
         pytest.param(pm.datasets.SBSAT, id='SBSAT'),
     ],

diff --git a/tests/datasets/hbn_test.py b/tests/datasets/hbn_test.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2023 The pymovements Project Authors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Test all functionality in pymovements.dataset.hbn."""
+from pathlib import Path
+
+import pytest
+
+import pymovements as pm
+
+
+@pytest.mark.parametrize(
+    'init_path, expected_paths',
+    [
+        pytest.param(
+            '/data/set/path',
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/'),
+                'download': Path('/data/set/path/downloads'),
+            },
+        ),
+        pytest.param(
+            pm.DatasetPaths(root='/data/set/path'),
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/HBN'),
+                'download': Path('/data/set/path/HBN/downloads'),
+            },
+        ),
+        pytest.param(
+            pm.DatasetPaths(root='/data/set/path', dataset='.'),
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/'),
+                'download': Path('/data/set/path/downloads'),
+            },
+        ),
+        pytest.param(
+            pm.DatasetPaths(root='/data/set/path', dataset='dataset'),
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/dataset'),
+                'download': Path('/data/set/path/dataset/downloads'),
+            },
+        ),
+        pytest.param(
+            pm.DatasetPaths(root='/data/set/path', downloads='custom_downloads'),
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/HBN'),
+                'download': Path('/data/set/path/HBN/custom_downloads'),
+            },
+        ),
+    ],
+)
+def test_paths(init_path, expected_paths):
+    dataset = pm.Dataset(pm.datasets.HBN, path=init_path)
+
+    assert dataset.paths.root == expected_paths['root']
+    assert dataset.path == expected_paths['dataset']
+    assert dataset.paths.dataset == expected_paths['dataset']
+    assert dataset.paths.downloads == expected_paths['download']