From f89a4f4b25ff869e57a544e8fd9bef0cddb24da6 Mon Sep 17 00:00:00 2001
From: prassepaul <prasse.paul@googlemail.com>
Date: Tue, 26 Sep 2023 18:45:03 +0200
Subject: [PATCH] Feature/dataset sbsat (#575)

* added from_csv (#504)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix docstring error

* docstring error

* flake8 issue

* cyclic import

* added import to docstring

* added dataset gaze_on_faces

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* from_ipc

* error in path

* unused arguments

* bug fix

* fixed time column in feather file

* requested changes

* sbset dataset

* flake, pyling, mypy

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* bib

---------

Co-authored-by: prassepaul <paul.prasse@uni-potsdam.de>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: David R. Reich <43832476+SiQube@users.noreply.github.com>
---
 docs/source/bibliography.bib              |  10 ++
 src/pymovements/datasets/__init__.py      |   3 +
 src/pymovements/datasets/gaze_on_faces.py |   2 +-
 src/pymovements/datasets/sb_sat.py        | 151 ++++++++++++++++++++++
 tests/datasets/datasets_test.py           |   2 +
 tests/datasets/sbsat_test.py              |  79 +++++++++++
 6 files changed, 246 insertions(+), 1 deletion(-)
 create mode 100644 src/pymovements/datasets/sb_sat.py
 create mode 100644 tests/datasets/sbsat_test.py

diff --git a/docs/source/bibliography.bib b/docs/source/bibliography.bib
index f240e2163..0679166a0 100644
--- a/docs/source/bibliography.bib
+++ b/docs/source/bibliography.bib
@@ -76,3 +76,13 @@ @article{GazeOnFaces
   year={2016},
   publisher={The Association for Research in Vision and Ophthalmology},
 }
+
+@inproceedings{SB-SAT,
+    title = {Towards predicting reading comprehension from gaze behavior},
+    year = {2020},
+    booktitle = {Proceedings of the ACM Symposium on Eye Tracking Research and Applications},
+    author = {Ahn, Seoyoung and Kelton, Conor and Balasubramanian, Aruna and Zelinsky, Greg},
+    pages = {1--5},
+    publisher = {Association for Computing Machinery},
+    address = "Stuttgart, Germany",
+}
diff --git a/src/pymovements/datasets/__init__.py b/src/pymovements/datasets/__init__.py
index 75597101c..b2f0a543e 100644
--- a/src/pymovements/datasets/__init__.py
+++ b/src/pymovements/datasets/__init__.py
@@ -29,6 +29,7 @@
     pymovements.datasets.GazeBaseVR
     pymovements.datasets.GazeOnFaces
     pymovements.datasets.JuDo1000
+    pymovements.datasets.SBSAT
 
 
 .. rubric:: Example Datasets
@@ -44,6 +45,7 @@
 from pymovements.datasets.gazebase import GazeBase
 from pymovements.datasets.gazebasevr import GazeBaseVR
 from pymovements.datasets.judo1000 import JuDo1000
+from pymovements.datasets.sb_sat import SBSAT
 from pymovements.datasets.toy_dataset import ToyDataset
 from pymovements.datasets.toy_dataset_eyelink import ToyDatasetEyeLink
 
@@ -53,6 +55,7 @@
     'GazeBaseVR',
     'GazeOnFaces',
     'JuDo1000',
+    'SBSAT',
     'ToyDataset',
     'ToyDatasetEyeLink',
 ]
diff --git a/src/pymovements/datasets/gaze_on_faces.py b/src/pymovements/datasets/gaze_on_faces.py
index 0a5795dfe..7ed3d125f 100644
--- a/src/pymovements/datasets/gaze_on_faces.py
+++ b/src/pymovements/datasets/gaze_on_faces.py
@@ -34,7 +34,7 @@
 @dataclass
 @register_dataset
 class GazeOnFaces(DatasetDefinition):
-    """GazeBaseVR dataset :cite:p:`GazeOnFaces`.
+    """GazeOnFaces dataset :cite:p:`GazeOnFaces`.
 
     This dataset includes monocular eye tracking data from single participants in a single
     session. Eye movements are recorded at a sampling frequency of 60 Hz
diff --git a/src/pymovements/datasets/sb_sat.py b/src/pymovements/datasets/sb_sat.py
new file mode 100644
index 000000000..f6e09f968
--- /dev/null
+++ b/src/pymovements/datasets/sb_sat.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022-2023 The pymovements Project Authors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""This module provides an interface to the GazeOnFaces dataset."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Any
+
+import polars as pl
+
+from pymovements.dataset.dataset_definition import DatasetDefinition
+from pymovements.dataset.dataset_library import register_dataset
+from pymovements.gaze.experiment import Experiment
+
+
+@dataclass
+@register_dataset
+class SBSAT(DatasetDefinition):
+    """SB-SAT dataset :cite:p:`SB-SAT`.
+
+    This dataset includes monocular eye tracking data from a single participants in a single
+    session. Eye movements are recorded at a sampling frequency of 1,000 Hz using an EyeLink 1000
+    eye tracker and are provided as pixel coordinates.
+
+    The participant is instructed to read texts and answer questions.
+
+    Check the respective paper for details :cite:p:`SB-SAT`.
+
+    Attributes
+    ----------
+    name : str
+        The name of the dataset.
+
+    mirrors : tuple[str, ...]
+        A tuple of mirrors of the dataset. Each entry must be of type `str` and end with a '/'.
+
+    resources : tuple[dict[str, str], ...]
+        A tuple of dataset resources. Each list entry must be a dictionary with the following keys:
+        - `resource`: The url suffix of the resource. This will be concatenated with the mirror.
+        - `filename`: The filename under which the file is saved as.
+        - `md5`: The MD5 checksum of the respective file.
+
+    experiment : Experiment
+        The experiment definition.
+
+    filename_format : str
+        Regular expression which will be matched before trying to load the file. Namedgroups will
+        appear in the `fileinfo` dataframe.
+
+    filename_format_dtypes : dict[str, type], optional
+        If named groups are present in the `filename_format`, this makes it possible to cast
+        specific named groups to a particular datatype.
+
+    column_map : dict[str, str]
+        The keys are the columns to read, the values are the names to which they should be renamed.
+
+    custom_read_kwargs : dict[str, Any], optional
+        If specified, these keyword arguments will be passed to the file reading function.
+
+    Examples
+    --------
+    Initialize your :py:class:`~pymovements.PublicDataset` object with the
+    :py:class:`~pymovements.GazeOnFaces` definition:
+
+    >>> import pymovements as pm
+    >>>
+    >>> dataset = pm.Dataset("SBSAT", path='data/SBSAT')
+
+    Download the dataset resources resources:
+
+    >>> dataset.download()# doctest: +SKIP
+
+    Load the data into memory:
+
+    >>> dataset.load()# doctest: +SKIP
+    """
+
+    # pylint: disable=similarities
+    # The PublicDatasetDefinition child classes potentially share code chunks for definitions.
+
+    name: str = 'SBSAT'
+
+    mirrors: tuple[str, ...] = (
+        'https://files.de-1.osf.io/v1/resources/cdx69/providers/osfstorage/',
+    )
+
+    resources: tuple[dict[str, str], ...] = (
+        {
+            'resource': '64525979230ea6163c031267/?zip=',
+            'filename': 'csvs.zip',
+            'md5': '3cf074c93266b723437cf887f948c993',
+        },
+    )
+
+    experiment: Experiment = Experiment(
+        screen_width_px=768,
+        screen_height_px=1024,
+        screen_width_cm=42.4,
+        screen_height_cm=44.5,
+        distance_cm=70,
+        origin='center',
+        sampling_rate=1000,
+    )
+
+    filename_format: str = r'msd{subject_id:d}.csv'
+
+    filename_format_dtypes: dict[str, type] = field(
+        default_factory=lambda: {
+            'subject_id': int,
+        },
+    )
+
+    trial_columns: list[str] = field(default_factory=lambda: ['book_name', 'screen_id'])
+
+    time_column: str = 'time'
+
+    pixel_columns: list[str] = field(default_factory=lambda: ['x_left', 'y_left'])
+
+    column_map: dict[str, str] = field(default_factory=lambda: {})
+
+    custom_read_kwargs: dict[str, Any] = field(
+        default_factory=lambda: {
+            'separator': '\t',
+            'columns': [
+                'time', 'book_name', 'screen_id',
+                'x_left', 'y_left', 'pupil_left',
+            ],
+            'dtypes': [
+                pl.Int64, pl.Utf8, pl.Int64,
+                pl.Float64, pl.Float64, pl.Float64,
+            ],
+        },
+    )
diff --git a/tests/datasets/datasets_test.py b/tests/datasets/datasets_test.py
index 074002c10..02065e943 100644
--- a/tests/datasets/datasets_test.py
+++ b/tests/datasets/datasets_test.py
@@ -33,6 +33,7 @@
         pytest.param(pm.datasets.GazeBaseVR, 'GazeBaseVR', id='GazeBaseVR'),
         pytest.param(pm.datasets.GazeOnFaces, 'GazeOnFaces', id='GazeOnFaces'),
         pytest.param(pm.datasets.JuDo1000, 'JuDo1000', id='JuDo1000'),
+        pytest.param(pm.datasets.SBSAT, 'SBSAT', id='SBSAT'),
     ],
 )
 def test_public_dataset_registered(definition_class, dataset_name):
@@ -49,6 +50,7 @@ def test_public_dataset_registered(definition_class, dataset_name):
         pytest.param(pm.datasets.GazeBaseVR, id='GazeBaseVR'),
         pytest.param(pm.datasets.GazeOnFaces, id='GazeOnFaces'),
         pytest.param(pm.datasets.JuDo1000, id='JuDo1000'),
+        pytest.param(pm.datasets.SBSAT, id='SBSAT'),
     ],
 )
 def test_public_dataset_registered_correct_attributes(dataset_definition_class):
diff --git a/tests/datasets/sbsat_test.py b/tests/datasets/sbsat_test.py
new file mode 100644
index 000000000..3fcd41b93
--- /dev/null
+++ b/tests/datasets/sbsat_test.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2023 The pymovements Project Authors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Test all functionality in pymovements.dataset.sb_sat."""
+from pathlib import Path
+
+import pytest
+
+import pymovements as pm
+
+
+@pytest.mark.parametrize(
+    'init_path, expected_paths',
+    [
+        pytest.param(
+            '/data/set/path',
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/'),
+                'download': Path('/data/set/path/downloads'),
+            },
+        ),
+        pytest.param(
+            pm.DatasetPaths(root='/data/set/path'),
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/SBSAT'),
+                'download': Path('/data/set/path/SBSAT/downloads'),
+            },
+        ),
+        pytest.param(
+            pm.DatasetPaths(root='/data/set/path', dataset='.'),
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/'),
+                'download': Path('/data/set/path/downloads'),
+            },
+        ),
+        pytest.param(
+            pm.DatasetPaths(root='/data/set/path', dataset='dataset'),
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/dataset'),
+                'download': Path('/data/set/path/dataset/downloads'),
+            },
+        ),
+        pytest.param(
+            pm.DatasetPaths(root='/data/set/path', downloads='custom_downloads'),
+            {
+                'root': Path('/data/set/path/'),
+                'dataset': Path('/data/set/path/SBSAT'),
+                'download': Path('/data/set/path/SBSAT/custom_downloads'),
+            },
+        ),
+    ],
+)
+def test_paths(init_path, expected_paths):
+    dataset = pm.Dataset(pm.datasets.SBSAT, path=init_path)
+
+    assert dataset.paths.root == expected_paths['root']
+    assert dataset.path == expected_paths['dataset']
+    assert dataset.paths.dataset == expected_paths['dataset']
+    assert dataset.paths.downloads == expected_paths['download']