From 88ada3b4af13e89f66ad088d44129f5cdda02a7d Mon Sep 17 00:00:00 2001
From: VincentAuriau <auriau.vincent@gmail.com>
Date: Wed, 20 Dec 2023 16:48:20 +0100
Subject: [PATCH] ADD: data files from repo

---
 lib/data/choice_dataset_v3.py | 854 ++++++++++++++++++++++++++++++++++
 lib/data/indexer.py           | 258 ++++++++++
 lib/data/store.py             | 268 +++++++++++
 3 files changed, 1380 insertions(+)
 create mode 100644 lib/data/choice_dataset_v3.py
 create mode 100644 lib/data/indexer.py
 create mode 100644 lib/data/store.py

diff --git a/lib/data/choice_dataset_v3.py b/lib/data/choice_dataset_v3.py
new file mode 100644
index 00000000..ea12ab5e
--- /dev/null
+++ b/lib/data/choice_dataset_v3.py
@@ -0,0 +1,854 @@
+"""Main classes to handle assortment data"""
+
+import numpy as np
+import pandas as pd
+from choice_modeling.data.indexer import ChoiceDatasetIndexer
+from choice_modeling.data.store import Store
+
+
+class ChoiceDataset(object):
+    """
+    Version of ChoiceDataset where the choices are given as a ragged list of choices for each session.
+    It is particularly useful if several (a lot) of choices happen during the same session.
+    For example if we have the same customer buying several items during the same session, all its choices
+    can be regrouped under the same session_features. Limits data duplication in such cases.
+
+    The class has same methods/arguments as ChoiceDatset with a slight difference with self.choices being
+    a ragged list. The returned features in self.__getitem__ are the same as ChoiceDataset.
+    When calling __getitem__(index) we map index to a session index and a choice index within the session.
+    """
+
+    def __init__(
+        self,
+        items_features=None,
+        sessions_features=None,
+        sessions_items_features=None,
+        items_features_names=None,
+        sessions_features_names=None,
+        sessions_items_features_names=None,
+        sessions_items_availabilities=None,
+        choices=None,  # Should not have None as default value ?
+        batch_size=16,
+        shuffle=False,
+    ):
+        """
+        Builds the ChoiceDataset
+
+        Parameters
+        ----------
+        items_features : tuple of (array_like, )
+            matrix of shape (num_items, num_items_features) containing the features of the items e.g. item color
+        sessions_features : tuple of (array_like, )
+            matrix of shape (num_sessions, num_sess_features) containing the features of the sessions e.g. day of week
+        sessions_items_features : tuple of (array_like, )
+            matrix of shape (num_sessions, num_items, num_ses_items_features) containing the item features varying over sessions, e.g. prices
+        sessions_items_availabilities : array_like
+            binary matrix of shape (num_sessions, num_items) containing the availabitilies of products (1. if present 0. otherwise) over sessions
+        choices: list of list
+            for each sessions we have a list of related choices. Main list has same legnth as session_features and sessions_items_features.
+        batch_size: int, optional
+            size of the batches to return in __iter__ method
+        suffle: bool, optional
+            whether to shuffle the dataset or not
+
+        """
+
+        # --------- [ Handling features type given as tuples or not ] --------- #
+        # If items_features is not given as tuple, transform it internally as a tuple
+        # A bit longer because can be None and need to also handle names
+        if not isinstance(items_features, tuple) and items_features is not None:
+            items_features = (items_features,)
+            items_features_names = (items_features_names,)
+            self._return_items_features_tuple = False
+        # items_features is already a tuple, names are given, checking consistency
+        elif items_features is not None and items_features_names is not None:
+            assert len(items_features) == len(items_features_names) or items_features_names is None
+            self._return_items_features_tuple = True
+        # In this case names are missing, still transform it as a tuple
+        elif items_features is not None:
+            self._return_items_features_tuple = True
+            items_features_names = (None,) * len(items_features)
+
+        # If sessions_features is not given as tuple, transform it internally as a tuple
+        # A bit longer because can be None and need to also handle names
+        if not isinstance(sessions_features, tuple) and sessions_features is not None:
+            sessions_features = (sessions_features,)
+            sessions_features_names = (sessions_features_names,)
+            self._return_sessions_features_tuple = False
+        # sessions_features is already a tuple, names are given, checking consistency
+        elif sessions_features is not None and sessions_features_names is not None:
+            assert (
+                len(sessions_features) == len(sessions_features_names)
+                or sessions_features_names is None
+            )
+            self._return_sessions_features_tuple = True
+        # In this case names are missing, still transform it as a tuple
+        elif sessions_features is not None:
+            self._return_sessions_features_tuple = True
+            sessions_features_names = (None,) * len(sessions_features)
+
+        # If sessions_items_features is not given as tuple, transform it internally as a tuple
+        # A bit longer because can be None and need to also handle names
+        if not isinstance(sessions_items_features, tuple) and sessions_items_features is not None:
+            sessions_items_features = (sessions_items_features,)
+            sessions_items_features_names = (sessions_items_features_names,)
+            self._return_sessions_items_features_tuple = False
+        # sessions_items_features is already a tuple, names are given, checking consistency
+        elif sessions_items_features is not None and sessions_items_features_names is not None:
+            assert (
+                len(sessions_items_features) == len(sessions_items_features_names)
+                or sessions_items_features_names is None
+            )
+            self._return_sessions_items_features_tuple = True
+        # In this case names are missing, still transform it as a tuple
+        elif sessions_items_features is not None:
+            self._return_sessions_items_features_tuple = True
+            sessions_items_features_names = (None,) * len(sessions_items_features)
+
+        # --------- [Normalizing features types (DataFrame, List, etc...) -> np.ndarray] --------- #
+        #
+        # Part of this code is for handling features given as pandas.DataFrame
+        # Basically it transforms them to be internally stocked as np.ndarray and keep columns names as features names
+
+        # Handling items_features
+        for i, feature in enumerate(items_features):
+            if isinstance(feature, pd.DataFrame):
+                # Ordering items by id ?
+                if "item_id" in feature.columns:
+                    feature = feature.set_index("item_id")
+                items_features = (
+                    items_features[:i]
+                    + (feature.loc[np.sort(feature.index)].values,)
+                    + items_features[i + 1 :]
+                )
+                items_features_names = (
+                    items_features_names[:i]
+                    + (feature.columns.tolist(),)
+                    + items_features_names[i + 1 :]
+                )
+            elif isinstance(feature, list):
+                items_features = items_features[:i] + (np.array(feature),) + items_features[i + 1 :]
+
+        # Handling sessions_features
+        for i, feature in enumerate(sessions_features):
+            if isinstance(feature, pd.DataFrame):
+                # Ordering sessions by id ?
+                if "session_id" in feature.columns:
+                    feature = feature.set_index("session_id")
+                sessions_features = (
+                    sessions_features[:i]
+                    + (feature.loc[np.sort(feature.index)].values,)
+                    + sessions_features[i + 1 :]
+                )
+                sessions_features_names = (
+                    sessions_features_names[:i]
+                    + (feature.columns.tolist(),)
+                    + sessions_features_names[i + 1 :]
+                )
+            elif isinstance(feature, list):
+                sessions_features = (
+                    sessions_features[:i] + (np.array(feature),) + sessions_features[i + 1 :]
+                )
+
+        # Handling sessions_items_features
+        for i, feature in enumerate(sessions_items_features):
+            if isinstance(feature, pd.DataFrame):
+                # Ordering sessions and items by id ?
+                if "session_id" not in feature.columns:
+                    feature["session_id"] = feature.index
+                items_index = np.sort(feature.item_id.unique())
+                sessions_index = np.sort(feature.session_id.unique())
+                names = [f for f in feature.columns if f != "session_id" and f != "item_id"]
+
+                (
+                    feature,
+                    sessions_items_availabilities,
+                ) = self._sessions_items_features_df_to_np(
+                    feature, items_index, sessions_index, feature.columns.tolist()
+                )
+
+                sessions_items_features = (
+                    sessions_items_features[:i] + feature + sessions_items_features[i + 1 :]
+                )
+
+                sessions_items_features_names = (
+                    sessions_items_features_names[:i]
+                    + (names,)
+                    + sessions_items_features_names[i + 1 :]
+                )
+            elif isinstance(feature, list):
+                sessions_items_features = (
+                    sessions_items_features[:i]
+                    + (np.array(feature),)
+                    + sessions_items_features[i + 1 :]
+                )
+
+        if isinstance(sessions_items_availabilities, list):
+            sessions_items_availabilities = np.array(sessions_items_availabilities)
+
+        # Handling choices
+        # Choices must then be given as the name of the chosen item
+        # Items are sorted by name and attributed an index
+        # Cannot be a list of choices yet
+        if isinstance(choices, pd.DataFrame):
+            # Ordering sessions by id
+            if "session_id" in choices.columns:
+                choices = choices.set_index("session_id")
+            choices = choices.loc[np.sort(choices.index)]
+            items = np.sort(np.unique(choices.choice))
+            # items is the value (str) of the item
+            choices = [np.where(items == c)[0] for c in choices.choice]
+
+        # Setting attributes of ChoiceDataset
+        self.items_features = items_features
+        self.sessions_features = sessions_features
+        self.sessions_items_features = sessions_items_features
+        self.sessions_items_availabilities = sessions_items_availabilities
+
+        self.items_features_names = items_features_names
+        self.sessions_features_names = sessions_features_names
+        self.sessions_items_features_names = sessions_items_features_names
+
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+
+        if choices is None:
+            # Done to keep a logical order of arguments, and has logic: choices have to be specified
+            raise ValueError("Choices must be specified, got None")
+        self.ragged_choices = choices
+        self.indexes, self.choices = self._build_indexes(choices)
+        self.n_choices = len(self.choices)
+
+        # Different consitency checks to ensure everythin is coherent
+        self._check_dataset()  # Should handle alone if np.arrays are squeezed
+        self._return_types = self._check_types()
+        self._check_names()
+
+        # Build .iloc method
+        self.indexer = ChoiceDatasetIndexer(self)
+
+    def _build_indexes(self, choices):
+        """
+        Builds the indexes dictionnary from the choices.
+        Particularly creates a flatten version of the choices and associates an index so that we can
+        retrieve from this index the session and the corresponding choice.
+
+        Parameters:
+        -----------
+        choices: list of list
+            raffed version of the choices
+
+        Returns:
+        --------
+        indexes: dict
+            dictionnary of indexes: {index: corresponding_session_index}
+        choices: np.ndarray
+            flattened (1D) version of the choices
+        """
+        try:  # 1 choice by session
+            if len(np.squeeze(choices).shape) == 1:
+                indexes = {i: i for i in range(len(choices))}
+                flat_choices = np.squeeze(self.ragged_choices)
+            elif len(np.squeeze(choices).shape) == 0:
+                indexes = {i: i for i in range(len(choices))}
+                flat_choices = np.array([np.squeeze(self.ragged_choices)])
+        except:  # Ragged sequence of choices
+            indexes = {}
+            flat_choices = []
+            total_count = 0
+            for sess_nb, sess in enumerate(choices):
+                for choice in sess:
+                    indexes[total_count] = sess_nb
+                    flat_choices.append(choice)
+                    total_count += 1
+        return indexes, np.array(flat_choices)
+
+    def _check_dataset(self):
+        """
+        Verifies that the shapes of the different features are consistent
+            - Over number of items
+            - Over number of sessions
+        Verifies that the choices have coherent values
+        """
+        self._check_num_items_shapes()
+        self._check_num_sessions_shapes()
+        self._check_choices_coherence()
+
+    def _check_num_items_shapes(self):
+        """
+        Verifies that the shapes of the different features are consistent over number of items
+            - items_features
+            - sessions_items_features
+            - sessions_items_availabilities
+        Sets the argument base_num_items
+        """
+        if self.items_features is not None:
+            base_num_items = self.items_features[0].shape[0]
+        elif self.sessions_items_features is not None:
+            base_num_items = self.sessions_items_features[0].shape[1]
+        elif self.sessions_items_availabilities is not None:
+            base_num_items = self.sessions_items_availabilities.shape[1]
+        else:
+            raise ValueError(
+                "No items features, sessions items features or items availabilities are defined"
+            )
+        self.base_num_items = base_num_items
+
+        if self.items_features is not None:
+            for items_feature in self.items_features:
+                assert (
+                    items_feature.shape[0] == base_num_items
+                ), f"shapes are (f{items_feature.shape[0]}, {base_num_items})"
+
+        if self.sessions_items_features is not None:
+            for sessions_items_feature in self.sessions_items_features:
+                assert (
+                    sessions_items_feature.shape[1] == base_num_items
+                ), f"shapes are (f{sessions_items_feature.shape[1]}, {base_num_items})"
+        if self.sessions_items_availabilities is not None:
+            assert (
+                self.sessions_items_availabilities.shape[1] == base_num_items
+            ), f"shapes are (f{self.sessions_items_availabilities.shape[1]}, {base_num_items})"
+
+    def _check_num_sessions_shapes(self):
+        """
+        Verifies that the shapes of the different features are consistent over number of sessions
+            - sessions_features
+            - sessions_items_features
+            - sessions_items_availabilities
+        Sets self.base_num_sessions argument.
+        """
+        base_num_sessions = len(self.ragged_choices)
+        self.base_num_sessions = base_num_sessions
+
+        if self.sessions_features is not None:
+            for sessions_feature in self.sessions_features:
+                assert (
+                    sessions_feature.shape[0] == base_num_sessions
+                ), f"shapes are: ({sessions_feature.shape[0]}, {base_num_sessions})"
+
+        if self.sessions_items_features is not None:
+            for sessions_items_feature in self.sessions_items_features:
+                assert (
+                    sessions_items_feature.shape[0] == base_num_sessions
+                ), f"shapes are: ({sessions_items_feature.shape[0]}, {base_num_sessions})"
+        if self.sessions_items_availabilities is not None:
+            assert (
+                self.sessions_items_availabilities.shape[0] == base_num_sessions
+            ), f"shapes are: ({self.sessions_items_availabilities.shape[0]}, {base_num_sessions})"
+
+    def _check_choices_coherence(self):
+        """
+        Verifies that the choices are coherent with the number of items present in other features.
+        Particularly:
+            - There is no choice index higher than detected number of items
+            - All items are present at least once in the choices
+        """
+        msg = f"Choices values not coherent with number of items given in features. In particular, \
+        max value of choices is {np.max(self.choices)} while number of items is {self.base_num_items}"
+        assert np.max(self.choices) < self.base_num_items, msg
+
+        unique_choices = set(np.unique(self.choices).flatten())
+        missing_choices = set(np.arange(start=0, stop=self.base_num_items, step=1)) - unique_choices
+        if len(missing_choices) > 0:
+            print(f"Some choices never happen in the dataset: {missing_choices}")
+
+    def _check_types(self):
+        """
+        Checks types of elements and store it in order to return right types.
+            - Either int32 or float32 consistently for features.
+                float32 is to be preferred unless One-Hot encoding is used.
+            - float32 for sessions_items_availabilities
+            - int32 for choices
+        """
+        return_types = []
+
+        item_types = []
+        if self.items_features is not None:
+            for item_feat in self.items_features:
+                if np.issubdtype(item_feat[0].dtype, np.integer):
+                    item_types.append(np.int32)
+                else:
+                    item_types.append(np.float32)
+        return_types.append(tuple(item_types))
+
+        session_types = []
+        if self.sessions_features is not None:
+            for sessions_feat in self.sessions_features:
+                if np.issubdtype(sessions_feat[0].dtype, np.integer):
+                    session_types.append(np.int32)
+                else:
+                    session_types.append(np.float32)
+        return_types.append(tuple(session_types))
+
+        session_item_types = []
+        if self.sessions_items_features is not None:
+            for session_item_feat in self.sessions_items_features:
+                if np.issubdtype(session_item_feat[0].dtype, np.integer):
+                    session_item_types.append(np.int32)
+                else:
+                    session_item_types.append(np.float32)
+        return_types.append(tuple(session_item_types))
+        return_types.append(np.float32)
+        return_types.append(np.int32)
+
+        return return_types
+
+    def _check_names(self):
+        if self.items_features_names is not None:
+            for name, features in zip(self.items_features_names, self.items_features):
+                if name is not None:
+                    assert (
+                        len(name) == features.shape[1]
+                    ), f"Specififed items_features_names has length {len(name)} while items_features has {features.shape[1]} elements"
+
+        if self.sessions_features_names is not None:
+            for name, features in zip(self.sessions_features_names, self.sessions_features):
+                if name is not None:
+                    assert (
+                        len(name) == features.shape[1]
+                    ), f"Specififed sessions_features_names has length {len(name)} while sessions_features has {features.shape[1]} elements"
+
+        if self.sessions_items_features_names is not None:
+            for (
+                name,
+                features,
+            ) in zip(self.sessions_items_features_names, self.sessions_items_features):
+                if name is not None:
+                    assert (
+                        len(name) == features.shape[1]
+                    ), f"Specififed sessions_items_features_names has length {len(name)} while sessions_items_features has {features.shape[1]} elements"
+
+    def __len__(self):
+        """Returns length of the dataset e.g. total number of sessions.
+
+        Returns:
+        --------
+        int
+            total number of sessions
+        """
+        return self.base_num_sessions
+
+    def get_num_items(self):
+        """
+        Method to access the total number of different items
+
+        Returns
+        -------
+        int
+            total number of different items
+        """
+        return self.base_num_items
+
+    def get_num_sessions(self):
+        """
+        Method to access the total number of different sessions.
+        Redundant with __len__ method.
+
+        Returns
+        -------
+        int
+            total number of different sessions
+        """
+        return len(self)
+
+    def get_num_choices(self):
+        """
+        Method to access the total number of different sessions
+
+        Returns
+        -------
+        int
+            total number of different sessions
+        """
+        return self.n_choices
+
+    @classmethod
+    def _sessions_items_features_df_to_np(
+        cls,
+        df,
+        items_index,
+        sessions_index,
+        features,
+        items_id_column="item_id",
+        sessions_id_column="session_id",
+    ):
+        """Builds sessions_items_features and sessions_items_availabilities from dataframe.
+
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            Dataframe containing all the features for each item and sessions
+        items_index : list
+            List of items
+        sessions_index : list
+            List of sessions
+        features : list
+            List of columns of df that represents the items_features (for sessions_items_features)
+
+        Returns
+        -------
+        np.ndarray of shape (n_sessions, n_items, n_features)
+            Corresponding sessions_items_features
+        np.ndarray of shape (n_sessions, n_items)
+            Corresponding availabilities
+        """
+        try:
+            features.remove("session_id")
+        except ValueError:
+            pass
+        try:
+            features.remove("item_id")
+        except ValueError:
+            pass
+
+        sessions_items_features = []
+        sessions_items_availabilities = []
+        for sess in sessions_index:
+            sess_df = df.loc[df[sessions_id_column] == sess]
+
+            if len(sess_df) == len(items_index):
+                sess_df = sess_df.T
+                sess_df.columns = sess_df.loc[items_id_column]
+                if features is not None:
+                    sessions_items_features.append(sess_df[items_index].loc[features].T.values)
+                sessions_items_availabilities.append(np.ones(len(items_index)))
+            else:
+                sess_feats = []
+                sess_av = []
+                for item in items_index:
+                    item_df = sess_df.loc[sess_df[items_id_column] == item]
+                    if len(item_df) > 0:
+                        if features is not None:
+                            sess_feats.append(item_df[features].values[0])
+                        sess_av.append(1)
+                    else:
+                        if features is not None:
+                            sess_feats.append(np.zeros(len(features)))
+                        sess_av.append(0)
+                sessions_items_features.append(sess_feats)
+                sessions_items_availabilities.append(sess_av)
+
+        if features is not None:
+            sessions_items_features = (np.array(sessions_items_features),)
+        else:
+            sessions_items_features = None
+        return sessions_items_features, np.array(sessions_items_availabilities)
+
+    @classmethod
+    def from_single_df(
+        cls,
+        df,
+        items_features_columns,
+        sessions_features_columns,
+        sessions_items_features_columns,
+        items_id_column="item_id",
+        sessions_id_column="session_id",
+        choices_column="choice",
+        choice_mode="items_name",
+    ):
+        """Builds numpy arrays for ChoiceDataset from a single dataframe.
+
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            dataframe in Long format
+        items_features_columns : list
+            Columns of the dataframe that are item features
+        sessions_features_columns : list
+            Columns of the dataframe that are session features
+        sessions_items_features_columns : list
+            Columns of the dataframe that are session-item features
+        items_id_column: str, optional
+            Name of the column containing the item ids, default is "items_id"
+        sessions_id_column: str, optional
+            Name of the column containing the sessions ids, default is "sessions_id"
+        choices_column: str, optional
+            Name of the column containing the choices, default is "choice"
+
+        Returns
+        -------
+        ChoiceDataset
+            corresponding ChoiceDataset
+        """
+
+        # Ordering items and sessions by id
+        items = np.sort(df[items_id_column].unique())
+        sessions = np.sort(df[sessions_id_column].unique())
+
+        if items_features_columns is not None:
+            items_features = df[items_features_columns + [items_id_column]].drop_duplicates()
+            items_features = items_features.set_index(items_id_column)
+            items_features = (items_features.loc[items].values,)
+
+            items_features_columns = (items_features_columns,)
+        else:
+            items_features = None
+
+        if sessions_features_columns is not None:
+            sessions_features = df[
+                sessions_features_columns + [sessions_id_column]
+            ].drop_duplicates()
+            sessions_features = sessions_features.set_index(sessions_id_column)
+            sessions_features = (sessions_features.loc[sessions].values,)
+
+            sessions_features_columns = (sessions_features_columns,)
+        else:
+            sessions_features = None
+
+        (
+            sessions_items_features,
+            sessions_items_availabilities,
+        ) = cls._sessions_items_features_df_to_np(
+            df,
+            items_index=items,
+            sessions_index=sessions,
+            features=sessions_items_features_columns,
+            items_id_column=items_id_column,
+            sessions_id_column=sessions_id_column,
+        )
+        sessions_items_features_columns = (
+            (sessions_items_features_columns,)
+            if sessions_items_features_columns is not None
+            else None
+        )
+
+        if choice_mode == "item_id":
+            choices = df[[choices_column, sessions_id_column]].drop_duplicates(sessions_id_column)
+            choices = choices.set_index(sessions_id_column)
+            choices = choices.loc[sessions].values
+            # items is the value (str) of the item
+            choices = [np.where(items == c)[0] for c in choices]
+        elif choice_mode == "one_zero":
+            choices = df[[items_id_column, choices_column, sessions_id_column]]
+            choices = choices.loc[choices[choices_column] == 1]
+            choices = choices = choices.set_index(sessions_id_column)
+            choices = (
+                choices.loc[sessions][items_id_column]
+                .map({k: v for v, k in enumerate(items)})
+                .values
+            )
+        else:
+            raise ValueError(
+                f"choice_mode {choice_mode} not recognized. Must be in ['item_id', 'one_zero']"
+            )
+
+        return ChoiceDataset(
+            items_features=items_features,
+            sessions_features=sessions_features,
+            sessions_items_features=sessions_items_features,
+            sessions_items_availabilities=sessions_items_availabilities,
+            choices=choices,
+            items_features_names=items_features_columns,
+            sessions_features_names=sessions_features_columns,
+            sessions_items_features_names=sessions_items_features_columns,
+        )
+
+    def save(self):
+        raise NotImplementedError
+
+    def summary(self):
+        raise NotImplementedError
+
+    def get_choice_batch(self, choice_index):
+        """
+        Method to access data within the ListChoiceDataset from its index.
+        One index corresponds to a choice within a session.
+
+        Return order:
+            - Fixed item features
+            - Session features
+            - Session item features
+            - Items availabilities
+            - Choice
+
+        Parameters
+        ----------
+        index : int or list of int or slice
+            indexes of the choices (that will be mapped to choice & session indexes) to return
+
+        """
+        if isinstance(choice_index, list):
+            if self.items_features is None:
+                items_features = None
+            else:
+                items_features = tuple(
+                    items_feature.astype(self._return_types[0][i])
+                    for i, items_feature in enumerate(self.items_features)
+                )
+                # items_features were not given as a tuple, so we return do not return it as a tuple
+                if not self._return_items_features_tuple:
+                    items_features = items_features[0]
+
+            # Get the session indexes
+            sessions_indexes = [self.indexes[i] for i in choice_index]
+
+            if self.sessions_features is None:
+                sessions_features = None
+            else:
+                sessions_features = tuple(
+                    np.stack(sessions_feature[sessions_indexes], axis=0).astype(
+                        self._return_types[1][i]
+                    )
+                    if not isinstance(sessions_feature, Store)
+                    else sessions_feature.iloc[sessions_indexes]
+                    for i, sessions_feature in enumerate(self.sessions_features)
+                )
+                # sessions_features were not given as a tuple, so we return do not return it as a tuple
+                if not self._return_sessions_features_tuple:
+                    sessions_features = sessions_features[0]
+
+            if self.sessions_items_features is None:
+                sessions_items_features = None
+            else:
+                sessions_items_features = tuple(
+                    np.stack(sessions_items_feature[sessions_indexes], axis=0).astype(
+                        self._return_types[2][i]
+                    )
+                    if not isinstance(sessions_items_feature, Store)
+                    else sessions_items_feature.iloc[sessions_indexes]
+                    for i, sessions_items_feature in enumerate(self.sessions_items_features)
+                )
+                # sessions_items_features were not given as a tuple, so we return do not return it as a tuple
+                if not self._return_sessions_items_features_tuple:
+                    sessions_items_features = sessions_items_features[0]
+
+            if self.sessions_items_availabilities is None:
+                sessions_items_availabilities = None
+            else:
+                sessions_items_availabilities = self.sessions_items_availabilities[
+                    sessions_indexes
+                ].astype(self._return_types[3])
+
+            choice = self.choices[choice_index].astype(self._return_types[4])
+
+            return (
+                items_features,
+                sessions_features,
+                sessions_items_features,
+                sessions_items_availabilities,
+                choice,
+            )
+
+        elif isinstance(choice_index, slice):
+            return self.get_choice_batch(list(range(*choice_index.indices(self.choices.shape[0]))))
+
+        session_index = self.indexes[choice_index]
+        choice = self.choices[choice_index]
+
+        if self.items_features is None:
+            items_features = None
+        else:
+            items_features = tuple(items_feature for items_feature in self.items_features)
+
+        if self.sessions_features is None:
+            sessions_features = None
+        else:
+            sessions_features = tuple(
+                sessions_feature[session_index] for sessions_feature in self.sessions_features
+            )
+
+        if self.sessions_items_features is None:
+            sessions_items_features = None
+        else:
+            sessions_items_features = tuple(
+                sessions_items_feature[session_index]
+                for sessions_items_feature in self.sessions_items_features
+            )
+
+        if self.sessions_items_availabilities is None:
+            sessions_items_availabilities = None
+        else:
+            sessions_items_availabilities = self.sessions_items_availabilities[session_index]
+
+        return (
+            items_features,
+            sessions_features,
+            sessions_items_features,
+            sessions_items_availabilities,
+            choice,
+        )
+
+    def __getitem__(self, session_indexes):
+        """Method to create a sub-ChoiceDataset with only a subset of sessions, from their indexes.
+
+        Parameters
+        ----------
+        indexes : np.ndarray
+            indexes of the sessions to keep, shape should be (num_sessions,)
+
+        Returns
+        -------
+        ChoiceDataset
+            ChoiceDataset with only the sessions indexed by indexes
+        """
+        if isinstance(session_indexes, int):
+            session_indexes = [session_indexes]
+        elif isinstance(session_indexes, slice):
+            return self.__getitem__(list(range(*session_indexes.indices(len(self.ragged_choices)))))
+
+        return ChoiceDataset(
+            items_features=self.items_features,
+            sessions_features=tuple(
+                self.sessions_features[i][session_indexes]
+                for i in range(len(self.sessions_features))
+            ),
+            sessions_items_features=tuple(
+                self.sessions_items_features[i][session_indexes]
+                for i in range(len(self.sessions_items_features))
+            ),
+            sessions_items_availabilities=self.sessions_items_availabilities[session_indexes],
+            choices=[self.ragged_choices[i] for i in session_indexes],
+            batch_size=self.batch_size,
+            items_features_names=self.items_features_names,
+            sessions_features_names=self.sessions_features_names,
+            sessions_items_features_names=self.sessions_items_features_names,
+        )
+
+    def batch(self, batch_size=None, shuffle=None, sample_weight=None):
+        """
+        Iterates over dataset return batches of length self.batch_size
+
+        Arguments
+        ---------
+        batch_size : int
+            batch size to set
+        shuffle: bool
+            Whether or not to shuffle the dataset
+        sample_weight : Iterable
+            list of weights to be returned with the right indexing during the shuffling
+        """
+        if batch_size is None:
+            batch_size = self.batch_size
+        if shuffle is None:
+            shuffle = self.shuffle
+        if batch_size == -1:
+            batch_size = self.get_num_choices()
+
+        # Get indexes for each choice
+        num_choices = self.get_num_choices()
+        indexes = np.arange(num_choices)
+        # Shuffle indexes
+        if shuffle and not batch_size == -1:
+            indexes = np.random.permutation(indexes)
+
+        yielded_size = 0
+        while yielded_size < num_choices:
+            # Return sample_weight if not None, for index matching
+            if sample_weight is not None:
+                yield self.get_choice_batch(
+                    indexes[yielded_size : yielded_size + batch_size].tolist()
+                ), sample_weight[indexes[yielded_size : yielded_size + batch_size].tolist()]
+            else:
+                yield self.get_choice_batch(
+                    indexes[yielded_size : yielded_size + batch_size].tolist()
+                )
+            yielded_size += batch_size
+
+            # Special exit strategy for batch_size = -1
+            if batch_size == -1:
+                yielded_size += 2 * num_choices
+
+    @property
+    def iloc(self):
+        return self.indexer
diff --git a/lib/data/indexer.py b/lib/data/indexer.py
new file mode 100644
index 00000000..87d1e24c
--- /dev/null
+++ b/lib/data/indexer.py
@@ -0,0 +1,258 @@
+from abc import abstractmethod
+
+import numpy as np
+import pandas as pd
+
+
+class Indexer(object):
+    def __init__(self, indexed_object):
+        self.indexed_object = indexed_object
+
+    @abstractmethod
+    def __getitem__(self, index):
+        pass
+
+
+class StoreIndexer(Indexer):
+    """Class for Ilocing FeaturesStore
+
+    Parameters
+    ----------
+    TBD
+    """
+
+    def __init__(self, store):
+        self.store = store
+
+    def __getitem__(self, sequence_index):
+        """
+        Returns the features corresponding appearing at the sequence_index-th position of sequence
+
+        Parameters
+        ----------
+        sequence_index : (int, list, slice)
+            index position of the sequence
+
+        Returns
+        -------
+        array_like
+            features corresponding to the sequence_index-th position of sequence
+        """
+        if isinstance(sequence_index, list):
+            return [self.store.store[self.store.sequence[i]] for i in sequence_index]
+        elif isinstance(sequence_index, slice):
+            return [
+                self.store.store[self.store.sequence[i]]
+                for i in range(*sequence_index.indices(len(self.sequence)))
+            ]
+        return self.store.store[self.store.sequence[sequence_index]]
+
+
+class OneHotStoreIndexer(Indexer):
+    """Class for Ilocing OneHotStore
+
+    Parameters
+    ----------
+    TBD
+    """
+
+    def __init__(self, store):
+        self.store = store
+
+        self.shape = (len(self.store.sequence), np.max(list(self.store.store.values())) + 1)
+
+    def __getitem__(self, sequence_index):
+        """Main method to get an element at sequence_index-th position of self.sequence.
+
+        Parameters
+        ----------
+        sequence_index : (int, list, slice)
+            index from sequence of element to get
+
+        Returns
+        -------
+        np.ndarray
+            OneHot features corresponding to the sequence_index-th position of sequence
+        """
+        if isinstance(sequence_index, list):
+            # Construction of the OneHot vector from the index of the 1 value
+            one_hot = np.zeros((len(sequence_index), self.shape[1]))
+            for i, j in enumerate(sequence_index):
+                one_hot[i, self.store.store[self.store.sequence[j]]] = 1
+            return one_hot.astype(self.store.dtype)
+        else:
+            one_hot = np.zeros(self.shape[1])
+            one_hot[self.store.store[self.store.sequence[sequence_index]]] = 1
+            return one_hot.astype(self.store.dtype)
+
+
+class ChoiceDatasetIndexer(Indexer):
+    """Indexing class for ChoiceDataset
+
+    Parameters
+    ----------
+    object : _type_
+        _description_
+    """
+
+    def __init__(self, choice_dataset):
+        self.choice_dataset = choice_dataset
+
+    def _get_items_features(self):
+        if self.choice_dataset.items_features is None:
+            items_features = None
+        else:
+            items_features = tuple(
+                items_feature.astype(self.choice_dataset._return_types[0][i])
+                for i, items_feature in enumerate(self.choice_dataset.items_features)
+            )
+            # items_features were not given as a tuple, so we return do not return it as a tuple
+            if not self.choice_dataset._return_items_features_tuple:
+                items_features = items_features[0]
+
+        return items_features
+
+    def _get_sessions_features(self, sessions_indexes):
+        if self.choice_dataset.sessions_features is None:
+            sessions_features = None
+        else:
+            sessions_features = []
+            for i, sessions_feature in enumerate(self.choice_dataset.sessions_features):
+                if hasattr(sessions_feature, "iloc"):
+                    sessions_features.append(
+                        sessions_feature.iloc[sessions_indexes].astype(
+                            self.choice_dataset._return_types[1][i]
+                        )
+                    )
+                else:
+                    sessions_features.append(
+                        np.stack(sessions_feature[sessions_indexes], axis=0).astype(
+                            self.choice_dataset._return_types[1][i]
+                        )
+                    )
+            # sessions_features were not given as a tuple, so we return do not return it as a tuple
+            if not self.choice_dataset._return_sessions_features_tuple:
+                sessions_features = sessions_features[0]
+            else:
+                sessions_features = tuple(sessions_features)
+        return sessions_features
+
+    def _get_sessions_items_features(self, sessions_indexes):
+        if self.choice_dataset.sessions_items_features is None:
+            sessions_items_features = None
+        else:
+            sessions_items_features = []
+            for i, sessions_items_feature in enumerate(self.choice_dataset.sessions_items_features):
+                if hasattr(sessions_items_feature, "iloc"):
+                    sessions_items_features.append(
+                        sessions_items_feature.iloc[sessions_indexes].astype(
+                            self._return_types[2][i]
+                        )
+                    )
+                else:
+                    sessions_items_features.append(
+                        np.stack(sessions_items_feature[sessions_indexes], axis=0).astype(
+                            self.choice_dataset._return_types[2][i]
+                        )
+                    )
+            # sessions_items_features were not given as a tuple, so we return do not return it as a tuple
+            if self.choice_dataset._return_sessions_items_features_tuple:
+                sessions_items_features = tuple(sessions_items_features)
+            else:
+                sessions_items_features = sessions_items_features[0]
+            return sessions_items_features
+
+    def __getitem__(self, choice_index):
+        """
+        Method to access data within the ChoiceDataset from its index.
+        One index corresponds to a choice within a session.
+
+        Return order:
+            - Fixed item features
+            - Session features
+            - Session item features
+            - Items availabilities
+            - Choice
+
+        Parameters
+        ----------
+        index : int or list of int or slice
+            indexes of the choices (that will be mapped to choice & session indexes) to return
+
+        """
+        if isinstance(choice_index, list):
+            items_features = self._get_items_features()
+            # Get the session indexes
+            sessions_indexes = [self.choice_dataset.indexes[i] for i in choice_index]
+
+            sessions_features = self._get_sessions_features(sessions_indexes)
+            sessions_items_features = self._get_sessions_items_features(sessions_indexes)
+
+            if self.choice_dataset.sessions_items_availabilities is None:
+                sessions_items_availabilities = None
+            else:
+                if hasattr(self.choice_dataset.sessions_items_availabilities, "iloc"):
+                    sessions_items_availabilities = (
+                        self.choice_dataset.sessions_items_availabilities.iloc[
+                            sessions_indexes
+                        ].astype(self.choice_dataset._return_types[3])
+                    )
+                else:
+                    sessions_items_availabilities = (
+                        self.choice_dataset.sessions_items_availabilities[sessions_indexes].astype(
+                            self.choice_dataset._return_types[3]
+                        )
+                    )
+
+            choice = self.choice_dataset.choices[choice_index].astype(
+                self.choice_dataset._return_types[4]
+            )
+
+            return (
+                items_features,
+                sessions_features,
+                sessions_items_features,
+                sessions_items_availabilities,
+                choice,
+            )
+
+        elif isinstance(choice_index, slice):
+            return self.__getitem__(list(range(*choice_index.indices(self.choices.shape[0]))))
+
+        elif isinstance(choice_index, int):
+            items_features = self._get_items_features()
+            # Get the session indexes
+            sessions_indexes = self.choice_dataset.indexes[choice_index]
+
+            sessions_features = self._get_sessions_features(sessions_indexes)
+            sessions_items_features = self._get_sessions_items_features(sessions_indexes)
+
+            if self.choice_dataset.sessions_items_availabilities is None:
+                sessions_items_availabilities = None
+            else:
+                if hasattr(self.choice_dataset.sessions_items_availabilities, "iloc"):
+                    sessions_items_availabilities = (
+                        self.choice_dataset.sessions_items_availabilities.iloc[
+                            sessions_indexes
+                        ].astype(self.choice_dataset._return_types[3])
+                    )
+                else:
+                    sessions_items_availabilities = (
+                        self.choice_dataset.sessions_items_availabilities[sessions_indexes].astype(
+                            self.choice_dataset._return_types[3]
+                        )
+                    )
+
+            choice = self.choice_dataset.choices[choice_index].astype(
+                self.choice_dataset._return_types[4]
+            )
+
+            return (
+                items_features,
+                sessions_features,
+                sessions_items_features,
+                sessions_items_availabilities,
+                choice,
+            )
+        else:
+            raise NotImplementedError
diff --git a/lib/data/store.py b/lib/data/store.py
new file mode 100644
index 00000000..b84ab286
--- /dev/null
+++ b/lib/data/store.py
@@ -0,0 +1,268 @@
+import numpy as np
+from choice_modeling.data.indexer import OneHotStoreIndexer, StoreIndexer
+
+
+class Store(object):
+    """Class to keep OneHotStore and FeaturesStore with same parent"""
+
+    def __init__(self, indexes=None, values=None, sequence=None, name=None, indexer=StoreIndexer):
+        """
+        Builds the store
+
+        Parameters
+        ----------
+        indexes : array_like or None
+            list of indexes of features to store. If None is given, indexes are created from apparition order of values
+        values : array_like
+            list of values of features to store
+        sequence : array_like
+            sequence of apparitions of the features
+        name: string, optional
+            name of the features store -- not used at the moment
+        """
+        if indexes is None:
+            indexes = list(range(values))
+        self.store = {k: v for (k, v) in zip(indexes, values)}
+        self.sequence = np.array(sequence)
+        self.name = name
+
+        if sequence is not None and values is not None:
+            try:
+                width = len(values[0])
+            except:
+                width = 1
+            self.shape = (len(sequence), width)
+
+        self.indexer = indexer(self)
+
+    def _get_store_element(self, index):
+        """
+        Returns the features stored at index index. Compared to __getitem__, it does take the index-th
+        element of sequence but the index-th element of the store.
+
+        Parameters
+        ----------
+        index : (int, list, slice)
+            index argument of the feature
+
+        Returns
+        -------
+        array_like
+            features corresponding to the index index in self.store
+        """
+        if isinstance(index, list):
+            return [self.store[i] for i in index]
+        else:
+            return self.store[index]
+
+    def __len__(self):
+        return len(self.sequence)
+
+    @property
+    def iloc(self):
+        return self.indexer
+
+
+class FeaturesStore(Store):
+    """
+    Base class to store features and a sequence of apparitions.
+    Mainly useful when features are repeated frequently over the sequence.
+    An example would be to store the features of a customers (supposing that the same customers come
+    several times over the work sequence) and to save which customer is concerned for each choice.
+
+    Attributes
+    ----------
+    store : dict
+        Dictionary stocking features that can be called from indexes: {index: features}
+    shape : tuple
+        shape of the features store: (sequence_length, features_number)
+    sequence : array_like
+        List of elements of indexes representing the sequence of apparitions of the features
+    name: string, optional
+        name of the features store -- not used at the moment
+    dtype: type
+        type of the features
+    """
+
+    @classmethod
+    def from_dict(cls, values_dict, sequence):
+        """
+        Instantiates the FeaturesStore from a dictionary of values
+
+        Parameters
+        ----------
+        values_dict : dict
+            dictionary of values to store, {index: value}
+        sequence : array_like
+            sequence of apparitions of the features
+
+        Returns
+        -------
+        FeaturesStore created from the values in the dictionnary
+        """
+        # Check uniform shape of values
+        return cls(
+            indexes=list(values_dict.keys()), values=list(values_dict.values()), sequence=sequence
+        )
+
+    @classmethod
+    def from_list(cls, values_list, sequence):
+        """
+        Instantiates the FeaturesStore from a list of values
+        Creates indexes for each value
+
+        Parameters
+        ----------
+        values_list : list
+            List of values to store
+        sequence : array_like
+            sequence of apparitions of the features
+
+        Returns
+        -------
+        FeaturesStore
+        """
+        # Check uniform shape of list
+        # Useful ? To rethink...
+        return cls(indexes=list(range(len(values_list))), values=values_list, sequence=sequence)
+
+    def __getitem__(self, sequence_index):
+        """
+        Subsets self with sequence_index
+
+        Parameters
+        ----------
+        sequence_index : (int, list, slice)
+            index position of the sequence
+
+        Returns
+        -------
+        array_like
+            features corresponding to the sequence_index-th position of sequence
+        """
+        if isinstance(sequence_index, int):
+            sequence_index = [sequence_index]
+        print(sequence_index, type(self.sequence))
+        new_sequence = self.sequence[sequence_index]
+        store = {}
+        for k, v in self.store.items():
+            if k in new_sequence:
+                store[k] = v
+            else:
+                print(f"Key {k} of store with value {v} not in sequence anymore")
+
+        return FeaturesStore.from_dict(store, new_sequence)
+
+    def astype(self, dtype):
+        """
+        Changes the dtype of the features. The type of the features should implement the astype method.
+        Typically, should work like np.ndarrays.
+
+        Parameters
+        ----------
+        dtype : str or type
+            type to set the features as
+        """
+        for k, v in self.store.items():
+            self.store[k] = v.astype(dtype)
+
+
+class OneHotStore(Store):
+    """
+    Specific FeaturesStore for one hot features storage. Inherits from FeaturesStore.
+    For example can be used to store a OneHot representation of the days of week.
+
+    Has the same attributes as FeaturesStore, only differs whit some One-Hot optimized methods.
+    """
+
+    def __init__(
+        self,
+        indexes=None,
+        values=None,
+        sequence=None,
+        name=None,
+        dtype=np.float32,
+    ):
+        """
+        Builds the OneHot features store
+
+        Parameters
+        ----------
+        indexes : array_like or None
+            list of indexes of features to store. If None is given, indexes are created from apparition order of values
+        values : array_like or None
+            list of values of features to store that must be One-Hot. If None given they are created from order of apparition in sequence
+        sequence : array_like
+            sequence of apparitions of the features
+        name: string, optional
+            name of the features store -- not used at the moment
+        """
+        self.name = name
+        self.sequence = np.array(sequence)
+
+        if values is None:
+            self = self.from_sequence(sequence)
+        else:
+            self.store = {k: v for (k, v) in zip(indexes, values)}
+            self.shape = (len(sequence), np.max(values) + 1)
+
+        self.dtype = dtype
+        self.indexer = OneHotStoreIndexer(self)
+
+    @classmethod
+    def from_sequence(cls, sequence):
+        """Creates a OneHotFeatureStore from a sequence of apparition.
+        One Hot vector are created from the order of apparition in the sequence: feature vectors created
+        have a length of the number of different values in the sequence and the 1 is positioned in order of
+        first appartitions in the sequence.
+
+        Parameters
+        ----------
+        sequence : array-like
+            Sequence of apparitions of values, or indexes. Will be used to index self.store
+
+        Returns
+        -------
+        FeatureStore
+            Created from the sequence.
+        """
+        all_indexes = np.unique(sequence)
+        values = np.arange(len(all_indexes))
+        return cls(indexes=all_indexes, values=values, sequence=sequence)
+
+    def __getitem__(self, sequence_index):
+        """Main method to get an element at sequence_index-th position of self.sequence.
+
+        Parameters
+        ----------
+        sequence_index : (int, list, slice)
+            index from sequence of element to get
+
+        Returns
+        -------
+        np.ndarray
+            OneHot features corresponding to the sequence_index-th position of sequence
+        """
+        if isinstance(sequence_index, int):
+            sequence_index = [sequence_index]
+        new_sequence = self.sequence[sequence_index]
+        store = {}
+        for k, v in self.store.items():
+            if k in new_sequence:
+                store[k] = v
+            else:
+                print(f"Key {k} of store with value {v} not in sequence anymore")
+
+        return OneHotStore(
+            indexes=list(store.keys()), values=list(store.values()), sequence=new_sequence
+        )
+
+    def astype(self, dtype):
+        """Method to change (mainly int or float) type of returned OneHot features vectors.
+
+        Parameters
+        ----------
+        dtype : type
+            Type to set the features as
+        """
+        self.dtype = dtype