From 88ada3b4af13e89f66ad088d44129f5cdda02a7d Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Wed, 20 Dec 2023 16:48:20 +0100 Subject: [PATCH 1/8] ADD: data files from repo --- lib/data/choice_dataset_v3.py | 854 ++++++++++++++++++++++++++++++++++ lib/data/indexer.py | 258 ++++++++++ lib/data/store.py | 268 +++++++++++ 3 files changed, 1380 insertions(+) create mode 100644 lib/data/choice_dataset_v3.py create mode 100644 lib/data/indexer.py create mode 100644 lib/data/store.py diff --git a/lib/data/choice_dataset_v3.py b/lib/data/choice_dataset_v3.py new file mode 100644 index 00000000..ea12ab5e --- /dev/null +++ b/lib/data/choice_dataset_v3.py @@ -0,0 +1,854 @@ +"""Main classes to handle assortment data""" + +import numpy as np +import pandas as pd +from choice_modeling.data.indexer import ChoiceDatasetIndexer +from choice_modeling.data.store import Store + + +class ChoiceDataset(object): + """ + Version of ChoiceDataset where the choices are given as a ragged list of choices for each session. + It is particularly useful if several (a lot) of choices happen during the same session. + For example if we have the same customer buying several items during the same session, all its choices + can be regrouped under the same session_features. Limits data duplication in such cases. + + The class has same methods/arguments as ChoiceDatset with a slight difference with self.choices being + a ragged list. The returned features in self.__getitem__ are the same as ChoiceDataset. + When calling __getitem__(index) we map index to a session index and a choice index within the session. + """ + + def __init__( + self, + items_features=None, + sessions_features=None, + sessions_items_features=None, + items_features_names=None, + sessions_features_names=None, + sessions_items_features_names=None, + sessions_items_availabilities=None, + choices=None, # Should not have None as default value ? + batch_size=16, + shuffle=False, + ): + """ + Builds the ChoiceDataset + + Parameters + ---------- + items_features : tuple of (array_like, ) + matrix of shape (num_items, num_items_features) containing the features of the items e.g. item color + sessions_features : tuple of (array_like, ) + matrix of shape (num_sessions, num_sess_features) containing the features of the sessions e.g. day of week + sessions_items_features : tuple of (array_like, ) + matrix of shape (num_sessions, num_items, num_ses_items_features) containing the item features varying over sessions, e.g. prices + sessions_items_availabilities : array_like + binary matrix of shape (num_sessions, num_items) containing the availabitilies of products (1. if present 0. otherwise) over sessions + choices: list of list + for each sessions we have a list of related choices. Main list has same legnth as session_features and sessions_items_features. + batch_size: int, optional + size of the batches to return in __iter__ method + suffle: bool, optional + whether to shuffle the dataset or not + + """ + + # --------- [ Handling features type given as tuples or not ] --------- # + # If items_features is not given as tuple, transform it internally as a tuple + # A bit longer because can be None and need to also handle names + if not isinstance(items_features, tuple) and items_features is not None: + items_features = (items_features,) + items_features_names = (items_features_names,) + self._return_items_features_tuple = False + # items_features is already a tuple, names are given, checking consistency + elif items_features is not None and items_features_names is not None: + assert len(items_features) == len(items_features_names) or items_features_names is None + self._return_items_features_tuple = True + # In this case names are missing, still transform it as a tuple + elif items_features is not None: + self._return_items_features_tuple = True + items_features_names = (None,) * len(items_features) + + # If sessions_features is not given as tuple, transform it internally as a tuple + # A bit longer because can be None and need to also handle names + if not isinstance(sessions_features, tuple) and sessions_features is not None: + sessions_features = (sessions_features,) + sessions_features_names = (sessions_features_names,) + self._return_sessions_features_tuple = False + # sessions_features is already a tuple, names are given, checking consistency + elif sessions_features is not None and sessions_features_names is not None: + assert ( + len(sessions_features) == len(sessions_features_names) + or sessions_features_names is None + ) + self._return_sessions_features_tuple = True + # In this case names are missing, still transform it as a tuple + elif sessions_features is not None: + self._return_sessions_features_tuple = True + sessions_features_names = (None,) * len(sessions_features) + + # If sessions_items_features is not given as tuple, transform it internally as a tuple + # A bit longer because can be None and need to also handle names + if not isinstance(sessions_items_features, tuple) and sessions_items_features is not None: + sessions_items_features = (sessions_items_features,) + sessions_items_features_names = (sessions_items_features_names,) + self._return_sessions_items_features_tuple = False + # sessions_items_features is already a tuple, names are given, checking consistency + elif sessions_items_features is not None and sessions_items_features_names is not None: + assert ( + len(sessions_items_features) == len(sessions_items_features_names) + or sessions_items_features_names is None + ) + self._return_sessions_items_features_tuple = True + # In this case names are missing, still transform it as a tuple + elif sessions_items_features is not None: + self._return_sessions_items_features_tuple = True + sessions_items_features_names = (None,) * len(sessions_items_features) + + # --------- [Normalizing features types (DataFrame, List, etc...) -> np.ndarray] --------- # + # + # Part of this code is for handling features given as pandas.DataFrame + # Basically it transforms them to be internally stocked as np.ndarray and keep columns names as features names + + # Handling items_features + for i, feature in enumerate(items_features): + if isinstance(feature, pd.DataFrame): + # Ordering items by id ? + if "item_id" in feature.columns: + feature = feature.set_index("item_id") + items_features = ( + items_features[:i] + + (feature.loc[np.sort(feature.index)].values,) + + items_features[i + 1 :] + ) + items_features_names = ( + items_features_names[:i] + + (feature.columns.tolist(),) + + items_features_names[i + 1 :] + ) + elif isinstance(feature, list): + items_features = items_features[:i] + (np.array(feature),) + items_features[i + 1 :] + + # Handling sessions_features + for i, feature in enumerate(sessions_features): + if isinstance(feature, pd.DataFrame): + # Ordering sessions by id ? + if "session_id" in feature.columns: + feature = feature.set_index("session_id") + sessions_features = ( + sessions_features[:i] + + (feature.loc[np.sort(feature.index)].values,) + + sessions_features[i + 1 :] + ) + sessions_features_names = ( + sessions_features_names[:i] + + (feature.columns.tolist(),) + + sessions_features_names[i + 1 :] + ) + elif isinstance(feature, list): + sessions_features = ( + sessions_features[:i] + (np.array(feature),) + sessions_features[i + 1 :] + ) + + # Handling sessions_items_features + for i, feature in enumerate(sessions_items_features): + if isinstance(feature, pd.DataFrame): + # Ordering sessions and items by id ? + if "session_id" not in feature.columns: + feature["session_id"] = feature.index + items_index = np.sort(feature.item_id.unique()) + sessions_index = np.sort(feature.session_id.unique()) + names = [f for f in feature.columns if f != "session_id" and f != "item_id"] + + ( + feature, + sessions_items_availabilities, + ) = self._sessions_items_features_df_to_np( + feature, items_index, sessions_index, feature.columns.tolist() + ) + + sessions_items_features = ( + sessions_items_features[:i] + feature + sessions_items_features[i + 1 :] + ) + + sessions_items_features_names = ( + sessions_items_features_names[:i] + + (names,) + + sessions_items_features_names[i + 1 :] + ) + elif isinstance(feature, list): + sessions_items_features = ( + sessions_items_features[:i] + + (np.array(feature),) + + sessions_items_features[i + 1 :] + ) + + if isinstance(sessions_items_availabilities, list): + sessions_items_availabilities = np.array(sessions_items_availabilities) + + # Handling choices + # Choices must then be given as the name of the chosen item + # Items are sorted by name and attributed an index + # Cannot be a list of choices yet + if isinstance(choices, pd.DataFrame): + # Ordering sessions by id + if "session_id" in choices.columns: + choices = choices.set_index("session_id") + choices = choices.loc[np.sort(choices.index)] + items = np.sort(np.unique(choices.choice)) + # items is the value (str) of the item + choices = [np.where(items == c)[0] for c in choices.choice] + + # Setting attributes of ChoiceDataset + self.items_features = items_features + self.sessions_features = sessions_features + self.sessions_items_features = sessions_items_features + self.sessions_items_availabilities = sessions_items_availabilities + + self.items_features_names = items_features_names + self.sessions_features_names = sessions_features_names + self.sessions_items_features_names = sessions_items_features_names + + self.batch_size = batch_size + self.shuffle = shuffle + + if choices is None: + # Done to keep a logical order of arguments, and has logic: choices have to be specified + raise ValueError("Choices must be specified, got None") + self.ragged_choices = choices + self.indexes, self.choices = self._build_indexes(choices) + self.n_choices = len(self.choices) + + # Different consitency checks to ensure everythin is coherent + self._check_dataset() # Should handle alone if np.arrays are squeezed + self._return_types = self._check_types() + self._check_names() + + # Build .iloc method + self.indexer = ChoiceDatasetIndexer(self) + + def _build_indexes(self, choices): + """ + Builds the indexes dictionnary from the choices. + Particularly creates a flatten version of the choices and associates an index so that we can + retrieve from this index the session and the corresponding choice. + + Parameters: + ----------- + choices: list of list + raffed version of the choices + + Returns: + -------- + indexes: dict + dictionnary of indexes: {index: corresponding_session_index} + choices: np.ndarray + flattened (1D) version of the choices + """ + try: # 1 choice by session + if len(np.squeeze(choices).shape) == 1: + indexes = {i: i for i in range(len(choices))} + flat_choices = np.squeeze(self.ragged_choices) + elif len(np.squeeze(choices).shape) == 0: + indexes = {i: i for i in range(len(choices))} + flat_choices = np.array([np.squeeze(self.ragged_choices)]) + except: # Ragged sequence of choices + indexes = {} + flat_choices = [] + total_count = 0 + for sess_nb, sess in enumerate(choices): + for choice in sess: + indexes[total_count] = sess_nb + flat_choices.append(choice) + total_count += 1 + return indexes, np.array(flat_choices) + + def _check_dataset(self): + """ + Verifies that the shapes of the different features are consistent + - Over number of items + - Over number of sessions + Verifies that the choices have coherent values + """ + self._check_num_items_shapes() + self._check_num_sessions_shapes() + self._check_choices_coherence() + + def _check_num_items_shapes(self): + """ + Verifies that the shapes of the different features are consistent over number of items + - items_features + - sessions_items_features + - sessions_items_availabilities + Sets the argument base_num_items + """ + if self.items_features is not None: + base_num_items = self.items_features[0].shape[0] + elif self.sessions_items_features is not None: + base_num_items = self.sessions_items_features[0].shape[1] + elif self.sessions_items_availabilities is not None: + base_num_items = self.sessions_items_availabilities.shape[1] + else: + raise ValueError( + "No items features, sessions items features or items availabilities are defined" + ) + self.base_num_items = base_num_items + + if self.items_features is not None: + for items_feature in self.items_features: + assert ( + items_feature.shape[0] == base_num_items + ), f"shapes are (f{items_feature.shape[0]}, {base_num_items})" + + if self.sessions_items_features is not None: + for sessions_items_feature in self.sessions_items_features: + assert ( + sessions_items_feature.shape[1] == base_num_items + ), f"shapes are (f{sessions_items_feature.shape[1]}, {base_num_items})" + if self.sessions_items_availabilities is not None: + assert ( + self.sessions_items_availabilities.shape[1] == base_num_items + ), f"shapes are (f{self.sessions_items_availabilities.shape[1]}, {base_num_items})" + + def _check_num_sessions_shapes(self): + """ + Verifies that the shapes of the different features are consistent over number of sessions + - sessions_features + - sessions_items_features + - sessions_items_availabilities + Sets self.base_num_sessions argument. + """ + base_num_sessions = len(self.ragged_choices) + self.base_num_sessions = base_num_sessions + + if self.sessions_features is not None: + for sessions_feature in self.sessions_features: + assert ( + sessions_feature.shape[0] == base_num_sessions + ), f"shapes are: ({sessions_feature.shape[0]}, {base_num_sessions})" + + if self.sessions_items_features is not None: + for sessions_items_feature in self.sessions_items_features: + assert ( + sessions_items_feature.shape[0] == base_num_sessions + ), f"shapes are: ({sessions_items_feature.shape[0]}, {base_num_sessions})" + if self.sessions_items_availabilities is not None: + assert ( + self.sessions_items_availabilities.shape[0] == base_num_sessions + ), f"shapes are: ({self.sessions_items_availabilities.shape[0]}, {base_num_sessions})" + + def _check_choices_coherence(self): + """ + Verifies that the choices are coherent with the number of items present in other features. + Particularly: + - There is no choice index higher than detected number of items + - All items are present at least once in the choices + """ + msg = f"Choices values not coherent with number of items given in features. In particular, \ + max value of choices is {np.max(self.choices)} while number of items is {self.base_num_items}" + assert np.max(self.choices) < self.base_num_items, msg + + unique_choices = set(np.unique(self.choices).flatten()) + missing_choices = set(np.arange(start=0, stop=self.base_num_items, step=1)) - unique_choices + if len(missing_choices) > 0: + print(f"Some choices never happen in the dataset: {missing_choices}") + + def _check_types(self): + """ + Checks types of elements and store it in order to return right types. + - Either int32 or float32 consistently for features. + float32 is to be preferred unless One-Hot encoding is used. + - float32 for sessions_items_availabilities + - int32 for choices + """ + return_types = [] + + item_types = [] + if self.items_features is not None: + for item_feat in self.items_features: + if np.issubdtype(item_feat[0].dtype, np.integer): + item_types.append(np.int32) + else: + item_types.append(np.float32) + return_types.append(tuple(item_types)) + + session_types = [] + if self.sessions_features is not None: + for sessions_feat in self.sessions_features: + if np.issubdtype(sessions_feat[0].dtype, np.integer): + session_types.append(np.int32) + else: + session_types.append(np.float32) + return_types.append(tuple(session_types)) + + session_item_types = [] + if self.sessions_items_features is not None: + for session_item_feat in self.sessions_items_features: + if np.issubdtype(session_item_feat[0].dtype, np.integer): + session_item_types.append(np.int32) + else: + session_item_types.append(np.float32) + return_types.append(tuple(session_item_types)) + return_types.append(np.float32) + return_types.append(np.int32) + + return return_types + + def _check_names(self): + if self.items_features_names is not None: + for name, features in zip(self.items_features_names, self.items_features): + if name is not None: + assert ( + len(name) == features.shape[1] + ), f"Specififed items_features_names has length {len(name)} while items_features has {features.shape[1]} elements" + + if self.sessions_features_names is not None: + for name, features in zip(self.sessions_features_names, self.sessions_features): + if name is not None: + assert ( + len(name) == features.shape[1] + ), f"Specififed sessions_features_names has length {len(name)} while sessions_features has {features.shape[1]} elements" + + if self.sessions_items_features_names is not None: + for ( + name, + features, + ) in zip(self.sessions_items_features_names, self.sessions_items_features): + if name is not None: + assert ( + len(name) == features.shape[1] + ), f"Specififed sessions_items_features_names has length {len(name)} while sessions_items_features has {features.shape[1]} elements" + + def __len__(self): + """Returns length of the dataset e.g. total number of sessions. + + Returns: + -------- + int + total number of sessions + """ + return self.base_num_sessions + + def get_num_items(self): + """ + Method to access the total number of different items + + Returns + ------- + int + total number of different items + """ + return self.base_num_items + + def get_num_sessions(self): + """ + Method to access the total number of different sessions. + Redundant with __len__ method. + + Returns + ------- + int + total number of different sessions + """ + return len(self) + + def get_num_choices(self): + """ + Method to access the total number of different sessions + + Returns + ------- + int + total number of different sessions + """ + return self.n_choices + + @classmethod + def _sessions_items_features_df_to_np( + cls, + df, + items_index, + sessions_index, + features, + items_id_column="item_id", + sessions_id_column="session_id", + ): + """Builds sessions_items_features and sessions_items_availabilities from dataframe. + + Parameters + ---------- + df : pandas.DataFrame + Dataframe containing all the features for each item and sessions + items_index : list + List of items + sessions_index : list + List of sessions + features : list + List of columns of df that represents the items_features (for sessions_items_features) + + Returns + ------- + np.ndarray of shape (n_sessions, n_items, n_features) + Corresponding sessions_items_features + np.ndarray of shape (n_sessions, n_items) + Corresponding availabilities + """ + try: + features.remove("session_id") + except ValueError: + pass + try: + features.remove("item_id") + except ValueError: + pass + + sessions_items_features = [] + sessions_items_availabilities = [] + for sess in sessions_index: + sess_df = df.loc[df[sessions_id_column] == sess] + + if len(sess_df) == len(items_index): + sess_df = sess_df.T + sess_df.columns = sess_df.loc[items_id_column] + if features is not None: + sessions_items_features.append(sess_df[items_index].loc[features].T.values) + sessions_items_availabilities.append(np.ones(len(items_index))) + else: + sess_feats = [] + sess_av = [] + for item in items_index: + item_df = sess_df.loc[sess_df[items_id_column] == item] + if len(item_df) > 0: + if features is not None: + sess_feats.append(item_df[features].values[0]) + sess_av.append(1) + else: + if features is not None: + sess_feats.append(np.zeros(len(features))) + sess_av.append(0) + sessions_items_features.append(sess_feats) + sessions_items_availabilities.append(sess_av) + + if features is not None: + sessions_items_features = (np.array(sessions_items_features),) + else: + sessions_items_features = None + return sessions_items_features, np.array(sessions_items_availabilities) + + @classmethod + def from_single_df( + cls, + df, + items_features_columns, + sessions_features_columns, + sessions_items_features_columns, + items_id_column="item_id", + sessions_id_column="session_id", + choices_column="choice", + choice_mode="items_name", + ): + """Builds numpy arrays for ChoiceDataset from a single dataframe. + + Parameters + ---------- + df : pandas.DataFrame + dataframe in Long format + items_features_columns : list + Columns of the dataframe that are item features + sessions_features_columns : list + Columns of the dataframe that are session features + sessions_items_features_columns : list + Columns of the dataframe that are session-item features + items_id_column: str, optional + Name of the column containing the item ids, default is "items_id" + sessions_id_column: str, optional + Name of the column containing the sessions ids, default is "sessions_id" + choices_column: str, optional + Name of the column containing the choices, default is "choice" + + Returns + ------- + ChoiceDataset + corresponding ChoiceDataset + """ + + # Ordering items and sessions by id + items = np.sort(df[items_id_column].unique()) + sessions = np.sort(df[sessions_id_column].unique()) + + if items_features_columns is not None: + items_features = df[items_features_columns + [items_id_column]].drop_duplicates() + items_features = items_features.set_index(items_id_column) + items_features = (items_features.loc[items].values,) + + items_features_columns = (items_features_columns,) + else: + items_features = None + + if sessions_features_columns is not None: + sessions_features = df[ + sessions_features_columns + [sessions_id_column] + ].drop_duplicates() + sessions_features = sessions_features.set_index(sessions_id_column) + sessions_features = (sessions_features.loc[sessions].values,) + + sessions_features_columns = (sessions_features_columns,) + else: + sessions_features = None + + ( + sessions_items_features, + sessions_items_availabilities, + ) = cls._sessions_items_features_df_to_np( + df, + items_index=items, + sessions_index=sessions, + features=sessions_items_features_columns, + items_id_column=items_id_column, + sessions_id_column=sessions_id_column, + ) + sessions_items_features_columns = ( + (sessions_items_features_columns,) + if sessions_items_features_columns is not None + else None + ) + + if choice_mode == "item_id": + choices = df[[choices_column, sessions_id_column]].drop_duplicates(sessions_id_column) + choices = choices.set_index(sessions_id_column) + choices = choices.loc[sessions].values + # items is the value (str) of the item + choices = [np.where(items == c)[0] for c in choices] + elif choice_mode == "one_zero": + choices = df[[items_id_column, choices_column, sessions_id_column]] + choices = choices.loc[choices[choices_column] == 1] + choices = choices = choices.set_index(sessions_id_column) + choices = ( + choices.loc[sessions][items_id_column] + .map({k: v for v, k in enumerate(items)}) + .values + ) + else: + raise ValueError( + f"choice_mode {choice_mode} not recognized. Must be in ['item_id', 'one_zero']" + ) + + return ChoiceDataset( + items_features=items_features, + sessions_features=sessions_features, + sessions_items_features=sessions_items_features, + sessions_items_availabilities=sessions_items_availabilities, + choices=choices, + items_features_names=items_features_columns, + sessions_features_names=sessions_features_columns, + sessions_items_features_names=sessions_items_features_columns, + ) + + def save(self): + raise NotImplementedError + + def summary(self): + raise NotImplementedError + + def get_choice_batch(self, choice_index): + """ + Method to access data within the ListChoiceDataset from its index. + One index corresponds to a choice within a session. + + Return order: + - Fixed item features + - Session features + - Session item features + - Items availabilities + - Choice + + Parameters + ---------- + index : int or list of int or slice + indexes of the choices (that will be mapped to choice & session indexes) to return + + """ + if isinstance(choice_index, list): + if self.items_features is None: + items_features = None + else: + items_features = tuple( + items_feature.astype(self._return_types[0][i]) + for i, items_feature in enumerate(self.items_features) + ) + # items_features were not given as a tuple, so we return do not return it as a tuple + if not self._return_items_features_tuple: + items_features = items_features[0] + + # Get the session indexes + sessions_indexes = [self.indexes[i] for i in choice_index] + + if self.sessions_features is None: + sessions_features = None + else: + sessions_features = tuple( + np.stack(sessions_feature[sessions_indexes], axis=0).astype( + self._return_types[1][i] + ) + if not isinstance(sessions_feature, Store) + else sessions_feature.iloc[sessions_indexes] + for i, sessions_feature in enumerate(self.sessions_features) + ) + # sessions_features were not given as a tuple, so we return do not return it as a tuple + if not self._return_sessions_features_tuple: + sessions_features = sessions_features[0] + + if self.sessions_items_features is None: + sessions_items_features = None + else: + sessions_items_features = tuple( + np.stack(sessions_items_feature[sessions_indexes], axis=0).astype( + self._return_types[2][i] + ) + if not isinstance(sessions_items_feature, Store) + else sessions_items_feature.iloc[sessions_indexes] + for i, sessions_items_feature in enumerate(self.sessions_items_features) + ) + # sessions_items_features were not given as a tuple, so we return do not return it as a tuple + if not self._return_sessions_items_features_tuple: + sessions_items_features = sessions_items_features[0] + + if self.sessions_items_availabilities is None: + sessions_items_availabilities = None + else: + sessions_items_availabilities = self.sessions_items_availabilities[ + sessions_indexes + ].astype(self._return_types[3]) + + choice = self.choices[choice_index].astype(self._return_types[4]) + + return ( + items_features, + sessions_features, + sessions_items_features, + sessions_items_availabilities, + choice, + ) + + elif isinstance(choice_index, slice): + return self.get_choice_batch(list(range(*choice_index.indices(self.choices.shape[0])))) + + session_index = self.indexes[choice_index] + choice = self.choices[choice_index] + + if self.items_features is None: + items_features = None + else: + items_features = tuple(items_feature for items_feature in self.items_features) + + if self.sessions_features is None: + sessions_features = None + else: + sessions_features = tuple( + sessions_feature[session_index] for sessions_feature in self.sessions_features + ) + + if self.sessions_items_features is None: + sessions_items_features = None + else: + sessions_items_features = tuple( + sessions_items_feature[session_index] + for sessions_items_feature in self.sessions_items_features + ) + + if self.sessions_items_availabilities is None: + sessions_items_availabilities = None + else: + sessions_items_availabilities = self.sessions_items_availabilities[session_index] + + return ( + items_features, + sessions_features, + sessions_items_features, + sessions_items_availabilities, + choice, + ) + + def __getitem__(self, session_indexes): + """Method to create a sub-ChoiceDataset with only a subset of sessions, from their indexes. + + Parameters + ---------- + indexes : np.ndarray + indexes of the sessions to keep, shape should be (num_sessions,) + + Returns + ------- + ChoiceDataset + ChoiceDataset with only the sessions indexed by indexes + """ + if isinstance(session_indexes, int): + session_indexes = [session_indexes] + elif isinstance(session_indexes, slice): + return self.__getitem__(list(range(*session_indexes.indices(len(self.ragged_choices))))) + + return ChoiceDataset( + items_features=self.items_features, + sessions_features=tuple( + self.sessions_features[i][session_indexes] + for i in range(len(self.sessions_features)) + ), + sessions_items_features=tuple( + self.sessions_items_features[i][session_indexes] + for i in range(len(self.sessions_items_features)) + ), + sessions_items_availabilities=self.sessions_items_availabilities[session_indexes], + choices=[self.ragged_choices[i] for i in session_indexes], + batch_size=self.batch_size, + items_features_names=self.items_features_names, + sessions_features_names=self.sessions_features_names, + sessions_items_features_names=self.sessions_items_features_names, + ) + + def batch(self, batch_size=None, shuffle=None, sample_weight=None): + """ + Iterates over dataset return batches of length self.batch_size + + Arguments + --------- + batch_size : int + batch size to set + shuffle: bool + Whether or not to shuffle the dataset + sample_weight : Iterable + list of weights to be returned with the right indexing during the shuffling + """ + if batch_size is None: + batch_size = self.batch_size + if shuffle is None: + shuffle = self.shuffle + if batch_size == -1: + batch_size = self.get_num_choices() + + # Get indexes for each choice + num_choices = self.get_num_choices() + indexes = np.arange(num_choices) + # Shuffle indexes + if shuffle and not batch_size == -1: + indexes = np.random.permutation(indexes) + + yielded_size = 0 + while yielded_size < num_choices: + # Return sample_weight if not None, for index matching + if sample_weight is not None: + yield self.get_choice_batch( + indexes[yielded_size : yielded_size + batch_size].tolist() + ), sample_weight[indexes[yielded_size : yielded_size + batch_size].tolist()] + else: + yield self.get_choice_batch( + indexes[yielded_size : yielded_size + batch_size].tolist() + ) + yielded_size += batch_size + + # Special exit strategy for batch_size = -1 + if batch_size == -1: + yielded_size += 2 * num_choices + + @property + def iloc(self): + return self.indexer diff --git a/lib/data/indexer.py b/lib/data/indexer.py new file mode 100644 index 00000000..87d1e24c --- /dev/null +++ b/lib/data/indexer.py @@ -0,0 +1,258 @@ +from abc import abstractmethod + +import numpy as np +import pandas as pd + + +class Indexer(object): + def __init__(self, indexed_object): + self.indexed_object = indexed_object + + @abstractmethod + def __getitem__(self, index): + pass + + +class StoreIndexer(Indexer): + """Class for Ilocing FeaturesStore + + Parameters + ---------- + TBD + """ + + def __init__(self, store): + self.store = store + + def __getitem__(self, sequence_index): + """ + Returns the features corresponding appearing at the sequence_index-th position of sequence + + Parameters + ---------- + sequence_index : (int, list, slice) + index position of the sequence + + Returns + ------- + array_like + features corresponding to the sequence_index-th position of sequence + """ + if isinstance(sequence_index, list): + return [self.store.store[self.store.sequence[i]] for i in sequence_index] + elif isinstance(sequence_index, slice): + return [ + self.store.store[self.store.sequence[i]] + for i in range(*sequence_index.indices(len(self.sequence))) + ] + return self.store.store[self.store.sequence[sequence_index]] + + +class OneHotStoreIndexer(Indexer): + """Class for Ilocing OneHotStore + + Parameters + ---------- + TBD + """ + + def __init__(self, store): + self.store = store + + self.shape = (len(self.store.sequence), np.max(list(self.store.store.values())) + 1) + + def __getitem__(self, sequence_index): + """Main method to get an element at sequence_index-th position of self.sequence. + + Parameters + ---------- + sequence_index : (int, list, slice) + index from sequence of element to get + + Returns + ------- + np.ndarray + OneHot features corresponding to the sequence_index-th position of sequence + """ + if isinstance(sequence_index, list): + # Construction of the OneHot vector from the index of the 1 value + one_hot = np.zeros((len(sequence_index), self.shape[1])) + for i, j in enumerate(sequence_index): + one_hot[i, self.store.store[self.store.sequence[j]]] = 1 + return one_hot.astype(self.store.dtype) + else: + one_hot = np.zeros(self.shape[1]) + one_hot[self.store.store[self.store.sequence[sequence_index]]] = 1 + return one_hot.astype(self.store.dtype) + + +class ChoiceDatasetIndexer(Indexer): + """Indexing class for ChoiceDataset + + Parameters + ---------- + object : _type_ + _description_ + """ + + def __init__(self, choice_dataset): + self.choice_dataset = choice_dataset + + def _get_items_features(self): + if self.choice_dataset.items_features is None: + items_features = None + else: + items_features = tuple( + items_feature.astype(self.choice_dataset._return_types[0][i]) + for i, items_feature in enumerate(self.choice_dataset.items_features) + ) + # items_features were not given as a tuple, so we return do not return it as a tuple + if not self.choice_dataset._return_items_features_tuple: + items_features = items_features[0] + + return items_features + + def _get_sessions_features(self, sessions_indexes): + if self.choice_dataset.sessions_features is None: + sessions_features = None + else: + sessions_features = [] + for i, sessions_feature in enumerate(self.choice_dataset.sessions_features): + if hasattr(sessions_feature, "iloc"): + sessions_features.append( + sessions_feature.iloc[sessions_indexes].astype( + self.choice_dataset._return_types[1][i] + ) + ) + else: + sessions_features.append( + np.stack(sessions_feature[sessions_indexes], axis=0).astype( + self.choice_dataset._return_types[1][i] + ) + ) + # sessions_features were not given as a tuple, so we return do not return it as a tuple + if not self.choice_dataset._return_sessions_features_tuple: + sessions_features = sessions_features[0] + else: + sessions_features = tuple(sessions_features) + return sessions_features + + def _get_sessions_items_features(self, sessions_indexes): + if self.choice_dataset.sessions_items_features is None: + sessions_items_features = None + else: + sessions_items_features = [] + for i, sessions_items_feature in enumerate(self.choice_dataset.sessions_items_features): + if hasattr(sessions_items_feature, "iloc"): + sessions_items_features.append( + sessions_items_feature.iloc[sessions_indexes].astype( + self._return_types[2][i] + ) + ) + else: + sessions_items_features.append( + np.stack(sessions_items_feature[sessions_indexes], axis=0).astype( + self.choice_dataset._return_types[2][i] + ) + ) + # sessions_items_features were not given as a tuple, so we return do not return it as a tuple + if self.choice_dataset._return_sessions_items_features_tuple: + sessions_items_features = tuple(sessions_items_features) + else: + sessions_items_features = sessions_items_features[0] + return sessions_items_features + + def __getitem__(self, choice_index): + """ + Method to access data within the ChoiceDataset from its index. + One index corresponds to a choice within a session. + + Return order: + - Fixed item features + - Session features + - Session item features + - Items availabilities + - Choice + + Parameters + ---------- + index : int or list of int or slice + indexes of the choices (that will be mapped to choice & session indexes) to return + + """ + if isinstance(choice_index, list): + items_features = self._get_items_features() + # Get the session indexes + sessions_indexes = [self.choice_dataset.indexes[i] for i in choice_index] + + sessions_features = self._get_sessions_features(sessions_indexes) + sessions_items_features = self._get_sessions_items_features(sessions_indexes) + + if self.choice_dataset.sessions_items_availabilities is None: + sessions_items_availabilities = None + else: + if hasattr(self.choice_dataset.sessions_items_availabilities, "iloc"): + sessions_items_availabilities = ( + self.choice_dataset.sessions_items_availabilities.iloc[ + sessions_indexes + ].astype(self.choice_dataset._return_types[3]) + ) + else: + sessions_items_availabilities = ( + self.choice_dataset.sessions_items_availabilities[sessions_indexes].astype( + self.choice_dataset._return_types[3] + ) + ) + + choice = self.choice_dataset.choices[choice_index].astype( + self.choice_dataset._return_types[4] + ) + + return ( + items_features, + sessions_features, + sessions_items_features, + sessions_items_availabilities, + choice, + ) + + elif isinstance(choice_index, slice): + return self.__getitem__(list(range(*choice_index.indices(self.choices.shape[0])))) + + elif isinstance(choice_index, int): + items_features = self._get_items_features() + # Get the session indexes + sessions_indexes = self.choice_dataset.indexes[choice_index] + + sessions_features = self._get_sessions_features(sessions_indexes) + sessions_items_features = self._get_sessions_items_features(sessions_indexes) + + if self.choice_dataset.sessions_items_availabilities is None: + sessions_items_availabilities = None + else: + if hasattr(self.choice_dataset.sessions_items_availabilities, "iloc"): + sessions_items_availabilities = ( + self.choice_dataset.sessions_items_availabilities.iloc[ + sessions_indexes + ].astype(self.choice_dataset._return_types[3]) + ) + else: + sessions_items_availabilities = ( + self.choice_dataset.sessions_items_availabilities[sessions_indexes].astype( + self.choice_dataset._return_types[3] + ) + ) + + choice = self.choice_dataset.choices[choice_index].astype( + self.choice_dataset._return_types[4] + ) + + return ( + items_features, + sessions_features, + sessions_items_features, + sessions_items_availabilities, + choice, + ) + else: + raise NotImplementedError diff --git a/lib/data/store.py b/lib/data/store.py new file mode 100644 index 00000000..b84ab286 --- /dev/null +++ b/lib/data/store.py @@ -0,0 +1,268 @@ +import numpy as np +from choice_modeling.data.indexer import OneHotStoreIndexer, StoreIndexer + + +class Store(object): + """Class to keep OneHotStore and FeaturesStore with same parent""" + + def __init__(self, indexes=None, values=None, sequence=None, name=None, indexer=StoreIndexer): + """ + Builds the store + + Parameters + ---------- + indexes : array_like or None + list of indexes of features to store. If None is given, indexes are created from apparition order of values + values : array_like + list of values of features to store + sequence : array_like + sequence of apparitions of the features + name: string, optional + name of the features store -- not used at the moment + """ + if indexes is None: + indexes = list(range(values)) + self.store = {k: v for (k, v) in zip(indexes, values)} + self.sequence = np.array(sequence) + self.name = name + + if sequence is not None and values is not None: + try: + width = len(values[0]) + except: + width = 1 + self.shape = (len(sequence), width) + + self.indexer = indexer(self) + + def _get_store_element(self, index): + """ + Returns the features stored at index index. Compared to __getitem__, it does take the index-th + element of sequence but the index-th element of the store. + + Parameters + ---------- + index : (int, list, slice) + index argument of the feature + + Returns + ------- + array_like + features corresponding to the index index in self.store + """ + if isinstance(index, list): + return [self.store[i] for i in index] + else: + return self.store[index] + + def __len__(self): + return len(self.sequence) + + @property + def iloc(self): + return self.indexer + + +class FeaturesStore(Store): + """ + Base class to store features and a sequence of apparitions. + Mainly useful when features are repeated frequently over the sequence. + An example would be to store the features of a customers (supposing that the same customers come + several times over the work sequence) and to save which customer is concerned for each choice. + + Attributes + ---------- + store : dict + Dictionary stocking features that can be called from indexes: {index: features} + shape : tuple + shape of the features store: (sequence_length, features_number) + sequence : array_like + List of elements of indexes representing the sequence of apparitions of the features + name: string, optional + name of the features store -- not used at the moment + dtype: type + type of the features + """ + + @classmethod + def from_dict(cls, values_dict, sequence): + """ + Instantiates the FeaturesStore from a dictionary of values + + Parameters + ---------- + values_dict : dict + dictionary of values to store, {index: value} + sequence : array_like + sequence of apparitions of the features + + Returns + ------- + FeaturesStore created from the values in the dictionnary + """ + # Check uniform shape of values + return cls( + indexes=list(values_dict.keys()), values=list(values_dict.values()), sequence=sequence + ) + + @classmethod + def from_list(cls, values_list, sequence): + """ + Instantiates the FeaturesStore from a list of values + Creates indexes for each value + + Parameters + ---------- + values_list : list + List of values to store + sequence : array_like + sequence of apparitions of the features + + Returns + ------- + FeaturesStore + """ + # Check uniform shape of list + # Useful ? To rethink... + return cls(indexes=list(range(len(values_list))), values=values_list, sequence=sequence) + + def __getitem__(self, sequence_index): + """ + Subsets self with sequence_index + + Parameters + ---------- + sequence_index : (int, list, slice) + index position of the sequence + + Returns + ------- + array_like + features corresponding to the sequence_index-th position of sequence + """ + if isinstance(sequence_index, int): + sequence_index = [sequence_index] + print(sequence_index, type(self.sequence)) + new_sequence = self.sequence[sequence_index] + store = {} + for k, v in self.store.items(): + if k in new_sequence: + store[k] = v + else: + print(f"Key {k} of store with value {v} not in sequence anymore") + + return FeaturesStore.from_dict(store, new_sequence) + + def astype(self, dtype): + """ + Changes the dtype of the features. The type of the features should implement the astype method. + Typically, should work like np.ndarrays. + + Parameters + ---------- + dtype : str or type + type to set the features as + """ + for k, v in self.store.items(): + self.store[k] = v.astype(dtype) + + +class OneHotStore(Store): + """ + Specific FeaturesStore for one hot features storage. Inherits from FeaturesStore. + For example can be used to store a OneHot representation of the days of week. + + Has the same attributes as FeaturesStore, only differs whit some One-Hot optimized methods. + """ + + def __init__( + self, + indexes=None, + values=None, + sequence=None, + name=None, + dtype=np.float32, + ): + """ + Builds the OneHot features store + + Parameters + ---------- + indexes : array_like or None + list of indexes of features to store. If None is given, indexes are created from apparition order of values + values : array_like or None + list of values of features to store that must be One-Hot. If None given they are created from order of apparition in sequence + sequence : array_like + sequence of apparitions of the features + name: string, optional + name of the features store -- not used at the moment + """ + self.name = name + self.sequence = np.array(sequence) + + if values is None: + self = self.from_sequence(sequence) + else: + self.store = {k: v for (k, v) in zip(indexes, values)} + self.shape = (len(sequence), np.max(values) + 1) + + self.dtype = dtype + self.indexer = OneHotStoreIndexer(self) + + @classmethod + def from_sequence(cls, sequence): + """Creates a OneHotFeatureStore from a sequence of apparition. + One Hot vector are created from the order of apparition in the sequence: feature vectors created + have a length of the number of different values in the sequence and the 1 is positioned in order of + first appartitions in the sequence. + + Parameters + ---------- + sequence : array-like + Sequence of apparitions of values, or indexes. Will be used to index self.store + + Returns + ------- + FeatureStore + Created from the sequence. + """ + all_indexes = np.unique(sequence) + values = np.arange(len(all_indexes)) + return cls(indexes=all_indexes, values=values, sequence=sequence) + + def __getitem__(self, sequence_index): + """Main method to get an element at sequence_index-th position of self.sequence. + + Parameters + ---------- + sequence_index : (int, list, slice) + index from sequence of element to get + + Returns + ------- + np.ndarray + OneHot features corresponding to the sequence_index-th position of sequence + """ + if isinstance(sequence_index, int): + sequence_index = [sequence_index] + new_sequence = self.sequence[sequence_index] + store = {} + for k, v in self.store.items(): + if k in new_sequence: + store[k] = v + else: + print(f"Key {k} of store with value {v} not in sequence anymore") + + return OneHotStore( + indexes=list(store.keys()), values=list(store.values()), sequence=new_sequence + ) + + def astype(self, dtype): + """Method to change (mainly int or float) type of returned OneHot features vectors. + + Parameters + ---------- + dtype : type + Type to set the features as + """ + self.dtype = dtype From 4f160dd95a37e0f49d13ccc72d873e500d5b6e7f Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 21 Dec 2023 16:27:52 +0100 Subject: [PATCH 2/8] format choice_dataset --- ...choice_dataset_v3.py => choice_dataset.py} | 251 ++++++++++-------- pyproject.toml | 2 +- 2 files changed, 148 insertions(+), 105 deletions(-) rename lib/data/{choice_dataset_v3.py => choice_dataset.py} (81%) diff --git a/lib/data/choice_dataset_v3.py b/lib/data/choice_dataset.py similarity index 81% rename from lib/data/choice_dataset_v3.py rename to lib/data/choice_dataset.py index ea12ab5e..a2fba478 100644 --- a/lib/data/choice_dataset_v3.py +++ b/lib/data/choice_dataset.py @@ -1,4 +1,4 @@ -"""Main classes to handle assortment data""" +"""Main classes to handle assortment data.""" import numpy as np import pandas as pd @@ -7,15 +7,18 @@ class ChoiceDataset(object): - """ - Version of ChoiceDataset where the choices are given as a ragged list of choices for each session. - It is particularly useful if several (a lot) of choices happen during the same session. - For example if we have the same customer buying several items during the same session, all its choices + """ChoiceDataset is the main class to handle assortment data minimizing RAM usage. + + The choices are given as a ragged list of choices + for each session. It is particularly useful if several (a lot) of choices happen + during the same session. For example if we have the same customer buying several + items during the same session, all its choices can be regrouped under the same session_features. Limits data duplication in such cases. - The class has same methods/arguments as ChoiceDatset with a slight difference with self.choices being - a ragged list. The returned features in self.__getitem__ are the same as ChoiceDataset. - When calling __getitem__(index) we map index to a session index and a choice index within the session. + The class has same methods/arguments as ChoiceDatset with a slight difference with + self.choices being a ragged list. The returned features in self.__getitem__ are the same + as ChoiceDataset. When calling __getitem__(index) we map index to a session index and a + choice index within the session. """ def __init__( @@ -31,28 +34,30 @@ def __init__( batch_size=16, shuffle=False, ): - """ - Builds the ChoiceDataset + """Builds the ChoiceDataset. Parameters ---------- items_features : tuple of (array_like, ) - matrix of shape (num_items, num_items_features) containing the features of the items e.g. item color + matrix of shape (num_items, num_items_features) containing the features of the items + e.g. item color sessions_features : tuple of (array_like, ) - matrix of shape (num_sessions, num_sess_features) containing the features of the sessions e.g. day of week + matrix of shape (num_sessions, num_sess_features) containing the features of the + sessions e.g. day of week sessions_items_features : tuple of (array_like, ) - matrix of shape (num_sessions, num_items, num_ses_items_features) containing the item features varying over sessions, e.g. prices + matrix of shape (num_sessions, num_items, num_ses_items_features) containing the item + features varying over sessions, e.g. prices sessions_items_availabilities : array_like - binary matrix of shape (num_sessions, num_items) containing the availabitilies of products (1. if present 0. otherwise) over sessions + binary matrix of shape (num_sessions, num_items) containing the availabitilies of + products (1. if present 0. otherwise) over sessions choices: list of list - for each sessions we have a list of related choices. Main list has same legnth as session_features and sessions_items_features. + for each sessions we have a list of related choices. Main list has same legnth as + session_features and sessions_items_features. batch_size: int, optional size of the batches to return in __iter__ method suffle: bool, optional whether to shuffle the dataset or not - """ - # --------- [ Handling features type given as tuples or not ] --------- # # If items_features is not given as tuple, transform it internally as a tuple # A bit longer because can be None and need to also handle names @@ -62,7 +67,11 @@ def __init__( self._return_items_features_tuple = False # items_features is already a tuple, names are given, checking consistency elif items_features is not None and items_features_names is not None: - assert len(items_features) == len(items_features_names) or items_features_names is None + if ( + len(items_features) != len(items_features_names) + and items_features_names is not None + ): + raise ValueError("items_features shape and items_features_names shape do not match") self._return_items_features_tuple = True # In this case names are missing, still transform it as a tuple elif items_features is not None: @@ -77,10 +86,14 @@ def __init__( self._return_sessions_features_tuple = False # sessions_features is already a tuple, names are given, checking consistency elif sessions_features is not None and sessions_features_names is not None: - assert ( - len(sessions_features) == len(sessions_features_names) - or sessions_features_names is None - ) + if ( + len(sessions_features) != len(sessions_features_names) + and sessions_features_names is not None + ): + raise ValueError( + "sessions_features shape and sessions_features_names shape \ + do not match" + ) self._return_sessions_features_tuple = True # In this case names are missing, still transform it as a tuple elif sessions_features is not None: @@ -95,10 +108,14 @@ def __init__( self._return_sessions_items_features_tuple = False # sessions_items_features is already a tuple, names are given, checking consistency elif sessions_items_features is not None and sessions_items_features_names is not None: - assert ( - len(sessions_items_features) == len(sessions_items_features_names) - or sessions_items_features_names is None - ) + if ( + len(sessions_items_features) != len(sessions_items_features_names) + and sessions_items_features_names is not None + ): + raise ValueError( + "sessions_items_features shape and \ + sessions_items_features_names shape do not match" + ) self._return_sessions_items_features_tuple = True # In this case names are missing, still transform it as a tuple elif sessions_items_features is not None: @@ -108,7 +125,8 @@ def __init__( # --------- [Normalizing features types (DataFrame, List, etc...) -> np.ndarray] --------- # # # Part of this code is for handling features given as pandas.DataFrame - # Basically it transforms them to be internally stocked as np.ndarray and keep columns names as features names + # Basically it transforms them to be internally stocked as np.ndarray and keep columns + # names as features names # Handling items_features for i, feature in enumerate(items_features): @@ -118,7 +136,7 @@ def __init__( feature = feature.set_index("item_id") items_features = ( items_features[:i] - + (feature.loc[np.sort(feature.index)].values,) + + (feature.loc[np.sort(feature.index)].to_numpy(),) + items_features[i + 1 :] ) items_features_names = ( @@ -137,7 +155,7 @@ def __init__( feature = feature.set_index("session_id") sessions_features = ( sessions_features[:i] - + (feature.loc[np.sort(feature.index)].values,) + + (feature.loc[np.sort(feature.index)].to_numpy(),) + sessions_features[i + 1 :] ) sessions_features_names = ( @@ -228,8 +246,8 @@ def __init__( self.indexer = ChoiceDatasetIndexer(self) def _build_indexes(self, choices): - """ - Builds the indexes dictionnary from the choices. + """Builds the indexes dictionnary from the choices. + Particularly creates a flatten version of the choices and associates an index so that we can retrieve from this index the session and the corresponding choice. @@ -238,7 +256,7 @@ def _build_indexes(self, choices): choices: list of list raffed version of the choices - Returns: + Returns:: -------- indexes: dict dictionnary of indexes: {index: corresponding_session_index} @@ -252,7 +270,7 @@ def _build_indexes(self, choices): elif len(np.squeeze(choices).shape) == 0: indexes = {i: i for i in range(len(choices))} flat_choices = np.array([np.squeeze(self.ragged_choices)]) - except: # Ragged sequence of choices + except ValueError: # Ragged sequence of choices indexes = {} flat_choices = [] total_count = 0 @@ -264,8 +282,9 @@ def _build_indexes(self, choices): return indexes, np.array(flat_choices) def _check_dataset(self): - """ - Verifies that the shapes of the different features are consistent + """Verifies that the shapes of the different features are consistent. + + Particularly: - Over number of items - Over number of sessions Verifies that the choices have coherent values @@ -275,8 +294,9 @@ def _check_dataset(self): self._check_choices_coherence() def _check_num_items_shapes(self): - """ - Verifies that the shapes of the different features are consistent over number of items + """Verifies that the shapes of the different features are consistent over number of items. + + Particularly: - items_features - sessions_items_features - sessions_items_availabilities @@ -296,23 +316,26 @@ def _check_num_items_shapes(self): if self.items_features is not None: for items_feature in self.items_features: - assert ( - items_feature.shape[0] == base_num_items - ), f"shapes are (f{items_feature.shape[0]}, {base_num_items})" + if items_feature.shape[0] != base_num_items: + raise ValueError(f"shapes are (f{items_feature.shape[0]}, {base_num_items})") if self.sessions_items_features is not None: for sessions_items_feature in self.sessions_items_features: - assert ( - sessions_items_feature.shape[1] == base_num_items - ), f"shapes are (f{sessions_items_feature.shape[1]}, {base_num_items})" + if sessions_items_feature.shape[1] != base_num_items: + raise ValueError( + f"shapes are (f{sessions_items_feature.shape[1]}, {base_num_items})" + ) if self.sessions_items_availabilities is not None: - assert ( - self.sessions_items_availabilities.shape[1] == base_num_items - ), f"shapes are (f{self.sessions_items_availabilities.shape[1]}, {base_num_items})" + if self.sessions_items_availabilities.shape[1] != base_num_items: + raise ValueError( + f"shapes are (f{self.sessions_items_availabilities.shape[1]}, \ + {base_num_items})" + ) def _check_num_sessions_shapes(self): - """ - Verifies that the shapes of the different features are consistent over number of sessions + """Verifies that the shapes of the different features are consistent over nb of sessions. + + Particularly: - sessions_features - sessions_items_features - sessions_items_availabilities @@ -323,30 +346,37 @@ def _check_num_sessions_shapes(self): if self.sessions_features is not None: for sessions_feature in self.sessions_features: - assert ( - sessions_feature.shape[0] == base_num_sessions - ), f"shapes are: ({sessions_feature.shape[0]}, {base_num_sessions})" + if sessions_feature.shape[0] != base_num_sessions: + raise ValueError( + f"shapes are ({sessions_feature.shape[0]}, {base_num_sessions})" + ) if self.sessions_items_features is not None: for sessions_items_feature in self.sessions_items_features: - assert ( - sessions_items_feature.shape[0] == base_num_sessions - ), f"shapes are: ({sessions_items_feature.shape[0]}, {base_num_sessions})" + if sessions_items_feature.shape[0] != base_num_sessions: + raise ValueError( + f"shapes are: ({sessions_items_feature.shape[0]}, \ + {base_num_sessions})" + ) if self.sessions_items_availabilities is not None: - assert ( - self.sessions_items_availabilities.shape[0] == base_num_sessions - ), f"shapes are: ({self.sessions_items_availabilities.shape[0]}, {base_num_sessions})" + if self.sessions_items_availabilities.shape[0] != base_num_sessions: + raise ValueError( + f"shapes are: ({self.sessions_items_availabilities.shape[0]}, \ + {base_num_sessions})" + ) def _check_choices_coherence(self): - """ - Verifies that the choices are coherent with the number of items present in other features. + """Verifies that the choices are coherent with the nb of items present in other features. + Particularly: - There is no choice index higher than detected number of items - All items are present at least once in the choices """ - msg = f"Choices values not coherent with number of items given in features. In particular, \ - max value of choices is {np.max(self.choices)} while number of items is {self.base_num_items}" - assert np.max(self.choices) < self.base_num_items, msg + if np.max(self.choices) > self.base_num_items: + msg = f"Choices values not coherent with number of items given in features. \ + In particular, max value of choices is {np.max(self.choices)} while number of \ + items is {self.base_num_items}" + raise ValueError(msg) unique_choices = set(np.unique(self.choices).flatten()) missing_choices = set(np.arange(start=0, stop=self.base_num_items, step=1)) - unique_choices @@ -354,8 +384,9 @@ def _check_choices_coherence(self): print(f"Some choices never happen in the dataset: {missing_choices}") def _check_types(self): - """ - Checks types of elements and store it in order to return right types. + """Checks types of elements and store it in order to return right types. + + Particularly: - Either int32 or float32 consistently for features. float32 is to be preferred unless One-Hot encoding is used. - float32 for sessions_items_availabilities @@ -395,19 +426,24 @@ def _check_types(self): return return_types def _check_names(self): + """Verifies that the names given to features are consistent with the features themselves.""" if self.items_features_names is not None: for name, features in zip(self.items_features_names, self.items_features): if name is not None: - assert ( - len(name) == features.shape[1] - ), f"Specififed items_features_names has length {len(name)} while items_features has {features.shape[1]} elements" + if len(name) != features.shape[1]: + raise ValueError( + f"Specififed items_features_names has \ + length {len(name)} while items_features has {features.shape[1]} elements" + ) if self.sessions_features_names is not None: for name, features in zip(self.sessions_features_names, self.sessions_features): if name is not None: - assert ( - len(name) == features.shape[1] - ), f"Specififed sessions_features_names has length {len(name)} while sessions_features has {features.shape[1]} elements" + if len(name) != features.shape[1]: + raise ValueError( + f"Specified sessions_features_names has \ + length {len(name)} while sessions_features has {features.shape[1]} elements" + ) if self.sessions_items_features_names is not None: for ( @@ -415,25 +451,27 @@ def _check_names(self): features, ) in zip(self.sessions_items_features_names, self.sessions_items_features): if name is not None: - assert ( - len(name) == features.shape[1] - ), f"Specififed sessions_items_features_names has length {len(name)} while sessions_items_features has {features.shape[1]} elements" + if len(name) != features.shape[1]: + raise ValueError( + f"Specified \ + sessions_items_features_names has length {len(name)} while \ + sessions_items_features has {features.shape[1]} elements" + ) def __len__(self): """Returns length of the dataset e.g. total number of sessions. Returns: - -------- + ------- int total number of sessions """ return self.base_num_sessions def get_num_items(self): - """ - Method to access the total number of different items + """Method to access the total number of different items. - Returns + Returns: ------- int total number of different items @@ -441,11 +479,11 @@ def get_num_items(self): return self.base_num_items def get_num_sessions(self): - """ - Method to access the total number of different sessions. + """Method to access the total number of different sessions. + Redundant with __len__ method. - Returns + Returns: ------- int total number of different sessions @@ -453,10 +491,9 @@ def get_num_sessions(self): return len(self) def get_num_choices(self): - """ - Method to access the total number of different sessions + """Method to access the total number of different sessions. - Returns + Returns: ------- int total number of different sessions @@ -486,7 +523,7 @@ def _sessions_items_features_df_to_np( features : list List of columns of df that represents the items_features (for sessions_items_features) - Returns + Returns: ------- np.ndarray of shape (n_sessions, n_items, n_features) Corresponding sessions_items_features @@ -520,7 +557,7 @@ def _sessions_items_features_df_to_np( item_df = sess_df.loc[sess_df[items_id_column] == item] if len(item_df) > 0: if features is not None: - sess_feats.append(item_df[features].values[0]) + sess_feats.append(item_df[features].to_numpy()[0]) sess_av.append(1) else: if features is not None: @@ -566,12 +603,11 @@ def from_single_df( choices_column: str, optional Name of the column containing the choices, default is "choice" - Returns + Returns: ------- ChoiceDataset corresponding ChoiceDataset """ - # Ordering items and sessions by id items = np.sort(df[items_id_column].unique()) sessions = np.sort(df[sessions_id_column].unique()) @@ -579,7 +615,7 @@ def from_single_df( if items_features_columns is not None: items_features = df[items_features_columns + [items_id_column]].drop_duplicates() items_features = items_features.set_index(items_id_column) - items_features = (items_features.loc[items].values,) + items_features = (items_features.loc[items].to_numpy(),) items_features_columns = (items_features_columns,) else: @@ -590,7 +626,7 @@ def from_single_df( sessions_features_columns + [sessions_id_column] ].drop_duplicates() sessions_features = sessions_features.set_index(sessions_id_column) - sessions_features = (sessions_features.loc[sessions].values,) + sessions_features = (sessions_features.loc[sessions].to_numpy(),) sessions_features_columns = (sessions_features_columns,) else: @@ -616,7 +652,7 @@ def from_single_df( if choice_mode == "item_id": choices = df[[choices_column, sessions_id_column]].drop_duplicates(sessions_id_column) choices = choices.set_index(sessions_id_column) - choices = choices.loc[sessions].values + choices = choices.loc[sessions].to_numpy() # items is the value (str) of the item choices = [np.where(items == c)[0] for c in choices] elif choice_mode == "one_zero": @@ -626,7 +662,7 @@ def from_single_df( choices = ( choices.loc[sessions][items_id_column] .map({k: v for v, k in enumerate(items)}) - .values + .to_numpy() ) else: raise ValueError( @@ -645,14 +681,16 @@ def from_single_df( ) def save(self): + """Method to save the dataset.""" raise NotImplementedError def summary(self): + """Method to display a summary of the dataset.""" raise NotImplementedError def get_choice_batch(self, choice_index): - """ - Method to access data within the ListChoiceDataset from its index. + """Method to access data within the ListChoiceDataset from its index. + One index corresponds to a choice within a session. Return order: @@ -694,7 +732,8 @@ def get_choice_batch(self, choice_index): else sessions_feature.iloc[sessions_indexes] for i, sessions_feature in enumerate(self.sessions_features) ) - # sessions_features were not given as a tuple, so we return do not return it as a tuple + # sessions_features were not given as a tuple, so we return do not return it + # as a tuple if not self._return_sessions_features_tuple: sessions_features = sessions_features[0] @@ -709,7 +748,8 @@ def get_choice_batch(self, choice_index): else sessions_items_feature.iloc[sessions_indexes] for i, sessions_items_feature in enumerate(self.sessions_items_features) ) - # sessions_items_features were not given as a tuple, so we return do not return it as a tuple + # sessions_items_features were not given as a tuple, so we return do not return + # it as a tuple if not self._return_sessions_items_features_tuple: sessions_items_features = sessions_items_features[0] @@ -730,7 +770,7 @@ def get_choice_batch(self, choice_index): choice, ) - elif isinstance(choice_index, slice): + if isinstance(choice_index, slice): return self.get_choice_batch(list(range(*choice_index.indices(self.choices.shape[0])))) session_index = self.indexes[choice_index] @@ -777,7 +817,7 @@ def __getitem__(self, session_indexes): indexes : np.ndarray indexes of the sessions to keep, shape should be (num_sessions,) - Returns + Returns: ------- ChoiceDataset ChoiceDataset with only the sessions indexed by indexes @@ -806,11 +846,10 @@ def __getitem__(self, session_indexes): ) def batch(self, batch_size=None, shuffle=None, sample_weight=None): - """ - Iterates over dataset return batches of length self.batch_size + """Iterates over dataset return batches of length self.batch_size. - Arguments - --------- + Parameters + ---------- batch_size : int batch size to set shuffle: bool @@ -836,9 +875,12 @@ def batch(self, batch_size=None, shuffle=None, sample_weight=None): while yielded_size < num_choices: # Return sample_weight if not None, for index matching if sample_weight is not None: - yield self.get_choice_batch( - indexes[yielded_size : yielded_size + batch_size].tolist() - ), sample_weight[indexes[yielded_size : yielded_size + batch_size].tolist()] + yield ( + self.get_choice_batch( + indexes[yielded_size : yielded_size + batch_size].tolist() + ), + sample_weight[indexes[yielded_size : yielded_size + batch_size].tolist()], + ) else: yield self.get_choice_batch( indexes[yielded_size : yielded_size + batch_size].tolist() @@ -851,4 +893,5 @@ def batch(self, batch_size=None, shuffle=None, sample_weight=None): @property def iloc(self): + """Indexer.""" return self.indexer diff --git a/pyproject.toml b/pyproject.toml index 651d0ce9..1c5fd9e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ select = [ "PTH", "PD", ] # See: https://beta.ruff.rs/docs/rules/ -ignore = ["D203", "D213", "ANN101", "ANN102"] +ignore = ["D203", "D213", "ANN101", "ANN102", "ANN204", "ANN001", "ANN202", "ANN201", "ANN206"] line-length = 100 target-version = "py310" exclude = [ From 22da1a14847d6b19e8a106060a4c1f3938c85996 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 21 Dec 2023 17:22:04 +0100 Subject: [PATCH 3/8] format indexer --- lib/data/indexer.py | 164 ++++++++++++++++++++++++++++---------------- 1 file changed, 105 insertions(+), 59 deletions(-) diff --git a/lib/data/indexer.py b/lib/data/indexer.py index 87d1e24c..0bc2de80 100644 --- a/lib/data/indexer.py +++ b/lib/data/indexer.py @@ -1,46 +1,63 @@ +"""Indexer classes for data classes.""" from abc import abstractmethod import numpy as np -import pandas as pd class Indexer(object): + """Base class for Indexer.""" + def __init__(self, indexed_object): + """Instanciate an Indexer object. + + Parameters + ---------- + indexed_object : object + object to be indexed. + """ self.indexed_object = indexed_object @abstractmethod def __getitem__(self, index): + """Main method to be coded for children classes. + + Parameters + ---------- + index : int, slice, list + index(es) of elements of self.indexed_object to be returned. + """ pass class StoreIndexer(Indexer): - """Class for Ilocing FeaturesStore - - Parameters - ---------- - TBD - """ + """Class for Ilocing/Batching FeaturesStore.""" def __init__(self, store): + """StoreIndexer constructor. + + Parameters + ---------- + store : choice_modeling.data.store.FeaturesStore + Store object to be indexed. + """ self.store = store def __getitem__(self, sequence_index): - """ - Returns the features corresponding appearing at the sequence_index-th position of sequence + """Returns the features appearing at the sequence_index-th position of sequence. Parameters ---------- sequence_index : (int, list, slice) index position of the sequence - Returns - ------- + Returns: + -------- array_like features corresponding to the sequence_index-th position of sequence """ if isinstance(sequence_index, list): return [self.store.store[self.store.sequence[i]] for i in sequence_index] - elif isinstance(sequence_index, slice): + if isinstance(sequence_index, slice): return [ self.store.store[self.store.sequence[i]] for i in range(*sequence_index.indices(len(self.sequence))) @@ -49,14 +66,16 @@ def __getitem__(self, sequence_index): class OneHotStoreIndexer(Indexer): - """Class for Ilocing OneHotStore - - Parameters - ---------- - TBD - """ + """Class for Ilocing OneHotStore.""" def __init__(self, store): + """OneHotStoreIndexer constructor. + + Parameters + ---------- + store : choice_modeling.data.store.OneHotStore + OneHotStore object to be indexed. + """ self.store = store self.shape = (len(self.store.sequence), np.max(list(self.store.store.values())) + 1) @@ -69,8 +88,8 @@ def __getitem__(self, sequence_index): sequence_index : (int, list, slice) index from sequence of element to get - Returns - ------- + Returns: + -------- np.ndarray OneHot features corresponding to the sequence_index-th position of sequence """ @@ -80,25 +99,33 @@ def __getitem__(self, sequence_index): for i, j in enumerate(sequence_index): one_hot[i, self.store.store[self.store.sequence[j]]] = 1 return one_hot.astype(self.store.dtype) - else: - one_hot = np.zeros(self.shape[1]) - one_hot[self.store.store[self.store.sequence[sequence_index]]] = 1 - return one_hot.astype(self.store.dtype) + # else: + one_hot = np.zeros(self.shape[1]) + one_hot[self.store.store[self.store.sequence[sequence_index]]] = 1 + return one_hot.astype(self.store.dtype) class ChoiceDatasetIndexer(Indexer): - """Indexing class for ChoiceDataset - - Parameters - ---------- - object : _type_ - _description_ - """ + """Indexing class for ChoiceDataset.""" def __init__(self, choice_dataset): + """Instanciate a ChoiceDatasetIndexer object. + + Parameters + ---------- + choice_dataset : choce_modeling.data.dataset.ChoiceDataset + Dataset to be indexed. + """ self.choice_dataset = choice_dataset def _get_items_features(self): + """Method to access items features of the ChoiceDataset. + + Returns: + -------- + tuple of np.ndarray or np.ndarray + items_features of the ChoiceDataset + """ if self.choice_dataset.items_features is None: items_features = None else: @@ -113,6 +140,18 @@ def _get_items_features(self): return items_features def _get_sessions_features(self, sessions_indexes): + """Method to access sessions features of the ChoiceDataset. + + Parameters + ---------- + sessions_indexes : list of ints or int + indexes of the sessions to return + + Returns: + -------- + tuple of np.ndarray or np.ndarray + items_features of the ChoiceDataset + """ if self.choice_dataset.sessions_features is None: sessions_features = None else: @@ -138,35 +177,43 @@ def _get_sessions_features(self, sessions_indexes): return sessions_features def _get_sessions_items_features(self, sessions_indexes): + """Method to access sessions items features of the ChoiceDataset. + + Parameters + ---------- + sessions_indexes : list of ints or int + indexes of the sessions to return + + Returns: + -------- + tuple of np.ndarray or np.ndarray + items_features of the ChoiceDataset + """ if self.choice_dataset.sessions_items_features is None: - sessions_items_features = None - else: - sessions_items_features = [] - for i, sessions_items_feature in enumerate(self.choice_dataset.sessions_items_features): - if hasattr(sessions_items_feature, "iloc"): - sessions_items_features.append( - sessions_items_feature.iloc[sessions_indexes].astype( - self._return_types[2][i] - ) - ) - else: - sessions_items_features.append( - np.stack(sessions_items_feature[sessions_indexes], axis=0).astype( - self.choice_dataset._return_types[2][i] - ) - ) - # sessions_items_features were not given as a tuple, so we return do not return it as a tuple - if self.choice_dataset._return_sessions_items_features_tuple: - sessions_items_features = tuple(sessions_items_features) + return None + sessions_items_features = [] + for i, sessions_items_feature in enumerate(self.choice_dataset.sessions_items_features): + if hasattr(sessions_items_feature, "iloc"): + sessions_items_features.append( + sessions_items_feature.iloc[sessions_indexes].astype(self._return_types[2][i]) + ) else: - sessions_items_features = sessions_items_features[0] - return sessions_items_features + sessions_items_features.append( + np.stack(sessions_items_feature[sessions_indexes], axis=0).astype( + self.choice_dataset._return_types[2][i] + ) + ) + # sessions_items_features were not given as a tuple, thus we do not return it as a tuple + if self.choice_dataset._return_sessions_items_features_tuple: + sessions_items_features = tuple(sessions_items_features) + else: + sessions_items_features = sessions_items_features[0] + return sessions_items_features def __getitem__(self, choice_index): - """ - Method to access data within the ChoiceDataset from its index. - One index corresponds to a choice within a session. + """Method to access data within the ChoiceDataset from its index. + One index corresponds to a choice within a session. Return order: - Fixed item features - Session features @@ -216,10 +263,10 @@ def __getitem__(self, choice_index): choice, ) - elif isinstance(choice_index, slice): + if isinstance(choice_index, slice): return self.__getitem__(list(range(*choice_index.indices(self.choices.shape[0])))) - elif isinstance(choice_index, int): + if isinstance(choice_index, int): items_features = self._get_items_features() # Get the session indexes sessions_indexes = self.choice_dataset.indexes[choice_index] @@ -254,5 +301,4 @@ def __getitem__(self, choice_index): sessions_items_availabilities, choice, ) - else: - raise NotImplementedError + raise NotImplementedError From d39b4d99c03109ac337d635fe607157c78ab8fef Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 21 Dec 2023 17:29:16 +0100 Subject: [PATCH 4/8] format store --- lib/data/store.py | 92 +++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/lib/data/store.py b/lib/data/store.py index b84ab286..5680943f 100644 --- a/lib/data/store.py +++ b/lib/data/store.py @@ -1,18 +1,19 @@ +"""Different classes to optimize RAM usage with repeated features over time.""" import numpy as np from choice_modeling.data.indexer import OneHotStoreIndexer, StoreIndexer class Store(object): - """Class to keep OneHotStore and FeaturesStore with same parent""" + """Class to keep OneHotStore and FeaturesStore with same parent.""" def __init__(self, indexes=None, values=None, sequence=None, name=None, indexer=StoreIndexer): - """ - Builds the store + """Builds the store. Parameters ---------- indexes : array_like or None - list of indexes of features to store. If None is given, indexes are created from apparition order of values + list of indexes of features to store. If None is given, indexes are created from + apparition order of values values : array_like list of values of features to store sequence : array_like @@ -29,49 +30,52 @@ def __init__(self, indexes=None, values=None, sequence=None, name=None, indexer= if sequence is not None and values is not None: try: width = len(values[0]) - except: + except TypeError: width = 1 self.shape = (len(sequence), width) self.indexer = indexer(self) def _get_store_element(self, index): - """ - Returns the features stored at index index. Compared to __getitem__, it does take the index-th - element of sequence but the index-th element of the store. + """Getter method over self.sequence. + + Returns the features stored at index index. Compared to __getitem__, it does take + the index-th element of sequence but the index-th element of the store. Parameters ---------- index : (int, list, slice) index argument of the feature - Returns - ------- + Returns: + -------- array_like features corresponding to the index index in self.store """ if isinstance(index, list): return [self.store[i] for i in index] - else: - return self.store[index] + # else: + return self.store[index] def __len__(self): + """Returns the length of the sequence of apparition of the features.""" return len(self.sequence) @property def iloc(self): + """Indexing attribute.""" return self.indexer class FeaturesStore(Store): - """ - Base class to store features and a sequence of apparitions. + """Base class to store features and a sequence of apparitions. + Mainly useful when features are repeated frequently over the sequence. An example would be to store the features of a customers (supposing that the same customers come several times over the work sequence) and to save which customer is concerned for each choice. - Attributes - ---------- + Attributes: + ----------- store : dict Dictionary stocking features that can be called from indexes: {index: features} shape : tuple @@ -86,8 +90,7 @@ class FeaturesStore(Store): @classmethod def from_dict(cls, values_dict, sequence): - """ - Instantiates the FeaturesStore from a dictionary of values + """Instantiates the FeaturesStore from a dictionary of values. Parameters ---------- @@ -96,8 +99,8 @@ def from_dict(cls, values_dict, sequence): sequence : array_like sequence of apparitions of the features - Returns - ------- + Returns: + -------- FeaturesStore created from the values in the dictionnary """ # Check uniform shape of values @@ -107,8 +110,8 @@ def from_dict(cls, values_dict, sequence): @classmethod def from_list(cls, values_list, sequence): - """ - Instantiates the FeaturesStore from a list of values + """Instantiates the FeaturesStore from a list of values. + Creates indexes for each value Parameters @@ -118,8 +121,8 @@ def from_list(cls, values_list, sequence): sequence : array_like sequence of apparitions of the features - Returns - ------- + Returns: + -------- FeaturesStore """ # Check uniform shape of list @@ -127,16 +130,15 @@ def from_list(cls, values_list, sequence): return cls(indexes=list(range(len(values_list))), values=values_list, sequence=sequence) def __getitem__(self, sequence_index): - """ - Subsets self with sequence_index + """Subsets self with sequence_index. Parameters ---------- sequence_index : (int, list, slice) index position of the sequence - Returns - ------- + Returns: + -------- array_like features corresponding to the sequence_index-th position of sequence """ @@ -154,8 +156,9 @@ def __getitem__(self, sequence_index): return FeaturesStore.from_dict(store, new_sequence) def astype(self, dtype): - """ - Changes the dtype of the features. The type of the features should implement the astype method. + """Changes the dtype of the features. + + The type of the features should implement the astype method. Typically, should work like np.ndarrays. Parameters @@ -168,8 +171,9 @@ def astype(self, dtype): class OneHotStore(Store): - """ - Specific FeaturesStore for one hot features storage. Inherits from FeaturesStore. + """Specific FeaturesStore for one hot features storage. + + Inherits from FeaturesStore. For example can be used to store a OneHot representation of the days of week. Has the same attributes as FeaturesStore, only differs whit some One-Hot optimized methods. @@ -183,15 +187,16 @@ def __init__( name=None, dtype=np.float32, ): - """ - Builds the OneHot features store + """Builds the OneHot features store. Parameters ---------- indexes : array_like or None - list of indexes of features to store. If None is given, indexes are created from apparition order of values + list of indexes of features to store. If None is given, indexes are created from + apparition order of values values : array_like or None - list of values of features to store that must be One-Hot. If None given they are created from order of apparition in sequence + list of values of features to store that must be One-Hot. If None given they are created + from order of apparition in sequence sequence : array_like sequence of apparitions of the features name: string, optional @@ -212,17 +217,18 @@ def __init__( @classmethod def from_sequence(cls, sequence): """Creates a OneHotFeatureStore from a sequence of apparition. - One Hot vector are created from the order of apparition in the sequence: feature vectors created - have a length of the number of different values in the sequence and the 1 is positioned in order of - first appartitions in the sequence. + + One Hot vector are created from the order of apparition in the sequence: feature vectors + created have a length of the number of different values in the sequence and the 1 is + positioned in order of first appartitions in the sequence. Parameters ---------- sequence : array-like Sequence of apparitions of values, or indexes. Will be used to index self.store - Returns - ------- + Returns: + -------- FeatureStore Created from the sequence. """ @@ -238,8 +244,8 @@ def __getitem__(self, sequence_index): sequence_index : (int, list, slice) index from sequence of element to get - Returns - ------- + Returns: + -------- np.ndarray OneHot features corresponding to the sequence_index-th position of sequence """ From 355f2212200533f41f0edc631c4031097c0271da Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 21 Dec 2023 17:49:29 +0100 Subject: [PATCH 5/8] add tf_ops --- lib/tf_ops.py | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 lib/tf_ops.py diff --git a/lib/tf_ops.py b/lib/tf_ops.py new file mode 100644 index 00000000..cf8d617e --- /dev/null +++ b/lib/tf_ops.py @@ -0,0 +1,147 @@ +"""Diverse Tensorflow operations used in the ChoiceLearn library.""" + +import tensorflow as tf + + +def custom_softmax( + sessions_items_logits, sessions_items_availabilities, axis=-1, normalize_exit=False, eps=1e-5 +): + """Function to compute softmax probabilities from utilities. + + Takes into account availabilties (1 if the product is available, 0 otherwise) to set + probabilities to 0 for unavailable products and to renormalize the probabilities of + available products. + + Parameters + ---------- + sessions_items_logits : np.ndarray (n_sessions, n_products) + Utilities / Logits on which to compute the softmax + sessions_items_availabilities : np.ndarray (n_sessions, n_products) + Matrix indicating the availabitily (1) or not (0) of the products + axis : int, optional + Axis of sessions_logits on which to apply the softmax, by default -1 + normalize_exit : bool, optional + Whether to normalize the probabilities of available products with an exit choice of + utility 1, by default False + eps : float, optional + Value to avoid division by 0 when a product with probability almost 1 is unavailable, + by default 1e-5 + + Returns: + -------- + tf.Tensor (n_sessions, n_products) + Probabilities of each product for each session computed from Logits + """ + # Substract max utility to avoid overflow + numerator = tf.exp( + sessions_items_logits - tf.reduce_max(sessions_items_logits, axis=axis, keepdims=True) + ) + # Set unavailable products utility to 0 + numerator = tf.multiply(numerator, sessions_items_availabilities) + # Sum of total available utilities + denominator = tf.reduce_sum(numerator, axis=axis, keepdims=True) + # Add 1 to the denominator to take into account the exit choice + if normalize_exit: + denominator += 1 + # Avoir division by 0 when only unavailable items have highest utilities + elif eps: + denominator += eps + + # Compute softmax + return numerator / denominator + + +class CustomCategoricalCrossEntropy(tf.keras.losses.Loss): + """Custom Categorical Cross Entropy Loss. Handles all options in one place. + + Follows structure of tf.keras.losses.CategoricalCrossentropy and its different possibilities. + + Parameters + ---------- + from_logits : bool, optional + Whether to compute the softmax from logits or probabilities, by default False + sparse : bool, optional + Whether the choice labels are integers(True) or one-hot encoded(False), by default False + label_smoothing : float, optional + Value of smoothing to apply to labels, by default 0.0 + Smoothing applied is 1.0 - label_smoothing for chosen item and + label_smoothing / (num_items - 1)for all other items + axis : int, optional + Axis on which to compute the softmax. Used only if from_logits is True, by default -1 + epsilon : float, optional + Value to apply to avoid computation issues in log, by default 1e-10 + """ + + def __init__( + self, + from_logits=False, + sparse=False, + label_smoothing=0.0, + axis=-1, + epsilon=1e-10, + name="eps_categorical_crossentropy", + reduction=tf.keras.losses.Reduction.AUTO, + ): + """Initialization function. + + Follows structure of tf.keras.losses.CategoricalCrossentropy. + + Parameters + ---------- + from_logits : bool, optional + Whether to compute the softmax from logits or probabilities, by default False + sparse : bool, optional + Whether the choice labels are integers(True) or one-hot encoded(False), by default False + label_smoothing : float, optional + Value of smoothing to apply to labels, by default 0.0 + Smoothing applied is 1.0 - label_smoothing for chosen item and + label_smoothing / (num_items - 1) for all other items + axis : int, optional + Axis on which to compute the softmax. Used only if from_logits is True, by default -1 + epsilon : float, optional + Value to apply to avoid computation issues in log, by default 1e-10 + name: str + Name of the loss function - here to follow tf.keras.losses.Loss signature + reduction: + Reduction function - here to follow tf.keras.losses.Loss signature + """ + super().__init__(reduction=reduction, name=name) + self.label_smoothing = label_smoothing + self.from_logits = from_logits + self.sparse = sparse + self.axis = axis + self.epsilon = epsilon + + def call(self, y_true, y_pred): + """Computes the cross-entropy loss. + + Parameters + ---------- + y_true : np.ndarray | tf.Tenosr + Ground truth labels + y_pred : np.ndarray | tf.Tenosr + Predicted labels + + Returns: + -------- + tf.Tensor + Average Cross-Entropy loss + """ + if self.from_logits: # Apply softmax if utilities are given + y_pred = tf.nn.softmax(y_pred, axis=self.axis) + else: + y_pred = tf.convert_to_tensor(y_pred) + if self.sparse: # Create OneHot labels if sparse labels are given + y_true = tf.one_hot(y_true, depth=tf.shape(y_pred)[self.axis]) + else: + y_true = tf.cast(y_true, y_pred.dtype) + + # Smooth labels + if self.label_smoothing > 0: + label_smoothing = tf.convert_to_tensor(self.label_smoothing, dtype=y_pred.dtype) + num_classes = tf.cast(tf.shape(y_true)[self.axis], y_pred.dtype) + y_true = y_true * (1.0 - label_smoothing) + (label_smoothing / num_classes) + + # Apply label clipping to avoid log(0) and such issues + y_pred = tf.clip_by_value(y_pred, self.epsilon, 1.0 - self.epsilon) + return -tf.reduce_sum(y_true * tf.math.log(y_pred), axis=self.axis) From 11531cd588282974579f342f566bd04ad052f2da Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 21 Dec 2023 18:12:57 +0100 Subject: [PATCH 6/8] add base_model --- lib/models/base_model.py | 785 +++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 786 insertions(+), 1 deletion(-) create mode 100644 lib/models/base_model.py diff --git a/lib/models/base_model.py b/lib/models/base_model.py new file mode 100644 index 00000000..5b19399b --- /dev/null +++ b/lib/models/base_model.py @@ -0,0 +1,785 @@ +"""Base class for choice models.""" +import json +import os +import time +from abc import abstractmethod +from pathlib import Path + +import numpy as np +import tensorflow as tf +import tqdm +from choice_modeling.tf_ops import ( + CustomCategoricalCrossEntropy, + availability_softmax, + custom_softmax, +) + + +class ChoiceModel(object): + """Base class for choice models.""" + + def __init__( + self, + label_smoothing=0.0, + normalize_non_buy=False, + optimizer="Adam", + callbacks=None, + lr=0.001, + ): + """Instantiates the ChoiceModel. + + Parameters + ---------- + label_smoothing : float, optional + Whether (then is ]O, 1[ value) or not (then can be None or 0) to use label smoothing, + during training, by default 0.0 + by default None. Label smoothing is applied to LogLikelihood loss. + normalize_non_buy : bool, optional + Whether or not to add a normalization (then U=1) with the exit option in probabilites + normalization,by default True + callbacks : list of tf.kera callbacks, optional + List of callbacks to add to model.fit, by default None and only add History + """ + self.is_fitted = False + self.normalize_non_buy = normalize_non_buy + self.label_smoothing = label_smoothing + self.stop_training = False + + # self.loss = tf.keras.losses.CategoricalCrossentropy( + # from_logits=False, label_smoothing=self.label_smoothing + # ) + self.loss = CustomCategoricalCrossEntropy( + from_logits=False, label_smoothing=self.label_smoothing + ) + self.callbacks = tf.keras.callbacks.CallbackList(callbacks, add_history=True, model=None) + self.callbacks.set_model(self) + + # Was originally in BaseMNL, moved here. + if optimizer.lower() == "adam": + self.optimizer = tf.keras.optimizers.Adam(lr) + elif optimizer.lower() == "sgd": + self.optimizer = tf.keras.optimizers.SGD(lr) + elif optimizer.lower() == "adamax": + self.optimizer = tf.keras.optimizers.Adamax(lr) + elif optimizer.lower() == "lbfgs" or optimizer.lower() == "l-bfgs": + print("Using L-BFGS optimizer, setting up .fit() function") + self.fit = self._fit_with_lbfgs + else: + print(f"Optimizer {optimizer} not implemnted, switching for default Adam") + self.optimizer = tf.keras.optimizers.Adam(lr) + + @abstractmethod + def compute_utility( + self, items_batch, sessions_batch, sessions_items_batch, availabilities_batch, choices_batch + ): + """Method that defines how the model computes the utility of a product. + + MUST be implemented in children classe + For simpler use-cases this is the only method to be user-defined. + + Parameters + ---------- + items_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products + constant/fixed features. + Shape must be (n_items, n_items_features) + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + Shape must be (n_sessions, n_sessions_features) + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + Shape must be (n_sessions, n_sessions_items_features) + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + Shape must be (n_sessions, n_items) + choices_batch : np.ndarray + Choices + Shape must be (n_sessions, ) + + Returns: + -------- + np.ndarray + Utility of each product for each session. + Shape must be (n_sessions, n_items) + """ + # To be implemented in children classes + # Can be numpy or tensorflow based + return + + @tf.function + def train_step( + self, + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + sample_weight=None, + ): + """Function that represents one training step (= one gradient descent step) of the model. + + Parameters + ---------- + items_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products + constant/fixed features. + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + sample_weight : np.ndarray, optional + List samples weights to apply during the gradient descent to the batch elements, + by default None + + Returns: + -------- + tf.Tensor + Value of NegativeLogLikelihood loss for the batch + """ + with tf.GradientTape() as tape: + all_u = self.compute_utility( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) + """ + all_u = tf.math.exp(all_u) + + # Assortment(t) Utility + norms = tf.reduce_sum(tf.multiply(all_u, ia_batch), axis=1) + if self.normalize_non_buy: + norms += 1 + # Probabilities + final_utilities = tf.divide( + all_u, + tf.repeat(tf.expand_dims(norms, 1), fif_batch[0].shape[0], axis=1), + ) + # Probabilities of selected product + available_utilities = tf.gather_nd(indices=choices_nd, params=final_utilities) + """ + probabilities = availability_softmax(all_u, availabilities_batch, axis=-1) + probabilities = custom_softmax( + all_u, availabilities_batch, normalize_exit=self.normalize_non_buy, axis=-1 + ) + # Negative Log-Likelihood + neg_loglikelihood = self.loss( + y_pred=probabilities, + y_true=tf.one_hot(choices_batch, depth=probabilities.shape[1]), + sample_weight=sample_weight, + ) + """ + if sample_weight is not None: + neg_loglikelihood = -tf.reduce_sum( + tf.math.log(available_utilities + 1e-10) * sample_weight + ) + else: + neg_loglikelihood = -tf.reduce_sum(tf.math.log(available_utilities + 1e-10)) + """ + grads = tape.gradient(neg_loglikelihood, self.weights) + self.optimizer.apply_gradients(zip(grads, self.weights)) + return neg_loglikelihood + + def fit( + self, choice_dataset, n_epochs, batch_size, sample_weight=None, val_dataset=None, verbose=0 + ): + """Method to train the model with a ChoiceDataset. + + Parameters + ---------- + choice_dataset : ChoiceDataset + _description_ + n_epochs : int + Number of epochs + batch_size : int + Batch size + sample_weight : np.ndarray, optional + Sample weights to apply, by default None + val_dataset : ChoiceDataset, optional + Test ChoiceDataset to evaluate performances on test at each epoch, by default None + verbose : int, optional + print level, for debugging, by default 0 + + Returns: + -------- + dict: + Different metrics values over epochs. + """ + losses_history = {"train_loss": []} + t_range = tqdm.trange(n_epochs, position=0) + + self.callbacks.on_train_begin() + + # Iterate of epochs + for epoch_nb in t_range: + self.callbacks.on_epoch_begin(epoch_nb) + t_start = time.time() + train_logs = {"train_loss": []} + val_logs = {"val_loss": []} + epoch_losses = [] + + if sample_weight is not None: + if verbose > 0: + inner_range = tqdm.tqdm( + choice_dataset.batch( + shuffle=True, sample_weight=sample_weight, batch_size=batch_size + ), + total=int(len(choice_dataset) / np.max([1, batch_size])), + position=1, + leave=False, + ) + else: + inner_range = choice_dataset.batch( + shuffle=True, sample_weight=sample_weight, batch_size=batch_size + ) + + for batch_nb, ( + ( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ), + weight_batch, + ) in enumerate(inner_range): + self.callbacks.on_train_batch_begin(batch_nb) + + neg_loglikelihood = self.train_step( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + sample_weight=weight_batch, + ) + + train_logs["train_loss"].append(neg_loglikelihood) + temps_logs = {k: tf.reduce_mean(v) for k, v in train_logs.items()} + self.callbacks.on_train_batch_end(batch_nb, logs=temps_logs) + + # Optimization Steps + epoch_losses.append(neg_loglikelihood) + + # In this case we do not need to batch the sample_weights + else: + if verbose > 0: + inner_range = tqdm.tqdm( + choice_dataset.batch(shuffle=True, batch_size=batch_size), + total=int(len(choice_dataset) / np.max([batch_size, 1])), + position=1, + leave=False, + ) + else: + inner_range = choice_dataset.batch(shuffle=True, batch_size=batch_size) + for batch_nb, ( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) in enumerate(inner_range): + self.callbacks.on_train_batch_begin(batch_nb) + neg_loglikelihood = self.train_step( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) + train_logs["train_loss"].append(neg_loglikelihood) + temps_logs = {k: tf.reduce_mean(v) for k, v in train_logs.items()} + self.callbacks.on_train_batch_end(batch_nb, logs=temps_logs) + + # Optimization Steps + epoch_losses.append(neg_loglikelihood) + + # Take into account last batch that may have a differnt length into account for + # the computation of the epoch loss. + if batch_size != -1: + last_batch_size = availabilities_batch.shape[0] + coefficients = tf.concat( + [tf.ones(len(epoch_losses) - 1) * batch_size, [last_batch_size]], axis=0 + ) + epoch_lossses = tf.multiply(epoch_losses, coefficients) + epoch_loss = tf.reduce_sum(epoch_lossses) / len(choice_dataset) + else: + epoch_loss = tf.reduce_mean(epoch_losses) + losses_history["train_loss"].append(epoch_loss) + desc = f"Epoch {epoch_nb} Train Loss {losses_history['train_loss'][-1].numpy()}" + if verbose > 1: + print( + f"Loop {epoch_nb} Time", + time.time() - t_start, + "Loss:", + tf.reduce_sum(epoch_losses).numpy(), + ) + + # Test on val_dataset if provided + if val_dataset is not None: + test_losses = [] + for batch_nb, ( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) in enumerate(val_dataset.batch(shuffle=False, batch_size=batch_size)): + self.callbacks.on_batch_begin(batch_nb) + self.callbacks.on_test_batch_begin(batch_nb) + test_losses.append( + self.batch_predict( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + )[0] + ) + val_logs["val_loss"].append(test_losses[-1]) + temps_logs = {k: tf.reduce_mean(v) for k, v in val_logs.items()} + self.callbacks.on_test_batch_end(batch_nb, logs=temps_logs) + test_loss = tf.reduce_mean(test_losses) + if verbose > 1: + print("Test Negative-LogLikelihood:", test_loss.numpy()) + desc += f", Test Loss {test_loss.numpy()}" + losses_history["test_loss"] = losses_history.get("test_loss", []) + [ + test_loss.numpy() + ] + train_logs = {**train_logs, **val_logs} + + temps_logs = {k: tf.reduce_mean(v) for k, v in train_logs.items()} + self.callbacks.on_epoch_end(epoch_nb, logs=temps_logs) + if self.stop_training: + print("Early Stopping taking effect") + break + if verbose > 0: + t_range.set_description(desc) + t_range.refresh() + + temps_logs = {k: tf.reduce_mean(v) for k, v in train_logs.items()} + self.callbacks.on_train_end(logs=temps_logs) + return losses_history + + @tf.function + def batch_predict( + self, + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + sample_weight=None, + ): + """Function that represents one prediction (Probas + Loss) for one batch of a ChoiceDataset. + + Parameters + ---------- + items_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products + constant features. + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + sample_weight : np.ndarray, optional + List samples weights to apply during the gradient descent to the batch elements, + by default None + + Returns: + -------- + tf.Tensor (1, ) + Value of NegativeLogLikelihood loss for the batch + tf.Tensor (batch_size, n_items) + Probabilities for each product to be chosen for each session + """ + # Compute utilities from features + utilities = self.compute_utility( + items_batch, sessions_batch, sessions_items_batch, availabilities_batch, choices_batch + ) + # Compute probabilities from utilities & availabilties + probabilities = availability_softmax(utilities, availabilities_batch, axis=-1) + probabilities = custom_softmax( + utilities, availabilities_batch, normalize_exit=self.normalize_non_buy, axis=-1 + ) + + # Compute loss from probabilities & actual choices + # batch_loss = self.loss(probabilities, c_batch, sample_weight=sample_weight) + batch_loss = self.loss( + y_pred=probabilities, + y_true=tf.one_hot(choices_batch, depth=probabilities.shape[1]), + sample_weight=sample_weight, + ) + return batch_loss, probabilities + + def save_model(self, path): + """Method to save the different models on disk. + + Parameters + ---------- + path : str + path to the folder where to save the model + """ + if not os.exists(path): + Path(path).mkdir(parents=True) + + for i, weight in enumerate(self.weights): + tf.keras.savedmodel.save(Path(path) / f"weight_{i}") + + # To improve for non-string attributes + params = self.__dict__ + json.dump(Path(path) / "params.json", params) + + # Save optimizer state + + @classmethod + def load_model(cls, path): + """Method to load a ChoiceModel previously saved with save_model(). + + Parameters + ---------- + path : str + path to the folder where the saved model files are + + Returns: + -------- + ChoiceModel + Loaded ChoiceModel + """ + obj = cls() + obj.weights = [] + i = 0 + weight_path = f"weight_{i}" + while weight_path in os.listdir(path): + obj.weights.append(tf.keras.load_model.load(Path(path) / weight_path)) + i += 1 + weight_path = f"weight_{i}" + + # To improve for non string attributes + params = json.load(Path(path) / "params.json") + for k, v in params.items(): + setattr(obj, k, v) + + # Load optimizer step + return cls + + def predict_probas(self, choice_dataset): + """Predicts the choice probabilities for each session and each product of a ChoiceDataset. + + Parameters + ---------- + choice_dataset : ChoiceDataset + Dataset on which to apply to prediction + + Returns: + -------- + np.ndarray (n_sessions, n_items) + Choice probabilties for each session and each product + """ + stacked_probabilities = [] + for ( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) in choice_dataset.batch(): + _, probabilities = self.batch_predict( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) + stacked_probabilities.append(probabilities) + + return tf.concat(stacked_probabilities, axis=0) + + def evaluate(self, choice_dataset, batch_size=None): + """Evaluates the model for each session and each product of a ChoiceDataset. + + Predicts the probabilities according to the model and computes the Negative-Log-Likelihood + loss from the actual choices. + + Parameters + ---------- + choice_dataset : ChoiceDataset + Dataset on which to apply to prediction + + Returns: + -------- + np.ndarray (n_sessions, n_items) + Choice probabilties for each session and each product + """ + if batch_size is None: + batch_size = choice_dataset.batch_size + batch_losses = [] + for ( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) in choice_dataset.batch(batch_size=batch_size): + loss, _ = self.batch_predict( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) + batch_losses.append(loss) + if batch_size != -1: + last_batch_size = availabilities_batch.shape[0] + coefficients = tf.concat( + [tf.ones(len(batch_losses) - 1) * batch_size, [last_batch_size]], axis=0 + ) + batch_losses = tf.multiply(batch_losses, coefficients) + batch_loss = tf.reduce_sum(batch_losses) / len(choice_dataset) + else: + batch_loss = tf.reduce_mean(batch_losses) + return batch_loss + + def _lbfgs_train_step(self, dataset): + """A factory to create a function required by tfp.optimizer.lbfgs_minimize. + + Parameters + ---------- + dataset: ChoiceDataset + Dataset on which to estimate the paramters. + + Returns: + -------- + function + with the signature: + loss_value, gradients = f(model_parameters). + """ + # obtain the shapes of all trainable parameters in the model + shapes = tf.shape_n(self.weights) + n_tensors = len(shapes) + + # we'll use tf.dynamic_stitch and tf.dynamic_partition later, so we need to + # prepare required information first + count = 0 + idx = [] # stitch indices + part = [] # partition indices + + for i, shape in enumerate(shapes): + n = np.product(shape) + idx.append(tf.reshape(tf.range(count, count + n, dtype=tf.int32), shape)) + part.extend([i] * n) + count += n + + part = tf.constant(part) + + @tf.function + def assign_new_model_parameters(params_1d): + """A function updating the model's parameters with a 1D tf.Tensor. + + Pararmeters + ----------- + params_1d: tf.Tensor + a 1D tf.Tensor representing the model's trainable parameters. + """ + params = tf.dynamic_partition(params_1d, part, n_tensors) + for i, (shape, param) in enumerate(zip(shapes, params)): + self.weights[i].assign(tf.reshape(param, shape)) + + # now create a function that will be returned by this factory + @tf.function + def f(params_1d): + """A function that can be used by tfp.optimizer.lbfgs_minimize. + + This function is created by function_factory. + + Parameters + ---------- + params_1d: tf.Tensor + a 1D tf.Tensor. + + Returns: + -------- + tf.Tensor + A scalar loss and the gradients w.r.t. the `params_1d`. + tf.Tensor + A 1D tf.Tensor representing the gradients w.r.t. the `params_1d`. + """ + # use GradientTape so that we can calculate the gradient of loss w.r.t. parameters + with tf.GradientTape() as tape: + # update the parameters in the model + assign_new_model_parameters(params_1d) + # calculate the loss + loss_value = self.evaluate(dataset, batch_size=-1) + + # calculate gradients and convert to 1D tf.Tensor + grads = tape.gradient(loss_value, self.weights) + grads = tf.dynamic_stitch(idx, grads) + + # print out iteration & loss + f.iter.assign_add(1) + + # store loss value so we can retrieve later + tf.py_function(f.history.append, inp=[loss_value], Tout=[]) + + return loss_value, grads + + # store these information as members so we can use them outside the scope + f.iter = tf.Variable(0) + f.idx = idx + f.part = part + f.shapes = shapes + f.assign_new_model_parameters = assign_new_model_parameters + f.history = [] + return f + + def _fit_with_lbfgs(self, dataset, n_epochs, tolerance=1e-8): + """Fit function for L-BFGS optimizer. + + Replaces the .fit method when the optimizer is set to L-BFGS. + + Parameters + ---------- + dataset : _type_ + _description_ + n_epochs : _type_ + _description_ + tolerance : _type_, optional + _description_, by default 1e-8 + + Returns: + -------- + dict + Fit history + """ + # Only import tensorflow_probability if LBFGS optimizer is used, avoid unnecessary + # dependency + import tensorflow_probability as tfp + + func = self._lbfgs_train_step(dataset) + + # convert initial model parameters to a 1D tf.Tensor + init_params = tf.dynamic_stitch(func.idx, self.weights) + + # train the model with L-BFGS solver + results = tfp.optimizer.lbfgs_minimize( + value_and_gradients_function=func, + initial_position=init_params, + max_iterations=n_epochs, + tolerance=tolerance, + f_absolute_tolerance=-1, + f_relative_tolerance=-1, + ) + + # after training, the final optimized parameters are still in results.position + # so we have to manually put them back to the model + func.assign_new_model_parameters(results.position) + print("L-BFGS Opimization finished:") + print("---------------------------------------------------------------") + print("Number of iterations:", results[2].numpy()) + print("Algorithm converged before reaching max iterations:", results[0].numpy()) + return func.history + + +class RandomChoiceModel(ChoiceModel): + """Dumb model that randomly attributes utilities to products.""" + + def __init__(self, **kwargs): + """Initialization of the model.""" + super().__init__(**kwargs) + + def compute_utility( + self, items_batch, sessions_batch, sessions_items_batch, availabilities_batch, choices_batch + ): + """Computes the random utility for each product of each session. + + Parameters + ---------- + items_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products + constant/fixed features. + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + + Returns: + -------- + tf.Tensor + (n_sessions, n_items) matrix of random utilities + """ + # In order to avoid unused arguments warnings + del items_batch, sessions_batch, availabilities_batch, choices_batch + return np.squeeze(np.random.uniform(shape=(sessions_items_batch.shape), minval=0, maxval=1)) + + def fit(**kwargs): + """Make sure that nothing happens during .fit.""" + del kwargs + return {} + + +class DistribMimickingModel(ChoiceModel): + """Dumb class model that mimicks the probabilities. + + It stores the encountered in the train datasets and always returns them + """ + + def __init__(self, **kwargs): + """Initialization of the model.""" + super().__init__(**kwargs) + self.weights = [] + + def fit(self, choice_dataset, **kwargs): + """Computes the choice frequency of each product and defines it as choice probabilities.""" + del kwargs + choices = choice_dataset.choices + for i in range(choice_dataset.get_num_items()): + self.weights.append(tf.reduce_sum(tf.cast(choices == i, tf.float32))) + self.weights = tf.stack(self.weights) / len(choices) + + def compute_utility( + self, items_batch, sessions_batch, sessions_items_batch, availabilities_batch, choices_batch + ): + """Returns utility that is fixed. U = log(P). + + Parameters + ---------- + items_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products + constant/fixed features. + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + + Returns: + -------- + np.ndarray (n_sessions, n_items) + Utilities + + Raises: + ------- + ValueError + If the model has not been fitted cannot evaluate the utility + """ + # In order to avoid unused arguments warnings + del items_batch, sessions_batch, sessions_items_batch, availabilities_batch + if self.weights is None: + raise ValueError("Model not fitted") + return np.stack([np.log(self.weights.numpy())] * len(choices_batch), axis=0) diff --git a/pyproject.toml b/pyproject.toml index 1c5fd9e0..436216d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ select = [ "PTH", "PD", ] # See: https://beta.ruff.rs/docs/rules/ -ignore = ["D203", "D213", "ANN101", "ANN102", "ANN204", "ANN001", "ANN202", "ANN201", "ANN206"] +ignore = ["D203", "D213", "ANN101", "ANN102", "ANN204", "ANN001", "ANN202", "ANN201", "ANN206", "ANN003"] line-length = 100 target-version = "py310" exclude = [ From fcc68ff8b46e8c2b754975dd67187194fe8571c0 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 21 Dec 2023 21:01:30 +0100 Subject: [PATCH 7/8] add: rumnet --- lib/models/rumnet.py | 816 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 816 insertions(+) create mode 100644 lib/models/rumnet.py diff --git a/lib/models/rumnet.py b/lib/models/rumnet.py new file mode 100644 index 00000000..2b51837b --- /dev/null +++ b/lib/models/rumnet.py @@ -0,0 +1,816 @@ +"""Implementation of RUMnet for easy use.""" +import tensorflow as tf +from choice_modeling.models.base_model import ChoiceModel + + +class PaperRUMnet(ChoiceModel): + """Re-Implementation of the RUMnet model. + + Re-implemented from the paper: + Representing Random Utility Choice Models with Neural Networks from Ali Aouad and Antoine Désir + https://arxiv.org/abs/2207.12877 + + Inherits from base_model.ChoiceModel + """ + + def __init__( + self, + num_products_features, + num_customer_features, + width_eps_x, + depth_eps_x, + heterogeneity_x, + width_eps_z, + depth_eps_z, + heterogeneity_z, + width_u, + depth_u, + tol, + optimizer, + lr, + normalize_non_buy=True, + logmin=1e-5, + l2_regularization_coef=0.0, + label_smoothing=0.0, + **kwargs, + ): + """Initiation of the RUMnet Model. + + Parameters + ---------- + num_products_features : int + Number of features each product will be described with. + In terms of ChoiceDataset it is the number of + { items_features + sessions_items_features } for one product. + num_customer_features : int + Number of features each customer will be described with. + In terms of ChoiceDataset it is the number of sessions_features. + width_eps_x : int + Number of neurons for each dense layer for the products encoding net. + depth_eps_x : int + Number of dense layers for the products encoding net. + heterogeneity_x : int + Number of nets of products features encoding. + width_eps_z : int + Number of neurons for each dense layer for the customers encoding net. + depth_eps_z : int + Number of dense layers for the customers encoding net. + heterogeneity_z : int + Number of nets of customers features encoding. + width_u : int + Number of neurons for each dense layer for the utility net. + depth_u : int + Number of dense layers for the utility net. + tol : float + # To be Implemented + optimizer : str + String representation of the optimizer to use. By default is Adam if not specified. + Should be within tf.keras.optimizers. + lr : float + Starting learning rate to associate with optimizer. + normalize_non_buy : bool, optional + Whether or not to add exit option with utility 1, by default True + logmin : float, optional + Value to be added within log computation to avoid infinity, by default 1e-5 + l2_regularization_coef : float, optional + Value of dense layers weights regulariation to apply during training, by default 0.0 + label_smoothing : float, optional + Value of smoothing to apply in CrossEntropy loss computation, by default 0.0 + """ + super().__init__(normalize_non_buy=normalize_non_buy, **kwargs) + # Number of features + self.num_products_features = num_products_features + self.num_customer_features = num_customer_features + + # Dimension of encoding nets + self.width_eps_x = width_eps_x + self.depth_eps_x = depth_eps_x + self.heterogeneity_x = heterogeneity_x + + self.width_eps_z = width_eps_z + self.depth_eps_z = depth_eps_z + self.heterogeneity_z = heterogeneity_z + + # Dimension of utility net + self.width_u = width_u + self.depth_u = depth_u + + # Optimization parameters + self.logmin = logmin + self.tol = tol + self.lr = lr + self.normalize_non_buy = normalize_non_buy + self.l2_regularization_coef = l2_regularization_coef + self.label_smoothing = label_smoothing + + if optimizer == "Adam": + self.optimizer = tf.keras.optimizers.Adam(lr) + elif optimizer == "SGD": + self.optimizer = tf.keras.optimizers.SGD(lr) + elif optimizer == "Adamax": + self.optimizer = tf.keras.optimizers.Adamax(lr) + else: + print(f"Optimizer {optimizer} not implemnted, switching for default Adam") + self.optimizer = tf.keras.optimizers.Adam(lr) + + def instantiate(self): + """Instatiation of the RUMnet model. + + Creation of : + - x_model encoding products features, + - z_model encoding customers features, + - u_model computing utilities from product, customer features and their embeddings + """ + # Instatiation of the different nets + self.x_model, self.z_model, self.u_model = recreate_official_nets( + num_products_features=self.num_products_features, + num_customer_features=self.num_customer_features, + x_width=self.width_eps_x, + x_depth=self.depth_eps_x, + x_eps=self.heterogeneity_x, + z_width=self.width_eps_z, + z_depth=self.depth_eps_z, + z_eps=self.heterogeneity_z, + width_u=self.width_u, + depth_u=self.depth_u, + l2_regularization_coeff=self.l2_regularization_coef, + ) + + # Storing weights for back-propagation + self.weights = self.x_model.weights + self.z_model.weights + self.u_model.weights + self.loss = tf.keras.losses.CategoricalCrossentropy( + from_logits=False, label_smoothing=self.label_smoothing + ) + + def compute_utility( + self, + items_features_batch, + session_features_batch, + session_items_features_batch, + availabilities_batch, + choices_batch, + ): + """Compute utility from a batch of ChoiceDataset. + + Here we asssume that: item features = {fixed item features + session item features} + user features = {session features} + + Parameters + ---------- + items_features_batch : tuple of np.ndarray (items_features) + Items-Features: formatting from ChoiceDataset: a matrix representing the + products constant features. + session_features_batch : tuple of np.ndarray (sessions_features) + Time-Features + session_items_features_batch :tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + + Returns: + -------- + np.ndarray + Utility of each product for each session. + Shape must be (n_sessions, n_items) + """ + del availabilities_batch, choices_batch + ### Restacking of the item features + items_features_batch = tf.concat([*items_features_batch], axis=-1) + session_features_batch = tf.concat([*session_features_batch], axis=-1) + session_items_features_batch = tf.concat([*session_items_features_batch], axis=-1) + + full_item_features = tf.stack( + [items_features_batch] * session_items_features_batch.shape[0], axis=0 + ) + full_item_features = tf.concat([session_items_features_batch, full_item_features], axis=-1) + + ### Computation of utilities + utilities = [] + + # Computation of the customer features embeddings + z_embeddings = self.z_model(session_features_batch) + + # Iterate over items in assortment + for item_i in range(full_item_features.shape[1]): + # Computation of item features embeddings + x_embeddings = self.x_model(full_item_features[:, item_i, :]) + + utilities.append([]) + + # Computation of utilites from embeddings, iteration over heterogeneities + # (eps_x * eps_z) + for _x in x_embeddings: + for _z in z_embeddings: + _u = tf.keras.layers.Concatenate()( + [full_item_features[:, item_i, :], _x, session_features_batch, _z] + ) + utilities[-1].append(self.u_model(_u)) + + ### Reshape utilities: (batch_size, num_items, heterogeneity) + return tf.transpose(tf.squeeze(tf.stack(utilities, axis=0), -1)) + + @tf.function + def train_step( + self, + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + sample_weight=None, + ): + """Modified version of train step, as we have to average probabilities over heterogeneities. + + Mayber split into two functions? + One for computing probabilities, one for gradient descent ? + Parameters to be renamed ! + Function that represents one training step (= one gradient descent step) of the model. + + Parameters + ---------- + items_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing + the products constant features. + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + sample_weight : np.ndarray, optional + List samples weights to apply during the gradient descent to the batch elements, + by default None + + Returns: + -------- + tf.Tensor + Value of NegativeLogLikelihood loss for the batch + """ + with tf.GradientTape() as tape: + ### Computation of utilities + all_u = self.compute_utility( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) + probabilities = [] + + # Iterate over heterogeneities + # for i in range(all_u.shape[2]): + # Assortment(t) Utility + # eps_probabilities = availability_softmax(all_u[:, :, i], ia_batch, axis=2) + eps_probabilities = tf.nn.softmax(all_u, axis=2) + # probabilities.append(eps_probabilities) + + # Average probabilities over heterogeneities + probabilities = tf.reduce_mean(eps_probabilities, axis=1) + """ + # Test with availability normalization + probabilities = tf.multiply(probabilities, ia_batch) + probabilities = tf.divide( + probabilities, tf.reduce_sum(probabilities, axis=1, keepdims=True) + 1e-5 + ) + """ + # Probabilities of selected products + # chosen_probabilities = tf.gather_nd(indices=choices_nd, params=probabilities) + + # Negative Log-Likelihood + nll = self.loss( + y_pred=probabilities, + y_true=tf.one_hot(choices_batch, depth=probabilities.shape[1]), + sample_weight=sample_weight, + ) + # nll = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)( + # y_pred=probabilities, y_true=c_batch + # ) + # nll = -tf.reduce_sum(tf.math.log(chosen_probabilities + self.logmin)) + + grads = tape.gradient(nll, self.weights) + self.optimizer.apply_gradients(zip(grads, self.weights)) + return nll + + @tf.function + def batch_predict( + self, + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + sample_weight=None, + ): + """Function that represents one prediction (Probas + Loss) for one batch of a ChoiceDataset. + + Specific version for RUMnet because it is needed to average probabilities over + heterogeneities. + + Parameters + ---------- + items_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products + constant features. + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + sample_weight : np.ndarray, optional + List samples weights to apply during the gradient descent to the batch elements, + by default None + + Returns: + -------- + tf.Tensor (1, ) + Value of NegativeLogLikelihood loss for the batch + tf.Tensor (batch_size, n_items) + Probabilities for each product to be chosen for each session + """ + utilities = self.compute_utility( + items_batch, sessions_batch, sessions_items_batch, availabilities_batch, choices_batch + ) + probabilities = tf.nn.softmax(utilities, axis=2) + probabilities = tf.reduce_mean(probabilities, axis=1) + + # Test with availability normalization + """ + probabilities = tf.multiply(probabilities, ia_batch) + probabilities = tf.divide( + probabilities, tf.reduce_sum(probabilities, axis=1, keepdims=True) + 1e-5 + ) + """ + batch_loss = self.loss( + y_pred=probabilities, + y_true=tf.one_hot(choices_batch, depth=probabilities.shape[1]), + sample_weight=sample_weight, + ) + return batch_loss, probabilities + + +class PaperRUMnet2(PaperRUMnet): + """Other implementation.""" + + def compute_utility( + self, + items_features_batch, + session_features_batch, + session_items_features_batch, + availabilities_batch, + choices_batch, + ): + """Compute utility from a batch of ChoiceDataset. + + Here we asssume that: item features = {fixed item features + session item features} + user features = {session features} + + Parameters + ---------- + items_features_batch : tuple of np.ndarray (items_features) + Items-Features: formatting from ChoiceDataset: a matrix representing + the products constant features. + session_features_batch : tuple of np.ndarray (sessions_features) + Time-Features + session_items_features_batch :tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + + Returns: + -------- + np.ndarray + Utility of each product for each session. + Shape must be (n_sessions, n_items) + """ + del availabilities_batch, choices_batch + ### Restacking of the item features + items_features_batch = tf.concat([*items_features_batch], axis=-1) + session_features_batch = tf.concat([*session_features_batch], axis=-1) + session_items_features_batch = tf.concat([*session_items_features_batch], axis=-1) + + full_item_features = tf.stack( + [items_features_batch] * session_items_features_batch.shape[0], axis=0 + ) + full_item_features = tf.concat([session_items_features_batch, full_item_features], axis=-1) + + ### Computation of utilities + utilities = [] + + # Computation of the customer features embeddings + z_embeddings = self.z_model(session_features_batch) + + # Iterate over items in + def apply_u(x): + return self.u_model(x) + + for item_i in range(full_item_features.shape[1]): + # Computation of item features embeddings + x_embeddings = self.x_model(full_item_features[:, item_i, :]) + + # utilities.append([]) + + # Computation of utilites from embeddings, iteration over heterogeneities + # (eps_x * eps_z) + _utilities = [] + for _x in x_embeddings: + for _z in z_embeddings: + _u = tf.keras.layers.Concatenate()( + [full_item_features[:, item_i, :], _x, session_features_batch, _z] + ) + _utilities.append(_u) + utilities.append( + tf.map_fn( + fn=apply_u, elems=tf.stack(_utilities, axis=0), fn_output_signature=tf.float32 + ) + ) + ### Reshape utilities: (batch_size, num_items, heterogeneity) + return tf.transpose(tf.squeeze(tf.stack(utilities, axis=0), -1)) + + +class PaperRUMnet3(PaperRUMnet): + """Other Implementation.""" + + def compute_utility( + self, + items_features_batch, + session_features_batch, + session_items_features_batch, + availabilities_batch, + choices_batch, + ): + """Compute utility from a batch of ChoiceDataset. + + Here we asssume that: item features = {fixed item features + session item features} + user features = {session features} + + Parameters + ---------- + items_features_batch : tuple of np.ndarray (items_features) + Items-Features: formatting from ChoiceDataset: a matrix representing the products + constant features. + session_features_batch : tuple of np.ndarray (sessions_features) + Time-Features + session_items_features_batch :tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + + Returns: + -------- + np.ndarray + Utility of each product for each session. + Shape must be (n_sessions, n_items) + """ + del availabilities_batch, choices_batch + ### Restacking of the item features + items_features_batch = tf.concat([*items_features_batch], axis=-1) + session_features_batch = tf.concat([*session_features_batch], axis=-1) + session_items_features_batch = tf.concat([*session_items_features_batch], axis=-1) + + full_item_features = tf.stack( + [items_features_batch] * session_items_features_batch.shape[0], axis=0 + ) + full_item_features = tf.concat([session_items_features_batch, full_item_features], axis=-1) + + ### Computation of utilities + utilities = [] + + # Computation of the customer features embeddings + z_embeddings = self.z_model(session_features_batch) + + # Iterate over items in assortment + # for item_i in range(full_item_features.shape[1]): + def apply_u(x): + # Computation of item features embeddings + x_embeddings = self.x_model(x) + + utilities = [] + + # Computation of utilites from embeddings, iteration over heterogeneities + # (eps_x * eps_z) + for _x in x_embeddings: + for _z in z_embeddings: + _u = tf.keras.layers.Concatenate()([x, _x, session_features_batch, _z]) + utilities.append(self.u_model(_u)) + return tf.stack(utilities, axis=0) + + utilities = tf.map_fn(fn=apply_u, elems=tf.transpose(full_item_features, perm=[1, 0, 2])) + ### Reshape utilities: (batch_size, num_items, heterogeneity) + return tf.transpose(tf.squeeze(tf.stack(utilities, axis=0), -1)) + + +class PaperRUMnet4(PaperRUMnet): + """Other Implementation.""" + + def compute_utility( + self, + items_features_batch, + session_features_batch, + session_items_features_batch, + availabilities_batch, + choices_batch, + ): + """Compute utility from a batch of ChoiceDataset. + + Here we asssume that: item features = {fixed item features + session item features} + user features = {session features} + + Parameters + ---------- + items_features_batch : tuple of np.ndarray (items_features) + Items-Features: formatting from ChoiceDataset: a matrix representing + the products constant features. + session_features_batch : tuple of np.ndarray (sessions_features) + Time-Features + session_items_features_batch :tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + + Returns: + -------- + np.ndarray + Utility of each product for each session. + Shape must be (n_sessions, n_items) + """ + del availabilities_batch, choices_batch + ### Restacking of the item features + items_features_batch = tf.concat([*items_features_batch], axis=-1) + session_features_batch = tf.concat([*session_features_batch], axis=-1) + session_items_features_batch = tf.concat([*session_items_features_batch], axis=-1) + + full_item_features = tf.stack( + [items_features_batch] * session_items_features_batch.shape[0], axis=0 + ) + full_item_features = tf.concat([session_items_features_batch, full_item_features], axis=-1) + + ### Computation of utilities + utilities = [] + batch_size = session_features_batch.shape[0] + + # Computation of the customer features embeddings + z_embeddings = self.z_model(session_features_batch) + + # Iterate over items in assortment + for item_i in range(full_item_features.shape[1]): + # Computation of item features embeddings + x_embeddings = self.x_model(full_item_features[:, item_i, :]) + + # utilities.append([]) + _utilities = [] + # Computation of utilites from embeddings, iteration over heterogeneities + # (eps_x * eps_z) + for _x in x_embeddings: + for _z in z_embeddings: + _u = tf.keras.layers.Concatenate()( + [full_item_features[:, item_i, :], _x, session_features_batch, _z] + ) + _utilities.append(_u) + item_utilities = self.u_model(tf.concat(_utilities, axis=0)) + item_utilities = tf.stack( + [ + item_utilities[batch_size * i : batch_size * (i + 1)] + for i in range(len(x_embeddings) * len(z_embeddings)) + ], + axis=1, + ) + utilities.append(item_utilities) + ### Reshape utilities: (batch_size, num_items, heterogeneity) + return tf.squeeze(tf.stack(utilities, axis=1), -1) + + +class PaperRUMnet5(PaperRUMnet): + """Other Implementation.""" + + def compute_utility( + self, + items_features_batch, + session_features_batch, + session_items_features_batch, + availabilities_batch, + choices_batch, + ): + """Compute utility from a batch of ChoiceDataset. + + Here we asssume that: item features = {fixed item features + session item features} + user features = {session features} + + Parameters + ---------- + items_features_batch : tuple of np.ndarray (items_features) + Items-Features: formatting from ChoiceDataset: a matrix representing + the products constant features. + session_features_batch : tuple of np.ndarray (sessions_features) + Time-Features + session_items_features_batch :tuple of np.ndarray (sessions_items_features) + Time-Item-Features + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + choices_batch : np.ndarray + Choices + + Returns: + -------- + np.ndarray + Utility of each product for each session. + Shape must be (n_sessions, n_items) + """ + del availabilities_batch, choices_batch + ### Restacking of the item features + items_features_batch = tf.concat([*items_features_batch], axis=-1) + session_features_batch = tf.concat([*session_features_batch], axis=-1) + session_items_features_batch = tf.concat([*session_items_features_batch], axis=-1) + + full_item_features = tf.stack( + [items_features_batch] * session_items_features_batch.shape[0], axis=0 + ) + full_item_features = tf.concat([session_items_features_batch, full_item_features], axis=-1) + + ### Computation of utilities + utilities = [] + batch_size = session_features_batch.shape[0] + num_items = full_item_features.shape[1] + + # Computation of the customer features embeddings + z_embeddings = self.z_model(session_features_batch) + + _utilities = [] + # Iterate over items in assortment + for item_i in range(num_items): + # Computation of item features embeddings + x_embeddings = self.x_model(full_item_features[:, item_i, :]) + + # utilities.append([]) + # Computation of utilites from embeddings, iteration over heterogeneities + # (eps_x * eps_z) + for _x in x_embeddings: + for _z in z_embeddings: + _u = tf.keras.layers.Concatenate()( + [full_item_features[:, item_i, :], _x, session_features_batch, _z] + ) + _utilities.append(_u) + utilities = self.u_model(tf.concat(_utilities, axis=0)) + length_one_item = len(x_embeddings) * len(z_embeddings) * batch_size + reshaped_utilities = [] + for item_i in range(num_items): + item_utilities = tf.stack( + [ + utilities[ + item_i * length_one_item + batch_size * i : item_i * length_one_item + + batch_size * (i + 1) + ] + for i in range(len(x_embeddings) * len(z_embeddings)) + ], + axis=1, + ) + print(item_i, "item_u", item_utilities.shape) + reshaped_utilities.append(item_utilities) + ### Reshape utilities: (batch_size, num_items, heterogeneity) + utilities = tf.squeeze(tf.stack(reshaped_utilities, axis=1), -1) + print("u", utilities.shape) + # utilities = tf.stack(utilities, axis=0) + return utilities + + +def create_ff_network(input_shape, depth, width, add_last=False, l2_regularization_coeff=0.0): + """Base function to create a simple fully connected (Dense) network. + + Parameters + ---------- + input_shape : tuple of int + shape of the input of the network. Typically (num_features, ) + depth : int + Number of dense/fully-connected of the network to create. + width : int + Neurons number for all dense layers. + add_last : bool, optional + Whether to add a Dense layer with a single output at the end, by default False + Typically to be used when creating the utility network, that outputs a single number: + the utility. + l2_regularization_coeff : float, optional + Regularization coefficient for Dense layers weights during training, by default 0.0 + + Returns: + -------- + tf.keras.Model + Dense Neural Network with tensorflow backend. + """ + input = tf.keras.layers.Input(shape=input_shape) + regularizer = tf.keras.regularizers.L2(l2_regularization_coeff) + out = input + for _ in range(depth): + out = tf.keras.layers.Dense( + width, activation="elu", kernel_regularizer=regularizer, use_bias=True + )(out) + if add_last: + out = tf.keras.layers.Dense(1, activation="linear", use_bias=False)(out) + return tf.keras.Model(inputs=input, outputs=out) + + +def recreate_official_nets( + num_products_features, + x_width, + x_depth, + x_eps, + num_customer_features, + z_width, + z_depth, + z_eps, + width_u, + depth_u, + l2_regularization_coeff=0.0, +): + """Function to create the three nets used in RUMnet: X_net, Z_net and U_net. + + Parameters + ---------- + num_products_features : int + Number of features each product will be described with. + In terms of ChoiceDataset it is the number of { items_features + sessions_items_features } + for one product. + num_customer_features : int + Number of features each customer will be described with. + In terms of ChoiceDataset it is the number of sessions_features. + width_eps_x : int + Number of neurons for each dense layer for the products encoding net. + depth_eps_x : int + Number of dense layers for the products encoding net. + heterogeneity_x : int + Number of nets of products features encoding. + width_eps_z : int + Number of neurons for each dense layer for the customers encoding net. + depth_eps_z : int + Number of dense layers for the customers encoding net. + heterogeneity_z : int + Number of nets of customers features encoding. + width_u : int + Number of neurons for each dense layer for the utility net. + depth_u : int + Number of dense layers for the utility net. + l2_regularization_coef : float, optional + Value of dense layers weights regulariation to apply during training, by default 0.0 + + Returns: + -------- + tf.keras.Model + Product features encoding network + tf.keras.Model + Customer features encoding network + tf.keras.Model + Features and encoding to utility computation network + """ + # Products and Customers embeddings nets, quiet symmetrical + products_input = tf.keras.layers.Input(shape=(num_products_features)) + customer_input = tf.keras.layers.Input(shape=(num_customer_features)) + x_embeddings = [] + z_embeddings = [] + + # Creating independant nets for each heterogeneity + for _ in range(x_eps): + x_embedding = create_ff_network( + input_shape=num_products_features, + depth=x_depth, + width=x_width, + l2_regularization_coeff=l2_regularization_coeff, + )(products_input) + x_embeddings.append(x_embedding) + + # Creating independant nets for each heterogeneity + for _ in range(z_eps): + z_embedding = create_ff_network( + input_shape=num_customer_features, + depth=z_depth, + width=z_width, + l2_regularization_coeff=l2_regularization_coeff, + )(customer_input) + + z_embeddings.append(z_embedding) + + x_net = tf.keras.Model(inputs=products_input, outputs=x_embeddings, name="X_embedding") + z_net = tf.keras.Model(inputs=customer_input, outputs=z_embeddings, name="Z_embedding") + + # Utility network + u_net = create_ff_network( + input_shape=( + x_width + z_width + num_products_features + num_customer_features + ), # Input shape from previous nets + width=width_u, + depth=depth_u, + add_last=True, # Add last for utility + l2_regularization_coeff=l2_regularization_coeff, + ) + + return x_net, z_net, u_net From 3e8b2314a433047aa8121c7b7c502a4374ad0d79 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 21 Dec 2023 21:30:05 +0100 Subject: [PATCH 8/8] add: condMNL --- lib/models/conditional_mnl.py | 884 ++++++++++++++++++++++++++++++++++ 1 file changed, 884 insertions(+) create mode 100644 lib/models/conditional_mnl.py diff --git a/lib/models/conditional_mnl.py b/lib/models/conditional_mnl.py new file mode 100644 index 00000000..9284797e --- /dev/null +++ b/lib/models/conditional_mnl.py @@ -0,0 +1,884 @@ +"""Conditional MNL model.""" + +import tensorflow as tf + +from .base_model import ChoiceModel + + +class ModelSpecification(object): + """Base class to specify the structure of a cMNL.""" + + def __init__(self): + """Instantiate a ModelSpecification object.""" + # User interface + self.coefficients = {} + # Handled by the model + self.feature_to_weight = {} + + def add_coefficients( + self, coefficient_name, feature_name, items_indexes=None, items_names=None + ): + """Adds a coefficient to the model throught the specification of the utility. + + Parameters + ---------- + coefficient_name : str + Name given to the coefficient. + feature_name : str + features name to which the coefficient is associated. It should work with + the names given. + in the ChoiceDataset that will be used for parameters estimation. + items_indexes : list of int, optional + list of items indexes (in the ChoiceDataset) for which we need to add a coefficient, + by default None + items_names : list of str, optional + list of items names (in the ChoiceDataset) for which we need to add a coefficient, + by default None + + Raises: + ------- + ValueError + When names or indexes are both not specified. + """ + if items_indexes is None and items_names is None: + raise ValueError("Either items_indexes or items_names must be specified") + + if isinstance(items_indexes, int): + items_indexes = [items_indexes] + if isinstance(items_names, str): + items_names = [items_names] + self.coefficients[coefficient_name] = { + "feature_name": feature_name, + "items_indexes": items_indexes, + "items_names": items_names, + } + + def add_shared_coefficient( + self, coefficient_name, feature_name, items_indexes=None, items_names=None + ): + """Adds a single, shared coefficient to the model throught the specification of the utility. + + Parameters + ---------- + coefficient_name : str + Name given to the coefficient. + feature_name : str + features name to which the coefficient is associated. It should work with + the names given. + in the ChoiceDataset that will be used for parameters estimation. + items_indexes : list of int, optional + list of items indexes (in the ChoiceDataset) for which the coefficient will be used, + by default None + items_names : list of str, optional + list of items names (in the ChoiceDataset) for which the coefficient will be used, + by default None + + Raises: + ------- + ValueError + When names or indexes are both not specified. + """ + if items_indexes is None and items_names is None: + raise ValueError("Either items_indexes or items_names must be specified") + + if isinstance(items_indexes, int): + print( + "You have added a single index to a shared coefficient. This is not recommended.", + "Returning to standard add_coefficients method.", + ) + self.add_coefficients(coefficient_name, feature_name, items_indexes, items_names) + if isinstance(items_names, str): + print( + "You have added a single name to a shared coefficient. This is not recommended.", + "Returning to standard add_coefficients method.", + ) + self.add_coefficients(coefficient_name, feature_name, items_indexes, items_names) + self.coefficients[coefficient_name] = { + "feature_name": feature_name, + "items_indexes": [items_indexes] if items_indexes is not None else None, + "items_names": items_names if items_names is not None else None, + } + + def get_coefficient(self, coefficient_name): + """Getter of a coefficient specification, from its name. + + Parameters + ---------- + coefficient_name : str + Name of the coefficient to get. + + Returns: + -------- + dict + specification of the coefficient. + """ + return self.coefficients[coefficient_name] + + def add_weight(self, weight_name, weight_index): + """Method used by cMNL class to create the Tensorflow weight corresponding. + + Parameters + ---------- + weight_name : str + Name of the weight to add. + weight_index : int + Index of the weight (in the conditionalMNL) to add. + """ + if weight_name not in self.coefficients.keys(): + raise ValueError(f"Weight {weight_name} not in coefficients") + + self.feature_to_weight[self.coefficients[weight_name]["feature_name"]] = ( + weight_name, + weight_index, + ) + + def list_features_with_weights(self): + """Get a list of the features that have a weight to be estimated. + + Returns: + -------- + dict.keys + List of the features that have a weight to be estimated. + """ + return self.feature_to_weight.keys() + + def get_weight_item_indexes(self, feature_name): + """Get the indexes of the concerned items for a given weight. + + Parameters + ---------- + feature_name : str + Features that is concerned by the weight. + + Returns: + -------- + list + List of indexes of the items concerned by the weight. + int + The index of the weight in the conditionalMNL weights. + """ + weight_name, weight_index = self.feature_to_weight[feature_name] + return self.coefficients[weight_name]["items_indexes"], weight_index + + @property + def coefficients_list(self): + """Returns the list of coefficients. + + Returns: + -------- + dict keys + List of coefficients in the specification. + """ + return list(self.coefficients.keys()) + + +class ConditionalMNL(ChoiceModel): + """Conditional MNL that has a generic structure. It can be parametrized with a dictionnary. + + Arguments: + ---------- + params: dict or ModelSpecification + Specfication of the model to be estimated. + """ + + def __init__( + self, + parameters, + add_exit_choice=False, + optimizer="Adam", + lr=0.001, + **kwargs, + ): + """Initialization of Conditional-MNL. + + Parameters: + ----------- + parameters : dict or ModelSpecification + Dictionnary containing the parametrization of the model. + The dictionnary must have the following structure: + {feature_name_1: mode_1, feature_name_2: mode_2, ...} + mode must be among "constant", "item", "item-full" for now + (same specifications as torch-choice). + add_exit_choice : bool, optional + Whether or not to normalize the probabilities computation with an exit choice + whose utility would be 1, by default True + """ + super().__init__(normalize_non_buy=add_exit_choice, optimizer=optimizer, lr=lr, **kwargs) + self.params = parameters + self.instantiated = False + + def instantiate_from_specifications(self): + """Instantiate the model from ModelSpecification object. + + Returns: + -------- + list of tf.Tensor + List of the weights created coresponding to the specification. + """ + weights = [] + for weight_nb, weight_name in enumerate(self.params.coefficients_list): + num_weights = ( + len(self.params.get_coefficient(weight_name)["items_indexes"]) + if self.params.get_coefficient(weight_name)["items_indexes"] is not None + else len(self.params.get_coefficient(weight_name)["items_names"]) + ) + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, num_weights)), + name=weight_name, + ) + weights.append(weight) + """ + feat_to_weight[self.params[weight_name]["feature_name"]] = ( + weight, + self.params[weight_name], + ) + """ + self.params.add_weight(weight_name, weight_nb) + + ## Fill items_indexes here + # Better organize feat_to_weight and specifications + return weights + + def _store_dataset_features_names(self, dataset): + """Registers the name of the features in the dataset. For later use in utility computation. + + Parameters + ---------- + dataset : ChoiceDataset + ChoiceDataset used to fit the model. + """ + self._items_features_names = dataset.items_features_names + self._sessions_features_names = dataset.sessions_features_names + self._sessions_items_features_names = dataset.sessions_items_features_names + + def compute_utility_from_specification( + self, + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + verbose=0, + ): + """Computes the utility when the model is constructed from a ModelSpecification object. + + Parameters + ---------- + tems_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products + constant/fixed features. + Shape must be (n_items, n_items_features) + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + Shape must be (n_sessions, n_sessions_features) + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + Shape must be (n_sessions, n_sessions_items_features) + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + Shape must be (n_sessions, n_items) + choices_batch : np.ndarray + Choices + Shape must be (n_sessions, ) + verbose : int, optional + Parametrization of the logging outputs, by default 0 + + Returns: + -------- + tf.Tensor + Utilities corresponding of shape (n_sessions, n_items) + """ + del choices_batch, verbose + + num_items = availabilities_batch.shape[1] + num_sessions = availabilities_batch.shape[0] + sessions_items_utilities = [] + # Items features + for i, feat_tuple in enumerate(self._items_features_names): + for j, feat in enumerate(feat_tuple): + if feat in self.params.list_features_with_weights(): + item_index, weight_index = self.params.get_weight_item_indexes(feat) + + s_i_u = tf.zeros((num_items,)) + for q, idx in enumerate(item_index): + if isinstance(idx, list): + for k in idx: + s_i_u = tf.concat( + [ + s_i_u[:k], + tf.multiply( + items_batch[i][k, j], self.weights[weight_index][:, q] + ), + s_i_u[k + 1 :], + ], + axis=0, + ) + else: + s_i_u = tf.concat( + [ + s_i_u[:idx], + tf.multiply( + items_batch[i][idx, j], self.weights[weight_index][:, q] + ), + s_i_u[idx + 1 :], + ], + axis=0, + ) + s_i_u = tf.stack([s_i_u] * num_sessions, axis=0) + + ### Need reshaping here + sessions_items_utilities.append(s_i_u) + else: + print( + f"Feature {feat} is in dataset but has no weight assigned in utility\ + computations" + ) + + # Session features + for i, feat_tuple in enumerate(self._sessions_features_names): + for j, feat in enumerate(feat_tuple): + if feat in self.params.list_features_with_weights(): + item_index, weight_index = self.params.get_weight_item_indexes(feat) + + s_i_u = tf.zeros((num_sessions, num_items)) + + for q, idx in enumerate(item_index): + if isinstance(idx, list): + for k in idx: + s_i_u = tf.concat( + [ + s_i_u[:, :k], + tf.expand_dims( + tf.multiply( + sessions_batch[i][:, j], + self.weights[weight_index][:, q], + ), + axis=-1, + ), + s_i_u[:, k + 1 :], + ], + axis=1, + ) + else: + s_i_u = tf.concat( + [ + s_i_u[:, :idx], + tf.expand_dims( + tf.multiply( + sessions_batch[i][:, j], + self.weights[weight_index][:, q], + ), + axis=-1, + ), + s_i_u[:, idx + 1 :], + ], + axis=1, + ) + + sessions_items_utilities.append(s_i_u) + else: + print( + f"Feature {feat} is in dataset but has no weight assigned in utility\ + computations" + ) + + # Session Items features + for i, feat_tuple in enumerate(self._sessions_items_features_names): + for j, feat in enumerate(feat_tuple): + if feat in self.params.list_features_with_weights(): + item_index, weight_index = self.params.get_weight_item_indexes(feat) + s_i_u = tf.zeros((num_sessions, num_items)) + + for q, idx in enumerate(item_index): + if isinstance(idx, list): + for k in idx: + s_i_u = tf.concat( + [ + s_i_u[:, :k], + tf.expand_dims( + tf.multiply( + sessions_items_batch[i][:, k, j], + self.weights[weight_index][:, q], + ), + axis=-1, + ), + s_i_u[:, k + 1 :], + ], + axis=1, + ) + else: + s_i_u = tf.concat( + [ + s_i_u[:, :idx], + tf.expand_dims( + tf.multiply( + sessions_items_batch[i][:, idx, j], + self.weights[weight_index][:, q], + ), + axis=-1, + ), + s_i_u[:, idx + 1 :], + ], + axis=1, + ) + + sessions_items_utilities.append(s_i_u) + else: + print( + f"Feature {feat} is in dataset but has no weight assigned in utility\ + computations" + ) + + if "intercept" in self.params.list_features_with_weights(): + item_index, weight_index = self.params.get_weight_item_indexes("intercept") + + s_i_u = tf.zeros((num_items,)) + for q, idx in enumerate(item_index): + s_i_u = tf.concat( + [ + s_i_u[:idx], + self.weights[weight_index][:, q], + s_i_u[idx + 1 :], + ], + axis=0, + ) + + s_i_u = tf.stack([s_i_u] * num_sessions, axis=0) + + ### Need reshaping here + + sessions_items_utilities.append(s_i_u) + + return tf.reduce_sum(sessions_items_utilities, axis=0) + + def instantiate_from_dict(self, num_items): + """Instantiation of the model from a dictionnary specification. + + Parameters + ---------- + num_items : int + Number of different items in the assortment. Used to create the right number of weights. + """ + spec = ModelSpecification() + weight_counter = 0 + for feature, mode in self.params.items(): + if mode == "constant": + spec.add_shared_coefficient( + feature + f"_w_{weight_counter}", feature, list(range(num_items)) + ) + elif mode == "item": + spec.add_coefficients( + feature + f"_w_{weight_counter}", feature, list(range(1, num_items)) + ) + elif mode == "item-full": + spec.add_coefficients( + feature + f"_w_{weight_counter}", feature, list(range(num_items)) + ) + + weight_counter += 1 + self.params = spec + self.instantiate_from_specifications() + + def instantiate( + self, + num_items, + items_features_names, + sessions_features_names, + sessions_items_features_names, + ): + """Instantiate the model from self.params and a dataset. + + Model is thus instantiated at .fit() time. + + Parameters + ---------- + num_items : int + Number of different items in the assortment. Used to create the right number of weights. + items_features_names : list of str + Names of the items features in the dataset. + sessions_features_names : list of str + Names of the sessions features in the dataset. + sessions_items_features_names : list of str + Names of the sessions items features in the dataset. + + Raises: + ------- + NotImplementedError + When a mode is wrongly precised. + """ + # Possibility to stack weights to be faster ???? + weights = [] + weights_count = 0 + self._items_features_names = [] + for feat_tuple in items_features_names: + tuple_names = [] + for feat in feat_tuple: + if feat in self.params.keys(): + if self.params[feat] == "constant": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, 1)) + ) + elif self.params[feat] == "item": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)( + shape=(1, num_items - 1) + ) + ) + elif self.params[feat] == "item-full": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, num_items)) + ) + else: + raise NotImplementedError(f"Param {self.params[feat]} not implemented") + weights.append(weight) + tuple_names.append((feat, weights_count)) + weights_count += 1 + + else: + print( + f"Feature {feat} is in dataset but has no weight assigned in utility\ + computations" + ) + if len(tuple_names) > 0: + self._items_features_names.append(tuple_names) + + self._sessions_features_names = [] + for feat_tuple in sessions_features_names: + tuple_names = [] + for feat in feat_tuple: + if feat in self.params.keys(): + if self.params[feat] == "constant": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, 1)), + name=feat, + ) + elif self.params[feat] == "item": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)( + shape=(1, num_items - 1) + ), + name=feat, + ) + elif self.params[feat] == "item-full": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, num_items)), + name=feat, + ) + else: + raise NotImplementedError(f"Param {self.params[feat]} not implemented") + weights.append(weight) + tuple_names.append((feat, weights_count)) + weights_count += 1 + else: + print( + f"Feature {feat} is in dataset but has no weight assigned in utility\ + computations" + ) + if len(tuple_names) > 0: + self._sessions_features_names.append(tuple_names) + + self._sessions_items_features_names = [] + for feat_tuple in sessions_items_features_names: + tuple_names = [] + for feat in feat_tuple: + if feat in self.params.keys(): + if self.params[feat] == "constant": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, 1)), + name=feat, + ) + elif self.params[feat] == "item": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)( + shape=(1, num_items - 1) + ), + name=feat, + ) + elif self.params[feat] == "item-full": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, num_items)), + name=feat, + ) + else: + for i, s_tuple in enumerate(sessions_features_names): + for j, s_feat in enumerate(s_tuple): + if s_feat == self.params[feat]: + # Get num weights with unique values of this feature + # Create a dictionary {value: weight} + # mydict = {} + # for i, j in enumerate( + # np.unique(dataset.sessions_features[i][:, j]) + # ): + # mydict[i] = j + # weight = tf.Variable( + # tf.random_normal_initializer(0.0, 0.02, seed=42)( + # shape=(1, j + 1) + # ), + # name=feat, + # ) + pass + raise NotImplementedError(f"Param {self.params[feat]} not implemented") + weights.append(weight) + tuple_names.append((feat, weights_count)) + weights_count += 1 + else: + print( + f"Feature {feat} is in dataset but has no weight assigned in utility\ + computations" + ) + + if len(tuple_names) > 0: + self._sessions_items_features_names.append(tuple_names) + + if "intercept" in self.params.keys(): + if self.params["intercept"] == "constant": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, 1)), name="intercept" + ) + elif self.params["intercept"] == "item": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, num_items - 1)), + name="intercept", + ) + elif self.params["intercept"] == "item-full": + weight = tf.Variable( + tf.random_normal_initializer(0.0, 0.02, seed=42)(shape=(1, num_items)), + name="intercept", + ) + else: + # Is supposed to be in sessions_features_names + raise NotImplementedError(f"Param {self.params['intercept']} not implemented") + weights.append(weight) + else: + print("No Intercept specified... was it forgotten ?") + + if len(weights) > 0: + self.instantiated = True + else: + raise ValueError("No weights instantiated") + return weights + + def compute_utility( + self, items_batch, sessions_batch, sessions_items_batch, availabilities_batch, choices_batch + ): + """Main method to compute the utility of the model. Selects the right method to compute. + + Parameters + ---------- + items_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products + constant/fixed features. + Shape must be (n_items, n_items_features) + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + Shape must be (n_sessions, n_sessions_features) + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + Shape must be (n_sessions, n_sessions_items_features) + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + Shape must be (n_sessions, n_items) + choices_batch : np.ndarray + Choices Shape must be (n_sessions, ) + + Returns: + -------- + tf.Tensor + Computed utilities of shape (n_sessions, n_items). + """ + if isinstance(self.params, ModelSpecification): + return self.compute_utility_from_specification( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) + return self.compute_utility_from_dict( + items_batch, + sessions_batch, + sessions_items_batch, + availabilities_batch, + choices_batch, + ) + + def compute_utility_from_dict( + self, items_batch, sessions_batch, sessions_items_batch, availabilities_batch, choices_batch + ): + """Computes the utility when the model is constructed from a dictionnary object. + + Parameters + ---------- + tems_batch : tuple of np.ndarray (items_features) + Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products + constant/fixed features. + Shape must be (n_items, n_items_features) + sessions_batch : tuple of np.ndarray (sessions_features) + Time-Features + Shape must be (n_sessions, n_sessions_features) + sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Time-Item-Features + Shape must be (n_sessions, n_sessions_items_features) + availabilities_batch : np.ndarray + Availabilities (sessions_items_availabilities) + Shape must be (n_sessions, n_items) + choices_batch : np.ndarray + Choices + Shape must be (n_sessions, ) + verbose : int, optional + Parametrization of the logging outputs, by default 0 + + Returns: + -------- + tf.Tensor + Utilities corresponding of shape (n_sessions, n_items) + """ + del availabilities_batch, choices_batch + + sessions_items_utilities = [] + num_items = items_batch[0].shape[0] + num_sessions = sessions_batch[0].shape[0] + + # Items features + for i, feat_tuple in enumerate(self._items_features_names): + for j, (feat, k) in enumerate(feat_tuple): + if feat in self.params.keys(): + weight = self.weights[k] + if self.params[feat] == "constant": + s_i_u = tf.concat( + [tf.multiply(items_batch[i][:, j], weight)] * num_sessions, axis=0 + ) + elif self.params[feat] == "item": + weight = tf.concat([tf.constant([[0.0]]), weight], axis=-1) + s_i_u = tf.concat( + [tf.multiply(items_batch[i][:, j], weight)] * num_sessions, axis=0 + ) + elif self.params[feat] == "item-full": + s_i_u = tf.concat( + [tf.multiply(items_batch[i][:, j], weight)] * num_sessions, axis=0 + ) + else: + raise NotImplementedError(f"Param {self.params[feat]} not implemented") + sessions_items_utilities.append(s_i_u) + else: + print( + f"Feature {feat} is in dataset but has no weight assigned in utility \ + computations" + ) + + # Session features + for i, feat_tuple in enumerate(self._sessions_features_names): + for j, (feat, k) in enumerate(feat_tuple): + if feat in self.params.keys(): + weight = self.weights[k] + if self.params[feat] == "constant": + s_i_u = tf.concat( + [tf.multiply(sessions_batch[i][j], weight)] * num_items, axis=-1 + ) + elif self.params[feat] == "item": + weight = tf.concat([tf.constant([[0.0]]), weight], axis=-1) + s_i_u = tf.tensordot(sessions_batch[i][:, j : j + 1], weight, axes=1) + elif self.params[feat] == "item-full": + s_i_u = tf.tensordot(sessions_batch[i][:, j : j + 1], weight, axes=1) + else: + raise NotImplementedError(f"Param {self.params[feat]} not implemented") + sessions_items_utilities.append(s_i_u) + else: + print( + f"Feature {feat} is in dataset but has no weight assigned in utility \ + computations" + ) + + # Session Items features + for i, feat_tuple in enumerate(self._sessions_items_features_names): + for j, (feat, k) in enumerate(feat_tuple): + if feat in self.params.keys(): + weight = self.weights[k] + if self.params[feat] == "constant": + s_i_u = tf.multiply(sessions_items_batch[i][:, :, j], weight) + elif self.params[feat] == "item": + weight = tf.concat([tf.constant([[0.0]]), weight], axis=-1) + s_i_u = tf.multiply(sessions_items_batch[i][:, :, j], weight) + elif self.params[feat] == "item-full": + s_i_u = tf.multiply(sessions_items_batch[i][:, :, j], weight) + else: + raise NotImplementedError(f"Param {self.params[feat]} not implemented") + sessions_items_utilities.append(s_i_u) + else: + print( + f"Feature {feat} is in dataset but has no weight assigned in utility \ + computations" + ) + + if "intercept" in self.params.keys(): + weight = self.weights[-1] + if self.params["intercept"] == "constant": + s_i_u = tf.concat([tf.concat([weight] * num_items, axis=0)] * num_sessions, axis=0) + elif self.params["intercept"] == "item": + weight = tf.concat([tf.constant([[0.0]]), weight], axis=-1) + s_i_u = tf.concat([weight] * num_sessions, axis=0) + elif self.params["intercept"] == "item-full": + s_i_u = tf.concat([weight] * num_sessions, axis=0) + else: + raise NotImplementedError(f"Param {self.params[feat]} not implemented") + sessions_items_utilities.append(s_i_u) + + return tf.reduce_sum(sessions_items_utilities, axis=0) + + def fit(self, choice_dataset, **kwargs): + """Main fit function to estimate the paramters. + + Parameters + ---------- + choice_dataset : ChoiceDataset + Choice dataset to use for the estimation. + + Returns: + -------- + ConditionalMNL + With estimated weights. + """ + if not self.instantiated: + if isinstance(self.params, ModelSpecification): + self.weights = self.instantiate_from_specifications() + self._store_dataset_features_names(choice_dataset) + else: + self.weights = self.instantiate( + num_items=choice_dataset.get_num_items(), + items_features_names=choice_dataset.items_features_names, + sessions_features_names=choice_dataset.sessions_features_names, + sessions_items_features_names=choice_dataset.sessions_items_features_names, + ) + self.instantiated = True + return super().fit(choice_dataset=choice_dataset, **kwargs) + + def _fit_with_lbfgs(self, choice_dataset, n_epochs, tolerance=1e-8): + """Specific fit function to estimate the paramters with LBFGS. + + Parameters + ---------- + choice_dataset : ChoiceDataset + Choice dataset to use for the estimation. + n_epochs : int + Number of epochs to run. + tolerance : float, optional + Tolerance in the research of minimum, by default 1e-8 + + Returns: + -------- + conditionalMNL + self with estimated weights. + """ + if not self.instantiated: + if isinstance(self.params, ModelSpecification): + self.weights = self.instantiate_from_specifications() + self._store_dataset_features_names(choice_dataset) + else: + self.weights = self.instantiate( + num_items=choice_dataset.get_num_items(), + items_features_names=choice_dataset.items_features_names, + sessions_features_names=choice_dataset.sessions_features_names, + sessions_items_features_names=choice_dataset.sessions_items_features_names, + ) + self.instantiated = True + return super()._fit_with_lbfgs(choice_dataset, n_epochs, tolerance)