From 80d75436df5939293089c874e4d2aeb0cbeee69c Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Mon, 29 Jan 2024 17:53:50 +0100 Subject: [PATCH 1/3] ADD little doc --- docs/how-to-guides.md | 11 +++++++++++ docs/index.md | 8 ++++++++ docs/tutorials.md | 3 +++ mkdocs.yaml | 4 ++-- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/docs/how-to-guides.md b/docs/how-to-guides.md index e69de29b..ab492e41 100644 --- a/docs/how-to-guides.md +++ b/docs/how-to-guides.md @@ -0,0 +1,11 @@ +Here a some in-depth examples to help you with the package. + +In particular you will find notebooks to handle: + +**DATA** +- ChoiceDataset +- FeaturesStorage + +**MODELS** +- Custom modelling - important if you want to build your own model +- RUMnet diff --git a/docs/index.md b/docs/index.md index cdef8bfb..d49209b0 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,11 @@ # Welcome to the choice-learn documentation! A toolbox for choice-modeling + + +Choice-Learn is a Python package designed to help you build discrete choice models. +The package provides ready to use datasets and different models from the litterature. It also provides a lower level use if you want to customize any model or create your own from scratch. In particular you will find smart datasets handling to limit RAM usage and different structure commons to any choice model. + +Choice-Learn uses NumPy and pandas as data backend engines and TensorFlow for models. + +In this documentation you will find examples to be quickly getting started as well as some more in-depth example. diff --git a/docs/tutorials.md b/docs/tutorials.md index e69de29b..a85916b5 100644 --- a/docs/tutorials.md +++ b/docs/tutorials.md @@ -0,0 +1,3 @@ +Here are two tutorials to get started with the choice-learn package. +In particular it shows how to create a ChoiceDataset. It's important as its the main object that will handle your data for the model. +The other example shows a few example on how to handle ConditionalMNL with the package. diff --git a/mkdocs.yaml b/mkdocs.yaml index 3164a009..06a75f8f 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -55,13 +55,13 @@ plugins: nav: - HomePage: index.md - Tutorials: - - Introduction: tutorials/introduction.md + - Introduction: tutorials.md - Getting Started with Data: notebooks/choice_learn_introduction_data.md - - Optimize RAM usage with Features Storage: notebooks/features_byID_example.md - Getting Started with Conditional Logit: notebooks/choice_learn_introduction_clogit.md - How-To Guides: - Introduction: how-to-guides.md - Exhaustive example of ChoiceDataset creation: notebooks/dataset_creation.md + - Optimize RAM usage with Features Storage: notebooks/features_byID_example.md - RUMnet Usage: notebooks/rumnet_example.md - Custom Choice Model Creation: notebooks/custom_model.md - References: From 0b32e7c508a7d155e638aa30e943f00aca48165b Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 30 Jan 2024 18:00:24 +0100 Subject: [PATCH 2/3] ADD: from single wide df methdo --- choice_learn/data/choice_dataset.py | 107 ++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 8ba0f5b1..780d1b93 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -712,6 +712,113 @@ def _contexts_items_features_df_to_np( sessions_items_features = None return sessions_items_features, np.array(contexts_items_availabilities) + @classmethod + def from_single_wide_df( + cls, + df, + items_id, + fixed_items_suffixes=None, + contexts_features_columns=None, + contexts_items_features_suffixes=None, + contexts_items_availabilities_suffix=None, + choices_column="choice", + choice_mode="items_id", + ): + """Builds numpy arrays for ChoiceDataset from a single dataframe. + + Parameters + ---------- + df : pandas.DataFrame + dataframe in Wide format + items_id : list + List of items ids + fixed_items_suffixes : list + Suffixes of the columns of the dataframe that are item features, default is None + contexts_features_suffixes : list + Suffixes of the columns of the dataframe that are contexts features, default is None + contexts_items_suffixes : list + Suffixes of the columns of the dataframe that are context-item features, default is None + contexts_items_availabilities_suffix: list + Suffixes of the columns of the dataframe that are context-item availabilities, + choice_column: str, optional + Name of the column containing the choices, default is "choice" + choice_mode: str, optional + How choice is indicated in df, either "items_name" or "items_index", + default is "items_id" + + Returns: + ------- + ChoiceDataset + corresponding ChoiceDataset + """ + if fixed_items_suffixes is not None: + fixed_items_features = {"item_id": []} + for item in items_id: + fixed_items_features["item_id"].append(item) + for feature in fixed_items_suffixes: + feature_value = df[f"{feature}_{item}"].unique() + if len(feature_value) > 1: + raise ValueError( + f"More than one value for feature {feature} for item {item}" + ) + fixed_items_features[feature] = ( + fixed_items_features.get(feature, []), + +[feature_value], + ) + fixed_items_features = pd.DataFrame(fixed_items_features) + else: + fixed_items_features = None + + if contexts_features_columns is not None: + contexts_features = df[contexts_features_columns] + else: + contexts_features = None + + if contexts_items_features_suffixes is not None: + contexts_items_features = [] + for item in items_id: + columns = [f"{item}_{feature}" for feature in contexts_items_features_suffixes] + for col in columns: + if col not in df.columns: + print( + f"Column {col} was not in DataFrame,\ + dummy creation of the feature with zeros." + ) + df[col] = 0 + contexts_items_features.append(df[columns].to_numpy()) + contexts_items_features = np.stack(contexts_items_features, axis=1) + else: + contexts_items_features = None + + if contexts_items_availabilities_suffix is not None: + if isinstance(contexts_items_availabilities_suffix, list): + if not len(contexts_items_availabilities_suffix) == len(items_id): + raise ValueError( + "You have given a list of columns for availabilities." + "We consider that it is one for each item but lenght do not match" + ) + print("You have given a list of columns for availabilities.") + print("We consider that it is one for each item") + contexts_items_availabilities = df[contexts_items_availabilities_suffix].to_numpy() + else: + columns = [f"{item}_{contexts_items_availabilities_suffix}" for item in items_id] + contexts_items_availabilities = df[columns].to_numpy() + else: + contexts_items_availabilities = None + + choices = df[choices_column] + if choice_mode == "items_id": + choices = np.squeeze([np.where(items_id == c)[0] for c in choices]) + + return ChoiceDataset( + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_features_names=contexts_items_features_suffixes, + contexts_items_availabilities=contexts_items_availabilities, + choices=choices, + ) + @classmethod def from_single_df( cls, From fb88d24ee11b2b6b58dcc4228132ffc793acad8f Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 30 Jan 2024 18:00:56 +0100 Subject: [PATCH 3/3] ADD: from single wide df example --- notebooks/dataset_creation.ipynb | 122 ++++++++++++++++++++++--------- 1 file changed, 86 insertions(+), 36 deletions(-) diff --git a/notebooks/dataset_creation.ipynb b/notebooks/dataset_creation.ipynb index 2d9290d2..75209981 100644 --- a/notebooks/dataset_creation.ipynb +++ b/notebooks/dataset_creation.ipynb @@ -217,19 +217,21 @@ "%=====================================================================%\n", "Number of items: 4\n", "Number of choices: 4324\n", - "Fixed Items Features:\n", - "1 items features\n", - "with names: (['is_public'],)\n", + "%=====================================================================%\n", + " Fixed Items Features:\n", + " 1 items features\n", + " with names: (['is_public'],)\n", "\n", "\n", - "Sessions features:\n", - "3 session features\n", - "with names: (['dist', 'income', 'urban'],)\n", + " Contexts features:\n", + " 3 context features\n", + " with names: (['dist', 'income', 'urban'],)\n", "\n", "\n", - "Session Items features:\n", - "4 sessions items features\n", - "with names: (['freq', 'cost', 'ivt', 'ovt'],)\n", + " Contexts Items features:\n", + " 4 context\n", + " items features\n", + " with names: (['freq', 'cost', 'ivt', 'ovt'],)\n", "%=====================================================================%\n", "\n" ] @@ -429,19 +431,21 @@ "%=====================================================================%\n", "Number of items: 4\n", "Number of choices: 4324\n", - "Fixed Items Features:\n", - "1 items features\n", - "with names: (['is_public'],)\n", + "%=====================================================================%\n", + " Fixed Items Features:\n", + " 1 items features\n", + " with names: (['is_public'],)\n", "\n", "\n", - "Sessions features:\n", - "3 session features\n", - "with names: (['dist', 'income', 'urban'],)\n", + " Contexts features:\n", + " 3 context features\n", + " with names: (['dist', 'income', 'urban'],)\n", "\n", "\n", - "Session Items features:\n", - "4 sessions items features\n", - "with names: (['freq', 'cost', 'ivt', 'ovt'],)\n", + " Contexts Items features:\n", + " 4 context\n", + " items features\n", + " with names: (['freq', 'cost', 'ivt', 'ovt'],)\n", "%=====================================================================%\n", "\n" ] @@ -461,6 +465,48 @@ "print(dataset.summary())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a wide format DataFrame\n", + "\n", + "If your DataFrame is in the wide format you can use the 'from_single_wide_df' method. Here is an example with the SwissMetro dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from choice_learn.datasets import load_swissmetro\n", + "\n", + "swiss_df = load_swissmetro(as_frame=True,)\n", + "swiss_df.loc[swiss_df.CHOICE != 0]\n", + "swiss_df[\"CHOICE\"] = swiss_df[\"CHOICE\"] - 1\n", + "swiss_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = ChoiceDataset.from_single_wide_df(\n", + " df=swiss_df,\n", + " items_id=[\"TRAIN\", \"SM\", \"CAR\"],\n", + " fixed_items_suffixes=None,\n", + " contexts_features_columns=[\"GROUP\", \"SURVEY\", \"SP\", \"PURPOSE\", \"FIRST\", \"TICKET\", \"WHO\", \"LUGGAGE\", \"AGE\",\n", + " \"MALE\", \"INCOME\", \"GA\", \"ORIGIN\", \"DEST\"],\n", + " contexts_items_features_suffixes=[\"CO\", \"TT\", \"HE\", \"SEATS\"],\n", + " contexts_items_availabilities_suffix=\"AV\", # [\"TRAIN_AV\", \"SM_AV\", \"CAR_AV\"] also works\n", + " choices_column=\"CHOICE\",\n", + " choice_mode=\"item_index\",\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -884,19 +930,21 @@ "%=====================================================================%\n", "Number of items: 4\n", "Number of choices: 4324\n", - "Fixed Items Features:\n", - "1 items features\n", - "with names: (['is_public'],)\n", + "%=====================================================================%\n", + " Fixed Items Features:\n", + " 1 items features\n", + " with names: (['is_public'],)\n", "\n", "\n", - "Sessions features:\n", - "3 session features\n", - "with names: (Index(['income', 'dist', 'urban'], dtype='object'),)\n", + " Contexts features:\n", + " 3 context features\n", + " with names: (Index(['income', 'dist', 'urban'], dtype='object'),)\n", "\n", "\n", - "Session Items features:\n", - "4 sessions items features\n", - "with names: (Index(['cost', 'freq', 'ivt', 'ovt'], dtype='object'),)\n", + " Contexts Items features:\n", + " 4 context\n", + " items features\n", + " with names: (Index(['cost', 'freq', 'ivt', 'ovt'], dtype='object'),)\n", "%=====================================================================%\n", "\n" ] @@ -1003,19 +1051,21 @@ "%=====================================================================%\n", "Number of items: 4\n", "Number of choices: 4324\n", - "Fixed Items Features:\n", - "1 items features\n", - "with names: (['is_public'],)\n", + "%=====================================================================%\n", + " Fixed Items Features:\n", + " 1 items features\n", + " with names: (['is_public'],)\n", "\n", "\n", - "Sessions features:\n", - "3 session features\n", - "with names: (['income', 'dist', 'urban'],)\n", + " Contexts features:\n", + " 3 context features\n", + " with names: (['income', 'dist', 'urban'],)\n", "\n", "\n", - "Session Items features:\n", - "4 sessions items features\n", - "with names: (['freq', 'cost', 'ivt', 'ovt'],)\n", + " Contexts Items features:\n", + " 4 context\n", + " items features\n", + " with names: (['freq', 'cost', 'ivt', 'ovt'],)\n", "%=====================================================================%\n", "\n" ]