Skip to content

Commit

Permalink
Merge pull request #20 from artefactory/wide-df
Browse files Browse the repository at this point in the history
ADD: from ChoiceDataset.from_single_wide_df
  • Loading branch information
VincentAuriau authored Jan 30, 2024
2 parents bf9653b + fb88d24 commit 8d47726
Show file tree
Hide file tree
Showing 6 changed files with 217 additions and 38 deletions.
107 changes: 107 additions & 0 deletions choice_learn/data/choice_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,113 @@ def _contexts_items_features_df_to_np(
sessions_items_features = None
return sessions_items_features, np.array(contexts_items_availabilities)

@classmethod
def from_single_wide_df(
cls,
df,
items_id,
fixed_items_suffixes=None,
contexts_features_columns=None,
contexts_items_features_suffixes=None,
contexts_items_availabilities_suffix=None,
choices_column="choice",
choice_mode="items_id",
):
"""Builds numpy arrays for ChoiceDataset from a single dataframe.
Parameters
----------
df : pandas.DataFrame
dataframe in Wide format
items_id : list
List of items ids
fixed_items_suffixes : list
Suffixes of the columns of the dataframe that are item features, default is None
contexts_features_suffixes : list
Suffixes of the columns of the dataframe that are contexts features, default is None
contexts_items_suffixes : list
Suffixes of the columns of the dataframe that are context-item features, default is None
contexts_items_availabilities_suffix: list
Suffixes of the columns of the dataframe that are context-item availabilities,
choice_column: str, optional
Name of the column containing the choices, default is "choice"
choice_mode: str, optional
How choice is indicated in df, either "items_name" or "items_index",
default is "items_id"
Returns:
-------
ChoiceDataset
corresponding ChoiceDataset
"""
if fixed_items_suffixes is not None:
fixed_items_features = {"item_id": []}
for item in items_id:
fixed_items_features["item_id"].append(item)
for feature in fixed_items_suffixes:
feature_value = df[f"{feature}_{item}"].unique()
if len(feature_value) > 1:
raise ValueError(
f"More than one value for feature {feature} for item {item}"
)
fixed_items_features[feature] = (
fixed_items_features.get(feature, []),
+[feature_value],
)
fixed_items_features = pd.DataFrame(fixed_items_features)
else:
fixed_items_features = None

if contexts_features_columns is not None:
contexts_features = df[contexts_features_columns]
else:
contexts_features = None

if contexts_items_features_suffixes is not None:
contexts_items_features = []
for item in items_id:
columns = [f"{item}_{feature}" for feature in contexts_items_features_suffixes]
for col in columns:
if col not in df.columns:
print(
f"Column {col} was not in DataFrame,\
dummy creation of the feature with zeros."
)
df[col] = 0
contexts_items_features.append(df[columns].to_numpy())
contexts_items_features = np.stack(contexts_items_features, axis=1)
else:
contexts_items_features = None

if contexts_items_availabilities_suffix is not None:
if isinstance(contexts_items_availabilities_suffix, list):
if not len(contexts_items_availabilities_suffix) == len(items_id):
raise ValueError(
"You have given a list of columns for availabilities."
"We consider that it is one for each item but lenght do not match"
)
print("You have given a list of columns for availabilities.")
print("We consider that it is one for each item")
contexts_items_availabilities = df[contexts_items_availabilities_suffix].to_numpy()
else:
columns = [f"{item}_{contexts_items_availabilities_suffix}" for item in items_id]
contexts_items_availabilities = df[columns].to_numpy()
else:
contexts_items_availabilities = None

choices = df[choices_column]
if choice_mode == "items_id":
choices = np.squeeze([np.where(items_id == c)[0] for c in choices])

return ChoiceDataset(
fixed_items_features=fixed_items_features,
contexts_features=contexts_features,
contexts_items_features=contexts_items_features,
contexts_items_features_names=contexts_items_features_suffixes,
contexts_items_availabilities=contexts_items_availabilities,
choices=choices,
)

@classmethod
def from_single_df(
cls,
Expand Down
11 changes: 11 additions & 0 deletions docs/how-to-guides.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Here a some in-depth examples to help you with the package.

In particular you will find notebooks to handle:

**DATA**
- ChoiceDataset
- FeaturesStorage

**MODELS**
- Custom modelling - important if you want to build your own model
- RUMnet
8 changes: 8 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Welcome to the choice-learn documentation!

A toolbox for choice-modeling


Choice-Learn is a Python package designed to help you build discrete choice models.
The package provides ready to use datasets and different models from the litterature. It also provides a lower level use if you want to customize any model or create your own from scratch. In particular you will find smart datasets handling to limit RAM usage and different structure commons to any choice model.

Choice-Learn uses NumPy and pandas as data backend engines and TensorFlow for models.

In this documentation you will find examples to be quickly getting started as well as some more in-depth example.
3 changes: 3 additions & 0 deletions docs/tutorials.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Here are two tutorials to get started with the choice-learn package.
In particular it shows how to create a ChoiceDataset. It's important as its the main object that will handle your data for the model.
The other example shows a few example on how to handle ConditionalMNL with the package.
4 changes: 2 additions & 2 deletions mkdocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ plugins:
nav:
- HomePage: index.md
- Tutorials:
- Introduction: tutorials/introduction.md
- Introduction: tutorials.md
- Getting Started with Data: notebooks/choice_learn_introduction_data.md
- Optimize RAM usage with Features Storage: notebooks/features_byID_example.md
- Getting Started with Conditional Logit: notebooks/choice_learn_introduction_clogit.md
- How-To Guides:
- Introduction: how-to-guides.md
- Exhaustive example of ChoiceDataset creation: notebooks/dataset_creation.md
- Optimize RAM usage with Features Storage: notebooks/features_byID_example.md
- RUMnet Usage: notebooks/rumnet_example.md
- Custom Choice Model Creation: notebooks/custom_model.md
- References:
Expand Down
122 changes: 86 additions & 36 deletions notebooks/dataset_creation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -217,19 +217,21 @@
"%=====================================================================%\n",
"Number of items: 4\n",
"Number of choices: 4324\n",
"Fixed Items Features:\n",
"1 items features\n",
"with names: (['is_public'],)\n",
"%=====================================================================%\n",
" Fixed Items Features:\n",
" 1 items features\n",
" with names: (['is_public'],)\n",
"\n",
"\n",
"Sessions features:\n",
"3 session features\n",
"with names: (['dist', 'income', 'urban'],)\n",
" Contexts features:\n",
" 3 context features\n",
" with names: (['dist', 'income', 'urban'],)\n",
"\n",
"\n",
"Session Items features:\n",
"4 sessions items features\n",
"with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
" Contexts Items features:\n",
" 4 context\n",
" items features\n",
" with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
"%=====================================================================%\n",
"\n"
]
Expand Down Expand Up @@ -429,19 +431,21 @@
"%=====================================================================%\n",
"Number of items: 4\n",
"Number of choices: 4324\n",
"Fixed Items Features:\n",
"1 items features\n",
"with names: (['is_public'],)\n",
"%=====================================================================%\n",
" Fixed Items Features:\n",
" 1 items features\n",
" with names: (['is_public'],)\n",
"\n",
"\n",
"Sessions features:\n",
"3 session features\n",
"with names: (['dist', 'income', 'urban'],)\n",
" Contexts features:\n",
" 3 context features\n",
" with names: (['dist', 'income', 'urban'],)\n",
"\n",
"\n",
"Session Items features:\n",
"4 sessions items features\n",
"with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
" Contexts Items features:\n",
" 4 context\n",
" items features\n",
" with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
"%=====================================================================%\n",
"\n"
]
Expand All @@ -461,6 +465,48 @@
"print(dataset.summary())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### From a wide format DataFrame\n",
"\n",
"If your DataFrame is in the wide format you can use the 'from_single_wide_df' method. Here is an example with the SwissMetro dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from choice_learn.datasets import load_swissmetro\n",
"\n",
"swiss_df = load_swissmetro(as_frame=True,)\n",
"swiss_df.loc[swiss_df.CHOICE != 0]\n",
"swiss_df[\"CHOICE\"] = swiss_df[\"CHOICE\"] - 1\n",
"swiss_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset = ChoiceDataset.from_single_wide_df(\n",
" df=swiss_df,\n",
" items_id=[\"TRAIN\", \"SM\", \"CAR\"],\n",
" fixed_items_suffixes=None,\n",
" contexts_features_columns=[\"GROUP\", \"SURVEY\", \"SP\", \"PURPOSE\", \"FIRST\", \"TICKET\", \"WHO\", \"LUGGAGE\", \"AGE\",\n",
" \"MALE\", \"INCOME\", \"GA\", \"ORIGIN\", \"DEST\"],\n",
" contexts_items_features_suffixes=[\"CO\", \"TT\", \"HE\", \"SEATS\"],\n",
" contexts_items_availabilities_suffix=\"AV\", # [\"TRAIN_AV\", \"SM_AV\", \"CAR_AV\"] also works\n",
" choices_column=\"CHOICE\",\n",
" choice_mode=\"item_index\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -884,19 +930,21 @@
"%=====================================================================%\n",
"Number of items: 4\n",
"Number of choices: 4324\n",
"Fixed Items Features:\n",
"1 items features\n",
"with names: (['is_public'],)\n",
"%=====================================================================%\n",
" Fixed Items Features:\n",
" 1 items features\n",
" with names: (['is_public'],)\n",
"\n",
"\n",
"Sessions features:\n",
"3 session features\n",
"with names: (Index(['income', 'dist', 'urban'], dtype='object'),)\n",
" Contexts features:\n",
" 3 context features\n",
" with names: (Index(['income', 'dist', 'urban'], dtype='object'),)\n",
"\n",
"\n",
"Session Items features:\n",
"4 sessions items features\n",
"with names: (Index(['cost', 'freq', 'ivt', 'ovt'], dtype='object'),)\n",
" Contexts Items features:\n",
" 4 context\n",
" items features\n",
" with names: (Index(['cost', 'freq', 'ivt', 'ovt'], dtype='object'),)\n",
"%=====================================================================%\n",
"\n"
]
Expand Down Expand Up @@ -1003,19 +1051,21 @@
"%=====================================================================%\n",
"Number of items: 4\n",
"Number of choices: 4324\n",
"Fixed Items Features:\n",
"1 items features\n",
"with names: (['is_public'],)\n",
"%=====================================================================%\n",
" Fixed Items Features:\n",
" 1 items features\n",
" with names: (['is_public'],)\n",
"\n",
"\n",
"Sessions features:\n",
"3 session features\n",
"with names: (['income', 'dist', 'urban'],)\n",
" Contexts features:\n",
" 3 context features\n",
" with names: (['income', 'dist', 'urban'],)\n",
"\n",
"\n",
"Session Items features:\n",
"4 sessions items features\n",
"with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
" Contexts Items features:\n",
" 4 context\n",
" items features\n",
" with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
"%=====================================================================%\n",
"\n"
]
Expand Down

0 comments on commit 8d47726

Please sign in to comment.