Merge pull request #20 from artefactory/wide-df

ADD: from ChoiceDataset.from_single_wide_df
artefactory · Jan 30, 2024 · 8d47726 · 8d47726
2 parents bf9653b + fb88d24
commit 8d47726
Show file tree

Hide file tree

Showing 6 changed files with 217 additions and 38 deletions.
diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py
@@ -712,6 +712,113 @@ def _contexts_items_features_df_to_np(
             sessions_items_features = None
         return sessions_items_features, np.array(contexts_items_availabilities)
 
+    @classmethod
+    def from_single_wide_df(
+        cls,
+        df,
+        items_id,
+        fixed_items_suffixes=None,
+        contexts_features_columns=None,
+        contexts_items_features_suffixes=None,
+        contexts_items_availabilities_suffix=None,
+        choices_column="choice",
+        choice_mode="items_id",
+    ):
+        """Builds numpy arrays for ChoiceDataset from a single dataframe.
+
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            dataframe in Wide format
+        items_id : list
+            List of items ids
+        fixed_items_suffixes : list
+            Suffixes of the columns of the dataframe that are item features, default is None
+        contexts_features_suffixes : list
+            Suffixes of the columns of the dataframe that are contexts features, default is None
+        contexts_items_suffixes : list
+            Suffixes of the columns of the dataframe that are context-item features, default is None
+        contexts_items_availabilities_suffix: list
+            Suffixes of the columns of the dataframe that are context-item availabilities,
+        choice_column: str, optional
+            Name of the column containing the choices, default is "choice"
+        choice_mode: str, optional
+            How choice is indicated in df, either "items_name" or "items_index",
+            default is "items_id"
+
+        Returns:
+        -------
+        ChoiceDataset
+            corresponding ChoiceDataset
+        """
+        if fixed_items_suffixes is not None:
+            fixed_items_features = {"item_id": []}
+            for item in items_id:
+                fixed_items_features["item_id"].append(item)
+                for feature in fixed_items_suffixes:
+                    feature_value = df[f"{feature}_{item}"].unique()
+                    if len(feature_value) > 1:
+                        raise ValueError(
+                            f"More than one value for feature {feature} for item {item}"
+                        )
+                    fixed_items_features[feature] = (
+                        fixed_items_features.get(feature, []),
+                        +[feature_value],
+                    )
+            fixed_items_features = pd.DataFrame(fixed_items_features)
+        else:
+            fixed_items_features = None
+
+        if contexts_features_columns is not None:
+            contexts_features = df[contexts_features_columns]
+        else:
+            contexts_features = None
+
+        if contexts_items_features_suffixes is not None:
+            contexts_items_features = []
+            for item in items_id:
+                columns = [f"{item}_{feature}" for feature in contexts_items_features_suffixes]
+                for col in columns:
+                    if col not in df.columns:
+                        print(
+                            f"Column {col} was not in DataFrame,\
+                            dummy creation of the feature with zeros."
+                        )
+                        df[col] = 0
+                contexts_items_features.append(df[columns].to_numpy())
+            contexts_items_features = np.stack(contexts_items_features, axis=1)
+        else:
+            contexts_items_features = None
+
+        if contexts_items_availabilities_suffix is not None:
+            if isinstance(contexts_items_availabilities_suffix, list):
+                if not len(contexts_items_availabilities_suffix) == len(items_id):
+                    raise ValueError(
+                        "You have given a list of columns for availabilities."
+                        "We consider that it is one for each item but lenght do not match"
+                    )
+                print("You have given a list of columns for availabilities.")
+                print("We consider that it is one for each item")
+                contexts_items_availabilities = df[contexts_items_availabilities_suffix].to_numpy()
+            else:
+                columns = [f"{item}_{contexts_items_availabilities_suffix}" for item in items_id]
+                contexts_items_availabilities = df[columns].to_numpy()
+        else:
+            contexts_items_availabilities = None
+
+        choices = df[choices_column]
+        if choice_mode == "items_id":
+            choices = np.squeeze([np.where(items_id == c)[0] for c in choices])
+
+        return ChoiceDataset(
+            fixed_items_features=fixed_items_features,
+            contexts_features=contexts_features,
+            contexts_items_features=contexts_items_features,
+            contexts_items_features_names=contexts_items_features_suffixes,
+            contexts_items_availabilities=contexts_items_availabilities,
+            choices=choices,
+        )
+
     @classmethod
     def from_single_df(
         cls,

diff --git a/docs/how-to-guides.md b/docs/how-to-guides.md
@@ -0,0 +1,11 @@
+Here a some in-depth examples to help you with the package.
+
+In particular you will find notebooks to handle:
+
+**DATA**
+- ChoiceDataset
+- FeaturesStorage
+
+**MODELS**
+- Custom modelling - important if you want to build your own model
+- RUMnet
diff --git a/docs/index.md b/docs/index.md
@@ -1,3 +1,11 @@
 # Welcome to the choice-learn documentation!
 
 A toolbox for choice-modeling
+
+
+Choice-Learn is a Python package designed to help you build discrete choice models.
+The package provides ready to use datasets and different models from the litterature. It also provides a lower level use if you want to customize any model or create your own from scratch. In particular you will find smart datasets handling to limit RAM usage and different structure commons to any choice model.
+
+Choice-Learn uses NumPy and pandas as data backend engines and TensorFlow for models.
+
+In this documentation you will find examples to be quickly getting started as well as some more in-depth example.
diff --git a/docs/tutorials.md b/docs/tutorials.md
@@ -0,0 +1,3 @@
+Here are two tutorials to get started with the choice-learn package.
+In particular it shows how to create a ChoiceDataset. It's important as its the main object that will handle your data for the model.
+The other example shows a few example on how to handle ConditionalMNL with the package.
diff --git a/mkdocs.yaml b/mkdocs.yaml
@@ -55,13 +55,13 @@ plugins:
 nav:
   - HomePage: index.md
   - Tutorials:
-    - Introduction: tutorials/introduction.md
+    - Introduction: tutorials.md
     - Getting Started with Data: notebooks/choice_learn_introduction_data.md
-    - Optimize RAM usage with Features Storage: notebooks/features_byID_example.md
     - Getting Started with Conditional Logit: notebooks/choice_learn_introduction_clogit.md
   - How-To Guides:
     - Introduction: how-to-guides.md
     - Exhaustive example of ChoiceDataset creation: notebooks/dataset_creation.md
+    - Optimize RAM usage with Features Storage: notebooks/features_byID_example.md
     - RUMnet Usage: notebooks/rumnet_example.md
     - Custom Choice Model Creation: notebooks/custom_model.md
   - References:

diff --git a/notebooks/dataset_creation.ipynb b/notebooks/dataset_creation.ipynb
@@ -217,19 +217,21 @@
       "%=====================================================================%\n",
       "Number of items: 4\n",
       "Number of choices: 4324\n",
-      "Fixed Items Features:\n",
-      "1 items features\n",
-      "with names: (['is_public'],)\n",
+      "%=====================================================================%\n",
+      " Fixed Items Features:\n",
+      " 1 items features\n",
+      " with names: (['is_public'],)\n",
       "\n",
       "\n",
-      "Sessions features:\n",
-      "3 session features\n",
-      "with names: (['dist', 'income', 'urban'],)\n",
+      " Contexts features:\n",
+      " 3 context features\n",
+      " with names: (['dist', 'income', 'urban'],)\n",
       "\n",
       "\n",
-      "Session Items features:\n",
-      "4 sessions                   items features\n",
-      "with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
+      " Contexts Items features:\n",
+      " 4 context\n",
+      "                 items features\n",
+      " with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
       "%=====================================================================%\n",
       "\n"
      ]
@@ -429,19 +431,21 @@
       "%=====================================================================%\n",
       "Number of items: 4\n",
       "Number of choices: 4324\n",
-      "Fixed Items Features:\n",
-      "1 items features\n",
-      "with names: (['is_public'],)\n",
+      "%=====================================================================%\n",
+      " Fixed Items Features:\n",
+      " 1 items features\n",
+      " with names: (['is_public'],)\n",
       "\n",
       "\n",
-      "Sessions features:\n",
-      "3 session features\n",
-      "with names: (['dist', 'income', 'urban'],)\n",
+      " Contexts features:\n",
+      " 3 context features\n",
+      " with names: (['dist', 'income', 'urban'],)\n",
       "\n",
       "\n",
-      "Session Items features:\n",
-      "4 sessions                   items features\n",
-      "with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
+      " Contexts Items features:\n",
+      " 4 context\n",
+      "                 items features\n",
+      " with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
       "%=====================================================================%\n",
       "\n"
      ]
@@ -461,6 +465,48 @@
     "print(dataset.summary())"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### From a wide format DataFrame\n",
+    "\n",
+    "If your DataFrame is in the wide format you can use the 'from_single_wide_df' method. Here is an example with the SwissMetro dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from choice_learn.datasets import load_swissmetro\n",
+    "\n",
+    "swiss_df = load_swissmetro(as_frame=True,)\n",
+    "swiss_df.loc[swiss_df.CHOICE != 0]\n",
+    "swiss_df[\"CHOICE\"] = swiss_df[\"CHOICE\"] - 1\n",
+    "swiss_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = ChoiceDataset.from_single_wide_df(\n",
+    "    df=swiss_df,\n",
+    "    items_id=[\"TRAIN\", \"SM\", \"CAR\"],\n",
+    "    fixed_items_suffixes=None,\n",
+    "    contexts_features_columns=[\"GROUP\", \"SURVEY\", \"SP\", \"PURPOSE\", \"FIRST\", \"TICKET\", \"WHO\", \"LUGGAGE\", \"AGE\",\n",
+    "                               \"MALE\", \"INCOME\", \"GA\", \"ORIGIN\", \"DEST\"],\n",
+    "    contexts_items_features_suffixes=[\"CO\", \"TT\", \"HE\", \"SEATS\"],\n",
+    "    contexts_items_availabilities_suffix=\"AV\", # [\"TRAIN_AV\", \"SM_AV\", \"CAR_AV\"] also works\n",
+    "    choices_column=\"CHOICE\",\n",
+    "    choice_mode=\"item_index\",\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -884,19 +930,21 @@
       "%=====================================================================%\n",
       "Number of items: 4\n",
       "Number of choices: 4324\n",
-      "Fixed Items Features:\n",
-      "1 items features\n",
-      "with names: (['is_public'],)\n",
+      "%=====================================================================%\n",
+      " Fixed Items Features:\n",
+      " 1 items features\n",
+      " with names: (['is_public'],)\n",
       "\n",
       "\n",
-      "Sessions features:\n",
-      "3 session features\n",
-      "with names: (Index(['income', 'dist', 'urban'], dtype='object'),)\n",
+      " Contexts features:\n",
+      " 3 context features\n",
+      " with names: (Index(['income', 'dist', 'urban'], dtype='object'),)\n",
       "\n",
       "\n",
-      "Session Items features:\n",
-      "4 sessions                   items features\n",
-      "with names: (Index(['cost', 'freq', 'ivt', 'ovt'], dtype='object'),)\n",
+      " Contexts Items features:\n",
+      " 4 context\n",
+      "                 items features\n",
+      " with names: (Index(['cost', 'freq', 'ivt', 'ovt'], dtype='object'),)\n",
       "%=====================================================================%\n",
       "\n"
      ]
@@ -1003,19 +1051,21 @@
       "%=====================================================================%\n",
       "Number of items: 4\n",
       "Number of choices: 4324\n",
-      "Fixed Items Features:\n",
-      "1 items features\n",
-      "with names: (['is_public'],)\n",
+      "%=====================================================================%\n",
+      " Fixed Items Features:\n",
+      " 1 items features\n",
+      " with names: (['is_public'],)\n",
       "\n",
       "\n",
-      "Sessions features:\n",
-      "3 session features\n",
-      "with names: (['income', 'dist', 'urban'],)\n",
+      " Contexts features:\n",
+      " 3 context features\n",
+      " with names: (['income', 'dist', 'urban'],)\n",
       "\n",
       "\n",
-      "Session Items features:\n",
-      "4 sessions                   items features\n",
-      "with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
+      " Contexts Items features:\n",
+      " 4 context\n",
+      "                 items features\n",
+      " with names: (['freq', 'cost', 'ivt', 'ovt'],)\n",
       "%=====================================================================%\n",
       "\n"
      ]