diff --git a/choice_learn/data/__init__.py b/choice_learn/data/__init__.py
index 83ca8df0..5f54051a 100644
--- a/choice_learn/data/__init__.py
+++ b/choice_learn/data/__init__.py
@@ -1,5 +1,5 @@
"""Data handling classes and functions."""
from .choice_dataset import ChoiceDataset
-from .store import FeaturesStore, OneHotStore
+from .storage import FeaturesStorage, OneHotStorage
-__all__ = ["ChoiceDataset", "FeaturesStore", "OneHotStore"]
+__all__ = ["ChoiceDataset", "FeaturesStorage", "OneHotStorage"]
diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py
index 46440b2c..ddba5383 100644
--- a/choice_learn/data/indexer.py
+++ b/choice_learn/data/indexer.py
@@ -329,6 +329,7 @@ def __getitem__(self, choices_indexes):
].batch[fixed_items_features[tuple_index][:, feature_index]]
)
feat_ind_min = feature_index + 1
+ unstacked_feat.append(fixed_items_features[tuple_index][feat_ind_min:])
mapped_features.append(np.concatenate(unstacked_feat, axis=1))
fixed_items_features = mapped_features
@@ -350,6 +351,7 @@ def __getitem__(self, choices_indexes):
].batch[contexts_features[tuple_index][:, feature_index]]
)
feat_ind_min = feature_index + 1
+ unstacked_feat.append(contexts_features[tuple_index][:, feat_ind_min:])
mapped_features.append(np.concatenate(unstacked_feat, axis=1))
contexts_features = mapped_features
@@ -373,6 +375,7 @@ def __getitem__(self, choices_indexes):
].batch[contexts_items_features[tuple_index][:, :, feature_index]]
)
feat_ind_min = feature_index + 1
+ unstacked_feat.append(contexts_features[tuple_index][:, :, feat_ind_min:])
mapped_features.append(np.concatenate(unstacked_feat, axis=2))
contexts_items_features = mapped_features
@@ -462,6 +465,7 @@ def __getitem__(self, choices_indexes):
].batch[fixed_items_features[tuple_index][:, feature_index]]
)
feat_ind_min = feature_index + 1
+ unstacked_feat.append(fixed_items_features[tuple_index][:, feat_ind_min:])
mapped_features.append(np.concatenate(unstacked_feat, axis=1))
fixed_items_features = mapped_features
@@ -483,6 +487,7 @@ def __getitem__(self, choices_indexes):
].batch[contexts_features[tuple_index][feature_index]]
)
feat_ind_min = feature_index + 1
+ unstacked_feat.append(contexts_features[tuple_index][feat_ind_min:])
mapped_features.append(np.concatenate(unstacked_feat, axis=0))
contexts_features = mapped_features
@@ -506,6 +511,7 @@ def __getitem__(self, choices_indexes):
].batch[contexts_items_features[tuple_index][:, feature_index]]
)
feat_ind_min = feature_index + 1
+ unstacked_feat.append(contexts_items_features[tuple_index][:, feat_ind_min:])
mapped_features.append(np.concatenate(unstacked_feat, axis=1))
contexts_items_features = mapped_features
diff --git a/notebooks/choice_learn_introduction_data.ipynb b/notebooks/choice_learn_introduction_data.ipynb
index ba559f1d..11d92b1d 100644
--- a/notebooks/choice_learn_introduction_data.ipynb
+++ b/notebooks/choice_learn_introduction_data.ipynb
@@ -652,17 +652,17 @@
{
"data": {
"text/plain": [
- "([array([[1. , 2. ],\n",
+ "((array([[1. , 2. ],\n",
" [2. , 4. ],\n",
" [1.5, 1.5]], dtype=float32),\n",
" array([[11. , 12. ],\n",
" [12. , 14. ],\n",
- " [11.5, 11.5]], dtype=float32)],\n",
- " array([100, 20]),\n",
+ " [11.5, 11.5]], dtype=float32)),\n",
+ " array([100, 20], dtype=int32),\n",
" array([[100, 0],\n",
" [140, 0],\n",
" [200, 0]], dtype=int32),\n",
- " array([1., 1., 1.], dtype=float32),\n",
+ " array([1, 1, 1], dtype=object),\n",
" 0)"
]
},
@@ -707,15 +707,81 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## More Advanced use: the FeatureStore & OneHotStore"
+ "## More Advanced use: the FeatureStorage & RAM optimization"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "The FeaturseStore class is here to stock values that regularly repeat themselves over a sequence.\n",
- "Let's take an example where several stores are considered. If we want to model the utility from store features (such as surface, average number of customers, etc...), these features are shared by several choices in our dataset."
+ "## FeaturesStorage, why should I use it ?\n",
+ "Regularly, you have features that repeat themselves over several choices. It can happen if you have several times the same customer, if you have store features or if you use OneHot representations... And those are only example.\n",
+ "\n",
+ "The FeaturesStorage object is designed to help you better handle these cases. It is mainly built to work well with ChoiceDataset, but here is a small introduction on how it works:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's consider a case where we consider three supermarkets: \n",
+ "- supermarket_1 with surface of 100 and 250 average nb of customers\n",
+ "- supermarket_2 with surface of 150 and 500 average nb of customers\n",
+ "- supermarket_3 with surface of 80 and 100 average nb of customers \n",
+ "\n",
+ "In each store, we have 4 available products for which we have little information. For the example'sake, let's consider the following utility:\n",
+ "$$U(i) = u_i + \\beta_1 \\cdot S_s + \\beta_2 \\cdot C_s$$\n",
+ "With $S_s$ the surface of the store and $C_s$ its average number of customers.\n",
+ "\n",
+ "We want to estimate the base utilities $u_i$ and the two coefficients: $\\beta_1$ and $\\beta_2$.\n",
+ "\n",
+ "Let's start with creating a ChoiceDataset without the FeaturesStorage:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Usual Supermakerket Features Shape: (18, 2)\n",
+ "No features_by_ids given.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Here are our choices:\n",
+ "choices = [0, 1, 2, 0, 2, 1, 1, 0, 2, 1, 2, 0, 2, 0, 1, 2, 1, 0]\n",
+ "supermarket_features = [[100, 250], [150, 500], [80, 100]]\n",
+ "# Now our store sequence of supermarkets is:\n",
+ "supermarkets_sequence = [1, 1, 2, 3, 2, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 3, 1, 2]\n",
+ "\n",
+ "# The usual way to store the features would be to create the contexts_features array that contains\n",
+ "# the right features:\n",
+ "usual_supermarket_features = np.array([supermarket_features[supermarket_id - 1] for supermarket_id in supermarkets_sequence])\n",
+ "print(\"Usual Supermakerket Features Shape:\", usual_supermarket_features.shape)\n",
+ "\n",
+ "# And now we can create our ChoiceDataset:\n",
+ "\n",
+ "usual_dataset = ChoiceDataset(choices=choices,\n",
+ " fixed_items_features=np.eye(3),\n",
+ " contexts_features=usual_supermarket_features)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, we have our dataset, we only need to create our ChoiceModel and we are good to go. However, it would also be natural to feel unsatisfied because your dataset is not well optimized. Indeed we have repeated the same information several times having a lot of redundant information.\n",
+ "\n",
+ "If in our small use-case it does not really matter, if we consider hundreds of stores on several millions - or billions - of choices, it would become... unreasonable!\n",
+ "\n",
+ "Let's now welcome the FeaturesStorage to help us:"
]
},
{
@@ -724,20 +790,20 @@
"metadata": {},
"outputs": [],
"source": [
- "# We have three stores, represented by their (surface, average_number_of_customers):\n",
- "store_features = [[100, 250], [150, 500], [80, 100]]\n",
- "# Now we consider a sequence of choices that happen in [store_1, store_1, store_2, store_3, store_2, store_1, store_3]\n",
- "# The usual way to store the features would be:\n",
- "usual_store_features = [[100, 250], [100, 250], [150, 500], [80, 100], [150, 500], [100, 250], [80, 100]]\n",
- "# There are a lot of repetetitions, which is not very efficient...\n",
- "# Let's multiply this with 600 stores represented as one-hot over thousands of sessions, we will have a memory problem !\n",
- "# Now the StoreFeatures tries to be more efficient:\n",
+ "from choice_learn.data import FeaturesStorage\n",
"\n",
- "store = {1: [100, 250], 2: [150, 500], 3: [80, 100]} # We can use a dictionary to store the features of each store\n",
- "sequence = [1, 1, 2, 3, 2, 1, 3] # We can use a sequence of keys to represent the sequence of stores\n",
+ "features_dict = {f\"supermarket_{i+1}\": supermarket_features[i] for i in range(3)}\n",
+ "storage = FeaturesStorage(values=features_dict, name=\"supermarket_features\")\n",
"\n",
- "from choice_learn.data import FeaturesStore\n",
- "feat_store = FeaturesStore.from_dict(store, sequence)"
+ "# Let's see how we can use this bad boy:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The FeaturesStorage is basically a Python dictionnary with a wrap-up to easily get batches of data.\\\n",
+ "You can ask for a sequence of features with .batch. It works with the keys of our dictionnary that can be int, float, str, etc..."
]
},
{
@@ -751,26 +817,47 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[100, 250]\n",
- "[[100, 250], [100, 250], [100, 250]]\n"
+ "Retrieving features of first supermarket:\n",
+ "[100 250]\n",
+ "Retrieving a batch of features:\n",
+ "[[100 250]\n",
+ " [150 500]\n",
+ " [100 250]]\n"
]
}
],
"source": [
- "# Now we can access the features of the store appearing at index i in the sequence with the iloc method:\n",
+ "print(\"Retrieving features of first supermarket:\")\n",
+ "print(storage.batch[\"supermarket_1\"])\n",
+ "print(\"Retrieving a batch of features:\")\n",
+ "print(storage.batch[[\"supermarket_1\", \"supermarket_2\", \"supermarket_1\"]])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The FeaturesStorage is handy for its transparent use with ChoiceDataset. For it to work well you need:\n",
+ "- to specify a FeaturesStorage name\n",
+ "- to match FeaturesStorage ids with the sequence\n",
"\n",
- "# Let's see the features of the store at index 0\n",
- "print(feat_store.batch[0])\n",
- "# Now we can also take a whole batch:\n",
- "print(feat_store.batch[[0, 1, 5]])\n",
- "# Ah ! We selected all the indexes where the store is 1, which is why we always have the same features !"
+ "In our case we call our FeaturesStorage \"supermarket_features\", the ids are now strings, let's maker the sequence match:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "str_supermarkets_sequence = [[f\"supermarket_{i}\"] for i in supermarkets_sequence]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "In order to further optimize RAM usage, you can use the OneHotStore, built specifically for one-hot encoded features. The store will only keep the index of the one of each element and will consitute the one-hot vector only when needed."
+ "And now we can create our ChoiceDataset:"
]
},
{
@@ -779,7 +866,20 @@
"metadata": {},
"outputs": [],
"source": [
- "from choice_learn.data import OneHotStore"
+ "storage_dataset = ChoiceDataset(choices=choices,\n",
+ " contexts_features=str_supermarkets_sequence,\n",
+ " contexts_features_names=[\"supermarket_features\"],\n",
+ " fixed_items_features=np.eye(3),\n",
+ " features_by_ids=[storage],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If you have paid attention, we have specified the FeaturesStorage in the features_by_ids argument and we HAVE TO match the contexts_features_names column with the name of the Features Storage.\\\n",
+ "When calling for a batch of data, the ChoiceDataset will look into the FeaturesStorage call \"supermarket_features\" to match the values in contexts_features with the ones store in it."
]
},
{
@@ -793,31 +893,238 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "RAM storage of the OneHotStore: {'a': 0, 'b': 1, 'c': 2} with sequence: ['a' 'a' 'a' 'c' 'b' 'b' 'a']\n",
- "One-hot vector at index 0: [1. 0. 0.]\n",
- "One-hot vector at indexes [0, 1, 3]:\n",
- "[[1. 0. 0.]\n",
- " [1. 0. 0.]\n",
- " [0. 0. 1.]]\n"
+ "Batch Fixed Items Features: [[1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]]\n",
+ "Batch Contexts Features: [100 250]\n",
+ "Batch Choice: 0\n",
+ "%-------------------------%\n",
+ "Batch Fixed Items Features: [[1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]]\n",
+ "Batch Contexts Features: [[100 250]\n",
+ " [150 500]\n",
+ " [ 80 100]]\n",
+ "Batch Choice: [1 2 0]\n",
+ "%-------------------------%\n",
+ "Batch Fixed Items Features: [[1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]]\n",
+ "Batch Contexts Features: [[100 250]\n",
+ " [100 250]\n",
+ " [100 250]]\n",
+ "Batch Choice: [0 1 1]\n"
]
}
],
"source": [
- "store = OneHotStore.from_sequence([\"a\", \"a\", \"a\", \"c\", \"b\", \"b\", \"a\"])\n",
+ "batch = storage_dataset.batch[0]\n",
+ "print(\"Batch Fixed Items Features:\", batch[0])\n",
+ "print(\"Batch Contexts Features:\", batch[1])\n",
+ "print(\"Batch Choice:\", batch[4])\n",
+ "print(\"%-------------------------%\")\n",
+ "batch = storage_dataset.batch[[1, 2, 3]]\n",
+ "print(\"Batch Fixed Items Features:\", batch[0])\n",
+ "print(\"Batch Contexts Features:\", batch[1])\n",
+ "print(\"Batch Choice:\", batch[4])\n",
+ "print(\"%-------------------------%\")\n",
+ "batch = storage_dataset.batch[[0, 1, 5]]\n",
+ "print(\"Batch Fixed Items Features:\", batch[0])\n",
+ "print(\"Batch Contexts Features:\", batch[1])\n",
+ "print(\"Batch Choice:\", batch[4])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Everything is mapped as needed. And the great thing is that you can easily mix ''classical'' features with FeaturesStorages.\\\n",
+ "Let's add a 'is_week_end' feature to our problem that will also be stored as a contexts_features."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " supermarket_features | \n",
+ " is_week_end | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " supermarket_1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " supermarket_1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " supermarket_2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " supermarket_3 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " supermarket_2 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " supermarket_features is_week_end\n",
+ "0 supermarket_1 0\n",
+ "1 supermarket_1 0\n",
+ "2 supermarket_2 0\n",
+ "3 supermarket_3 1\n",
+ "4 supermarket_2 1"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contexts_features = pd.DataFrame({\"supermarket_features\": np.array(str_supermarkets_sequence).squeeze(),\n",
+ "\"is_week_end\": [0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0]})\n",
+ "contexts_features.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creation of the ChoiceDataset\n",
+ "storage_dataset = ChoiceDataset(choices=choices,\n",
+ " contexts_features=contexts_features,\n",
+ " fixed_items_features=np.eye(3),\n",
+ " features_by_ids=[storage],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Batch Fixed Items Features: [[1. 0. 0.]\n",
+ " [0. 1. 0.]\n",
+ " [0. 0. 1.]]\n",
+ "Batch Contexts Features: [[100 250 0]\n",
+ " [150 500 0]\n",
+ " [ 80 100 1]]\n",
+ "Batch Choice: [1 2 0]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# And now it's ready\n",
+ "batch = storage_dataset.batch[[1, 2, 3]]\n",
+ "print(\"Batch Fixed Items Features:\", batch[0])\n",
+ "print(\"Batch Contexts Features:\", batch[1])\n",
+ "print(\"Batch Choice:\", batch[4])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Specific sub-example: the OneHot Storage\n",
+ "A recurring usecase is the use of **OneHot** representation of features. The OneHotStorage is built specifically for one-hot encoded features and further improves memory consumption. The storage is to be used the same way as FeaturesStorage, but behind will only keep the index of the one of each element and will consitute the one-hot vector only when needed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from choice_learn.data import OneHotStorage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RAM storage of the OneHotStore: {'a': 0, 'b': 1, 'c': 2}\n",
+ "One-hot vector batch: storage.batch['a'] [1 0 0]\n",
+ "One-hot vector batch: storage.batch[['a', 'b', 'c', 'c', 'b', 'a']]\n",
+ "[[1 0 0]\n",
+ " [0 1 0]\n",
+ " [0 0 1]\n",
+ " [0 0 1]\n",
+ " [0 1 0]\n",
+ " [1 0 0]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "storage = OneHotStorage(ids=[\"a\", \"b\", \"c\"])\n",
"\n",
- "# When using from_sequence, the store collects ranked order (lower to higher) of each element as index\n",
- "print(\"RAM storage of the OneHotStore:\", store.store, \"with sequence:\", store.sequence)\n",
- "# When indexing with iloc, we can access the one-hot encoding of the element at index i in the sequence\n",
- "print(\"One-hot vector at index 0:\", store.batch[0])\n",
- "print(\"One-hot vector at indexes [0, 1, 3]:\")\n",
- "print(store.batch[[0, 1, 3]])"
+ "print(\"RAM storage of the OneHotStore:\", storage.storage)\n",
+ "# When indexing with .batch, we can access the one-hot encoding of the element using its id\n",
+ "print(\"One-hot vector batch: storage.batch['a']\", storage.batch[\"a\"])\n",
+ "print(\"One-hot vector batch: storage.batch[['a', 'b', 'c', 'c', 'b', 'a']]\")\n",
+ "print(storage.batch[[\"a\", \"b\", \"c\", \"c\", \"b\", \"a\"]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "- Add: Example from pandas.DataFrame"
+ "**Note that:**\n",
+ "- we use strings as ids for the example, however we recommend to use integers.\n",
+ "- FeaturesStorage can be instantiated from dict, np.ndarray, list, pandas.DataFrame, etc...\n",
+ "- More in-depth examples and explanations can be found [here](./features_byID_example.ipynb)"
]
},
{
@@ -1076,6 +1383,13 @@
"cell_type": "markdown",
"metadata": {},
"source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/notebooks/features_byID_example.ipynb b/notebooks/features_byID_example.ipynb
index 6237dcd6..d285beb7 100644
--- a/notebooks/features_byID_example.ipynb
+++ b/notebooks/features_byID_example.ipynb
@@ -1,5 +1,12 @@
{
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Deep dive on FeaturesStorage"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
@@ -34,250 +41,407 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Different Instantiation Possibilities for Storage:"
+ "## Different Instantiation Possibilities for Storage:\n",
+ "### 1 - from dict"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "keep_output": true
+ },
"outputs": [],
"source": [
"features = {\"customerA\": [1, 2, 3], \"customerB\": [4, 5, 6], \"customerC\": [7, 8, 9]}\n",
- "storage = FeaturesStorage(values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers_features\")"
+ "# dict must be {id: features}\n",
+ "storage = FeaturesStorage(values=features,\n",
+ " values_names=[\"age\", \"income\", \"children_nb\"],\n",
+ " name=\"customers_features\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Subset in order to only keep som ids\n",
"storage[[\"customerA\", \"customerC\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[1, 2, 3],\n",
+ " [7, 8, 9],\n",
+ " [1, 2, 3],\n",
+ " [7, 8, 9]])"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Batch to access the features values\n",
"storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
- "source": []
+ "source": [
+ "### 2 - from list"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[1, 2, 3],\n",
+ " [7, 8, 9],\n",
+ " [1, 2, 3],\n",
+ " [7, 8, 9]])"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n",
"ids = [\"customerA\", \"customerB\", \"customerC\"]\n",
"\n",
- "storage = FeaturesStorage(ids=ids, values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers\")\n",
+ "storage = FeaturesStorage(ids=ids,\n",
+ " values=features,\n",
+ " values_names=[\"age\", \"income\", \"children_nb\"],\n",
+ " name=\"customers\")\n",
+ "# We get the same result as before\n",
"storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n",
+ "### 3 - from list, without ids\n",
"\n",
- "storage = FeaturesStorage(values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers\")\n",
- "storage.batch[[0, 2, 0, 2]]"
+ "The ids are generated automatically as increasing integers:"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[1, 2, 3],\n",
+ " [7, 8, 9],\n",
+ " [1, 2, 3],\n",
+ " [7, 8, 9]])"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "features = {\"age\": [1, 2, 3], \"income\": [4, 5, 6], \"children_nb\": [7, 8, 9], \"id\": [\"customerA\", \"customerB\", \"customerC\"]}\n",
- "features = pd.DataFrame(features)\n",
- "storage = FeaturesStorage(values=features, name=\"customers\")\n",
- "storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]"
+ "features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n",
+ "\n",
+ "storage = FeaturesStorage(values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers\")\n",
+ "storage.batch[[0, 2, 0, 2]]"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "features = {\"age\": [1, 2, 3], \"income\": [4, 5, 6], \"children_nb\": [7, 8, 9]}\n",
- "features = pd.DataFrame(features, index=[\"customerA\", \"customerB\", \"customerC\"])\n",
- "storage = FeaturesStorage(values=features, name=\"customers\")\n",
- "storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]"
+ "### 4 - from pandas.DataFrame"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ids = [0, 1, 2, 3, 4]\n",
- "values = [4, 3, 2, 1, 0]\n",
- "\n",
- "oh_storage = OneHotStorage(ids=ids, values=values, name=\"OneHotTest\")"
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[1, 2, 3],\n",
+ " [7, 8, 9],\n",
+ " [1, 2, 3],\n",
+ " [7, 8, 9]])"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Here the DataFrame has a column \"id\" that identifies the keys from the features values\n",
+ "features = {\"age\": [1, 4, 7], \"income\": [2, 5, 8], \"children_nb\": [3, 6, 9], \"id\": [\"customerA\", \"customerB\", \"customerC\"]}\n",
+ "features = pd.DataFrame(features)\n",
+ "storage = FeaturesStorage(values=features, name=\"customers\")\n",
+ "storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "oh_storage.batch[[0, 2, 4]], oh_storage.get_element_from_index(0)"
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[1, 2, 3],\n",
+ " [7, 8, 9],\n",
+ " [1, 2, 3],\n",
+ " [7, 8, 9]])"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Here the DataFrame does not have a column \"id\" that identifies the keys from the features values\n",
+ "# We thus specify the 'index'\n",
+ "features = {\"age\": [1, 4, 7], \"income\": [2, 5, 8], \"children_nb\": [3, 6, 9]}\n",
+ "features = pd.DataFrame(features, index=[\"customerA\", \"customerB\", \"customerC\"])\n",
+ "storage = FeaturesStorage(values=features, name=\"customers\")\n",
+ "storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "oh_storage = OneHotStorage(values=values, name=\"OneHotTest\")\n",
- "oh_storage.batch[[0, 2, 4]]"
+ "### 5 - OneHotStorage from list"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "keep_output": true
+ },
"outputs": [],
"source": [
- "oh_storage = OneHotStorage(ids=ids, name=\"OneHotTest\")\n",
- "oh_storage.batch[[0, 2, 4]]"
+ "ids = [0, 1, 2, 3, 4]\n",
+ "values = [4, 3, 2, 1, 0]\n",
+ "\n",
+ "# Here the Storage will map the ids to the values\n",
+ "# value = 4 means that the fifth value is a one, the rest are zeros\n",
+ "oh_storage = OneHotStorage(ids=ids, values=values, name=\"OneHotTest\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "values_dict = {k:v for k, v in zip(ids, values)}\n",
- "oh_storage = OneHotStorage(values=values_dict, name=\"OneHotTest\")\n",
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[0, 0, 0, 0, 1],\n",
+ " [0, 0, 1, 0, 0],\n",
+ " [1, 0, 0, 0, 0]], dtype=uint8)"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Get OneHot vectors:\n",
"oh_storage.batch[[0, 2, 4]]"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(4, {0: 4, 1: 3, 2: 2, 3: 1, 4: 0})"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "oh_storage = OneHotStorage(ids=ids, values=values_dict, name=\"OneHotTest\")\n",
- "try:\n",
- " oh_storage = OneHotStorage(name=\"OneHotTest\")\n",
- " assert False\n",
- "except:\n",
- " assert True"
+ "# Get the Storage value\n",
+ "oh_storage.get_element_from_index(0), oh_storage.storage"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Use of FeaturesByID and Storage in the ChoiceDataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "features = {\"customerA\": [1, 2, 3], \"customerB\": [4, 5, 6], \"customerC\": [7, 8, 9]}\n",
- "storage = FeaturesStorage(values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers_features\")"
+ "### 6 - OneHotStorage from single list\n",
+ "\n",
+ "If only the values are given, the ids are created as increasing integers."
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[0, 0, 0, 0, 1],\n",
+ " [0, 0, 1, 0, 0],\n",
+ " [1, 0, 0, 0, 0]], dtype=uint8)"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "fixed_items_features = np.eye(3)\n",
- "prices = [[[4, 1], [4, 1], [5, 1]], [[5, 2], [4, 2], [6, 2]],\n",
- " [[6, 3], [7, 3], [8, 3]], [[4, 4], [5, 4], [4, 4]]]\n",
- "choices = [0, 1 , 2, 2]\n",
- "contexts_features = [[\"customerA\"], [\"customerB\"], [\"customerC\"], [\"customerA\"]]\n",
- "\n",
- "dataset = ChoiceDataset(\n",
- " fixed_items_features=fixed_items_features,\n",
- " contexts_features=contexts_features,\n",
- " choices=choices,\n",
- " contexts_items_features=prices,\n",
- " features_by_ids=[storage],\n",
- " contexts_features_names=[\"customers_features\"],\n",
- " )"
+ "oh_storage = OneHotStorage(values=values, name=\"OneHotTest\")\n",
+ "oh_storage.batch[[0, 2, 4]]"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "dataset.get_choices_batch(0)"
+ "If the values are not given, they are also created from the ids as increasing integers."
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[1, 0, 0, 0, 0],\n",
+ " [0, 0, 1, 0, 0],\n",
+ " [0, 0, 0, 0, 1]], dtype=uint8)"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {
+ "keep_output": true
+ },
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "dataset.batch[0]"
+ "oh_storage = OneHotStorage(ids=ids, name=\"OneHotTest\")\n",
+ "oh_storage.batch[[0, 2, 4]]\n",
+ "# Note that here it changes the order !"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "dataset.get_choices_batch([1, 2])"
+ "### 7 - OneHotStorage from dict"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[0, 0, 0, 0, 1],\n",
+ " [0, 0, 1, 0, 0],\n",
+ " [1, 0, 0, 0, 0]], dtype=uint8)"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "myd = {1: 2, 2:3, 3:4, 4:5}"
+ "values_dict = {k:v for k, v in zip(ids, values)}\n",
+ "oh_storage = OneHotStorage(values=values_dict, name=\"OneHotTest\")\n",
+ "oh_storage.batch[[0, 2, 4]]"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "np.sort(list(myd.keys()))"
+ "## Use of FeaturesByID and Storage in the ChoiceDataset"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "keep_output": true
+ },
"outputs": [],
"source": [
- "dataset.batch[1:3]"
+ "features = {\"customerA\": [1, 2, 3], \"customerB\": [4, 5, 6], \"customerC\": [7, 8, 9]}\n",
+ "storage = FeaturesStorage(values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers_features\")"
]
},
{
@@ -313,30 +477,232 @@
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "dataset.contexts_items_features_map[0][0].storage"
+ "## Example with the SwissMetro dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "dataset.get_choices_batch([0, 1])"
+ "metadata": {
+ "keep_output": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GROUP | \n",
+ " SURVEY | \n",
+ " SP | \n",
+ " ID | \n",
+ " PURPOSE | \n",
+ " FIRST | \n",
+ " TICKET | \n",
+ " WHO | \n",
+ " LUGGAGE | \n",
+ " AGE | \n",
+ " ... | \n",
+ " TRAIN_CO | \n",
+ " TRAIN_HE | \n",
+ " SM_TT | \n",
+ " SM_CO | \n",
+ " SM_HE | \n",
+ " SM_SEATS | \n",
+ " CAR_TT | \n",
+ " CAR_CO | \n",
+ " CHOICE | \n",
+ " CAR_HE | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " ... | \n",
+ " 48.0 | \n",
+ " 120.0 | \n",
+ " 63.0 | \n",
+ " 52.0 | \n",
+ " 20.0 | \n",
+ " 0.0 | \n",
+ " 117.0 | \n",
+ " 65.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " ... | \n",
+ " 48.0 | \n",
+ " 30.0 | \n",
+ " 60.0 | \n",
+ " 49.0 | \n",
+ " 10.0 | \n",
+ " 0.0 | \n",
+ " 117.0 | \n",
+ " 84.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " ... | \n",
+ " 48.0 | \n",
+ " 60.0 | \n",
+ " 67.0 | \n",
+ " 58.0 | \n",
+ " 30.0 | \n",
+ " 0.0 | \n",
+ " 117.0 | \n",
+ " 52.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " ... | \n",
+ " 40.0 | \n",
+ " 30.0 | \n",
+ " 63.0 | \n",
+ " 52.0 | \n",
+ " 20.0 | \n",
+ " 0.0 | \n",
+ " 72.0 | \n",
+ " 52.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " ... | \n",
+ " 36.0 | \n",
+ " 60.0 | \n",
+ " 63.0 | \n",
+ " 42.0 | \n",
+ " 20.0 | \n",
+ " 0.0 | \n",
+ " 90.0 | \n",
+ " 84.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 29 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GROUP SURVEY SP ID PURPOSE FIRST TICKET WHO LUGGAGE AGE ... \\\n",
+ "0 2.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 0.0 3.0 ... \n",
+ "1 2.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 0.0 3.0 ... \n",
+ "2 2.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 0.0 3.0 ... \n",
+ "3 2.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 0.0 3.0 ... \n",
+ "4 2.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 0.0 3.0 ... \n",
+ "\n",
+ " TRAIN_CO TRAIN_HE SM_TT SM_CO SM_HE SM_SEATS CAR_TT CAR_CO CHOICE \\\n",
+ "0 48.0 120.0 63.0 52.0 20.0 0.0 117.0 65.0 2.0 \n",
+ "1 48.0 30.0 60.0 49.0 10.0 0.0 117.0 84.0 2.0 \n",
+ "2 48.0 60.0 67.0 58.0 30.0 0.0 117.0 52.0 2.0 \n",
+ "3 40.0 30.0 63.0 52.0 20.0 0.0 72.0 52.0 2.0 \n",
+ "4 36.0 60.0 63.0 42.0 20.0 0.0 90.0 84.0 2.0 \n",
+ "\n",
+ " CAR_HE \n",
+ "0 0.0 \n",
+ "1 0.0 \n",
+ "2 0.0 \n",
+ "3 0.0 \n",
+ "4 0.0 \n",
+ "\n",
+ "[5 rows x 29 columns]"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from choice_learn.datasets import load_swissmetro\n",
+ "\n",
+ "df = load_swissmetro(as_frame=True)\n",
+ "df = df.loc[df.CHOICE!=0]\n",
+ "df.head()"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "dataset.batch[[0, 1]]"
+ "The ID column refers to a unique participant to the survey. Each participant answered several cases. We therefore have several times the features concerning this participant. A perfect example for FeaturesStorage."
]
},
{
@@ -345,7 +711,14 @@
"metadata": {},
"outputs": [],
"source": [
- "a = np.array([1, 2, 3])"
+ "customer_columns = ['ID', 'GROUP', 'SURVEY', 'SP', 'PURPOSE', 'FIRST', 'TICKET', 'WHO',\n",
+ " 'LUGGAGE', 'AGE', 'MALE', 'INCOME', 'GA', 'ORIGIN', 'DEST']\n",
+ "customer_features = df[customer_columns].drop_duplicates()\n",
+ "customer_features = customer_features.rename(columns={\"ID\": \"id\"})\n",
+ "customer_storage = FeaturesStorage(values=customer_features, name=\"customer_features\")\n",
+ "\n",
+ "contexts_features = df[[\"ID\"]]\n",
+ "contexts_features = contexts_features.rename(columns={\"ID\": \"customer_features\"})"
]
},
{
@@ -354,7 +727,11 @@
"metadata": {},
"outputs": [],
"source": [
- "a[2:2]"
+ "choices = df.CHOICE.to_numpy() - 1\n",
+ "contexts_items_availabilities = df[[\"TRAIN_AV\", \"SM_AV\", \"CAR_AV\"]].to_numpy()\n",
+ "contexts_items_features = np.stack([df[[\"TRAIN_TT\", \"TRAIN_CO\", \"TRAIN_HE\"]].to_numpy(),\n",
+ " df[[\"SM_TT\", \"SM_CO\", \"SM_HE\"]].to_numpy(),\n",
+ " df[[\"CAR_TT\", \"CAR_CO\", \"CAR_HE\"]].to_numpy()], axis=1)"
]
},
{
@@ -363,16 +740,18 @@
"metadata": {},
"outputs": [],
"source": [
- "np.concatenate([a[:2], a[2:2], a[2:3]])"
+ "choice_dataset = ChoiceDataset(contexts_features=contexts_features,\n",
+ " contexts_items_features=contexts_items_features,\n",
+ " contexts_items_availabilities=contexts_items_availabilities,\n",
+ " choices=choices,\n",
+ " features_by_ids=[customer_storage],)"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "myd = {1:2, 2:3}"
+ "Et voilà !"
]
},
{
@@ -381,7 +760,12 @@
"metadata": {},
"outputs": [],
"source": [
- "next(iter(myd.values()))"
+ "batch = choice_dataset.batch[[0, 10, 200]]\n",
+ "print(\"Batch Fixed Items Features:\", batch[0])\n",
+ "print(\"Batch Contexts Features:\", batch[1])\n",
+ "print(\"Batch Contexts Items Features:\", batch[2])\n",
+ "print(\"Batch Contexts Items Availabilities:\", batch[3])\n",
+ "print(\"Batch Choices:\", batch[4])"
]
},
{