From ca1a1f6dd596c6b7fdb2a83b6b716431f514333c Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Fri, 16 Feb 2024 12:04:16 +0100 Subject: [PATCH 01/14] ADD: typo and suggestions for README --- README.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9481432a..b91aaa26 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Choice-Learn is a Python package designed to help you build discrete choice models. -The package provides ready to use datasets and different models from the litterature. It also provides a lower level use if you want to customize any model or create your own from scratch. In particular you will find smart datasets handling to limit RAM usage and different structure commons to any choice model. +The package provides ready-to-use datasets and models from the litterature. It also provides a lower level use if you want to customize any model or create your own from scratch. In particular you will find efficient data handling to limit RAM usage and structure commons to any choice model. Choice-Learn uses NumPy and pandas as data backend engines and TensorFlow for models. @@ -47,10 +47,10 @@ If you are new to choice modelling, you can check this [resource](https://www.pu - ModeCanada from Koppelman et al. (1993) ### Models -- Ready to use models: +- Ready-to-use models: - Conditional MultiNomialLogit, Train, K.; McFadden, D.; Ben-Akiva, M. (1987) - RUMnet, Aouad A.; Désir A. (2022) [1] -- Ready to use models to be implemented: +- (WIP) Ready-to-use models to be implemented: - Nested MultiNomialLogit - MultiNomialLogit with latent variables (MixedLogit) - TasteNet @@ -108,7 +108,13 @@ A detailed documentation of this project is available [here](https://artefactory ## Citation -### Contributors +If you consider this package and any of its feature useful for your research, please cite our paper: + +(WIP - Paper to come) + +### License + +The use of this software is under the MIT (tbc) license, with no limitation of usage, including for commercial applications. ## References From 0d59645840d75e465c98bef1e3cf2b4fb5bb1daa Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Fri, 16 Feb 2024 14:01:39 +0100 Subject: [PATCH 02/14] ADD: requirements with python version explicit (took me time to install the environment... hehe) --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index b91aaa26..3ae3bf86 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,10 @@ For modelling you need: Finally, an optional requirement used for report and LBFG-s use is: - tensorflow_probability (>=0.20.1) +Once you have created your conda/pip python==3.9 environment, you can install requirements by: +```bash +pip install choice-learn +``` ## Usage ```python from choice_learn.data import ChoiceDataset From 1eb47ece22ad747dc99a7b8d15daa0eb1e8bc91d Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Fri, 16 Feb 2024 14:25:41 +0100 Subject: [PATCH 03/14] FIX: changed way to locate data module (with os) --- choice_learn/datasets/base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index 92b65eab..99f49571 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -2,13 +2,15 @@ import csv import gzip from importlib import resources +import os import numpy as np import pandas as pd from choice_learn.data.choice_dataset import ChoiceDataset -DATA_MODULE = "choice_learn.datasets.data" + +DATA_MODULE = os.path.join(os.path.abspath('..'), 'choice_learn', 'datasets', 'data') def load_csv(data_file_name, data_module=DATA_MODULE, encoding="utf-8"): @@ -30,8 +32,7 @@ def load_csv(data_file_name, data_module=DATA_MODULE, encoding="utf-8"): np.ndarray data contained in the csv file """ - data_path = resources.files(data_module) - with (data_path / data_file_name).open("r", encoding=encoding) as csv_file: + with open(os.path.join(data_module, data_file_name), "r", encoding=encoding) as csv_file: data_file = csv.reader(csv_file) names = next(data_file) data = [] @@ -60,8 +61,7 @@ def load_gzip(data_file_name, data_module=DATA_MODULE, encoding="utf-8"): np.ndarray data contained in the csv file """ - data_path = resources.files(data_module) - with (data_path / data_file_name).open("rb") as compressed_file: + with open(os.path.join(data_module, data_file_name), "rb") as compressed_file: compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding) names = next(compressed_file) names = names.replace("\n", "") From 6789e60cae392c55906fa9f620ca648487d49333 Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Fri, 16 Feb 2024 14:26:22 +0100 Subject: [PATCH 04/14] ADD: remove useless import --- choice_learn/datasets/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index 99f49571..4ac6930d 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -1,7 +1,6 @@ """Datasets loader.""" import csv import gzip -from importlib import resources import os import numpy as np From ff445e4e8743a40a63f15d9a3b786e684cb8286f Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Fri, 16 Feb 2024 18:01:27 +0100 Subject: [PATCH 05/14] ADD : some cosmetics for notebooks (and choicedataset) --- choice_learn/data/choice_dataset.py | 33 +- .../choice_learn_introduction_data.ipynb | 418 ++++++++++++------ 2 files changed, 294 insertions(+), 157 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 56448da1..72a4b239 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -363,7 +363,6 @@ def _build_features_by_ids(self): indexes and features_by_id of contexts_items_features """ if len(self.features_by_ids) == 0: - print("No features_by_ids given.") return {}, {}, {} if ( @@ -614,6 +613,10 @@ def __len__(self): """ return len(self.choices) + def __str__(self): + template = '''First choice is:\nItems features: {}\nContexts features: {}\nContexts Items features: {}\nContexts Items Availabilities: {}\nContexts Choice: {}''' + return template.format(self.batch[0][0], self.batch[0][1], self.batch[0][2], self.batch[0][3], self.batch[0][4]) + def get_num_items(self): """Method to access the total number of different items. @@ -823,13 +826,13 @@ def from_single_wide_df( def from_single_long_df( cls, df, + choices_column="choice", + items_id_column="item_id", + contexts_id_column="context_id", fixed_items_features_columns=None, contexts_features_columns=None, contexts_items_features_columns=None, - items_id_column="item_id", - contexts_id_column="context_id", - choices_column="choice", - choice_mode="items_id", + choice_format="items_id", ): """Builds numpy arrays for ChoiceDataset from a single dataframe in long format. @@ -837,19 +840,19 @@ def from_single_long_df( ---------- df : pandas.DataFrame dataframe in Long format + choices_column: str, optional + Name of the column containing the choices, default is "choice" + items_id_column: str, optional + Name of the column containing the item ids, default is "items_id" + contexts_id_column: str, optional + Name of the column containing the sessions ids, default is "contexts_id" fixed_items_features_columns : list Columns of the dataframe that are item features, default is None contexts_features_columns : list Columns of the dataframe that are contexts features, default is None contexts_items_features_columns : list Columns of the dataframe that are context-item features, default is None - items_id_column: str, optional - Name of the column containing the item ids, default is "items_id" - contexts_id_column: str, optional - Name of the column containing the sessions ids, default is "contexts_id" - choices_column: str, optional - Name of the column containing the choices, default is "choice" - choice_mode: str, optional + choice_format: str, optional How choice is indicated in df, either "items_name" or "one_zero", default is "items_id" @@ -901,13 +904,13 @@ def from_single_long_df( else None ) - if choice_mode == "items_id": + if choice_format == "items_id": choices = df[[choices_column, contexts_id_column]].drop_duplicates(contexts_id_column) choices = choices.set_index(contexts_id_column) choices = choices.loc[sessions].to_numpy() # items is the value (str) of the item choices = np.squeeze([np.where(items == c)[0] for c in choices]) - elif choice_mode == "one_zero": + elif choice_format == "one_zero": choices = df[[items_id_column, choices_column, contexts_id_column]] choices = choices.loc[choices[choices_column] == 1] choices = choices.set_index(contexts_id_column) @@ -918,7 +921,7 @@ def from_single_long_df( ) else: raise ValueError( - f"choice_mode {choice_mode} not recognized. Must be in ['items_id', 'one_zero']" + f"choice_format {choice_format} not recognized. Must be in ['items_id', 'one_zero']" ) return ChoiceDataset( fixed_items_features=items_features, diff --git a/notebooks/choice_learn_introduction_data.ipynb b/notebooks/choice_learn_introduction_data.ipynb index 05c0f692..32096086 100644 --- a/notebooks/choice_learn_introduction_data.ipynb +++ b/notebooks/choice_learn_introduction_data.ipynb @@ -4,62 +4,69 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Introduction to choice-learn's data management" + "# Introduction to choice-learn's data management" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", - "from pathlib import Path\n", - "\n", - "sys.path.append(\"../\")\n", "\n", - "import numpy as np\n", - "import pandas as pd" + "sys.path.append(\"../\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### ChoiceDataset - Getting Started !\n", + "## ChoiceDataset - Getting Started !\n", + "\n", + "In order to estimate a model using the choice-learn API, you will first need to wrap your dataset within a ChoiceDataset.\n", "\n", - "choice-learn package aims at being able to handle large datasets. One of the main idea is to limit as much as possible the usage of memory to save several time the same feature.\n", + "choice-learn ChoiceDataset aims at being able to handle large datasets, typically by limiting the usage of memory to store several times the same feature.\n", "We define two sources of features, the items and the contexts.\n", "\n", "**Items** represent a product, an alternative that can be chosen by the customer at some point.\n", "\n", - "**Contexts** represent the different cases of the dataset. One context corresponds to one choice and regroups every factor that might be different from one choice to another.\n", + "**Contexts** represent the contexts surrounding each choice. One context corresponds to one choice and regroups every factor that might be different from one choice to another.\n", "\n", "\n", - "From these two concepts, we defines 5 different types of data, storing them separatly in order to avoid redundancy in memory:\n", + "From these two concepts, we defines 5 types of data:\n", "\n", "- **choices:** The main information, indicating which item/alternative has been chosen among all availables\n", "- **fixed_items_features:** The items features that never change (e.g. size, color, etc...) over the choices/contexts.\n", - "- **contexts_features:** It represents all the features that might change from one choice to another and that are **common** to all items (e.g. day of week, customer features, etc...)\n", - "- **contexts_items_features:** The features that are function of the item and of the context (e.g. prices change over contexts and are specific to each sold item, etc...)\n", - "\n", + " \n", + " Size=number of items.\n", + "- **contexts_features:** It represents all the features that might change from one choice to another and that are **common** to all items (e.g. day of week, customer features, etc...).\n", + " \n", + " Size=number of choices.\n", + "- **contexts_items_features:** The features that are function of the item and of the context (e.g. prices change over contexts and are specific to each sold item, etc...).\n", + " \n", + " Size=number of choices x number of items\n", "- **contexts_items_availabilities:** For each context it represents whether each item/alternative is proposed to the customer (1.) or not (0.).\n", + " \n", + " Size=number of choices.\n", "\n", "\n", - "In order to estimate a model using the choice-learn API, you will first need to wrap your dataset within a ChoiceDataset. The easiest way to do it is to use a pandas DataFrame, let's see how to do it !" + "The easiest way to do it is to use a pandas DataFrame, let's see how to do it !" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "## Hands-on: example from a DataFrame with ModeCanada\n", + "\n", "We will use the ModeCanada [1] dataset for this example. It is provided with the choice-learn package and can loaded as follows:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "keep_output": true }, @@ -182,12 +189,15 @@ "5 3 train 0 83 28.25 50 66 4 70.0 0 2" ] }, - "execution_count": null, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", "from choice_learn.data import ChoiceDataset\n", "from choice_learn.datasets import load_modecanada\n", "\n", @@ -225,7 +235,7 @@ "Easy ! It is the alternative whenever the value is one.\n", "\n", "**contexts_features**\n", - "The income, urban and distance (also noalt which is not really a feature) features are the same for all the alternative within a context: they are contexts_features.\n", + "The income, urban and distance (also noalt which is not really a feature) features are the same for all the alternative within a context: they are contexts_features. They are all constant with respect to a case=traveler ID.\n", "\n", "**contexts_items_features**\n", "Ivt, Ovt, cost and freq depends on the alternative and change over the contexts. They are contexts_items_features.\n", @@ -239,13 +249,150 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
casealtchoicedistcostivtovtfreqincomeurbannoaltis_public
11train08328.255066445.0021.0
21car18315.77610045.0020.0
32train08328.255066425.0021.0
42car18315.77610025.0020.0
53train08328.255066470.0021.0
\n", + "
" + ], + "text/plain": [ + " case alt choice dist cost ivt ovt freq income urban noalt \\\n", + "1 1 train 0 83 28.25 50 66 4 45.0 0 2 \n", + "2 1 car 1 83 15.77 61 0 0 45.0 0 2 \n", + "3 2 train 0 83 28.25 50 66 4 25.0 0 2 \n", + "4 2 car 1 83 15.77 61 0 0 25.0 0 2 \n", + "5 3 train 0 83 28.25 50 66 4 70.0 0 2 \n", + "\n", + " is_public \n", + "1 1.0 \n", + "2 0.0 \n", + "3 1.0 \n", + "4 0.0 \n", + "5 1.0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "transport_df = canada_transport_df.copy()\n", "items = [\"air\", \"bus\", \"car\", \"train\"]\n", "\n", + "# Add \"is_public\" feature for transport modes\n", "transport_df[\"is_public\"] = transport_df.apply(lambda row: 0. if row.alt == \"car\" else 1., axis=1)\n", "\n", "# Just some typing\n", @@ -260,46 +407,54 @@ "metadata": {}, "source": [ "Our feature, is_public is 0 for the car and 1 for all other alternatives, seems fine! We can now create our ChoiceDataset !\\\n", - "*Note that you do NOT need each type of feature, here the purpose was to give a complete example.*\n", + "*Note that you do NOT need each type of feature, here the purpose was to give a complete example.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating a ChoiceDataset from this *single* dataframe\n", "\n", "In order to create the ChoiceDataset from the DataFrame, we need to specify:\n", + "- the column in which the choice is given\n", + "- the column where the item is identified \n", + "- the column where the context is identified\n", "- the columns representing the fixed_items_features\n", "- the columns representing the contexts_features\n", "- the columns representing the contexts_items_features\n", - "- the column where the item is identified \n", - "- the column where the context is identified\n", - "- the column in which the choice is given\n", + "\n", "\n", "For our Canada Transport example, here is how it should be done:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "dataset = ChoiceDataset.from_single_long_df(df=transport_df,\n", - " fixed_items_features_columns=[\"is_public\"],\n", - " contexts_features_columns=[\"income\", \"urban\", \"dist\"],\n", - " contexts_items_features_columns=[\"cost\", \"freq\", \"ovt\", \"ivt\"],\n", - " items_id_column=\"alt\",\n", - " contexts_id_column=\"case\",\n", - " choices_column=\"choice\",\n", - " choice_mode=\"one_zero\")" + "dataset = ChoiceDataset.from_single_long_df(\n", + " df=transport_df,\n", + " choices_column=\"choice\",\n", + " items_id_column=\"alt\",\n", + " contexts_id_column=\"case\",\n", + " fixed_items_features_columns=[\"is_public\"],\n", + " contexts_features_columns=[\"income\", \"urban\", \"dist\"],\n", + " contexts_items_features_columns=[\"cost\", \"freq\", \"ovt\", \"ivt\"],\n", + " choice_format=\"one_zero\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "I have added an argument without any warning, the \"choice_mode\". It only precises how the choice is encoded in the dataframe. Currently two modes are availble:\n", + "Last argument, \"choice_format\", precises how the choice is encoded in the dataframe. Currently two modes are availble:\n", "\n", - "**one_zero:**\n", + " - *one_zero*:\n", "The choice column contains a 0 when the alternative/item is not chosen in the session and a 1 if it is chosen.\n", "This is the case here with Canada Transport.\n", - "\n", - "**item_id:**\n", + " - *item_id*:\n", "The choice column contains the id of the choice during the session. The id corresponds to the values used in the column 'items_id_column'.\n", "In this case of Canada Transport, the dataframe would need to be:\n", "\n", @@ -321,7 +476,7 @@ "The ChoiceDataset is ready !\n", "\n", "You now have three possibilities to continue discovering the choice-learn package:\n", - "- You can directly go [here]() to the modelling tutorial if you want to understand how a first simple ConditionMNl would be implementd.\n", + "- You can directly go [here]() to the modelling tutorial if you want to understand how a first simple ConditionMNL would be implementd.\n", "- You can go [here]() if your dataset is organized differently to see all the different ways to instantiate a ChoiceDataset. In particular it helps if you DataFrame is in the wide format or if it is splitted into several DataFrames.\n", "- Or you can continue this current tutorial to better understand the ChoiceDataset machinery and everything there is to know about it.\n", "\n", @@ -332,26 +487,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### ChoiceDataset - Inside the machine\n", + "## Hands-on: example from a NumPy arrays\n", + "\n", + "Let's see an example of ChoiceDataset instantiation from numpy arrays.\n", + "\n", + "Let's consider three *items* whose *features* are: Size, Weight, price, promotion (simply a boolean to indicate whether it is under promotion).\n", + "\n", + "For size and weights, we will store as *fixed items features* as they don't change. For the price and promotion, we will store in the *contexts items features*, since they may change for each context.\n", "\n", - "Let's see an example of ChoiceDataset instantiation from numpy arrays:" + "For the *contexts*, we will consider the customers attributes: Budget and age." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# Let's see an example of ChoiceDataset instantiation\n", - "\n", - "# from choice-learn.data import ChoiceDataset\n", + "# Choices:\n", + "# Customer 1 bought item 1\n", + "# Customer 2 bought item 3\n", + "# Customer 1 bought item 2\n", "\n", - "# Let's consider three items whose features are:\n", - "# - Size\n", - "# - Weight\n", - "# - price\n", - "# - whether is is on promotion or not\n", + "choices = [0, 2, 1]\n", "\n", "fixed_items_features = [\n", " [1, 2], # item 1 [size, weight]\n", @@ -359,42 +517,35 @@ " [1.5, 1.5], # item 3 [size, weight]\n", "]\n", "\n", - "# We have two customers whose features are\n", - "# - Budget\n", - "# - Age\n", - "# Customer 1 bought item 1 at session 1 and item 2 at session 3\n", - "# Customer 2 bought item 3 at session 2\n", - "\n", - "choices = [0, 2, 1]\n", - "contexts_items_availabilities = [\n", - " [1, 1, 1], # All items available at session 1\n", - " [1, 1, 1], # All items available at session 2\n", - " [0, 1, 1], # Item 1 not available at session 3\n", - "]\n", - "\n", "contexts_features = [\n", - " [100, 20], # session 1, customer 1 [budget, age]\n", - " [200, 40], # session 2, customer 2 [budget, age]\n", - " [80, 20], # session 3, customer 1 [budget, age]\n", + " [100, 20], # choice 1, customer 1 [budget, age]\n", + " [200, 40], # choice 2, customer 2 [budget, age]\n", + " [80, 20], # choice 3, customer 1 [budget, age]\n", "]\n", "\n", "contexts_items_features = [\n", " [\n", - " [100, 0], # Session 1, Item 1 [price, promotion]\n", - " [140, 0], # Session 1, Item 2 [price, promotion]\n", - " [200, 0], # Session 1, Item 2 [price, promotion]\n", + " [100, 0], # choice 1, Item 1 [price, promotion]\n", + " [140, 0], # choice 1, Item 2 [price, promotion]\n", + " [200, 0], # choice 1, Item 2 [price, promotion]\n", " ],\n", " [\n", - " [100, 0], # Session 2 Item 1 [price, promotion]\n", - " [120, 1], # Session 2, Item 2 [price, promotion]\n", - " [200, 0], # Session 2, Item 2 [price, promotion]\n", + " [100, 0], # choice 2, Item 1 [price, promotion]\n", + " [120, 1], # choice 2, Item 2 [price, promotion]\n", + " [200, 0], # choice 2, Item 2 [price, promotion]\n", " ],\n", " [\n", - " [100, 0], # Session 3, Item 1 [price, promotion], values do not really matter, but needs to exist for shapes sake\n", - " [120, 1], # Session 3, Item 2 [price, promotion]\n", - " [180, 1], # Session 3, Item 2 [price, promotion]\n", + " [100, 0], # choice 3, Item 1 [price, promotion]\n", + " [120, 1], # choice 3, Item 2 [price, promotion]\n", + " [180, 1], # choice 3, Item 2 [price, promotion]\n", " ],\n", - "]\n" + "]\n", + "\n", + "contexts_items_availabilities = [\n", + " [1, 1, 1], # All items available at choice 1\n", + " [1, 1, 1], # All items available at choice 2\n", + " [0, 1, 1], # Item 1 not available at choice 3\n", + "]" ] }, { @@ -403,23 +554,24 @@ "source": [ "Note that in items_features and contexts_items_features, the features need to be well ordered:\n", "- The features are ordered the same for all items\n", - "- The items are ordered the same for items_features and contexts_items_features, and their index is used in choices:\n", + "- The items are ordered in their index given in choices. This applies in items_features and contexts_items_features\n", "\n", "\n", - "**items_features** = [[feature_1_item_A, feature_2_item_A, ...], [features_1_item_B, feature_2_item_B, ...], ...]\n", + "**items_features** = [[item1_featureA, item1_featureB, ...], [item2_featureA, item2_featureB, ...], ...]\n", "\n", - "**contexts_items_features** = [[[context_1_feature_1_item_A, ...], [context_1_feature_1_item_B, ...]], [[context_2_feature_1_item_A, ...], [context_2_feature_1_item_B, ...]], ...]\n", + "**contexts_items_features** = [[[context1_item1_featureA, ...], [context1_item2_featureA, ...]], [[context2_item1_featureA, ...], [context2_item2_featureA, ...]], ...]\n", "\n", - "**choices** then represent the index of the item: 0 when item_1 is chose, 1 when item_2, etc..., e.g. [0, 0, 2, 1, ...]" + "**choices** then represent the index of the item: 0 when item1 is chose, 1 when item2, etc..., e.g. [0, 0, 2, 1, ...]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "dataset = ChoiceDataset(\n", + " choices=choices,\n", " fixed_items_features=fixed_items_features,\n", " fixed_items_features_names=[\"size\", \"weight\"], # You can precise the names of the features if you want\n", " contexts_features=contexts_features,\n", @@ -427,13 +579,12 @@ " contexts_items_features=contexts_items_features,\n", " contexts_items_features_names=[\"price\", \"promotion\"], # same, not mandatory\n", " contexts_items_availabilities=contexts_items_availabilities,\n", - " choices=choices,\n", ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "keep_output": true }, @@ -444,7 +595,7 @@ "(3, 3, 2)" ] }, - "execution_count": null, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -457,13 +608,40 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "ChoiceDataset is indexed by session. You can use [] to subset it.\n", + "ChoiceDataset is indexed by choice. You can use [] to subset it.\n", "It is particularly useful for train/test split:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Some choices never happen in the dataset: {1, 2}\n", + "First choice is:\n", + "Items features: (array([[1. , 2. ],\n", + " [2. , 4. ],\n", + " [1.5, 1.5]], dtype=float32),)\n", + "Contexts features: (array([100, 20], dtype=int32),)\n", + "Contexts Items features: (array([[100, 0],\n", + " [140, 0],\n", + " [200, 0]], dtype=int32),)\n", + "Contexts Items Availabilities features: [1 1 1]\n", + "Contexts Choices: 0\n" + ] + } + ], + "source": [ + "print(dataset[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": { "keep_output": true }, @@ -492,7 +670,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If you want to access the features you can use the .iloc function with sessions indexes \n", + "If you want to access the features you can use the .iloc function with choices indexes \n", "It returns the features in this order:\n", "\n", "- items_features (n_items, n_items_features)\n", @@ -519,7 +697,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "keep_output": true }, @@ -566,7 +744,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "keep_output": true }, @@ -595,23 +773,19 @@ ], "source": [ "# All the features are given for each session, in order to compute utility and NegativeLogLikelihood\n", - "for n_batch, batch in enumerate(dataset.iter_batch(batch_size=1)):\n", - " print(n_batch, batch)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that you will need to use a ChoiceDataset to use the models." + "for i, batch in enumerate(dataset.iter_batch(batch_size=1)):\n", + " print(i, batch)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Stacking features**\n", - "If you need to keep a clear distinction between different features, you can use stacking in the ChoiceDataset. For example if we have two kind of items_features and we do not want them to be within the same np.ndarray we can as follow:" + "**Stacking features when building the ChoiceDataset**\n", + "\n", + "If you need to keep a clear distinction between different features, you can use stacking in the ChoiceDataset. In this case, you need to provide the additional features arrays indexed the same. It is possible to stack: *items_features*, *contexts_features*, *contexts_items_features*.\n", + "\n", + "For example if we have two kind of items_features and we do not want them to be within the same np.ndarray we can as follow:" ] }, { @@ -675,34 +849,6 @@ "dataset.batch[0]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is possible with:\n", - "- items_features\n", - "- contexts_features\n", - "- contexts_items_features\n", - "As the other should not need any superposition of values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = ChoiceDataset(\n", - " fixed_items_features=(fixed_items_features, items_features_2), # Here items_features specified as a tuple of the two features lists\n", - " contexts_features=(contexts_features, contexts_features),\n", - " contexts_items_features=(contexts_items_features, contexts_items_features),\n", - " contexts_items_availabilities=contexts_items_availabilities,\n", - " choices=choices,\n", - ")\n", - "\n", - "dataset.batch[0]" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1378,23 +1524,11 @@ "[1] Koppelman et al. (1993), *Application and Interpretation of Nested Logit Models of Intercity Mode Choice*\\\n", "[2] Bierlaire, M., Axhausen, K. and Abay, G. (2001), *The Acceptance of Modal Innovation: The Case of SwissMetro*" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "tf_env", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1408,9 +1542,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.9.18" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 6ede510164add1758372ae43c478c9fa945fdcd4 Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Mon, 19 Feb 2024 10:40:31 +0100 Subject: [PATCH 06/14] ADD: precommit fixes --- README.md | 2 +- .../choice_learn_introduction_data.ipynb | 188 ++---------------- tests/unit_tests/data/test_choice_dataset.py | 4 +- 3 files changed, 21 insertions(+), 173 deletions(-) diff --git a/README.md b/README.md index 3ae3bf86..ce54f68f 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ If you consider this package and any of its feature useful for your research, pl ### License -The use of this software is under the MIT (tbc) license, with no limitation of usage, including for commercial applications. +The use of this software is under the MIT (tbc) license, with no limitation of usage, including for commercial applications. ## References diff --git a/notebooks/choice_learn_introduction_data.ipynb b/notebooks/choice_learn_introduction_data.ipynb index 32096086..9ffd374e 100644 --- a/notebooks/choice_learn_introduction_data.ipynb +++ b/notebooks/choice_learn_introduction_data.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## ChoiceDataset - Getting Started !\n", "\n", @@ -66,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "keep_output": true }, @@ -189,7 +191,7 @@ "5 3 train 0 83 28.25 50 66 4 70.0 0 2" ] }, - "execution_count": 2, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -249,145 +251,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
casealtchoicedistcostivtovtfreqincomeurbannoaltis_public
11train08328.255066445.0021.0
21car18315.77610045.0020.0
32train08328.255066425.0021.0
42car18315.77610025.0020.0
53train08328.255066470.0021.0
\n", - "
" - ], - "text/plain": [ - " case alt choice dist cost ivt ovt freq income urban noalt \\\n", - "1 1 train 0 83 28.25 50 66 4 45.0 0 2 \n", - "2 1 car 1 83 15.77 61 0 0 45.0 0 2 \n", - "3 2 train 0 83 28.25 50 66 4 25.0 0 2 \n", - "4 2 car 1 83 15.77 61 0 0 25.0 0 2 \n", - "5 3 train 0 83 28.25 50 66 4 70.0 0 2 \n", - "\n", - " is_public \n", - "1 1.0 \n", - "2 0.0 \n", - "3 1.0 \n", - "4 0.0 \n", - "5 1.0 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "transport_df = canada_transport_df.copy()\n", "items = [\"air\", \"bus\", \"car\", \"train\"]\n", @@ -430,7 +296,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -500,7 +366,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -566,7 +432,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -584,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "keep_output": true }, @@ -595,7 +461,7 @@ "(3, 3, 2)" ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -614,34 +480,16 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Some choices never happen in the dataset: {1, 2}\n", - "First choice is:\n", - "Items features: (array([[1. , 2. ],\n", - " [2. , 4. ],\n", - " [1.5, 1.5]], dtype=float32),)\n", - "Contexts features: (array([100, 20], dtype=int32),)\n", - "Contexts Items features: (array([[100, 0],\n", - " [140, 0],\n", - " [200, 0]], dtype=int32),)\n", - "Contexts Items Availabilities features: [1 1 1]\n", - "Contexts Choices: 0\n" - ] - } - ], + "outputs": [], "source": [ "print(dataset[0])" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "keep_output": true }, @@ -697,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "keep_output": true }, @@ -744,7 +592,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": { "keep_output": true }, diff --git a/tests/unit_tests/data/test_choice_dataset.py b/tests/unit_tests/data/test_choice_dataset.py index ed2b495f..3b52297a 100644 --- a/tests/unit_tests/data/test_choice_dataset.py +++ b/tests/unit_tests/data/test_choice_dataset.py @@ -272,7 +272,7 @@ def test_from_df(): fixed_items_features_columns=["items_feat_1", "items_feat_2"], contexts_features_columns=["session_feat_1", "session_feat_2"], contexts_items_features_columns=["session_item_feat_1", "session_item_feat_2"], - choice_mode="items_id", + choice_format="items_id", ) ground_truth_cd = ChoiceDataset( fixed_items_features=fixed_items_features, @@ -312,7 +312,7 @@ def test_from_df(): fixed_items_features_columns=["items_feat_1", "items_feat_2"], contexts_features_columns=["session_feat_1", "session_feat_2"], contexts_items_features_columns=["session_item_feat_1", "session_item_feat_2"], - choice_mode="one_zero", + choice_format="one_zero", ) ground_truth_cd = ChoiceDataset( fixed_items_features=fixed_items_features, From 472d3c49deb6bb5b9c5ea3dc56c81002c845f722 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 20 Feb 2024 17:37:23 +0100 Subject: [PATCH 07/14] CHANGE: allow os.path use --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f4693f3b..258ba107 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ select = [ "PTH", "PD", ] # See: https://beta.ruff.rs/docs/rules/ -ignore = ["D203", "D213", "ANN101", "ANN102", "ANN204", "ANN001", "ANN202", "ANN201", "ANN206", "ANN003"] +ignore = ["D203", "D213", "ANN101", "ANN102", "ANN204", "ANN001", "ANN202", "ANN201", "ANN206", "ANN003", "PTH100", "PTH118", "PTH123"] line-length = 100 target-version = "py310" exclude = [ From f2392b2329baaefb2f31411a7215fbcf9b2b906d Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 20 Feb 2024 17:37:31 +0100 Subject: [PATCH 08/14] ENH: completed renaming choice-mode -> choice_format --- choice_learn/data/choice_dataset.py | 20 ++++++++++++++----- choice_learn/datasets/base.py | 13 ++++++------ .../choice_learn_introduction_clogit.ipynb | 2 +- notebooks/custom_model.ipynb | 2 +- notebooks/dataset_creation.ipynb | 10 +++++----- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 72a4b239..232a7c8b 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -614,8 +614,18 @@ def __len__(self): return len(self.choices) def __str__(self): - template = '''First choice is:\nItems features: {}\nContexts features: {}\nContexts Items features: {}\nContexts Items Availabilities: {}\nContexts Choice: {}''' - return template.format(self.batch[0][0], self.batch[0][1], self.batch[0][2], self.batch[0][3], self.batch[0][4]) + """Retursn short representation of ChoiceDataset. + + Returns: + -------- + str + short representation of ChoiceDataset + """ + template = """First choice is:\nItems features: {}\nContexts features: {}\n\ + Contexts Items features: {}\nContexts Items Availabilities: {}\nContexts Choice: {}""" + return template.format( + self.batch[0][0], self.batch[0][1], self.batch[0][2], self.batch[0][3], self.batch[0][4] + ) def get_num_items(self): """Method to access the total number of different items. @@ -725,7 +735,7 @@ def from_single_wide_df( contexts_items_features_suffixes=None, contexts_items_availabilities_suffix=None, choices_column="choice", - choice_mode="items_id", + choice_format="items_id", ): """Builds numpy arrays for ChoiceDataset from a single dataframe in wide format. @@ -745,7 +755,7 @@ def from_single_wide_df( Suffixes of the columns of the dataframe that are context-item availabilities, choice_column: str, optional Name of the column containing the choices, default is "choice" - choice_mode: str, optional + choice_format: str, optional How choice is indicated in df, either "items_name" or "items_index", default is "items_id" @@ -810,7 +820,7 @@ def from_single_wide_df( contexts_items_availabilities = None choices = df[choices_column] - if choice_mode == "items_id": + if choice_format == "items_id": choices = np.squeeze([np.where(items_id == c)[0] for c in choices]) return ChoiceDataset( diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index 4ac6930d..3c0f189a 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -8,8 +8,7 @@ from choice_learn.data.choice_dataset import ChoiceDataset - -DATA_MODULE = os.path.join(os.path.abspath('..'), 'choice_learn', 'datasets', 'data') +DATA_MODULE = os.path.join(os.path.abspath(".."), "choice_learn", "datasets", "data") def load_csv(data_file_name, data_module=DATA_MODULE, encoding="utf-8"): @@ -195,7 +194,7 @@ def load_modecanada( add_is_public=False, as_frame=False, return_desc=False, - choice_mode="one_zero", + choice_format="one_zero", split_features=False, to_wide=False, ): @@ -214,8 +213,8 @@ def load_modecanada( by default False. return_desc : bool, optional Whether to return the description, by default False. - choice_mode : str, optional, among ["one_zero", "items_id"] - mode indicating how the choice is encoded, by default "one_zero". + choice_format : str, optional, among ["one_zero", "items_id"] + format indicating how the choice is encoded, by default "one_zero". split_features : bool, optional Whether to split features by type in different dataframes, by default False. to_wide : bool, optional @@ -270,7 +269,7 @@ def load_modecanada( for col in canada_df.columns: canada_df[col] = pd.to_numeric(canada_df[col], errors="ignore") - if choice_mode == "items_id": + if choice_format == "items_id": # We need to transform how the choice is encoded to add the chosen item id named_choice = [0] * len(canada_df) for n_row, row in canada_df.iterrows(): @@ -364,5 +363,5 @@ def load_modecanada( items_id_column="alt", contexts_id_column="case", choices_column=choice_column, - choice_mode="one_zero", + choice_format="one_zero", ) diff --git a/notebooks/choice_learn_introduction_clogit.ipynb b/notebooks/choice_learn_introduction_clogit.ipynb index bd20a3e0..5773a4ab 100644 --- a/notebooks/choice_learn_introduction_clogit.ipynb +++ b/notebooks/choice_learn_introduction_clogit.ipynb @@ -71,7 +71,7 @@ " items_id_column=\"alt\",\n", " contexts_id_column=\"case\",\n", " choices_column=\"choice\",\n", - " choice_mode=\"one_zero\")" + " choice_format=\"one_zero\")" ] }, { diff --git a/notebooks/custom_model.ipynb b/notebooks/custom_model.ipynb index 3cfda8ec..606c1caf 100644 --- a/notebooks/custom_model.ipynb +++ b/notebooks/custom_model.ipynb @@ -82,7 +82,7 @@ " items_id_column=\"alt\",\n", " contexts_id_column=\"case\",\n", " choices_column=\"choice\",\n", - " choice_mode=\"one_zero\")" + " choice_format=\"one_zero\")" ] }, { diff --git a/notebooks/dataset_creation.ipynb b/notebooks/dataset_creation.ipynb index ea94a78f..5423c436 100644 --- a/notebooks/dataset_creation.ipynb +++ b/notebooks/dataset_creation.ipynb @@ -246,7 +246,7 @@ " contexts_id_column=\"case\",\n", " choices_column=\"choice\",\n", " # the choice columns indicates if the item is chosen (1) or not (0)\n", - " choice_mode=\"one_zero\",\n", + " choice_format=\"one_zero\",\n", " )\n", "print(dataset.summary())" ] @@ -402,7 +402,7 @@ } ], "source": [ - "canada_df = load_modecanada(as_frame=True, add_is_public=True, choice_mode=\"items_id\")\n", + "canada_df = load_modecanada(as_frame=True, add_is_public=True, choice_format=\"items_id\")\n", "canada_df.head()" ] }, @@ -411,7 +411,7 @@ "metadata": {}, "source": [ "This time, the choice is not given by ones and zeros but actually names for each context which alternative (item) has been chosen.\n", - "The ChoiceDataset handles this case easily, by specifying 'choice_mode=\"items_id\"'." + "The ChoiceDataset handles this case easily, by specifying 'choice_format=\"items_id\"'." ] }, { @@ -460,7 +460,7 @@ " contexts_id_column=\"case\",\n", " choices_column=\"choice\",\n", " # the choice columns indicates the id of the chosen item\n", - " choice_mode=\"items_id\",\n", + " choice_format=\"items_id\",\n", " )\n", "print(dataset.summary())" ] @@ -503,7 +503,7 @@ " contexts_items_features_suffixes=[\"CO\", \"TT\", \"HE\", \"SEATS\"],\n", " contexts_items_availabilities_suffix=\"AV\", # [\"TRAIN_AV\", \"SM_AV\", \"CAR_AV\"] also works\n", " choices_column=\"CHOICE\",\n", - " choice_mode=\"item_index\",\n", + " choice_format=\"item_index\",\n", ")" ] }, From 0a568df66ab54023bc610483284ec123b587ef19 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 20 Feb 2024 18:25:17 +0100 Subject: [PATCH 09/14] minor change --- choice_learn/data/choice_dataset.py | 5 +++-- notebooks/choice_learn_introduction_data.ipynb | 7 ++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 232a7c8b..2d3a519f 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -621,8 +621,9 @@ def __str__(self): str short representation of ChoiceDataset """ - template = """First choice is:\nItems features: {}\nContexts features: {}\n\ - Contexts Items features: {}\nContexts Items Availabilities: {}\nContexts Choice: {}""" + template = """First choice is:\nItems features: {}\nContexts features: {}\n + Contexts Items features: {}\nContexts Items Availabilities: {}\n + Contexts Choice: {}""" return template.format( self.batch[0][0], self.batch[0][1], self.batch[0][2], self.batch[0][3], self.batch[0][4] ) diff --git a/notebooks/choice_learn_introduction_data.ipynb b/notebooks/choice_learn_introduction_data.ipynb index 9ffd374e..aa0b3deb 100644 --- a/notebooks/choice_learn_introduction_data.ipynb +++ b/notebooks/choice_learn_introduction_data.ipynb @@ -498,9 +498,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "No features_by_ids given.\n", "Some choices never happen in the dataset: {1}\n", - "No features_by_ids given.\n", "Some choices never happen in the dataset: {0, 2}\n", "Train Dataset length: 2 Test Dataset lenght: 1\n" ] @@ -743,8 +741,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Usual Supermakerket Features Shape: (18, 2)\n", - "No features_by_ids given.\n" + "Usual Supermakerket Features Shape: (18, 2)\n" ] } ], @@ -1390,7 +1387,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.11.4" } }, "nbformat": 4, From 76b8a976149b16120d77ecc6def66e36d1c6be64 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 14 Mar 2024 15:51:09 +0100 Subject: [PATCH 10/14] ADD: checked modif of choice_mode > choice_format --- choice_learn/data/choice_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 67db8272..ec0df1e1 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -629,7 +629,7 @@ def __len__(self): return len(self.choices) def __str__(self): - """Retursn short representation of ChoiceDataset. + """Returns short representation of ChoiceDataset. Returns: -------- @@ -917,7 +917,7 @@ def from_single_wide_df( choices = df[choices_column].to_numpy() if choice_format == "items_id": if items_id is None: - raise ValueError("items_id must be given to use choice_mode 'items_id'") + raise ValueError("items_id must be given to use choice_format='items_id'") items_id = np.array(items_id) choices = np.squeeze([np.where(items_id == c)[0] for c in choices]) From 426a5cf53d231ec0ca162d1534d7d05ae81a2759 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 14 Mar 2024 16:00:26 +0100 Subject: [PATCH 11/14] ENH: citations names consistency --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 40c181f7..dd47d7ee 100644 --- a/README.md +++ b/README.md @@ -182,7 +182,7 @@ The use of this software is under the MIT license, with no limitation of usage, [2][The Acceptance of Model Innovation: The Case of Swissmetro](https://www.researchgate.net/publication/37456549_The_acceptance_of_modal_innovation_The_case_of_Swissmetro), Bierlaire, M.; Axhausen, K., W.; Abay, G. (2001)\ [3][Applications and Interpretation of Nested Logit Models of Intercity Mode Choice](https://trid.trb.org/view/385097), Forinash, C., V.; Koppelman, F., S. (1993)\ [4][The Demand for Local Telephone Service: A Fully Discrete Model of Residential Calling Patterns and Service Choices](https://www.jstor.org/stable/2555538), Train K., E.; McFadden, D., L.; Moshe, B. (1987)\ -[5] [Estimation of Travel Choice Models with Randomly Distributed Values of Time](https://ideas.repec.org/p/fth/lavaen/9303.html), Ben-Akiva M; Bolduc D; Bradley M(1993)\ +[5] [Estimation of Travel Choice Models with Randomly Distributed Values of Time](https://ideas.repec.org/p/fth/lavaen/9303.html), Ben-Akiva, M.; Bolduc, D.; Bradley, M. (1993)\ [6] [Personalize Expedia Hotel Searches - ICDM 2013](https://www.kaggle.com/c/expedia-personalized-sort), Ben Hamner, A.; Friedman, D.; SSA_Expedia. (2013) ### Code and Repositories From bd516ccc325b1861b2323ed8de5df9ed36e6362a Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 14 Mar 2024 17:44:13 +0100 Subject: [PATCH 12/14] ADD: MIT License --- LICENSE.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 LICENSE.md diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 00000000..14572cbb --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2023 The choice-learn developers, artefactory +All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 7bcb93801f3e488b89a46317dc2f5822619394be Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 14 Mar 2024 18:56:33 +0100 Subject: [PATCH 13/14] ENH: smal enhancement use of added prefice ;) --- choice_learn/datasets/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/choice_learn/datasets/base.py b/choice_learn/datasets/base.py index 7669e138..91473783 100644 --- a/choice_learn/datasets/base.py +++ b/choice_learn/datasets/base.py @@ -38,7 +38,7 @@ def get_path(data_file_name, module=DATA_MODULE): return path -def load_csv(data_file_name, data_module=DATA_MODULE, encoding="utf-8"): +def load_csv(data_file_name, data_module=OS_DATA_MODULE, encoding="utf-8"): """Base function to load csv files. Parameters @@ -67,7 +67,7 @@ def load_csv(data_file_name, data_module=DATA_MODULE, encoding="utf-8"): return names, np.stack(data) -def load_gzip(data_file_name, data_module=DATA_MODULE, encoding="utf-8"): +def load_gzip(data_file_name, data_module=OS_DATA_MODULE, encoding="utf-8"): """Base function to load zipped .csv.gz files. Parameters @@ -750,6 +750,7 @@ def load_train( if as_frame: return train_df train_df["choice"] = train_df.apply(lambda row: row.choice[-1], axis=1) + """ train_df = train_df.rename( columns={ "price1": "1_price", @@ -766,13 +767,14 @@ def load_train( "comfort2": "2_comfort", } ) - + """ return ChoiceDataset.from_single_wide_df( df=train_df, items_id=["1", "2"], fixed_items_suffixes=None, contexts_features_columns=["id"], - contexts_items_features_suffixes=["price", "time", "change", "comfort"], + contexts_items_features_prefixes=["price", "time", "change", "comfort"], + delimiter="", contexts_items_availabilities_suffix=None, choices_column="choice", choice_format="items_id", From 2951eb2a17d3c31246209dab71acd8a3a8cc5d48 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 14 Mar 2024 18:58:25 +0100 Subject: [PATCH 14/14] FIX: iteration over prefixes was done on suffixes.... --- choice_learn/data/choice_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index ec0df1e1..63234a3e 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -867,7 +867,7 @@ def from_single_wide_df( contexts_items_features = [] for item in items_id: columns = [ - f"{feature}{delimiter}{item}" for feature in contexts_items_features_suffixes + f"{feature}{delimiter}{item}" for feature in contexts_items_features_prefixes ] for col in columns: if col not in df.columns: