From 98018919d2030dc7c1e5a8f10a9a7f278c973557 Mon Sep 17 00:00:00 2001
From: VincentAuriau <auriau.vincent@gmail.com>
Date: Thu, 23 May 2024 21:01:46 +0200
Subject: [PATCH] refacto from OverLeaf

---
 docs/paper/memory_usage.ipynb | 321 +++++++++++++++++++++++++++++++++-
 docs/paper/paper.bib          | 143 +++++++++++----
 docs/paper/paper.md           | 177 +++++++++----------
 3 files changed, 508 insertions(+), 133 deletions(-)

diff --git a/docs/paper/memory_usage.ipynb b/docs/paper/memory_usage.ipynb
index 3aeb2708..b9df005a 100644
--- a/docs/paper/memory_usage.ipynb
+++ b/docs/paper/memory_usage.ipynb
@@ -29,7 +29,7 @@
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
-    "import pandas as pf\n",
+    "import pandas as pd\n",
     "\n",
     "from choice_learn.datasets import load_expedia\n",
     "from choice_learn.data import ChoiceDataset\n"
@@ -134,6 +134,8 @@
     "\n",
     "plt.plot(ds_lengths, dense_sizes, label='w/o FeaturesById - (10, 10)', c=\"darkblue\")\n",
     "plt.plot(ds_lengths, fbid_sizes, label='w/ FeaturesById - (10, 10)', c=\"turquoise\")\n",
+    "plt.scatter(ds_lengths, dense_sizes, c=\"darkblue\")\n",
+    "plt.scatter(ds_lengths, fbid_sizes, c=\"turquoise\")\n",
     "plt.yscale(\"log\")\n",
     "plt.xscale(\"log\")\n",
     "plt.xlabel(\"Dataset Size\")\n",
@@ -161,6 +163,8 @@
     "\n",
     "plt.plot(ds_lengths, dense_sizes, label='w/o FeaturesById - (100, 100)', c=\"cornflowerblue\")\n",
     "plt.plot(ds_lengths, fbid_sizes, label='w/ FeaturesById - (100, 100)', c=\"teal\")\n",
+    "plt.scatter(ds_lengths, dense_sizes, c=\"cornflowerblue\")\n",
+    "plt.scatter(ds_lengths, fbid_sizes, c=\"teal\")\n",
     "plt.yscale(\"log\")\n",
     "plt.xscale(\"log\")\n",
     "plt.xlabel(\"Dataset Size\")\n",
@@ -168,6 +172,13 @@
     "plt.legend()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Expedia Dataset"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -431,7 +442,7 @@
    "source": [
     "import pandas as pd\n",
     "\n",
-    "df = pd.read_csv(\"../../choice_learn/datasets/data/expedia_rumnet_preprocessing.csv\", engine=\"pyarrow\")"
+    "df = load_expedia(as_frame=True)"
    ]
   },
   {
@@ -442,7 +453,7 @@
    "source": [
     "site_id_one_hot = pd.get_dummies(df.site_id, prefix=\"site_id\")\n",
     "visitor_location_country_id_one_hot = pd.get_dummies(df.visitor_location_country_id, prefix=\"visitor_location_country_id\")\n",
-    "srch_destination_id_one_hot =pd.get_dummies(df.srch_destination_id, prefix=\"srch_destination_id\")\n",
+    "srch_destination_id_one_hot = pd.get_dummies(df.srch_destination_id, prefix=\"srch_destination_id\")\n",
     "prop_country_id_one_hpt = pd.get_dummies(df.prop_country_id, prefix=\"prop_country_id\")\n",
     "df = pd.concat([df, site_id_one_hot, visitor_location_country_id_one_hot, srch_destination_id_one_hot, prop_country_id_one_hpt], axis=1)\n"
    ]
@@ -535,6 +546,295 @@
     "    wide_df_memory_size.append(get_obj_size(sub_wide_df))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Torch-Choice\n",
+    "\n",
+    "For this part you will need the torch-choice package: ```pip install torch-choice```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch_choice.utils.easy_data_wrapper import EasyDatasetWrapper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "long_df = long_df.reset_index(drop=True)\n",
+    "long_df.sort_values(\"srch_id\", inplace=True, ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "long_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "items_ids = []\n",
+    "for nit in long_df.srch_id.value_counts().sort_index():\n",
+    "    items_ids.append(np.arange(nit))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "long_df[\"items_id\"] = np.concatenate(items_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "long_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mem_sizes = []\n",
+    "data_lengths = [100, 1000, 10000, 100000, 397618]\n",
+    "for length in data_lengths:\n",
+    "    ids = long_df.srch_id.unique()[:length]\n",
+    "    sub_long_df = long_df[long_df.srch_id.isin(ids)].copy(deep=True)\n",
+    "    print(get_obj_size(sub_long_df))\n",
+    "\n",
+    "    data_1 = EasyDatasetWrapper(main_data=sub_long_df,\n",
+    "                            purchase_record_column='srch_id',\n",
+    "                            choice_column='booking_bool',\n",
+    "                            item_name_column='items_id',\n",
+    "                            session_index_column='srch_id',\n",
+    "                            user_index_column='srch_id',\n",
+    "                            # it can be derived from columns of the dataframe or supplied as\n",
+    "                            user_observable_columns=['srch_length_of_stay',\n",
+    "                                                    'srch_adults_count',\n",
+    "                                                    'srch_children_count',\n",
+    "                                                    'srch_room_count',\n",
+    "                                                    'srch_saturday_night_bool'],\n",
+    "                            price_observable_columns=['log_price'],\n",
+    "                            device=\"cpu\")\n",
+    "    mem_sizes.append(get_obj_size(data_1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mem_sizes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_2 = EasyDatasetWrapper(main_data=long_df,\n",
+    "                            purchase_record_column='srch_id',\n",
+    "                            choice_column='booking_bool',\n",
+    "                            item_name_column='items_id',\n",
+    "                            session_index_column='srch_id',\n",
+    "                            user_index_column='srch_id',\n",
+    "                            # it can be derived from columns of the dataframe or supplied as\n",
+    "                            user_observable_columns=['srch_length_of_stay',\n",
+    "                                                    'srch_adults_count',\n",
+    "                                                    'srch_children_count',\n",
+    "                                                    'srch_room_count',\n",
+    "                                                    'srch_saturday_night_bool'],\n",
+    "                            price_observable_columns=['log_price'],\n",
+    "                            device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = sub_long_df.to_numpy()\n",
+    "b = long_df.to_numpy()\n",
+    "get_obj_size(a), get_obj_size(b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sys.getsizeof(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_obj_size(np.ones((1000, 1000)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_obj_size(long_df.to_numpy()), get_obj_size(sub_long_df.to_numpy())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sub_long_df == long_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mem_sizes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch_choice.data import ChoiceDataset as torch_dataset\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wo_fbid_dataset.choices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mem_prints = []\n",
+    "for length in data_lengths:\n",
+    "    tdata = torch_dataset(\n",
+    "        item_index=torch.LongTensor(wo_fbid_dataset.choices[:length]),\n",
+    "        # user_index=np.arange(len(wo_fbid_dataset)),\n",
+    "        session_index=torch.LongTensor(np.arange(len(wo_fbid_dataset))[:length]),\n",
+    "        item_availability=torch.LongTensor(wo_fbid_dataset.available_items_by_choice[:length]),\n",
+    "\n",
+    "        session_obs=torch.LongTensor(wo_fbid_dataset.shared_features_by_choice[0][:length]),\n",
+    "        price_obs=torch.LongTensor(wo_fbid_dataset.items_features_by_choice[0][:length]),\n",
+    "    )\n",
+    "    print(len(tdata))\n",
+    "    references = gc.get_referents(tdata)\n",
+    "    print(references)\n",
+    "    memsize = 0\n",
+    "    if isinstance(references, list):\n",
+    "        for el in references:\n",
+    "            try:\n",
+    "                memsize += get_obj_size(el.storage())\n",
+    "            except:\n",
+    "                pass\n",
+    "    else:\n",
+    "        for el in references[0].values():\n",
+    "            try:\n",
+    "                memsize += get_obj_size(el.storage())\n",
+    "            except:\n",
+    "                pass\n",
+    "    mem_prints.append(memsize)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mem_prints"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "memsize = 0\n",
+    "for el in gc.get_referents(tdata)[0].values():\n",
+    "    try:\n",
+    "        memsize += get_obj_size(el.storage())\n",
+    "    except:\n",
+    "        pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "memsize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_obj_size(tdata.item_availability)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_obj_size(tdata)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -545,6 +845,7 @@
     "|---|---|---|---|---|---|\n",
     "| CD w. FeaturesByIDs | 85.640 | 771.440 | 7.629.440 | 76.209.440 | 302.994.356 |\n",
     "| CD wo FeaturesByIDs | 220.400 | 2.198.600 | 21.980.600 | 219.800.600 | 873.964.964 |\n",
+    "| Torch-Choice | | | | | 4.962.273.680 |\n",
     "| Long format DF | 5.521.360 | 52.463.080 | 524.667.470 | 5.234.198.450 | 20.815.361.140 |\n",
     "| Wide format DF | 3.503.109 | 31.784.709 | 314.600.709 | 3.142.760.709 | 12.495.108.741 |\n",
     "\n",
@@ -552,6 +853,7 @@
     "data_lengths: [100, 1000, 10000, 100000, 397618]\\\n",
     "ChoiceDataset with FeaturesByIDs: [85640, 771440, 7629440, 76209440, 302994356]\\\n",
     "ChoiceDataset without FeaturesByIDs: [220400, 2198600, 21980600, 219800600, 873964964]\\\n",
+    "Torch-Choice: [5825168, 55093676, 550480667, 5515236600, 10448857871]\\\n",
     "DF Long Format: [5521360, 52463080, 524667470, 5234198450, 20815361140]\\\n",
     "DF Wide Format: [3503109, 31784709, 314600709, 3142760709, 12495108741]"
    ]
@@ -575,7 +877,8 @@
     "cd_wo =  [220400, 2198600, 21980600, 219800600, 873964964]\n",
     "df_long =  [5521360, 52463080, 524667470, 5234198450, 20815361140]\n",
     "df_wide =  [6252516976, 6266664976, 6408144976, 7822944976, 12501499936]\n",
-    "df_wide = [3503109, 31784709, 314600709, 3142760709, 12495108741]"
+    "df_wide = [3503109, 31784709, 314600709, 3142760709, 12495108741]\n",
+    "torch_choice = [1249040, 12481040, 124801040, 1248001040, 4962273680]"
    ]
   },
   {
@@ -603,10 +906,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.plot(data_lengths, cd_with, label=\"ChoiceLearn\", c=\"teal\")\n",
-    "# plt.plot(data_lengths, cd_wo, label=\"ChoiceDataset withOUT FeaturesByIDs\", c=\"turquoise\")\n",
+    "plt.plot(data_lengths, cd_with, label=\"Choice-Learn\", c=\"teal\")\n",
+    "plt.plot(data_lengths, torch_choice, label=\"Torch-Choice\", c=\"turquoise\")\n",
     "plt.plot(data_lengths, df_long, label=\"PyLogit (long format)\", c=\"darkblue\")\n",
     "plt.plot(data_lengths, df_wide, label=\"Biogeme (wide format)\", c=\"cornflowerblue\")\n",
+    "plt.scatter(data_lengths, cd_with, c=\"teal\")\n",
+    "plt.scatter(data_lengths, torch_choice, c=\"turquoise\")\n",
+    "plt.scatter(data_lengths, df_long, c=\"darkblue\")\n",
+    "plt.scatter(data_lengths, df_wide, c=\"cornflowerblue\")\n",
     "plt.legend()\n",
     "plt.yscale(\"log\")\n",
     "plt.xscale(\"log\")\n",
@@ -716,7 +1023,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,
diff --git a/docs/paper/paper.bib b/docs/paper/paper.bib
index 97baba2e..49a26a2b 100644
--- a/docs/paper/paper.bib
+++ b/docs/paper/paper.bib
@@ -22,22 +22,18 @@ @article{Brathwaite:2018
    pages={78–112}
 }
 
-@misc{Du:2023,
+@article{Du:2023,
       title={Torch-Choice: A PyTorch Package for Large-Scale Choice Modelling with Python},
       author={Tianyu Du and Ayush Kanodia and Susan Athey},
       year={2023},
-      eprint={2304.01906},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
+  journal={arXiv preprint arXiv:{2304.01906}},
 }
 
-@misc{Aouad:2023,
-      title={Representing Random Utility Choice Models with Neural Networks},
-      author={Ali Aouad and Antoine Désir},
-      year={2023},
-      eprint={2207.12877},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
+@article{Aouad:2023,
+  title={Representing random utility choice models with neural networks},
+  author={Aouad, Ali and D{\'e}sir, Antoine},
+  journal={arXiv preprint arXiv:2207.12877},
+  year={2022}
 }
 
 @article{Han:2022,
@@ -53,13 +49,12 @@ @article{Han:2022
 keywords = {Discrete choice models, Neural networks, Taste heterogeneity, Interpretability, Utility specification, Machine learning, Deep learning},
 abstract = {Discrete choice models (DCMs) require a priori knowledge of the utility functions, especially how tastes vary across individuals. Utility misspecification may lead to biased estimates, inaccurate interpretations and limited predictability. In this paper, we utilize a neural network to learn taste representation. Our formulation consists of two modules: a neural network (TasteNet) that learns taste parameters (e.g., time coefficient) as flexible functions of individual characteristics; and a multinomial logit (MNL) model with utility functions defined with expert knowledge. Taste parameters learned by the neural network are fed into the choice model and link the two modules. Our approach extends the L-MNL model (Sifringer et al., 2020) by allowing the neural network to learn the interactions between individual characteristics and alternative attributes. Moreover, we formalize and strengthen the interpretability condition — requiring realistic estimates of behavior indicators (e.g., value-of-time, elasticity) at the disaggregated level, which is crucial for a model to be suitable for scenario analysis and policy decisions. Through a unique network architecture and parameter transformation, we incorporate prior knowledge and guide the neural network to output realistic behavior indicators at the disaggregated level. We show that TasteNet-MNL reaches the ground-truth model’s predictability and recovers the nonlinear taste functions on synthetic data. Its estimated value-of-time and choice elasticities at the individual level are close to the ground truth. In contrast, exemplary logit models with misspecified systematic utility lead to biased parameter estimates and lower prediction accuracy. On a publicly available Swissmetro dataset, TasteNet-MNL outperforms benchmarking MNLs and Mixed Logit model’s predictability. It learns a broader spectrum of taste variations within the population and suggests a higher average value-of-time. Our source code is available for research and application.}
 }
-@misc{Salvadé:2024,
-      title={RUMBoost: Gradient Boosted Random Utility Models},
-      author={Nicolas Salvadé and Tim Hillel},
-      year={2024},
-      eprint={2401.11954},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
+
+@article{Salvadé:2024,
+  title={RUMBoost: Gradient Boosted Random Utility Models},
+  author={Salvad{\'e}, Nicolas and Hillel, Tim},
+  journal={arXiv preprint arXiv:2401.11954},
+  year={2024}
 }
 
 @article{Train:1987,
@@ -121,13 +116,13 @@ @Inbook{Nocedal:2006
 url="https://doi.org/10.1007/978-0-387-40065-5_7"
 }
 
-@misc{Kingma:2017,
+@article{Kingma:2017,
       title={Adam: A Method for Stochastic Optimization},
       author={Diederik P. Kingma and Jimmy Ba},
       year={2017},
-      eprint={1412.6980},
       archivePrefix={arXiv},
-      primaryClass={cs.LG}
+      primaryClass={cs.LG},
+  journal={arXiv preprint arXiv:{1412.6980}},
 }
 
 @article{Tieleman:2012,
@@ -138,18 +133,106 @@ @article{Tieleman:2012
   year={2012}
 }
 
-@misc{Expedia:2013,
-      title={Personalize Expedia Hotel Searches - ICDM 2013},
+@article{Expedia:2013,
+      title={Personalize Expedia Hotel Searches},
       author={Ben Hamner, Adam and Friedman, Dan},
+  journal={ICDM},
       year={2013},
       eprint={https://www.kaggle.com/c/expedia-personalized-sort},
       URL={https://www.kaggle.com/c/expedia-personalized-sort},
 }
-@misc{AouadMarket:2023,
-      title={Market Segmentation Trees},
-      author={Ali Aouad and Adam N. Elmachtoub and Kris J. Ferreira and Ryan McNellis},
-      year={2023},
-      eprint={1906.01174},
-      archivePrefix={arXiv},
-      primaryClass={stat.AP}
+
+@article{AouadMarket:2023,
+  title={Market segmentation trees},
+  author={Aouad, Ali and Elmachtoub, Adam N and Ferreira, Kris J and McNellis, Ryan},
+  journal={Manufacturing \& Service Operations Management},
+  volume={25},
+  number={2},
+  pages={648--667},
+  year={2023},
+  publisher={INFORMS}
+}
+
+@article{MendezDiaz:2014,
+title = {A branch-and-cut algorithm for the latent-class logit assortment problem},
+journal = {Discrete Applied Mathematics},
+volume = {164},
+pages = {246-263},
+year = {2014},
+note = {Combinatorial Optimization},
+issn = {0166-218X},
+doi = {https://doi.org/10.1016/j.dam.2012.03.003},
+url = {https://www.sciencedirect.com/science/article/pii/S0166218X12001072},
+author = {Isabel Méndez-Díaz and Juan José Miranda-Bront and Gustavo Vulcano and Paula Zabala},
+keywords = {Retail operations, Revenue management, Choice behavior, Multinomial logit, Integer programming, Fractional programming},
+abstract = {We study the product assortment problem of a retail operation that faces a stream of customers who are heterogeneous with respect to preferences. Each customer belongs to a market segment characterized by a consideration set that includes the alternatives viewed as options, and by the preference weights that the segment assigns to each of those alternatives. Upon arrival, he checks the offer set displayed by the firm, and either chooses one of those products or quits without purchasing according to a multinomial-logit (MNL) criterion. The firm’s goal is to maximize the expected revenue extracted during a fixed time horizon. This problem also arises in the growing area of choice-based, network revenue management, where computational speed is a critical factor for the practical viability of a solution approach. This so-called latent-class, logit assortment problem is known to be NP-Hard. In this paper, we analyze unconstrained and constrained (i.e., with a limited number of products to display) versions of it, and propose a branch-and-cut algorithm that is computationally fast and leads to (nearly) optimal solutions.}
+}
+
+@software{pandas:2020,
+    author       = {The pandas development team},
+    title        = {pandas-dev/pandas: Pandas},
+    month        = feb,
+    year         = 2020,
+    publisher    = {Zenodo},
+    version      = {latest},
+    doi          = {10.5281/zenodo.3509134},
+    url          = {https://doi.org/10.5281/zenodo.3509134}
+}
+
+@inproceedings{Bierlaire:2001,
+  title={The acceptance of modal innovation: The case of Swissmetro},
+  author={Bierlaire, Michel and Axhausen, Kay and Abay, Georg},
+  booktitle={Swiss transport research conference},
+  year={2001}
+}
+
+@article{Pedregosa:2011,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+          and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+          and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+          Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+
+@misc{Helveston:2023,
+title={Convert data from wide to long format},
+howpublished={\url{https://xlogit.readthedocs.io/en/latest/notebooks/convert_data_wide_to_long.html}},
+author={Forsythe, C. and Helveston, J}
+}
+@misc{Chollet:2015,
+  title={Keras},
+  author={Chollet, Fran\c{c}ois and others},
+  year={2015},
+  howpublished={\url{https://keras.io}},
+}
+
+@article{McFadden:2000,
+  title={Mixed MNL models for discrete response},
+  author={McFadden, Daniel and Train, Kenneth},
+  journal={Journal of applied Econometrics},
+  volume={15},
+  number={5},
+  pages={447--470},
+  year={2000},
+  publisher={Wiley Online Library}
+}
+
+@misc{Gurobi:2023,
+  author = {{Gurobi Optimization, LLC}},
+  title = {{Gurobi Optimizer Reference Manual}},
+  year = 2023,
+  url = "https://www.gurobi.com"
+}
+
+@software{ORTools:2024,
+  title = {OR-Tools},
+  version = { v9.9 },
+  author = {Laurent Perron and Vincent Furnon},
+  organization = {Google},
+  url = {https://developers.google.com/optimization/},
+  date = { 2024-03-07 }
 }
diff --git a/docs/paper/paper.md b/docs/paper/paper.md
index ff826e0d..20fa711c 100644
--- a/docs/paper/paper.md
+++ b/docs/paper/paper.md
@@ -1,5 +1,6 @@
 ---
-title: '`Choice-Learn`: A Python package for generic choice modelling with large datasets.'
+title: '`Choice-Learn`: Large-scale choice modeling for operational contexts through the lens of machine learning'
+
 # Idea to introduce: ML&Classical, toolbox
 # A Python Toolbox for generic and custom choice modelling ?
 tags:
@@ -8,33 +9,25 @@ tags:
   - decision
 authors:
   - name: Vincent Auriau
-    corresponding: true # (This is how to denote the corresponding author)
+    corresponding: true
     orcid: 0000-0000-0000-0000
     affiliation: "1, 2"
   - name: Emmanuel Malherbe
     affiliation: 2
-  - name: Maxime Lutel
-    affiliation: 2
-  - name: Martin Mozina
-    affiliation: 3
   - name: Ali Aouad
-    affiliation: 4
+    affiliation: 3
   - name: Antoine Désir
-    affiliation: 5
-  - name: Vincent Mousseau
-    affiliation: 1
+    affiliation: 4
 affiliations:
  - name: CentraleSupélec, Université Paris-Saclay, France
    index: 1
  - name: Artefact Research Center, France
    index: 2
- - name: Fortenova Group, Croatia
+ - name: London Business School, United Kingdom
    index: 3
- - name: London Business School, Great-Britain
-   index: 4
  - name: INSEAD, France
-   index: 5
-date: 29 March 2024
+   index: 4
+date: 23 May 2024
 bibliography: paper.bib
 output: paper_pdf
 
@@ -42,131 +35,123 @@ output: paper_pdf
 
 # Introduction
 
-Discrete choice models aim to explain or predict choices made by individuals from a set of alternatives, i.e. an assortment. Well known use-cases include analyzing a commuter's choice of transportation mode or modelling in-stores or online products purchases. A key feature of choice models is their ability to handle varying assortments, where some alternatives may be unavailable for the choice maker. Choice models are often used to estimate interpretable coefficients of consumer's utility function such as a own - or cross - price elasticities. Another practical usage is to plug a fitted choice model into an optimization process, in order to inform operational decisions. For example, assortment optimization or pricing can be formulated as linear programming optimization problems for certain classical parametric choice models. While traditional specifications of choice models are restricted to a linear form, recent advances based on Machine-Learning algorithms call for the use of more complex models that can be fitted to larger datasets.
+Discrete choice models aim at predicting choice decisions made by individuals from a menu of alternatives, which is known as an assortment. Well-known use cases include predicting a commuter's choice of transportation mode or a customer's in-store or online purchases. A key capability of choice models is their ability to handle assortment variations, such as predicting choices when some alternatives become unavailable or when their features change in different operational contexts. This adaptability to different scenarios allows these models to be used as inputs for optimization problems, such as assortment planning or pricing.
 
-`Choice-Learn` provides a scalable and modular suite of choice modelling tools for practitioners and academic researchers. In order to offer a high flexibility while keeping a simple signature, the package is organized around two levels of interaction. The higher-level API allows a fast integration of choice datasets, and the specification and estimation of standard logit-based choice models. The lower level API offers capabilities to optimize memory usage or customize model formulations. `Choice-Learn` focuses on three main features that complement and extend existing:
+Choice-Learn provides a modular suite of choice modeling tools for practitioners and academic researchers to process choice data, and then formulate, estimate and operationalize choice models. The library is structured into two levels of usage, as illustrated in \ref{fig:gen_org}. The higher-level is designed for fast and easy implementation and the lower-level enables more advanced customization. This structure is inspired by Keras [@Chollet:2015], which is an overlay of TensorFlow [@Abadi:2015] endpoints, enabling a user-friendly modeling interface. Choice-Learn was designed with the following objectives:
 
-- *Large-scale datasets*: optimized RAM usage and batching processes for very large-scale datasets
-- *Model family*: Handling both parametric families of choice models and Machine Learning-based formulations within the same codebase
-- *Tools*: to estimate, deploy, evaluate and use choice models
+- **Streamlined:** The code signature is kept simple for fast integration of datasets and estimation of standard models. The higher-level API can be used with minimal code.
+- **Scalable:** Optimized processes are implemented, allowing the use of large datasets and large models.
+- **Flexible:** The codebase is designed to be customized in order to fit different use cases. The lower-level API offers more control over the possible parameterizations.
+- **Models Library:** The same package provides implementations of both standard choice models and machine learning-based methods, including neural networks.
+- **Downstream operations:** Post-processing tools  that leverage choice models for assortment optimization and pricing are also integrated into the library.
 
-![General Organization of Choice-Learn package. \label{fig:generalorg}](../illustrations/choice_learn_high_level.png)
+![General Organization of Choice-Learn package. \label{fig:gen_org}](../illustrations/choice_learn_high_level.png)
 
-This tryptich, data, model and usage, is illustrated on \autoref{fig:generalorg} with examples of the two levels of API interactions.
+A summary of the main contributions of Choice-Learn is provided in Table \ref{tab:comparison}.
 
 # Statement of need
 
-## Handling Large Datasets
-Choice modelling is a natural tool for retailers or marketplaces to understand their customer base and to improve or optimize their commercial offering or operational footprint. With the fast-paced improvement of companies data architectures, larger and more reliable customer-level datasets have emerged. While several efficient Python packages have been made available to estimate choice models [@Bierlaire:2023; @Brathwaite:2018] they are usually not built to work with large-scale datasets.
+## Streamlined signatures
+`Choice-Learn` proposes short signatures for a fast implementation. For example, the *ChoiceDataset* object, which handles the dataset, takes only 4 inputs: 'items_features' describing each available alternative, 'shared_features' describing the context of the choice, 'available_items' indicating the subset of alternatives offered in the assortment, and finally 'choices', the index of the chosen option. Choice-Learn also provides methods to seamlessly integrate popular data formats, such as long and wide format dataframes [@Helveston:2023].
 
-![Organisation of the FeaturesbyID. \label{fig:fbi}](../illustrations/choice_learn_features_storage.png)
+```python
+dataset = ChoiceDataset(choices, shared_features, items_features, available_items)
+```
 
-`Choice-Learn`'s ChoiceDataset is built specifically to handle large choice datasets. It mainly relies on NumPy [@Harris:2020] with the objective to limit the memory footprint of the dataset. The key idea is to minimize features repetition and to rebuild the full data structure only for batches of the dataset.
+The signatures for data usage in model estimation and evaluation are designed to be consistent with mainstream machine learning packages such as scikit-learn [@Pedregosa:2011].
 
-- *Features splitting:* We define 'items_features' that describe each available alternative and 'shared_features' that are common to all alternatives for one given choice. These shared features usually change from one choice to another and can represent customer attributes for example. This split let us avoid repeating these 'shared_features' for each alternative as it would be implied by the standard "long format" of the input dataset.
+## Data and model scalability
 
-- *Features by IDs:* Features can be stored in specific objects which are only referenced in the dataset by their ID. These features are stacked with the main dataset only in batches, when the data is processed. It is particularly efficient for features that are repeated in the dataset. Consider the case where we have a few different stores represented by their surface, position, etc... Such features are static - they never change. Therefore, they can be stored in an object and it suffices to reference the store where the choice observation occurs in the input dataset. Then, when a batch of data is retrieved, the full features matrix is generated from the ID. \autoref{fig:fbi} illustrates this approach.
+Choice modeling is a standard tool for brick-and-mortar retailers and online marketplaces to better understand customer behavior and optimize product offerings. With the continuous development of firms' data architectures, larger-scale and more reliable choice datasets are leveraged to manage customer-facing operations.
 
-## Parametrized and Machine-Learning based models
-> Naming still unsatisfactory imo (Parametrized/Interpretable vs ?)
+`Choice-Learn`'s data structure relies on NumPy [@Harris:2020] with the objective of limiting the memory footprint. It minimizes the repetition of the same item or customer features and defers the instantiation of the full data structure until processing batches of data. Moreover, the *FeaturesStorage* object allows feature values to be referenced in the dataset only by their ID. These features value are  substituted to the ID placeholder on the fly in the batching process. For instance, suppose that we have access to store features such as surface, position, or number of employees. These features are often stationary: they do not change over time when predicting customer choices. Thus, they can be stored in an auxiliary data structure and it suffices to reference in the main dataset in which specific store the choice observation is recorded. Figure~\ref{fig:fbi} illustrates this approach.
 
-The availability of detailed customer choice data enables estimating more complex choice models. Recent research outlines this possibility with neural networks approaches [@Han:2022; @Aouad:2023] or tree-based boosting models [@Salvadé:2024; &@AouadMarket:2023].
-The existing libraries [@Bierlaire:2023; @Brathwaite:2018; @Du:2023] are often not designed to integrate such machine learning-based approaches.
+The package stands on Tensorflow [@Abadi:2015] for model estimation, offering the possibility to use fast second-order optimization algorithm such as L-BFGS [@Nocedal:2006] as well as various gradient-descent optimizers [@Tieleman:2012; @Kingma:2017] specialized in handling batches of data. GPU usage is also possible, which can prove to be time-saving.
+Finally, the TensorFlow backbone ensures an efficient usage in a production environment, such as within an assortment recommendation software. Many state-of-the-art tools are provided for deployment and serving, such as TFLite and TFServing.
 
-`Choice-Learn` proposes a model object structure and a unified estimation tools using automatic differentiation library which can flexibly accommodate parametric models such as the Conditional Logit [@Train:1987] as well as machine learning-based such as RUMnet [@Aouad:2023] or TasteNet [@Han:2022]. Specifically, we rely on the Tensorflow library [@Abadi:2015] implementing efficiently optimization algorithms such as LBFGS[@Nocedal:2006] and various gradient-descent optimizers [@Tieleman:2012; @Kingma:2017]. It also enables GPUs usage for parameters estimation that can prove to be time saving on high-dimensional or large-sample datasets.
-Moreover, `Choice-Learn` also enables building new and custom choice models with a common inheritance scheme that minimizes user's integration effort. Compared to standard implementations there is virtually no restriction in specifying the utility function, as long as its gradient can be computed.
-Finally, the TensorFlow backbone ensures an efficient use of the models in a production environment. Many state-of-the-art tools are provided for TensorFlow based models deployment and serving such as TFLite and TFServing.
+![Functioning of the *FeaturesStorage*. \label{fig:fbi}](../illustrations/choice_learn_features_storage.png)
 
-## Beyond choice modeling: Assortment and pricing optimization
+## Flexible usage: from linear utility to customized specification
 
-`Choice-Learn` also ambitions to offer a set of tools revolving around choice modelling. Assortment optimization is a common usecase that leverages a choice model in order to determine or design the optimal subset of alternative to offer customers in order to maximize a certain objective function. Examples includes assortment planning, display location optimization, and pricing. A generic implemenation is proposed in the library so that estimated choice models are easily plugged into such optimization processes.
+Choice models following the *Random Utility Maximization* principle [@McFadden:2000] define the utility of an option $i \in \mathcal{A}$ as the sum of a deterministic part $U(i)$ and an error random term $\epsilon_i$. If the noise terms $(\epsilon_i)_{i \in \mathcal{A}}$ are assumed to be independent and Gumbel-distributed, the probability to choose option $i$ can be written as the softmax normalization over the available alternatives $j\in \mathcal{A}$:
 
-# Examples
+$$\mathbb{P}(i|\mathcal{A}) = \frac{e^{U(i)}}{\sum_{j \in \mathcal{A}} e^{U(j)}}$$
 
-## RAM usage comparison
+The choice-modeller's job is to formulate an appropriate utility function depending on the context. In Choice-Learn, the user can parametrize predefined models such as the Conditional Logit or freely specify a custom utility function by overriding the *compute_batch_utility* method from the *ChoiceModel* class. This allows for tailoring choice models to different use cases.
 
-![RAM usage with and without FeaturesByIDs for datasets including one hot representation of shapes (10, 10) and (100, 100). \label{fig:ram_usage}](../illustrations/fbid_RAM.png){ width=50% }
+## Unifying  traditional random utility models and machine learning-based models
 
-![RAM usage comparison on the Expedia Dataset. \label{fig:exp_ram_usage}](../illustrations/expedia_RAM.png){ width=50% }
+Traditional parametric choice models often specify the utility function as a linear form. While this provides interpretable coefficients, such as price  elasticities, it also limits the predictive power of the model.
+The availability of detailed customer choice data, paired with advances in machine learning, enables the estimation of more complex models. Recent research outlines this potential with neural networks approaches [@Han:2022; @Aouad:2023] and tree-based models [@Salvadé:2024; @AouadMarket:2023]. However, existing choice libraries [@Bierlaire:2023; @Brathwaite:2018; @Du:2023] are often not designed to integrate such machine learning-based approaches.
 
-We provide numerical examples of memory usage to showcase the efficiency of Features by IDs provided by `Choice-Learn`. We consider a case where we have a feature repeated in a dataset. For instance, this may represent a one-hot encoding for locations, identified by a matrix of shape (n_locations, n_locations). Each row of the dataset of size dataset_size refers to one of the locations. In \autoref{fig:ram_usage}, we compare the memory usage for different dataset sizes and n_locations=10 and 100. We find that `Choice-Learn` can save several orders of magnitude of bytes in memory usage.
+Choice-Learn proposes a unified estimation tool based on  TensorFlow's automatic differentiation [@Abadi:2015], which can flexibly accommodate traditional parametric models, such as the Conditional Logit [@Train:1987], as well as neural network models, such as RUMnet [@Aouad:2023] or TasteNet [@Han:2022].
 
-We conduct another experiment on the ICDM 2013 Expedia dataset [@Expedia:2013]. We compare four data handling methods: pandas.DataFrames in long and wide format that are commonly used in choice modelling packages, and `Choice-Learn`'s ChoiceDataset with and without Features by IDs. Following [@Aouad:2023] preprocessing of the dataset, four features are represented as one-hot values and are optimized with `Choice-Learn` data management.The results obtained by varying the sample size are reported in \autoref{fig:exp_ram_usage}.
 
-## Choice model customization
+## Downstream operations: Assortment and pricing optimization
+`Choice-Learn` also offers additional tools for downstream operations, which are not usually integrated in choice modeling libraries. In particular, assortment optimization is a common use case that leverages a choice model in order to determine the optimal subset of alternatives to offer customers in order to maximize a certain objective, such as the expected revenue, conversion rate, or social welfare. This framework captures a variety of applications such as assortment planning, display location optimization, and pricing. We provide a generic implementation based on the mixed-integer programming formulation described in [@MendezDiaz:2014]. It currently supports the optimization of the assortment and prices of offered items. Users can choose between solvers like Gurobi [@Gurobi:2023], which is popular in the research community with free licensing, or OR-Tools [ORTools:2024], which is open source.
 
-Choice models following the Random Utility principle define the utility of an alternative $i \in \mathcal{A}$ as the sum of a deterministic part $U_i$ and an error random term $\epsilon_i$. If $\epsilon_i$ is supposed to be i.i.d. over all the available alternatives and following a Gumbel distribution, the probability to choose $i$ can be written as the softmax normalization over the available alternatives $j\in \mathcal{A}$:
 
-$$\mathbb{P}(i) = \frac{e^{U_i}}{\sum_j e^{U_j}}$$\
+# Experiments and examples
 
+## Memory usage: a case study
 
-### An example: Definition of non linear utility function
-> What would be a better example ?
+We provide numerical examples of memory usage to showcase the efficiency of the *FeaturesStorage*. Consider a feature repeated in a dataset, such as a one-hot encoding for locations, represented by a matrix of shape (*n_locations, n_locations*). Each row of the dataset refers to one of the locations. In \ref{fig:xps} (a), we compare the memory usage for different dataset sizes and \*n_locations* set to 10 and 100. We find that `Choice-Learn` can save several orders of magnitude in memory usage.
 
-Most choice modelling packages only handle linear formulation of the utility. `Choice-Learn` allows flexibility and an easy creation of a custom choice model. Inheriting the ChoiceModel class lets the user define its own utility function. One only needs to sepcify how to compute the utility of a batch of data using elementary TensorFlow operations. Here is an example where we use the following formulation of utility for an alternative $i$ with features $x_i$ considered by a customer with features $z$:
+We conduct a similar experiment experiment on the ICDM 2013 Expedia dataset [@Expedia:2013]. We compare four data handling methods: pandas.DataFrames [@pandas:2020] in long and wide format, often used in choice modeling packages, as well as Torch-Choice and `Choice-Learn`. Following the preprocessing of the dataset as described by [@Aouad:2023], four features are represented as one-hot values. The results, obtained by varying the sample size, are reported in \ref{fig:xps} (b).
 
-$$U_i = \beta_l \cdot (elu(\Gamma_x \cdot x_i) + elu(\Gamma_z \cdot z)) $$
-with $\Gamma_x$, $\Gamma_z$ matrixes and $\beta_l$ a vector to be estimated,
-$elu$ is the activation function so that $elu(x) = x$ if $x > 0$ and $elu(x) = e^x-1$ if $x < 0$.
+Finally, we observe similar performance gains in terms of memory management on a proprietary dataset in brick-and-mortar retailing. It consists of the aggregation of more than 4 million purchases over 5 years in over 600 retail Konzum supermarkets in Croatia. Focusing  on the *coffee* subcategory, the dataset specifies, for each purchase, which of the 63 products were available, their prices, as well as a one-hot representation of the store. The numerical results are presented in \ref{fig:xps} (c) and (d).
 
-Below is an example of implementation using TensorFlow's Dense layers and `Choice-Learn`:
 
-```python
-from tensorflow.keras.layers import Dense
-from choice_learn.models import ChoiceModel
+\begin{figure}
+\centering
+\label{fig:xps}
+\begin{tabular}{cc}
+  \includegraphics[width=65mm]{illustrations/ram_images/ram_usage_storage.png} &   \includegraphics[width=65mm]{illustrations/ram_images/ram_usage_expedia_2.png} \\
+(a) Choice-Learn memory usage for different & (b) Memory usage for the Expedia dataset \\[6pt]
+values of (n\_locations, n\_locations) & with different dataset sizes \\
+    \includegraphics[width=65mm]{illustrations/ram_images/ram_usage_fng_1.png} &   \includegraphics[width=65mm]{illustrations/ram_images/ram_usage_fng_2.png} \\
+(c) Memory usage of our own retail dataset   & (d) Memory usage of our own retail dataset \\[6pt]
+for different dataset sizes & for different number of stores \\
+\end{tabular}
+\caption{Memory usage experiments \VA{Illustrations to be completed and colors to be aligned!} \Alicomment{Nice. Why not include Torch choice?} \VA{will doo}}
+\end{figure}
 
-class ExampleCustomizedModel(ChoiceModel):
-    def __init__(self, n_neurons, **kwargs):
-        super().__init__(**kwargs)
-        self.n_neurons = n_neurons
 
-        # Items Features Layer
-        self.dense_items = Dense(units=n_neurons, activation="elu")
 
-        # Shared Features Layer
-        self.dense_shared = Dense(units=n_neurons, activation="elu")
+## Customized choice models
+We provide an example of the custom model definition with the following formulation of utility for an alternative $i$ with features $x_i$ considered by a customer with features $z$:
+$$U(i) = \beta_l \cdot \sigma(\sigma(\Gamma_x \cdot x_i) + \sigma(\Gamma_z \cdot z)) + \epsilon_i,$$
+where $\Gamma_x$, $\Gamma_z$ are matrices and $\beta_l$ is a vector, all of which are parameters to be estimated. Additionally, $\sigma$ is the sigmoid activation function.
+When introducing a custom model, one needs to inherit the *ChoiceModel* class, specify the weights to be estimated in the *__init__* method, and determine how to compute the utility in the *compute_batch_utility* method.
 
-        # Third layer: embeddings to utility (dense representation of features > U)
-        self.final_layer = Dense(units=1, activation="linear")
 
-    @property
-    def trainable_weights(self):
-        """Access model's trainable_weights.
+```python``
+    def __init__(self, n_neurons, **kwargs):
+        super().__init__(**kwargs)
 
-        Returns
-        -------
-        list
-            list of trainable_weights
-        """
-        return model.dense_items.trainable_variables\
-              + model.dense_shared.trainable_variables\
-                  + model.final_layer.trainable_variables
+        self.gamma_x = Dense(units=n_neurons, activation="sigmoid")
+        self.gamma_z = Dense(units=n_neurons, activation="sigmoid")
+        self.beta_l = Dense(units=1, activation="linear")
 
     def compute_batch_utility(self,
                               shared_features_by_choice,
                               items_features_by_choice,
-                              available_items_by_choice,
-                              choices):
-        """Compute batch utility from features."""
-        _, _ = available_items_by_choice, choices
-        # We apply the neural network to all items_features_by_choice for all the items
-        # We then concatenate the utilities of each item of shape (n_choices, 1) into
-        # a single one of shape (n_choices, n_items)
-        shared_embeddings = self.dense_shared(shared_features_by_choice[0])
-
-        # Iterate over items
-        items_features_embeddings = []
-        for i in range(items_features_by_choice[0].shape[1]):
-            # Utility is Dense(embeddings sum)
-            item_embedding = shared_embeddings + self.dense_items(items_features_by_choice[0][:, i])
-            items_features_embeddings.append(self.final_layer(item_embedding))
-
-        # Concatenation to get right shape (n_choices, n_items, )
-        item_utility_by_choice = tf.concat(items_features_embeddings, axis=1)
+                              **kwargs):
+
+        z_embedding = self.gamma_z(shared_features_by_choice)
+
+        item_utility_by_choice = []
+        for i in range(n_items):
+            embedding = sigmoid(shared_embeddings +\
+            self.gamma_w(items_features_by_choice[:, i]))
+            item_utility_by_choice.append(self.beta_l(item_embedding))
+
+        item_utility_by_choice = tf.concat(item_utility_by_choice, axis=1)
 
         return item_utility_by_choice
 ```
 
-# Acknowledgements
+# Acknowledgments
+We thank Fortenova and particularly Martin Možina for their helpful collaboration and providing of the proprietary dataset.
 
 # References