small fix

artefactory · Jun 17, 2024 · a7b06a3 · a7b06a3
1 parent 4aaaf09
commit a7b06a3
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 2 deletions.
diff --git a/docs/illustrations/ram_usage_comparison.png b/docs/illustrations/ram_usage_comparison.png
diff --git a/docs/paper/memory_usage.ipynb b/docs/paper/memory_usage.ipynb
@@ -612,6 +612,13 @@
     "torch_choice = [1249040, 12481040, 124801040, 1248001040, 4962273680]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -673,6 +680,117 @@
     "plt.show()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "colors=[\"#e69f00\", \"#56b4e9\", \"#009e73\", \"#0072b2\", \"#d55e00\", \"#cc79a7\", \"#f0e442\"]\n",
+    "linestyle=[\"-\", \"--\", \"-.\", \":\", \"-\", \"--\", \"-.\"]\n",
+    "\n",
+    "fig = plt.figure(figsize=(12, 10))\n",
+    "fig.tight_layout()\n",
+    "fig.subplots_adjust(hspace=0.35)\n",
+    "\n",
+    "ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n",
+    "dense_1 = [928, 8128, 80128, 800128, 8000128, 800000128]\n",
+    "fbid_1 =[1036, 1756, 8956, 80956, 800956, 80000956]\n",
+    "dense_2 = [8128, 80128, 800128, 8000128, 80000128, 8000000128]\n",
+    "fbid_2 = [7892, 8612, 15812, 87812, 807812, 80007812]\n",
+    "plt.subplot(2, 2, 1)\n",
+    "plt.plot(ds_lengths, dense_1, label='w/o FeaturesStorage - 10 locations', c=colors[0], ls=linestyle[0])\n",
+    "plt.plot(ds_lengths, fbid_1, label='w/ FeaturesStorage - 10 locations', c=colors[1], ls=linestyle[1])\n",
+    "plt.scatter(ds_lengths, dense_1, c=colors[0])\n",
+    "plt.scatter(ds_lengths, fbid_1, c=colors[1])\n",
+    "\n",
+    "plt.plot(ds_lengths, dense_2, label='w/o FeaturesStorage - 100 locations', c=colors[2], ls=linestyle[2])\n",
+    "plt.plot(ds_lengths, fbid_2, label='w/ FeaturesStorage - 100 locations', c=colors[3], ls=linestyle[3])\n",
+    "plt.scatter(ds_lengths, dense_2, c=colors[2])\n",
+    "plt.scatter(ds_lengths, fbid_2, c=colors[3])\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Dataset Size\")\n",
+    "plt.ylabel(\"Memory usage (bytes)\")\n",
+    "plt.legend(prop={'size': 8})\n",
+    "\n",
+    "plt.title(\"(a) Memory usage of our retail dataset \\n for different dataset sizes\", y=-.3)\n",
+    "\n",
+    "plt.subplot(2, 2, 2)\n",
+    "data_lengths = [100, 1000, 10000, 100000, 397618]\n",
+    "cd_with = [85640, 771440, 7629440, 76209440, 302994356]\n",
+    "cd_wo =  [220400, 2198600, 21980600, 219800600, 873964964]\n",
+    "df_long =  [5521360, 52463080, 524667470, 5234198450, 20815361140]\n",
+    "df_wide =  [6252516976, 6266664976, 6408144976, 7822944976, 12501499936]\n",
+    "df_wide = [3503109, 31784709, 314600709, 3142760709, 12495108741]\n",
+    "torch_choice = [1249040, 12481040, 124801040, 1248001040, 4962273680]\n",
+    "\n",
+    "plt.plot(data_lengths, cd_with, label=\"Choice-Learn\", c=colors[3], ls=linestyle[3])\n",
+    "plt.plot(data_lengths, torch_choice, label=\"Torch-Choice\", c=colors[1], ls=linestyle[1])\n",
+    "plt.plot(data_lengths, df_long, label=\"PyLogit (long format)\", c=colors[0], ls=linestyle[0])\n",
+    "plt.plot(data_lengths, df_wide, label=\"Biogeme (wide format)\", c=colors[2], ls=linestyle[2])\n",
+    "plt.scatter(data_lengths, cd_with, c=colors[3])\n",
+    "plt.scatter(data_lengths, torch_choice, c=colors[1])\n",
+    "plt.scatter(data_lengths, df_long, c=colors[0])\n",
+    "plt.scatter(data_lengths, df_wide, c=colors[2])\n",
+    "plt.legend(prop={'size': 8})\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Dataset Size\")\n",
+    "plt.ylabel(\"Memory usage (bytes)\")\n",
+    "plt.title(\"(b) Memory usage of our retail dataset \\n for different dataset sizes\", y=-.3)\n",
+    "\n",
+    "plt.subplot(2, 2, 3)\n",
+    "\n",
+    "sizes = [100, 1000, 10000.0, 100000.0, 1000000.0, 4789225]\n",
+    "tc = [3933312, 4854912, 14070912, 106230912, 1027830912, 4907997312]\n",
+    "# Wide \n",
+    "wide = [629748, 5954312, 59178320, 591244240, 5914324260, 28317686484]\n",
+    "# Long\n",
+    "long = [729260, 7216256, 79819560, 1000640616, 10190708220, 47241911756]\n",
+    "# CL\n",
+    "cl = [163734, 526146, 3921306, 34453314, 319713662, 1546499942]\n",
+    "\n",
+    "plt.plot(sizes, cl, label=\"Choice-Learn\", c=colors[3], ls=linestyle[3])\n",
+    "plt.plot(sizes, tc, label=\"Torch-Choice\", c=colors[1], ls=linestyle[1])\n",
+    "plt.plot(sizes, long, label=\"PyLogit (long format)\", c=colors[0], ls=linestyle[0])\n",
+    "plt.plot(sizes, wide, label=\"Biogeme (wide format)\", c=colors[2], ls=linestyle[2])\n",
+    "plt.scatter(sizes, cl, c=colors[3])\n",
+    "plt.scatter(sizes, tc, c=colors[1])\n",
+    "plt.scatter(sizes, long, c=colors[0])\n",
+    "plt.scatter(sizes, wide, c=colors[2])\n",
+    "plt.legend(prop={'size': 8})\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Dataset Size\")\n",
+    "plt.ylabel(\"Memory usage (bytes)\")\n",
+    "plt.title(\"(c) Memory usage of our retail dataset \\n for different dataset sizes\", y=-.3)\n",
+    "\n",
+    "plt.subplot(2, 2, 4)\n",
+    "n_stores = [0, 10, 100, 250, 692]\n",
+    "cl_mem_usage = [352083911, 352084711, 352163911, 352583911, 355914823]\n",
+    "other_mem = [1027830912, 1027831712, 1027910912, 1028330912, 1031661824]\n",
+    "long_mem_usage = [4654663932, 4734664572, 5454670332, 6654679932, 10190708220]\n",
+    "wide_mem_usage = [378279972, 458280612, 1178286372, 2378295972, 5914324260]\n",
+    "\n",
+    "plt.plot(n_stores, cl_mem_usage, label=\"Choice-Learn\", c=colors[3], ls=linestyle[3])\n",
+    "plt.plot(n_stores, other_mem, label=\"Torch-Choice\", c=colors[1], ls=linestyle[1])\n",
+    "plt.plot(n_stores, long_mem_usage, label=\"PyLogit (long format)\", c=colors[0], ls=linestyle[0])\n",
+    "plt.plot(n_stores, wide_mem_usage, label=\"Biogeme (wide format)\", c=colors[2], ls=linestyle[2])\n",
+    "plt.scatter(n_stores, cl_mem_usage, c=colors[3])\n",
+    "plt.scatter(n_stores, other_mem, c=colors[1])\n",
+    "plt.scatter(n_stores, long_mem_usage, c=colors[0])\n",
+    "plt.scatter(n_stores, wide_mem_usage, c=colors[2])\n",
+    "plt.legend(prop={'size': 8})\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xlabel(\"Stores Number\")\n",
+    "plt.ylabel(\"Memory usage (bytes)\")\n",
+    "plt.title(\"(d) Memory usage of our retail dataset \\n for different number of stores\", y=-.3)\n",
+    "# plt.xticks([0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000],\n",
+    "#            [0, 50, 100, 150, 200, 250, 300, 350, 400])\n",
+    "plt.show()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/docs/paper/paper.md b/docs/paper/paper.md
@@ -139,9 +139,9 @@ We provide numerical examples of memory usage to showcase the efficiency of the
 
 We conduct a similar comparison on the ICDM 2013 Expedia dataset [@Expedia:2013] with four data handling methods: pandas.DataFrames [@pandas:2020] in long and wide format, both used in choice modeling packages, as well as Torch-Choice and `Choice-Learn`. Figure \ref{fig:xps} (b) shows the results for various sample sizes.
 
-Finally, in Figure \ref{fig:xps} (c) and (d), we observe performance gains in terms of memory management on a proprietary dataset in brick-and-mortar retailing consisting of the aggregation of more than 4 million purchases over 5 years in Konzum supermarkets in Croatia. Focusing  on the *coffee* subcategory, the dataset specifies, for each purchase, which products were available, their prices, as well as a one-hot representation of the supermarket.
+Finally, in Figure \ref{fig:xps} (c) and (d), we observe performance gains in terms of memory management on a proprietary dataset in brick-and-mortar retailing consisting of the aggregation of more than 4 billion purchases in Konzum supermarkets in Croatia. Focusing  on the *coffee* subcategory, the dataset specifies, for each purchase, which products were available, their prices, as well as a one-hot representation of the supermarket.
 
-![Memory usage experiments. \label{fig:xps}](../illustrations/full_ram.png)
+![Memory usage experiments. \label{fig:xps}](../illustrations/ram_usage_comparison.png)
 
 ## Customized choice models
 We provide an example of the custom model definition with the following formulation of utility $U(i)$ with alternative features $x_i$ and customer features $z$: