Skip to content

Commit

Permalink
small fix
Browse files Browse the repository at this point in the history
  • Loading branch information
VincentAuriau committed Jun 17, 2024
1 parent 4aaaf09 commit a7b06a3
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 2 deletions.
Binary file added docs/illustrations/ram_usage_comparison.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
118 changes: 118 additions & 0 deletions docs/paper/memory_usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,13 @@
"torch_choice = [1249040, 12481040, 124801040, 1248001040, 4962273680]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -673,6 +680,117 @@
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"colors=[\"#e69f00\", \"#56b4e9\", \"#009e73\", \"#0072b2\", \"#d55e00\", \"#cc79a7\", \"#f0e442\"]\n",
"linestyle=[\"-\", \"--\", \"-.\", \":\", \"-\", \"--\", \"-.\"]\n",
"\n",
"fig = plt.figure(figsize=(12, 10))\n",
"fig.tight_layout()\n",
"fig.subplots_adjust(hspace=0.35)\n",
"\n",
"ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n",
"dense_1 = [928, 8128, 80128, 800128, 8000128, 800000128]\n",
"fbid_1 =[1036, 1756, 8956, 80956, 800956, 80000956]\n",
"dense_2 = [8128, 80128, 800128, 8000128, 80000128, 8000000128]\n",
"fbid_2 = [7892, 8612, 15812, 87812, 807812, 80007812]\n",
"plt.subplot(2, 2, 1)\n",
"plt.plot(ds_lengths, dense_1, label='w/o FeaturesStorage - 10 locations', c=colors[0], ls=linestyle[0])\n",
"plt.plot(ds_lengths, fbid_1, label='w/ FeaturesStorage - 10 locations', c=colors[1], ls=linestyle[1])\n",
"plt.scatter(ds_lengths, dense_1, c=colors[0])\n",
"plt.scatter(ds_lengths, fbid_1, c=colors[1])\n",
"\n",
"plt.plot(ds_lengths, dense_2, label='w/o FeaturesStorage - 100 locations', c=colors[2], ls=linestyle[2])\n",
"plt.plot(ds_lengths, fbid_2, label='w/ FeaturesStorage - 100 locations', c=colors[3], ls=linestyle[3])\n",
"plt.scatter(ds_lengths, dense_2, c=colors[2])\n",
"plt.scatter(ds_lengths, fbid_2, c=colors[3])\n",
"plt.yscale(\"log\")\n",
"plt.xscale(\"log\")\n",
"plt.xlabel(\"Dataset Size\")\n",
"plt.ylabel(\"Memory usage (bytes)\")\n",
"plt.legend(prop={'size': 8})\n",
"\n",
"plt.title(\"(a) Memory usage of our retail dataset \\n for different dataset sizes\", y=-.3)\n",
"\n",
"plt.subplot(2, 2, 2)\n",
"data_lengths = [100, 1000, 10000, 100000, 397618]\n",
"cd_with = [85640, 771440, 7629440, 76209440, 302994356]\n",
"cd_wo = [220400, 2198600, 21980600, 219800600, 873964964]\n",
"df_long = [5521360, 52463080, 524667470, 5234198450, 20815361140]\n",
"df_wide = [6252516976, 6266664976, 6408144976, 7822944976, 12501499936]\n",
"df_wide = [3503109, 31784709, 314600709, 3142760709, 12495108741]\n",
"torch_choice = [1249040, 12481040, 124801040, 1248001040, 4962273680]\n",
"\n",
"plt.plot(data_lengths, cd_with, label=\"Choice-Learn\", c=colors[3], ls=linestyle[3])\n",
"plt.plot(data_lengths, torch_choice, label=\"Torch-Choice\", c=colors[1], ls=linestyle[1])\n",
"plt.plot(data_lengths, df_long, label=\"PyLogit (long format)\", c=colors[0], ls=linestyle[0])\n",
"plt.plot(data_lengths, df_wide, label=\"Biogeme (wide format)\", c=colors[2], ls=linestyle[2])\n",
"plt.scatter(data_lengths, cd_with, c=colors[3])\n",
"plt.scatter(data_lengths, torch_choice, c=colors[1])\n",
"plt.scatter(data_lengths, df_long, c=colors[0])\n",
"plt.scatter(data_lengths, df_wide, c=colors[2])\n",
"plt.legend(prop={'size': 8})\n",
"plt.yscale(\"log\")\n",
"plt.xscale(\"log\")\n",
"plt.xlabel(\"Dataset Size\")\n",
"plt.ylabel(\"Memory usage (bytes)\")\n",
"plt.title(\"(b) Memory usage of our retail dataset \\n for different dataset sizes\", y=-.3)\n",
"\n",
"plt.subplot(2, 2, 3)\n",
"\n",
"sizes = [100, 1000, 10000.0, 100000.0, 1000000.0, 4789225]\n",
"tc = [3933312, 4854912, 14070912, 106230912, 1027830912, 4907997312]\n",
"# Wide \n",
"wide = [629748, 5954312, 59178320, 591244240, 5914324260, 28317686484]\n",
"# Long\n",
"long = [729260, 7216256, 79819560, 1000640616, 10190708220, 47241911756]\n",
"# CL\n",
"cl = [163734, 526146, 3921306, 34453314, 319713662, 1546499942]\n",
"\n",
"plt.plot(sizes, cl, label=\"Choice-Learn\", c=colors[3], ls=linestyle[3])\n",
"plt.plot(sizes, tc, label=\"Torch-Choice\", c=colors[1], ls=linestyle[1])\n",
"plt.plot(sizes, long, label=\"PyLogit (long format)\", c=colors[0], ls=linestyle[0])\n",
"plt.plot(sizes, wide, label=\"Biogeme (wide format)\", c=colors[2], ls=linestyle[2])\n",
"plt.scatter(sizes, cl, c=colors[3])\n",
"plt.scatter(sizes, tc, c=colors[1])\n",
"plt.scatter(sizes, long, c=colors[0])\n",
"plt.scatter(sizes, wide, c=colors[2])\n",
"plt.legend(prop={'size': 8})\n",
"plt.yscale(\"log\")\n",
"plt.xscale(\"log\")\n",
"plt.xlabel(\"Dataset Size\")\n",
"plt.ylabel(\"Memory usage (bytes)\")\n",
"plt.title(\"(c) Memory usage of our retail dataset \\n for different dataset sizes\", y=-.3)\n",
"\n",
"plt.subplot(2, 2, 4)\n",
"n_stores = [0, 10, 100, 250, 692]\n",
"cl_mem_usage = [352083911, 352084711, 352163911, 352583911, 355914823]\n",
"other_mem = [1027830912, 1027831712, 1027910912, 1028330912, 1031661824]\n",
"long_mem_usage = [4654663932, 4734664572, 5454670332, 6654679932, 10190708220]\n",
"wide_mem_usage = [378279972, 458280612, 1178286372, 2378295972, 5914324260]\n",
"\n",
"plt.plot(n_stores, cl_mem_usage, label=\"Choice-Learn\", c=colors[3], ls=linestyle[3])\n",
"plt.plot(n_stores, other_mem, label=\"Torch-Choice\", c=colors[1], ls=linestyle[1])\n",
"plt.plot(n_stores, long_mem_usage, label=\"PyLogit (long format)\", c=colors[0], ls=linestyle[0])\n",
"plt.plot(n_stores, wide_mem_usage, label=\"Biogeme (wide format)\", c=colors[2], ls=linestyle[2])\n",
"plt.scatter(n_stores, cl_mem_usage, c=colors[3])\n",
"plt.scatter(n_stores, other_mem, c=colors[1])\n",
"plt.scatter(n_stores, long_mem_usage, c=colors[0])\n",
"plt.scatter(n_stores, wide_mem_usage, c=colors[2])\n",
"plt.legend(prop={'size': 8})\n",
"plt.yscale(\"log\")\n",
"plt.xlabel(\"Stores Number\")\n",
"plt.ylabel(\"Memory usage (bytes)\")\n",
"plt.title(\"(d) Memory usage of our retail dataset \\n for different number of stores\", y=-.3)\n",
"# plt.xticks([0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000],\n",
"# [0, 50, 100, 150, 200, 250, 300, 350, 400])\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
4 changes: 2 additions & 2 deletions docs/paper/paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,9 @@ We provide numerical examples of memory usage to showcase the efficiency of the

We conduct a similar comparison on the ICDM 2013 Expedia dataset [@Expedia:2013] with four data handling methods: pandas.DataFrames [@pandas:2020] in long and wide format, both used in choice modeling packages, as well as Torch-Choice and `Choice-Learn`. Figure \ref{fig:xps} (b) shows the results for various sample sizes.

Finally, in Figure \ref{fig:xps} (c) and (d), we observe performance gains in terms of memory management on a proprietary dataset in brick-and-mortar retailing consisting of the aggregation of more than 4 million purchases over 5 years in Konzum supermarkets in Croatia. Focusing on the *coffee* subcategory, the dataset specifies, for each purchase, which products were available, their prices, as well as a one-hot representation of the supermarket.
Finally, in Figure \ref{fig:xps} (c) and (d), we observe performance gains in terms of memory management on a proprietary dataset in brick-and-mortar retailing consisting of the aggregation of more than 4 billion purchases in Konzum supermarkets in Croatia. Focusing on the *coffee* subcategory, the dataset specifies, for each purchase, which products were available, their prices, as well as a one-hot representation of the supermarket.

![Memory usage experiments. \label{fig:xps}](../illustrations/full_ram.png)
![Memory usage experiments. \label{fig:xps}](../illustrations/ram_usage_comparison.png)

## Customized choice models
We provide an example of the custom model definition with the following formulation of utility $U(i)$ with alternative features $x_i$ and customer features $z$:
Expand Down

0 comments on commit a7b06a3

Please sign in to comment.