From a3a507947f76f91b5443bf2b5ec8bc61c06e8cd0 Mon Sep 17 00:00:00 2001
From: VincentAURIAU <auriau.vincent@gmail.com>
Date: Mon, 8 Apr 2024 16:26:19 +0200
Subject: [PATCH] Update mem usage

---
 docs/paper/memory_usage.ipynb | 413 +++++++++++++++++++++++++++-------
 1 file changed, 334 insertions(+), 79 deletions(-)

diff --git a/docs/paper/memory_usage.ipynb b/docs/paper/memory_usage.ipynb
index cdbe8e61..788e38ab 100644
--- a/docs/paper/memory_usage.ipynb
+++ b/docs/paper/memory_usage.ipynb
@@ -6,7 +6,14 @@
    "source": [
     "# RAM usage with Choice-Learn\n",
     "\n",
-    "## On the ICDM 2013 Expedia Dataset"
+    "## On the ICDM 2013 Expedia Dataset\n",
+    "\n",
+    "- [Choice-Learn'e Choice Dataset with FeaturesByIDs](#with-choice-learn-and-featuresbyids)\n",
+    "- [Choice-Learn'e Choice Dataset without FeaturesByIDs](#choice-learn-without-featuresbyids)\n",
+    "- [pandas.DataFrame on Long format](#pandasdataframe-long-format)\n",
+    "- [pandas.DataFrame on Wide format](#pandasdataframe-wide-format)\n",
+    "- [Plots and Illustrations](#plots-and-illustrations)\n",
+    "- [FeaturesByIDs Study](#featuresbyids-study)"
    ]
   },
   {
@@ -62,17 +69,40 @@
     "        obj_q = new_refr.values()\n",
     "        marked.update(new_refr.keys())\n",
     "\n",
-    "    return sz\n",
-    "\n",
-    "# Defining tested data lengths\n",
-    "data_lengths = [100, 1000, 10000, 100000, 397618]"
+    "    return sz"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### With Choice-Learn and FeaturesByIDs"
+    "from choice_learn.data import OneHotStorage\n",
+    "\n",
+    "### Small Example\n",
+    "\n",
+    "n_fixed_features = 10\n",
+    "n_different_values = 10\n",
+    "n_data = 100\n",
+    "\n",
+    "indexes = np.random.randint(n_different_values, size=(n_data, ))\n",
+    "\n",
+    "dense_features = np.zeros((n_data, n_fixed_features))\n",
+    "dense_features[np.arange(n_data), indexes] = 1\n",
+    "\n",
+    "storage = OneHotStorage(ids=list(range(n_different_values)))\n",
+    "\n",
+    "assert (storage.batch[indexes] == dense_features).all()\n",
+    "\n",
+    "### Dense features memory usage:\n",
+    "print(\"Dense memory usage:\", get_obj_size(dense_features))\n",
+    "\n",
+    "### FeaturesByIDs memory usage:\n",
+    "# Storage memory usage + ids memory stirage\n",
+    "print(\"FeaturesByIDs memory usage:\", get_obj_size(storage)+get_obj_size(indexes))\n",
+    "\n",
+    "\n"
    ]
   },
   {
@@ -81,8 +111,68 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Takes some time\n",
-    "dataset = load_expedia(as_frame=False, preprocessing=\"rumnet\")"
+    "from choice_learn.data import OneHotStorage\n",
+    "n_fixed_features = 10\n",
+    "n_different_values = 10\n",
+    "\n",
+    "\n",
+    "dense_sizes = []\n",
+    "fbid_sizes = []\n",
+    "ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n",
+    "for n_data in ds_lengths:\n",
+    "\n",
+    "\n",
+    "    indexes = np.random.randint(n_different_values, size=(n_data, ))\n",
+    "\n",
+    "    dense_dataset = np.zeros((n_data, n_fixed_features))\n",
+    "    dense_dataset[np.arange(n_data), indexes] = 1\n",
+    "\n",
+    "    storage = OneHotStorage(ids=list(range(n_different_values)))\n",
+    "    \n",
+    "    fbid_sizes.append(get_obj_size(indexes) + get_obj_size(storage))\n",
+    "    dense_sizes.append(get_obj_size(dense_dataset))\n",
+    "\n",
+    "plt.plot(ds_lengths, dense_sizes, label='w/o FeaturesById - (10, 10)', c=\"darkblue\")\n",
+    "plt.plot(ds_lengths, fbid_sizes, label='w/ FeaturesById - (10, 10)', c=\"turquoise\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Dataset Size\")\n",
+    "plt.ylabel(\"Memory usage (bytes)\")\n",
+    "\n",
+    "n_fixed_features = 100\n",
+    "n_different_values = 100\n",
+    "\n",
+    "\n",
+    "dense_sizes = []\n",
+    "fbid_sizes = []\n",
+    "ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n",
+    "for n_data in ds_lengths:\n",
+    "\n",
+    "\n",
+    "    indexes = np.random.randint(n_different_values, size=(n_data, ))\n",
+    "\n",
+    "    dense_dataset = np.zeros((n_data, n_fixed_features))\n",
+    "    dense_dataset[np.arange(n_data), indexes] = 1\n",
+    "\n",
+    "    storage = OneHotStorage(ids=list(range(n_different_values)))\n",
+    "    \n",
+    "    fbid_sizes.append(get_obj_size(indexes) + get_obj_size(storage))\n",
+    "    dense_sizes.append(get_obj_size(dense_dataset))\n",
+    "\n",
+    "plt.plot(ds_lengths, dense_sizes, label='w/o FeaturesById - (100, 100)', c=\"cornflowerblue\")\n",
+    "plt.plot(ds_lengths, fbid_sizes, label='w/ FeaturesById - (100, 100)', c=\"teal\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Dataset Size\")\n",
+    "plt.ylabel(\"Memory usage (bytes)\")\n",
+    "plt.legend()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### With Choice-Learn and FeaturesByIDs"
    ]
   },
   {
@@ -91,16 +181,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cl_w_fbid_memory_size = []\n",
-    "for length in data_lengths:\n",
-    "    sub_dataset = dataset[:length]\n",
-    "    mem_size = 0\n",
-    "    mem_size += get_obj_size(np.copy(sub_dataset.shared_features_by_choice))\n",
-    "    mem_size += get_obj_size(np.copy(sub_dataset.items_features_by_choice))\n",
-    "    mem_size += get_obj_size(np.copy(sub_dataset.choices))\n",
-    "    mem_size += get_obj_size(np.copy(sub_dataset.available_items_by_choice))\n",
-    "    mem_size += get_obj_size(sub_dataset.features_by_ids)\n",
-    "    cl_w_fbid_memory_size.append(mem_size)"
+    "# Takes some time\n",
+    "dataset = load_expedia(as_frame=False, preprocessing=\"rumnet\")\n",
+    "\n",
+    "# Defining tested data lengths\n",
+    "data_lengths = [100, 1000, 10000, 100000, 397618]"
    ]
   },
   {
@@ -109,7 +194,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(cl_w_fbid_memory_size)"
+    "w_fbid_memory_size = []\n",
+    "for length in data_lengths:\n",
+    "    sub_dataset = dataset[:length]\n",
+    "    w_fbid_memory_size.append(get_obj_size(sub_dataset))"
    ]
   },
   {
@@ -129,9 +217,8 @@
     "sfbc = []\n",
     "ifbc = []\n",
     "for batch in dataset.iter_batch(batch_size=1024):\n",
-    "    batch_sfbc = batch[0]\n",
-    "    sfbc.append(batch[0])\n",
-    "    ifbc.append(batch[1])\n",
+    "    sfbc.append(batch[0][0])\n",
+    "    ifbc.append(batch[1][0])\n",
     "\n",
     "sfbc = np.concatenate(sfbc, axis=0)\n",
     "ifbc = np.concatenate(ifbc, axis=0)\n",
@@ -154,60 +241,7 @@
     "wofbid_mem = []\n",
     "for length in data_lengths:\n",
     "    sub_dataset = wo_fbid_dataset[:length]\n",
-    "    mem_size = 0\n",
-    "    mem_size += get_obj_size(np.copy(sub_dataset.shared_features_by_choice))\n",
-    "    mem_size += get_obj_size(np.copy(sub_dataset.items_features_by_choice))\n",
-    "    mem_size += get_obj_size(np.copy(sub_dataset.choices))\n",
-    "    mem_size += get_obj_size(np.copy(sub_dataset.available_items_by_choice))\n",
-    "    mem_size += get_obj_size(sub_dataset.features_by_ids)\n",
-    "    wofbid_mem.append(mem_size)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(wofbid_mem)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cl_w_fbid_memory_size"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.plot(wofbid_mem, label=\"without\")\n",
-    "plt.plot(cl_w_fbid_memory_size, label=\"with\")\n",
-    "plt.legend()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "wo_fbid_dataset.available_items_by_choice[0].dtype"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "get_obj_size(np.copy(dataset.shared_features_by_choice[0])) / get_obj_size(np.copy(wo_fbid_dataset.shared_features_by_choice[0]))"
+    "    wofbid_mem.append(get_obj_size(sub_dataset))"
    ]
   },
   {
@@ -382,6 +416,125 @@
     "    long_df_memory_size.append(get_obj_size(sub_long_df))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### pandas.DataFrame wide format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv(\"../../choice_learn/datasets/data/expedia_rumnet_preprocessing.csv\", engine=\"pyarrow\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "site_id_one_hot = pd.get_dummies(df.site_id, prefix=\"site_id\")\n",
+    "visitor_location_country_id_one_hot = pd.get_dummies(df.visitor_location_country_id, prefix=\"visitor_location_country_id\")\n",
+    "srch_destination_id_one_hot =pd.get_dummies(df.srch_destination_id, prefix=\"srch_destination_id\")\n",
+    "prop_country_id_one_hpt = pd.get_dummies(df.prop_country_id, prefix=\"prop_country_id\")\n",
+    "df = pd.concat([df, site_id_one_hot, visitor_location_country_id_one_hot, srch_destination_id_one_hot, prop_country_id_one_hpt], axis=1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "contexts_items_features_names = [\n",
+    "    \"prop_starrating\",\n",
+    "    \"prop_review_score\",\n",
+    "    \"prop_brand_bool\",\n",
+    "    \"prop_location_score1\",\n",
+    "    \"prop_location_score2\",\n",
+    "    \"prop_log_historical_price\",\n",
+    "    \"position\",\n",
+    "    \"promotion_flag\",\n",
+    "    \"orig_destination_distance\",\n",
+    "    \"log_price\",\n",
+    "    \"prop_country_id\",\n",
+    "]\n",
+    "contexts_features_names = [\n",
+    "    \"srch_id\",\n",
+    "    \"srch_length_of_stay\",\n",
+    "    \"srch_adults_count\",\n",
+    "    \"srch_children_count\",\n",
+    "    \"srch_room_count\",\n",
+    "    \"srch_saturday_night_bool\",\n",
+    "    \"booking_window\",\n",
+    "    \"random_bool\",\n",
+    "    \"day_of_week\",\n",
+    "    \"month\",\n",
+    "    \"hour\",\n",
+    "    \"site_id\",\n",
+    "    \"visitor_location_country_id\",\n",
+    "    \"srch_destination_id\",\n",
+    "]\n",
+    "for col in df.columns:\n",
+    "    if col.startswith(\"prop_country_id\"):\n",
+    "        contexts_items_features_names += [col]\n",
+    "    if col.startswith(\"site_id\"):\n",
+    "        contexts_features_names += [col]\n",
+    "    if col.startswith(\"visitor_location_country_id\"):\n",
+    "        contexts_features_names += [col]\n",
+    "    if col.startswith(\"srch_destination_id\"):\n",
+    "        contexts_features_names += [col]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wide_items = []\n",
+    "for i in range(39):\n",
+    "    sub_df = df.groupby(\"srch_id\").apply(lambda x: x[contexts_items_features_names].iloc[i])\n",
+    "    wide_items.append(sub_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "contexts_features = df.groupby(\"srch_id\").apply(lambda x: x[contexts_features_names].iloc[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wide_df = pd.concat(wide_items+[contexts_features], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wide_df_memory_size = []\n",
+    "for length in data_lengths:\n",
+    "    sub_wide_df = wide_df.iloc[:length].copy()\n",
+    "    wide_df_memory_size.append(get_obj_size(sub_wide_df))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -392,16 +545,118 @@
     "|---|---|---|---|---|---|\n",
     "| CD w. FeaturesByIDs | 398.887 | 3.869.287 | 38.573.287 | 385.613.287 | 1.533.228.295 |\n",
     "| CD wo FeaturesByIDs | 190.028 | 1.887.428 | 18.861.428 | 188.601.428 | 749.908.976 |\n",
+    "| Long format DF |\n",
+    "\n",
     "\n",
     "data_lengths: [100, 1000, 10000, 100000, 397618]\\\n",
     "ChoiceDataset with FeaturesByIDs: [85640, 771440, 7629440, 76209440, 302994356]\\\n",
     "ChoiceDataset without FeaturesByIDs: [220400, 2198600, 21980600, 219800600, 873964964]\\\n",
-    "DF Long Format: [5521360, 52463080, 524667470, 5234198450, 20815361140]"
+    "DF Long Format: [5521360, 52463080, 524667470, 5234198450, 20815361140]\\\n",
+    "DF Wide Format: [3503109, 31784709, 314600709, 3142760709, 12495108741]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Plots and Illustrations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "data_lengths = [100, 1000, 10000, 100000, 397618]\n",
+    "cd_with = [85640, 771440, 7629440, 76209440, 302994356]\n",
+    "cd_wo =  [220400, 2198600, 21980600, 219800600, 873964964]\n",
+    "df_long =  [5521360, 52463080, 524667470, 5234198450, 20815361140]\n",
+    "df_wide =  [6252516976, 6266664976, 6408144976, 7822944976, 12501499936]\n",
+    "df_wide = [3503109, 31784709, 314600709, 3142760709, 12495108741]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.plot(data_lengths, cd_with, label=\"ChoiceDataset with FeaturesByIDs\", c=\"teal\")\n",
+    "plt.plot(data_lengths, cd_wo, label=\"ChoiceDataset withOUT FeaturesByIDs\", c=\"turquoise\")\n",
+    "plt.plot(data_lengths, df_long, label=\"pd.DataFrame Long format\", c=\"darkblue\")\n",
+    "plt.plot(data_lengths, df_wide, label=\"pd.DataFrame Wide format\", c=\"cornflowerblue\")\n",
+    "plt.legend()\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xlabel(\"Dataset Size (x1000)\")\n",
+    "plt.ylabel(\"Memory Size (bytes)\")\n",
+    "plt.xticks([0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000], [0, 50, 100, 150, 200, 250, 300, 350, 400])\n",
+    "plt.show()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
+   "source": [
+    "## FeaturesByIDs Study"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_fixed_features = 10\n",
+    "n_different_values = 10\n",
+    "\n",
+    "normal_sizes = []\n",
+    "cd_sizes = []\n",
+    "ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n",
+    "for dataset_len in ds_lengths:\n",
+    "\n",
+    "    normal_dataset = np.ones((dataset_len, n_fixed_features))\n",
+    "    cd_dataset = (np.ones((dataset_len, 1)), np.ones((n_different_values, n_fixed_features)))\n",
+    "    \n",
+    "    cd_sizes.append(sys.getsizeof(cd_dataset[0]) + sys.getsizeof(cd_dataset[1]))\n",
+    "    normal_sizes.append(sys.getsizeof(normal_dataset))\n",
+    "\n",
+    "plt.plot(ds_lengths, normal_sizes, label='w/o FeaturesById - (10, 10)', c=\"darkblue\")\n",
+    "plt.plot(ds_lengths, cd_sizes, label='w/ FeaturesById - (10, 10)', c=\"turquoise\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Dataset Size\")\n",
+    "plt.ylabel(\"Memory usage (bytes)\")\n",
+    "\n",
+    "n_fixed_features = 100\n",
+    "n_different_values = 100\n",
+    "\n",
+    "normal_sizes = []\n",
+    "cd_sizes = []\n",
+    "ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n",
+    "for dataset_len in ds_lengths:\n",
+    "\n",
+    "    normal_dataset = np.ones((dataset_len, n_fixed_features))\n",
+    "    cd_dataset = (np.ones((dataset_len, 1)), np.ones((n_different_values, n_fixed_features)))\n",
+    "    \n",
+    "    cd_sizes.append(sys.getsizeof(cd_dataset[0]) + sys.getsizeof(cd_dataset[1]))\n",
+    "    normal_sizes.append(sys.getsizeof(normal_dataset))\n",
+    "\n",
+    "plt.plot(ds_lengths, normal_sizes, label='w/o FeaturesById - (100, 100)', c=\"cornflowerblue\")\n",
+    "plt.plot(ds_lengths, cd_sizes, label='w/ FeaturesById - (100, 100)', c=\"teal\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Dataset Size\")\n",
+    "plt.ylabel(\"Memory usage (bytes)\")\n",
+    "plt.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],
@@ -421,7 +676,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,