From a3a507947f76f91b5443bf2b5ec8bc61c06e8cd0 Mon Sep 17 00:00:00 2001 From: VincentAURIAU Date: Mon, 8 Apr 2024 16:26:19 +0200 Subject: [PATCH] Update mem usage --- docs/paper/memory_usage.ipynb | 413 +++++++++++++++++++++++++++------- 1 file changed, 334 insertions(+), 79 deletions(-) diff --git a/docs/paper/memory_usage.ipynb b/docs/paper/memory_usage.ipynb index cdbe8e61..788e38ab 100644 --- a/docs/paper/memory_usage.ipynb +++ b/docs/paper/memory_usage.ipynb @@ -6,7 +6,14 @@ "source": [ "# RAM usage with Choice-Learn\n", "\n", - "## On the ICDM 2013 Expedia Dataset" + "## On the ICDM 2013 Expedia Dataset\n", + "\n", + "- [Choice-Learn'e Choice Dataset with FeaturesByIDs](#with-choice-learn-and-featuresbyids)\n", + "- [Choice-Learn'e Choice Dataset without FeaturesByIDs](#choice-learn-without-featuresbyids)\n", + "- [pandas.DataFrame on Long format](#pandasdataframe-long-format)\n", + "- [pandas.DataFrame on Wide format](#pandasdataframe-wide-format)\n", + "- [Plots and Illustrations](#plots-and-illustrations)\n", + "- [FeaturesByIDs Study](#featuresbyids-study)" ] }, { @@ -62,17 +69,40 @@ " obj_q = new_refr.values()\n", " marked.update(new_refr.keys())\n", "\n", - " return sz\n", - "\n", - "# Defining tested data lengths\n", - "data_lengths = [100, 1000, 10000, 100000, 397618]" + " return sz" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "### With Choice-Learn and FeaturesByIDs" + "from choice_learn.data import OneHotStorage\n", + "\n", + "### Small Example\n", + "\n", + "n_fixed_features = 10\n", + "n_different_values = 10\n", + "n_data = 100\n", + "\n", + "indexes = np.random.randint(n_different_values, size=(n_data, ))\n", + "\n", + "dense_features = np.zeros((n_data, n_fixed_features))\n", + "dense_features[np.arange(n_data), indexes] = 1\n", + "\n", + "storage = OneHotStorage(ids=list(range(n_different_values)))\n", + "\n", + "assert (storage.batch[indexes] == dense_features).all()\n", + "\n", + "### Dense features memory usage:\n", + "print(\"Dense memory usage:\", get_obj_size(dense_features))\n", + "\n", + "### FeaturesByIDs memory usage:\n", + "# Storage memory usage + ids memory stirage\n", + "print(\"FeaturesByIDs memory usage:\", get_obj_size(storage)+get_obj_size(indexes))\n", + "\n", + "\n" ] }, { @@ -81,8 +111,68 @@ "metadata": {}, "outputs": [], "source": [ - "# Takes some time\n", - "dataset = load_expedia(as_frame=False, preprocessing=\"rumnet\")" + "from choice_learn.data import OneHotStorage\n", + "n_fixed_features = 10\n", + "n_different_values = 10\n", + "\n", + "\n", + "dense_sizes = []\n", + "fbid_sizes = []\n", + "ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n", + "for n_data in ds_lengths:\n", + "\n", + "\n", + " indexes = np.random.randint(n_different_values, size=(n_data, ))\n", + "\n", + " dense_dataset = np.zeros((n_data, n_fixed_features))\n", + " dense_dataset[np.arange(n_data), indexes] = 1\n", + "\n", + " storage = OneHotStorage(ids=list(range(n_different_values)))\n", + " \n", + " fbid_sizes.append(get_obj_size(indexes) + get_obj_size(storage))\n", + " dense_sizes.append(get_obj_size(dense_dataset))\n", + "\n", + "plt.plot(ds_lengths, dense_sizes, label='w/o FeaturesById - (10, 10)', c=\"darkblue\")\n", + "plt.plot(ds_lengths, fbid_sizes, label='w/ FeaturesById - (10, 10)', c=\"turquoise\")\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"Dataset Size\")\n", + "plt.ylabel(\"Memory usage (bytes)\")\n", + "\n", + "n_fixed_features = 100\n", + "n_different_values = 100\n", + "\n", + "\n", + "dense_sizes = []\n", + "fbid_sizes = []\n", + "ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n", + "for n_data in ds_lengths:\n", + "\n", + "\n", + " indexes = np.random.randint(n_different_values, size=(n_data, ))\n", + "\n", + " dense_dataset = np.zeros((n_data, n_fixed_features))\n", + " dense_dataset[np.arange(n_data), indexes] = 1\n", + "\n", + " storage = OneHotStorage(ids=list(range(n_different_values)))\n", + " \n", + " fbid_sizes.append(get_obj_size(indexes) + get_obj_size(storage))\n", + " dense_sizes.append(get_obj_size(dense_dataset))\n", + "\n", + "plt.plot(ds_lengths, dense_sizes, label='w/o FeaturesById - (100, 100)', c=\"cornflowerblue\")\n", + "plt.plot(ds_lengths, fbid_sizes, label='w/ FeaturesById - (100, 100)', c=\"teal\")\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"Dataset Size\")\n", + "plt.ylabel(\"Memory usage (bytes)\")\n", + "plt.legend()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With Choice-Learn and FeaturesByIDs" ] }, { @@ -91,16 +181,11 @@ "metadata": {}, "outputs": [], "source": [ - "cl_w_fbid_memory_size = []\n", - "for length in data_lengths:\n", - " sub_dataset = dataset[:length]\n", - " mem_size = 0\n", - " mem_size += get_obj_size(np.copy(sub_dataset.shared_features_by_choice))\n", - " mem_size += get_obj_size(np.copy(sub_dataset.items_features_by_choice))\n", - " mem_size += get_obj_size(np.copy(sub_dataset.choices))\n", - " mem_size += get_obj_size(np.copy(sub_dataset.available_items_by_choice))\n", - " mem_size += get_obj_size(sub_dataset.features_by_ids)\n", - " cl_w_fbid_memory_size.append(mem_size)" + "# Takes some time\n", + "dataset = load_expedia(as_frame=False, preprocessing=\"rumnet\")\n", + "\n", + "# Defining tested data lengths\n", + "data_lengths = [100, 1000, 10000, 100000, 397618]" ] }, { @@ -109,7 +194,10 @@ "metadata": {}, "outputs": [], "source": [ - "print(cl_w_fbid_memory_size)" + "w_fbid_memory_size = []\n", + "for length in data_lengths:\n", + " sub_dataset = dataset[:length]\n", + " w_fbid_memory_size.append(get_obj_size(sub_dataset))" ] }, { @@ -129,9 +217,8 @@ "sfbc = []\n", "ifbc = []\n", "for batch in dataset.iter_batch(batch_size=1024):\n", - " batch_sfbc = batch[0]\n", - " sfbc.append(batch[0])\n", - " ifbc.append(batch[1])\n", + " sfbc.append(batch[0][0])\n", + " ifbc.append(batch[1][0])\n", "\n", "sfbc = np.concatenate(sfbc, axis=0)\n", "ifbc = np.concatenate(ifbc, axis=0)\n", @@ -154,60 +241,7 @@ "wofbid_mem = []\n", "for length in data_lengths:\n", " sub_dataset = wo_fbid_dataset[:length]\n", - " mem_size = 0\n", - " mem_size += get_obj_size(np.copy(sub_dataset.shared_features_by_choice))\n", - " mem_size += get_obj_size(np.copy(sub_dataset.items_features_by_choice))\n", - " mem_size += get_obj_size(np.copy(sub_dataset.choices))\n", - " mem_size += get_obj_size(np.copy(sub_dataset.available_items_by_choice))\n", - " mem_size += get_obj_size(sub_dataset.features_by_ids)\n", - " wofbid_mem.append(mem_size)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(wofbid_mem)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cl_w_fbid_memory_size" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.plot(wofbid_mem, label=\"without\")\n", - "plt.plot(cl_w_fbid_memory_size, label=\"with\")\n", - "plt.legend()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wo_fbid_dataset.available_items_by_choice[0].dtype" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "get_obj_size(np.copy(dataset.shared_features_by_choice[0])) / get_obj_size(np.copy(wo_fbid_dataset.shared_features_by_choice[0]))" + " wofbid_mem.append(get_obj_size(sub_dataset))" ] }, { @@ -382,6 +416,125 @@ " long_df_memory_size.append(get_obj_size(sub_long_df))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### pandas.DataFrame wide format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"../../choice_learn/datasets/data/expedia_rumnet_preprocessing.csv\", engine=\"pyarrow\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "site_id_one_hot = pd.get_dummies(df.site_id, prefix=\"site_id\")\n", + "visitor_location_country_id_one_hot = pd.get_dummies(df.visitor_location_country_id, prefix=\"visitor_location_country_id\")\n", + "srch_destination_id_one_hot =pd.get_dummies(df.srch_destination_id, prefix=\"srch_destination_id\")\n", + "prop_country_id_one_hpt = pd.get_dummies(df.prop_country_id, prefix=\"prop_country_id\")\n", + "df = pd.concat([df, site_id_one_hot, visitor_location_country_id_one_hot, srch_destination_id_one_hot, prop_country_id_one_hpt], axis=1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "contexts_items_features_names = [\n", + " \"prop_starrating\",\n", + " \"prop_review_score\",\n", + " \"prop_brand_bool\",\n", + " \"prop_location_score1\",\n", + " \"prop_location_score2\",\n", + " \"prop_log_historical_price\",\n", + " \"position\",\n", + " \"promotion_flag\",\n", + " \"orig_destination_distance\",\n", + " \"log_price\",\n", + " \"prop_country_id\",\n", + "]\n", + "contexts_features_names = [\n", + " \"srch_id\",\n", + " \"srch_length_of_stay\",\n", + " \"srch_adults_count\",\n", + " \"srch_children_count\",\n", + " \"srch_room_count\",\n", + " \"srch_saturday_night_bool\",\n", + " \"booking_window\",\n", + " \"random_bool\",\n", + " \"day_of_week\",\n", + " \"month\",\n", + " \"hour\",\n", + " \"site_id\",\n", + " \"visitor_location_country_id\",\n", + " \"srch_destination_id\",\n", + "]\n", + "for col in df.columns:\n", + " if col.startswith(\"prop_country_id\"):\n", + " contexts_items_features_names += [col]\n", + " if col.startswith(\"site_id\"):\n", + " contexts_features_names += [col]\n", + " if col.startswith(\"visitor_location_country_id\"):\n", + " contexts_features_names += [col]\n", + " if col.startswith(\"srch_destination_id\"):\n", + " contexts_features_names += [col]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wide_items = []\n", + "for i in range(39):\n", + " sub_df = df.groupby(\"srch_id\").apply(lambda x: x[contexts_items_features_names].iloc[i])\n", + " wide_items.append(sub_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "contexts_features = df.groupby(\"srch_id\").apply(lambda x: x[contexts_features_names].iloc[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wide_df = pd.concat(wide_items+[contexts_features], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wide_df_memory_size = []\n", + "for length in data_lengths:\n", + " sub_wide_df = wide_df.iloc[:length].copy()\n", + " wide_df_memory_size.append(get_obj_size(sub_wide_df))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -392,16 +545,118 @@ "|---|---|---|---|---|---|\n", "| CD w. FeaturesByIDs | 398.887 | 3.869.287 | 38.573.287 | 385.613.287 | 1.533.228.295 |\n", "| CD wo FeaturesByIDs | 190.028 | 1.887.428 | 18.861.428 | 188.601.428 | 749.908.976 |\n", + "| Long format DF |\n", + "\n", "\n", "data_lengths: [100, 1000, 10000, 100000, 397618]\\\n", "ChoiceDataset with FeaturesByIDs: [85640, 771440, 7629440, 76209440, 302994356]\\\n", "ChoiceDataset without FeaturesByIDs: [220400, 2198600, 21980600, 219800600, 873964964]\\\n", - "DF Long Format: [5521360, 52463080, 524667470, 5234198450, 20815361140]" + "DF Long Format: [5521360, 52463080, 524667470, 5234198450, 20815361140]\\\n", + "DF Wide Format: [3503109, 31784709, 314600709, 3142760709, 12495108741]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plots and Illustrations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "data_lengths = [100, 1000, 10000, 100000, 397618]\n", + "cd_with = [85640, 771440, 7629440, 76209440, 302994356]\n", + "cd_wo = [220400, 2198600, 21980600, 219800600, 873964964]\n", + "df_long = [5521360, 52463080, 524667470, 5234198450, 20815361140]\n", + "df_wide = [6252516976, 6266664976, 6408144976, 7822944976, 12501499936]\n", + "df_wide = [3503109, 31784709, 314600709, 3142760709, 12495108741]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.plot(data_lengths, cd_with, label=\"ChoiceDataset with FeaturesByIDs\", c=\"teal\")\n", + "plt.plot(data_lengths, cd_wo, label=\"ChoiceDataset withOUT FeaturesByIDs\", c=\"turquoise\")\n", + "plt.plot(data_lengths, df_long, label=\"pd.DataFrame Long format\", c=\"darkblue\")\n", + "plt.plot(data_lengths, df_wide, label=\"pd.DataFrame Wide format\", c=\"cornflowerblue\")\n", + "plt.legend()\n", + "plt.yscale(\"log\")\n", + "plt.xlabel(\"Dataset Size (x1000)\")\n", + "plt.ylabel(\"Memory Size (bytes)\")\n", + "plt.xticks([0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000], [0, 50, 100, 150, 200, 250, 300, 350, 400])\n", + "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, + "source": [ + "## FeaturesByIDs Study" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_fixed_features = 10\n", + "n_different_values = 10\n", + "\n", + "normal_sizes = []\n", + "cd_sizes = []\n", + "ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n", + "for dataset_len in ds_lengths:\n", + "\n", + " normal_dataset = np.ones((dataset_len, n_fixed_features))\n", + " cd_dataset = (np.ones((dataset_len, 1)), np.ones((n_different_values, n_fixed_features)))\n", + " \n", + " cd_sizes.append(sys.getsizeof(cd_dataset[0]) + sys.getsizeof(cd_dataset[1]))\n", + " normal_sizes.append(sys.getsizeof(normal_dataset))\n", + "\n", + "plt.plot(ds_lengths, normal_sizes, label='w/o FeaturesById - (10, 10)', c=\"darkblue\")\n", + "plt.plot(ds_lengths, cd_sizes, label='w/ FeaturesById - (10, 10)', c=\"turquoise\")\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"Dataset Size\")\n", + "plt.ylabel(\"Memory usage (bytes)\")\n", + "\n", + "n_fixed_features = 100\n", + "n_different_values = 100\n", + "\n", + "normal_sizes = []\n", + "cd_sizes = []\n", + "ds_lengths = [10, 100, 1000, 10000, 100000, 10000000]\n", + "for dataset_len in ds_lengths:\n", + "\n", + " normal_dataset = np.ones((dataset_len, n_fixed_features))\n", + " cd_dataset = (np.ones((dataset_len, 1)), np.ones((n_different_values, n_fixed_features)))\n", + " \n", + " cd_sizes.append(sys.getsizeof(cd_dataset[0]) + sys.getsizeof(cd_dataset[1]))\n", + " normal_sizes.append(sys.getsizeof(normal_dataset))\n", + "\n", + "plt.plot(ds_lengths, normal_sizes, label='w/o FeaturesById - (100, 100)', c=\"cornflowerblue\")\n", + "plt.plot(ds_lengths, cd_sizes, label='w/ FeaturesById - (100, 100)', c=\"teal\")\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"Dataset Size\")\n", + "plt.ylabel(\"Memory usage (bytes)\")\n", + "plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [] } ], @@ -421,7 +676,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.10" } }, "nbformat": 4,