add raw data and notebooks for validation plot

JuBiotech · Oct 11, 2024 · 48ec3c6 · 48ec3c6
1 parent 6a4bd65
commit 48ec3c6
Show file tree

Hide file tree

Showing 11 changed files with 634 additions and 0 deletions.
diff --git a/docs/source/notebooks/Create validation plot from raw data.ipynb b/docs/source/notebooks/Create validation plot from raw data.ipynb
diff --git a/docs/source/notebooks/Normal model_normal data_noise level 0.6.xlsx b/docs/source/notebooks/Normal model_normal data_noise level 0.6.xlsx
diff --git a/docs/source/notebooks/Normal model_normal data_noise level 1.2.xlsx b/docs/source/notebooks/Normal model_normal data_noise level 1.2.xlsx
diff --git a/docs/source/notebooks/Normal model_skew normal data_noise level 0.6.xlsx b/docs/source/notebooks/Normal model_skew normal data_noise level 0.6.xlsx
diff --git a/docs/source/notebooks/Processing test 1 raw data.ipynb b/docs/source/notebooks/Processing test 1 raw data.ipynb
@@ -0,0 +1,359 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import arviz as az\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import pandas\n",
+    "import pymc as pm\n",
+    "from matplotlib import pyplot as plt\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_data_files = [\n",
+    "    \"Normal model_normal data_noise level 0.6.xlsx\",\n",
+    "    \"Normal model_normal data_noise level 1.2.xlsx\",\n",
+    "    \"Normal model_skew normal data_noise level 0.6.xlsx\",\n",
+    "    \"Skew normal model_skew normal data_noise level 0.6.xlsx\",\n",
+    "    \"Skew normal model_skew normal data_noise level 1.2.xlsx\",\n",
+    "    \"Skew normal model_normal data_noise level 0.6.xlsx\",\n",
+    "]\n",
+    "\n",
+    "parameters = [\"mean\", \"std\", \"area\", \"height\", \"alpha\", \"baseline_intercept\", \"baseline_slope\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare data in df_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'mean'}\n",
+      "{'std'}\n",
+      "{'area'}\n",
+      "{'height'}\n",
+      "alpha skipalpha\n",
+      "{'baseline_intercept'}\n",
+      "{'baseline_slope'}\n",
+      "{'mean'}\n",
+      "{'std'}\n",
+      "{'area'}\n",
+      "{'height'}\n",
+      "alpha skipalpha\n",
+      "{'baseline_intercept'}\n",
+      "{'baseline_slope'}\n",
+      "{'mean'}\n",
+      "{'std'}\n",
+      "{'area'}\n",
+      "{'height'}\n",
+      "alpha skipalpha\n",
+      "{'baseline_intercept'}\n",
+      "{'baseline_slope'}\n",
+      "{'mean'}\n",
+      "{'std'}\n",
+      "{'area'}\n",
+      "{'height'}\n",
+      "{'alpha'}\n",
+      "{'baseline_intercept'}\n",
+      "{'baseline_slope'}\n",
+      "{'mean'}\n",
+      "{'std'}\n",
+      "{'area'}\n",
+      "{'height'}\n",
+      "{'alpha'}\n",
+      "{'baseline_intercept'}\n",
+      "{'baseline_slope'}\n",
+      "{'mean'}\n",
+      "{'std'}\n",
+      "{'area'}\n",
+      "{'height'}\n",
+      "{'alpha'}\n",
+      "{'baseline_intercept'}\n",
+      "{'baseline_slope'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_results = pandas.DataFrame()\n",
+    "\n",
+    "for path in raw_data_files:\n",
+    "    for param in parameters:\n",
+    "        # print(path, param)\n",
+    "        # normal distribution does not have the alpha parameter so skip that when necessary\n",
+    "        if path in [raw_data_files[0], raw_data_files[1], raw_data_files[2]] and param == \"alpha\":\n",
+    "            print(\"alpha skip\" + param)\n",
+    "            continue\n",
+    "        # summary laden\n",
+    "        summary = pandas.read_excel(path, index_col=0)\n",
+    "        # sort summary and calculate differences between true and simulated values\n",
+    "        df = summary.loc[param, [\"mean\", \"sd\", \"true_values\"]]\n",
+    "        print(set(df.index))\n",
+    "        df[\"ratio_mean_to_truth\"] = np.abs(df.loc[:, \"mean\"] / df.loc[:, \"true_values\"])\n",
+    "        df[\"absolute_difference\"] = df.loc[:, \"mean\"] - df.loc[:, \"true_values\"]\n",
+    "        df[\"ratio_std_to_mean\"] = df.loc[:, \"sd\"] / df.loc[:, \"mean\"]\n",
+    "        df[\"within_range_of_1_std\"] = [True if df.iloc[x, 0] - df.iloc[x, 1] <= df.iloc[x, 2] <= df.iloc[x, 0] + df.iloc[x, 1] else False for x in range(len(df))]\n",
+    "        df[\"within_range_of_3_stds\"] = [True if df.iloc[x, 0] - 3 * df.iloc[x, 1] <= df.iloc[x, 2] <= df.iloc[x, 0] + 3 * df.iloc[x, 1] else False for x in range(len(df))]\n",
+    "        df[\"noise_level\"] = len(df) * [list(set(summary.loc[:,\"noise_scale\"]))[0]]\n",
+    "        df[\"draws\"] = len(df) * [list(set(summary.loc[:,\"draws\"]))[0]]\n",
+    "        df[\"tuning\"] = len(df) * [list(set(summary.loc[:,\"tuning_samples\"]))[0]]\n",
+    "        # calculate mean and std of differences\n",
+    "        df2 = pandas.DataFrame()\n",
+    "        df2[\"path\"] = [path]\n",
+    "        df2[\"parameter\"] = [\"\".join(set(df.index))]\n",
+    "        df2[\"ratio_mean_to_truth\"] = [(np.mean(df.loc[:, \"ratio_mean_to_truth\"]), np.std(df.loc[:, \"ratio_mean_to_truth\"]))]\n",
+    "        df2[\"absolute_difference\"] = [(np.mean(df.loc[:, \"absolute_difference\"]), np.std(df.loc[:, \"absolute_difference\"]))]\n",
+    "        df2[\"within_range_of_3_stds\"] = np.count_nonzero(df.loc[:, \"within_range_of_3_stds\"]) / len(df)\n",
+    "        df2[\"within_range_of_1_std\"] = np.count_nonzero(df.loc[:, \"within_range_of_1_std\"]) / len(df)\n",
+    "        df2[\"noise_level\"] = list(set(df[\"noise_level\"]))[0]\n",
+    "        df2[\"tuning samples\"] = list(set(df[\"tuning\"]))[0]\n",
+    "        df2[\"draws\"] = list(set(df[\"draws\"]))[0] \n",
+    "        if path in [raw_data_files[0], raw_data_files[1]]:\n",
+    "            df2[\"data_distribution\"] = [\"normal\"]\n",
+    "            df2[\"model_distribution\"] = [\"normal\"]\n",
+    "        elif path == raw_data_files[2]:\n",
+    "            df2[\"data_distribution\"] = [\"skew normal\"]\n",
+    "            df2[\"model_distribution\"] = [\"normal\"]\n",
+    "        elif path in [raw_data_files[3], raw_data_files[4]]:\n",
+    "            df2[\"data_distribution\"] = [\"skew normal\"]\n",
+    "            df2[\"model_distribution\"] = [\"skew normal\"]\n",
+    "        elif path == raw_data_files[5]:\n",
+    "            df2[\"data_distribution\"] = [\"normal\"]\n",
+    "            df2[\"model_distribution\"] = [\"skew normal\"]      \n",
+    "        # save results in one DataFrame for subsequent plotting\n",
+    "        df_results = pandas.concat([df_results, df2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "model: normal, data: normal, noise level: 0.6\n",
+      "model: normal, data: normal, noise level: 1.2\n",
+      "model: normal, data: skew normal, noise level: 0.6\n",
+      "model: skew normal, data: normal, noise level: 0.6\n",
+      "model: skew normal, data: skew normal, noise level: 0.6\n",
+      "model: skew normal, data: skew normal, noise level: 1.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "for model in set(df_results.loc[:, \"model_distribution\"]):\n",
+    "    dfdf = df_results[df_results.loc[:, \"model_distribution\"] == model]\n",
+    "    for data in set(dfdf.loc[:, \"data_distribution\"]):\n",
+    "        dfdf2 = dfdf[dfdf.loc[:, \"data_distribution\"] == data]\n",
+    "        for noise_level in set(dfdf2.loc[:, \"noise_level\"]):\n",
+    "            dfdf3 = dfdf2[dfdf2.loc[:, \"noise_level\"] == noise_level]\n",
+    "            model = list(dfdf3.loc[:,\"model_distribution\"])[0]\n",
+    "            data = list(dfdf3.loc[:,\"data_distribution\"])[0]\n",
+    "            noise = list(dfdf3.loc[:,\"noise_level\"])[0]\n",
+    "            print(f\"model: {model}, data: {data}, noise level: {noise}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfdf = df_results[df_results.loc[:, \"model_distribution\"] == \"skew normal\"]\n",
+    "dfdf2 = dfdf[dfdf.loc[:, \"data_distribution\"] == \"skew normal\"]\n",
+    "dfdf3 = dfdf2[dfdf2.loc[:, \"noise_level\"] == 0.6]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'normal data, normal model': [[0.9999001336005343,\n",
+       "   1.0025702609677298,\n",
+       "   1.0017147282600856,\n",
+       "   1.000123572878139],\n",
+       "  [0.002286555631402294,\n",
+       "   0.028900726978078068,\n",
+       "   0.02958680525019264,\n",
+       "   0.022445046960539197]],\n",
+       " 'normal data (higher noise), normal model': [[0.9997316666666668,\n",
+       "   1.0059567381829964,\n",
+       "   1.001356598861276,\n",
+       "   0.9977187067316658],\n",
+       "  [0.004410296979166418,\n",
+       "   0.05488690135089093,\n",
+       "   0.055093378982298734,\n",
+       "   0.04168657187789078]],\n",
+       " 'skew normal data, normal model': [[0.9990176666666667,\n",
+       "   0.7598253910963016,\n",
+       "   0.9869124703934096,\n",
+       "   0.9889579711666672],\n",
+       "  [0.04540922653553522,\n",
+       "   0.1425229338854569,\n",
+       "   0.029251994462966387,\n",
+       "   0.02178598822049324]],\n",
+       " 'normal data, skew normal model': [[0.9993873333333333,\n",
+       "   1.145324094260921,\n",
+       "   1.0038603930164334,\n",
+       "   1.0021702322498285],\n",
+       "  [0.025492314214288193,\n",
+       "   0.06460165579288266,\n",
+       "   0.0295645094605588,\n",
+       "   0.022277250178015084]],\n",
+       " 'skew normal data, skew normal model': [[1.0003276666666665,\n",
+       "   1.0178059537564914,\n",
+       "   0.9995769654521169,\n",
+       "   0.9994046368514812],\n",
+       "  [0.022164664598810824,\n",
+       "   0.08144664654979102,\n",
+       "   0.02553221429137138,\n",
+       "   0.019596288333603468]],\n",
+       " 'skew normal data (higher noise), skew normal model': [[0.9975454545454545,\n",
+       "   1.062975971807339,\n",
+       "   1.0078594345558298,\n",
+       "   1.0013061414928683],\n",
+       "  [0.029588612507556917,\n",
+       "   0.13828870506270582,\n",
+       "   0.050852728197426554,\n",
+       "   0.03782158437972263]]}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_data = {}\n",
+    "for model in set(df_results.loc[:, \"model_distribution\"]):\n",
+    "    dfdf = df_results[df_results.loc[:, \"model_distribution\"] == model]\n",
+    "    for data in set(dfdf.loc[:, \"data_distribution\"]):\n",
+    "        dfdf2 = dfdf[dfdf.loc[:, \"data_distribution\"] == data]\n",
+    "        for noise_level in set(dfdf2.loc[:, \"noise_level\"]):\n",
+    "            dfdf3 = dfdf2[dfdf2.loc[:, \"noise_level\"] == noise_level]\n",
+    "            model = list(dfdf3.loc[:,\"model_distribution\"])[0]\n",
+    "            data = list(dfdf3.loc[:,\"data_distribution\"])[0]\n",
+    "            noise = list(dfdf3.loc[:,\"noise_level\"])[0]\n",
+    "            # print(f\"model: {model}, data: {data}, noise level: {noise}\")\n",
+    "            # print(noise)\n",
+    "            dfdf4 = dfdf3[~dfdf3.loc[:, \"parameter\"].isin([\"alpha\", \"baseline_intercept\", \"baseline_slope\"])]\n",
+    "            if noise == 1.2:\n",
+    "                all_data[f\"{data} data (higher noise), {model} model\"] = [[x[0] for x in list(dfdf4.loc[:,\"ratio_mean_to_truth\"])], [x[1] for x in list(dfdf4.loc[:,\"ratio_mean_to_truth\"])]]\n",
+    "            else:\n",
+    "                all_data[f\"{data} data, {model} model\"] = [[x[0] for x in list(dfdf4.loc[:,\"ratio_mean_to_truth\"])], [x[1] for x in list(dfdf4.loc[:,\"ratio_mean_to_truth\"])]]\n",
+    "all_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['normal data, normal model', 'normal data (higher noise), normal model', 'skew normal data, normal model', 'skew normal data, skew normal model', 'skew normal data (higher noise), skew normal model', 'normal data, skew normal model'])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rearrange = ['normal data, normal model', 'normal data (higher noise), normal model', 'skew normal data, normal model', 'skew normal data, skew normal model', 'skew normal data (higher noise), skew normal model','normal data, skew normal model']\n",
+    "reordered_dict = {k: all_data[k] for k in rearrange}\n",
+    "reordered_dict.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save processed data in file\n",
+    "\n",
+    "# with open('all_data.txt', 'w') as file:\n",
+    "#     file.write(json.dumps(reordered_dict)) # use `json.loads` to do the reverse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Last updated: 2024-10-11T18:34:57.629742+02:00\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext watermark\n",
+    "%watermark -idu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "nutpie_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/notebooks/Skew normal model_normal data_noise level 0.6.xlsx b/docs/source/notebooks/Skew normal model_normal data_noise level 0.6.xlsx
diff --git a/docs/source/notebooks/Skew normal model_skew normal data_noise level 0.6.xlsx b/docs/source/notebooks/Skew normal model_skew normal data_noise level 0.6.xlsx
diff --git a/docs/source/notebooks/Skew normal model_skew normal data_noise level 1.2.xlsx b/docs/source/notebooks/Skew normal model_skew normal data_noise level 1.2.xlsx
diff --git a/docs/source/notebooks/test1_all_data.txt b/docs/source/notebooks/test1_all_data.txt
@@ -0,0 +1 @@
+{"normal data, normal model": [[0.9999001336005343, 1.0025702609677298, 1.0017147282600856, 1.000123572878139], [0.002286555631402294, 0.028900726978078068, 0.02958680525019264, 0.022445046960539197]], "normal data (higher noise), normal model": [[0.9997316666666668, 1.0059567381829964, 1.001356598861276, 0.9977187067316658], [0.004410296979166418, 0.05488690135089093, 0.055093378982298734, 0.04168657187789078]], "skew normal data, normal model": [[0.9990176666666667, 0.7598253910963016, 0.9869124703934096, 0.9889579711666672], [0.04540922653553522, 0.1425229338854569, 0.029251994462966387, 0.02178598822049324]], "skew normal data, skew normal model": [[1.0003276666666665, 1.0178059537564914, 0.9995769654521169, 0.9994046368514812], [0.022164664598810824, 0.08144664654979102, 0.02553221429137138, 0.019596288333603468]], "skew normal data (higher noise), skew normal model": [[0.9975454545454545, 1.062975971807339, 1.0078594345558298, 1.0013061414928683], [0.029588612507556917, 0.13828870506270582, 0.050852728197426554, 0.03782158437972263]], "normal data, skew normal model": [[0.9993873333333333, 1.145324094260921, 1.0038603930164334, 1.0021702322498285], [0.025492314214288193, 0.06460165579288266, 0.0295645094605588, 0.022277250178015084]]}
diff --git a/docs/source/notebooks/test2_summary.xlsx b/docs/source/notebooks/test2_summary.xlsx
diff --git a/docs/source/notebooks/test3_df_comparison.xlsx b/docs/source/notebooks/test3_df_comparison.xlsx
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"normal data, normal model": [[0.9999001336005343, 1.0025702609677298, 1.0017147282600856, 1.000123572878139], [0.002286555631402294, 0.028900726978078068, 0.02958680525019264, 0.022445046960539197]], "normal data (higher noise), normal model": [[0.9997316666666668, 1.0059567381829964, 1.001356598861276, 0.9977187067316658], [0.004410296979166418, 0.05488690135089093, 0.055093378982298734, 0.04168657187789078]], "skew normal data, normal model": [[0.9990176666666667, 0.7598253910963016, 0.9869124703934096, 0.9889579711666672], [0.04540922653553522, 0.1425229338854569, 0.029251994462966387, 0.02178598822049324]], "skew normal data, skew normal model": [[1.0003276666666665, 1.0178059537564914, 0.9995769654521169, 0.9994046368514812], [0.022164664598810824, 0.08144664654979102, 0.02553221429137138, 0.019596288333603468]], "skew normal data (higher noise), skew normal model": [[0.9975454545454545, 1.062975971807339, 1.0078594345558298, 1.0013061414928683], [0.029588612507556917, 0.13828870506270582, 0.050852728197426554, 0.03782158437972263]], "normal data, skew normal model": [[0.9993873333333333, 1.145324094260921, 1.0038603930164334, 1.0021702322498285], [0.025492314214288193, 0.06460165579288266, 0.0295645094605588, 0.022277250178015084]]}