From 0873e4696ed804eada227971e5b70e0d6c472d98 Mon Sep 17 00:00:00 2001 From: silask Date: Wed, 25 Oct 2023 13:46:29 +0200 Subject: [PATCH] start translating genen catalog --- Python/Genecatalog.ipynb | 374 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 366 insertions(+), 8 deletions(-) diff --git a/Python/Genecatalog.ipynb b/Python/Genecatalog.ipynb index 350fc5e..4fa5702 100644 --- a/Python/Genecatalog.ipynb +++ b/Python/Genecatalog.ipynb @@ -23,12 +23,15 @@ "import matplotlib.pylab as plt\n", "import seaborn as sns\n", "from pathlib import Path\n", - "import h5py" + "import h5py\n", + "\n", + "import yaml\n", + "import os" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -44,14 +47,369 @@ } ], "source": [ - "atlas_dir= Path(\"../DiarrheaExample/\")\n", "\n", - "gene_abundance_file= atlas_dir/\"Genecatalog/counts/median_coverage.h5\"\n", "\n", - "with h5py.File(gene_abundance_file, 'r') as hdf_file:\n", + "data_dir = \"../DiarrheaExample/\"\n", + "atlas_version = \"v2.17\"\n", + "\n", + "# Load the YAML configuration file\n", + "with open(\"../atlas_output_files.yaml\", 'r') as yaml_file:\n", + " config = yaml.load(yaml_file, Loader=yaml.FullLoader)\n", + "\n", + "# Access the specified version in the configuration\n", + "files = config[atlas_version]\n", + "\n", + "# Function to update file paths in the nested dictionary\n", + "def update_paths(value, data_dir):\n", + " if isinstance(value, str):\n", + " return os.path.join(data_dir, value)\n", + " elif isinstance(value, dict):\n", + " return {key: update_paths(subvalue, data_dir) for key, subvalue in value.items()}\n", + "\n", + "# Update file paths in the configuration\n", + "files = update_paths(files, data_dir)\n", + "\n", + "# Access the \"abundance_file\" from the updated configuration\n", + "genecatalog_files = files[\"genecatalog\"]\n", + "abundance_file = genecatalog_files[\"coverage\"]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Look at the gene stats\n", + "\n", + "Let's have a look at the dimension of the data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"Unable to synchronously open object (object 'dim' doesn't exist)\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/silas/Documents/GitHub/Tutorial/Python/Genecatalog.ipynb Cell 3\u001b[0m line \u001b[0;36m4\n\u001b[1;32m 1\u001b[0m \u001b[39m# Open the HDF5 file\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[39mwith\u001b[39;00m h5py\u001b[39m.\u001b[39mFile(abundance_file, \u001b[39m'\u001b[39m\u001b[39mr\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m h5file:\n\u001b[1;32m 3\u001b[0m \u001b[39m# Get the dimensions\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m dim \u001b[39m=\u001b[39m h5file[\u001b[39m'\u001b[39;49m\u001b[39mdim\u001b[39;49m\u001b[39m'\u001b[39;49m][:]\n\u001b[1;32m 5\u001b[0m Ngenes, Nsamples \u001b[39m=\u001b[39m dim\n\u001b[1;32m 7\u001b[0m \u001b[39m# Print the dimensions\u001b[39;00m\n", + "File \u001b[0;32mh5py/_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mh5py/_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/h5py/_hl/group.py:357\u001b[0m, in \u001b[0;36mGroup.__getitem__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mInvalid HDF5 object reference\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 356\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(name, (\u001b[39mbytes\u001b[39m, \u001b[39mstr\u001b[39m)):\n\u001b[0;32m--> 357\u001b[0m oid \u001b[39m=\u001b[39m h5o\u001b[39m.\u001b[39;49mopen(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mid, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_e(name), lapl\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_lapl)\n\u001b[1;32m 358\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 359\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mTypeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mAccessing a group is done with bytes or str, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 360\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnot \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(\u001b[39mtype\u001b[39m(name)))\n", + "File \u001b[0;32mh5py/_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mh5py/_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mh5py/h5o.pyx:189\u001b[0m, in \u001b[0;36mh5py.h5o.open\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: \"Unable to synchronously open object (object 'dim' doesn't exist)\"" + ] + } + ], + "source": [ + "\n", + "\n", + "# Open the HDF5 file\n", + "with h5py.File(abundance_file, 'r') as h5file:\n", + " # Get the dimensions\n", + " dim = h5file['dim'][:]\n", + " Ngenes, Nsamples = dim\n", + "\n", + "# Print the dimensions\n", + "print(f\"The genecatalog contains {Ngenes} genes and {Nsamples} samples.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because the dimensions of the genecatalog are huge (even more so with more samples) but\n", + "many genes are detected only in a subset of samples,\n", + "I optimized the file format to allow for fast loading of a subset of the data.\n", + "\n", + "However we still want information from all the genes.\n", + "The file `r genecatalog_files$sample_stats` contains stats per sample of the genecatalog.\n", + "Especially the number of genes that are detected in each sample and the total coverage which we will use for normalization.\n", + "\n", + "Similarly the file `r gene_catalog_files$gene_stats` contains stats per gene, e.g. the number of samples in which the gene is detected.\n", + "\n", + "Let's first look at the stats per sample.\n", + "\n", + "### Gene stats per sample" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n", + " with pd.option_context('mode.use_inf_as_na', True):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n", + " with pd.option_context('mode.use_inf_as_na', True):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n", + " with pd.option_context('mode.use_inf_as_na', True):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n", + " with pd.option_context('mode.use_inf_as_na', True):\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "image/png": { + "height": 589, + "width": 1183 + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# Create a subplot with two axes\n", + "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n", + "\n", + "# Plot \"Total coverage\" on the first axis\n", + "sns.swarmplot(y=\"Sum_coverage\", data=sample_stats, ax=axes[0])\n", + "axes[0].set_ylabel(\"\",visible=False)\n", + "axes[0].set_title(\"Total coverage\")\n", + "\n", + "# Plot \"N detected genes\" on the second axis\n", + "sns.swarmplot(y=\"Genes_nz_coverage\", data=sample_stats, ax=axes[1])\n", + "axes[1].set_ylabel(\"\",visible=False)\n", + "axes[1].set_title(\"N detected genes\")\n", + "\n", + "# Set a common title for both axes\n", + "fig.suptitle(\"Gene stats per sample\", fontsize=16)\n", + "\n", + "# Adjust spacing between subplots\n", + "plt.tight_layout(rect=[0, 0, 1, 0.95])\n", + "\n", + "# Show the plot\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stats per gene\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GeneNameLengthGCSamples_nz_coverageSamples_nz_countsSum_coverageMax_coverage
0Gene0000015310.386117302093900
1Gene0000027230.4716101939994
2Gene0000039510.5047818771273
3Gene0000045370.4646585022
4Gene0000054050.5210610168
\n", + "
" + ], + "text/plain": [ + " GeneName Length GC Samples_nz_coverage Samples_nz_counts \\\n", + "0 Gene000001 531 0.3861 17 30 \n", + "1 Gene000002 723 0.4716 10 19 \n", + "2 Gene000003 951 0.5047 8 18 \n", + "3 Gene000004 537 0.4646 5 8 \n", + "4 Gene000005 405 0.5210 6 10 \n", + "\n", + " Sum_coverage Max_coverage \n", + "0 2093 900 \n", + "1 399 94 \n", + "2 771 273 \n", + "3 50 22 \n", + "4 16 8 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "gene_stats_file = genecatalog_files[\"coverage_stats\"]\n", + "gene_stats = pd.read_parquet(gene_stats_file)\n", + "\n", + "\n", + "gene_stats.head()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Samples_nz_coverage: Number of samples in which the gene has a non-zero coverage\n", + "Samples_nz_counts: Number of samples in which the gene has a non-zero counts\n", + "Sum_coverage: Sum of the coverage of the gene in all samples\n", + "\n", + "The values for `Samples_nz_coverage` and `Samples_nz_counts` are not the same\n", + "because if there are only a view reads mapped to a gene but less than halve of the gene is covered the median coverage is zero.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\n", + " if pd.api.types.is_categorical_dtype(vector):\n", + "/Users/silas/miniforge3/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.\n", + " with pd.option_context('mode.use_inf_as_na', True):\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/silas/Documents/GitHub/Tutorial/Python/Genecatalog.ipynb Cell 10\u001b[0m line \u001b[0;36m3\n\u001b[1;32m 1\u001b[0m \u001b[39m# Create a histogram for log10(Sum_coverage)\u001b[39;00m\n\u001b[1;32m 2\u001b[0m plt\u001b[39m.\u001b[39mfigure(figsize\u001b[39m=\u001b[39m(\u001b[39m8\u001b[39m, \u001b[39m4\u001b[39m))\n\u001b[0;32m----> 3\u001b[0m sns\u001b[39m.\u001b[39;49mhistplot(gene_stats[\u001b[39m\"\u001b[39;49m\u001b[39mSum_coverage\u001b[39;49m\u001b[39m\"\u001b[39;49m], binwidth\u001b[39m=\u001b[39;49m\u001b[39m0.2\u001b[39;49m, color\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mblue\u001b[39;49m\u001b[39m\"\u001b[39;49m, kde\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 4\u001b[0m plt\u001b[39m.\u001b[39mtitle(\u001b[39m\"\u001b[39m\u001b[39mHistogram of log10(Sum_coverage)\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 5\u001b[0m plt\u001b[39m.\u001b[39mxlabel(\u001b[39m\"\u001b[39m\u001b[39mlog10(Sum_coverage)\u001b[39m\u001b[39m\"\u001b[39m)\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/seaborn/distributions.py:1432\u001b[0m, in \u001b[0;36mhistplot\u001b[0;34m(data, x, y, hue, weights, stat, bins, binwidth, binrange, discrete, cumulative, common_bins, common_norm, multiple, element, fill, shrink, kde, kde_kws, line_kws, thresh, pthresh, pmax, cbar, cbar_ax, cbar_kws, palette, hue_order, hue_norm, color, log_scale, legend, ax, **kwargs)\u001b[0m\n\u001b[1;32m 1421\u001b[0m estimate_kws \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m(\n\u001b[1;32m 1422\u001b[0m stat\u001b[39m=\u001b[39mstat,\n\u001b[1;32m 1423\u001b[0m bins\u001b[39m=\u001b[39mbins,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1427\u001b[0m cumulative\u001b[39m=\u001b[39mcumulative,\n\u001b[1;32m 1428\u001b[0m )\n\u001b[1;32m 1430\u001b[0m \u001b[39mif\u001b[39;00m p\u001b[39m.\u001b[39munivariate:\n\u001b[0;32m-> 1432\u001b[0m p\u001b[39m.\u001b[39;49mplot_univariate_histogram(\n\u001b[1;32m 1433\u001b[0m multiple\u001b[39m=\u001b[39;49mmultiple,\n\u001b[1;32m 1434\u001b[0m element\u001b[39m=\u001b[39;49melement,\n\u001b[1;32m 1435\u001b[0m fill\u001b[39m=\u001b[39;49mfill,\n\u001b[1;32m 1436\u001b[0m shrink\u001b[39m=\u001b[39;49mshrink,\n\u001b[1;32m 1437\u001b[0m common_norm\u001b[39m=\u001b[39;49mcommon_norm,\n\u001b[1;32m 1438\u001b[0m common_bins\u001b[39m=\u001b[39;49mcommon_bins,\n\u001b[1;32m 1439\u001b[0m kde\u001b[39m=\u001b[39;49mkde,\n\u001b[1;32m 1440\u001b[0m kde_kws\u001b[39m=\u001b[39;49mkde_kws,\n\u001b[1;32m 1441\u001b[0m color\u001b[39m=\u001b[39;49mcolor,\n\u001b[1;32m 1442\u001b[0m legend\u001b[39m=\u001b[39;49mlegend,\n\u001b[1;32m 1443\u001b[0m estimate_kws\u001b[39m=\u001b[39;49mestimate_kws,\n\u001b[1;32m 1444\u001b[0m line_kws\u001b[39m=\u001b[39;49mline_kws,\n\u001b[1;32m 1445\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m 1446\u001b[0m )\n\u001b[1;32m 1448\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1450\u001b[0m p\u001b[39m.\u001b[39mplot_bivariate_histogram(\n\u001b[1;32m 1451\u001b[0m common_bins\u001b[39m=\u001b[39mcommon_bins,\n\u001b[1;32m 1452\u001b[0m common_norm\u001b[39m=\u001b[39mcommon_norm,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1462\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[1;32m 1463\u001b[0m )\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/seaborn/distributions.py:575\u001b[0m, in \u001b[0;36m_DistributionPlotter.plot_univariate_histogram\u001b[0;34m(self, multiple, element, fill, common_norm, common_bins, shrink, kde, kde_kws, color, legend, line_kws, estimate_kws, **plot_kws)\u001b[0m\n\u001b[1;32m 570\u001b[0m \u001b[39mif\u001b[39;00m element \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mbars\u001b[39m\u001b[39m\"\u001b[39m:\n\u001b[1;32m 571\u001b[0m \n\u001b[1;32m 572\u001b[0m \u001b[39m# Use matplotlib bar plotting\u001b[39;00m\n\u001b[1;32m 574\u001b[0m plot_func \u001b[39m=\u001b[39m ax\u001b[39m.\u001b[39mbar \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdata_variable \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mx\u001b[39m\u001b[39m\"\u001b[39m \u001b[39melse\u001b[39;00m ax\u001b[39m.\u001b[39mbarh\n\u001b[0;32m--> 575\u001b[0m artists \u001b[39m=\u001b[39m plot_func(\n\u001b[1;32m 576\u001b[0m hist[\u001b[39m\"\u001b[39;49m\u001b[39medges\u001b[39;49m\u001b[39m\"\u001b[39;49m],\n\u001b[1;32m 577\u001b[0m hist[\u001b[39m\"\u001b[39;49m\u001b[39mheights\u001b[39;49m\u001b[39m\"\u001b[39;49m] \u001b[39m-\u001b[39;49m bottom,\n\u001b[1;32m 578\u001b[0m hist[\u001b[39m\"\u001b[39;49m\u001b[39mwidths\u001b[39;49m\u001b[39m\"\u001b[39;49m],\n\u001b[1;32m 579\u001b[0m bottom,\n\u001b[1;32m 580\u001b[0m align\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39medge\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 581\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49martist_kws,\n\u001b[1;32m 582\u001b[0m )\n\u001b[1;32m 584\u001b[0m \u001b[39mfor\u001b[39;00m bar \u001b[39min\u001b[39;00m artists:\n\u001b[1;32m 585\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdata_variable \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mx\u001b[39m\u001b[39m\"\u001b[39m:\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/matplotlib/__init__.py:1465\u001b[0m, in \u001b[0;36m_preprocess_data..inner\u001b[0;34m(ax, data, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1462\u001b[0m \u001b[39m@functools\u001b[39m\u001b[39m.\u001b[39mwraps(func)\n\u001b[1;32m 1463\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39minner\u001b[39m(ax, \u001b[39m*\u001b[39margs, data\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 1464\u001b[0m \u001b[39mif\u001b[39;00m data \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 1465\u001b[0m \u001b[39mreturn\u001b[39;00m func(ax, \u001b[39m*\u001b[39;49m\u001b[39mmap\u001b[39;49m(sanitize_sequence, args), \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 1467\u001b[0m bound \u001b[39m=\u001b[39m new_sig\u001b[39m.\u001b[39mbind(ax, \u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 1468\u001b[0m auto_label \u001b[39m=\u001b[39m (bound\u001b[39m.\u001b[39marguments\u001b[39m.\u001b[39mget(label_namer)\n\u001b[1;32m 1469\u001b[0m \u001b[39mor\u001b[39;00m bound\u001b[39m.\u001b[39mkwargs\u001b[39m.\u001b[39mget(label_namer))\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/matplotlib/axes/_axes.py:2534\u001b[0m, in \u001b[0;36mAxes.bar\u001b[0;34m(self, x, height, width, bottom, align, **kwargs)\u001b[0m\n\u001b[1;32m 2532\u001b[0m \u001b[39melse\u001b[39;00m: \u001b[39m# horizontal\u001b[39;00m\n\u001b[1;32m 2533\u001b[0m r\u001b[39m.\u001b[39msticky_edges\u001b[39m.\u001b[39mx\u001b[39m.\u001b[39mappend(l)\n\u001b[0;32m-> 2534\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49madd_patch(r)\n\u001b[1;32m 2535\u001b[0m patches\u001b[39m.\u001b[39mappend(r)\n\u001b[1;32m 2537\u001b[0m \u001b[39mif\u001b[39;00m xerr \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mor\u001b[39;00m yerr \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/matplotlib/axes/_base.py:2384\u001b[0m, in \u001b[0;36m_AxesBase.add_patch\u001b[0;34m(self, p)\u001b[0m\n\u001b[1;32m 2382\u001b[0m \u001b[39mif\u001b[39;00m p\u001b[39m.\u001b[39mget_clip_path() \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 2383\u001b[0m p\u001b[39m.\u001b[39mset_clip_path(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpatch)\n\u001b[0;32m-> 2384\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_update_patch_limits(p)\n\u001b[1;32m 2385\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_children\u001b[39m.\u001b[39mappend(p)\n\u001b[1;32m 2386\u001b[0m p\u001b[39m.\u001b[39m_remove_method \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_children\u001b[39m.\u001b[39mremove\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/matplotlib/axes/_base.py:2406\u001b[0m, in \u001b[0;36m_AxesBase._update_patch_limits\u001b[0;34m(self, patch)\u001b[0m\n\u001b[1;32m 2403\u001b[0m \u001b[39m# Get all vertices on the path\u001b[39;00m\n\u001b[1;32m 2404\u001b[0m \u001b[39m# Loop through each segment to get extrema for Bezier curve sections\u001b[39;00m\n\u001b[1;32m 2405\u001b[0m vertices \u001b[39m=\u001b[39m []\n\u001b[0;32m-> 2406\u001b[0m \u001b[39mfor\u001b[39;00m curve, code \u001b[39min\u001b[39;00m p\u001b[39m.\u001b[39miter_bezier(simplify\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m):\n\u001b[1;32m 2407\u001b[0m \u001b[39m# Get distance along the curve of any extrema\u001b[39;00m\n\u001b[1;32m 2408\u001b[0m _, dzeros \u001b[39m=\u001b[39m curve\u001b[39m.\u001b[39maxis_aligned_extrema()\n\u001b[1;32m 2409\u001b[0m \u001b[39m# Calculate vertices of start, end and any extrema in between\u001b[39;00m\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/matplotlib/path.py:445\u001b[0m, in \u001b[0;36mPath.iter_bezier\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 443\u001b[0m \u001b[39myield\u001b[39;00m BezierSegment(np\u001b[39m.\u001b[39marray([first_vert])), code\n\u001b[1;32m 444\u001b[0m \u001b[39melif\u001b[39;00m code \u001b[39m==\u001b[39m Path\u001b[39m.\u001b[39mLINETO: \u001b[39m# \"CURVE2\"\u001b[39;00m\n\u001b[0;32m--> 445\u001b[0m \u001b[39myield\u001b[39;00m BezierSegment(np\u001b[39m.\u001b[39;49marray([prev_vert, verts])), code\n\u001b[1;32m 446\u001b[0m \u001b[39melif\u001b[39;00m code \u001b[39m==\u001b[39m Path\u001b[39m.\u001b[39mCURVE3:\n\u001b[1;32m 447\u001b[0m \u001b[39myield\u001b[39;00m BezierSegment(np\u001b[39m.\u001b[39marray([prev_vert, verts[:\u001b[39m2\u001b[39m],\n\u001b[1;32m 448\u001b[0m verts[\u001b[39m2\u001b[39m:]])), code\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/matplotlib/bezier.py:195\u001b[0m, in \u001b[0;36mBezierSegment.__init__\u001b[0;34m(self, control_points)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_d \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cpoints\u001b[39m.\u001b[39mshape\n\u001b[1;32m 194\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_orders \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39marange(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N)\n\u001b[0;32m--> 195\u001b[0m coeff \u001b[39m=\u001b[39m [math\u001b[39m.\u001b[39mfactorial(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N \u001b[39m-\u001b[39m \u001b[39m1\u001b[39m)\n\u001b[1;32m 196\u001b[0m \u001b[39m/\u001b[39m\u001b[39m/\u001b[39m (math\u001b[39m.\u001b[39mfactorial(i) \u001b[39m*\u001b[39m math\u001b[39m.\u001b[39mfactorial(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N \u001b[39m-\u001b[39m \u001b[39m1\u001b[39m \u001b[39m-\u001b[39m i))\n\u001b[1;32m 197\u001b[0m \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N)]\n\u001b[1;32m 198\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_px \u001b[39m=\u001b[39m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cpoints\u001b[39m.\u001b[39mT \u001b[39m*\u001b[39m coeff)\u001b[39m.\u001b[39mT\n", + "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/matplotlib/bezier.py:195\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_d \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cpoints\u001b[39m.\u001b[39mshape\n\u001b[1;32m 194\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_orders \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39marange(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N)\n\u001b[0;32m--> 195\u001b[0m coeff \u001b[39m=\u001b[39m [math\u001b[39m.\u001b[39mfactorial(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N \u001b[39m-\u001b[39m \u001b[39m1\u001b[39m)\n\u001b[1;32m 196\u001b[0m \u001b[39m/\u001b[39m\u001b[39m/\u001b[39m (math\u001b[39m.\u001b[39mfactorial(i) \u001b[39m*\u001b[39m math\u001b[39m.\u001b[39mfactorial(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N \u001b[39m-\u001b[39m \u001b[39m1\u001b[39m \u001b[39m-\u001b[39m i))\n\u001b[1;32m 197\u001b[0m \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_N)]\n\u001b[1;32m 198\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_px \u001b[39m=\u001b[39m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cpoints\u001b[39m.\u001b[39mT \u001b[39m*\u001b[39m coeff)\u001b[39m.\u001b[39mT\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "\n", + "\n", + "# Create a histogram for log10(Sum_coverage)\n", + "plt.figure(figsize=(8, 4))\n", + "sns.histplot(gene_stats[\"Sum_coverage\"], binwidth=0.2, color=\"blue\", kde=True)\n", + "plt.title(\"Histogram of log10(Sum_coverage)\")\n", + "plt.xlabel(\"log10(Sum_coverage)\")\n", + "plt.ylabel(\"Frequency\")\n", + "plt.show()\n", "\n", - " # data_matrix = hdf_file['data'][:]\n", - " sample_names = hdf_file['data'].attrs['sample_names'].astype(str)" + "# Create a histogram for Samples_nz_coverage\n", + "plt.figure(figsize=(8, 4))\n", + "sns.histplot(gene_stats[\"Samples_nz_coverage\"], binwidth=10, color=\"green\", kde=True)\n", + "plt.title(\"Histogram of Samples_nz_coverage\")\n", + "plt.xlabel(\"Samples_nz_coverage\")\n", + "plt.ylabel(\"Frequency\")\n", + "plt.show()\n" ] }, { @@ -78,7 +436,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.10.12" }, "orig_nbformat": 4 },