From 47324490c817fb1acad7d990e5e6ce36734b9143 Mon Sep 17 00:00:00 2001 From: Lucas Camillo Date: Fri, 29 Dec 2023 09:04:19 -0300 Subject: [PATCH] fixed zhangen and zhangblup --- clocks/notebooks/dnamtl.ipynb | 4 ++-- clocks/notebooks/join_metadata.ipynb | 2 +- clocks/notebooks/zhangblup.ipynb | 14 +++++++------- clocks/notebooks/zhangen.ipynb | 28 ++++++++++------------------ pyaging/data/_data.py | 2 +- pyaging/predict/_pred_utils.py | 23 ++++++++++++++++++----- pyaging/predict/_preprocessing.py | 14 ++++++++++++++ pyproject.toml | 2 +- 8 files changed, 54 insertions(+), 35 deletions(-) diff --git a/clocks/notebooks/dnamtl.ipynb b/clocks/notebooks/dnamtl.ipynb index 9c2f237..2197ab6 100644 --- a/clocks/notebooks/dnamtl.ipynb +++ b/clocks/notebooks/dnamtl.ipynb @@ -130,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "303e9b76-993f-4691-af9d-1151b3c7638f", "metadata": {}, "outputs": [ @@ -140,7 +140,7 @@ "0" ] }, - "execution_count": 8, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } diff --git a/clocks/notebooks/join_metadata.ipynb b/clocks/notebooks/join_metadata.ipynb index 84f98cf..056a2c8 100644 --- a/clocks/notebooks/join_metadata.ipynb +++ b/clocks/notebooks/join_metadata.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "59eb29df-0597-4d45-b2e6-8825670effe2", "metadata": {}, "outputs": [], diff --git a/clocks/notebooks/zhangblup.ipynb b/clocks/notebooks/zhangblup.ipynb index a1e22c9..d304889 100644 --- a/clocks/notebooks/zhangblup.ipynb +++ b/clocks/notebooks/zhangblup.ipynb @@ -22,7 +22,7 @@ { "data": { "text/plain": [ - "32768" + "0" ] }, "execution_count": 2, @@ -49,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "a284fe99-dc47-4f0c-b2ff-274e136e7020", "metadata": {}, "outputs": [], @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "7b4c3f6b-72af-4e99-84c4-65b8ef58c91d", "metadata": {}, "outputs": [ @@ -74,7 +74,7 @@ ")" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -90,13 +90,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "id": "b38f8af2-3d94-4a45-90b9-30b227828da1", "metadata": {}, "outputs": [], "source": [ "weights_dict = {\n", - " 'preprocessing': None, \n", + " 'preprocessing': 'scale_row', \n", " 'preprocessing_helper': None,\n", " 'postprocessing': None,\n", " 'postprocessing_helper': None,\n", @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "id": "34136f3c-92b8-4641-a103-381d3a7dd857", "metadata": {}, "outputs": [], diff --git a/clocks/notebooks/zhangen.ipynb b/clocks/notebooks/zhangen.ipynb index 8a16bde..086a66a 100644 --- a/clocks/notebooks/zhangen.ipynb +++ b/clocks/notebooks/zhangen.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "fb157849-5454-4a60-8548-fff633fff764", "metadata": {}, "outputs": [], @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "b9f484b1-f501-41b7-9565-82e03bfe97dc", "metadata": {}, "outputs": [], @@ -49,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "a284fe99-dc47-4f0c-b2ff-274e136e7020", "metadata": {}, "outputs": [], @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "id": "7b4c3f6b-72af-4e99-84c4-65b8ef58c91d", "metadata": {}, "outputs": [ @@ -74,7 +74,7 @@ ")" ] }, - "execution_count": 11, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -90,13 +90,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 5, "id": "b38f8af2-3d94-4a45-90b9-30b227828da1", "metadata": {}, "outputs": [], "source": [ "weights_dict = {\n", - " 'preprocessing': None, \n", + " 'preprocessing': 'scale_row', \n", " 'preprocessing_helper': None,\n", " 'postprocessing': None,\n", " 'postprocessing_helper': None,\n", @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "id": "34136f3c-92b8-4641-a103-381d3a7dd857", "metadata": {}, "outputs": [], @@ -130,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "id": "01b905f5-298f-4edd-b69b-fcedeea9d0d4", "metadata": {}, "outputs": [ @@ -140,7 +140,7 @@ "0" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -148,14 +148,6 @@ "source": [ "os.system(\"rm -r DNAm-based-age-predictor\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "34a90554-4e17-42de-9671-f52d656caf0a", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/pyaging/data/_data.py b/pyaging/data/_data.py index 01bd1c4..64e17ee 100644 --- a/pyaging/data/_data.py +++ b/pyaging/data/_data.py @@ -25,7 +25,7 @@ def download_example_data( The type of data to download. Valid options are 'GSE139307' (human methylation), 'GSE130735' (mouse methylation), 'GSE223748' (mammalian methylation), 'ENCFF386QWG' (histone mark), 'GSE65765' (C. elegans RNA-seq), 'GSE193140' (ATAC-Seq), 'blood_chemistry_example' (blood chemistry). - + dir : str The directory to deposit the downloaded file. Defaults to "pyaging_data". diff --git a/pyaging/predict/_pred_utils.py b/pyaging/predict/_pred_utils.py index dd88773..a3406ce 100644 --- a/pyaging/predict/_pred_utils.py +++ b/pyaging/predict/_pred_utils.py @@ -476,6 +476,8 @@ def preprocess_data( adata.X = tpm_norm_log1p(adata.X, preprocessing_helper) elif preprocessing == "binarize": adata.X = binarize(adata.X) + elif preprocessing == "scale_row": + adata.X = scale_row(adata.X, adata[:, features].X) elif preprocessing == "scale": X = adata[:, features].X X = scale(X, preprocessing_helper) @@ -599,7 +601,12 @@ def postprocess_data( @progress("Predict ages with model") def predict_ages_with_model( - model: torch.nn.Module, adata: torch.Tensor, features: List[str], device: str, logger, indent_level: int = 2 + model: torch.nn.Module, + adata: torch.Tensor, + features: List[str], + device: str, + logger, + indent_level: int = 2, ) -> torch.Tensor: """ Predict biological ages using a trained model and input data. @@ -653,7 +660,7 @@ def predict_ages_with_model( """ # Create an AnnLoader - use_cuda = device == 'cuda' + use_cuda = device == "cuda" dataloader = AnnLoader(adata, batch_size=1024, use_cuda=use_cuda) # Use the AnnLoader for batched prediction @@ -824,11 +831,17 @@ def filter_missing_features( """ n_missing_features = sum(adata.var["percent_na"] == 1) if n_missing_features > 0: - logger.info(f"Removing {n_missing_features} added features", indent_level=indent_level+1) + logger.info( + f"Removing {n_missing_features} added features", + indent_level=indent_level + 1, + ) adata = adata[:, adata.var["percent_na"] < 1].copy() else: - logger.info("No missing features, so adata size did not change", indent_level=indent_level+1) - + logger.info( + "No missing features, so adata size did not change", + indent_level=indent_level + 1, + ) + return adata diff --git a/pyaging/predict/_preprocessing.py b/pyaging/predict/_preprocessing.py index 91710f7..47f7248 100644 --- a/pyaging/predict/_preprocessing.py +++ b/pyaging/predict/_preprocessing.py @@ -11,6 +11,20 @@ def scale(x, scaler): return x_scaled +def scale_row(x, x_overlap): + """ + Scales the input data per row with mean 0 and std 1. + """ + row_means = np.mean(x_overlap, axis=1, keepdims=True) + row_stds = np.std(x_overlap, axis=1, keepdims=True) + + # Avoid division by zero in case of a row with constant value + row_stds[row_stds == 0] = 1 + + x_scaled = (x - row_means) / row_stds + return x_scaled + + def binarize(x): """ Binarizes an array based on the median of each row, excluding zeros. diff --git a/pyproject.toml b/pyproject.toml index 48b8d00..c110434 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyaging" -version = "0.0.10" +version = "0.0.11" description = "A Python-based compendium of GPU-optimized aging clocks." authors = ["Lucas Paulo de Lima Camillo "] license = "BSD"