From 47324490c817fb1acad7d990e5e6ce36734b9143 Mon Sep 17 00:00:00 2001
From: Lucas Camillo <lucascamillo@Lucass-MacBook-Pro-4.local>
Date: Fri, 29 Dec 2023 09:04:19 -0300
Subject: [PATCH] fixed zhangen and zhangblup

---
 clocks/notebooks/dnamtl.ipynb        |  4 ++--
 clocks/notebooks/join_metadata.ipynb |  2 +-
 clocks/notebooks/zhangblup.ipynb     | 14 +++++++-------
 clocks/notebooks/zhangen.ipynb       | 28 ++++++++++------------------
 pyaging/data/_data.py                |  2 +-
 pyaging/predict/_pred_utils.py       | 23 ++++++++++++++++++-----
 pyaging/predict/_preprocessing.py    | 14 ++++++++++++++
 pyproject.toml                       |  2 +-
 8 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/clocks/notebooks/dnamtl.ipynb b/clocks/notebooks/dnamtl.ipynb
index 9c2f237..2197ab6 100644
--- a/clocks/notebooks/dnamtl.ipynb
+++ b/clocks/notebooks/dnamtl.ipynb
@@ -130,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "id": "303e9b76-993f-4691-af9d-1151b3c7638f",
    "metadata": {},
    "outputs": [
@@ -140,7 +140,7 @@
        "0"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/clocks/notebooks/join_metadata.ipynb b/clocks/notebooks/join_metadata.ipynb
index 84f98cf..056a2c8 100644
--- a/clocks/notebooks/join_metadata.ipynb
+++ b/clocks/notebooks/join_metadata.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "59eb29df-0597-4d45-b2e6-8825670effe2",
    "metadata": {},
    "outputs": [],
diff --git a/clocks/notebooks/zhangblup.ipynb b/clocks/notebooks/zhangblup.ipynb
index a1e22c9..d304889 100644
--- a/clocks/notebooks/zhangblup.ipynb
+++ b/clocks/notebooks/zhangblup.ipynb
@@ -22,7 +22,7 @@
     {
      "data": {
       "text/plain": [
-       "32768"
+       "0"
       ]
      },
      "execution_count": 2,
@@ -49,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "a284fe99-dc47-4f0c-b2ff-274e136e7020",
    "metadata": {},
    "outputs": [],
@@ -62,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "id": "7b4c3f6b-72af-4e99-84c4-65b8ef58c91d",
    "metadata": {},
    "outputs": [
@@ -74,7 +74,7 @@
        ")"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -90,13 +90,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 6,
    "id": "b38f8af2-3d94-4a45-90b9-30b227828da1",
    "metadata": {},
    "outputs": [],
    "source": [
     "weights_dict = {\n",
-    "    'preprocessing': None, \n",
+    "    'preprocessing': 'scale_row', \n",
     "    'preprocessing_helper': None,\n",
     "    'postprocessing': None,\n",
     "    'postprocessing_helper': None,\n",
@@ -119,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 7,
    "id": "34136f3c-92b8-4641-a103-381d3a7dd857",
    "metadata": {},
    "outputs": [],
diff --git a/clocks/notebooks/zhangen.ipynb b/clocks/notebooks/zhangen.ipynb
index 8a16bde..086a66a 100644
--- a/clocks/notebooks/zhangen.ipynb
+++ b/clocks/notebooks/zhangen.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "fb157849-5454-4a60-8548-fff633fff764",
    "metadata": {},
    "outputs": [],
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "b9f484b1-f501-41b7-9565-82e03bfe97dc",
    "metadata": {},
    "outputs": [],
@@ -49,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "id": "a284fe99-dc47-4f0c-b2ff-274e136e7020",
    "metadata": {},
    "outputs": [],
@@ -62,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 4,
    "id": "7b4c3f6b-72af-4e99-84c4-65b8ef58c91d",
    "metadata": {},
    "outputs": [
@@ -74,7 +74,7 @@
        ")"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -90,13 +90,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 5,
    "id": "b38f8af2-3d94-4a45-90b9-30b227828da1",
    "metadata": {},
    "outputs": [],
    "source": [
     "weights_dict = {\n",
-    "    'preprocessing': None, \n",
+    "    'preprocessing': 'scale_row', \n",
     "    'preprocessing_helper': None,\n",
     "    'postprocessing': None,\n",
     "    'postprocessing_helper': None,\n",
@@ -119,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 6,
    "id": "34136f3c-92b8-4641-a103-381d3a7dd857",
    "metadata": {},
    "outputs": [],
@@ -130,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "id": "01b905f5-298f-4edd-b69b-fcedeea9d0d4",
    "metadata": {},
    "outputs": [
@@ -140,7 +140,7 @@
        "0"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -148,14 +148,6 @@
    "source": [
     "os.system(\"rm -r DNAm-based-age-predictor\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "34a90554-4e17-42de-9671-f52d656caf0a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/pyaging/data/_data.py b/pyaging/data/_data.py
index 01bd1c4..64e17ee 100644
--- a/pyaging/data/_data.py
+++ b/pyaging/data/_data.py
@@ -25,7 +25,7 @@ def download_example_data(
         The type of data to download. Valid options are 'GSE139307' (human methylation), 'GSE130735' (mouse
         methylation), 'GSE223748' (mammalian methylation), 'ENCFF386QWG' (histone mark), 'GSE65765' (C. elegans
         RNA-seq), 'GSE193140' (ATAC-Seq), 'blood_chemistry_example' (blood chemistry).
-    
+
     dir : str
         The directory to deposit the downloaded file. Defaults to "pyaging_data".
 
diff --git a/pyaging/predict/_pred_utils.py b/pyaging/predict/_pred_utils.py
index dd88773..a3406ce 100644
--- a/pyaging/predict/_pred_utils.py
+++ b/pyaging/predict/_pred_utils.py
@@ -476,6 +476,8 @@ def preprocess_data(
         adata.X = tpm_norm_log1p(adata.X, preprocessing_helper)
     elif preprocessing == "binarize":
         adata.X = binarize(adata.X)
+    elif preprocessing == "scale_row":
+        adata.X = scale_row(adata.X, adata[:, features].X)
     elif preprocessing == "scale":
         X = adata[:, features].X
         X = scale(X, preprocessing_helper)
@@ -599,7 +601,12 @@ def postprocess_data(
 
 @progress("Predict ages with model")
 def predict_ages_with_model(
-    model: torch.nn.Module, adata: torch.Tensor, features: List[str], device: str, logger, indent_level: int = 2
+    model: torch.nn.Module,
+    adata: torch.Tensor,
+    features: List[str],
+    device: str,
+    logger,
+    indent_level: int = 2,
 ) -> torch.Tensor:
     """
     Predict biological ages using a trained model and input data.
@@ -653,7 +660,7 @@ def predict_ages_with_model(
 
     """
     # Create an AnnLoader
-    use_cuda = device == 'cuda'
+    use_cuda = device == "cuda"
     dataloader = AnnLoader(adata, batch_size=1024, use_cuda=use_cuda)
 
     # Use the AnnLoader for batched prediction
@@ -824,11 +831,17 @@ def filter_missing_features(
     """
     n_missing_features = sum(adata.var["percent_na"] == 1)
     if n_missing_features > 0:
-        logger.info(f"Removing {n_missing_features} added features", indent_level=indent_level+1)
+        logger.info(
+            f"Removing {n_missing_features} added features",
+            indent_level=indent_level + 1,
+        )
         adata = adata[:, adata.var["percent_na"] < 1].copy()
     else:
-        logger.info("No missing features, so adata size did not change", indent_level=indent_level+1)
-        
+        logger.info(
+            "No missing features, so adata size did not change",
+            indent_level=indent_level + 1,
+        )
+
     return adata
 
 
diff --git a/pyaging/predict/_preprocessing.py b/pyaging/predict/_preprocessing.py
index 91710f7..47f7248 100644
--- a/pyaging/predict/_preprocessing.py
+++ b/pyaging/predict/_preprocessing.py
@@ -11,6 +11,20 @@ def scale(x, scaler):
     return x_scaled
 
 
+def scale_row(x, x_overlap):
+    """
+    Scales the input data per row with mean 0 and std 1.
+    """
+    row_means = np.mean(x_overlap, axis=1, keepdims=True)
+    row_stds = np.std(x_overlap, axis=1, keepdims=True)
+
+    # Avoid division by zero in case of a row with constant value
+    row_stds[row_stds == 0] = 1
+
+    x_scaled = (x - row_means) / row_stds
+    return x_scaled
+
+
 def binarize(x):
     """
     Binarizes an array based on the median of each row, excluding zeros.
diff --git a/pyproject.toml b/pyproject.toml
index 48b8d00..c110434 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pyaging"
-version = "0.0.10"
+version = "0.0.11"
 description = "A Python-based compendium of GPU-optimized aging clocks."
 authors = ["Lucas Paulo de Lima Camillo <lucas_camillo@alumni.brown.edu>"]
 license = "BSD"