Merge branch 'main' into ebezzi/scib-metrics-scripts

chanzuckerberg · Jul 9, 2024 · 3f55e66 · 3f55e66
2 parents 8c9aac1 + 55d0856
commit 3f55e66
Show file tree

Hide file tree

Showing 72 changed files with 1,815 additions and 691 deletions.
diff --git a/.github/workflows/docsite-build-deploy.yml b/.github/workflows/docsite-build-deploy.yml
@@ -6,6 +6,10 @@ on:
   workflow_dispatch: # Used to make post-release docfixes
 permissions:
   contents: write
+
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   build-and-deploy:
     concurrency: ci-${{ github.ref }}

diff --git a/.github/workflows/full-unittests.yml b/.github/workflows/full-unittests.yml
@@ -37,6 +37,9 @@ on:
         default: ""
         type: string
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   py_unit_tests:
     runs-on: single-cell-1tb-runner

diff --git a/.github/workflows/lts-compat-check.yml b/.github/workflows/lts-compat-check.yml
@@ -4,7 +4,10 @@ on:
   schedule:
     - cron: "30 1 * * *"
   workflow_dispatch: # used for debugging or manual validation
-
+
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   python-compat-check:
     name: Python LTS compatibility check

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
@@ -1,5 +1,8 @@
 name: Profiler
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 on:
   pull_request:
     paths:

diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml
@@ -14,6 +14,9 @@ on:
     - cron: "30 1 * * *"
   workflow_dispatch: # used for debugging or manual validation
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   python-dependency-check:
     name: python-dependency-check

diff --git a/.github/workflows/py-formatting.yml b/.github/workflows/py-formatting.yml
@@ -7,6 +7,9 @@ on:
   push:
     branches: [main]
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   pre_commit_checks:
     name: pre-commit checks

diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml
@@ -12,6 +12,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   unit_tests_python_api:
     strategy:
@@ -41,6 +44,9 @@ jobs:
           pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
           GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
           pip install -e './api/python/cellxgene_census/[experimental]'
+      - name: Install Geneformer (python >=3.10 only)
+        run: pip install git+https://huggingface.co/ctheodoris/Geneformer@471eefc
+        if: matrix.python-version != '3.8' && matrix.python-version != '3.9'
       - name: Report Dependency Versions
         run: pip list
       - name: Test with pytest (API, main tests)

diff --git a/.github/workflows/r-check.yml b/.github/workflows/r-check.yml
@@ -8,6 +8,9 @@ on:
   push:
     branches: [main]
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   build:
     strategy:

diff --git a/.github/workflows/r-dependency-check.yml b/.github/workflows/r-dependency-check.yml
@@ -8,6 +8,9 @@ on:
     - cron: "30 1 * * *"
   workflow_dispatch: # used for debugging or manual validation
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   r-dependency-check:
     name: r-dependency-check

diff --git a/api/python/cellxgene_census/README.md b/api/python/cellxgene_census/README.md
@@ -23,19 +23,13 @@ import cellxgene_census
 
 with cellxgene_census.open_soma() as census:
 
-    # Reads SOMADataFrame as a slice
-    cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
+    cell_metadata = cellxgene_census.get_obs(
+        census,
+        "homo_sapiens",
         value_filter = "sex == 'female' and cell_type in ['microglial cell', 'neuron']",
         column_names = ["assay", "cell_type", "tissue", "tissue_general", "suspension_type", "disease"]
     )
-
-    # Concatenates results to pyarrow.Table
-    cell_metadata = cell_metadata.concat()
-
-    # Converts to pandas.DataFrame
-    cell_metadata = cell_metadata.to_pandas()
-
-    print(cell_metadata)
+    cell_metadata
 ```
 
 The output is a `pandas.DataFrame` with over 600K cells meeting our query criteria and the selected columns:

diff --git a/api/python/cellxgene_census/scripts/requirements-dev.txt b/api/python/cellxgene_census/scripts/requirements-dev.txt
@@ -5,5 +5,5 @@ twine
 coverage
 nbqa
 transformers[torch]
-git+https://huggingface.co/ctheodoris/Geneformer@8df5dc1
 owlready2
+proxy.py
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
@@ -8,6 +8,7 @@
 """
 
 from typing import Literal, Optional, Sequence
+from warnings import warn
 
 import anndata
 import pandas as pd
@@ -38,6 +39,8 @@ def get_anndata(
     column_names: Optional[soma.AxisColumnNames] = None,
     obs_embeddings: Optional[Sequence[str]] = (),
     var_embeddings: Optional[Sequence[str]] = (),
+    obs_column_names: Optional[Sequence[str]] = None,
+    var_column_names: Optional[Sequence[str]] = None,
 ) -> anndata.AnnData:
     """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query,
     and return it as an :class:`anndata.AnnData` object.
@@ -65,8 +68,6 @@ def get_anndata(
         var_coords:
             Coordinates for the ``var`` axis, which is indexed by the ``soma_joinid`` value.
             May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
-        column_names:
-            Columns to fetch for ``obs`` and ``var`` dataframes.
         obsm_layers:
             Additional obsm layers to read and return in the ``obsm`` slot.
         obsp_layers:
@@ -83,6 +84,10 @@ def get_anndata(
             Additional embeddings to be returned as part of the ``varm`` slot.
             Use :func:`get_all_available_embeddings` to retrieve available embeddings
             for this Census version and organism.
+        obs_column_names:
+            Columns to fetch for ``obs`` dataframe.
+        var_column_names:
+            Columns to fetch for ``var`` dataframe.
 
     Returns:
         An :class:`anndata.AnnData` object containing the census slice.
@@ -93,7 +98,7 @@ def get_anndata(
     Examples:
         >>> get_anndata(census, "Mus musculus", obs_value_filter="tissue_general in ['brain', 'lung']")
 
-        >>> get_anndata(census, "Homo sapiens", column_names={"obs": ["tissue"]})
+        >>> get_anndata(census, "Homo sapiens", obs_column_names=["tissue"])
 
         >>> get_anndata(census, "Homo sapiens", obs_coords=slice(0, 1000))
     """
@@ -107,14 +112,31 @@ def get_anndata(
     if varm_layers and var_embeddings and set(varm_layers) & set(var_embeddings):
         raise ValueError("Cannot request both `varm_layers` and `var_embeddings` for the same embedding name")
 
+    # Backwards compat for old column_names argument
+    if column_names is not None:
+        if obs_column_names is not None or var_column_names is not None:
+            raise ValueError(
+                "Both the deprecated 'column_names' argument and its replacements were used. Please use 'obs_column_names' and 'var_column_names' only."
+            )
+        else:
+            warn(
+                "The argument `column_names` is deprecated and will be removed in a future release. Please use `obs_column_names` and `var_column_names` instead.",
+                FutureWarning,
+                stacklevel=2,
+            )
+        if "obs" in column_names:
+            obs_column_names = column_names["obs"]
+        if "var" in column_names:
+            var_column_names = column_names["var"]
+
     with exp.axis_query(
         measurement_name,
         obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords),
         var_query=soma.AxisQuery(value_filter=var_value_filter, coords=var_coords),
     ) as query:
         adata = query.to_anndata(
             X_name=X_name,
-            column_names=column_names,
+            column_names={"obs": obs_column_names, "var": var_column_names},
             X_layers=X_layers,
             obsm_layers=obsm_layers,
             varm_layers=varm_layers,

diff --git a/api/python/cellxgene_census/src/cellxgene_census/_open.py b/api/python/cellxgene_census/src/cellxgene_census/_open.py
@@ -24,10 +24,14 @@
     _get_census_mirrors,
     get_census_version_description,
 )
-from ._util import _uri_join
+from ._util import _uri_join, _user_agent
 
 DEFAULT_CENSUS_VERSION = "stable"
 
+DEFAULT_S3FS_KWARGS = {
+    "anon": True,
+    "cache_regions": True,
+}
 DEFAULT_TILEDB_CONFIGURATION: Dict[str, Any] = {
     # https://docs.tiledb.com/main/how-to/configuration#configuration-parameters
     "py.init_buffer_bytes": 1 * 1024**3,
@@ -120,7 +124,9 @@ def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) ->
     Lifecycle:
         experimental
     """
-    tiledb_config = dict(DEFAULT_TILEDB_CONFIGURATION, **(tiledb_config or {}))
+    tiledb_config = dict(
+        DEFAULT_TILEDB_CONFIGURATION, **{"vfs.s3.custom_headers.User-Agent": _user_agent()}, **(tiledb_config or {})
+    )
     return soma.options.SOMATileDBContext().replace(tiledb_config=tiledb_config)
 
 
@@ -343,8 +349,8 @@ def download_source_h5ad(
     assert protocol == "s3"
 
     fs = s3fs.S3FileSystem(
-        anon=True,
-        cache_regions=True,
+        config_kwargs={"user_agent": _user_agent()},
+        **DEFAULT_S3FS_KWARGS,
     )
     fs.get_file(
         locator["uri"],

diff --git a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py
@@ -9,11 +9,13 @@
 
 import typing
 from collections import OrderedDict
-from typing import Dict, Literal, Optional, Union, cast
+from typing import Any, Dict, Literal, Optional, Union, cast
 
 import requests
 from typing_extensions import NotRequired, TypedDict
 
+from cellxgene_census._util import _user_agent
+
 """
 The following types describe the expected directory of Census builds, used
 to bootstrap all data location requests.
@@ -350,10 +352,10 @@ def get_census_version_directory(
                 }
             }
     """
-    response = requests.get(CELL_CENSUS_RELEASE_DIRECTORY_URL)
+    response = requests.get(CELL_CENSUS_RELEASE_DIRECTORY_URL, headers={"User-Agent": _user_agent()})
     response.raise_for_status()
 
-    directory: CensusDirectory = cast(CensusDirectory, response.json())
+    directory: dict[str, str | dict[str, Any]] = response.json()
     directory_out: CensusDirectory = {}
     aliases: typing.Set[CensusVersionName] = set()
 
@@ -379,6 +381,11 @@ def get_census_version_directory(
         if not isinstance(directory_value, dict):
             continue
 
+        # Filter fields
+        directory_value = {
+            k: directory_value[k] for k in CensusVersionDescription.__annotations__ if k in directory_value
+        }
+
         # filter by release flags
         census_version_description = cast(CensusVersionDescription, directory_value)
         release_flags = cast(ReleaseFlags, {"lts": lts, "retracted": retracted})
@@ -425,6 +432,6 @@ def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]:
 
 
 def _get_census_mirrors() -> CensusMirrors:
-    response = requests.get(CELL_CENSUS_MIRRORS_DIRECTORY_URL)
+    response = requests.get(CELL_CENSUS_MIRRORS_DIRECTORY_URL, headers={"User-Agent": _user_agent()})
     response.raise_for_status()
     return cast(CensusMirrors, response.json())
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_testing/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/_testing/__init__.py
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_testing/logger_proxy.py b/api/python/cellxgene_census/src/cellxgene_census/_testing/logger_proxy.py
@@ -0,0 +1,40 @@
+"""This module defines a plugin class that logs each request to a logfile.
+
+This class needs to be importable by the proxy server which runs in a separate process.
+See the user agent tests for usage.
+"""
+
+import json
+import traceback
+from pathlib import Path
+
+import proxy
+from proxy.common.flag import flags
+
+flags.add_argument(
+    "--request-log-file",
+    type=str,
+    default="",
+    help="Where to log the requests to.",
+)
+
+
+class RequestLoggerPlugin(proxy.http.proxy.HttpProxyBasePlugin):  # type: ignore
+    def handle_client_request(self, request: proxy.http.parser.HttpParser) -> proxy.http.parser.HttpParser:
+        # If anything fails in here, it just fails to respond
+        try:
+            with Path(self.flags.request_log_file).open("a") as f:
+                record = {
+                    "method": request.method.decode(),
+                    "url": str(request._url),
+                }
+
+                if request.headers:
+                    record["headers"] = {k2.decode().lower(): v.decode() for _, (k2, v) in request.headers.items()}
+                f.write(f"{json.dumps(record)}\n")
+        except Exception as e:
+            # Making sure there is some visible output
+            print(repr(e))
+            traceback.print_exception(e)
+            raise e
+        return request
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py
@@ -2,6 +2,9 @@
 
 import tiledbsoma as soma
 
+USER_AGENT_ENVVAR = "CELLXGENE_CENSUS_USERAGENT"
+"""Environment variable used to add more information into the user-agent."""
+
 
 def _uri_join(base: str, url: str) -> str:
     """Like urllib.parse.urljoin, but doesn't get confused by s3://."""
@@ -30,3 +33,14 @@ def _extract_census_version(census: soma.Collection) -> str:
         raise ValueError("Unable to extract Census version.") from None
 
     return version
+
+
+def _user_agent() -> str:
+    import os
+
+    import cellxgene_census
+
+    if env_specifier := os.environ.get(USER_AGENT_ENVVAR, None):
+        return f"cellxgene-census-python/{cellxgene_census.__version__} {env_specifier}"
+    else:
+        return f"cellxgene-census-python/{cellxgene_census.__version__}"
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,9 @@ on: @@
       push:
         branches: [main]
+    env:
+      CELLXGENE_CENSUS_USERAGENT: "CZI-test"
     jobs:
       build:
         strategy:
@@ Expand Down @@