diff --git a/.github/workflows/docsite-build-deploy.yml b/.github/workflows/docsite-build-deploy.yml
index 7628e6760..f4a7e8166 100644
--- a/.github/workflows/docsite-build-deploy.yml
+++ b/.github/workflows/docsite-build-deploy.yml
@@ -6,6 +6,10 @@ on:
   workflow_dispatch: # Used to make post-release docfixes
 permissions:
   contents: write
+
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   build-and-deploy:
     concurrency: ci-${{ github.ref }}
diff --git a/.github/workflows/full-unittests.yml b/.github/workflows/full-unittests.yml
index 87dc388a3..b41352d67 100644
--- a/.github/workflows/full-unittests.yml
+++ b/.github/workflows/full-unittests.yml
@@ -37,6 +37,9 @@ on:
         default: ""
         type: string
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   py_unit_tests:
     runs-on: single-cell-1tb-runner
diff --git a/.github/workflows/lts-compat-check.yml b/.github/workflows/lts-compat-check.yml
index 7e4a7b640..5fc787f0b 100644
--- a/.github/workflows/lts-compat-check.yml
+++ b/.github/workflows/lts-compat-check.yml
@@ -4,7 +4,10 @@ on:
   schedule:
     - cron: "30 1 * * *"
   workflow_dispatch: # used for debugging or manual validation
-  
+
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   python-compat-check:
     name: Python LTS compatibility check
diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
index 2820d9bba..4a1c13364 100644
--- a/.github/workflows/profiler.yml
+++ b/.github/workflows/profiler.yml
@@ -1,5 +1,8 @@
 name: Profiler
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 on:
   pull_request:
     paths:
diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml
index daa85c09a..40aa4a16c 100644
--- a/.github/workflows/py-dependency-check.yml
+++ b/.github/workflows/py-dependency-check.yml
@@ -14,6 +14,9 @@ on:
     - cron: "30 1 * * *"
   workflow_dispatch: # used for debugging or manual validation
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   python-dependency-check:
     name: python-dependency-check
diff --git a/.github/workflows/py-formatting.yml b/.github/workflows/py-formatting.yml
index 5cf529795..25e877c26 100644
--- a/.github/workflows/py-formatting.yml
+++ b/.github/workflows/py-formatting.yml
@@ -7,6 +7,9 @@ on:
   push:
     branches: [main]
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   pre_commit_checks:
     name: pre-commit checks
diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml
index 115083909..ad68fd0d8 100644
--- a/.github/workflows/py-unittests.yml
+++ b/.github/workflows/py-unittests.yml
@@ -12,6 +12,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   unit_tests_python_api:
     strategy:
@@ -41,6 +44,9 @@ jobs:
           pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
           GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
           pip install -e './api/python/cellxgene_census/[experimental]'
+      - name: Install Geneformer (python >=3.10 only)
+        run: pip install git+https://huggingface.co/ctheodoris/Geneformer@471eefc
+        if: matrix.python-version != '3.8' && matrix.python-version != '3.9'
       - name: Report Dependency Versions
         run: pip list
       - name: Test with pytest (API, main tests)
diff --git a/.github/workflows/r-check.yml b/.github/workflows/r-check.yml
index 9a1397bfb..fad2ca8e5 100644
--- a/.github/workflows/r-check.yml
+++ b/.github/workflows/r-check.yml
@@ -8,6 +8,9 @@ on:
   push:
     branches: [main]
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   build:
     strategy:
diff --git a/.github/workflows/r-dependency-check.yml b/.github/workflows/r-dependency-check.yml
index 2017e4013..ccc5e2a4d 100644
--- a/.github/workflows/r-dependency-check.yml
+++ b/.github/workflows/r-dependency-check.yml
@@ -8,6 +8,9 @@ on:
     - cron: "30 1 * * *"
   workflow_dispatch: # used for debugging or manual validation
 
+env:
+  CELLXGENE_CENSUS_USERAGENT: "CZI-test"
+
 jobs:
   r-dependency-check:
     name: r-dependency-check
diff --git a/api/python/cellxgene_census/README.md b/api/python/cellxgene_census/README.md
index 4af92636c..a694717fe 100644
--- a/api/python/cellxgene_census/README.md
+++ b/api/python/cellxgene_census/README.md
@@ -23,19 +23,13 @@ import cellxgene_census
 
 with cellxgene_census.open_soma() as census:
 
-    # Reads SOMADataFrame as a slice
-    cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
+    cell_metadata = cellxgene_census.get_obs(
+        census,
+        "homo_sapiens",
         value_filter = "sex == 'female' and cell_type in ['microglial cell', 'neuron']",
         column_names = ["assay", "cell_type", "tissue", "tissue_general", "suspension_type", "disease"]
     )
-
-    # Concatenates results to pyarrow.Table
-    cell_metadata = cell_metadata.concat()
-
-    # Converts to pandas.DataFrame
-    cell_metadata = cell_metadata.to_pandas()
-
-    print(cell_metadata)
+    cell_metadata
 ```
 
 The output is a `pandas.DataFrame` with over 600K cells meeting our query criteria and the selected columns:
diff --git a/api/python/cellxgene_census/scripts/requirements-dev.txt b/api/python/cellxgene_census/scripts/requirements-dev.txt
index c04490c25..7fab730ad 100644
--- a/api/python/cellxgene_census/scripts/requirements-dev.txt
+++ b/api/python/cellxgene_census/scripts/requirements-dev.txt
@@ -5,5 +5,5 @@ twine
 coverage
 nbqa
 transformers[torch]
-git+https://huggingface.co/ctheodoris/Geneformer@8df5dc1
 owlready2
+proxy.py
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
index 69ca0da6b..e37337184 100644
--- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
+++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
@@ -8,6 +8,7 @@
 """
 
 from typing import Literal, Optional, Sequence
+from warnings import warn
 
 import anndata
 import pandas as pd
@@ -38,6 +39,8 @@ def get_anndata(
     column_names: Optional[soma.AxisColumnNames] = None,
     obs_embeddings: Optional[Sequence[str]] = (),
     var_embeddings: Optional[Sequence[str]] = (),
+    obs_column_names: Optional[Sequence[str]] = None,
+    var_column_names: Optional[Sequence[str]] = None,
 ) -> anndata.AnnData:
     """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query,
     and return it as an :class:`anndata.AnnData` object.
@@ -65,8 +68,6 @@ def get_anndata(
         var_coords:
             Coordinates for the ``var`` axis, which is indexed by the ``soma_joinid`` value.
             May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
-        column_names:
-            Columns to fetch for ``obs`` and ``var`` dataframes.
         obsm_layers:
             Additional obsm layers to read and return in the ``obsm`` slot.
         obsp_layers:
@@ -83,6 +84,10 @@ def get_anndata(
             Additional embeddings to be returned as part of the ``varm`` slot.
             Use :func:`get_all_available_embeddings` to retrieve available embeddings
             for this Census version and organism.
+        obs_column_names:
+            Columns to fetch for ``obs`` dataframe.
+        var_column_names:
+            Columns to fetch for ``var`` dataframe.
 
     Returns:
         An :class:`anndata.AnnData` object containing the census slice.
@@ -93,7 +98,7 @@ def get_anndata(
     Examples:
         >>> get_anndata(census, "Mus musculus", obs_value_filter="tissue_general in ['brain', 'lung']")
 
-        >>> get_anndata(census, "Homo sapiens", column_names={"obs": ["tissue"]})
+        >>> get_anndata(census, "Homo sapiens", obs_column_names=["tissue"])
 
         >>> get_anndata(census, "Homo sapiens", obs_coords=slice(0, 1000))
     """
@@ -107,6 +112,23 @@ def get_anndata(
     if varm_layers and var_embeddings and set(varm_layers) & set(var_embeddings):
         raise ValueError("Cannot request both `varm_layers` and `var_embeddings` for the same embedding name")
 
+    # Backwards compat for old column_names argument
+    if column_names is not None:
+        if obs_column_names is not None or var_column_names is not None:
+            raise ValueError(
+                "Both the deprecated 'column_names' argument and its replacements were used. Please use 'obs_column_names' and 'var_column_names' only."
+            )
+        else:
+            warn(
+                "The argument `column_names` is deprecated and will be removed in a future release. Please use `obs_column_names` and `var_column_names` instead.",
+                FutureWarning,
+                stacklevel=2,
+            )
+        if "obs" in column_names:
+            obs_column_names = column_names["obs"]
+        if "var" in column_names:
+            var_column_names = column_names["var"]
+
     with exp.axis_query(
         measurement_name,
         obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords),
@@ -114,7 +136,7 @@ def get_anndata(
     ) as query:
         adata = query.to_anndata(
             X_name=X_name,
-            column_names=column_names,
+            column_names={"obs": obs_column_names, "var": var_column_names},
             X_layers=X_layers,
             obsm_layers=obsm_layers,
             varm_layers=varm_layers,
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_open.py b/api/python/cellxgene_census/src/cellxgene_census/_open.py
index 640d2d9a6..642e6fbb6 100644
--- a/api/python/cellxgene_census/src/cellxgene_census/_open.py
+++ b/api/python/cellxgene_census/src/cellxgene_census/_open.py
@@ -24,10 +24,14 @@
     _get_census_mirrors,
     get_census_version_description,
 )
-from ._util import _uri_join
+from ._util import _uri_join, _user_agent
 
 DEFAULT_CENSUS_VERSION = "stable"
 
+DEFAULT_S3FS_KWARGS = {
+    "anon": True,
+    "cache_regions": True,
+}
 DEFAULT_TILEDB_CONFIGURATION: Dict[str, Any] = {
     # https://docs.tiledb.com/main/how-to/configuration#configuration-parameters
     "py.init_buffer_bytes": 1 * 1024**3,
@@ -120,7 +124,9 @@ def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) ->
     Lifecycle:
         experimental
     """
-    tiledb_config = dict(DEFAULT_TILEDB_CONFIGURATION, **(tiledb_config or {}))
+    tiledb_config = dict(
+        DEFAULT_TILEDB_CONFIGURATION, **{"vfs.s3.custom_headers.User-Agent": _user_agent()}, **(tiledb_config or {})
+    )
     return soma.options.SOMATileDBContext().replace(tiledb_config=tiledb_config)
 
 
@@ -343,8 +349,8 @@ def download_source_h5ad(
     assert protocol == "s3"
 
     fs = s3fs.S3FileSystem(
-        anon=True,
-        cache_regions=True,
+        config_kwargs={"user_agent": _user_agent()},
+        **DEFAULT_S3FS_KWARGS,
     )
     fs.get_file(
         locator["uri"],
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py
index e0839aab8..5ba8b77fb 100644
--- a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py
+++ b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py
@@ -9,11 +9,13 @@
 
 import typing
 from collections import OrderedDict
-from typing import Dict, Literal, Optional, Union, cast
+from typing import Any, Dict, Literal, Optional, Union, cast
 
 import requests
 from typing_extensions import NotRequired, TypedDict
 
+from cellxgene_census._util import _user_agent
+
 """
 The following types describe the expected directory of Census builds, used
 to bootstrap all data location requests.
@@ -350,10 +352,10 @@ def get_census_version_directory(
                 }
             }
     """
-    response = requests.get(CELL_CENSUS_RELEASE_DIRECTORY_URL)
+    response = requests.get(CELL_CENSUS_RELEASE_DIRECTORY_URL, headers={"User-Agent": _user_agent()})
     response.raise_for_status()
 
-    directory: CensusDirectory = cast(CensusDirectory, response.json())
+    directory: dict[str, str | dict[str, Any]] = response.json()
     directory_out: CensusDirectory = {}
     aliases: typing.Set[CensusVersionName] = set()
 
@@ -379,6 +381,11 @@ def get_census_version_directory(
         if not isinstance(directory_value, dict):
             continue
 
+        # Filter fields
+        directory_value = {
+            k: directory_value[k] for k in CensusVersionDescription.__annotations__ if k in directory_value
+        }
+
         # filter by release flags
         census_version_description = cast(CensusVersionDescription, directory_value)
         release_flags = cast(ReleaseFlags, {"lts": lts, "retracted": retracted})
@@ -425,6 +432,6 @@ def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]:
 
 
 def _get_census_mirrors() -> CensusMirrors:
-    response = requests.get(CELL_CENSUS_MIRRORS_DIRECTORY_URL)
+    response = requests.get(CELL_CENSUS_MIRRORS_DIRECTORY_URL, headers={"User-Agent": _user_agent()})
     response.raise_for_status()
     return cast(CensusMirrors, response.json())
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_testing/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/_testing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_testing/logger_proxy.py b/api/python/cellxgene_census/src/cellxgene_census/_testing/logger_proxy.py
new file mode 100644
index 000000000..c5e6ef54f
--- /dev/null
+++ b/api/python/cellxgene_census/src/cellxgene_census/_testing/logger_proxy.py
@@ -0,0 +1,40 @@
+"""This module defines a plugin class that logs each request to a logfile.
+
+This class needs to be importable by the proxy server which runs in a separate process.
+See the user agent tests for usage.
+"""
+
+import json
+import traceback
+from pathlib import Path
+
+import proxy
+from proxy.common.flag import flags
+
+flags.add_argument(
+    "--request-log-file",
+    type=str,
+    default="",
+    help="Where to log the requests to.",
+)
+
+
+class RequestLoggerPlugin(proxy.http.proxy.HttpProxyBasePlugin):  # type: ignore
+    def handle_client_request(self, request: proxy.http.parser.HttpParser) -> proxy.http.parser.HttpParser:
+        # If anything fails in here, it just fails to respond
+        try:
+            with Path(self.flags.request_log_file).open("a") as f:
+                record = {
+                    "method": request.method.decode(),
+                    "url": str(request._url),
+                }
+
+                if request.headers:
+                    record["headers"] = {k2.decode().lower(): v.decode() for _, (k2, v) in request.headers.items()}
+                f.write(f"{json.dumps(record)}\n")
+        except Exception as e:
+            # Making sure there is some visible output
+            print(repr(e))
+            traceback.print_exception(e)
+            raise e
+        return request
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py
index b7f70ee2c..70e979294 100644
--- a/api/python/cellxgene_census/src/cellxgene_census/_util.py
+++ b/api/python/cellxgene_census/src/cellxgene_census/_util.py
@@ -2,6 +2,9 @@
 
 import tiledbsoma as soma
 
+USER_AGENT_ENVVAR = "CELLXGENE_CENSUS_USERAGENT"
+"""Environment variable used to add more information into the user-agent."""
+
 
 def _uri_join(base: str, url: str) -> str:
     """Like urllib.parse.urljoin, but doesn't get confused by s3://."""
@@ -30,3 +33,14 @@ def _extract_census_version(census: soma.Collection) -> str:
         raise ValueError("Unable to extract Census version.") from None
 
     return version
+
+
+def _user_agent() -> str:
+    import os
+
+    import cellxgene_census
+
+    if env_specifier := os.environ.get(USER_AGENT_ENVVAR, None):
+        return f"cellxgene-census-python/{cellxgene_census.__version__} {env_specifier}"
+    else:
+        return f"cellxgene-census-python/{cellxgene_census.__version__}"
diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py
index 767b74e0b..4baba8e06 100644
--- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py
+++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py
@@ -17,8 +17,15 @@
 import requests
 import tiledbsoma as soma
 
+from cellxgene_census._util import _user_agent
+
 from .._open import get_default_soma_context, open_soma
-from .._release_directory import CensusVersionDescription, CensusVersionName, get_census_version_directory
+from .._release_directory import (
+    CensusVersionDescription,
+    CensusVersionName,
+    get_census_version_description,
+    get_census_version_directory,
+)
 
 CELL_CENSUS_EMBEDDINGS_MANIFEST_URL = "https://contrib.cellxgene.cziscience.com/contrib/cell-census/contributions.json"
 
@@ -181,7 +188,10 @@ def get_embedding_metadata_by_name(
         ValueError: if no embeddings are found for the specified query parameters.
 
     """
-    response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL)
+    census_version_description = get_census_version_description(census_version)
+    resolved_census_version = census_version_description["release_build"]
+
+    response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, headers={"User-Agent": _user_agent()})
     response.raise_for_status()
 
     manifest = cast(Dict[str, Dict[str, Any]], response.json())
@@ -191,12 +201,14 @@ def get_embedding_metadata_by_name(
             obj["embedding_name"] == embedding_name
             and obj["experiment_name"] == organism
             and obj["data_type"] == embedding_type
-            and obj["census_version"] == census_version
+            and obj["census_version"] == resolved_census_version
         ):
             embeddings.append(obj)
 
     if len(embeddings) == 0:
-        raise ValueError(f"No embeddings found for {embedding_name}, {organism}, {census_version}, {embedding_type}")
+        raise ValueError(
+            f"No embeddings found for {embedding_name}, {organism}, {resolved_census_version}, {embedding_type}"
+        )
 
     return sorted(embeddings, key=lambda x: x["submission_date"])[-1]
 
@@ -224,13 +236,16 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]:
         }]
 
     """
-    response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL)
+    # Validate census_version
+    census_version_description = get_census_version_description(census_version)
+
+    response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, headers={"User-Agent": _user_agent()})
     response.raise_for_status()
 
     embeddings = []
     manifest = response.json()
     for _, obj in manifest.items():
-        if obj["census_version"] == census_version:
+        if obj["census_version"] == census_version_description["release_build"]:
             embeddings.append(obj)
 
     return embeddings
@@ -252,7 +267,7 @@ def get_all_census_versions_with_embedding(
     Returns:
         A list of census versions that contain the specified embedding.
     """
-    response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL)
+    response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, headers={"User-Agent": _user_agent()})
     response.raise_for_status()
 
     manifest = response.json()
diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/__init__.py
index 99a155bc4..8450cdcd2 100644
--- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/__init__.py
+++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/__init__.py
@@ -1,9 +1,13 @@
 """An API to facilitate use of PyTorch ML training with data from the CZI Science CELLxGENE Census."""
 
+from .encoders import BatchEncoder, Encoder, LabelEncoder
 from .pytorch import ExperimentDataPipe, Stats, experiment_dataloader
 
 __all__ = [
     "Stats",
     "ExperimentDataPipe",
     "experiment_dataloader",
+    "Encoder",
+    "LabelEncoder",
+    "BatchEncoder",
 ]
diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py
new file mode 100644
index 000000000..3d4fc4dc5
--- /dev/null
+++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py
@@ -0,0 +1,130 @@
+import abc
+import functools
+from typing import List
+
+import numpy.typing as npt
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder as SklearnLabelEncoder
+
+
+class Encoder(abc.ABC):
+    """Base class for ``obs`` encoders.
+
+    To define a custom encoder, five methods must be implemented:
+
+    - ``fit``: defines how the encoder will be fitted to the data.
+    - ``transform``: defines how the encoder will be applied to the data
+      in order to create an ``obs`` tensor.
+    - ``inverse_transform``: defines how to decode the encoded values back
+      to the original values.
+    - ``name``: The name of the encoder. This will be used as the key in the
+      dictionary of encoders. Each encoder passed to a :class:`.pytorch.ExperimentDataPipe` must have a unique name.
+    - ``columns``: List of columns in ``obs`` that the encoder will be applied to.
+
+    See the implementation of :class:`LabelEncoder` for an example.
+    """
+
+    @abc.abstractmethod
+    def fit(self, obs: pd.DataFrame) -> None:
+        """Fit the encoder with obs."""
+        pass
+
+    @abc.abstractmethod
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Transform the obs :class:`pandas.DataFrame` into a :class:`pandas.DataFrame` of encoded values."""
+        pass
+
+    @abc.abstractmethod
+    def inverse_transform(self, encoded_values: npt.ArrayLike) -> npt.ArrayLike:
+        """Inverse transform the encoded values back to the original values."""
+        pass
+
+    @property
+    @abc.abstractmethod
+    def name(self) -> str:
+        """Name of the encoder."""
+        pass
+
+    @property
+    @abc.abstractmethod
+    def columns(self) -> List[str]:
+        """Columns in ``obs`` that the encoder will be applied to."""
+        pass
+
+
+class LabelEncoder(Encoder):
+    """Default encoder based on :class:`sklearn.preprocessing.LabelEncoder`."""
+
+    def __init__(self, col: str) -> None:
+        self._encoder = SklearnLabelEncoder()
+        self.col = col
+
+    def fit(self, obs: pd.DataFrame) -> None:
+        """Fit the encoder with ``obs``."""
+        self._encoder.fit(obs[self.col].unique())
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Transform the obs :class:`pandas.DataFrame` into a :class:`pandas.DataFrame` of encoded values."""
+        return self._encoder.transform(df[self.col])  # type: ignore
+
+    def inverse_transform(self, encoded_values: npt.ArrayLike) -> npt.ArrayLike:
+        """Inverse transform the encoded values back to the original values."""
+        return self._encoder.inverse_transform(encoded_values)  # type: ignore
+
+    @property
+    def name(self) -> str:
+        """Name of the encoder."""
+        return self.col
+
+    @property
+    def columns(self) -> List[str]:
+        """Columns in ``obs`` that the encoder will be applied to."""
+        return [self.col]
+
+    @property
+    def classes_(self):  # type: ignore
+        """Classes of the encoder."""
+        return self._encoder.classes_
+
+
+class BatchEncoder(Encoder):
+    """An encoder that concatenates and encodes several ``obs`` columns."""
+
+    def __init__(self, cols: List[str], name: str = "batch"):
+        self.cols = cols
+        from sklearn.preprocessing import LabelEncoder
+
+        self._name = name
+        self._encoder = LabelEncoder()
+
+    def _join_cols(self, df: pd.DataFrame):  # type: ignore
+        return functools.reduce(lambda a, b: a + b, [df[c].astype(str) for c in self.cols])
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Transform the obs :class:`pandas.DataFrame` into a :class:`pandas.DataFrame` of encoded values."""
+        arr = self._join_cols(df)
+        return self._encoder.transform(arr)  # type: ignore
+
+    def inverse_transform(self, encoded_values: npt.ArrayLike) -> npt.ArrayLike:
+        """Inverse transform the encoded values back to the original values."""
+        return self._encoder.inverse_transform(encoded_values)  # type: ignore
+
+    def fit(self, obs: pd.DataFrame) -> None:
+        """Fit the encoder with ``obs``."""
+        arr = self._join_cols(obs)
+        self._encoder.fit(arr.unique())
+
+    @property
+    def columns(self) -> List[str]:
+        """Columns in ``obs`` that the encoder will be applied to."""
+        return self.cols
+
+    @property
+    def name(self) -> str:
+        """Name of the encoder."""
+        return self._name
+
+    @property
+    def classes_(self):  # type: ignore
+        """Classes of the encoder."""
+        return self._encoder.classes_
diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py
index 6b274e8fd..07d2212c8 100644
--- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py
+++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py
@@ -66,6 +66,7 @@ def gen() -> Generator[Dict[str, Any], None, None]:
                 self.X(self.layer_name).blockwise(axis=0, reindex_disable_on_axis=[1], size=self.block_size).scipy()
             ):
                 assert isinstance(Xblock, scipy.sparse.csr_matrix)
+                assert Xblock.shape[0] == len(block_cell_joinids)
                 for i, cell_joinid in enumerate(block_cell_joinids):
                     yield self.cell_item(cell_joinid, Xblock.getrow(i))
 
diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py
index 3c8310fe1..1da99bf02 100644
--- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py
+++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py
@@ -1,5 +1,5 @@
 import pickle
-from typing import Any, Dict, Optional, Sequence, Set
+from typing import Any, Dict, List, Optional, Sequence, Set
 
 import numpy as np
 import numpy.typing as npt
@@ -14,7 +14,7 @@ class GeneformerTokenizer(CellDatasetBuilder):
     cell in CELLxGENE Census ExperimentAxisQuery results (human).
 
     This class requires the Geneformer package to be installed separately with:
-    `pip install git+https://huggingface.co/ctheodoris/Geneformer@8df5dc1`
+    `pip install git+https://huggingface.co/ctheodoris/Geneformer@471eefc`
 
     Example usage:
 
@@ -44,11 +44,18 @@ class GeneformerTokenizer(CellDatasetBuilder):
 
     obs_column_names: Set[str]
     max_input_tokens: int
-
-    # set of gene soma_joinids corresponding to genes modeled by Geneformer:
-    model_gene_ids: npt.NDArray[np.int64]
-    model_gene_tokens: npt.NDArray[np.int64]  # token for each model_gene_id
-    model_gene_medians: npt.NDArray[np.float64]  # float for each model_gene_id
+    special_token: bool
+
+    # Newer versions of Geneformer has a consolidated gene list (gene_mapping_file), meaning the
+    # counts for one or more Census genes are to be summed to get the count for one Geneformer
+    # gene. model_gene_map is a sparse binary matrix to map a cell vector (or multi-cell matrix) of
+    # Census gene counts onto Geneformer gene counts. model_gene_map[i,j] is 1 iff the i'th Census
+    # gene count contributes to the j'th Geneformer gene count.
+    model_gene_map: scipy.sparse.coo_matrix
+    model_gene_tokens: npt.NDArray[np.int64]  # Geneformer token for each column of model_gene_map
+    model_gene_medians: npt.NDArray[np.float64]  # float for each column of model_gene_map
+    model_cls_token: Optional[np.int64] = None
+    model_sep_token: Optional[np.int64] = None
 
     def __init__(
         self,
@@ -57,25 +64,33 @@ def __init__(
         obs_column_names: Optional[Sequence[str]] = None,
         obs_attributes: Optional[Sequence[str]] = None,
         max_input_tokens: int = 2048,
+        special_token: bool = False,
         token_dictionary_file: str = "",
         gene_median_file: str = "",
+        gene_mapping_file: str = "",
         **kwargs: Any,
     ) -> None:
-        """- `experiment`: Census Experiment to query
+        """Initialize GeneformerTokenizer.
+
+        Args:
+        - `experiment`: Census Experiment to query
         - `obs_query`: obs AxisQuery defining the set of Census cells to process (default all)
         - `obs_column_names`: obs dataframe columns (cell metadata) to propagate into attributes
            of each Dataset item
         - `max_input_tokens`: maximum length of Geneformer input token sequence (default 2048)
+        - `special_token`: whether to affix separator tokens to the sequence (default False)
         - `token_dictionary_file`, `gene_median_file`: pickle files supplying the mapping of
           Ensembl human gene IDs onto Geneformer token numbers and median expression values.
           By default, these will be loaded from the Geneformer package.
+        - `gene_mapping_file`: optional pickle file with mapping for Census gene IDs to model's
         """
         if obs_attributes:  # old name of obs_column_names
             obs_column_names = obs_attributes
 
         self.max_input_tokens = max_input_tokens
+        self.special_token = special_token
         self.obs_column_names = set(obs_column_names) if obs_column_names else set()
-        self._load_geneformer_data(experiment, token_dictionary_file, gene_median_file)
+        self._load_geneformer_data(experiment, token_dictionary_file, gene_median_file, gene_mapping_file)
         super().__init__(
             experiment,
             measurement_name="RNA",
@@ -88,6 +103,7 @@ def _load_geneformer_data(
         experiment: tiledbsoma.Experiment,
         token_dictionary_file: str,
         gene_median_file: str,
+        gene_mapping_file: str,
     ) -> None:
         """Load (1) the experiment's genes dataframe and (2) Geneformer's static data
         files for gene tokens and median expression; then, intersect them to compute
@@ -95,7 +111,13 @@ def _load_geneformer_data(
         """
         # TODO: this work could be reused for all queries on this experiment
 
-        genes_df = experiment.ms["RNA"].var.read(column_names=["soma_joinid", "feature_id"]).concat().to_pandas()
+        genes_df = (
+            experiment.ms["RNA"]
+            .var.read(column_names=["soma_joinid", "feature_id"])
+            .concat()
+            .to_pandas()
+            .set_index("soma_joinid")
+        )
 
         if not (token_dictionary_file and gene_median_file):
             try:
@@ -104,7 +126,7 @@ def _load_geneformer_data(
                 # pyproject.toml can't express Geneformer git+https dependency
                 raise ImportError(
                     "Please install Geneformer with: "
-                    "pip install git+https://huggingface.co/ctheodoris/Geneformer@8df5dc1"
+                    "pip install git+https://huggingface.co/ctheodoris/Geneformer@471eefc"
                 ) from None
             if not token_dictionary_file:
                 token_dictionary_file = geneformer.tokenizer.TOKEN_DICTIONARY_FILE
@@ -115,27 +137,47 @@ def _load_geneformer_data(
         with open(gene_median_file, "rb") as f:
             gene_median_dict = pickle.load(f)
 
+        gene_mapping = None
+        if gene_mapping_file:
+            with open(gene_mapping_file, "rb") as f:
+                gene_mapping = pickle.load(f)
+
         # compute model_gene_{ids,tokens,medians} by joining genes_df with Geneformer's
         # dicts
-        model_gene_ids = []
-        model_gene_tokens = []
-        model_gene_medians = []
+        map_data = []
+        map_i = []
+        map_j = []
+        model_gene_id_by_ensg: Dict[str, int] = {}
+        model_gene_count = 0
+        model_gene_tokens: List[np.int64] = []
+        model_gene_medians: List[np.float64] = []
         for gene_id, row in genes_df.iterrows():
             ensg = row["feature_id"]  # ENSG... gene id, which keys Geneformer's dicts
+            if gene_mapping is not None:
+                ensg = gene_mapping.get(ensg, ensg)
             if ensg in gene_token_dict:
-                model_gene_ids.append(gene_id)
-                model_gene_tokens.append(gene_token_dict[ensg])
-                model_gene_medians.append(gene_median_dict[ensg])
-        self.model_gene_ids = np.array(model_gene_ids, dtype=np.int64)
+                if ensg not in model_gene_id_by_ensg:
+                    model_gene_id_by_ensg[ensg] = model_gene_count
+                    model_gene_count += 1
+                    model_gene_tokens.append(gene_token_dict[ensg])
+                    model_gene_medians.append(gene_median_dict[ensg])
+                map_data.append(1)
+                map_i.append(gene_id)
+                map_j.append(model_gene_id_by_ensg[ensg])
+
+        self.model_gene_map = scipy.sparse.coo_matrix(
+            (map_data, (map_i, map_j)), shape=(genes_df.index.max() + 1, model_gene_count), dtype=bool
+        )
         self.model_gene_tokens = np.array(model_gene_tokens, dtype=np.int64)
         self.model_gene_medians = np.array(model_gene_medians, dtype=np.float64)
 
-        assert len(np.unique(self.model_gene_ids)) == len(self.model_gene_ids)
         assert len(np.unique(self.model_gene_tokens)) == len(self.model_gene_tokens)
         assert np.all(self.model_gene_medians > 0)
         # Geneformer models protein-coding and miRNA genes, so the intersection should
-        # be somewhere a little north of 20K.
-        assert len(self.model_gene_ids) > 20_000
+        # be north of 18K.
+        assert (
+            model_gene_count > 18_000
+        ), f"Mismatch between Census gene IDs and Geneformer token dicts (only {model_gene_count} common genes)"
 
         # Precompute a vector by which we'll multiply each cell's expression vector.
         # The denominator normalizes by Geneformer's median expression values.
@@ -143,6 +185,10 @@ def _load_geneformer_data(
         # affect the rank order, but is probably intended to help with numerical precision.
         self.model_gene_medians_factor = 10_000.0 / self.model_gene_medians
 
+        if self.special_token:
+            self.model_cls_token = gene_token_dict["<cls>"]
+            self.model_sep_token = gene_token_dict["<sep>"]
+
     def __enter__(self) -> "GeneformerTokenizer":
         super().__enter__()
         # On context entry, load the necessary cell metadata (obs_df)
@@ -156,21 +202,29 @@ def cell_item(self, cell_joinid: int, cell_Xrow: scipy.sparse.csr_matrix) -> Dic
         """Given the expression vector for one cell, compute the Dataset item providing
         the Geneformer inputs (token sequence and metadata).
         """
-        # project cell_Xrow onto model_gene_ids and normalize by row sum.
-        # notice we divide by the total count of the complete row (not only of the projected
+        # Apply model_gene_map to cell_Xrow and normalize with row sum & gene medians.
+        # Notice we divide by the total count of the complete row (not only of the projected
         # values); this follows Geneformer's internal tokenizer.
-        model_counts = cell_Xrow[:, self.model_gene_ids].multiply(1.0 / cell_Xrow.sum())
-        assert isinstance(model_counts, scipy.sparse.csr_matrix), type(model_counts)
-        # assert len(model_counts.data) == np.count_nonzero(model_counts.data)
-        model_expr = model_counts.multiply(self.model_gene_medians_factor)
+        model_expr = (cell_Xrow * self.model_gene_map).multiply(self.model_gene_medians_factor / cell_Xrow.sum())
         assert isinstance(model_expr, scipy.sparse.coo_matrix), type(model_expr)
-        # assert len(model_expr.data) == np.count_nonzero(model_expr.data)
+        assert model_expr.shape == (1, self.model_gene_map.shape[1])
 
         # figure the resulting tokens in descending order of model_expr
         # (use sparse model_expr.{col,data} to naturally exclude undetected genes)
         token_order = model_expr.col[np.argsort(-model_expr.data)[: self.max_input_tokens]]
         input_ids = self.model_gene_tokens[token_order]
 
+        if self.special_token:
+            # affix special tokens, dropping the last two gene tokens if necessary
+            if len(input_ids) == self.max_input_tokens:
+                input_ids = input_ids[:-1]
+            assert self.model_cls_token is not None
+            input_ids = np.insert(input_ids, 0, self.model_cls_token)
+            if len(input_ids) == self.max_input_tokens:
+                input_ids = input_ids[:-1]
+            assert self.model_sep_token is not None
+            input_ids = np.append(input_ids, self.model_sep_token)
+
         ans = {"input_ids": input_ids, "length": len(input_ids)}
         # add the requested obs attributes
         for attr in self.obs_column_names:
diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py
index 6bf9aa30c..765c5cea1 100644
--- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py
+++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py
@@ -1,25 +1,25 @@
 import gc
+import itertools
 import logging
 import os
+import typing
 from contextlib import contextmanager
 from datetime import timedelta
 from math import ceil
 from time import time
-from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 import psutil
-import pyarrow as pa
-import scipy
 import tiledbsoma as soma
 import torch
 import torchdata.datapipes.iter as pipes
 from attr import define
 from numpy.random import Generator
+from pyarrow import Table
 from scipy import sparse
-from sklearn.preprocessing import LabelEncoder
 from torch import Tensor
 from torch import distributed as dist
 from torch.utils.data import DataLoader
@@ -27,6 +27,7 @@
 
 from ... import get_default_soma_context
 from ..util._eager_iter import _EagerIterator
+from .encoders import Encoder, LabelEncoder
 
 pytorch_logger = logging.getLogger("cellxgene_census.experimental.pytorch")
 
@@ -36,6 +37,10 @@
 The Tensors are rank 1 if ``batch_size`` is 1, otherwise the Tensors are rank 2."""
 
 
+# "Chunk" of X data, returned by each `Method` above
+ChunkX = Union[npt.NDArray[Any], sparse.csr_matrix]
+
+
 @define
 class _SOMAChunk:
     """Return type of ``_ObsAndXSOMAIterator`` that pairs a chunk of ``obs`` rows with the respective rows from the ``X``
@@ -46,15 +51,15 @@ class _SOMAChunk:
     """
 
     obs: pd.DataFrame
-    X: scipy.sparse.spmatrix
+    X: ChunkX
     stats: "Stats"
 
     def __len__(self) -> int:
         return len(self.obs)
 
 
-Encoders = Dict[str, LabelEncoder]
-"""A dictionary of ``LabelEncoder``s keyed by the ``obs`` column name."""
+Encoders = Dict[str, Encoder]
+"""A dictionary of ``Encoder``s keyed by the ``obs`` column name."""
 
 
 @define
@@ -72,7 +77,7 @@ class Stats:
     nnz: int = 0
     """The total number of values retrieved"""
 
-    elapsed: int = 0
+    elapsed: float = 0
     """The total elapsed time in seconds for retrieving all batches"""
 
     n_soma_chunks: int = 0
@@ -101,6 +106,17 @@ def _open_experiment(
         yield exp
 
 
+def _tables_to_np(
+    tables: Iterator[Tuple[Table, Any]], shape: Tuple[int, int]
+) -> typing.Generator[Tuple[npt.NDArray[Any], Any, int], None, None]:
+    for tbl, indices in tables:
+        row_indices, col_indices, data = (x.to_numpy() for x in tbl.columns)
+        nnz = len(data)
+        dense_matrix = np.zeros(shape, dtype=data.dtype)
+        dense_matrix[row_indices, col_indices] = data
+        yield dense_matrix, indices, nnz
+
+
 class _ObsAndXSOMAIterator(Iterator[_SOMAChunk]):
     """Iterates the SOMA chunks of corresponding ``obs`` and ``X`` data. This is an internal class,
     not intended for public use.
@@ -123,11 +139,12 @@ def __init__(
         var_joinids: npt.NDArray[np.int64],
         shuffle_chunk_count: Optional[int] = None,
         shuffle_rng: Optional[Generator] = None,
+        return_sparse_X: bool = False,
     ):
         self.obs = obs
         self.X = X
         self.obs_column_names = obs_column_names
-        if shuffle_chunk_count:
+        if shuffle_chunk_count is not None:
             assert shuffle_rng is not None
 
             # At the start of this step, `obs_joinids_chunked` is a list of one dimensional
@@ -145,6 +162,7 @@ def __init__(
             self.obs_joinids_chunks_iter = iter(obs_joinids_chunked)
         self.var_joinids = var_joinids
         self.shuffle_chunk_count = shuffle_chunk_count
+        self.return_sparse_X = return_sparse_X
 
     def __next__(self) -> _SOMAChunk:
         pytorch_logger.debug("Retrieving next SOMA chunk...")
@@ -153,10 +171,15 @@ def __next__(self) -> _SOMAChunk:
         # If no more chunks to iterate through, raise StopIteration, as all iterators do when at end
         obs_joinids_chunk = next(self.obs_joinids_chunks_iter)
 
+        if "soma_joinid" not in self.obs_column_names:
+            cols = ["soma_joinid", *self.obs_column_names]
+        else:
+            cols = list(self.obs_column_names)
+
         obs_batch = (
             self.obs.read(
                 coords=(obs_joinids_chunk,),
-                column_names=self.obs_column_names,
+                column_names=cols,
             )
             .concat()
             .to_pandas()
@@ -173,18 +196,25 @@ def __next__(self) -> _SOMAChunk:
 
         # note: the `blockwise` call is employed for its ability to reindex the axes of the sparse matrix,
         # but the blockwise iteration feature is not used (block_size is set to retrieve the chunk as a single block)
-        scipy_iter = (
-            self.X.read(coords=(obs_joinids_chunk, self.var_joinids))
-            .blockwise(axis=0, size=len(obs_joinids_chunk), eager=False)
-            .scipy(compress=True)
+        blockwise_iter = self.X.read(coords=(obs_joinids_chunk, self.var_joinids)).blockwise(
+            axis=0, size=len(obs_joinids_chunk), eager=False
         )
-        X_batch, _ = next(scipy_iter)
+
+        X_batch: ChunkX
+        if not self.return_sparse_X:
+            res = next(_tables_to_np(blockwise_iter.tables(), shape=(obs_batch.shape[0], len(self.var_joinids))))
+            X_batch, nnz = res[0], res[2]
+        else:
+            X_batch = next(blockwise_iter.scipy(compress=True))[0]
+            nnz = X_batch.nnz
+
         assert obs_batch.shape[0] == X_batch.shape[0]
 
+        end_time = time()
         stats = Stats()
         stats.n_obs += X_batch.shape[0]
-        stats.nnz += X_batch.nnz
-        stats.elapsed += int(time() - start_time)
+        stats.nnz += nnz
+        stats.elapsed += end_time - start_time
         stats.n_soma_chunks += 1
 
         pytorch_logger.debug(f"Retrieved SOMA chunk: {stats}")
@@ -208,17 +238,19 @@ def list_split(arr_list: List[Any], sublist_len: int) -> List[List[Any]]:
     return result
 
 
-def run_gc() -> Tuple[Tuple[Any, Any, Any], Tuple[Any, Any, Any]]:  # noqa: D103
+def run_gc() -> Tuple[Tuple[Any, Any, Any], Tuple[Any, Any, Any], float]:  # noqa: D103
     proc = psutil.Process(os.getpid())
 
     pre_gc = proc.memory_full_info(), psutil.virtual_memory(), psutil.swap_memory()
+    start = time()
     gc.collect()
+    gc_elapsed = time() - start
     post_gc = proc.memory_full_info(), psutil.virtual_memory(), psutil.swap_memory()
 
     pytorch_logger.debug(f"gc:  pre={pre_gc}")
     pytorch_logger.debug(f"gc: post={post_gc}")
 
-    return pre_gc, post_gc
+    return pre_gc, post_gc, gc_elapsed
 
 
 class _ObsAndXIterator(Iterator[ObsAndXDatum]):
@@ -248,7 +280,7 @@ def __init__(
         obs_joinids_chunked: List[npt.NDArray[np.int64]],
         var_joinids: npt.NDArray[np.int64],
         batch_size: int,
-        encoders: Dict[str, LabelEncoder],
+        encoders: List[Encoder],
         stats: Stats,
         return_sparse_X: bool,
         use_eager_fetch: bool,
@@ -256,7 +288,14 @@ def __init__(
         shuffle_rng: Optional[Generator] = None,
     ) -> None:
         self.soma_chunk_iter = _ObsAndXSOMAIterator(
-            obs, X, obs_column_names, obs_joinids_chunked, var_joinids, shuffle_chunk_count, shuffle_rng
+            obs,
+            X,
+            obs_column_names,
+            obs_joinids_chunked,
+            var_joinids,
+            shuffle_chunk_count,
+            shuffle_rng,
+            return_sparse_X=return_sparse_X,
         )
         if use_eager_fetch:
             self.soma_chunk_iter = _EagerIterator(self.soma_chunk_iter)
@@ -266,39 +305,47 @@ def __init__(
         self.return_sparse_X = return_sparse_X
         self.encoders = encoders
         self.stats = stats
+        self.gc_elapsed = 0.0
         self.max_process_mem_usage_bytes = 0
         self.X_dtype = X.schema[2].type.to_pandas_dtype()
 
     def __next__(self) -> ObsAndXDatum:
         """Read the next torch batch, possibly across multiple soma chunks."""
-        obs: pd.DataFrame = pd.DataFrame()
-        X: sparse.csr_matrix = sparse.csr_matrix((0, len(self.var_joinids)), dtype=self.X_dtype)
+        obss: list[pd.DataFrame] = []
+        Xs: list[ChunkX] = []
+        n_obs = 0
 
-        while len(obs) < self.batch_size:
+        while n_obs < self.batch_size:
             try:
-                obs_partial, X_partial = self._read_partial_torch_batch(self.batch_size - len(obs))
-                obs = pd.concat([obs, obs_partial], axis=0)
-                X = sparse.vstack([X, X_partial])
+                obs_partial, X_partial = self._read_partial_torch_batch(self.batch_size - n_obs)
+                n_obs += len(obs_partial)
+                obss.append(obs_partial)
+                Xs.append(X_partial)
             except StopIteration:
                 break
 
-        if len(obs) == 0:
+        if len(Xs) == 0:  # If we ran out of data
             raise StopIteration
+        else:
+            if self.return_sparse_X:
+                X = sparse.vstack(Xs)
+            else:
+                X = np.concatenate(Xs, axis=0)
+            obs = pd.concat(obss, axis=0)
 
-        obs_encoded = pd.DataFrame(
-            data={"soma_joinid": obs.index},
-            columns=["soma_joinid"] + obs.columns.tolist(),
-            dtype=np.int64,
-        )
-        # TODO: Encode the entire SOMA chunk at once in _read_partial_torch_batch()
-        for col, enc in self.encoders.items():
-            obs_encoded[col] = enc.transform(obs[col])
+        obs_encoded = pd.DataFrame()
+
+        # Add the soma_joinid to the original obs, in case that is requested by the encoders.
+        obs["soma_joinid"] = obs.index
+
+        for enc in self.encoders:
+            obs_encoded[enc.name] = enc.transform(obs)
 
         # `to_numpy()` avoids copying the numpy array data
         obs_tensor = torch.from_numpy(obs_encoded.to_numpy())
 
         if not self.return_sparse_X:
-            X_tensor = torch.from_numpy(X.todense())
+            X_tensor = torch.from_numpy(X)
         else:
             coo = X.tocoo()
 
@@ -315,7 +362,7 @@ def __next__(self) -> ObsAndXDatum:
 
         return X_tensor, obs_tensor
 
-    def _read_partial_torch_batch(self, batch_size: int) -> ObsAndXDatum:
+    def _read_partial_torch_batch(self, batch_size: int) -> Tuple[pd.DataFrame, ChunkX]:
         """Reads a torch-size batch of data from the current SOMA chunk, returning a torch-size batch whose size may
         contain fewer rows than the requested ``batch_size``. This can happen when the remaining rows in the current
         SOMA chunk are fewer than the requested ``batch_size``.
@@ -323,17 +370,20 @@ def _read_partial_torch_batch(self, batch_size: int) -> ObsAndXDatum:
         if self.soma_chunk is None or not (0 <= self.i < len(self.soma_chunk)):
             # GC memory from previous soma_chunk
             self.soma_chunk = None
-            mem_info = run_gc()
-            self.max_process_mem_usage_bytes = max(self.max_process_mem_usage_bytes, mem_info[0][0].uss)
+            pre_gc, _, gc_elapsed = run_gc()
+            self.max_process_mem_usage_bytes = max(self.max_process_mem_usage_bytes, pre_gc[0].uss)
 
             self.soma_chunk: _SOMAChunk = next(self.soma_chunk_iter)
             self.stats += self.soma_chunk.stats
+            self.gc_elapsed += gc_elapsed
             self.i = 0
 
-            pytorch_logger.debug(f"Retrieved SOMA chunk totals: {self.stats}")
+            pytorch_logger.debug(
+                f"Retrieved SOMA chunk totals: {self.stats}, gc_elapsed={timedelta(seconds=self.gc_elapsed)}"
+            )
 
         obs_batch = self.soma_chunk.obs
-        X_batch = self.soma_chunk.X
+        X_chunk = self.soma_chunk.X
 
         safe_batch_size = min(batch_size, len(obs_batch) - self.i)
         slice_ = slice(self.i, self.i + safe_batch_size)
@@ -343,12 +393,13 @@ def _read_partial_torch_batch(self, batch_size: int) -> ObsAndXDatum:
         assert obs_rows.index.is_unique
         assert safe_batch_size == obs_rows.shape[0]
 
-        X_csr_scipy = X_batch[slice_]
-        assert obs_rows.shape[0] == X_csr_scipy.shape[0]
+        X_batch = X_chunk[slice_]
+
+        assert obs_rows.shape[0] == X_batch.shape[0]
 
         self.i += safe_batch_size
 
-        return obs_rows, X_csr_scipy
+        return obs_rows, X_batch
 
 
 class ExperimentDataPipe(pipes.IterDataPipe[Dataset[ObsAndXDatum]]):  # type: ignore
@@ -396,7 +447,7 @@ class ExperimentDataPipe(pipes.IterDataPipe[Dataset[ObsAndXDatum]]):  # type: ig
 
     _var_joinids: Optional[npt.NDArray[np.int64]]
 
-    _encoders: Optional[Encoders]
+    _encoders: List[Encoder]
 
     _stats: Stats
 
@@ -418,6 +469,7 @@ def __init__(
         return_sparse_X: bool = False,
         soma_chunk_size: Optional[int] = 64,
         use_eager_fetch: bool = True,
+        encoders: Optional[List[Encoder]] = None,
         shuffle_chunk_count: Optional[int] = 2000,
     ) -> None:
         r"""Construct a new ``ExperimentDataPipe``.
@@ -438,6 +490,8 @@ def __init__(
             obs_column_names:
                 The names of the ``obs`` columns to return. The ``soma_joinid`` index "column" does not need to be
                 specified and will always be returned. If not specified, only the ``soma_joinid`` will be returned.
+                If custom encoders are passed, this parameter must not be used, since the columns will be inferred
+                automatically from the encoders.
             batch_size:
                 The number of rows of ``obs`` and ``X`` data to return in each iteration. Defaults to ``1``. A value of
                 ``1`` will result in :class:`torch.Tensor` of rank 1 being returns (a single row); larger values will
@@ -476,6 +530,12 @@ def __init__(
                 The number of contiguous blocks (chunks) of rows sampled to then concatenate and shuffle.
                 Larger numbers correspond to more randomness per training batch.
                 If ``shuffle == False``, this parameter is ignored. Defaults to ``2000``.
+            encoders:
+                Specify custom encoders to be used. If not specified, a LabelEncoder will be created and
+                used for each column in ``obs_column_names``. If specified, only columns for which an encoder
+                has been registered will be returned in the ``obs`` tensor. Each encoder needs to have a unique name.
+                If this parameter is specified, the ``obs_column_names`` parameter must not be used,
+                since the columns will be inferred automatically from the encoders.
 
         Lifecycle:
             experimental
@@ -492,15 +552,25 @@ def __init__(
         self.soma_chunk_size = soma_chunk_size
         self.use_eager_fetch = use_eager_fetch
         self._stats = Stats()
-        self._encoders = None
+        self._encoders = encoders or []
         self._obs_joinids = None
         self._var_joinids = None
         self._shuffle_chunk_count = shuffle_chunk_count if shuffle else None
         self._shuffle_rng = np.random.default_rng(seed) if shuffle else None
         self._initialized = False
+        self.max_process_mem_usage_bytes = 0
 
-        if "soma_joinid" not in self.obs_column_names:
-            self.obs_column_names = ["soma_joinid", *self.obs_column_names]
+        if obs_column_names and encoders:
+            raise ValueError(
+                "Cannot specify both `obs_column_names` and `encoders`. If `encoders` are specified, columns will be inferred automatically."
+            )
+
+        if encoders:
+            # Check if names are unique
+            if len(encoders) != len({enc.name for enc in encoders}):
+                raise ValueError("Encoders must have unique names")
+
+            self.obs_column_names = list(dict.fromkeys(itertools.chain(*[enc.columns for enc in encoders])))
 
     def _init(self) -> None:
         if self._initialized:
@@ -607,7 +677,7 @@ def __iter__(self) -> Iterator[ObsAndXDatum]:
                 obs_joinids_chunked=obs_joinids_chunked_partition,
                 var_joinids=self._var_joinids,
                 batch_size=self.batch_size,
-                encoders=self.obs_encoders,
+                encoders=self._encoders,
                 stats=self._stats,
                 return_sparse_X=self.return_sparse_X,
                 use_eager_fetch=self.use_eager_fetch,
@@ -617,8 +687,9 @@ def __iter__(self) -> Iterator[ObsAndXDatum]:
 
             yield from obs_and_x_iter
 
+            self.max_process_mem_usage_bytes = obs_and_x_iter.max_process_mem_usage_bytes
             pytorch_logger.debug(
-                "max process memory usage=" f"{obs_and_x_iter.max_process_mem_usage_bytes / (1024 ** 3):.3f} GiB"
+                "max process memory usage=" f"{self.max_process_mem_usage_bytes / (1024 ** 3):.3f} GiB"
             )
 
     @staticmethod
@@ -631,21 +702,35 @@ def __len__(self) -> int:
         self._init()
         assert self._obs_joinids is not None
 
-        return len(self._obs_joinids)
+        div, rem = divmod(len(self._obs_joinids), self.batch_size)
+        return div + bool(rem)
 
     def __getitem__(self, index: int) -> ObsAndXDatum:
         raise NotImplementedError("IterDataPipe can only be iterated")
 
-    def _build_obs_encoders(self, query: soma.ExperimentAxisQuery) -> Encoders:
+    def _build_obs_encoders(self, query: soma.ExperimentAxisQuery) -> List[Encoder]:
         pytorch_logger.debug("Initializing encoders")
 
-        obs = query.obs(column_names=self.obs_column_names).concat()
-        encoders = {}
-        for col in self.obs_column_names:
-            if obs[col].type in (pa.string(), pa.large_string()):
-                enc = LabelEncoder()
-                enc.fit(obs[col].combine_chunks().unique())
-                encoders[col] = enc
+        encoders = []
+
+        if "soma_joinid" not in self.obs_column_names:
+            cols = ["soma_joinid", *self.obs_column_names]
+        else:
+            cols = list(self.obs_column_names)
+
+        obs = query.obs(column_names=cols).concat().to_pandas()
+
+        if self._encoders:
+            # Fit all the custom encoders with obs
+            for enc in self._encoders:
+                enc.fit(obs)
+                encoders.append(enc)
+        else:
+            # Create one LabelEncoder for each column, and fit it with obs
+            for col in self.obs_column_names:
+                enc = LabelEncoder(col)
+                enc.fit(obs)
+                encoders.append(enc)
 
         return encoders
 
@@ -696,7 +781,7 @@ def obs_encoders(self) -> Encoders:
         self._init()
         assert self._encoders is not None
 
-        return self._encoders
+        return {enc.name: enc for enc in self._encoders}
 
 
 # Note: must be a top-level function (and not a lambda), to play nice with multiprocessing pickling
diff --git a/api/python/cellxgene_census/tests/conftest.py b/api/python/cellxgene_census/tests/conftest.py
index 2041cd146..da500ceff 100644
--- a/api/python/cellxgene_census/tests/conftest.py
+++ b/api/python/cellxgene_census/tests/conftest.py
@@ -1,8 +1,13 @@
+import multiprocessing
+
 import pytest
 import tiledbsoma as soma
 
 TEST_MARKERS_SKIPPED_BY_DEFAULT = ["expensive", "experimental"]
 
+# tiledb will complain if this isn't set and a process is spawned. May cause segfaults on the proxy test if this isn't set.
+multiprocessing.set_start_method("spawn", force=True)
+
 
 def pytest_addoption(parser: pytest.Parser) -> None:
     for test_option in TEST_MARKERS_SKIPPED_BY_DEFAULT:
@@ -49,9 +54,6 @@ def small_mem_context() -> soma.SOMATileDBContext:
     return get_default_soma_context(tiledb_config={"soma.init_buffer_bytes": 32 * 1024**2})
 
 
-# Fixtures for census objects
-
-
 @pytest.fixture(scope="session")
 def census() -> soma.Collection:
     import cellxgene_census
@@ -63,4 +65,4 @@ def census() -> soma.Collection:
 def lts_census() -> soma.Collection:
     import cellxgene_census
 
-    return cellxgene_census.open_soma(census_version="stable")
+    return cellxgene_census.open_soma(census_version="2023-12-15")
diff --git a/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py b/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py
index c557a27a8..76dadd525 100644
--- a/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py
+++ b/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py
@@ -1,3 +1,5 @@
+import sys
+
 import datasets
 import pytest
 import tiledbsoma
@@ -66,7 +68,7 @@ def test_GeneformerTokenizer_correctness(tmpdir: Path) -> None:
         ad.write_h5ad(h5ad_dir.join("tokenizeme.h5ad"))
         # run geneformer.TranscriptomeTokenizer to get "true" tokenizations
         # see: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/tokenizer.py
-        TranscriptomeTokenizer({}).tokenize_data(h5ad_dir, tmpdir, "tk", file_format="h5ad")
+        TranscriptomeTokenizer({}).tokenize_data(h5ad_dir, str(tmpdir), "tk", file_format="h5ad")
         true_tokens = [it["input_ids"] for it in datasets.load_from_disk(tmpdir.join("tk.dataset"))]
 
         # check GeneformerTokenizer sequences against geneformer.TranscriptomeTokenizer's
@@ -87,6 +89,7 @@ def test_GeneformerTokenizer_correctness(tmpdir: Path) -> None:
         assert identical / len(cell_ids) >= EXACT_THRESHOLD
 
 
+@pytest.mark.skipif(sys.version_info < (3, 10), reason="requires python3.10 or higher")
 @pytest.mark.experimental
 @pytest.mark.live_corpus
 def test_GeneformerTokenizer_docstring_example() -> None:
diff --git a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py
index 678119856..2cdc5b772 100644
--- a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py
+++ b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py
@@ -18,6 +18,7 @@
     from torch import Tensor, float32
     from torch.utils.data._utils.worker import WorkerInfo
 
+    from cellxgene_census.experimental.ml.encoders import BatchEncoder, LabelEncoder
     from cellxgene_census.experimental.ml.pytorch import (
         ExperimentDataPipe,
         experiment_dataloader,
@@ -71,6 +72,7 @@ def add_dataframe(coll: CollectionBase, key: str, value_range: range) -> None:
             [
                 ("soma_joinid", pa.int64()),
                 ("label", pa.large_string()),
+                ("label2", pa.large_string()),
             ]
         ),
         index_column_names=["soma_joinid"],
@@ -80,6 +82,7 @@ def add_dataframe(coll: CollectionBase, key: str, value_range: range) -> None:
             {
                 "soma_joinid": list(value_range),
                 "label": [str(i) for i in value_range],
+                "label2": ["c" for i in value_range],
             }
         )
     )
@@ -150,7 +153,40 @@ def test_non_batched(soma_experiment: Experiment, use_eager_fetch: bool) -> None
 
     row = next(row_iter)
     assert row[0].int().tolist() == [0, 1, 0]
-    assert row[1].tolist() == [0, 0]
+    assert row[1].tolist() == [0]
+
+
+@pytest.mark.experimental
+# noinspection PyTestParametrized
+@pytest.mark.parametrize(
+    "obs_range,var_range,X_value_gen,use_eager_fetch",
+    [(6, 3, pytorch_x_value_gen, use_eager_fetch) for use_eager_fetch in (True, False)],
+)
+@pytest.mark.parametrize("return_sparse_X", [True, False])
+def test_uneven_soma_and_result_batches(
+    soma_experiment: Experiment, use_eager_fetch: bool, return_sparse_X: bool
+) -> None:
+    """This is checking that batches are correctly created when they require fetching multiple chunks.
+
+    This was added due to failures in _ObsAndXIterator.__next__.
+    """
+    exp_data_pipe = ExperimentDataPipe(
+        soma_experiment,
+        measurement_name="RNA",
+        X_name="raw",
+        obs_column_names=["label"],
+        shuffle=False,
+        batch_size=3,
+        soma_chunk_size=2,
+        return_sparse_X=return_sparse_X,
+        use_eager_fetch=use_eager_fetch,
+    )
+    row_iter = iter(exp_data_pipe)
+
+    row = next(row_iter)
+    X_batch = row[0].to_dense() if return_sparse_X else row[0]
+    assert X_batch.int()[0].tolist() == [0, 1, 0]
+    assert row[1].tolist() == [[0], [1], [2]]
 
 
 @pytest.mark.experimental
@@ -173,11 +209,11 @@ def test_batching__all_batches_full_size(soma_experiment: Experiment, use_eager_
 
     batch = next(batch_iter)
     assert batch[0].int().tolist() == [[0, 1, 0], [1, 0, 1], [0, 1, 0]]
-    assert batch[1].tolist() == [[0, 0], [1, 1], [2, 2]]
+    assert batch[1].tolist() == [[0], [1], [2]]
 
     batch = next(batch_iter)
     assert batch[0].int().tolist() == [[1, 0, 1], [0, 1, 0], [1, 0, 1]]
-    assert batch[1].tolist() == [[3, 3], [4, 4], [5, 5]]
+    assert batch[1].tolist() == [[3], [4], [5]]
 
     with pytest.raises(StopIteration):
         next(batch_iter)
@@ -250,7 +286,7 @@ def test_batching__exactly_one_batch(soma_experiment: Experiment, use_eager_fetc
 
     batch = next(batch_iter)
     assert batch[0].int().tolist() == [[0, 1, 0], [1, 0, 1], [0, 1, 0]]
-    assert batch[1].tolist() == [[0, 0], [1, 1], [2, 2]]
+    assert batch[1].tolist() == [[0], [1], [2]]
 
     with pytest.raises(StopIteration):
         next(batch_iter)
@@ -336,7 +372,7 @@ def test_batching__partial_soma_batches_are_concatenated(soma_experiment: Experi
         soma_experiment,
         measurement_name="RNA",
         X_name="raw",
-        obs_column_names=[],
+        obs_column_names=["label"],
         batch_size=3,
         # set SOMA batch read size such that PyTorch batches will span the tail and head of two SOMA batches
         soma_chunk_size=4,
@@ -345,13 +381,13 @@ def test_batching__partial_soma_batches_are_concatenated(soma_experiment: Experi
 
     full_result = list(exp_data_pipe)
 
-    assert [len(batch[1]) for batch in full_result] == [3, 3, 3, 1]
+    assert [len(batch[0]) for batch in full_result] == [3, 3, 3, 1]
 
 
 @pytest.mark.experimental
 # noinspection PyTestParametrized
 @pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(3, 3, pytorch_x_value_gen)])
-def test_encoders(soma_experiment: Experiment) -> None:
+def test_default_encoders_implicit(soma_experiment: Experiment) -> None:
     exp_data_pipe = ExperimentDataPipe(
         soma_experiment,
         measurement_name="RNA",
@@ -364,10 +400,89 @@ def test_encoders(soma_experiment: Experiment) -> None:
 
     batch = next(batch_iter)
     assert isinstance(batch[1], Tensor)
+    assert batch[0].to_dense().tolist() == [[0, 1, 0], [1, 0, 1], [0, 1, 0]]
+
+    labels_encoded = batch[1]
 
-    labels_encoded = batch[1][:, 1]
     labels_decoded = exp_data_pipe.obs_encoders["label"].inverse_transform(labels_encoded)
-    assert labels_decoded.tolist() == ["0", "1", "2"]
+    assert labels_decoded.tolist() == ["0", "1", "2"]  # type: ignore
+
+
+@pytest.mark.experimental
+# noinspection PyTestParametrized
+@pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(3, 3, pytorch_x_value_gen)])
+def test_default_encoders_explicit(soma_experiment: Experiment) -> None:
+    exp_data_pipe = ExperimentDataPipe(
+        soma_experiment,
+        measurement_name="RNA",
+        X_name="raw",
+        encoders=[LabelEncoder("label")],
+        shuffle=False,
+        batch_size=3,
+    )
+    batch_iter = iter(exp_data_pipe)
+
+    batch = next(batch_iter)
+    assert isinstance(batch[1], Tensor)
+
+    labels_encoded = batch[1]
+
+    labels_decoded = exp_data_pipe.obs_encoders["label"].inverse_transform(labels_encoded)
+    assert labels_decoded.tolist() == ["0", "1", "2"]  # type: ignore
+
+
+@pytest.mark.experimental
+# noinspection PyTestParametrized
+@pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(3, 3, pytorch_x_value_gen)])
+def test_batch_encoder(soma_experiment: Experiment) -> None:
+    exp_data_pipe = ExperimentDataPipe(
+        soma_experiment,
+        measurement_name="RNA",
+        X_name="raw",
+        encoders=[BatchEncoder(["label", "label2"])],
+        shuffle=False,
+        batch_size=3,
+    )
+    batch_iter = iter(exp_data_pipe)
+
+    batch = next(batch_iter)
+    assert isinstance(batch[1], Tensor)
+
+    labels_encoded = batch[1]
+
+    labels_decoded = exp_data_pipe.obs_encoders["batch"].inverse_transform(labels_encoded)
+    assert labels_decoded.tolist() == ["0c", "1c", "2c"]  # type: ignore
+
+
+@pytest.mark.experimental
+# noinspection PyTestParametrized
+@pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(3, 3, pytorch_x_value_gen)])
+def test_custom_encoders_fail_if_duplicate(soma_experiment: Experiment) -> None:
+    with pytest.raises(ValueError):
+        ExperimentDataPipe(
+            soma_experiment,
+            measurement_name="RNA",
+            X_name="raw",
+            encoders=[LabelEncoder("label"), LabelEncoder("label")],
+            shuffle=False,
+            batch_size=3,
+        )
+
+
+@pytest.mark.experimental
+# noinspection PyTestParametrized
+@pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(3, 3, pytorch_x_value_gen)])
+def test_custom_encoders_fail_if_columns_defined(soma_experiment: Experiment) -> None:
+    with pytest.raises(ValueError, match="Cannot specify both `obs_column_names` and `encoders`"):
+        ExperimentDataPipe(
+            soma_experiment,
+            measurement_name="RNA",
+            X_name="raw",
+            obs_column_names=["label"],
+            encoders=[LabelEncoder("label")],
+            shuffle=False,
+            batch_size=3,
+        )
 
 
 @pytest.mark.experimental
@@ -419,7 +534,7 @@ def test_distributed__returns_data_partition_for_rank(
             soma_experiment,
             measurement_name="RNA",
             X_name="raw",
-            obs_column_names=["label"],
+            encoders=[LabelEncoder("soma_joinid"), LabelEncoder("label")],
             soma_chunk_size=2,
             shuffle=False,
         )
@@ -458,7 +573,7 @@ def test_distributed_and_multiprocessing__returns_data_partition_for_rank(
             soma_experiment,
             measurement_name="RNA",
             X_name="raw",
-            obs_column_names=["label"],
+            encoders=[LabelEncoder("soma_joinid"), LabelEncoder("label")],
             soma_chunk_size=2,
             shuffle=False,
         )
@@ -484,7 +599,7 @@ def test_experiment_dataloader__non_batched(soma_experiment: Experiment, use_eag
         soma_experiment,
         measurement_name="RNA",
         X_name="raw",
-        obs_column_names=["label"],
+        encoders=[LabelEncoder("soma_joinid"), LabelEncoder("label")],
         shuffle=False,
         use_eager_fetch=use_eager_fetch,
     )
@@ -507,7 +622,7 @@ def test_experiment_dataloader__batched(soma_experiment: Experiment, use_eager_f
         soma_experiment,
         measurement_name="RNA",
         X_name="raw",
-        obs_column_names=["label"],
+        encoders=[LabelEncoder("soma_joinid"), LabelEncoder("label")],
         batch_size=3,
         shuffle=False,
         use_eager_fetch=use_eager_fetch,
@@ -520,6 +635,26 @@ def test_experiment_dataloader__batched(soma_experiment: Experiment, use_eager_f
     assert batch[1].tolist() == [[0, 0], [1, 1], [2, 2]]
 
 
+@pytest.mark.experimental
+# noinspection PyTestParametrized,DuplicatedCode
+@pytest.mark.parametrize(
+    "obs_range,var_range,X_value_gen,use_eager_fetch",
+    [(10, 3, pytorch_x_value_gen, use_eager_fetch) for use_eager_fetch in (True, False)],
+)
+def test_experiment_dataloader__batched_length(soma_experiment: Experiment, use_eager_fetch: bool) -> None:
+    dp = ExperimentDataPipe(
+        soma_experiment,
+        measurement_name="RNA",
+        X_name="raw",
+        obs_column_names=["label"],
+        batch_size=3,
+        shuffle=False,
+        use_eager_fetch=use_eager_fetch,
+    )
+    dl = experiment_dataloader(dp)
+    assert len(dl) == len(list(dl))
+
+
 @pytest.mark.experimental
 # noinspection PyTestParametrized,DuplicatedCode
 @pytest.mark.parametrize(
@@ -565,7 +700,7 @@ def test__shuffle(soma_experiment: Experiment) -> None:
         soma_experiment,
         measurement_name="RNA",
         X_name="raw",
-        obs_column_names=["label"],
+        encoders=[LabelEncoder("soma_joinid"), LabelEncoder("label")],
         shuffle=True,
     )
 
diff --git a/api/python/cellxgene_census/tests/experimental/test_embeddings.py b/api/python/cellxgene_census/tests/experimental/test_embeddings.py
index aeb0ff661..4ee1ad579 100644
--- a/api/python/cellxgene_census/tests/experimental/test_embeddings.py
+++ b/api/python/cellxgene_census/tests/experimental/test_embeddings.py
@@ -1,6 +1,9 @@
+from functools import partial
+
 import pytest
 import requests_mock as rm
 
+import cellxgene_census
 from cellxgene_census.experimental import (
     get_all_available_embeddings,
     get_all_census_versions_with_embedding,
@@ -74,6 +77,32 @@ def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None:
         )
 
 
+@pytest.mark.xfail(
+    strict=True,
+    reason="Current stable release doesn't have embeddings available. This xfail should be removed once that's resolved.",
+)
+def test_get_embedding_by_name_w_version_aliases() -> None:
+    """https://github.com/chanzuckerberg/cellxgene-census/issues/1202"""
+    # Only testing "stable" as "latest" doesn't have embeddings
+    version = "stable"
+    resolved_version = cellxgene_census.get_census_version_description(version)["release_build"]
+
+    metadata = get_all_available_embeddings(version)[0]
+
+    _get_metadata = partial(
+        get_embedding_metadata_by_name,
+        embedding_name=metadata["embedding_name"],
+        organism=metadata["experiment_name"],
+        embedding_type=metadata["data_type"],
+    )
+
+    w_alias = _get_metadata(census_version=version)
+    w_resolved = _get_metadata(census_version=resolved_version)
+
+    assert w_resolved == w_alias
+    assert metadata == w_alias
+
+
 def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None:
     mock_embeddings = {
         "embedding-id-1": {
@@ -108,8 +137,8 @@ def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None:
     assert embeddings is not None
     assert len(embeddings) == 2
 
-    # Query for a non existing version of the Census
-    embeddings = get_all_available_embeddings("2024-12-15")
+    # Query for a version of the census that doesn't have embeddings
+    embeddings = get_all_available_embeddings("2023-05-15")
     assert len(embeddings) == 0
 
 
@@ -175,3 +204,18 @@ def test_get_all_census_versions_with_embedding(requests_mock: rm.Mocker) -> Non
 
     versions = get_all_census_versions_with_embedding("emb_2", organism="mus_musculus", embedding_type="var_embedding")
     assert versions == ["2023-12-15"]
+
+
+@pytest.mark.parametrize("version", ["stable", "latest"])
+def test_get_all_available_embeddings_w_version_aliases(version: str) -> None:
+    """https://github.com/chanzuckerberg/cellxgene-census/issues/1202"""
+    resolved_version = cellxgene_census.get_census_version_description(version)["release_build"]
+
+    assert get_all_available_embeddings(version) == get_all_available_embeddings(resolved_version)
+
+
+def test_get_all_available_embeddings_non_existing_version() -> None:
+    false_version = "not a real version"
+
+    with pytest.raises(ValueError, match=f"Unable to locate Census version: {false_version}"):
+        get_all_available_embeddings(false_version)
diff --git a/api/python/cellxgene_census/tests/test_directory.py b/api/python/cellxgene_census/tests/test_directory.py
index 9ac52f6ea..14a29bb9b 100644
--- a/api/python/cellxgene_census/tests/test_directory.py
+++ b/api/python/cellxgene_census/tests/test_directory.py
@@ -34,7 +34,6 @@
         "release_date": "2022-09-30",
         "release_build": "2022-09-01",
         "flags": {"lts": True, "retracted": True},
-        "do_not_delete": True,
         "retraction": {
             "date": "2022-11-15",
             "reason": "mistakes happen",
@@ -53,7 +52,6 @@
     "2022-11-01": {
         "release_date": "2022-11-30",
         "release_build": "2022-11-01",
-        "do_not_delete": True,
         "soma": {
             "uri": "s3://cellxgene-data-public/cell-census/2022-11-01/soma/",
             "s3_region": "us-west-2",
@@ -188,3 +186,17 @@ def test_live_directory_contents() -> None:
 
         assert fs.exists(version_description["soma"]["uri"])
         assert fs.exists(version_description["h5ads"]["uri"])
+
+
+def test_census_version_types() -> None:
+    """Do a little bit of runtime type checking on the results of census version functions.
+
+    Part of solving: https://github.com/chanzuckerberg/cellxgene-census/issues/1204
+    """
+    from cellxgene_census._release_directory import CensusVersionDescription
+
+    directory = cellxgene_census.get_census_version_directory()
+    for k, v in directory.items():
+        assert set(v).issubset(CensusVersionDescription.__annotations__)
+        desc = cellxgene_census.get_census_version_description(k)
+        assert set(desc).issubset(CensusVersionDescription.__annotations__)
diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py
index a2f0be1ce..e03452f38 100644
--- a/api/python/cellxgene_census/tests/test_get_anndata.py
+++ b/api/python/cellxgene_census/tests/test_get_anndata.py
@@ -8,16 +8,6 @@
 import cellxgene_census
 
 
-@pytest.fixture(scope="session")
-def census() -> soma.Collection:
-    return cellxgene_census.open_soma(census_version="latest")
-
-
-@pytest.fixture(scope="session")
-def lts_census() -> soma.Collection:
-    return cellxgene_census.open_soma(census_version="stable")
-
-
 @pytest.mark.live_corpus
 def test_get_anndata_value_filter(census: soma.Collection) -> None:
     ad = cellxgene_census.get_anndata(
@@ -25,16 +15,14 @@ def test_get_anndata_value_filter(census: soma.Collection) -> None:
         organism="Mus musculus",
         obs_value_filter="tissue_general == 'vasculature'",
         var_value_filter="feature_name in ['Gm53058', '0610010K14Rik']",
-        column_names={
-            "obs": [
-                "soma_joinid",
-                "cell_type",
-                "tissue",
-                "tissue_general",
-                "assay",
-            ],
-            "var": ["soma_joinid", "feature_id", "feature_name", "feature_length"],
-        },
+        obs_column_names=[
+            "soma_joinid",
+            "cell_type",
+            "tissue",
+            "tissue_general",
+            "assay",
+        ],
+        var_column_names=["soma_joinid", "feature_id", "feature_name", "feature_length"],
     )
 
     assert ad is not None
@@ -253,6 +241,66 @@ def test_get_anndata_obsm_layers_and_add_obs_embedding_fails(lts_census: soma.Co
         )
 
 
+@pytest.mark.live_corpus
+def test_deprecated_column_api(census: soma.Collection) -> None:
+    """Testing for previous `column_names` argument.
+
+    See: https://github.com/chanzuckerberg/cellxgene-census/issues/1035
+    """
+    ad_curr = cellxgene_census.get_anndata(
+        census,
+        organism="Mus musculus",
+        obs_value_filter="tissue_general == 'vasculature'",
+        var_value_filter="feature_name in ['Gm53058', '0610010K14Rik']",
+        obs_column_names=[
+            "soma_joinid",
+            "cell_type",
+            "tissue",
+            "tissue_general",
+            "assay",
+        ],
+        var_column_names=["soma_joinid", "feature_id", "feature_name", "feature_length"],
+    )
+    with pytest.warns(FutureWarning):
+        ad_prev = cellxgene_census.get_anndata(
+            census,
+            organism="Mus musculus",
+            obs_value_filter="tissue_general == 'vasculature'",
+            var_value_filter="feature_name in ['Gm53058', '0610010K14Rik']",
+            column_names={
+                "obs": [
+                    "soma_joinid",
+                    "cell_type",
+                    "tissue",
+                    "tissue_general",
+                    "assay",
+                ],
+                "var": ["soma_joinid", "feature_id", "feature_name", "feature_length"],
+            },
+        )
+    with pytest.raises(
+        ValueError, match=r"Both the deprecated 'column_names' argument and its replacements were used."
+    ):
+        cellxgene_census.get_anndata(
+            census,
+            organism="Mus musculus",
+            obs_value_filter="tissue_general == 'vasculature'",
+            var_value_filter="feature_name in ['Gm53058', '0610010K14Rik']",
+            obs_column_names=[
+                "soma_joinid",
+                "cell_type",
+            ],
+            column_names={
+                "obs": [
+                    "soma_joinid",
+                    "cell_type",
+                ],
+            },
+        )
+    pd.testing.assert_frame_equal(ad_curr.obs, ad_prev.obs)
+    pd.testing.assert_frame_equal(ad_curr.var, ad_prev.var)
+
+
 def _map_to_get_anndata_args(query: Dict[str, Any], axis: Literal["obs", "var"]) -> Dict[str, Any]:
     """Helper to map arguments of get_obs/ get_var to get_anndata."""
     result = {}
diff --git a/api/python/cellxgene_census/tests/test_user_agent.py b/api/python/cellxgene_census/tests/test_user_agent.py
new file mode 100644
index 000000000..dc410df9a
--- /dev/null
+++ b/api/python/cellxgene_census/tests/test_user_agent.py
@@ -0,0 +1,284 @@
+# mypy: ignore-errors
+from __future__ import annotations
+
+import json
+import os
+from functools import partial
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable
+
+import numpy as np
+import proxy
+import pytest
+import requests
+from urllib3.exceptions import InsecureRequestWarning
+
+if TYPE_CHECKING:
+    from _pytest.tmpdir import TempPathFactory
+
+import cellxgene_census
+
+# We are forcing the requests to be insecure so we can intercept them.
+pytestmark = pytest.mark.filterwarnings("ignore::urllib3.exceptions.InsecureRequestWarning")
+
+
+class ProxyInstance:
+    def __init__(self, proxy_obj: proxy.Proxy, logpth: Path):
+        self.proxy = proxy_obj
+        self.logpth = logpth
+
+    @property
+    def port(self) -> int:
+        return self.proxy.flags.port
+
+
+@pytest.fixture(scope="session")
+def ca_certificates(tmp_path_factory: TempPathFactory) -> tuple[Path, Path, Path]:
+    # Adapted from https://github.com/abhinavsingh/proxy.py/blob/a7077cf8db3bb66a6667a9d968a401e8f805e092/Makefile#L68C1-L82C49
+    # TODO: Figure out if we can remove this. Currently seems neccesary for intercepting tiledb s3 requests
+    cert_dir = tmp_path_factory.mktemp("ca-certificates")
+    KEY_FILE = cert_dir / "ca-key.pem"
+    CERT_FILE = cert_dir / "ca-cert.pem"
+    SIGNING_KEY_FILE = cert_dir / "ca-signing-key.pem"
+    assert proxy.common.pki.gen_private_key(key_path=KEY_FILE, password="proxy.py")
+    assert proxy.common.pki.remove_passphrase(key_in_path=KEY_FILE, password="proxy.py", key_out_path=KEY_FILE)
+    assert proxy.common.pki.gen_public_key(
+        public_key_path=CERT_FILE, private_key_path=KEY_FILE, private_key_password="proxy.py", subject="/CN=localhost"
+    )
+    assert proxy.common.pki.gen_private_key(key_path=SIGNING_KEY_FILE, password="proxy.py")
+    assert proxy.common.pki.remove_passphrase(
+        key_in_path=SIGNING_KEY_FILE, password="proxy.py", key_out_path=SIGNING_KEY_FILE
+    )
+    return (KEY_FILE, CERT_FILE, SIGNING_KEY_FILE)
+
+
+@pytest.fixture(scope="session")
+def proxy_server(
+    tmp_path_factory: TempPathFactory,
+    ca_certificates: tuple[Path, Path, Path],
+):
+    import cellxgene_census
+
+    tmp_path = tmp_path_factory.mktemp("proxy_logs")
+    # proxy.py can override passed ca-key-file and ca-cert-file with cached ones. So we create a fresh cache for each proxy server
+    cert_cache_dir = tmp_path_factory.mktemp("certificates_cache")
+    proxy_log_file = tmp_path / "proxy.log"
+    request_log_file = tmp_path / "proxy_requests.log"
+    key_file, cert_file, signing_keyfile = ca_certificates
+    assert all(p.is_file() for p in (key_file, cert_file, signing_keyfile))
+
+    # Adapted from TestCase setup from proxy.py: https://github.com/abhinavsingh/proxy.py/blob/develop/proxy/testing/test_case.py#L23
+    PROXY_PY_STARTUP_FLAGS = [
+        "--num-workers",
+        "1",
+        "--num-acceptors",
+        "1",
+        "--hostname",
+        "127.0.0.1",
+        "--port",
+        "0",
+        "--plugin",
+        "cellxgene_census._testing.logger_proxy.RequestLoggerPlugin",
+        "--ca-key-file",
+        str(key_file),
+        "--ca-cert-file",
+        str(cert_file),
+        "--ca-signing-key-file",
+        str(signing_keyfile),
+        "--ca-cert-dir",
+        str(cert_cache_dir),
+        "--log-file",
+        str(proxy_log_file),
+        "--request-log-file",
+        str(request_log_file),
+    ]
+    proxy_obj = proxy.Proxy(PROXY_PY_STARTUP_FLAGS)
+    with proxy_obj:
+        assert proxy_obj.acceptors
+        proxy.TestCase.wait_for_server(proxy_obj.flags.port)
+        proxy_instance = ProxyInstance(proxy_obj, request_log_file)
+
+        # Now that proxy is set up, set relevant environment variables/ constants to make all request making libraries use proxy
+        with pytest.MonkeyPatch.context() as mp:
+            # Both requests and s3fs use these environment variables:
+            mp.setenv("HTTP_PROXY", f"http://localhost:{proxy_obj.flags.port}")
+            mp.setenv("HTTPS_PROXY", f"http://localhost:{proxy_obj.flags.port}")
+
+            # s3fs
+            mp.setattr(
+                cellxgene_census._open,
+                "DEFAULT_S3FS_KWARGS",
+                {
+                    "anon": True,
+                    "cache_regions": True,
+                    "use_ssl": False,  # So we can inspect the requests on the proxy
+                },
+            )
+
+            # requests
+            mp.setattr(requests, "get", partial(requests.request, "get", verify=False))
+
+            # tiledb
+            tiledb_config = cellxgene_census._open.DEFAULT_TILEDB_CONFIGURATION.copy()
+            tiledb_config["vfs.s3.proxy_host"] = "localhost"
+            tiledb_config["vfs.s3.proxy_port"] = str(proxy_instance.port)
+            tiledb_config["vfs.s3.verify_ssl"] = "false"
+            mp.setattr(
+                cellxgene_census._open,
+                "DEFAULT_TILEDB_CONFIGURATION",
+                tiledb_config,
+            )
+
+            yield proxy_instance
+
+
+@pytest.fixture
+def test_specific_useragent() -> str:
+    """Sets custom user agent addendum for every test so they can be uniqueley identified."""
+    current_test_name = os.environ["PYTEST_CURRENT_TEST"]
+    with pytest.MonkeyPatch.context() as mp:
+        mp.setenv("CELLXGENE_CENSUS_USERAGENT", current_test_name)
+        yield current_test_name
+
+
+@pytest.fixture()
+def collect_proxy_requests(proxy_server: ProxyInstance):
+    """Test specific fixture exposing the proxy server.
+
+    While a proxy server is started for every test session, this fixture
+    captures only the output that is written for a specific tests. These
+    logged requests can be checked to make sure have the correct headers.
+    """
+    # If logs have already been written, count how many
+    if proxy_server.logpth.is_file():
+        with proxy_server.logpth.open("r") as f:
+            prev_lines = len(f.readlines())
+    else:
+        prev_lines = 0
+
+    def _proxy_requests():
+        # For each new log written by the test, check that the correct headers were written
+        with proxy_server.logpth.open("r") as f:
+            records = [json.loads(line) for line in f.readlines()]
+        records = records[prev_lines:]
+        return records
+
+    # Run test
+    yield _proxy_requests
+
+
+@pytest.fixture(scope="session")
+def small_dataset_id() -> str:
+    with cellxgene_census.open_soma(census_version="latest") as census:
+        census_datasets = census["census_info"]["datasets"].read().concat().to_pandas()
+
+    small_dataset = census_datasets.nsmallest(1, "dataset_total_cell_count").iloc[0]
+    assert isinstance(small_dataset.dataset_id, str)
+    return small_dataset.dataset_id
+
+
+def check_proxy_records(records: list[dict], *, custom_user_agent: None | str = None, min_records: int = 1) -> None:
+    # Check that there aren't two CONNECT requests in a row
+    prev_was_connect = False
+    for record in records:
+        was_connect = record["method"] == "CONNECT"
+        if prev_was_connect and was_connect:
+            raise AssertionError(
+                "Recieved multiple connect requests in a row. Some calls aren't being intercepted by the proxy."
+            )
+
+    # Check that headers were set correctly on intercepted requests
+    n_records = 0
+    for record in records:
+        if record["method"] == "CONNECT":
+            continue
+        n_records += 1
+        headers = record["headers"]
+        user_agent = headers["user-agent"]
+        assert "cellxgene-census-python" in user_agent
+        assert cellxgene_census.__version__ in user_agent
+        if custom_user_agent:
+            assert custom_user_agent in user_agent
+    assert n_records >= min_records, f"Fewer than min_records ({min_records}) were found."
+
+
+def test_proxy_fixture(collect_proxy_requests: Callable[[], list[dict]]):
+    """Test that our proxy testing setup is working as expected."""
+    # Should just be downloading a json
+    with pytest.warns(InsecureRequestWarning):
+        _ = cellxgene_census.get_census_version_directory()
+
+    records = collect_proxy_requests()
+
+    # Expecting a CONNECT request followed by a GET request
+    assert len(records) == 2
+    assert records[0]["method"] == "CONNECT"
+    assert records[1]["method"] == "GET"
+    assert records[1]["headers"]["host"] == "census.cellxgene.cziscience.com"
+    assert "cellxgene-census-python" in records[1]["headers"]["user-agent"]
+
+
+def test_download_w_proxy_fixture(
+    small_dataset_id: str,
+    collect_proxy_requests: Callable[[], list[dict]],
+    tmp_path: Path,
+    test_specific_useragent: str,
+):
+    # Use of collect_proxy_requests forces test to use a proxy and will check headers of requests made via that proxy
+    adata_path = tmp_path / "adata.h5ad"
+    cellxgene_census.download_source_h5ad(small_dataset_id, adata_path.as_posix(), census_version="latest")
+
+    records = collect_proxy_requests()
+    check_proxy_records(
+        records,
+        custom_user_agent=test_specific_useragent,
+        min_records=3,  # Should request at least a json and the download
+    )
+
+
+def test_query_w_proxy_fixture(collect_proxy_requests: Callable[[], list[dict]]):
+    with cellxgene_census.open_soma(census_version="stable") as census:
+        _ = cellxgene_census.get_obs(census, "Mus musculus", coords=slice(100, 300))
+
+    records = collect_proxy_requests()
+    check_proxy_records(
+        records,
+        min_records=5,  # some metadata requests, then a lot of request from tiledb
+    )
+
+
+def test_embedding_headers(collect_proxy_requests: Callable[[], list[dict]]):
+    import cellxgene_census.experimental
+
+    CENSUS_VERSION = "2023-12-15"
+
+    embeddings_metadata = cellxgene_census.experimental.get_all_available_embeddings(CENSUS_VERSION)
+    metadata = embeddings_metadata[0]
+    embedding_uri = (
+        f"s3://cellxgene-contrib-public/contrib/cell-census/soma/{metadata['census_version']}/{metadata['id']}"
+    )
+    _ = cellxgene_census.experimental.get_embedding(
+        CENSUS_VERSION,
+        embedding_uri=embedding_uri,
+        obs_soma_joinids=np.arange(100),
+    )
+
+    check_proxy_records(collect_proxy_requests())
+
+
+def test_dataloader_headers(collect_proxy_requests) -> None:
+    import cellxgene_census
+    from cellxgene_census.experimental.ml.pytorch import ExperimentDataPipe
+
+    soma_experiment = cellxgene_census.open_soma(census_version="latest")["census_data"]["homo_sapiens"]
+    dp = ExperimentDataPipe(
+        soma_experiment,
+        measurement_name="RNA",
+        X_name="raw",
+        obs_column_names=["cell_type"],
+        shuffle=False,
+    )
+    _ = next(iter(dp))
+
+    records = collect_proxy_requests()
+    check_proxy_records(records, min_records=5)
diff --git a/api/python/notebooks/analysis_demo/comp_bio_embedding_exploration.ipynb b/api/python/notebooks/analysis_demo/comp_bio_embedding_exploration.ipynb
index 8e872a8ce..e858c8df7 100644
--- a/api/python/notebooks/analysis_demo/comp_bio_embedding_exploration.ipynb
+++ b/api/python/notebooks/analysis_demo/comp_bio_embedding_exploration.ipynb
@@ -168,8 +168,7 @@
     "# Let's find our cells of interest\n",
     "obs_value_filter = \"tissue_general=='eye' and is_primary_data == True\"\n",
     "\n",
-    "obs_df = census[\"census_data\"][EXPERIMENT_NAME].obs.read(value_filter=obs_value_filter, column_names=[\"soma_joinid\"])\n",
-    "obs_df = obs_df.concat().to_pandas()\n",
+    "obs_df = cellxgene_census.get_obs(census, EXPERIMENT_NAME, value_filter=obs_value_filter, column_names=[\"soma_joinid\"])\n",
     "\n",
     "print(obs_df.shape[0], \"cells in\", obs_value_filter)\n",
     "\n",
@@ -578,9 +577,8 @@
     "# Let's find our cells of interest\n",
     "obs_value_filter = \"tissue_general=='brain' and is_primary_data == True\"\n",
     "\n",
-    "obs_df = census[\"census_data\"][EXPERIMENT_NAME].obs.read(value_filter=obs_value_filter, column_names=[\"soma_joinid\"])\n",
+    "obs_df = cellxgene_census.get_obs(census, EXPERIMENT_NAME, value_filter=obs_value_filter, column_names=[\"soma_joinid\"])\n",
     "\n",
-    "obs_df = obs_df.concat().to_pandas()\n",
     "print(obs_df.shape[0], \"cells in\", obs_value_filter)\n",
     "\n",
     "# Let's subset to 150K\n",
diff --git a/api/python/notebooks/analysis_demo/comp_bio_explore_and_load_lung_data.ipynb b/api/python/notebooks/analysis_demo/comp_bio_explore_and_load_lung_data.ipynb
index b2d8cfdb2..952c643a1 100644
--- a/api/python/notebooks/analysis_demo/comp_bio_explore_and_load_lung_data.ipynb
+++ b/api/python/notebooks/analysis_demo/comp_bio_explore_and_load_lung_data.ipynb
@@ -603,11 +603,8 @@
     }
    ],
    "source": [
-    "lung_obs = (\n",
-    "    census[\"census_data\"][\"homo_sapiens\"]\n",
-    "    .obs.read(value_filter=\"tissue_general == 'lung' and is_primary_data == True\")\n",
-    "    .concat()\n",
-    "    .to_pandas()\n",
+    "lung_obs = cellxgene_census.get_obs(\n",
+    "    census, \"homo_sapiens\", value_filter=\"tissue_general == 'lung' and is_primary_data == True\"\n",
     ")\n",
     "lung_obs"
    ]
@@ -1612,7 +1609,7 @@
     }
    ],
    "source": [
-    "lung_var = census[\"census_data\"][\"homo_sapiens\"].ms[\"RNA\"].var.read().concat().to_pandas()\n",
+    "lung_var = cellxgene_census.get_var(census, \"homo_sapiens\")\n",
     "lung_var"
    ]
   },
diff --git a/api/python/notebooks/analysis_demo/comp_bio_summarize_axis_query.ipynb b/api/python/notebooks/analysis_demo/comp_bio_summarize_axis_query.ipynb
index 04a6473eb..0d2e236e2 100644
--- a/api/python/notebooks/analysis_demo/comp_bio_summarize_axis_query.ipynb
+++ b/api/python/notebooks/analysis_demo/comp_bio_summarize_axis_query.ipynb
@@ -113,10 +113,8 @@
     }
    ],
    "source": [
-    "human = census[\"census_data\"][\"homo_sapiens\"]\n",
-    "\n",
     "# Read entire _obs_ into a pandas dataframe.\n",
-    "obs_df = human.obs.read(column_names=[\"cell_type_ontology_term_id\"]).concat().to_pandas()\n",
+    "obs_df = cellxgene_census.get_obs(census, \"homo_sapiens\", column_names=[\"cell_type_ontology_term_id\"])\n",
     "\n",
     "# Use Pandas API to find all unique values in the `cell_type_ontology_term_id` column.\n",
     "unique_cell_type_ontology_term_id = obs_df.cell_type_ontology_term_id.unique()\n",
@@ -178,18 +176,15 @@
    ],
    "source": [
     "# Count cell_type occurrences for cells with tissue == 'lung'\n",
-    "human = census[\"census_data\"][\"homo_sapiens\"]\n",
     "\n",
     "# Read cell_type terms for cells which have a specific tissue term\n",
     "LUNG_TISSUE = \"UBERON:0002048\"\n",
     "\n",
-    "obs_df = (\n",
-    "    human.obs.read(\n",
-    "        column_names=[\"cell_type_ontology_term_id\"],\n",
-    "        value_filter=f\"tissue_ontology_term_id == '{LUNG_TISSUE}'\",\n",
-    "    )\n",
-    "    .concat()\n",
-    "    .to_pandas()\n",
+    "obs_df = cellxgene_census.get_obs(\n",
+    "    census,\n",
+    "    \"homo_sapiens\",\n",
+    "    column_names=[\"cell_type_ontology_term_id\"],\n",
+    "    value_filter=f\"tissue_ontology_term_id == '{LUNG_TISSUE}'\",\n",
     ")\n",
     "\n",
     "# Use Pandas API to find all unique values in the `cell_type_ontology_term_id` column.\n",
@@ -251,17 +246,13 @@
    ],
    "source": [
     "# You can also do more complex queries, such as testing for inclusion in a list of values and \"and\" operations\n",
-    "human = census[\"census_data\"][\"homo_sapiens\"]\n",
-    "\n",
     "VENTRICLES = [\"UBERON:0002082\", \"UBERON:OOO2084\", \"UBERON:0002080\"]\n",
     "\n",
-    "obs_df = (\n",
-    "    human.obs.read(\n",
-    "        column_names=[\"cell_type_ontology_term_id\"],\n",
-    "        value_filter=f\"tissue_ontology_term_id in {VENTRICLES} and is_primary_data == True\",\n",
-    "    )\n",
-    "    .concat()\n",
-    "    .to_pandas()\n",
+    "obs_df = cellxgene_census.get_obs(\n",
+    "    census,\n",
+    "    \"homo_sapiens\",\n",
+    "    column_names=[\"cell_type_ontology_term_id\"],\n",
+    "    value_filter=f\"tissue_ontology_term_id in {VENTRICLES} and is_primary_data == True\",\n",
     ")\n",
     "\n",
     "# Use Pandas API to summarize\n",
@@ -314,8 +305,7 @@
     "]\n",
     "\n",
     "obs_df = {\n",
-    "    name: experiment.obs.read(column_names=COLS_TO_QUERY).concat().to_pandas()\n",
-    "    for name, experiment in census[\"census_data\"].items()\n",
+    "    name: cellxgene_census.get_obs(census, name, column_names=COLS_TO_QUERY) for name in census[\"census_data\"].keys()\n",
     "}\n",
     "\n",
     "# Use Pandas API to summarize each organism\n",
diff --git a/api/python/notebooks/api_demo/census_access_maintained_embeddings.ipynb b/api/python/notebooks/api_demo/census_access_maintained_embeddings.ipynb
index 0d4382b3b..6d2d6c92f 100644
--- a/api/python/notebooks/api_demo/census_access_maintained_embeddings.ipynb
+++ b/api/python/notebooks/api_demo/census_access_maintained_embeddings.ipynb
@@ -181,7 +181,7 @@
     "    organism=\"homo_sapiens\",\n",
     "    measurement_name=\"RNA\",\n",
     "    obs_value_filter=\"tissue_general == 'central nervous system'\",\n",
-    "    column_names={\"obs\": [\"cell_type\"]},\n",
+    "    obs_column_names=[\"cell_type\"],\n",
     "    obs_embeddings=emb_names,\n",
     ")\n",
     "\n",
@@ -422,12 +422,12 @@
     "\n",
     "census = cellxgene_census.open_soma(census_version=census_version)\n",
     "\n",
-    "obs_df = census[\"census_data\"][experiment_name].obs.read(\n",
+    "obs_df = cellxgene_census.get_obs(\n",
+    "    census,\n",
+    "    experiment_name,\n",
     "    value_filter=\"tissue_general == 'central nervous system'\",\n",
     "    column_names=[\"soma_joinid\", \"cell_type\"],\n",
-    ")\n",
-    "\n",
-    "obs_df = obs_df.concat().to_pandas()"
+    ")"
    ]
   },
   {
@@ -445,7 +445,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "metadata = get_embedding_metadata_by_name(\"scvi\", \"homo_sapiens\", census_version=census_version)\n",
+    "metadata = get_embedding_metadata_by_name(\"scvi\", experiment_name, census_version=census_version)\n",
     "embedding_uri = f\"s3://cellxgene-contrib-public/contrib/cell-census/soma/{metadata['census_version']}/{metadata['id']}\"\n",
     "embedding = get_embedding(metadata[\"census_version\"], embedding_uri, obs_df.soma_joinid.to_numpy())"
    ]
diff --git a/api/python/notebooks/api_demo/census_citation_generation.ipynb b/api/python/notebooks/api_demo/census_citation_generation.ipynb
index eb849e20c..9845c7855 100644
--- a/api/python/notebooks/api_demo/census_citation_generation.ipynb
+++ b/api/python/notebooks/api_demo/census_citation_generation.ipynb
@@ -257,10 +257,9 @@
    ],
    "source": [
     "# Query cell metadata\n",
-    "cell_metadata = census[\"census_data\"][\"homo_sapiens\"].obs.read(\n",
-    "    value_filter=\"tissue == 'cardiac atrium'\", column_names=[\"dataset_id\", \"cell_type\"]\n",
+    "cell_metadata = cellxgene_census.get_obs(\n",
+    "    census, \"homo_sapiens\", value_filter=\"tissue == 'cardiac atrium'\", column_names=[\"dataset_id\", \"cell_type\"]\n",
     ")\n",
-    "cell_metadata = cell_metadata.concat().to_pandas()\n",
     "\n",
     "# Get a citation string for the slice\n",
     "slice_datasets = datasets[datasets[\"dataset_id\"].isin(cell_metadata[\"dataset_id\"])]\n",
@@ -325,7 +324,7 @@
     "    measurement_name=\"RNA\",\n",
     "    obs_value_filter=\"tissue == 'cardiac atrium'\",\n",
     "    var_value_filter=\"feature_name == 'MYBPC3'\",\n",
-    "    column_names={\"obs\": [\"dataset_id\", \"cell_type\"]},\n",
+    "    obs_column_names=[\"dataset_id\", \"cell_type\"],\n",
     ")\n",
     "\n",
     "# Get a citation string for the slice\n",
diff --git a/api/python/notebooks/api_demo/census_datasets.ipynb b/api/python/notebooks/api_demo/census_datasets.ipynb
index da3202549..ec42b8b6f 100644
--- a/api/python/notebooks/api_demo/census_datasets.ipynb
+++ b/api/python/notebooks/api_demo/census_datasets.ipynb
@@ -350,13 +350,10 @@
    ],
    "source": [
     "# Count cells across all experiments\n",
-    "all_experiments = (\n",
-    "    (organism_name, organism_experiment) for organism_name, organism_experiment in census[\"census_data\"].items()\n",
-    ")\n",
     "experiments_total_cells = 0\n",
     "print(\"Count by experiment:\")\n",
-    "for organism_name, organism_experiment in all_experiments:\n",
-    "    num_cells = len(organism_experiment.obs.read(column_names=[\"soma_joinid\"]).concat().to_pandas())\n",
+    "for organism_name in census[\"census_data\"].keys():\n",
+    "    num_cells = len(cellxgene_census.get_obs(census, organism_name, column_names=[\"soma_joinid\"]))\n",
     "    print(f\"\\t{num_cells} cells in {organism_name}\")\n",
     "    experiments_total_cells += num_cells\n",
     "\n",
diff --git a/api/python/notebooks/api_demo/census_duplicated_cells.ipynb b/api/python/notebooks/api_demo/census_duplicated_cells.ipynb
index 9b902430a..575a9119e 100644
--- a/api/python/notebooks/api_demo/census_duplicated_cells.ipynb
+++ b/api/python/notebooks/api_demo/census_duplicated_cells.ipynb
@@ -71,13 +71,14 @@
     "import cellxgene_census\n",
     "\n",
     "tabula_muris_dataset_id = \"48b37086-25f7-4ecd-be66-f5bb378e3aea\"\n",
-    "\n",
-    "with cellxgene_census.open_soma() as census:\n",
-    "    tabula_muris_obs = census[\"census_data\"][\"mus_musculus\"].obs.read(\n",
-    "        value_filter=f\"dataset_id == '{tabula_muris_dataset_id}'\", column_names=[\"tissue\", \"is_primary_data\"]\n",
-    "    )\n",
-    "\n",
-    "    tabula_muris_obs = tabula_muris_obs.concat().to_pandas()"
+    "census = cellxgene_census.open_soma()\n",
+    "\n",
+    "tabula_muris_obs = cellxgene_census.get_obs(\n",
+    "    census,\n",
+    "    \"mus_musculus\",\n",
+    "    value_filter=f\"dataset_id == '{tabula_muris_dataset_id}'\",\n",
+    "    column_names=[\"tissue\", \"is_primary_data\"],\n",
+    ")"
    ]
   },
   {
@@ -165,12 +166,12 @@
    "source": [
     "tabula_muris_liver_dataset_id = \"6202a243-b713-4e12-9ced-c387f8483dea\"\n",
     "\n",
-    "with cellxgene_census.open_soma() as census:\n",
-    "    tabula_muris_liver_obs = census[\"census_data\"][\"mus_musculus\"].obs.read(\n",
-    "        value_filter=f\"dataset_id == '{tabula_muris_liver_dataset_id}'\", column_names=[\"tissue\", \"is_primary_data\"]\n",
-    "    )\n",
-    "\n",
-    "    tabula_muris_liver_obs = tabula_muris_liver_obs.concat().to_pandas()"
+    "tabula_muris_liver_obs = cellxgene_census.get_obs(\n",
+    "    census,\n",
+    "    \"mus_musculus\",\n",
+    "    value_filter=f\"dataset_id == '{tabula_muris_liver_dataset_id}'\",\n",
+    "    column_names=[\"tissue\", \"is_primary_data\"],\n",
+    ")"
    ]
   },
   {
@@ -256,15 +257,14 @@
     }
    ],
    "source": [
-    "with cellxgene_census.open_soma() as census:\n",
-    "    nk_cells = census[\"census_data\"][\"homo_sapiens\"].obs.read(\n",
-    "        value_filter=\"cell_type == 'natural killer cell' \"\n",
-    "        \"and disease == 'COVID-19' \"\n",
-    "        \"and sex == 'female'\"\n",
-    "        \"and tissue_general == 'blood'\"\n",
-    "    )\n",
-    "\n",
-    "    nk_cells = nk_cells.concat().to_pandas()"
+    "nk_cells = cellxgene_census.get_obs(\n",
+    "    census,\n",
+    "    \"mus_musculus\",\n",
+    "    value_filter=\"cell_type == 'natural killer cell' \"\n",
+    "    \"and disease == 'COVID-19' \"\n",
+    "    \"and sex == 'female'\"\n",
+    "    \"and tissue_general == 'blood'\",\n",
+    ")"
    ]
   },
   {
@@ -323,16 +323,15 @@
     }
    ],
    "source": [
-    "with cellxgene_census.open_soma() as census:\n",
-    "    nk_cells_primary = census[\"census_data\"][\"homo_sapiens\"].obs.read(\n",
-    "        value_filter=\"cell_type == 'natural killer cell' \"\n",
-    "        \"and disease == 'COVID-19' \"\n",
-    "        \"and tissue_general == 'blood'\"\n",
-    "        \"and sex == 'female'\"\n",
-    "        \"and is_primary_data == True\"\n",
-    "    )\n",
-    "\n",
-    "    nk_cells_primary = nk_cells_primary.concat().to_pandas()"
+    "nk_cells_primary = cellxgene_census.get_obs(\n",
+    "    census,\n",
+    "    \"mus_musculus\",\n",
+    "    value_filter=\"cell_type == 'natural killer cell' \"\n",
+    "    \"and disease == 'COVID-19' \"\n",
+    "    \"and tissue_general == 'blood'\"\n",
+    "    \"and sex == 'female'\"\n",
+    "    \"and is_primary_data == True\",\n",
+    ")"
    ]
   },
   {
@@ -397,16 +396,15 @@
     }
    ],
    "source": [
-    "with cellxgene_census.open_soma() as census:\n",
-    "    adata = cellxgene_census.get_anndata(\n",
-    "        census,\n",
-    "        organism=\"Homo sapiens\",\n",
-    "        var_value_filter=\"feature_name == 'AQP5'\",\n",
-    "        obs_value_filter=\"cell_type == 'natural killer cell' \"\n",
-    "        \"and disease == 'COVID-19' \"\n",
-    "        \"and sex == 'female'\"\n",
-    "        \"and tissue_general == 'blood'\",\n",
-    "    )"
+    "adata = cellxgene_census.get_anndata(\n",
+    "    census,\n",
+    "    organism=\"Homo sapiens\",\n",
+    "    var_value_filter=\"feature_name == 'AQP5'\",\n",
+    "    obs_value_filter=\"cell_type == 'natural killer cell' \"\n",
+    "    \"and disease == 'COVID-19' \"\n",
+    "    \"and sex == 'female'\"\n",
+    "    \"and tissue_general == 'blood'\",\n",
+    ")"
    ]
   },
   {
@@ -465,17 +463,16 @@
     }
    ],
    "source": [
-    "with cellxgene_census.open_soma() as census:\n",
-    "    adata_primary = cellxgene_census.get_anndata(\n",
-    "        census,\n",
-    "        organism=\"Homo sapiens\",\n",
-    "        var_value_filter=\"feature_name == 'AQP5'\",\n",
-    "        obs_value_filter=\"cell_type == 'natural killer cell' \"\n",
-    "        \"and disease == 'COVID-19' \"\n",
-    "        \"and sex == 'female' \"\n",
-    "        \"and tissue_general == 'blood'\"\n",
-    "        \"and is_primary_data == True\",\n",
-    "    )"
+    "adata_primary = cellxgene_census.get_anndata(\n",
+    "    census,\n",
+    "    organism=\"Homo sapiens\",\n",
+    "    var_value_filter=\"feature_name == 'AQP5'\",\n",
+    "    obs_value_filter=\"cell_type == 'natural killer cell' \"\n",
+    "    \"and disease == 'COVID-19' \"\n",
+    "    \"and sex == 'female' \"\n",
+    "    \"and tissue_general == 'blood'\"\n",
+    "    \"and is_primary_data == True\",\n",
+    ")"
    ]
   },
   {
@@ -556,30 +553,29 @@
    "source": [
     "import tiledbsoma\n",
     "\n",
-    "with cellxgene_census.open_soma() as census:\n",
-    "    human = census[\"census_data\"][\"homo_sapiens\"]\n",
-    "\n",
-    "    # initialize lazy query\n",
-    "    query = human.axis_query(\n",
-    "        measurement_name=\"RNA\",\n",
-    "        obs_query=tiledbsoma.AxisQuery(\n",
-    "            value_filter=\"cell_type == 'natural killer cell' \"\n",
-    "            \"and disease == 'COVID-19' \"\n",
-    "            \"and tissue_general == 'blood' \"\n",
-    "            \"and sex == 'female' \"\n",
-    "            \"and is_primary_data == True\"\n",
-    "        ),\n",
-    "    )\n",
-    "\n",
-    "    # get iterator for X\n",
-    "    iterator = query.X(\"raw\").tables()\n",
-    "\n",
-    "    # iterate in chunks\n",
-    "    for chunk in iterator:\n",
-    "        print(chunk)\n",
-    "\n",
-    "        # since this is a demo we stop right away\n",
-    "        break"
+    "human = census[\"census_data\"][\"homo_sapiens\"]\n",
+    "\n",
+    "# initialize lazy query\n",
+    "query = human.axis_query(\n",
+    "    measurement_name=\"RNA\",\n",
+    "    obs_query=tiledbsoma.AxisQuery(\n",
+    "        value_filter=\"cell_type == 'natural killer cell' \"\n",
+    "        \"and disease == 'COVID-19' \"\n",
+    "        \"and tissue_general == 'blood' \"\n",
+    "        \"and sex == 'female' \"\n",
+    "        \"and is_primary_data == True\"\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "# get iterator for X\n",
+    "iterator = query.X(\"raw\").tables()\n",
+    "\n",
+    "# iterate in chunks\n",
+    "for chunk in iterator:\n",
+    "    print(chunk)\n",
+    "\n",
+    "    # since this is a demo we stop right away\n",
+    "    break"
    ]
   }
  ],
diff --git a/api/python/notebooks/api_demo/census_embedding.ipynb b/api/python/notebooks/api_demo/census_embedding.ipynb
index e8a1df604..5f5f85a5c 100644
--- a/api/python/notebooks/api_demo/census_embedding.ipynb
+++ b/api/python/notebooks/api_demo/census_embedding.ipynb
@@ -59,9 +59,9 @@
    "source": [
     "from cellxgene_census.experimental import get_all_available_embeddings\n",
     "\n",
-    "census_version = \"2023-12-15\"\n",
+    "CENSUS_VERSION = \"2023-12-15\"\n",
     "\n",
-    "for e in get_all_available_embeddings(census_version):\n",
+    "for e in get_all_available_embeddings(CENSUS_VERSION):\n",
     "    print(f\"{e['embedding_name']:15} {e['experiment_name']:15} {e['data_type']:15}\")"
    ]
   },
@@ -243,7 +243,7 @@
     "        organism=EXPERIMENT_NAME,\n",
     "        measurement_name=MEASUREMENT_NAME,\n",
     "        obs_value_filter=\"tissue_general == 'central nervous system'\",\n",
-    "        column_names={\"obs\": [\"cell_type\", \"soma_joinid\"]},\n",
+    "        obs_column_names=[\"cell_type\", \"soma_joinid\"],\n",
     "        obs_embeddings=[\"scgpt\"],\n",
     "    )"
    ]
@@ -450,12 +450,12 @@
    "source": [
     "census = cellxgene_census.open_soma(census_version=CENSUS_VERSION)\n",
     "\n",
-    "obs_df = census[\"census_data\"][EXPERIMENT_NAME].obs.read(\n",
+    "obs_df = cellxgene_census.get_obs(\n",
+    "    census,\n",
+    "    EXPERIMENT_NAME,\n",
     "    value_filter=\"tissue_general == 'exocrine gland'\",\n",
     "    column_names=[\"soma_joinid\", \"cell_type\"],\n",
-    ")\n",
-    "\n",
-    "obs_df = obs_df.concat().to_pandas()"
+    ")"
    ]
   },
   {
diff --git a/api/python/notebooks/api_demo/census_query_extract.ipynb b/api/python/notebooks/api_demo/census_query_extract.ipynb
index 828dd687d..2f6f9791a 100644
--- a/api/python/notebooks/api_demo/census_query_extract.ipynb
+++ b/api/python/notebooks/api_demo/census_query_extract.ipynb
@@ -65,7 +65,7 @@
     "\n",
     "The method will return an `anndata.AnnData` object, it takes as an input a census object, the string for an organism, and for both cell and gene metadata we can specify filters and column selection as described above but with the following arguments:\n",
     "\n",
-    "- `column_names` — a dictionary with two keys `obs` and `var` whose values are lists of strings indicating the columns to select for cell and gene metadata respectively.\n",
+    "- `obs_column_names` and `var_column_names` — a pair of arguments whose values are lists of strings indicating the columns to select for cell (`obs`) and gene (`var`) metadata respectively.\n",
     "- `obs_value_filter` —  python expression with selection conditions to fetch **cells** meeting a criteria. For full details see [tiledb.QueryCondition](https://tiledb-inc-tiledb.readthedocs-hosted.com/projects/tiledb-py/en/stable/python-api.html#query-condition).\n",
     "- `var_value_filter` —  python expression with selection conditions to fetch **genes** meeting a criteria. Details as above.  For full details see [tiledb.QueryCondition](https://tiledb-inc-tiledb.readthedocs-hosted.com/projects/tiledb-py/en/stable/python-api.html#query-condition).\n",
     "\n",
@@ -95,7 +95,7 @@
     "    organism=\"Homo sapiens\",\n",
     "    var_value_filter=\"feature_id in ['ENSG00000161798', 'ENSG00000188229']\",\n",
     "    obs_value_filter=\"cell_type == 'B cell' and tissue_general == 'lung' and disease == 'COVID-19' and is_primary_data == True\",\n",
-    "    column_names={\"obs\": [\"sex\"]},\n",
+    "    obs_column_names=[\"sex\"],\n",
     ")"
    ]
   },
@@ -383,7 +383,7 @@
    "source": [
     "## Querying cell metadata (obs)\n",
     "\n",
-    "The human gene metadata of the Census, for RNA assays, is located at `census[\"census_data\"][\"homo_sapiens\"].obs`. This is a `SOMADataFrame` and as such it can be materialized as a `pandas.DataFrame` via the methods `read().concat().to_pandas()`. \n",
+    "The human gene metadata of the Census, for RNA assays, is located at `census[\"census_data\"][\"homo_sapiens\"].obs`. This is a `SOMADataFrame` and as such it can be materialized as a `pandas.DataFrame` via the methods `read().concat().to_pandas()`. See also, the helper function `cellxgene_census.get_obs` which removes some boiler plate.\n",
     "\n",
     "The mouse cell metadata is at `census[\"census_data\"][\"mus_musculus\"].obs`.\n",
     "\n",
@@ -526,7 +526,7 @@
     }
    ],
    "source": [
-    "sex_cell_metadata = census[\"census_data\"][\"homo_sapiens\"].obs.read(column_names=[\"sex\"]).concat().to_pandas()\n",
+    "sex_cell_metadata = cellxgene_census.get_obs(census, \"homo_sapiens\", column_names=[\"sex\"])\n",
     "\n",
     "sex_cell_metadata.drop_duplicates()"
    ]
@@ -980,9 +980,7 @@
     }
    ],
    "source": [
-    "cell_metadata_all_unknown_sex = (\n",
-    "    census[\"census_data\"][\"homo_sapiens\"].obs.read(value_filter=\"sex == 'unknown'\").concat().to_pandas()\n",
-    ")\n",
+    "cell_metadata_all_unknown_sex = cellxgene_census.get_obs(census, \"homo_sapiens\", value_filter=\"sex == 'unknown'\")\n",
     "\n",
     "cell_metadata_all_unknown_sex"
    ]
@@ -1033,14 +1031,11 @@
     }
    ],
    "source": [
-    "cell_metadata_b_cell = (\n",
-    "    census[\"census_data\"][\"homo_sapiens\"]\n",
-    "    .obs.read(\n",
-    "        value_filter=\"cell_type == 'B cell' and tissue_general == 'lung' and is_primary_data==True\",\n",
-    "        column_names=[\"disease\"],\n",
-    "    )\n",
-    "    .concat()\n",
-    "    .to_pandas()\n",
+    "cell_metadata_b_cell = cellxgene_census.get_obs(\n",
+    "    census,\n",
+    "    \"homo_sapiens\",\n",
+    "    value_filter=\"cell_type == 'B cell' and tissue_general == 'lung' and is_primary_data==True\",\n",
+    "    column_names=[\"disease\"],\n",
     ")\n",
     "\n",
     "cell_metadata_b_cell.value_counts()"
@@ -1164,15 +1159,11 @@
     }
    ],
    "source": [
-    "gene_metadata = (\n",
-    "    census[\"census_data\"][\"homo_sapiens\"]\n",
-    "    .ms[\"RNA\"]\n",
-    "    .var.read(\n",
-    "        value_filter=\"feature_id in ['ENSG00000161798', 'ENSG00000188229']\",\n",
-    "        column_names=[\"feature_name\", \"feature_length\"],\n",
-    "    )\n",
-    "    .concat()\n",
-    "    .to_pandas()\n",
+    "gene_metadata = cellxgene_census.get_var(\n",
+    "    census,\n",
+    "    \"homo_sapiens\",\n",
+    "    value_filter=\"feature_id in ['ENSG00000161798', 'ENSG00000188229']\",\n",
+    "    column_names=[\"feature_name\", \"feature_length\"],\n",
     ")\n",
     "\n",
     "gene_metadata"
diff --git a/api/python/notebooks/experimental/highly_variable_genes.ipynb b/api/python/notebooks/experimental/highly_variable_genes.ipynb
index f5a69c031..bd0a218d7 100644
--- a/api/python/notebooks/experimental/highly_variable_genes.ipynb
+++ b/api/python/notebooks/experimental/highly_variable_genes.ipynb
@@ -258,7 +258,7 @@
     "    )\n",
     "\n",
     "    # while the Census is open, also grab the var dataframe for the mouse\n",
-    "    var_df = census[\"census_data\"][\"mus_musculus\"].ms[\"RNA\"].var.read().concat().to_pandas()\n",
+    "    var_df = cellxgene_census.get_var(census, \"mus_musculus\")\n",
     "\n",
     "hvgs_df"
    ]
diff --git a/api/python/notebooks/experimental/pytorch.ipynb b/api/python/notebooks/experimental/pytorch.ipynb
index 2ed46d472..73640cedd 100644
--- a/api/python/notebooks/experimental/pytorch.ipynb
+++ b/api/python/notebooks/experimental/pytorch.ipynb
@@ -34,18 +34,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 37,
    "id": "c3dd549f",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:20:21.600206Z",
      "start_time": "2023-10-09T18:20:19.390343Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:33:00.392773Z",
-     "iopub.status.busy": "2023-07-28T16:33:00.392516Z",
-     "iopub.status.idle": "2023-07-28T16:33:02.881471Z",
-     "shell.execute_reply": "2023-07-28T16:33:02.880857Z"
     }
    },
    "outputs": [
@@ -53,7 +47,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "The \"stable\" release is currently 2023-07-25. Specify 'census_version=\"2023-07-25\"' in future calls to open_soma() to ensure data consistency.\n"
+      "The \"stable\" release is currently 2023-12-15. Specify 'census_version=\"2023-12-15\"' in future calls to open_soma() to ensure data consistency.\n"
      ]
     }
    ],
@@ -76,18 +70,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 38,
    "id": "54896e6f",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:20:22.278894Z",
      "start_time": "2023-10-09T18:20:21.830683Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:33:02.884588Z",
-     "iopub.status.busy": "2023-07-28T16:33:02.884133Z",
-     "iopub.status.idle": "2023-07-28T16:33:03.356736Z",
-     "shell.execute_reply": "2023-07-28T16:33:03.356115Z"
     }
    },
    "outputs": [],
@@ -109,17 +97,14 @@
     ")"
    ]
   },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "6c7c17c3",
-   "metadata": {},
-   "source": []
-  },
   {
    "cell_type": "markdown",
+   "id": "04eb7742",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "### `ExperimentDataPipe` class explained\n",
@@ -129,8 +114,12 @@
   },
   {
    "cell_type": "markdown",
+   "id": "44188ba8",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "### `ExperimentDataPipe` parameters explained\n",
@@ -162,18 +151,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 39,
    "id": "70a2ddbe",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:20:32.049618Z",
      "start_time": "2023-10-09T18:20:22.821101Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:33:03.359927Z",
-     "iopub.status.busy": "2023-07-28T16:33:03.359450Z",
-     "iopub.status.idle": "2023-07-28T16:33:05.524015Z",
-     "shell.execute_reply": "2023-07-28T16:33:05.523337Z"
     }
    },
    "outputs": [
@@ -183,7 +166,7 @@
        "(15020, 60664)"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -205,18 +188,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 40,
    "id": "133f594f",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:20:32.052795Z",
      "start_time": "2023-10-09T18:20:32.051289Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:33:05.527106Z",
-     "iopub.status.busy": "2023-07-28T16:33:05.526540Z",
-     "iopub.status.idle": "2023-07-28T16:33:05.530907Z",
-     "shell.execute_reply": "2023-07-28T16:33:05.530386Z"
     }
    },
    "outputs": [],
@@ -237,18 +214,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 41,
    "id": "39d30df2",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:20:32.056886Z",
      "start_time": "2023-10-09T18:20:32.052898Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:33:05.538514Z",
-     "iopub.status.busy": "2023-07-28T16:33:05.538206Z",
-     "iopub.status.idle": "2023-07-28T16:33:05.541008Z",
-     "shell.execute_reply": "2023-07-28T16:33:05.540445Z"
     }
    },
    "outputs": [],
@@ -278,18 +249,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 42,
    "id": "6b792b4b",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:20:32.060262Z",
      "start_time": "2023-10-09T18:20:32.058875Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:33:05.543534Z",
-     "iopub.status.busy": "2023-07-28T16:33:05.543229Z",
-     "iopub.status.idle": "2023-07-28T16:33:05.546861Z",
-     "shell.execute_reply": "2023-07-28T16:33:05.546267Z"
     }
    },
    "outputs": [],
@@ -318,18 +283,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 43,
    "id": "b744cd21",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:20:32.310829Z",
      "start_time": "2023-10-09T18:20:32.307661Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:33:05.549312Z",
-     "iopub.status.busy": "2023-07-28T16:33:05.549001Z",
-     "iopub.status.idle": "2023-07-28T16:33:05.562074Z",
-     "shell.execute_reply": "2023-07-28T16:33:05.561501Z"
     }
    },
    "outputs": [],
@@ -355,8 +314,7 @@
     "\n",
     "        # Compute the loss and perform back propagation\n",
     "\n",
-    "        # Exclude the cell_type labels, which are in the second column\n",
-    "        y_batch = y_batch[:, 1]\n",
+    "        y_batch = y_batch.flatten()\n",
     "        y_batch = y_batch.to(device)\n",
     "\n",
     "        train_correct += (predictions == y_batch).sum().item()\n",
@@ -397,18 +355,8 @@
     "tensor([0., 0., 0.,  ..., 1., 0., 0.])\n",
     "```\n",
     "    \n",
-    "Secondly, note the line, `y_batch = y_batch[:, 1]`. This line is extracting the user-specified `obs` `cell_type` training labels from the second column of the `y_batch` rank 2 Tensor.  For example, this would take a `y_batch` tensor that looks like:\n",
-    "```\n",
-    "tensor([[42496620,        1],\n",
-    "        [42496621,        1],\n",
-    "        [42496622,        3],\n",
-    "        ...,\n",
-    "        [42496633,        2],\n",
-    "        [42496634,        1],\n",
-    "        [42496635,        4]], dtype=torch.int32)\n",
-    "      \n",
-    "```\n",
-    "and return:\n",
+    "For `y_batch`, this will contain the user-specified `obs` `cell_type` training labels. By default, these are encoded using a LabelEncoder and it will be a matrix where each column represents the encoded values of each column specified in `obs_column_names` when creating the datapipe (in this case, only the cell type). It will look like this:\n",
+    "\n",
     "```\n",
     "tensor([1, 1, 3, ..., 2, 1, 4])\n",
     "\n",
@@ -429,18 +377,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 44,
    "id": "733ec2fb",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:29:31.028253Z",
      "start_time": "2023-10-09T18:20:32.311816Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:33:05.564772Z",
-     "iopub.status.busy": "2023-07-28T16:33:05.564454Z",
-     "iopub.status.idle": "2023-07-28T16:34:04.801559Z",
-     "shell.execute_reply": "2023-07-28T16:34:04.800846Z"
     }
    },
    "outputs": [
@@ -448,16 +390,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch 1: Train Loss: 0.0167253 Accuracy 0.4856\n",
-      "Epoch 2: Train Loss: 0.0156710 Accuracy 0.4943\n",
-      "Epoch 3: Train Loss: 0.0149408 Accuracy 0.4813\n",
-      "Epoch 4: Train Loss: 0.0144469 Accuracy 0.5040\n",
-      "Epoch 5: Train Loss: 0.0141749 Accuracy 0.5669\n",
-      "Epoch 6: Train Loss: 0.0139776 Accuracy 0.6672\n",
-      "Epoch 7: Train Loss: 0.0138565 Accuracy 0.7920\n",
-      "Epoch 8: Train Loss: 0.0138094 Accuracy 0.8088\n",
-      "Epoch 9: Train Loss: 0.0136689 Accuracy 0.8757\n",
-      "Epoch 10: Train Loss: 0.0136101 Accuracy 0.8923\n"
+      "Epoch 1: Train Loss: 0.0160780 Accuracy 0.4212\n",
+      "Epoch 2: Train Loss: 0.0147909 Accuracy 0.4836\n",
+      "Epoch 3: Train Loss: 0.0144370 Accuracy 0.5914\n",
+      "Epoch 4: Train Loss: 0.0141504 Accuracy 0.7369\n",
+      "Epoch 5: Train Loss: 0.0139617 Accuracy 0.8451\n",
+      "Epoch 6: Train Loss: 0.0137966 Accuracy 0.8723\n",
+      "Epoch 7: Train Loss: 0.0136620 Accuracy 0.8924\n",
+      "Epoch 8: Train Loss: 0.0135778 Accuracy 0.9018\n",
+      "Epoch 9: Train Loss: 0.0135177 Accuracy 0.9073\n",
+      "Epoch 10: Train Loss: 0.0134660 Accuracy 0.9155\n"
      ]
     }
    ],
@@ -493,18 +435,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 45,
    "id": "d3e33edc",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:29:59.425527Z",
      "start_time": "2023-10-09T18:29:31.705548Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:34:04.804402Z",
-     "iopub.status.busy": "2023-07-28T16:34:04.803987Z",
-     "iopub.status.idle": "2023-07-28T16:34:09.331800Z",
-     "shell.execute_reply": "2023-07-28T16:34:09.331168Z"
     }
    },
    "outputs": [],
@@ -524,32 +460,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 46,
    "id": "00e12182",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:29:59.438079Z",
      "start_time": "2023-10-09T18:29:59.429107Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:34:09.334916Z",
-     "iopub.status.busy": "2023-07-28T16:34:09.334470Z",
-     "iopub.status.idle": "2023-07-28T16:34:09.340880Z",
-     "shell.execute_reply": "2023-07-28T16:34:09.340293Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "tensor([ 1, 11,  1,  1,  5,  1,  1,  1,  1,  5,  1,  5,  1,  5,  5,  8,  1,  1,\n",
-       "         7,  1,  5,  5,  8,  5,  5,  1,  1,  1,  1,  8,  9,  1,  1,  8,  1,  1,\n",
-       "         1, 11,  5,  1,  8,  5,  5,  1,  5,  1,  5,  5,  1,  5,  9,  8,  1,  1,\n",
-       "         1,  5,  5,  5,  1,  5,  1,  5,  1,  1,  5,  8,  1,  1,  1,  1,  7,  1,\n",
-       "         5,  1,  1,  5,  5,  1,  1,  8,  5,  5,  8,  1,  1,  1,  5,  5,  5,  1,\n",
-       "         5,  1,  5,  5,  1,  1,  5,  1,  5,  1,  1,  1,  5,  1,  1,  1,  9,  5,\n",
-       "         1,  1,  7,  1,  1,  1,  1,  8,  1,  1,  5,  5,  1,  5,  1,  1,  1,  5,\n",
-       "         8,  1])"
+       "tensor([ 1,  8,  1,  1,  1,  1,  1,  6,  1,  1,  5,  5,  7,  1,  7,  7,  1,  1,\n",
+       "         5,  1,  8,  1,  1,  8,  7,  1,  5,  1,  5,  1,  1,  1,  1,  7,  1,  8,\n",
+       "         1,  1,  1,  5,  7,  7,  5,  8,  5,  1,  1,  1,  1, 11,  7,  1,  1,  7,\n",
+       "         1,  1,  1,  5,  7,  8,  1,  7,  5,  1,  1,  1,  8,  1,  7,  1,  1,  1,\n",
+       "         8,  1,  8,  1,  1,  1,  5,  7,  7,  1,  5,  8,  5,  1,  1,  1,  1,  1,\n",
+       "         1,  1,  1,  8,  7,  1,  5,  5,  5,  7,  7,  5,  1,  1,  1,  1,  5,  1,\n",
+       "         1,  8,  5,  1,  8, 11,  1,  6,  7,  7,  7,  1,  1,  7,  1,  8,  1,  7,\n",
+       "         1,  1])"
       ]
      },
      "metadata": {},
@@ -581,57 +511,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 47,
    "id": "1cfff865",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:29:59.441907Z",
      "start_time": "2023-10-09T18:29:59.439561Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:34:09.343375Z",
-     "iopub.status.busy": "2023-07-28T16:34:09.343131Z",
-     "iopub.status.idle": "2023-07-28T16:34:09.347842Z",
-     "shell.execute_reply": "2023-07-28T16:34:09.347311Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "array(['basal cell', 'vein endothelial cell', 'basal cell', 'basal cell',\n",
+       "array(['basal cell', 'leukocyte', 'basal cell', 'basal cell',\n",
+       "       'basal cell', 'basal cell', 'basal cell', 'fibroblast',\n",
+       "       'basal cell', 'basal cell', 'epithelial cell', 'epithelial cell',\n",
+       "       'keratinocyte', 'basal cell', 'keratinocyte', 'keratinocyte',\n",
+       "       'basal cell', 'basal cell', 'epithelial cell', 'basal cell',\n",
+       "       'leukocyte', 'basal cell', 'basal cell', 'leukocyte',\n",
+       "       'keratinocyte', 'basal cell', 'epithelial cell', 'basal cell',\n",
        "       'epithelial cell', 'basal cell', 'basal cell', 'basal cell',\n",
-       "       'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n",
-       "       'basal cell', 'epithelial cell', 'epithelial cell', 'leukocyte',\n",
+       "       'basal cell', 'keratinocyte', 'basal cell', 'leukocyte',\n",
+       "       'basal cell', 'basal cell', 'basal cell', 'epithelial cell',\n",
+       "       'keratinocyte', 'keratinocyte', 'epithelial cell', 'leukocyte',\n",
+       "       'epithelial cell', 'basal cell', 'basal cell', 'basal cell',\n",
+       "       'basal cell', 'vein endothelial cell', 'keratinocyte',\n",
        "       'basal cell', 'basal cell', 'keratinocyte', 'basal cell',\n",
-       "       'epithelial cell', 'epithelial cell', 'leukocyte',\n",
-       "       'epithelial cell', 'epithelial cell', 'basal cell', 'basal cell',\n",
-       "       'basal cell', 'basal cell', 'leukocyte', 'pericyte', 'basal cell',\n",
-       "       'basal cell', 'leukocyte', 'basal cell', 'basal cell',\n",
-       "       'basal cell', 'vein endothelial cell', 'epithelial cell',\n",
-       "       'basal cell', 'leukocyte', 'epithelial cell', 'epithelial cell',\n",
-       "       'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n",
-       "       'epithelial cell', 'basal cell', 'epithelial cell', 'pericyte',\n",
-       "       'leukocyte', 'basal cell', 'basal cell', 'basal cell',\n",
-       "       'epithelial cell', 'epithelial cell', 'epithelial cell',\n",
-       "       'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n",
-       "       'basal cell', 'basal cell', 'epithelial cell', 'leukocyte',\n",
-       "       'basal cell', 'basal cell', 'basal cell', 'basal cell',\n",
-       "       'keratinocyte', 'basal cell', 'epithelial cell', 'basal cell',\n",
-       "       'basal cell', 'epithelial cell', 'epithelial cell', 'basal cell',\n",
-       "       'basal cell', 'leukocyte', 'epithelial cell', 'epithelial cell',\n",
-       "       'leukocyte', 'basal cell', 'basal cell', 'basal cell',\n",
-       "       'epithelial cell', 'epithelial cell', 'epithelial cell',\n",
-       "       'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n",
-       "       'epithelial cell', 'basal cell', 'basal cell', 'epithelial cell',\n",
-       "       'basal cell', 'epithelial cell', 'basal cell', 'basal cell',\n",
-       "       'basal cell', 'epithelial cell', 'basal cell', 'basal cell',\n",
-       "       'basal cell', 'pericyte', 'epithelial cell', 'basal cell',\n",
+       "       'basal cell', 'basal cell', 'epithelial cell', 'keratinocyte',\n",
+       "       'leukocyte', 'basal cell', 'keratinocyte', 'epithelial cell',\n",
+       "       'basal cell', 'basal cell', 'basal cell', 'leukocyte',\n",
        "       'basal cell', 'keratinocyte', 'basal cell', 'basal cell',\n",
-       "       'basal cell', 'basal cell', 'leukocyte', 'basal cell',\n",
-       "       'basal cell', 'epithelial cell', 'epithelial cell', 'basal cell',\n",
+       "       'basal cell', 'leukocyte', 'basal cell', 'leukocyte', 'basal cell',\n",
+       "       'basal cell', 'basal cell', 'epithelial cell', 'keratinocyte',\n",
+       "       'keratinocyte', 'basal cell', 'epithelial cell', 'leukocyte',\n",
        "       'epithelial cell', 'basal cell', 'basal cell', 'basal cell',\n",
-       "       'epithelial cell', 'leukocyte', 'basal cell'], dtype=object)"
+       "       'basal cell', 'basal cell', 'basal cell', 'basal cell',\n",
+       "       'basal cell', 'leukocyte', 'keratinocyte', 'basal cell',\n",
+       "       'epithelial cell', 'epithelial cell', 'epithelial cell',\n",
+       "       'keratinocyte', 'keratinocyte', 'epithelial cell', 'basal cell',\n",
+       "       'basal cell', 'basal cell', 'basal cell', 'epithelial cell',\n",
+       "       'basal cell', 'basal cell', 'leukocyte', 'epithelial cell',\n",
+       "       'basal cell', 'leukocyte', 'vein endothelial cell', 'basal cell',\n",
+       "       'fibroblast', 'keratinocyte', 'keratinocyte', 'keratinocyte',\n",
+       "       'basal cell', 'basal cell', 'keratinocyte', 'basal cell',\n",
+       "       'leukocyte', 'basal cell', 'keratinocyte', 'basal cell',\n",
+       "       'basal cell'], dtype=object)"
       ]
      },
      "metadata": {},
@@ -657,18 +581,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 49,
    "id": "f4ac8087",
    "metadata": {
     "ExecuteTime": {
      "end_time": "2023-10-09T18:29:59.471320Z",
      "start_time": "2023-10-09T18:29:59.443175Z"
-    },
-    "execution": {
-     "iopub.execute_input": "2023-07-28T16:34:09.350404Z",
-     "iopub.status.busy": "2023-07-28T16:34:09.350006Z",
-     "iopub.status.idle": "2023-07-28T16:34:09.701102Z",
-     "shell.execute_reply": "2023-07-28T16:34:09.700533Z"
     }
    },
    "outputs": [
@@ -705,8 +623,8 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>vein endothelial cell</td>\n",
-       "      <td>vein endothelial cell</td>\n",
+       "      <td>leukocyte</td>\n",
+       "      <td>leukocyte</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -715,13 +633,13 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>basal cell</td>\n",
+       "      <td>keratinocyte</td>\n",
        "      <td>basal cell</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>epithelial cell</td>\n",
-       "      <td>epithelial cell</td>\n",
+       "      <td>basal cell</td>\n",
+       "      <td>basal cell</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -730,8 +648,8 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>123</th>\n",
-       "      <td>basal cell</td>\n",
-       "      <td>basal cell</td>\n",
+       "      <td>leukocyte</td>\n",
+       "      <td>leukocyte</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>124</th>\n",
@@ -740,13 +658,13 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>125</th>\n",
-       "      <td>epithelial cell</td>\n",
-       "      <td>epithelial cell</td>\n",
+       "      <td>keratinocyte</td>\n",
+       "      <td>keratinocyte</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>126</th>\n",
-       "      <td>leukocyte</td>\n",
-       "      <td>leukocyte</td>\n",
+       "      <td>basal cell</td>\n",
+       "      <td>basal cell</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>127</th>\n",
@@ -759,18 +677,18 @@
        "</div>"
       ],
       "text/plain": [
-       "          actual cell type    predicted cell type\n",
-       "0               basal cell             basal cell\n",
-       "1    vein endothelial cell  vein endothelial cell\n",
-       "2               basal cell             basal cell\n",
-       "3               basal cell             basal cell\n",
-       "4          epithelial cell        epithelial cell\n",
-       "..                     ...                    ...\n",
-       "123             basal cell             basal cell\n",
-       "124             basal cell             basal cell\n",
-       "125        epithelial cell        epithelial cell\n",
-       "126              leukocyte              leukocyte\n",
-       "127             basal cell             basal cell\n",
+       "    actual cell type predicted cell type\n",
+       "0         basal cell          basal cell\n",
+       "1          leukocyte           leukocyte\n",
+       "2         basal cell          basal cell\n",
+       "3       keratinocyte          basal cell\n",
+       "4         basal cell          basal cell\n",
+       "..               ...                 ...\n",
+       "123        leukocyte           leukocyte\n",
+       "124       basal cell          basal cell\n",
+       "125     keratinocyte        keratinocyte\n",
+       "126       basal cell          basal cell\n",
+       "127       basal cell          basal cell\n",
        "\n",
        "[128 rows x 2 columns]"
       ]
@@ -785,12 +703,20 @@
     "display(\n",
     "    pd.DataFrame(\n",
     "        {\n",
-    "            \"actual cell type\": cell_type_encoder.inverse_transform(y_batch[:, 1].numpy()),\n",
+    "            \"actual cell type\": cell_type_encoder.inverse_transform(y_batch.ravel().numpy()),\n",
     "            \"predicted cell type\": predicted_cell_types,\n",
     "        }\n",
     "    )\n",
     ")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce2c02b3-2032-45af-a313-70bb082ab12c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -809,7 +735,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,
diff --git a/docs/articles/2024/20240709-pytorch-fig-benchmark.png b/docs/articles/2024/20240709-pytorch-fig-benchmark.png
new file mode 100644
index 000000000..76ee30103
Binary files /dev/null and b/docs/articles/2024/20240709-pytorch-fig-benchmark.png differ
diff --git a/docs/articles/2024/20240709-pytorch-fig-loss-after.png b/docs/articles/2024/20240709-pytorch-fig-loss-after.png
new file mode 100644
index 000000000..faf92d7d8
Binary files /dev/null and b/docs/articles/2024/20240709-pytorch-fig-loss-after.png differ
diff --git a/docs/articles/2024/20240709-pytorch-fig-loss-before.png b/docs/articles/2024/20240709-pytorch-fig-loss-before.png
new file mode 100644
index 000000000..75622ee1d
Binary files /dev/null and b/docs/articles/2024/20240709-pytorch-fig-loss-before.png differ
diff --git a/docs/articles/2024/20240709-pytorch-fig-scvi.png b/docs/articles/2024/20240709-pytorch-fig-scvi.png
new file mode 100644
index 000000000..6aee97eb8
Binary files /dev/null and b/docs/articles/2024/20240709-pytorch-fig-scvi.png differ
diff --git a/docs/articles/2024/20240709-pytorch.md b/docs/articles/2024/20240709-pytorch.md
new file mode 100644
index 000000000..759a6d54e
--- /dev/null
+++ b/docs/articles/2024/20240709-pytorch.md
@@ -0,0 +1,135 @@
+# First stable iteration of Census (SOMA) PyTorch loaders
+
+*Published:* *July 9th, 2024*
+
+*By:* *[Emanuele Bezzi](mailto:ebezzi@chanzuckerberg.com), [Pablo Garcia-Nieto](mailto:pgarcia-nieto@chanzuckerberg.com), [Prathap Sridharan](mailto:psridharan@chanzuckerberg.com), [Ryan Williams](mailto:ryan.williams@tiledb.com)*
+
+The Census team is excited to share the release of Census PyTorch loaders that work out-of-the-box for memory-efficient training across any slice of the >70M cells in Census.
+
+In 2023, we released a beta version of the loaders and we have observed interest from users to utilize them with Census or their own data. For example [Wolf et al.](https://lamin.ai/blog/arrayloader-benchmarks) performed comparisons across different training approaches and found our loaders to be ideal for *uncached* training of Census data, albeit with some caveats.
+
+We have continued the development of the loaders in collaboration with our partners at TileDB, and we are happy to announce this release as the first stable iteration. We hope the loaders can accelerate the development of large-scale models of single-cell data by leveraging the following main features:
+
+- **Out-of-the-box training on all or any slice of Census data.**
+- **Efficient memory usage with out-of-core training.**
+- **Calibrated shuffling of observations (cells).**
+- **Cloud-based or local data access.**
+- **Increased training speed.**
+- **Custom data encoders.**
+
+Keep on reading for usage and more details on the main loader features.
+
+## Census PyTorch loaders usage
+
+The loaders are ready to use for PyTorch modeling via the specialized Data Pipe [`ExperimentDataPipe`](https://chanzuckerberg.github.io/cellxgene-census/_autosummary/cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe.html#cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe), which takes advantage of the out-of-core data access TileDB-SOMA offers.
+
+Please follow the [Training a PyTorch Model](https://chanzuckerberg.github.io/cellxgene-census/notebooks/experimental/pytorch.html) tutorial for a full reproducible example to train a logistic regression on cell type labels.
+
+In short, the following shows you how to initialize the loader to train a model on a small subset of cells. First, you can initialize a `ExperimentDataPipe` to train a model on tongue cells as follows:
+
+```python
+import cellxgene_census.experimental.ml as census_ml
+import cellxgene_census
+import tiledbsoma as soma
+
+experiment = census["census_data"]["homo_sapiens"]
+
+experiment_datapipe = census_ml.ExperimentDataPipe(
+    experiment,
+    measurement_name="RNA",
+    X_name="raw",
+    obs_query=soma.AxisQuery(value_filter="tissue_general == 'tongue' and is_primary_data == True"),
+    obs_column_names=["cell_type"],
+    batch_size=128,
+    shuffle=True,
+)
+```
+
+Then you can perform any PyTorch operations and training.
+
+```python
+# Splitting training and test sets
+train_datapipe, test_datapipe = experiment_datapipe.random_split(weights={"train": 0.8, "test": 0.2}, seed=1)
+
+# Creating data loader
+experiment_dataloader = census_ml.experiment_dataloader(train_datapipe)
+
+# Training a PyTorch model
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+model = MODEL().to(device)
+model.train()
+```
+
+## Census PyTorch loaders main features
+
+### Out-of-the-box training on all or any slice of Census data
+
+Since the `ExperimentDataPipe` inherits from the [PyTorch Iterable-style DataPipe](https://pytorch.org/data/main/torchdata.datapipes.iter.html) it can be readily used with PyTorch models.
+
+The single-cell expression data is encoded in numerical tensors, and for supervised training the cell metadata can be automatically transformed with a default encoder, or with custom user-defined encoders (see below).
+
+### Efficient memory usage with out-of-core training
+
+Thanks to the underlying backend of Census — TileDB-SOMA — the PyTorch loaders take advantage of incremental data materialization of fixed and small size to keep memory usage constant throughout training.
+
+In addition, data is eagerly fetched while batches go through training so that compute is never idle or waiting for data to be loaded. This feature is particularly useful when fetching Census data directly from the cloud.
+
+Memory usage is defined by the parameters `soma_chunk_size` and `shuffle_chunk_count` - see below for a full description on how these should be tuned.
+
+### Calibrated shuffling of observations (cells)
+
+Shuffling along efficient out-of-core data fetching is a challenge. In general, increasing randomness of shuffling leads to slower data fetching.
+
+In the first iteration of the loaders, shuffling was done through large blocks of data of user-defined size. This shuffling strategy led to non-random distribution of observations per training batch, becasue Census has a non-random data structure (observations from the same datasets are adjacent to one another) thus training loss was unstable (Figure 1).
+
+**Now we have implemented a scatter-gather approach**, whereby multiple chunks of data are fetched randomly from Census, then a number of chunks are concatenated into a block and all observations within the block are randomly shuffled. Adjusting the size and number of chunks per block leads to well-calibrated shuffling with stable training loss (Figure 2) while maintaining efficient data fetching (Figure 3).
+
+The balance between memory usage, efficiency, and level of randomness can be adjusted with the parameters `soma_chunk_size` and `shuffle_chunk_count`. Increasing `shuffle_chunk_count` will improve randomness, as more scattered chunks will be collected before the pool is randomized. Increasing `soma_chunk_size` will improve I/O efficiency while decreasing it will improve memory usage. We recommend a default of `soma_chunk_size=64, shuffle_chunk_count=2000` as we determined this configuration yields a good balance.
+
+```{figure} ./20240709-pytorch-fig-loss-before.png
+:alt: Census PyTorch loaders shuffling
+:align: center
+:figwidth: 80%
+
+**Figure 1. Training loss was unstable with the previous shuffling strategy**. Based on a trial scVI run on 64K Census cells.
+```
+
+```{figure} ./20240709-pytorch-fig-loss-after.png
+:alt: Census PyTorch loaders callibrated shuffling
+:align: center
+:figwidth: 80%
+
+**Figure 2. Training loss is well-calibrated with the current scatter-gather shuffling strategy.** Based on a trial scVI run on 250K Census cells.
+```
+
+### Increased training speed
+
+We have made improvements to the loaders to reduce the amount of data transformations required from data fetching to model training. One such important change is to encode the expression data as a dense matrix immediately after the data is retrieved from disk/cloud.
+
+In our benchmarks, we found that densifying data increases training speed ~3X while maintaining relatively constant memory usage (Figure 3). For this reason, we have disable the intermediate data processing in sparse format unless Torch Sparse Tensors are requested via the `ExperimentDataPipe` parameter `return_sparse_X`.
+
+```{figure} ./20240709-pytorch-fig-benchmark.png
+:alt: Census PyTorch loaders benchmark
+:align: center
+:figwidth: 80%
+
+**Figure 3. Benchmark of memory usage and speed of data processing during modeling, default parameters lead to 3K+ samples/sec with 27GB of memory.** The benchmark was done processing 4M cells out of a 10M-cell Census, data was fetched from the cloud (S3). "Method" indicates the expression matrix encoding, circles are dense (np.array) and squares are sparse (scipy.csr). Size indicates the total number of cells per processing block (max cells materialized at any given time) and color is the number of individual randomly grabbed chunks composing a processing block, higher chunks per block lead to better shuffling. Data was fetched until modeling step, but no model was trained.
+```
+
+We repeated the benchmark in Figure 3 in different conditions encompassing varying number of total cells and multiple epochs, please [follow this link for the full benchmark report and code.](https://github.com/ryan-williams/arrayloader-benchmarks).
+
+When comparing dense vs sparse processing in an end-to-end training exercise with scVI, we also observed slight increased speed with the dense approach and comparable memory usage to sparse processing (Figure 4). However in this full training example the differences were less substantial, highlighting that other model-specific factors during the training phase will contribute to memory and speed performance.
+
+```{figure} ./20240709-pytorch-fig-scvi.png
+:alt: Census scVI PyTorch run
+:align: center
+:figwidth: 80%
+
+**Figure 4. Trial scVI training run with default parameters of the Census Pytorch loaders, highlighting increased speed of dense vs sparse data processing.** Training was done on 5684805 mouse cells for 1 epoch on a g4dn.16xlarge EC2 machine.
+```
+
+### Custom data encoders
+
+For maximum flexibility, users can provide custom encoders for the cell metadata enabling custom transformations or interactions between different metadata variables.
+
+To use custom encoders you need to instantiate the desired encoder via the [Encoder](https://chanzuckerberg.github.io/cellxgene-census/_autosummary/cellxgene_census.experimental.ml.encoders.Encoder.html#cellxgene_census.experimental.ml.encoders.Encoder) class and pass it to the `encoders` parameter of the `ExperimentDataPipe`.
diff --git a/docs/cellxgene_census_docsite_data_release_info.md b/docs/cellxgene_census_docsite_data_release_info.md
index 621b4de44..8cdb83fe5 100644
--- a/docs/cellxgene_census_docsite_data_release_info.md
+++ b/docs/cellxgene_census_docsite_data_release_info.md
@@ -62,6 +62,54 @@ census <- open_soma(census_version = "latest")
 
 ## List of LTS Census data releases
 
+### LTS 2024-07-01
+
+Open this data release by specifying `census_version = "2024-07-01"` in future calls to `open_soma()`.
+
+#### Version information
+
+| Information                       | Value      |
+|-----------------------------------|------------|
+| Census schema version             | [2.0.1](https://github.com/chanzuckerberg/cellxgene-census/blob/fad674674e5070b735a29bc069d1d3dc21d2e5e8/docs/cellxgene_census_schema.md) |
+| Census build date                 | 2024-05-20 |
+| Dataset schema version            | [5.0.0](https://github.com/chanzuckerberg/cellxgene-census/blob/fad674674e5070b735a29bc069d1d3dc21d2e5e8/docs/cellxgene_census_schema.md)      |
+| Number of datasets                | 812        |
+
+#### Cell and donor counts
+
+| Type              | _Homo sapiens_ | _Mus musculus_ |
+|-------------------|----------------|----------------|
+| Total cells       | 74,322,510     | 41,233,630     |
+| Unique cells      | 44,265,932     | 16,332,034     |
+| Number of donors  | 17,651         | 4,216          |
+
+#### Cell metadata
+
+| Category                | _Homo sapiens_ | _Mus musculus_ |
+|-------------------------|----------------|----------------|
+| Assay                   | 24             | 11             |
+| Cell type               | 698            | 364            |
+| Development stage       | 176            | 48             |
+| Disease                 | 109            | 7              |
+| Self-reported ethnicity | 31             | _NA_           |
+| Sex                     | 3              | 3              |
+| Suspension type         | 2              | 2              |
+| Tissue                  | 267            | 84             |
+| Tissue general          | 55             | 29             |
+
+#### Embbedings
+
+Find out more in the [Census model page](https://cellxgene.cziscience.com/census-models).
+
+Available embeddings can be accessed via [`cellxgene_census.experimental.get_embedding()`](https://chanzuckerberg.github.io/cellxgene-census/_autosummary/cellxgene_census.experimental.get_embedding.html#cellxgene_census.experimental.get_embedding), or by specifying the `obs_embeddings`/`var_embeddings` field in [`cellxgene_census.get_anndata()`](https://chanzuckerberg.github.io/cellxgene-census/_autosummary/cellxgene_census.get_anndata.html#cellxgene_census.get_anndata).
+
+##### Cells
+
+| Method                    | _Homo sapiens_ | _Mus musculus_ |
+|---------------------------|----------------|----------------|
+| scVI                      | `scvi`         | `scvi`         |
+| Geneformer                | `geneformer`   | _NA_           |
+
 ### LTS 2023-12-15
 
 Open this data release by specifying `census_version = "2023-12-15"` in future calls to `open_soma()`.
@@ -97,16 +145,27 @@ Open this data release by specifying `census_version = "2023-12-15"` in future c
 | Tissue                  | 230            | 74             |
 | Tissue general          | 53             | 27             |
 
-#### Cell embbedings
+#### Embbedings
 
 Find out more in the [Census model page](https://cellxgene.cziscience.com/census-models).
 
-Available `obsm` slots:
+Available embeddings can be accessed via [`cellxgene_census.experimental.get_embedding()`](https://chanzuckerberg.github.io/cellxgene-census/_autosummary/cellxgene_census.experimental.get_embedding.html#cellxgene_census.experimental.get_embedding), or by specifying the `obs_embeddings`/`var_embeddings` field in [`cellxgene_census.get_anndata()`](https://chanzuckerberg.github.io/cellxgene-census/_autosummary/cellxgene_census.get_anndata.html#cellxgene_census.get_anndata).
 
-| Method                  | _Homo sapiens_ | _Mus musculus_ |
-|-------------------------|----------------|----------------|
-| scVI                    | `scvi`         | `scvi`         |
-| Fine-tuned Geneformer   | `geneformer`   | _NA_           |
+##### Cells
+
+| Method                    | _Homo sapiens_ | _Mus musculus_ |
+|---------------------------|----------------|----------------|
+| scVI                      | `scvi`         | `scvi`         |
+| Fine-tuned Geneformer     | `geneformer`   | _NA_           |
+| scGPT                     | `scgpt`        | _NA_           |
+| Universal Cell Embeddings | `uce`          | _NA_           |
+| NMF                       | `nmf`          | _NA_           |
+
+##### Features
+
+| Method                    | _Homo sapiens_ | _Mus musculus_ |
+|---------------------------|----------------|----------------|
+| NMF                       | `nmf`          | _NA_           |
 
 ### LTS 2023-07-25
 
diff --git a/docs/cellxgene_census_schema.md b/docs/cellxgene_census_schema.md
index c5c7b098d..511899ec0 100644
--- a/docs/cellxgene_census_schema.md
+++ b/docs/cellxgene_census_schema.md
@@ -1,8 +1,8 @@
 # CZ CELLxGENE Discover Census Schema
 
-**Version**: 2.0.1
+**Version**: 2.1.0
 
-**Last edited**: March, 2024.
+**Last edited**: June, 2024.
 
 The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED" "MAY", and "OPTIONAL" in this document are to be interpreted as described in [BCP 14](https://tools.ietf.org/html/bcp14), [RFC2119](https://www.rfc-editor.org/rfc/rfc2119.txt), and [RFC8174](https://www.rfc-editor.org/rfc/rfc8174.txt) when, and only when, they appear in all capitals, as shown here.
 
@@ -10,14 +10,14 @@ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "S
 
 The CZ CELLxGENE Discover Census, hereafter referred as Census, is a versioned data object and API for most of the single-cell data hosted at [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/). To learn more about the Census visit the `chanzuckerberg/cellxgene-census` [github repository](https://github.com/chanzuckerberg/cellxgene-census)
 
-To better understand this document the reader should be familiar with the [CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md) and [SOMA](https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md).
+To better understand this document the reader should be familiar with the [CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md) and [SOMA](https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md).
 
 ## Definitions
 
 The following terms are used throughout this document:
 
 * adata – generic variable name that refers to an [`AnnData`](https://anndata.readthedocs.io/) object.
-* CELLxGENE dataset schema – the data schema for h5ad files served by CELLxGENE Discover, for this Census schema: [CELLxGENE dataset schema version is 5.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md)
+* CELLxGENE dataset schema – the data schema for h5ad files served by CELLxGENE Discover, for this Census schema: [CELLxGENE dataset schema version is 5.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md)
 * census\_obj – the Census root object, a SOMACollection.
 * Census data release – a versioned Census object deposited in a public bucket and accessible by APIs.
 * tissue – original tissue annotation.
@@ -44,23 +44,23 @@ Census data releases are versioned separately from the schema.
 
 ### Data included
 
-All datasets included in the Census MUST be of [CELLxGENE dataset schema version 5.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md). The following data constraints are imposed on top of the CELLxGENE dataset schema.
+All datasets included in the Census MUST be of [CELLxGENE dataset schema version 5.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md). The following data constraints are imposed on top of the CELLxGENE dataset schema.
 
 #### Species
 
-The Census MUST only contain observations (cells) with an  [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#organism_ontology_term_id) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included.
+The Census MUST only contain observations (cells) with an  [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#organism_ontology_term_id) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included.
 
-The Census MUST only contain features (genes) with a [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#feature_reference) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included
+The Census MUST only contain features (genes) with a [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#feature_reference) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included
 
 #### Multi-species data constraints
 
-Per the CELLxGENE dataset schema, [multi-species datasets MAY contain observations (cells) of a given organism and features (genes) of a different one](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#general-requirements), as defined in [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#organism_ontology_term_id) and [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#feature_reference) respectively.
+Per the CELLxGENE dataset schema, [multi-species datasets MAY contain observations (cells) of a given organism and features (genes) of a different one](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#general-requirements), as defined in [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#organism_ontology_term_id) and [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#feature_reference) respectively.
 
 For any given multi-species dataset, observation and features from the dataset are included in the Census as defined by the following:
 
 * Where a dataset includes observations and features from a single species, all observations and features from the dataset are included in the Census.
 * Where a dataset includes observations from a single species `S`, and includes features from multiple species *including* the species `S`, all dataset observations and all features from `S` will be included in the Census.
-* Where a dataset includes features from a single species `S`, and observations from multiple species *including* the species `S`, all dataset features and all observations from speices `S` are included in the Census.
+* Where a dataset includes features from a single species `S`, and observations from multiple species *including* the species `S`, all dataset features and all observations from species `S` are included in the Census.
 * Where a species has observations *AND* features from multiple species, the dataset will be excluded from the Census.
 
 The table below shows all possible combinations of organisms for both observations and features, assuming a Census comprised of Homo sapiens and Mus musculus. For each combination, inclusion criteria for the Census is provided.
@@ -114,7 +114,7 @@ The table below shows all possible combinations of organisms for both observatio
 
 #### Assays
 
-Assays are defined in the CELLxGENE dataset schema in [`assay_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#assay_ontology_term_id).
+Assays are defined in the CELLxGENE dataset schema in [`assay_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#assay_ontology_term_id).
 
 The Census MUST include all cells from the list of [accepted assays](./census_accepted_assays.csv).
 
@@ -143,15 +143,15 @@ These data need to be normalized by gene length for downstream analysis.
 
 #### Data matrix types
 
-Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#x-matrix-layers). Author-normalized data layers [as defined in the CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#x-matrix-layers) MUST NOT be included in the Census.
+Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x-matrix-layers). Author-normalized data layers [as defined in the CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x-matrix-layers) MUST NOT be included in the Census.
 
 #### Sample types
 
-Only observations (cells) from primary tissue MUST be included in the Census. Thus, ONLY those observations with a [`tissue_type`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#tissue_type) value equal to "tissue" MUST be included; other values of `tissue_type` MUST NOT be included.
+Only observations (cells) from primary tissue MUST be included in the Census. Thus, ONLY those observations with a [`tissue_type`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#tissue_type) value equal to "tissue" MUST be included; other values of `tissue_type` MUST NOT be included.
 
 #### Repeated data
 
-When a cell is represented multiple times in CELLxGENE Discover, only one is marked as the primary cell. This is defined in the CELLxGENE dataset schema under [`is_primary_data`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#is_primary_data). This information MUST be included in the Census cell metadata to enable queries that retrieve datasets (see cell metadata below), and all cells MUST be included in the Census.
+When a cell is represented multiple times in CELLxGENE Discover, only one is marked as the primary cell. This is defined in the CELLxGENE dataset schema under [`is_primary_data`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#is_primary_data). This information MUST be included in the Census cell metadata to enable queries that retrieve datasets (see cell metadata below), and all cells MUST be included in the Census.
 
 ### Data encoding and organization
 
@@ -231,7 +231,7 @@ An example of this `SOMADataFrame` is shown below:
   </tr>
   <tr>
     <td>dataset_schema_version </td>
-    <td>5.0.0</td>
+    <td>5.1.0</td>
   </tr>
   <tr>
     <td>total_cell_count</td>
@@ -283,6 +283,10 @@ All datasets used to build the Census MUST be included in a table modeled as a `
     <td>collection_doi</td>
     <td>string</td>
   </tr>
+  <tr>
+    <td>collection_doi_label</td>
+    <td>string</td>
+  </tr>
   <tr>
     <td>dataset_id</td>
     <td>string</td>
@@ -361,7 +365,7 @@ Summary cell counts grouped by organism and relevant cell metadata MUST be model
   <tr>
     <td>unique_cell_count</td>
     <td>int</td>
-    <td>Unique number of cells for the combination of values of all other fields above. Unique number of cells refers to the cell count, for this group, when <code><a href="https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#is_primary_data">is_primary_data == True</a></code> </td>
+    <td>Unique number of cells for the combination of values of all other fields above. Unique number of cells refers to the cell count, for this group, when <code><a href="https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#is_primary_data">is_primary_data == True</a></code> </td>
   </tr>
 </tbody>
 </table>
@@ -656,7 +660,7 @@ For each organism the `SOMAExperiment` MUST contain the following:
 
 #### Matrix Data, count (raw) matrix – `census_obj["census_data"][organism].ms["RNA"].X["raw"]` – `SOMASparseNDArray`
 
-Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#x-matrix-layers). These counts MUST be encoded as `float32` in this `SOMASparseNDArray` with a fill value of zero (0), and no explicitly stored zero values.
+Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x-matrix-layers). These counts MUST be encoded as `float32` in this `SOMASparseNDArray` with a fill value of zero (0), and no explicitly stored zero values.
 
 #### Matrix Data, normalized count matrix – `census_obj["census_data"][organism].ms["RNA"].X["normalized"]` – `SOMASparseNDArray`
 
@@ -670,9 +674,9 @@ as `normalized[i,j] = X[i,j] / sum(X[i, ])`.
 
 #### Feature metadata – `census_obj["census_data"][organism].ms["RNA"].var` – `SOMADataFrame`
 
-The Census MUST only contain features with a [`feature_biotype`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#feature_biotype) value of "gene".
+The Census MUST only contain features with a [`feature_biotype`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#feature_biotype) value of "gene".
 
-The [gene references are pinned](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.0.0/schema.md#required-gene-annotations) as defined in the CELLxGENE dataset schema.
+The [gene references are pinned](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#required-gene-annotations) as defined in the CELLxGENE dataset schema.
 
 The following columns MUST be included:
 
@@ -870,6 +874,11 @@ Cell metadata MUST be encoded as a `SOMADataFrame` with the following columns:
 
 ## Changelog
 
+### Version 2.1.0
+
+* Update to require [CELLxGENE schema version 5.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md)
+* Adds `collection_doi_label` to "Census table of CELLxGENE Discover datasets – `census_obj["census_info"]["datasets"]`"
+
 ### Version 2.0.1
 
 * Update accepted assays for Census based on guidance from curators.
diff --git a/docs/conf.py b/docs/conf.py
index 57711cb62..f4d63e0ed 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -50,6 +50,7 @@
     'scanpy': ('https://scanpy.readthedocs.io/en/stable/', None),
     'torch': ('https://pytorch.org/docs/stable/', None),
     'torchdata': ('https://pytorch.org/data/beta/', None),
+    'sklearn': ('http://scikit-learn.org/stable', None),
 }
 
 templates_path = ['_templates']
diff --git a/docs/python-api.rst b/docs/python-api.rst
index 8df3b3321..9309ba4d9 100644
--- a/docs/python-api.rst
+++ b/docs/python-api.rst
@@ -54,6 +54,9 @@ Experimental: Machine Learning
     cellxgene_census.experimental.ml.pytorch.experiment_dataloader
     cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe
     cellxgene_census.experimental.ml.pytorch.Stats
+    cellxgene_census.experimental.ml.encoders.Encoder
+    cellxgene_census.experimental.ml.encoders.LabelEncoder
+    cellxgene_census.experimental.ml.encoders.BatchEncoder
     cellxgene_census.experimental.ml.huggingface.CellDatasetBuilder
     cellxgene_census.experimental.ml.huggingface.GeneformerTokenizer
 
diff --git a/tools/cellxgene_census_builder/pyproject.toml b/tools/cellxgene_census_builder/pyproject.toml
index 5f684c238..4ac3e4d56 100644
--- a/tools/cellxgene_census_builder/pyproject.toml
+++ b/tools/cellxgene_census_builder/pyproject.toml
@@ -36,7 +36,7 @@ dependencies= [
     #    https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md
     "tiledbsoma==1.9.3",
     "cellxgene-census==1.12.0",
-    "cellxgene-ontology-guide==0.6.1",
+    "cellxgene-ontology-guide==1.0.0",
     "scipy==1.12.0",
     "fsspec[http]==2024.3.1",
     "s3fs==2024.3.1",
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
index 7ee72c7a8..ec044a267 100644
--- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
+++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
@@ -11,9 +11,9 @@
 # DataFrame columns. True is enabled, False is disabled.
 USE_ARROW_DICTIONARY = True
 
-CENSUS_SCHEMA_VERSION = "2.0.1"
+CENSUS_SCHEMA_VERSION = "2.1.0"
 
-CXG_SCHEMA_VERSION = "5.0.0"  # the CELLxGENE schema version supported
+CXG_SCHEMA_VERSION = "5.1.0"  # the CELLxGENE schema version supported
 
 # Columns expected in the census_datasets dataframe
 CENSUS_DATASETS_TABLE_SPEC = TableSpec.create(
diff --git a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py
index 921393c17..dc2f1e999 100644
--- a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py
+++ b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py
@@ -265,7 +265,7 @@ def test_empty_estimated_density(tmp_path: pathlib.Path) -> None:
     adata = anndata.AnnData(
         obs=pd.DataFrame(), var=pd.DataFrame({"feature_id": [0, 1, 2]}), X=sparse.csr_matrix((0, 3), dtype=np.float32)
     )
-    adata.uns["schema_version"] = "5.0.0"
+    adata.uns["schema_version"] = "5.1.0"
     adata.write_h5ad(path)
 
     with open_anndata(path) as ad:
@@ -297,7 +297,7 @@ def test_open_anndata_raw_X(tmp_path: pathlib.Path) -> None:
         var=pd.DataFrame({"feature_id": [0, 1, 2]}),
         X=sparse.csr_matrix((2, 3), dtype=np.float32),
         raw={"X": sparse.csr_matrix((2, 4), dtype=np.float32)},
-        uns={"schema_version": "5.0.0"},
+        uns={"schema_version": "5.1.0"},
     )
     adata.write_h5ad(path)
 
@@ -410,7 +410,7 @@ def test_multi_species_filter(
             index=[f"feature_{i}" for i in range(n_vars)],
         ),
         X=sparse.random(n_obs, n_vars, format="csr", dtype=np.float32),
-        uns={"schema_version": "5.0.0"},
+        uns={"schema_version": "5.1.0"},
     )
     path = (tmp_path / "species.h5ad").as_posix()
     adata.write_h5ad(path)
diff --git a/tools/cellxgene_census_builder/tests/conftest.py b/tools/cellxgene_census_builder/tests/conftest.py
index 269860bb2..adccea725 100644
--- a/tools/cellxgene_census_builder/tests/conftest.py
+++ b/tools/cellxgene_census_builder/tests/conftest.py
@@ -116,7 +116,7 @@ def get_anndata(
     uns["batch_condition"] = np.array(["a", "b"], dtype="object")
 
     # Need to carefully set the corpora schema versions in order for tests to pass.
-    uns["schema_version"] = "5.0.0"  # type: ignore
+    uns["schema_version"] = "5.1.0"  # type: ignore
 
     return anndata.AnnData(X=X, obs=obs, var=var, obsm=obsm, uns=uns)
 
diff --git a/tools/cellxgene_census_builder/tests/test_manifest.py b/tools/cellxgene_census_builder/tests/test_manifest.py
index ab7a4384a..fb9098cea 100644
--- a/tools/cellxgene_census_builder/tests/test_manifest.py
+++ b/tools/cellxgene_census_builder/tests/test_manifest.py
@@ -65,7 +65,7 @@ def test_load_manifest_from_cxg(empty_blocklist: str) -> None:
                 "collection_doi_label": "Publication 1",
                 "citation": "citation",
                 "title": "dataset #1",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [
                     {
                         "filesize": 123,
@@ -90,7 +90,7 @@ def test_load_manifest_from_cxg(empty_blocklist: str) -> None:
                 "collection_doi_label": "Publication 2",
                 "citation": "citation",
                 "title": "dataset #2",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [{"filesize": 456, "filetype": "H5AD", "url": "https://fake.url/dataset_id_2.h5ad"}],
                 "dataset_version_id": "dataset_id_2",
                 "cell_count": 11,
@@ -122,7 +122,7 @@ def test_load_manifest_from_cxg_errors_on_datasets_with_old_schema(
                 "collection_doi_label": "Publication 1",
                 "citation": "citation",
                 "title": "dataset #1",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [{"filesize": 123, "filetype": "H5AD", "url": "https://fake.url/dataset_id_1.h5ad"}],
                 "dataset_version_id": "dataset_id_1",
                 "cell_count": 10,
@@ -166,7 +166,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets(
                 "collection_doi": None,
                 "citation": "citation",
                 "title": "dataset #1",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [{"filesize": 123, "filetype": "H5AD", "url": "https://fake.url/dataset_id_1.h5ad"}],
                 "dataset_version_id": "dataset_id_1",
                 "cell_count": 10,
@@ -179,7 +179,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets(
                 "collection_doi": None,
                 "citation": "citation",
                 "title": "dataset #2",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [],
                 "dataset_version_id": "dataset_id_2",
                 "cell_count": 10,
diff --git a/tools/census_contrib_qc/embeddings_qc_2023-12-15.ipynb b/tools/census_contrib_qc/embeddings_qc_2023-12-15.ipynb
index 7a32a46d5..0bc917ece 100644
--- a/tools/census_contrib_qc/embeddings_qc_2023-12-15.ipynb
+++ b/tools/census_contrib_qc/embeddings_qc_2023-12-15.ipynb
@@ -488,7 +488,7 @@
     "    organism=\"homo_sapiens\",\n",
     "    measurement_name=\"RNA\",\n",
     "    obs_value_filter=\"tissue_general == 'central nervous system' and is_primary_data == True\",\n",
-    "    column_names={\"obs\": [\"cell_type\", \"assay\", \"soma_joinid\"]},\n",
+    "    obs_column_names=[\"cell_type\", \"assay\", \"soma_joinid\"],\n",
     "    obsm_layers=maintained_embs_human,\n",
     ")"
    ]
@@ -663,7 +663,7 @@
     "    organism=\"homo_sapiens\",\n",
     "    measurement_name=\"RNA\",\n",
     "    obs_value_filter=\"tissue_general == 'pancreas' and is_primary_data == True\",\n",
-    "    column_names={\"obs\": [\"cell_type\", \"assay\", \"soma_joinid\", \"dataset_id\", \"is_primary_data\"]},\n",
+    "    obs_column_names=[\"cell_type\", \"assay\", \"soma_joinid\", \"dataset_id\", \"is_primary_data\"],\n",
     "    obsm_layers=maintained_embs_human,\n",
     ")"
    ]
@@ -862,7 +862,7 @@
     "    organism=\"mus_musculus\",\n",
     "    measurement_name=\"RNA\",\n",
     "    obs_value_filter=\"tissue_general == 'heart' and is_primary_data == True\",\n",
-    "    column_names={\"obs\": [\"cell_type\", \"assay\", \"soma_joinid\"]},\n",
+    "    obs_column_names=[\"cell_type\", \"assay\", \"soma_joinid\"],\n",
     "    obsm_layers=maintained_embs_mouse,\n",
     ")"
    ]
@@ -1014,7 +1014,7 @@
     "    organism=\"mus_musculus\",\n",
     "    measurement_name=\"RNA\",\n",
     "    obs_value_filter=\"tissue_general == 'pancreas'\",\n",
-    "    column_names={\"obs\": [\"cell_type\", \"assay\", \"soma_joinid\"]},\n",
+    "    obs_column_names=[\"cell_type\", \"assay\", \"soma_joinid\"],\n",
     "    obsm_layers=maintained_embs_mouse,\n",
     ")"
    ]
diff --git a/tools/models/geneformer/Dockerfile b/tools/models/geneformer/Dockerfile
index b1f485bc4..cd9a67114 100644
--- a/tools/models/geneformer/Dockerfile
+++ b/tools/models/geneformer/Dockerfile
@@ -1,41 +1,49 @@
 # Builds a docker image with:
-# - PyTorch+CUDA
+# - CUDA+PyTorch
 # - Geneformer
 # - cellxgene_census
 # - our Census-Geneformer training scripts
-FROM nvcr.io/nvidia/pytorch:23.10-py3
+FROM nvcr.io/nvidia/cuda:11.8.0-runtime-ubuntu22.04
 
-# Set the tiledbsoma version used to write the embeddings SparseNDArray, to ensure
-# compatibility with the Census embeddings curator
-ARG EMBEDDINGS_TILEDBSOMA_VERSION=1.4.4
-ARG GENEFORMER_VERSION=8df5dc1
-
-RUN apt update && apt install -y python3-venv git-lfs pigz
+RUN apt update && apt install -y build-essential python3-pip python3-venv git-lfs pigz libcurl4-openssl-dev
 RUN git lfs install
-ENV GIT_SSL_NO_VERIFY=true
-RUN pip install \
-        transformers[torch] \
-        "cellxgene_census[experimental] @ git+https://github.com/chanzuckerberg/cellxgene-census.git#subdirectory=api/python/cellxgene_census" \
-        git+https://huggingface.co/ctheodoris/Geneformer@${GENEFORMER_VERSION}
-RUN pip install owlready2 boto3
 
+ENV GIT_SSL_NO_VERIFY=true
+RUN pip install --upgrade pip setuptools setuptools_scm
+RUN pip install torch torchdata --index-url https://download.pytorch.org/whl/cu118
+                                                                             # ^^^ match the base image CUDA version!
+RUN pip install owlready2 boto3 transformers[torch]
 # workaround for unknown problem blocking `import geneformer`:
 #   https://github.com/microsoft/TaskMatrix/issues/116#issuecomment-1565431850
 RUN pip uninstall -y transformer-engine
-# smoke test
-RUN python3 -c 'import geneformer; import cellxgene_census; cellxgene_census.open_soma()'
+
+# Set the tiledbsoma version used to write the embeddings SparseNDArray, to ensure
+# compatibility with the Census embeddings curator
+ARG EMBEDDINGS_TILEDBSOMA_VERSION=1.9.5
+ARG CELLXGENE_CENSUS_VERSION=main
+ARG GENEFORMER_VERSION=471eefc
 
 RUN mkdir /census-geneformer
 WORKDIR /census-geneformer
-# clone Geneformer separately to get LFS files
+RUN git clone https://github.com/chanzuckerberg/cellxgene-census.git \
+        && git -C cellxgene-census checkout ${CELLXGENE_CENSUS_VERSION}
+RUN pip install cellxgene-census/api/python/cellxgene_census
 RUN git clone --recursive https://huggingface.co/ctheodoris/Geneformer \
         && git -C Geneformer checkout ${GENEFORMER_VERSION}
+RUN pip install -e Geneformer
 
-# prepare a venv with tiledbsoma ${EMBEDDINGS_TILEDBSOMA_VERSION}
+# smoke test
+RUN python3 -c 'import geneformer; import cellxgene_census; from cellxgene_census.experimental.ml.huggingface import GeneformerTokenizer; cellxgene_census.open_soma()'
+
+# prepare a venv with pinned tiledbsoma ${EMBEDDINGS_TILEDBSOMA_VERSION}, which our embeddings
+# generation step will use to output a TileDB array compatible with the Census embeddings curator.
 RUN python3 -m venv --system-site-packages embeddings_tiledbsoma_venv && \
     . embeddings_tiledbsoma_venv/bin/activate && \
     pip install tiledbsoma==${EMBEDDINGS_TILEDBSOMA_VERSION}
 
-COPY *.py .
 COPY helpers ./helpers
+COPY *.py ./
 COPY finetune-geneformer.config.yml .
+
+# FIXME: eliminate once model is published in Geneformer repo
+COPY gf-95m/ ./gf-95m/
diff --git a/tools/models/geneformer/README.md b/tools/models/geneformer/README.md
index c6f45065a..c360fb382 100644
--- a/tools/models/geneformer/README.md
+++ b/tools/models/geneformer/README.md
@@ -12,48 +12,53 @@ The `Dockerfile` provides the recipe for the docker image used by the WDLs, whic
 
 ## Example invocations
 
-Using a [miniwdl-aws](https://github.com/miniwdl-ext/miniwdl-aws) deployment with suitable GPU instance types enabled on the underlying AWS Batch compute environment, and assuming the docker image has been built and pushed to a suitable repository like ECR (tagged `$DOCKER_TAG`).
+Using [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) for the Amazon HealthOmics workflow service, and assuming the docker image has been built and pushed to a suitable repository like ECR (tagged `$DOCKER_TAG`).
 
 Preparing a tokenized training dataset with 2,500 primary cells per human cell type:
 
 ```bash
-miniwdl-aws-submit --verbose --follow --workflow-queue miniwdl-workflow \
-    wdl/prepare_datasets.wdl docker=$DOCKER_TAG \
-    census_version=2023-10-23 N=2500 sampling_column=cell_type output_name=2500_per_cell_type \
-    --s3upload s3://MYBUCKET/geneformer/datasets/2500_per_cell_type/
+miniwdl-omics-run wdl/prepare_datasets.wdl \
+    docker=$DOCKER_TAG \
+    census_version=s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/ \
+    N=2500 sampling_column=cell_type output_name=2500_per_cell_type \
+    --role poweromics --output-uri s3://MYBUCKET/geneformer/datasets/
 ```
 
-And a tokenized dataset for all of Census (371GiB!):
+And a tokenized dataset for all of Census (>300GiB, sharded):
 
 ```bash
-miniwdl-aws-submit --verbose --follow --workflow-queue miniwdl-workflow \
-    wdl/prepare_datasets.wdl docker=$DOCKER_TAG \
-    census_version=2023-10-23 output_name=census-2023-10-23 value_filter='is_primary_data==True or is_primary_data==False' \
-    --s3upload s3://MYBUCKET/geneformer/datasets/census-2023-10-23/
+miniwdl-omics-run wdl/prepare_datasets.wdl \
+    docker=$DOCKER_TAG \
+    census_version=s3://cellxgene-census-public-us-west-2/cell-census/2024-05-20/soma/ \
+    value_filter='is_primary_data==True or is_primary_data==False' \
+    output_name=2024-05-20 shards=256 \
+    --role poweromics --output-uri s3://MYBUCKET/geneformer/datasets/
 ```
 
+(We set `census_version` to the SOMACollection URI because the HealthOmics workers don't have internet access to the Census release directory endpoint.)
+
 Fine-tuning for 8 epochs (takes ~36h on g5.8xlarge):
 
 ```bash
-MINIWDL__AWS__GPU_VALUE=8 \
-MINIWDL__AWS__CONTAINER_PROPERTIES='{"linuxParameters":{"sharedMemorySize":4096}}' \
-miniwdl-aws-submit --verbose --follow --workflow-queue miniwdl-workflow \
-    wdl/finetune_geneformer.wdl docker=$DOCKER_TAG \
+miniwdl-omics-run wdl/finetune_geneformer.wdl \
+    docker=$DOCKER_TAG \
     dataset=s3://MYBUCKET/geneformer/datasets/2500_per_cell_type/dataset/2500_per_cell_type \
     epochs=8 output_name=2500_per_cell_type_8epochs \
-    --s3upload s3://MYBUCKET/geneformer/models/2500_per_cell_type_8epochs/
+    --role poweromics --output-uri s3://MYBUCKET/geneformer/models/
 ```
 
 Generating cell embeddings (takes 8-12h on up to 256 g5.2xlarge, generates 130GiB `tiledbsoma.SparseNDArray` on S3):
 
 ```bash
-MINIWDL__SCHEDULER__CALL_CONCURRENCY=256 \
-MINIWDL__AWS__SUBMIT_PERIOD=60 \
-miniwdl-aws-submit --verbose --follow --workflow-queue miniwdl-workflow \
-    wdl/generate_embeddings.wdl docker=$DOCKER_TAG \
-    emb_layer=0 \
-    dataset=s3://MYBUCKET/geneformer/datasets/census-2023-10-23/dataset/census-2023-10-23 \
-    model=s3://MYBUCKET/geneformer/models/2500_per_cell_type_8epochs/model/2500_per_cell_type_8epochs \
-    output_uri=s3://MYBUCKET/geneformer/embs/census-2023-10-23 parts=256 \
-    --s3upload s3://MYBUCKET/geneformer/embs
+seq 0 255 \
+    | xargs -n 1 printf 'dataset_shards=s3://MYBUCKET/geneformer/datasets/census-2024-05-20/shard-%03d/\n' \
+    | xargs -n 9999 miniwdl-omics-run \
+    --role poweromics --output-uri s3://MYBUCKET/geneformer/embs \
+    wdl/generate_embeddings.wdl \
+    docker=$DOCKER_TAG \
+    emb_layer=0 model_type=Pretrained \
+    model=s3://MYBUCKET/geneformer/gf-95m/fine_tuned_model/ \
+    output_uri=s3_//MYBUCKET/geneformer/embs/$(date '+%s')/census-2024-05-20/
 ```
+
+(The `s3_//MYBUCKET` is a workaround for the workflow service rejecting our submission if the specified S3 output folder doesn't yet exist; this workflow has TileDB create it.)
diff --git a/tools/models/geneformer/buildspec.yml b/tools/models/geneformer/buildspec.yml
new file mode 100644
index 000000000..0c439d650
--- /dev/null
+++ b/tools/models/geneformer/buildspec.yml
@@ -0,0 +1,15 @@
+# This CodeBuild spec is used to build the docker image and push it to ECR.
+# (The image is >10GB so can be painful to push from outside AWS.)
+version: 0.2
+
+phases:
+  pre_build:
+    commands:
+    - aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 699936264352.dkr.ecr.us-west-2.amazonaws.com
+  build:
+    commands:
+    - aws s3 cp s3://mlin-census-scratch/geneformer/gf-95m/ tools/models/geneformer/gf-95m/ --recursive
+    - docker build -t 699936264352.dkr.ecr.us-west-2.amazonaws.com/omics:census-geneformer --build-arg CELLXGENE_CENSUS_VERSION=$CODEBUILD_RESOLVED_SOURCE_VERSION tools/models/geneformer
+  post_build:
+    commands:
+    - docker push 699936264352.dkr.ecr.us-west-2.amazonaws.com/omics:census-geneformer
diff --git a/tools/models/geneformer/finetune-geneformer.config.yml b/tools/models/geneformer/finetune-geneformer.config.yml
index cf72418d2..734fa3b1a 100644
--- a/tools/models/geneformer/finetune-geneformer.config.yml
+++ b/tools/models/geneformer/finetune-geneformer.config.yml
@@ -3,6 +3,9 @@ label_feature: cell_subclass
 # Specific labels to exclude from training and evaluation
 label_blocklist:
   - unknown
+  - abnormal cell
+  - animal cell
+  - eukaryotic cell
 # Also exclude labels with too few examples
 label_min_examples: 10
 # Fraction of the input Dataset to hold out for evaluation
diff --git a/tools/models/geneformer/finetune-geneformer.py b/tools/models/geneformer/finetune-geneformer.py
index 13427706e..cf3f3af5a 100644
--- a/tools/models/geneformer/finetune-geneformer.py
+++ b/tools/models/geneformer/finetune-geneformer.py
@@ -10,6 +10,7 @@
 from collections import Counter
 
 import pandas as pd
+import torch
 import yaml
 from datasets import Dataset
 from geneformer import DataCollatorForCellClassification
@@ -22,6 +23,7 @@
 
 
 def main(argv):
+    assert torch.cuda.is_available(), "CUDA is not available"
     args = parse_arguments(argv)
     if os.path.exists(args.model_out):
         logger.error("output directory already exists: " + args.model_out)
diff --git a/tools/models/geneformer/generate-geneformer-embeddings.py b/tools/models/geneformer/generate-geneformer-embeddings.py
index e3adbc589..225f07b16 100755
--- a/tools/models/geneformer/generate-geneformer-embeddings.py
+++ b/tools/models/geneformer/generate-geneformer-embeddings.py
@@ -8,7 +8,8 @@
 import tempfile
 
 import geneformer
-from datasets import Dataset, disable_progress_bar
+import torch
+from datasets import disable_progress_bar
 from transformers import BertConfig
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(module)s [%(levelname)s] %(message)s")
@@ -17,6 +18,7 @@
 
 
 def main(argv):
+    assert torch.cuda.is_available(), "CUDA is not available"
     args = parse_arguments(argv)
 
     tiledbsoma_context = None
@@ -56,7 +58,6 @@ def main(argv):
 
     with tempfile.TemporaryDirectory() as scratch_dir:
         # prepare the dataset, taking only one shard of it if so instructed
-        dataset_path = prepare_dataset(args.dataset, args.part, args.parts, scratch_dir)
         logger.info("Extracting embeddings...")
         extractor = geneformer.EmbExtractor(
             model_type=args.model_type,
@@ -70,7 +71,7 @@ def main(argv):
         #       see https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/emb_extractor.py
         embs_df = extractor.extract_embs(
             model_directory=args.model,
-            input_data_file=dataset_path,
+            input_data_file=args.dataset,
             # the method always writes out a .csv file which we discard (since it also returns the
             # embeddings data frame)
             output_directory=scratch_dir,
@@ -124,8 +125,6 @@ def parse_arguments(argv):
         help="dataset features to copy into output dataframe (comma-separated)",
     )
     parser.add_argument("--batch-size", type=int, default=16, help="batch size")
-    parser.add_argument("--part", type=int, help="process only one shard of the data (zero-based index)")
-    parser.add_argument("--parts", type=int, help="required with --part")
     parser.add_argument(
         "--tiledbsoma",
         action="store_true",
@@ -141,26 +140,9 @@ def parse_arguments(argv):
     if "soma_joinid" not in args.features:
         args.features.append("soma_joinid")
 
-    if args.part is not None:
-        if not (args.part >= 0 and args.parts is not None and args.parts > args.part):
-            parser.error("--part must be nonnegative and less than --parts")
-
     logger.info("arguments: " + str(vars(args)))
     return args
 
 
-def prepare_dataset(dataset_dir, part, parts, spool_dir):
-    dataset = Dataset.load_from_disk(dataset_dir)
-    logger.info(f"dataset (full): {dataset}")
-    if part is None:
-        return dataset_dir
-    dataset = dataset.shard(num_shards=parts, index=part, contiguous=True)
-    # spool the desired part of the dataset (since EmbExtractor takes a directory)
-    logger.info(f"dataset part: {dataset}")
-    part_dir = os.path.join(spool_dir, "dataset_part")
-    dataset.save_to_disk(part_dir)
-    return part_dir
-
-
 if __name__ == "__main__":
     sys.exit(main(sys.argv))
diff --git a/tools/models/geneformer/helpers/cl.v2024-04-05.owl.gz b/tools/models/geneformer/helpers/cl.v2024-04-05.owl.gz
new file mode 100644
index 000000000..f4c633e6b
Binary files /dev/null and b/tools/models/geneformer/helpers/cl.v2024-04-05.owl.gz differ
diff --git a/tools/models/geneformer/helpers/ontology_mapper.py b/tools/models/geneformer/helpers/ontology_mapper.py
index d5d01eb86..994b5cfad 100644
--- a/tools/models/geneformer/helpers/ontology_mapper.py
+++ b/tools/models/geneformer/helpers/ontology_mapper.py
@@ -10,7 +10,9 @@
 
 """
 
+import gzip
 import os
+import tempfile
 from abc import ABC, abstractmethod
 
 import owlready2
@@ -191,16 +193,21 @@ def _is_and_object(entity: owlready2.entity.ThingClass) -> bool:
 
 class CellMapper(OntologyMapper):
     # From schema 3.1.0 https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md
-    CXG_CL_ONTOLOGY_URL = "https://github.com/obophenotype/cell-ontology/releases/download/v2023-07-20/cl.owl"
+    CXG_CL_ONTOLOGY_PATH = os.path.join(os.path.dirname(__file__), "cl.v2024-04-05.owl.gz")
     # Only look up ancestors under Cell
     ROOT_NODE = "CL_0000000"
 
     def __init__(self, cell_type_high_level_ontology_term_ids: list[str]):
-        super(CellMapper, self).__init__(  # noqa: UP008
-            high_level_ontology_term_ids=cell_type_high_level_ontology_term_ids,
-            ontology_owl_path=self.CXG_CL_ONTOLOGY_URL,
-            root_ontology_term_id=self.ROOT_NODE,
-        )
+        # with a temporary file of gunzipped CXG_CL_ONTOLOGY_PATH:
+        with tempfile.NamedTemporaryFile() as owl:
+            with gzip.open(self.CXG_CL_ONTOLOGY_PATH, "rb") as f:
+                owl.write(f.read())
+            owl.flush()
+            super(CellMapper, self).__init__(  # noqa: UP008
+                high_level_ontology_term_ids=cell_type_high_level_ontology_term_ids,
+                ontology_owl_path=owl.name,
+                root_ontology_term_id=self.ROOT_NODE,
+            )
 
     def _get_branch_ancestors(self, owl_entity):
         branch_ancestors = []
@@ -226,18 +233,22 @@ def _get_is_a_for_cl(owl_entity):
 
 class TissueMapper(OntologyMapper):
     # From schema 3.1.0 https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.1.0/schema.md
-    CXG_UBERON_ONTOLOGY_URL = "https://github.com/obophenotype/uberon/releases/download/v2023-06-28/uberon.owl"
+    CXG_UBERON_ONTOLOGY_PATH = os.path.join(os.path.dirname(__file__), "uberon.v2024-03-22.owl.gz")
 
     # Only look up ancestors under anatomical entity
     ROOT_NODE = "UBERON_0001062"
 
     def __init__(self, tissue_high_level_ontology_term_ids: list[str]):
         self.cell_type_high_level_ontology_term_ids = tissue_high_level_ontology_term_ids
-        super(TissueMapper, self).__init__(  # noqa: UP008
-            high_level_ontology_term_ids=tissue_high_level_ontology_term_ids,
-            ontology_owl_path=self.CXG_UBERON_ONTOLOGY_URL,
-            root_ontology_term_id=self.ROOT_NODE,
-        )
+        with tempfile.NamedTemporaryFile() as owl:
+            with gzip.open(self.CXG_UBERON_ONTOLOGY_PATH, "rb") as f:
+                owl.write(f.read())
+            owl.flush()
+            super(TissueMapper, self).__init__(  # noqa: UP008
+                high_level_ontology_term_ids=tissue_high_level_ontology_term_ids,
+                ontology_owl_path=owl.name,
+                root_ontology_term_id=self.ROOT_NODE,
+            )
 
     def _get_branch_ancestors(self, owl_entity):
         branch_ancestors = []
diff --git a/tools/models/geneformer/helpers/uberon.v2024-03-22.owl.gz b/tools/models/geneformer/helpers/uberon.v2024-03-22.owl.gz
new file mode 100644
index 000000000..b7801c0f6
Binary files /dev/null and b/tools/models/geneformer/helpers/uberon.v2024-03-22.owl.gz differ
diff --git a/tools/models/geneformer/prepare-census-geneformer-dataset.py b/tools/models/geneformer/prepare-census-geneformer-dataset.py
index 414777fb9..fecb27542 100755
--- a/tools/models/geneformer/prepare-census-geneformer-dataset.py
+++ b/tools/models/geneformer/prepare-census-geneformer-dataset.py
@@ -2,8 +2,10 @@
 # mypy: ignore-errors
 
 import argparse
+import functools
 import json
 import logging
+import math
 import multiprocessing
 import os
 import subprocess
@@ -16,11 +18,12 @@
 import numpy as np
 import tiledbsoma
 from cellxgene_census.experimental.ml.huggingface import GeneformerTokenizer
+
+# TODO: switch to https://github.com/chanzuckerberg/cellxgene-ontology-guide
 from helpers.ontology_mapper import CellSubclassMapper
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(module)s [%(levelname)s] %(message)s")
 logger = logging.getLogger(os.path.basename(__file__))
-NPROC = multiprocessing.cpu_count()
 
 
 def main(argv):
@@ -30,49 +33,35 @@ def main(argv):
         logger.error("output directory already exists: " + args.output_dir)
         return 1
 
-    # open human census
-    with cellxgene_census.open_soma(census_version=args.census_version) as census:
-        census_human = census["census_data"]["homo_sapiens"]
-
-        # select the cell id's to include
-        obs_df = select_cells(census_human, args.value_filter, args.percentage_data, args.sampling_column, args.N)
-        coords = np.array(obs_df.index)
-
-        # use GeneformerTokenizer to build dataset of those cells
-        with GeneformerTokenizer(
-            census_human,
-            obs_query=tiledbsoma.AxisQuery(coords=(coords,)),
-            obs_attributes=[
-                # cell_subclass isn't yet in Census (select_cells() added it to obs_df for us), so
-                # exclude from the experiment axis query
-                it
-                for it in args.obs_columns
-                if it not in ("cell_subclass", "cell_subclass_ontology_term_id")
-            ],
-        ) as tokenizer:
-            logger.info(f"tokenizing {len(coords)} cells...")
-            dataset = tokenizer.build()
-
-    # add back cell_subclass
-    if "cell_subclass_ontology_term_id" in args.obs_columns:
-        dataset = dataset.map(
-            lambda it: {
-                "cell_subclass_ontology_term_id": obs_df.loc[it["soma_joinid"]]["cell_subclass_ontology_term_id"]
-            },
-            num_proc=NPROC,
-        )
-    if "cell_subclass" in args.obs_columns:
-        dataset = dataset.map(
-            lambda it: {"cell_subclass": obs_df.loc[it["soma_joinid"]]["cell_subclass"]}, num_proc=NPROC
+    # select cells
+    if "://" in args.census_version:
+        census_uri = args.census_version
+    else:
+        census_uri = cellxgene_census.get_census_version_description(args.census_version)["soma"]["uri"]
+        logger.info(f"resolved census version {args.census_version} to {census_uri}")
+    with cellxgene_census.open_soma(uri=census_uri) as census:
+        obs_df = select_cells(
+            census["census_data"]["homo_sapiens"], args.value_filter, args.percentage_data, args.sampling_column, args.N
         )
-    logger.info(str(dataset))
-    if len(dataset):
-        logger.info(dataset[0])
 
-    # write them to output_dir (note: the Dataset tools will have spooled to disk already, so
-    # this should just be copying it to the desired location)
-    logger.info("writing Dataset to " + args.output_dir)
-    dataset.save_to_disk(args.output_dir)
+    logger.info(f"tokenizing {len(obs_df)} cells...")
+    # build dataset (parallelizing across shards, if so configured)
+    # NOTE: originally we made one big Dataset and later used its built-in shard() method, but we
+    # found that didn't use disk I/O efficiently (reading a shard read the whole dataset) so
+    # switched to sharding into separate datasets.
+    tasks = [(obs_df, args.output_dir)]
+    if args.shards > 1:
+        obs_dfs = np.array_split(obs_df, args.shards)
+        digits = math.ceil(math.log10(len(obs_dfs)))
+        tasks = [
+            (obs_dfs[i], os.path.join(args.output_dir, "shard-" + str(i).zfill(digits))) for i in range(len(obs_dfs))
+        ]
+    multiprocessing.freeze_support()
+    multiprocessing.set_start_method("spawn", force=True)
+    with multiprocessing.Pool(
+        processes=4, initializer=init_worker
+    ) as pool:  # NOTE: keep processes= small due to memory usage
+        pool.map(functools.partial(build_dataset, census_uri, args.obs_columns, args.tokenizer_kwargs), tasks)
 
     logger.info(subprocess.run(["du", "-sh", args.output_dir], stdout=subprocess.PIPE).stdout.decode().strip())
 
@@ -110,7 +99,11 @@ def parse_arguments(argv):
         "-N", type=int, help="further downsample to no more than N examples per distinct value of sampling_column"
     )
     parser.add_argument(
-        "-v", "--census-version", type=str, default="latest", help='Census release to query (default: "latest")'
+        "--tokenizer-kwargs", type=json.loads, default={}, help="additional kwargs to pass to GeneformerTokenizer"
+    )
+    parser.add_argument("--shards", type=int, default=1, help="output dataset shards (default: 1)")
+    parser.add_argument(
+        "-v", "--census-version", type=str, default="stable", help='Census release or URI to query (default: "stable")'
     )
     parser.add_argument("output_dir", type=str, help="output directory (must not already exist)")
 
@@ -148,17 +141,23 @@ def select_cells(census_human, value_filter, percentage_data, sampling_column, N
 
     # annotate cell subclasses
     logger.info("annotating cell subclasses...")
-    mapper = CellSubclassMapper(map_orphans_to_class=True)
-    obs_df["cell_subclass_ontology_term_id"] = obs_df["cell_type_ontology_term_id"].map(
-        # if CellSubclassMapper doesn't find a subclass, just use the cell type itself
-        lambda it: mapper.get_top_high_level_term(it) or it
-    )
-    obs_df["cell_subclass"] = obs_df["cell_subclass_ontology_term_id"].map(lambda it: mapper.get_label_from_id(it))
-    subclass_counts = Counter(obs_df["cell_subclass"])
-    logger.info(
-        f"cell subclasses ({len(subclass_counts)}): {json.dumps(subclass_counts)}"
-        + f" (compare to {len(obs_df['cell_type_ontology_term_id'].unique())} cell_types)"
-    )
+    subclass_counts = None
+    try:
+        mapper = CellSubclassMapper(map_orphans_to_class=True)
+        obs_df["cell_subclass_ontology_term_id"] = obs_df["cell_type_ontology_term_id"].map(
+            # if CellSubclassMapper doesn't find a subclass, just use the cell type itself
+            lambda it: (mapper.get_top_high_level_term(it) or it) if it != "unknown" else it
+        )
+        obs_df["cell_subclass"] = obs_df["cell_subclass_ontology_term_id"].map(
+            lambda it: mapper.get_label_from_id(it) if it != "unknown" else it
+        )
+        subclass_counts = Counter(obs_df["cell_subclass"])
+        logger.info(
+            f"cell subclasses ({len(subclass_counts)}): {json.dumps(subclass_counts)}"
+            + f" (compare to {len(obs_df['cell_type_ontology_term_id'].unique())} cell_types)"
+        )
+    except Exception:
+        logger.exception("failed to annotate cell subclasses")
 
     # further downsample by sampling_column, if requested
     if N:
@@ -168,12 +167,60 @@ def select_cells(census_human, value_filter, percentage_data, sampling_column, N
         obs_df = obs_df.groupby(sampling_column).apply(lambda x: x.sample(min(len(x), N)))
         sampling_counts = Counter(obs_df[sampling_column])
         logger.info(f"after downsampling to at most {N} examples per {sampling_column}: {json.dumps(sampling_counts)}")
-        subclass_counts = Counter(obs_df["cell_subclass"])
-        logger.info(f"downsampled cell subclasses ({len(subclass_counts)}): {json.dumps(subclass_counts)}")
+        if subclass_counts is not None:
+            subclass_counts = Counter(obs_df["cell_subclass"])
+            logger.info(f"downsampled cell subclasses ({len(subclass_counts)}): {json.dumps(subclass_counts)}")
 
     obs_df.set_index("soma_joinid", inplace=True)
     return obs_df
 
 
+worker_soma_context = None
+
+
+def init_worker():
+    global worker_soma_context
+    worker_soma_context = tiledbsoma.SOMATileDBContext()
+
+
+def build_dataset(census_uri, obs_columns, tokenizer_kwargs, task):
+    """Given obs_df from select_cells (or subset thereof), build the Geneformer dataset and save to output_dir."""
+    obs_df = task[0]
+    output_dir = task[1]
+
+    # open human census
+    with cellxgene_census.open_soma(uri=census_uri, context=worker_soma_context) as census:
+        # use GeneformerTokenizer to build dataset of those cells
+        with GeneformerTokenizer(
+            census["census_data"]["homo_sapiens"],
+            obs_query=tiledbsoma.AxisQuery(coords=(np.array(obs_df.index),)),
+            obs_attributes=[
+                # cell_subclass isn't yet in Census (select_cells() added it to obs_df for us), so
+                # exclude from the experiment axis query
+                it
+                for it in obs_columns
+                if it not in ("cell_subclass", "cell_subclass_ontology_term_id")
+            ],
+            **tokenizer_kwargs,
+        ) as tokenizer:
+            dataset = tokenizer.build()
+
+    # add back cell_subclass from obs_df
+    def add_cell_subclass(it):
+        ans = {}
+        if "cell_subclass_ontology_term_id" in obs_columns:
+            ans["cell_subclass_ontology_term_id"] = obs_df.loc[it["soma_joinid"]]["cell_subclass_ontology_term_id"]
+        if "cell_subclass" in obs_columns:
+            ans["cell_subclass"] = obs_df.loc[it["soma_joinid"]]["cell_subclass"]
+        return ans
+
+    if "cell_subclass" in obs_df and "cell_subclass_ontology_term_id" in obs_df:
+        dataset = dataset.map(add_cell_subclass)
+
+    # save to output_dir
+    dataset.save_to_disk(output_dir)
+    logger.info("saved " + output_dir)
+
+
 if __name__ == "__main__":
     sys.exit(main(sys.argv))
diff --git a/tools/models/geneformer/wdl/finetune_geneformer.wdl b/tools/models/geneformer/wdl/finetune_geneformer.wdl
index 47f9a9dd5..6471fdd20 100644
--- a/tools/models/geneformer/wdl/finetune_geneformer.wdl
+++ b/tools/models/geneformer/wdl/finetune_geneformer.wdl
@@ -32,6 +32,8 @@ task finetune_geneformer {
         cpu: 48
         memory: "160G"
         gpu: true
+        acceleratorType: "nvidia-tesla-a10g"
+        acceleratorCount: 8
         docker: docker
     }
 }
diff --git a/tools/models/geneformer/wdl/generate_embeddings.wdl b/tools/models/geneformer/wdl/generate_embeddings.wdl
index 9f10af87a..c2a6b1914 100644
--- a/tools/models/geneformer/wdl/generate_embeddings.wdl
+++ b/tools/models/geneformer/wdl/generate_embeddings.wdl
@@ -2,25 +2,32 @@ version development
 
 workflow scatter_generate_embeddings {
     input {
-        Directory dataset
+        Array[Directory] dataset_shards
         Directory model
         String output_uri
+        String? model_type
         Int? emb_layer
-        Int parts = 10
+        String? features
 
         String s3_region = "us-west-2"
         String docker
     }
 
+    # work around any tooling that might try to verify pre-existence of the output URI when
+    # launching the workflow:
+    String output_uri2 = sub(output_uri, "s3_//", "s3://")
+
+    # create the output TileDB array
     call init_embeddings_array {
         input:
-        uri = output_uri, s3_region, docker
+        uri = output_uri2, s3_region, docker
     }
 
-    scatter (part in range(parts)) {
+    # generate each shard's embeddings and write them into the above-created array
+    scatter (shard in dataset_shards) {
         call generate_embeddings after init_embeddings_array {
             input:
-            dataset, model, emb_layer, output_uri, s3_region, part, parts, docker
+            dataset = shard, output_uri = output_uri2, model, model_type, emb_layer, features, s3_region, docker
         }
     }
 
@@ -70,11 +77,9 @@ task generate_embeddings {
         String output_uri
         String s3_region
 
+        String model_type = "CellClassifier"
         Int emb_layer = -1  # -1 or 0
-
-        # for scattering over partitions: process only part# of parts
-        Int? part
-        Int parts = 1
+        String features = "soma_joinid,cell_type,cell_type_ontology_term_id,cell_subclass,cell_subclass_ontology_term_id"
 
         String docker
     }
@@ -89,7 +94,7 @@ task generate_embeddings {
         export AWS_DEFAULT_REGION='~{s3_region}'
         export TQDM_MININTERVAL=10
         python3 /census-geneformer/generate-geneformer-embeddings.py \
-            --emb-layer ~{emb_layer} ~{"--part " + part} --parts ~{parts} --batch-size 10 --tiledbsoma \
+            --model-type ~{model_type} --emb-layer ~{emb_layer} --features '~{features}' --batch-size 10 --tiledbsoma \
             '~{model}' '~{dataset}' '~{output_uri}'
     >>>
 
@@ -98,6 +103,8 @@ task generate_embeddings {
         cpu: 8
         memory: "30G"
         gpu: true
+        acceleratorCount: 1
+        acceleratorType: "nvidia-tesla-a10g"
         docker: docker
         # for robustness to sporadic errors e.g.
         # https://github.com/pytorch/pytorch/issues/21819
diff --git a/tools/models/geneformer/wdl/prepare_datasets.wdl b/tools/models/geneformer/wdl/prepare_datasets.wdl
index 2c3b0700e..eeea95c7c 100644
--- a/tools/models/geneformer/wdl/prepare_datasets.wdl
+++ b/tools/models/geneformer/wdl/prepare_datasets.wdl
@@ -6,9 +6,12 @@ task prepare_census_geneformer_dataset {
 
         String value_filter = "is_primary_data==True"
         Array[String] obs_columns = ["soma_joinid", "cell_type", "cell_type_ontology_term_id", "cell_subclass", "cell_subclass_ontology_term_id"]
+        Int percentage_data = 100
         Int N = 0
         String sampling_column = "cell_subclass"
         String census_version = "stable"
+        String tokenizer_kwargs = "{}"
+        Int shards = 1
 
         String docker
     }
@@ -21,8 +24,11 @@ task prepare_census_geneformer_dataset {
         export TQDM_MININTERVAL=10
         python3 /census-geneformer/prepare-census-geneformer-dataset.py \
             -c '~{sep(",",obs_columns)}' \
-            --value-filter '~{value_filter}' -N ~{N} --sampling-column '~{sampling_column}' \
+            --value-filter '~{value_filter}' \
+            -p ~{percentage_data} -N ~{N} --sampling-column '~{sampling_column}' \
             -v ~{census_version} \
+            --tokenizer-kwargs '~{tokenizer_kwargs}' \
+            --shards ~{shards} \
             ~{output_name}
     >>>
 
@@ -33,7 +39,7 @@ task prepare_census_geneformer_dataset {
 
     runtime {
         cpu: 8
-        memory: "90G"
+        memory: "120G"
         docker: docker
     }
 }
diff --git a/tools/models/scvi/scvi-config.yaml b/tools/models/scvi/scvi-config.yaml
index 656d45cd1..b8f017fc3 100644
--- a/tools/models/scvi/scvi-config.yaml
+++ b/tools/models/scvi/scvi-config.yaml
@@ -5,6 +5,8 @@ census:
     null
   obs_query_model: # Required when loading data for model training. Do not change.
     'is_primary_data == True and nnz >= 300'
+  version:
+    "2024-05-20"
 hvg:
   top_n_hvg: 
     8000
@@ -19,7 +21,7 @@ model:
   filename: "scvi.model"
   n_hidden: 512
   n_latent: 50
-  n_layers: 1
+  n_layers: 2
 train:
   max_epochs: 100
   batch_size: 1024
diff --git a/tools/models/scvi/scvi-create-latent-update.py b/tools/models/scvi/scvi-create-latent-update.py
index 5aeaa34b6..0a70e2596 100644
--- a/tools/models/scvi/scvi-create-latent-update.py
+++ b/tools/models/scvi/scvi-create-latent-update.py
@@ -14,12 +14,13 @@
     with open(file) as f:
         config = yaml.safe_load(f)
 
-    census = cellxgene_census.open_soma(census_version="2023-12-15")
-
     census_config = config.get("census")
     experiment_name = census_config.get("organism")
     obs_value_filter = census_config.get("obs_query")
 
+    version = census_config.get("version")
+    census = cellxgene_census.open_soma(census_version=version)
+
     hv = pd.read_pickle("hv_genes.pkl")
     hv_idx = hv[hv].index
 
@@ -44,7 +45,7 @@
 
     adata.var.set_index("feature_id", inplace=True)
 
-    idx = query.obs(column_names=["soma_joinid"]).concat().to_pandas().index.to_numpy()
+    idx = query.obs(column_names=["soma_joinid"]).concat().to_pandas().to_numpy()
 
     del census, query, hv, hv_idx
     gc.collect()
diff --git a/tools/models/scvi/scvi-prepare.py b/tools/models/scvi/scvi-prepare.py
index 4c2214629..b66c05833 100644
--- a/tools/models/scvi/scvi-prepare.py
+++ b/tools/models/scvi/scvi-prepare.py
@@ -11,13 +11,14 @@
     with open(file) as f:
         config = yaml.safe_load(f)
 
-    census = cellxgene_census.open_soma(census_version="2023-12-15")
-
     census_config = config.get("census")
     experiment_name = census_config.get("organism")
     obs_query = census_config.get("obs_query")
     obs_query_model = census_config.get("obs_query_model")
 
+    version = census_config.get("version")
+    census = cellxgene_census.open_soma(census_version=version)
+
     if obs_query is None:
         obs_value_filter = obs_query_model
     else: