Skip to content

Commit

Permalink
Merge branch 'main' into ebezzi/scib-metrics-scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
ebezzi committed Jul 9, 2024
2 parents 8c9aac1 + 55d0856 commit 3f55e66
Show file tree
Hide file tree
Showing 72 changed files with 1,815 additions and 691 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/docsite-build-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ on:
workflow_dispatch: # Used to make post-release docfixes
permissions:
contents: write

env:
CELLXGENE_CENSUS_USERAGENT: "CZI-test"

jobs:
build-and-deploy:
concurrency: ci-${{ github.ref }}
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/full-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ on:
default: ""
type: string

env:
CELLXGENE_CENSUS_USERAGENT: "CZI-test"

jobs:
py_unit_tests:
runs-on: single-cell-1tb-runner
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/lts-compat-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ on:
schedule:
- cron: "30 1 * * *"
workflow_dispatch: # used for debugging or manual validation


env:
CELLXGENE_CENSUS_USERAGENT: "CZI-test"

jobs:
python-compat-check:
name: Python LTS compatibility check
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/profiler.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
name: Profiler

env:
CELLXGENE_CENSUS_USERAGENT: "CZI-test"

on:
pull_request:
paths:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/py-dependency-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ on:
- cron: "30 1 * * *"
workflow_dispatch: # used for debugging or manual validation

env:
CELLXGENE_CENSUS_USERAGENT: "CZI-test"

jobs:
python-dependency-check:
name: python-dependency-check
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/py-formatting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ on:
push:
branches: [main]

env:
CELLXGENE_CENSUS_USERAGENT: "CZI-test"

jobs:
pre_commit_checks:
name: pre-commit checks
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/py-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
CELLXGENE_CENSUS_USERAGENT: "CZI-test"

jobs:
unit_tests_python_api:
strategy:
Expand Down Expand Up @@ -41,6 +44,9 @@ jobs:
pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
pip install -e './api/python/cellxgene_census/[experimental]'
- name: Install Geneformer (python >=3.10 only)
run: pip install git+https://huggingface.co/ctheodoris/Geneformer@471eefc
if: matrix.python-version != '3.8' && matrix.python-version != '3.9'
- name: Report Dependency Versions
run: pip list
- name: Test with pytest (API, main tests)
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/r-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ on:
push:
branches: [main]

env:
CELLXGENE_CENSUS_USERAGENT: "CZI-test"

jobs:
build:
strategy:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/r-dependency-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ on:
- cron: "30 1 * * *"
workflow_dispatch: # used for debugging or manual validation

env:
CELLXGENE_CENSUS_USERAGENT: "CZI-test"

jobs:
r-dependency-check:
name: r-dependency-check
Expand Down
14 changes: 4 additions & 10 deletions api/python/cellxgene_census/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,13 @@ import cellxgene_census

with cellxgene_census.open_soma() as census:

# Reads SOMADataFrame as a slice
cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
cell_metadata = cellxgene_census.get_obs(
census,
"homo_sapiens",
value_filter = "sex == 'female' and cell_type in ['microglial cell', 'neuron']",
column_names = ["assay", "cell_type", "tissue", "tissue_general", "suspension_type", "disease"]
)

# Concatenates results to pyarrow.Table
cell_metadata = cell_metadata.concat()

# Converts to pandas.DataFrame
cell_metadata = cell_metadata.to_pandas()

print(cell_metadata)
cell_metadata
```

The output is a `pandas.DataFrame` with over 600K cells meeting our query criteria and the selected columns:
Expand Down
2 changes: 1 addition & 1 deletion api/python/cellxgene_census/scripts/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ twine
coverage
nbqa
transformers[torch]
git+https://huggingface.co/ctheodoris/Geneformer@8df5dc1
owlready2
proxy.py
30 changes: 26 additions & 4 deletions api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""

from typing import Literal, Optional, Sequence
from warnings import warn

import anndata
import pandas as pd
Expand Down Expand Up @@ -38,6 +39,8 @@ def get_anndata(
column_names: Optional[soma.AxisColumnNames] = None,
obs_embeddings: Optional[Sequence[str]] = (),
var_embeddings: Optional[Sequence[str]] = (),
obs_column_names: Optional[Sequence[str]] = None,
var_column_names: Optional[Sequence[str]] = None,
) -> anndata.AnnData:
"""Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query,
and return it as an :class:`anndata.AnnData` object.
Expand Down Expand Up @@ -65,8 +68,6 @@ def get_anndata(
var_coords:
Coordinates for the ``var`` axis, which is indexed by the ``soma_joinid`` value.
May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
column_names:
Columns to fetch for ``obs`` and ``var`` dataframes.
obsm_layers:
Additional obsm layers to read and return in the ``obsm`` slot.
obsp_layers:
Expand All @@ -83,6 +84,10 @@ def get_anndata(
Additional embeddings to be returned as part of the ``varm`` slot.
Use :func:`get_all_available_embeddings` to retrieve available embeddings
for this Census version and organism.
obs_column_names:
Columns to fetch for ``obs`` dataframe.
var_column_names:
Columns to fetch for ``var`` dataframe.
Returns:
An :class:`anndata.AnnData` object containing the census slice.
Expand All @@ -93,7 +98,7 @@ def get_anndata(
Examples:
>>> get_anndata(census, "Mus musculus", obs_value_filter="tissue_general in ['brain', 'lung']")
>>> get_anndata(census, "Homo sapiens", column_names={"obs": ["tissue"]})
>>> get_anndata(census, "Homo sapiens", obs_column_names=["tissue"])
>>> get_anndata(census, "Homo sapiens", obs_coords=slice(0, 1000))
"""
Expand All @@ -107,14 +112,31 @@ def get_anndata(
if varm_layers and var_embeddings and set(varm_layers) & set(var_embeddings):
raise ValueError("Cannot request both `varm_layers` and `var_embeddings` for the same embedding name")

# Backwards compat for old column_names argument
if column_names is not None:
if obs_column_names is not None or var_column_names is not None:
raise ValueError(
"Both the deprecated 'column_names' argument and its replacements were used. Please use 'obs_column_names' and 'var_column_names' only."
)
else:
warn(
"The argument `column_names` is deprecated and will be removed in a future release. Please use `obs_column_names` and `var_column_names` instead.",
FutureWarning,
stacklevel=2,
)
if "obs" in column_names:
obs_column_names = column_names["obs"]
if "var" in column_names:
var_column_names = column_names["var"]

with exp.axis_query(
measurement_name,
obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords),
var_query=soma.AxisQuery(value_filter=var_value_filter, coords=var_coords),
) as query:
adata = query.to_anndata(
X_name=X_name,
column_names=column_names,
column_names={"obs": obs_column_names, "var": var_column_names},
X_layers=X_layers,
obsm_layers=obsm_layers,
varm_layers=varm_layers,
Expand Down
14 changes: 10 additions & 4 deletions api/python/cellxgene_census/src/cellxgene_census/_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,14 @@
_get_census_mirrors,
get_census_version_description,
)
from ._util import _uri_join
from ._util import _uri_join, _user_agent

DEFAULT_CENSUS_VERSION = "stable"

DEFAULT_S3FS_KWARGS = {
"anon": True,
"cache_regions": True,
}
DEFAULT_TILEDB_CONFIGURATION: Dict[str, Any] = {
# https://docs.tiledb.com/main/how-to/configuration#configuration-parameters
"py.init_buffer_bytes": 1 * 1024**3,
Expand Down Expand Up @@ -120,7 +124,9 @@ def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) ->
Lifecycle:
experimental
"""
tiledb_config = dict(DEFAULT_TILEDB_CONFIGURATION, **(tiledb_config or {}))
tiledb_config = dict(
DEFAULT_TILEDB_CONFIGURATION, **{"vfs.s3.custom_headers.User-Agent": _user_agent()}, **(tiledb_config or {})
)
return soma.options.SOMATileDBContext().replace(tiledb_config=tiledb_config)


Expand Down Expand Up @@ -343,8 +349,8 @@ def download_source_h5ad(
assert protocol == "s3"

fs = s3fs.S3FileSystem(
anon=True,
cache_regions=True,
config_kwargs={"user_agent": _user_agent()},
**DEFAULT_S3FS_KWARGS,
)
fs.get_file(
locator["uri"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@

import typing
from collections import OrderedDict
from typing import Dict, Literal, Optional, Union, cast
from typing import Any, Dict, Literal, Optional, Union, cast

import requests
from typing_extensions import NotRequired, TypedDict

from cellxgene_census._util import _user_agent

"""
The following types describe the expected directory of Census builds, used
to bootstrap all data location requests.
Expand Down Expand Up @@ -350,10 +352,10 @@ def get_census_version_directory(
}
}
"""
response = requests.get(CELL_CENSUS_RELEASE_DIRECTORY_URL)
response = requests.get(CELL_CENSUS_RELEASE_DIRECTORY_URL, headers={"User-Agent": _user_agent()})
response.raise_for_status()

directory: CensusDirectory = cast(CensusDirectory, response.json())
directory: dict[str, str | dict[str, Any]] = response.json()
directory_out: CensusDirectory = {}
aliases: typing.Set[CensusVersionName] = set()

Expand All @@ -379,6 +381,11 @@ def get_census_version_directory(
if not isinstance(directory_value, dict):
continue

# Filter fields
directory_value = {
k: directory_value[k] for k in CensusVersionDescription.__annotations__ if k in directory_value
}

# filter by release flags
census_version_description = cast(CensusVersionDescription, directory_value)
release_flags = cast(ReleaseFlags, {"lts": lts, "retracted": retracted})
Expand Down Expand Up @@ -425,6 +432,6 @@ def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]:


def _get_census_mirrors() -> CensusMirrors:
response = requests.get(CELL_CENSUS_MIRRORS_DIRECTORY_URL)
response = requests.get(CELL_CENSUS_MIRRORS_DIRECTORY_URL, headers={"User-Agent": _user_agent()})
response.raise_for_status()
return cast(CensusMirrors, response.json())
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""This module defines a plugin class that logs each request to a logfile.
This class needs to be importable by the proxy server which runs in a separate process.
See the user agent tests for usage.
"""

import json
import traceback
from pathlib import Path

import proxy
from proxy.common.flag import flags

flags.add_argument(
"--request-log-file",
type=str,
default="",
help="Where to log the requests to.",
)


class RequestLoggerPlugin(proxy.http.proxy.HttpProxyBasePlugin): # type: ignore
def handle_client_request(self, request: proxy.http.parser.HttpParser) -> proxy.http.parser.HttpParser:
# If anything fails in here, it just fails to respond
try:
with Path(self.flags.request_log_file).open("a") as f:
record = {
"method": request.method.decode(),
"url": str(request._url),
}

if request.headers:
record["headers"] = {k2.decode().lower(): v.decode() for _, (k2, v) in request.headers.items()}
f.write(f"{json.dumps(record)}\n")
except Exception as e:
# Making sure there is some visible output
print(repr(e))
traceback.print_exception(e)
raise e
return request
14 changes: 14 additions & 0 deletions api/python/cellxgene_census/src/cellxgene_census/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import tiledbsoma as soma

USER_AGENT_ENVVAR = "CELLXGENE_CENSUS_USERAGENT"
"""Environment variable used to add more information into the user-agent."""


def _uri_join(base: str, url: str) -> str:
"""Like urllib.parse.urljoin, but doesn't get confused by s3://."""
Expand Down Expand Up @@ -30,3 +33,14 @@ def _extract_census_version(census: soma.Collection) -> str:
raise ValueError("Unable to extract Census version.") from None

return version


def _user_agent() -> str:
import os

import cellxgene_census

if env_specifier := os.environ.get(USER_AGENT_ENVVAR, None):
return f"cellxgene-census-python/{cellxgene_census.__version__} {env_specifier}"
else:
return f"cellxgene-census-python/{cellxgene_census.__version__}"
Loading

0 comments on commit 3f55e66

Please sign in to comment.