Skip to content

Commit

Permalink
Change Dataset.dims to Dataset.sizes to address Xarray FutureWarning
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite authored and mergify[bot] committed Dec 18, 2023
1 parent 2ab47b5 commit a755269
Show file tree
Hide file tree
Showing 31 changed files with 97 additions and 97 deletions.
4 changes: 2 additions & 2 deletions sgkit/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def truncate(ds: xr.Dataset, max_sizes: Mapping[Hashable, int]) -> xr.Dataset:
"""
sel = dict()
for dim, size in max_sizes.items():
if ds.dims[dim] <= size:
if ds.sizes[dim] <= size:
# No truncation required
pass
else:
Expand Down Expand Up @@ -194,7 +194,7 @@ def display_genotypes(
ds_calls = set_index_if_unique(ds_calls, "variants", variant_index)
# convert call genotypes to strings
calls = ds_calls["call_genotype"].values
max_chars = max(2, len(str(ds.dims["alleles"] - 1)))
max_chars = max(2, len(str(ds.sizes["alleles"] - 1)))
if "call_genotype_phased" in ds_calls:
phased = ds_calls["call_genotype_phased"].values
else:
Expand Down
4 changes: 2 additions & 2 deletions sgkit/io/bgen/bgen_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,8 +486,8 @@ def rechunk_bgen(
if isinstance(output, Path):
output = str(output)

chunk_length = min(chunk_length, ds.dims["variants"])
chunk_width = min(chunk_width, ds.dims["samples"])
chunk_length = min(chunk_length, ds.sizes["variants"])
chunk_width = min(chunk_width, ds.sizes["samples"])

if pack:
ds = pack_variables(ds)
Expand Down
4 changes: 2 additions & 2 deletions sgkit/io/plink/plink_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ def write_plink(
raise ValueError(
"Either `path` or all 3 of `{bed,bim,fam}_path` must be specified but not both"
)
if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
if "ploidy" in ds.sizes and ds.sizes["ploidy"] != 2:
raise ValueError("write_plink only works for diploid genotypes")
if "alleles" in ds.dims and ds.dims["alleles"] != 2:
if "alleles" in ds.sizes and ds.sizes["alleles"] != 2:
raise ValueError("write_plink only works for biallelic genotypes")

if path:
Expand Down
4 changes: 2 additions & 2 deletions sgkit/io/vcf/vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,8 +571,8 @@ def vcf_to_zarr_sequential(

if first_variants_chunk:
# limit chunk width to actual number of samples seen in first chunk
if ds.dims["samples"] > 0:
chunk_width = min(chunk_width, ds.dims["samples"])
if ds.sizes["samples"] > 0:
chunk_width = min(chunk_width, ds.sizes["samples"])

# ensure that booleans are not stored as int8 by xarray https://github.com/pydata/xarray/issues/4386
for var in ds.data_vars:
Expand Down
6 changes: 3 additions & 3 deletions sgkit/io/vcf/vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def write_vcf(

print(vcf_header, end="", file=output)

if input.dims["variants"] == 0:
if input.sizes["variants"] == 0:
return

header_info_fields = _info_fields(vcf_header)
Expand All @@ -174,8 +174,8 @@ def dataset_chunk_to_vcf(

ds = ds.load() # load dataset chunk into memory

n_variants = ds.dims["variants"] # number of variants in this chunk
n_samples = ds.dims["samples"] # number of samples in whole dataset
n_variants = ds.sizes["variants"] # number of variants in this chunk
n_samples = ds.sizes["samples"] # number of samples in whole dataset

# fixed fields

Expand Down
4 changes: 2 additions & 2 deletions sgkit/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ def create_genotype_dosage_dataset(

def num_contigs(ds: xr.Dataset) -> ArrayLike:
"""Return the number of contigs in a dataset."""
if DIM_CONTIG in ds.dims:
return ds.dims[DIM_CONTIG]
if DIM_CONTIG in ds.sizes:
return ds.sizes[DIM_CONTIG]
else:
return len(ds.attrs["contigs"])

Expand Down
18 changes: 9 additions & 9 deletions sgkit/stats/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def count_call_alleles(
from .aggregation_numba_fns import count_alleles

variables.validate(ds, {call_genotype: variables.call_genotype_spec})
n_alleles = ds.dims["alleles"]
n_alleles = ds.sizes["alleles"]
G = da.asarray(ds[call_genotype])
shape = (G.chunks[0], G.chunks[1], n_alleles)
# use numpy array to avoid dask task dependencies between chunks
Expand Down Expand Up @@ -170,8 +170,8 @@ def count_variant_alleles(
from .aggregation_numba_fns import count_alleles

variables.validate(ds, {call_genotype: variables.call_genotype_spec})
n_alleles = ds.dims["alleles"]
n_variant = ds.dims["variants"]
n_alleles = ds.sizes["alleles"]
n_variant = ds.sizes["variants"]
G = da.asarray(ds[call_genotype]).reshape((n_variant, -1))
shape = (G.chunks[0], n_alleles)
# use uint64 dummy array to return uin64 counts array
Expand Down Expand Up @@ -227,7 +227,7 @@ def count_cohort_alleles(
>>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)
>>> # Divide samples into two cohorts
>>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples")
>>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.sizes["samples"] // 2), dims="samples")
>>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
samples S0 S1 S2 S3
variants
Expand Down Expand Up @@ -364,8 +364,8 @@ def count_variant_genotypes(
mixed_ploidy = ds[call_genotype].attrs.get("mixed_ploidy", False)
if mixed_ploidy:
raise ValueError("Mixed-ploidy dataset")
ploidy = ds.dims["ploidy"]
n_alleles = ds.dims["alleles"]
ploidy = ds.sizes["ploidy"]
n_alleles = ds.sizes["alleles"]
n_genotypes = _comb_with_replacement(n_alleles, ploidy)
G = da.asarray(ds[call_genotype].data)
N = np.empty(n_genotypes, np.uint64)
Expand Down Expand Up @@ -432,8 +432,8 @@ def genotype_coords(
"""
from .conversion_numba_fns import _comb_with_replacement, _index_as_genotype

n_alleles = ds.dims["alleles"]
ploidy = ds.dims["ploidy"]
n_alleles = ds.sizes["alleles"]
ploidy = ds.sizes["ploidy"]
n_genotypes = _comb_with_replacement(n_alleles, ploidy)
max_chars = len(str(n_alleles - 1))
# dummy variable for ploidy dim also specifies output dtype
Expand Down Expand Up @@ -553,7 +553,7 @@ def cohort_allele_frequencies(
>>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)
>>> # Divide samples into two cohorts
>>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples")
>>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.sizes["samples"] // 2), dims="samples")
>>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
samples S0 S1 S2 S3
variants
Expand Down
2 changes: 1 addition & 1 deletion sgkit/stats/association.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def gwas_linear_regression(

if len(covariates) == 0:
if add_intercept:
X = da.ones((ds.dims["samples"], 1), dtype=np.float32)
X = da.ones((ds.sizes["samples"], 1), dtype=np.float32)
else:
raise ValueError("add_intercept must be True if no covariates specified")
else:
Expand Down
6 changes: 3 additions & 3 deletions sgkit/stats/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def convert_call_to_index(
raise ValueError("Mixed-ploidy dataset")
G = da.asarray(ds[call_genotype].data)
shape = G.chunks[0:2]
if ds.dims.get("alleles") == 2: # default to general case
if ds.sizes.get("alleles") == 2: # default to general case
X = da.map_blocks(
biallelic_genotype_call_index,
G,
Expand Down Expand Up @@ -169,10 +169,10 @@ def convert_probability_to_call(
variables.validate(
ds, {call_genotype_probability: variables.call_genotype_probability_spec}
)
if ds.dims["genotypes"] != 3:
if ds.sizes["genotypes"] != 3:
raise NotImplementedError(
f"Hard call conversion only supported for diploid, biallelic genotypes; "
f"num genotypes in provided probabilities array = {ds.dims['genotypes']}."
f"num genotypes in provided probabilities array = {ds.sizes['genotypes']}."
)
GP = da.asarray(ds[call_genotype_probability])
# Remove chunking in genotypes dimension, if present
Expand Down
8 changes: 4 additions & 4 deletions sgkit/stats/grm.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def genomic_relationship(
[1, 0, 0],
[1, 1, 2]], dtype=uint8)
>>> # use sample population frequency as ancestral frequency
>>> ds["sample_frequency"] = ds.call_dosage.mean(dim="samples") / ds.dims["ploidy"]
>>> ds["sample_frequency"] = ds.call_dosage.mean(dim="samples") / ds.sizes["ploidy"]
>>> ds = sg.genomic_relationship(ds, ancestral_frequency="sample_frequency")
>>> ds.stat_genomic_relationship.values # doctest: +NORMALIZE_WHITESPACE
array([[ 0.93617021, -0.21276596, -0.72340426],
Expand Down Expand Up @@ -208,7 +208,7 @@ def genomic_relationship(
[ 2., 2., 0., 0.]])
>>> ds["sample_frequency"] = ds.call_dosage.mean(
... dim="samples", skipna=True
... ) / ds.dims["ploidy"]
... ) / ds.sizes["ploidy"]
>>> ds = sg.genomic_relationship(
... ds, ancestral_frequency="sample_frequency", skipna=True
... )
Expand Down Expand Up @@ -249,7 +249,7 @@ def genomic_relationship(
[2. , 2. , 0. , 0. ]])
>>> ds["sample_frequency"] = ds.call_dosage.mean(
... dim="samples", skipna=True
... ) / ds.dims["ploidy"]
... ) / ds.sizes["ploidy"]
>>> ds = sg.genomic_relationship(
... ds,
... call_dosage="call_dosage_imputed",
Expand Down Expand Up @@ -293,7 +293,7 @@ def genomic_relationship(

estimator = estimator or EST_VAN_RADEN
# TODO: raise on mixed ploidy
ploidy = ploidy or ds.dims.get("ploidy")
ploidy = ploidy or ds.sizes.get("ploidy")
if ploidy is None:
raise ValueError("Ploidy must be specified when the ploidy dimension is absent")
dosage = da.array(ds[call_dosage].data)
Expand Down
4 changes: 2 additions & 2 deletions sgkit/stats/hwe.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,15 +201,15 @@ def hardy_weinberg_test(
“A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
Human Genetics 76 (5): 887–93.
"""
ploidy = ploidy or ds.dims.get("ploidy")
ploidy = ploidy or ds.sizes.get("ploidy")
if not ploidy:
raise ValueError(
"`ploidy` parameter must be set when not present as dataset dimension."
)
if ploidy != 2:
raise NotImplementedError("HWE test only implemented for diploid genotypes")

alleles = alleles or ds.dims.get("alleles")
alleles = alleles or ds.sizes.get("alleles")
if not alleles:
raise ValueError(
"`alleles` parameter must be set when not present as dataset dimension."
Expand Down
4 changes: 2 additions & 2 deletions sgkit/stats/ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ def ld_prune(
>>> import numpy as np
>>> import sgkit as sg
>>> ds = sg.simulate_genotype_call_dataset(n_variant=10, n_sample=4)
>>> ds.dims["variants"]
>>> ds.sizes["variants"]
10
>>> # Calculate dosage
Expand All @@ -466,7 +466,7 @@ def ld_prune(
>>> ds = sg.window_by_variant(ds, size=5)
>>> pruned_ds = sg.ld_prune(ds)
>>> pruned_ds.dims["variants"]
>>> pruned_ds.sizes["variants"]
6
"""
ldm = ld_matrix(ds, dosage=dosage, threshold=threshold, variant_score=variant_score)
Expand Down
4 changes: 2 additions & 2 deletions sgkit/stats/pc_relate.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,9 @@ def pc_relate(
"""
if maf <= 0.0 or maf >= 1.0:
raise ValueError("MAF must be between (0.0, 1.0)")
if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
if "ploidy" in ds.sizes and ds.sizes["ploidy"] != 2:
raise ValueError("PC Relate only works for diploid genotypes")
if "alleles" in ds.dims and ds.dims["alleles"] != 2:
if "alleles" in ds.sizes and ds.sizes["alleles"] != 2:
raise ValueError("PC Relate only works for biallelic genotypes")
variables.validate(
ds,
Expand Down
4 changes: 2 additions & 2 deletions sgkit/stats/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ def pca_est(
) -> BaseEstimator:
"""Create PCA estimator"""
if ploidy is None:
if "ploidy" not in ds.dims:
if "ploidy" not in ds.sizes:
raise ValueError(
"`ploidy` must be specified explicitly when not present in dataset dimensions"
)
ploidy = ds.dims["ploidy"]
ploidy = ds.sizes["ploidy"]
scaler = scaler or "patterson"
if isinstance(scaler, str):
if scaler != "patterson":
Expand Down
16 changes: 8 additions & 8 deletions sgkit/stats/pedigree.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,7 +1049,7 @@ def pedigree_kinship(
)
if method == "diploid":
# check ploidy dimension and assume diploid if it's absent
if ds.dims.get("ploidy", 2) != 2:
if ds.sizes.get("ploidy", 2) != 2:
raise ValueError("Dataset is not diploid")
if founder_kinship is None:
func = da.gufunc(
Expand Down Expand Up @@ -1552,9 +1552,9 @@ def pedigree_inbreeding(
parent = da.asarray(ds[parent].data, chunks=ds[parent].shape)
if method == "diploid":
# check ploidy dimension and assume diploid if it's absent
if ds.dims.get("ploidy", 2) != 2:
if ds.sizes.get("ploidy", 2) != 2:
raise ValueError("Dataset is not diploid")
if ds.dims["parents"] != 2:
if ds.sizes["parents"] != 2:
raise ValueError("The parents dimension must be length 2")
tau = da.ones_like(parent, int)
lambda_ = da.zeros_like(parent, float)
Expand Down Expand Up @@ -1893,9 +1893,9 @@ def pedigree_inverse_kinship(
parent = ds[parent].data
if method == "diploid":
# check ploidy dimension and assume diploid if it's absent
if ds.dims.get("ploidy", 2) != 2:
if ds.sizes.get("ploidy", 2) != 2:
raise ValueError("Dataset is not diploid")
if ds.dims["parents"] != 2:
if ds.sizes["parents"] != 2:
raise ValueError("The parents dimension must be length 2")
tau = da.ones_like(parent, int)
lambda_ = da.zeros_like(parent, float)
Expand Down Expand Up @@ -2128,9 +2128,9 @@ def pedigree_sel(
idx |= (depth >= 0) & (depth <= descendant_depth)
keep = ds.samples.values[idx]
selection = {"samples": keep}
if sel_samples_0 and ("samples_0" in ds.dims):
if sel_samples_0 and ("samples_0" in ds.sizes):
selection["samples_0"] = keep
if sel_samples_1 and ("samples_1" in ds.dims):
if sel_samples_1 and ("samples_1" in ds.sizes):
selection["samples_1"] = keep
new_ds = ds.sel(selection)
if update_parent_id:
Expand Down Expand Up @@ -2241,7 +2241,7 @@ def pedigree_contribution(
parent = da.asarray(ds[parent].data, chunks=ds[parent].shape)
n_sample, n_parent = parent.shape
if method == "even":
if bool(ds.dims.get("ploidy", 2) % 2):
if bool(ds.sizes.get("ploidy", 2) % 2):
raise ValueError("The 'even' method requires an even-ploidy dataset")
if n_parent != 2:
raise ValueError(
Expand Down
Loading

0 comments on commit a755269

Please sign in to comment.