Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP Domain variables cell measures coordinates #1129

Draft
wants to merge 2 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions compliance_checker/cf/cf_1_6.py
Original file line number Diff line number Diff line change
Expand Up @@ -2252,7 +2252,6 @@ def check_multi_dimensional_coords(self, ds):
# IS THIS EVEN NEEDED ANYMORE?
# ***************
def check_grid_coordinates(self, ds):
# def _check_grid_coordinates(self, ds):
"""
5.6 When the coordinate variables for a horizontal grid are not
longitude and latitude, it is required that the true latitude and
Expand All @@ -2266,7 +2265,7 @@ def check_grid_coordinates(self, ds):
latitudes = cfutil.get_true_latitude_variables(ds)
longitudes = cfutil.get_true_longitude_variables(ds)

check_featues = [
check_features = [
"2d-regular-grid",
"2d-static-grid",
"3d-regular-grid",
Expand All @@ -2287,7 +2286,7 @@ def check_grid_coordinates(self, ds):
# dimensions
dimensions = set(ds.variables[variable].dimensions)
# If it's not a grid, skip it
if cfutil.guess_feature_type(ds, variable) not in check_featues:
if cfutil.guess_feature_type(ds, variable) not in check_features:
continue
has_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5.6"])

Expand Down
6 changes: 3 additions & 3 deletions compliance_checker/cf/cf_1_8.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,17 +169,17 @@ def check_geometry(self, ds: Dataset):
results.append(geom_valid.to_result())
continue

node_count = reference_attr_variables(
node_count, node_count_errors = reference_attr_variables(
ds,
getattr(geometry_var, "node_count", None),
)
# multipart lines and polygons only
part_node_count = reference_attr_variables(
part_node_count, part_node_count_errors = reference_attr_variables(
ds,
getattr(geometry_var, "part_node_count", None),
)
# polygons with interior geometry only
interior_ring = reference_attr_variables(
interior_ring, interior_ring_errors = reference_attr_variables(
ds,
getattr(geometry_var, "interior_ring", None),
)
Expand Down
137 changes: 99 additions & 38 deletions compliance_checker/cf/cf_1_9.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,61 +112,122 @@ def check_domain_variables(self, ds: Dataset):
var
for var in ds.get_variables_by_attributes(
coordinates=lambda c: c is not None,
)
# IMPLICIT CONFORMANCE REQUIRED 1/4
)):
# all variables have dimensions attribute, but should be scalar
if var.dimensions == ()
):
if not ds.dimensions == ():
continue

# IMPLICIT CONFORMANCE REQUIRED 1/4
# Has a dimensions *NetCDF* attribute
try:
dim_nc_attr = var.getncattr(dimensions)
# most variables are unlikely to be domain variables, so don't treat this
# as a failure
except AttributeError:
continue
# IMPLICIT CONFORMANCE REQUIRED 2/4
# Aforementioned dimensions attribute is comprised of space separated
# dimension names which must exist in the file
domain_valid = TestCtx(BaseCheck.MEDIUM, self.section_titles["5.8"])
domain_valid.out_of += 1
domain_coord_vars = reference_attr_variables(
domain_valid.out_of += 2
domain_dims, dim_errors = reference_attr_variables(
ds,
domain_var.coordinates,
domain_var.getncattr("dimensions"),
" ",
"dimensions"
)
if dim_errors:
errors_str = ", ".join(dim_errors)
domain_valid.messages.append(
"Could not find the following "
"dimensions referenced in "
"dimensions attribute from "
"domain variable "
f"{domain_var.name}: {errors_str}",
)
else:
domain_valid.score += 1
domain_coord_vars, domain_coord_var_errors = reference_attr_variables(
ds, domain_var.coordinates, " "
)
errors = [
maybe_error.name
for maybe_error in domain_coord_vars
if isinstance(maybe_error, VariableReferenceError)
]
if errors:
errors_str = ", ".join(errors)
if domain_coord_var_errors:
errors_str = ", ".join(var_errors)
domain_valid.messages.append(
"Could not find the following "
"variables referenced in "
"coordinates attribute from "
"domain variable "
f"{domain_var.name}: {errors_str}",
)
else:
domain_valid.score += 1

coord_var_dim_failures = []
is_ragged_array_repr = is_dataset_valid_ragged_array_repr_featureType(ds, getattr(ds, "featureType", ""))
if is_ragged_array_repr:
for var in domain_coord_vars:
domain_valid.out_of += 1
ragged_array_dim_variable, ragged_attr_name = resolve_ragged_array_dimension(ds)
dim_name = getattr(ragged_array_dim_variable, ragged_attr_name)
referenced_dim = reference_attr_variables(ds, dim_name, reference_type="dimension")
if isinstance(referenced_dim, VariableReferenceError):
domain_valid.messages.append(
f"Found ragged array variable {ragged_array_dim_variable.name}, "
f"but dimension {dim_name} referenced from {attr_name} does not exist in file"
)

coord_var_reference_failures = []
for coord_var in reference_attr_variables(ds, dim_name, " "):
if isinstance(coord_var, VariableReferenceError):
coord_var_reference_failures.append(coord_var)
domain_valid.messages.append(
f"Referenced coordinate variable {coord_var} does not exist in file")
continue
# TODO: check for label variables
if not util.get_possible_label_variable_dimensions(coord_var).issubset({referenced_dim}):
domain_valid.messages.append(
f"Found ragged array variable {ragged_array_dim_variable.name}, "
f"but dimension {dim_name} referenced from {attr_name} does not exist in file"
)
else:
domain_valid.scored += 1
pass
else:
long_name = getattr(domain_var, "long_name", None)
if long_name is None or not isinstance(long_name, str):
domain_valid.messages.append(
f"For domain variable {domain_var.name} "
f"it is recommended that attribute long_name be present and a string",
)
results.append(domain_valid.to_result())
continue
appendix_a_not_recommended_attrs = []
for attr_name in domain_var.ncattrs():
if (
attr_name in self.appendix_a
and "D" not in self.appendix_a[attr_name]["attr_loc"]
):
appendix_a_not_recommended_attrs.append(attr_name)

if appendix_a_not_recommended_attrs:
domain_valid.messages.append(
f"The following attributes appear in variable {domain_var.name} "
"and CF Appendix A, but are not for use in domain variables: "
f"{appendix_a_not_recommended_attrs}",
)
for var in domain_coord_vars:
domain_valid.out_of += 1
if not util.get_possible_label_variable_dimensions(coord_var).issubset(domain_dims):
domain_valid.messages.append("Domain dimension failure")
else:
domain_valid.scored += 1

# not in conformance docs, but mentioned as recommended anyways
long_name = getattr(domain_var, "long_name", None)
if long_name is None or not isinstance(long_name, str):
domain_valid.messages.append(
f"For domain variable {domain_var.name} "
f"it is recommended that attribute long_name be present and a string",
)
results.append(domain_valid.to_result())
continue
appendix_a_not_recommended_attrs = []
for attr_name in domain_var.ncattrs():
if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]:
appendix_a_not_recommended_attrs.append(attr_name)

if appendix_a_not_recommended_attrs:
domain_valid.messages.append(
f"The following attributes appear in variable {domain_var.name} "
"and CF Appendix A, but are not for use in domain variables: "
f"{appendix_a_not_recommended_attrs}",
)

# no errors occurred
domain_valid.score += 1

# no errors occurred
domain_valid.score += 1

# IMPLEMENTATION CONFORMANCE 5.8 REQUIRED 4/4
# variables named by domain variable's cell_measures attributes must themselves be a subset
# of dimensions named by domain variable's dimensions NetCDF attribute
if hasattr(domain_var, "cell_measures"):
cell_measures_var_names = regex.findall(
r"\b(?:area|volume):\s+(\w+)",
Expand Down
30 changes: 17 additions & 13 deletions compliance_checker/cf/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,6 @@ def get_possible_label_variable_dimensions(variable: Variable) -> Tuple[int, ...
return variable.dimensions[:-1]
return variable.dimensions


@lru_cache()
def maybe_lateral_reference_variable_or_dimension(group: Union[Group, Dataset],
name: str,
Expand Down Expand Up @@ -461,26 +460,31 @@ def can_lateral_search(name):
# perform lateral search if we aren't in the root group



def reference_attr_variables(
dataset: Dataset,
attributes_string: str,
split_by: str = None,
reference_type: str = "variable",
group: Union[Group, Dataset] = None
):
"""
Attempts to reference variables in the string, optionally splitting by
a string
"""
references, errors = [], []
if attributes_string is None:
return None
elif split_by is None:
return dataset.variables.get(
attributes_string,
VariableReferenceError(attributes_string),
)
else:
string_proc = attributes_string.split(split_by)
return [
dataset.variables.get(var_name, VariableReferenceError(var_name))
for var_name in string_proc
]
elif reference_type == "variable":
if split_by is None:
return dataset.variables.get(
attributes_string,
VariableReferenceError(attributes_string),
)
else:
string_proc = attributes_string.split(split_by)
for var_name in string_proc:
if var_name in dataset.variables:
references.append(dataset.variables[var_name])
else:
errors.append(VariableReferenceError(var_name))
return references, errors
53 changes: 27 additions & 26 deletions compliance_checker/cfutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from functools import lru_cache, partial
from importlib.resources import files

from netCDF4 import Dataset
from cf_units import Unit

_UNITLESS_DB = None
Expand Down Expand Up @@ -241,6 +242,7 @@ def is_geophysical(nc, variable):
return True


@lru_cache()
def get_coordinate_variables(nc):
"""
Returns a list of variable names that identify as coordinate variables.
Expand All @@ -260,6 +262,7 @@ def get_coordinate_variables(nc):
coord_vars = []
for dimension in nc.dimensions:
if dimension in nc.variables:
# TODO: Handle string coordinate variables
if nc.variables[dimension].dimensions == (dimension,):
coord_vars.append(dimension)
return coord_vars
Expand Down Expand Up @@ -491,11 +494,6 @@ def get_latitude_variables(nc):
for variable in nc.get_variables_by_attributes(standard_name="latitude"):
latitude_variables.append(variable.name)

# Then axis
for variable in nc.get_variables_by_attributes(axis="Y"):
if variable.name not in latitude_variables:
latitude_variables.append(variable.name)

check_fn = partial(
attr_membership,
value_set=VALID_LAT_UNITS,
Expand Down Expand Up @@ -557,11 +555,6 @@ def get_longitude_variables(nc):
for variable in nc.get_variables_by_attributes(standard_name="longitude"):
longitude_variables.append(variable.name)

# Then axis
for variable in nc.get_variables_by_attributes(axis="X"):
if variable.name not in longitude_variables:
longitude_variables.append(variable.name)

check_fn = partial(
attr_membership,
value_set=VALID_LON_UNITS,
Expand Down Expand Up @@ -905,29 +898,26 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
array structure. See inline comments.
"""

# regardless of if compound type or not, must have a cf_role
# variable; if compound, this will be the first part of the
# feature_type as we'll have to search for one with profile_id
# regardless; if single feature type, cf_role must match that
# featureType
cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
is_compound = False
if feature_type.lower() in {"timeseriesprofile", "trajectoryprofile"}:
is_compound = True
ftype = feature_type.lower().split("profile")[0]
if len(cf_role_vars) > 2:
return False
else:
ftype = feature_type.lower()
if len(cf_role_vars) > 1:
return False

# regardless of if compound type or not, must have a cf_role
# variable; if compound, this will be the first part of the
# feature_type as we'll have to search for one with profile_id
# regardless; if single feature type, cf_role must match that
# featureType
cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
if (
not cf_role_vars
or (len(cf_role_vars) > 1 and not is_compound)
or (len(cf_role_vars) > 2 and is_compound)
):
return False
cf_role_var = nc.get_variables_by_attributes(cf_role=f"{ftype}_id")[0]
if (
cf_role_var.cf_role.split("_id")[0].lower() != ftype
): # if cf_role_var returns None, this should raise an error?
# if cf_role_var returns None, this should raise an error?
if cf_role_var.cf_role.split("_id")[0].lower() != ftype:
return False

# now we'll check dimensions for singular feature types and/or
Expand All @@ -936,7 +926,7 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
if len(instance_dim) != 1:
return False

# Wow we check for the presence of an index variable or count variable;
# Now we check for the presence of an index variable or count variable;
# NOTE that if no index or count variables exist, we can't determine with
# certainty that this is invalid, because single-instance data sets
# are valid representations of the ragged array structures. Instead,
Expand Down Expand Up @@ -1022,6 +1012,17 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):

return True

def resolve_ragged_array_dimension(ds: Dataset):
# TODO: put in loop?
ragged_variable = ds.get_variables_by_attributes(sample_dimension=lambda s: isinstance(s, str))
if ragged_variable:
ragged_type = 'sample_dimension'
else:
ragged_variable = ds.get_variables_by_attributes(instance_dimension=lambda s: isinstance(s, str))
ragged_type = "instance_dimension"
if ragged_variable is None:
raise ValueError("Could not find a ragged array related variable")


def is_variable_valid_ragged_array_repr_featureType(nc, variable: str) -> bool:
"""
Expand Down
Loading
Loading