ioos · benjwadams · Sep 16, 2024 · Jan 16, 2025
diff --git a/compliance_checker/cf/cf_1_6.py b/compliance_checker/cf/cf_1_6.py
@@ -2252,7 +2252,6 @@ def check_multi_dimensional_coords(self, ds):
     # IS THIS EVEN NEEDED ANYMORE?
     # ***************
     def check_grid_coordinates(self, ds):
-        # def _check_grid_coordinates(self, ds):
         """
         5.6 When the coordinate variables for a horizontal grid are not
         longitude and latitude, it is required that the true latitude and
@@ -2266,7 +2265,7 @@ def check_grid_coordinates(self, ds):
         latitudes = cfutil.get_true_latitude_variables(ds)
         longitudes = cfutil.get_true_longitude_variables(ds)
 
-        check_featues = [
+        check_features = [
             "2d-regular-grid",
             "2d-static-grid",
             "3d-regular-grid",
@@ -2287,7 +2286,7 @@ def check_grid_coordinates(self, ds):
             # dimensions
             dimensions = set(ds.variables[variable].dimensions)
             # If it's not a grid, skip it
-            if cfutil.guess_feature_type(ds, variable) not in check_featues:
+            if cfutil.guess_feature_type(ds, variable) not in check_features:
                 continue
             has_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5.6"])
 

diff --git a/compliance_checker/cf/cf_1_8.py b/compliance_checker/cf/cf_1_8.py
@@ -169,17 +169,17 @@ def check_geometry(self, ds: Dataset):
                 results.append(geom_valid.to_result())
                 continue
 
-            node_count = reference_attr_variables(
+            node_count, node_count_errors = reference_attr_variables(
                 ds,
                 getattr(geometry_var, "node_count", None),
             )
             # multipart lines and polygons only
-            part_node_count = reference_attr_variables(
+            part_node_count, part_node_count_errors = reference_attr_variables(
                 ds,
                 getattr(geometry_var, "part_node_count", None),
             )
             # polygons with interior geometry only
-            interior_ring = reference_attr_variables(
+            interior_ring, interior_ring_errors = reference_attr_variables(
                 ds,
                 getattr(geometry_var, "interior_ring", None),
             )

diff --git a/compliance_checker/cf/cf_1_9.py b/compliance_checker/cf/cf_1_9.py
@@ -112,61 +112,122 @@ def check_domain_variables(self, ds: Dataset):
             var
             for var in ds.get_variables_by_attributes(
                 coordinates=lambda c: c is not None,
-            )
-            # IMPLICIT CONFORMANCE REQUIRED 1/4
+        )):
             # all variables have dimensions attribute, but should be scalar
-            if var.dimensions == ()
-        ):
+            if not ds.dimensions == ():
+                continue
+
+            # IMPLICIT CONFORMANCE REQUIRED 1/4
+            # Has a dimensions *NetCDF* attribute
+            try:
+                dim_nc_attr = var.getncattr(dimensions)
+            # most variables are unlikely to be domain variables, so don't treat this
+            # as a failure
+            except AttributeError:
+                continue
+            # IMPLICIT CONFORMANCE REQUIRED 2/4
+            # Aforementioned dimensions attribute is comprised of space separated
+            # dimension names which must exist in the file
             domain_valid = TestCtx(BaseCheck.MEDIUM, self.section_titles["5.8"])
-            domain_valid.out_of += 1
-            domain_coord_vars = reference_attr_variables(
+            domain_valid.out_of += 2
+            domain_dims, dim_errors = reference_attr_variables(
                 ds,
-                domain_var.coordinates,
+                domain_var.getncattr("dimensions"),
                 " ",
+                "dimensions"
+            )
+            if dim_errors:
+                errors_str = ", ".join(dim_errors)
+                domain_valid.messages.append(
+                    "Could not find the following "
+                    "dimensions referenced in "
+                    "dimensions attribute from "
+                    "domain variable "
+                    f"{domain_var.name}: {errors_str}",
+                )
+            else:
+                domain_valid.score += 1
+            domain_coord_vars, domain_coord_var_errors = reference_attr_variables(
+                ds, domain_var.coordinates, " "
             )
-            errors = [
-                maybe_error.name
-                for maybe_error in domain_coord_vars
-                if isinstance(maybe_error, VariableReferenceError)
-            ]
-            if errors:
-                errors_str = ", ".join(errors)
+            if domain_coord_var_errors:
+                errors_str = ", ".join(var_errors)
                 domain_valid.messages.append(
                     "Could not find the following "
                     "variables referenced in "
                     "coordinates attribute from "
                     "domain variable "
                     f"{domain_var.name}: {errors_str}",
                 )
+            else:
+                domain_valid.score += 1
 
+            coord_var_dim_failures = []
+            is_ragged_array_repr = is_dataset_valid_ragged_array_repr_featureType(ds, getattr(ds, "featureType", ""))
+            if is_ragged_array_repr:
+                for var in domain_coord_vars:
+                    domain_valid.out_of += 1
+                    ragged_array_dim_variable, ragged_attr_name = resolve_ragged_array_dimension(ds)
+                    dim_name = getattr(ragged_array_dim_variable, ragged_attr_name)
+                    referenced_dim = reference_attr_variables(ds, dim_name, reference_type="dimension")
+                    if isinstance(referenced_dim, VariableReferenceError):
+                        domain_valid.messages.append(
+                        f"Found ragged array variable {ragged_array_dim_variable.name}, "
+                        f"but dimension {dim_name} referenced from {attr_name} does not exist in file"
+                        )
+
+                    coord_var_reference_failures = []
+                    for coord_var in reference_attr_variables(ds, dim_name, " "):
+                        if isinstance(coord_var, VariableReferenceError):
+                            coord_var_reference_failures.append(coord_var)
+                            domain_valid.messages.append(
+                            f"Referenced coordinate variable {coord_var} does not exist in file")
+                            continue
+                        # TODO: check for label variables
+                        if not util.get_possible_label_variable_dimensions(coord_var).issubset({referenced_dim}):
+                            domain_valid.messages.append(
+                            f"Found ragged array variable {ragged_array_dim_variable.name}, "
+                            f"but dimension {dim_name} referenced from {attr_name} does not exist in file"
+                            )
+                        else:
+                            domain_valid.scored += 1
+                    pass
             else:
-                long_name = getattr(domain_var, "long_name", None)
-                if long_name is None or not isinstance(long_name, str):
-                    domain_valid.messages.append(
-                        f"For domain variable {domain_var.name} "
-                        f"it is recommended that attribute long_name be present and a string",
-                    )
-                    results.append(domain_valid.to_result())
-                    continue
-                appendix_a_not_recommended_attrs = []
-                for attr_name in domain_var.ncattrs():
-                    if (
-                        attr_name in self.appendix_a
-                        and "D" not in self.appendix_a[attr_name]["attr_loc"]
-                    ):
-                        appendix_a_not_recommended_attrs.append(attr_name)
-
-                if appendix_a_not_recommended_attrs:
-                    domain_valid.messages.append(
-                        f"The following attributes appear in variable {domain_var.name} "
-                        "and CF Appendix A, but are not for use in domain variables: "
-                        f"{appendix_a_not_recommended_attrs}",
-                    )
+                for var in domain_coord_vars:
+                    domain_valid.out_of += 1
+                    if not util.get_possible_label_variable_dimensions(coord_var).issubset(domain_dims):
+                        domain_valid.messages.append("Domain dimension failure")
+                    else:
+                        domain_valid.scored += 1
+
+            # not in conformance docs, but mentioned as recommended anyways
+            long_name = getattr(domain_var, "long_name", None)
+            if long_name is None or not isinstance(long_name, str):
+                domain_valid.messages.append(
+                    f"For domain variable {domain_var.name} "
+                    f"it is recommended that attribute long_name be present and a string",
+                )
+                results.append(domain_valid.to_result())
+                continue
+            appendix_a_not_recommended_attrs = []
+            for attr_name in domain_var.ncattrs():
+                if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]:
+                    appendix_a_not_recommended_attrs.append(attr_name)
+
+            if appendix_a_not_recommended_attrs:
+                domain_valid.messages.append(
+                    f"The following attributes appear in variable {domain_var.name} "
+                    "and CF Appendix A, but are not for use in domain variables: "
+                    f"{appendix_a_not_recommended_attrs}",
+                )
+
+            # no errors occurred
+            domain_valid.score += 1
 
-                # no errors occurred
-                domain_valid.score += 1
 
             # IMPLEMENTATION CONFORMANCE 5.8 REQUIRED 4/4
+            # variables named by domain variable's cell_measures attributes must themselves be a subset
+            # of dimensions named by domain variable's dimensions NetCDF attribute
             if hasattr(domain_var, "cell_measures"):
                 cell_measures_var_names = regex.findall(
                     r"\b(?:area|volume):\s+(\w+)",

diff --git a/compliance_checker/cf/util.py b/compliance_checker/cf/util.py
@@ -419,7 +419,6 @@ def get_possible_label_variable_dimensions(variable: Variable) -> Tuple[int, ...
         return variable.dimensions[:-1]
     return variable.dimensions
 
-
 @lru_cache()
 def maybe_lateral_reference_variable_or_dimension(group: Union[Group, Dataset],
                                                   name: str,
@@ -461,26 +460,31 @@ def can_lateral_search(name):
             # perform lateral search if we aren't in the root group
 
 
-
 def reference_attr_variables(
     dataset: Dataset,
     attributes_string: str,
     split_by: str = None,
+    reference_type: str = "variable",
+    group: Union[Group, Dataset] = None
 ):
     """
     Attempts to reference variables in the string, optionally splitting by
     a string
     """
+    references, errors = [], []
     if attributes_string is None:
         return None
-    elif split_by is None:
-        return dataset.variables.get(
-            attributes_string,
-            VariableReferenceError(attributes_string),
-        )
-    else:
-        string_proc = attributes_string.split(split_by)
-        return [
-            dataset.variables.get(var_name, VariableReferenceError(var_name))
-            for var_name in string_proc
-        ]
+    elif reference_type == "variable":
+        if split_by is None:
+            return dataset.variables.get(
+                attributes_string,
+                VariableReferenceError(attributes_string),
+            )
+        else:
+            string_proc = attributes_string.split(split_by)
+            for var_name in string_proc:
+                if var_name in dataset.variables:
+                    references.append(dataset.variables[var_name])
+                else:
+                    errors.append(VariableReferenceError(var_name))
+            return references, errors
diff --git a/compliance_checker/cfutil.py b/compliance_checker/cfutil.py
@@ -9,6 +9,7 @@
 from functools import lru_cache, partial
 from importlib.resources import files
 
+from netCDF4 import Dataset
 from cf_units import Unit
 
 _UNITLESS_DB = None
@@ -241,6 +242,7 @@ def is_geophysical(nc, variable):
     return True
 
 
+@lru_cache()
 def get_coordinate_variables(nc):
     """
     Returns a list of variable names that identify as coordinate variables.
@@ -260,6 +262,7 @@ def get_coordinate_variables(nc):
     coord_vars = []
     for dimension in nc.dimensions:
         if dimension in nc.variables:
+            # TODO: Handle string coordinate variables
             if nc.variables[dimension].dimensions == (dimension,):
                 coord_vars.append(dimension)
     return coord_vars
@@ -491,11 +494,6 @@ def get_latitude_variables(nc):
     for variable in nc.get_variables_by_attributes(standard_name="latitude"):
         latitude_variables.append(variable.name)
 
-    # Then axis
-    for variable in nc.get_variables_by_attributes(axis="Y"):
-        if variable.name not in latitude_variables:
-            latitude_variables.append(variable.name)
-
     check_fn = partial(
         attr_membership,
         value_set=VALID_LAT_UNITS,
@@ -557,11 +555,6 @@ def get_longitude_variables(nc):
     for variable in nc.get_variables_by_attributes(standard_name="longitude"):
         longitude_variables.append(variable.name)
 
-    # Then axis
-    for variable in nc.get_variables_by_attributes(axis="X"):
-        if variable.name not in longitude_variables:
-            longitude_variables.append(variable.name)
-
     check_fn = partial(
         attr_membership,
         value_set=VALID_LON_UNITS,
@@ -905,29 +898,26 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
     array structure. See inline comments.
     """
 
+    # regardless of if compound type or not, must have a cf_role
+    # variable; if compound, this will be the first part of the
+    # feature_type as we'll have to search for one with profile_id
+    # regardless; if single feature type, cf_role must match that
+    # featureType
+    cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
     is_compound = False
     if feature_type.lower() in {"timeseriesprofile", "trajectoryprofile"}:
         is_compound = True
         ftype = feature_type.lower().split("profile")[0]
+        if len(cf_role_vars) > 2:
+            return False
     else:
         ftype = feature_type.lower()
+        if len(cf_role_vars) > 1:
+            return False
 
-    # regardless of if compound type or not, must have a cf_role
-    # variable; if compound, this will be the first part of the
-    # feature_type as we'll have to search for one with profile_id
-    # regardless; if single feature type, cf_role must match that
-    # featureType
-    cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
-    if (
-        not cf_role_vars
-        or (len(cf_role_vars) > 1 and not is_compound)
-        or (len(cf_role_vars) > 2 and is_compound)
-    ):
-        return False
     cf_role_var = nc.get_variables_by_attributes(cf_role=f"{ftype}_id")[0]
-    if (
-        cf_role_var.cf_role.split("_id")[0].lower() != ftype
-    ):  # if cf_role_var returns None, this should raise an error?
+    # if cf_role_var returns None, this should raise an error?
+    if cf_role_var.cf_role.split("_id")[0].lower() != ftype:
         return False
 
     # now we'll check dimensions for singular feature types and/or
@@ -936,7 +926,7 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
     if len(instance_dim) != 1:
         return False
 
-    # Wow we check for the presence of an index variable or count variable;
+    # Now we check for the presence of an index variable or count variable;
     # NOTE that if no index or count variables exist, we can't determine with
     # certainty that this is invalid, because single-instance data sets
     # are valid representations of the ragged array structures. Instead,
@@ -1022,6 +1012,17 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
 
     return True
 
+def resolve_ragged_array_dimension(ds: Dataset):
+    # TODO: put in loop?
+    ragged_variable = ds.get_variables_by_attributes(sample_dimension=lambda s: isinstance(s, str))
+    if ragged_variable:
+        ragged_type = 'sample_dimension'
+    else:
+        ragged_variable = ds.get_variables_by_attributes(instance_dimension=lambda s: isinstance(s, str))
+        ragged_type = "instance_dimension"
+    if ragged_variable is None:
+        raise ValueError("Could not find a ragged array related variable")
+
 
 def is_variable_valid_ragged_array_repr_featureType(nc, variable: str) -> bool:
     """