pytroll · gerritholl · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
@@ -18,15 +18,15 @@
 """Helpers for reading netcdf-based files."""
 
 import logging
+import warnings
 
-import dask.array as da
 import netCDF4
 import numpy as np
 import xarray as xr
 
 from satpy.readers import open_file_or_filename
 from satpy.readers.file_handlers import BaseFileHandler
-from satpy.readers.utils import np2str
+from satpy.readers.utils import get_distributed_friendly_dask_array, np2str
 from satpy.utils import get_legacy_chunk_size
 
 LOG = logging.getLogger(__name__)
@@ -85,10 +85,12 @@ class NetCDF4FileHandler(BaseFileHandler):
         xarray_kwargs (dict): Addition arguments to `xarray.open_dataset`
         cache_var_size (int): Cache variables smaller than this size.
         cache_handle (bool): Keep files open for lifetime of filehandler.
+            Uses xarray.backends.CachingFileManager, which uses a least
+            recently used cache.
 
     """
 
-    file_handle = None
+    manager = None
 
     def __init__(self, filename, filename_info, filetype_info,
                  auto_maskandscale=False, xarray_kwargs=None,
@@ -99,6 +101,7 @@ def __init__(self, filename, filename_info, filetype_info,
         self.file_content = {}
         self.cached_file_content = {}
         self._use_h5netcdf = False
+        self._auto_maskandscale = auto_maskandscale
         try:
             file_handle = self._get_file_handle()
         except IOError:
@@ -118,13 +121,24 @@ def __init__(self, filename, filename_info, filetype_info,
         self.collect_cache_vars(cache_var_size)
 
         if cache_handle:
-            self.file_handle = file_handle
+            self.manager = xr.backends.CachingFileManager(
+                    netCDF4.Dataset, self.filename, mode="r")
         else:
             file_handle.close()
 
     def _get_file_handle(self):
         return netCDF4.Dataset(self.filename, "r")
 
+    @property
+    def file_handle(self):
+        """Backward-compatible way for file handle caching."""
+        warnings.warn(
+                "attribute .file_handle is deprecated, use .manager instead",
+                DeprecationWarning)
+        if self.manager is None:
+            return None
+        return self.manager.acquire()
+
     @staticmethod
     def _set_file_handle_auto_maskandscale(file_handle, auto_maskandscale):
         if hasattr(file_handle, "set_auto_maskandscale"):
@@ -196,11 +210,8 @@ def _get_required_variable_names(listed_variables, variable_name_replacements):
 
     def __del__(self):
         """Delete the file handler."""
-        if self.file_handle is not None:
-            try:
-                self.file_handle.close()
-            except RuntimeError:  # presumably closed already
-                pass
+        if self.manager is not None:
+            self.manager.close()
 
     def _collect_global_attrs(self, obj):
         """Collect all the global attributes for the provided file object."""
@@ -289,8 +300,8 @@ def _get_variable(self, key, val):
             group, key = parts
         else:
             group = None
-        if self.file_handle is not None:
-            val = self._get_var_from_filehandle(group, key)
+        if self.manager is not None:
+            val = self._get_var_from_manager(group, key)
         else:
             val = self._get_var_from_xr(group, key)
         return val
@@ -319,18 +330,29 @@ def _get_var_from_xr(self, group, key):
                 val.load()
         return val
 
-    def _get_var_from_filehandle(self, group, key):
+    def _get_var_from_manager(self, group, key):
         # Not getting coordinates as this is more work, therefore more
         # overhead, and those are not used downstream.
+
+        with self.manager.acquire_context() as ds:
+            if group is not None:
+                v = ds[group][key]
+            else:
+                v = ds[key]
         if group is None:
-            g = self.file_handle
+            dv = get_distributed_friendly_dask_array(
+                    self.manager, key,
+                    chunks=v.shape, dtype=v.dtype,
+                    auto_maskandscale=self._auto_maskandscale)
         else:
-            g = self.file_handle[group]
-        v = g[key]
+            dv = get_distributed_friendly_dask_array(
+                    self.manager, key, group=group,
+                    chunks=v.shape, dtype=v.dtype,
+                    auto_maskandscale=self._auto_maskandscale)
         attrs = self._get_object_attrs(v)
         x = xr.DataArray(
-                da.from_array(v), dims=v.dimensions, attrs=attrs,
-                name=v.name)
+                dv,
+                dims=v.dimensions, attrs=attrs, name=v.name)
         return x
 
     def __contains__(self, item):

@@ -29,6 +29,7 @@
 from shutil import which
 from subprocess import PIPE, Popen  # nosec
 
+import dask.array as da
 import numpy as np
 import pyproj
 import xarray as xr
@@ -474,3 +475,53 @@
     with xr.set_options(keep_attrs=True):
         reflectance = reflectance / reflectance.dtype.type(sun_earth_dist * sun_earth_dist)
     return reflectance
+
+
+def get_distributed_friendly_dask_array(manager, varname, chunks, dtype,
+                                        group="/", auto_maskandscale=None):
+    """Construct a dask array from a variable for dask distributed.
+
+    When we construct a dask array using da.array and use that to create an
+    xarray dataarray, the result is not serialisable and dask graphs using
+    this dataarray cannot be computed when the dask distributed scheduler
+    is in use.  To circumvent this problem, xarray provides the
+    CachingFileManager.  See GH#2815 for more information.
+
+    Should have at least one dimension.
+
+    Example::
+
+        >>> import NetCDF4
+        >>> from xarray.backends import CachingFileManager
+        >>> cfm = CachingFileManager(NetCDF4.Dataset, fn, mode="r")
+        >>> arr = get_distributed_friendly_dask_array(cfm, "my_var")
+
+    Args:
+        manager (xarray.backends.CachingFileManager):
+            Instance of xarray.backends.CachingFileManager encapsulating the
+            dataset to be read.
+        varname (str):
+            Name of the variable.
+        chunks (tuple):
+            Chunks to use when creating the dask array.
+        dtype (dtype):
+            What dtype to use.
+        group (str):
+            What group to read the variable from.
+        auto_maskandscale (bool, optional):
+            Apply automatic masking and scaling.  This will only
+            work if CachingFileManager.acquire returns a handler with a
+            method set_auto_maskandscale, such as is the case for
+            NetCDF4.Dataset.
+    """
+    def get_chunk():
+        with manager.acquire_context() as nc:
+            if auto_maskandscale is not None:
+                nc.set_auto_maskandscale(auto_maskandscale)
+            return nc["/".join([group, varname])][:]
+
+    return da.map_blocks(
+            get_chunk,
+            chunks=chunks,
+            dtype=dtype,
+            meta=np.array([]))
@@ -18,7 +18,6 @@
 """Module for testing the satpy.readers.netcdf_utils module."""
 
 import os
-import unittest
 
 import numpy as np
 import pytest
@@ -71,13 +70,15 @@ def get_test_content(self, filename, filename_info, filetype_info):
         raise NotImplementedError("Fake File Handler subclass must implement 'get_test_content'")
 
 
-class TestNetCDF4FileHandler(unittest.TestCase):
+class TestNetCDF4FileHandler:
     """Test NetCDF4 File Handler Utility class."""
 
-    def setUp(self):
+    @pytest.fixture()
+    def dummy_nc_file(self, tmp_path):
         """Create a test NetCDF4 file."""
         from netCDF4 import Dataset
-        with Dataset("test.nc", "w") as nc:
+        fn = tmp_path / "test.nc"
+        with Dataset(fn, "w") as nc:
             # Create dimensions
             nc.createDimension("rows", 10)
             nc.createDimension("cols", 100)
@@ -116,17 +117,14 @@ def setUp(self):
                 d.test_attr_str = "test_string"
                 d.test_attr_int = 0
                 d.test_attr_float = 1.2
+        return fn
 
-    def tearDown(self):
-        """Remove the previously created test file."""
-        os.remove("test.nc")
-
-    def test_all_basic(self):
+    def test_all_basic(self, dummy_nc_file):
         """Test everything about the NetCDF4 class."""
         import xarray as xr
 
         from satpy.readers.netcdf_utils import NetCDF4FileHandler
-        file_handler = NetCDF4FileHandler("test.nc", {}, {})
+        file_handler = NetCDF4FileHandler(dummy_nc_file, {}, {})
 
         assert file_handler["/dimension/rows"] == 10
         assert file_handler["/dimension/cols"] == 100
@@ -165,7 +163,7 @@ def test_all_basic(self):
         assert file_handler.file_handle is None
         assert file_handler["ds2_sc"] == 42
 
-    def test_listed_variables(self):
+    def test_listed_variables(self, dummy_nc_file):
         """Test that only listed variables/attributes area collected."""
         from satpy.readers.netcdf_utils import NetCDF4FileHandler
 
@@ -175,12 +173,12 @@ def test_listed_variables(self):
                 "attr/test_attr_str",
             ]
         }
-        file_handler = NetCDF4FileHandler("test.nc", {}, filetype_info)
+        file_handler = NetCDF4FileHandler(dummy_nc_file, {}, filetype_info)
         assert len(file_handler.file_content) == 2
         assert "test_group/attr/test_attr_str" in file_handler.file_content
         assert "attr/test_attr_str" in file_handler.file_content
 
-    def test_listed_variables_with_composing(self):
+    def test_listed_variables_with_composing(self, dummy_nc_file):
         """Test that composing for listed variables is performed."""
         from satpy.readers.netcdf_utils import NetCDF4FileHandler
 
@@ -199,7 +197,7 @@ def test_listed_variables_with_composing(self):
                 ],
             }
         }
-        file_handler = NetCDF4FileHandler("test.nc", {}, filetype_info)
+        file_handler = NetCDF4FileHandler(dummy_nc_file, {}, filetype_info)
         assert len(file_handler.file_content) == 3
         assert "test_group/ds1_f/attr/test_attr_str" in file_handler.file_content
         assert "test_group/ds1_i/attr/test_attr_str" in file_handler.file_content
@@ -208,10 +206,10 @@ def test_listed_variables_with_composing(self):
         assert not any("another_parameter" in var for var in file_handler.file_content)
         assert "test_group/attr/test_attr_str" in file_handler.file_content
 
-    def test_caching(self):
+    def test_caching(self, dummy_nc_file):
         """Test that caching works as intended."""
         from satpy.readers.netcdf_utils import NetCDF4FileHandler
-        h = NetCDF4FileHandler("test.nc", {}, {}, cache_var_size=1000,
+        h = NetCDF4FileHandler(dummy_nc_file, {}, {}, cache_var_size=1000,
                                cache_handle=True)
         assert h.file_handle is not None
         assert h.file_handle.isopen()
@@ -226,31 +224,29 @@ def test_caching(self):
         np.testing.assert_array_equal(
                 h["ds2_f"],
                 np.arange(10. * 100).reshape((10, 100)))
-        h.__del__()
-        assert not h.file_handle.isopen()
 
     def test_filenotfound(self):
         """Test that error is raised when file not found."""
         from satpy.readers.netcdf_utils import NetCDF4FileHandler
 
-        with pytest.raises(IOError, match=".*No such file or directory.*"):
+        with pytest.raises(IOError, match=".* file .*"):
             NetCDF4FileHandler("/thisfiledoesnotexist.nc", {}, {})
 
-    def test_get_and_cache_npxr_is_xr(self):
+    def test_get_and_cache_npxr_is_xr(self, dummy_nc_file):
         """Test that get_and_cache_npxr() returns xr.DataArray."""
         import xarray as xr
 
         from satpy.readers.netcdf_utils import NetCDF4FileHandler
-        file_handler = NetCDF4FileHandler("test.nc", {}, {}, cache_handle=True)
+        file_handler = NetCDF4FileHandler(dummy_nc_file, {}, {}, cache_handle=True)
 
         data = file_handler.get_and_cache_npxr("test_group/ds1_f")
         assert isinstance(data, xr.DataArray)
 
-    def test_get_and_cache_npxr_data_is_cached(self):
+    def test_get_and_cache_npxr_data_is_cached(self, dummy_nc_file):
         """Test that the data are cached when get_and_cache_npxr() is called."""
         from satpy.readers.netcdf_utils import NetCDF4FileHandler
 
-        file_handler = NetCDF4FileHandler("test.nc", {}, {}, cache_handle=True)
+        file_handler = NetCDF4FileHandler(dummy_nc_file, {}, {}, cache_handle=True)
         data = file_handler.get_and_cache_npxr("test_group/ds1_f")
 
         # Delete the dataset from the file content dict, it should be available from the cache
@@ -264,7 +260,6 @@ class TestNetCDF4FsspecFileHandler:
 
     def test_default_to_netcdf4_lib(self):
         """Test that the NetCDF4 backend is used by default."""
-        import os
         import tempfile
 
         import h5py
@@ -392,3 +387,40 @@ def test_get_data_as_xarray_scalar_h5netcdf(tmp_path):
     res = get_data_as_xarray(fid["test_data"])
     np.testing.assert_equal(res.data, np.array(data))
     assert res.attrs == NC_ATTRS
+
+
+@pytest.fixture()
+def dummy_nc(tmp_path):
+    """Fixture to create a dummy NetCDF file and return its path."""
+    import xarray as xr
+
+    fn = tmp_path / "sjaunja.nc"
+    ds = xr.Dataset(data_vars={"kaitum": (["x"], np.arange(10))})
+    ds.to_netcdf(fn)
+    return fn
+
+
+def test_caching_distributed(dummy_nc):
+    """Test that the distributed scheduler works with file handle caching.
+
+    This is a test for GitHub issue 2815.
+    """
+    from dask.distributed import Client
+
+    from satpy.readers.netcdf_utils import NetCDF4FileHandler
+
+    fh = NetCDF4FileHandler(dummy_nc, {}, {}, cache_handle=True)
+
+    def doubler(x):
+        return x * 2
+
+    # As documented in GH issue 2815, using dask distributed with the file
+    # handle cacher might fail in non-trivial ways, such as giving incorrect
+    # results.  Testing map_blocks is one way to reproduce the problem
+    # reliably, even though the problem also manifests itself (in different
+    # ways) without map_blocks.
+
+
+    with Client():
+        dask_doubler = fh["kaitum"].map_blocks(doubler)
+        dask_doubler.compute()