Merge pull request C2SM#31 from MeteoSwiss-APN/histograms

Add histograms: - Additional function for histograms in plot.py - Additional entry point for histograms in cli.py - Separated masking from aggregation as we want the data unaggregated for the binning into histograms - Tests for histogram plot and results
leuty · Mar 8, 2023 · 717ebbd · 717ebbd
2 parents 9b82c6b + 6b5198b
commit 717ebbd
Show file tree

Hide file tree

Showing 5 changed files with 609 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -39,11 +39,13 @@ The main functionalities of this package are:
 * plotting
   * time series visualisation for multiple runs
   * time series visualisation for ensemble data
+  * histograms for value distributions
 
 The functions of the package can be used in scripts or via pre-defined command-line tools. Following command-line tools are provided:
-* `icon-timeseries meanmax`: time series of domain average and domain maximum of a model variable on a given level (domains can be customised and added in `src/resources/domains.yaml`)
+* `icon-timeseries meanmax`: time series of domain average and domain maximum of a model variable on a given level (can display multiple experiments, domains can be customised and added in `src/resources/domains.yaml`)
 * `icon-timeseries quicklook-domain`: map plot of the considered domain
-* `icon-timeseries nearest-neighbour`: time series of the values of a model variable at the grid point closest to the given location
+* `icon-timeseries nearest-neighbour`: time series of the values of a model variable at the grid point closest to the given location (can display multiple experiments)
+* `icon-timeseries histograms`: histograms of the values of a model variable (can display multiple experiments)
 
 In order to use more than one dask worker for the parallelisation, the job needs to be send to the slurm queue (`postproc`). A script for `sbatch` containing an example call for `icon-timeseries meanmax` is provided: `sbatch.sh`
 

diff --git a/src/icon_timeseries/cli.py b/src/icon_timeseries/cli.py
@@ -16,7 +16,9 @@
 from .handle_grid import get_domain
 from .handle_grid import get_grid
 from .plot import plot_domain
+from .plot import plot_histograms
 from .plot import plot_ts_multiple
+from .prepare_data import prepare_masked_da
 from .prepare_data import prepare_meanmax
 from .prepare_data import prepare_nn
 
@@ -175,7 +177,7 @@ def meanmax(
     type=str,
     help="Coordinates (format 'lon,lat') for nearest neighbour lookup.",
 )
-@click.option("--level", default=None, type=int, help="model level index")
+@click.option("--level", default=None, type=float, help="model level values")
 @click.option(
     "--gridfile",
     default=None,
@@ -198,7 +200,7 @@ def meanmax(
 def nearest_neighbour(
     exp: Tuple[Tuple, ...],
     varname: str,
-    level: int | None,
+    level: float | None,
     gridfile: str | None,
     lonlat: str,
     deagg: bool,
@@ -241,6 +243,136 @@ def nearest_neighbour(
     plot_ts_multiple(da_dict, domain=lonlat)
 
 
+@main.command()
+@click.option(
+    "--exp",
+    required=True,
+    type=(str, str),
+    nargs=2,
+    multiple=True,
+    help=(
+        "experiment info. file pattern to files to read, will be expanded to a "
+        "list with glob.glob, and experiment identifier"
+    ),
+)
+@click.option(
+    "--varname", required=True, type=str, help="GRIB shortName of the variable"
+)
+@click.option("--level", default=None, type=float, help="model level value")
+@click.option(
+    "--color",
+    default=None,
+    type=str,
+    multiple=True,
+    help="color to use for experiments in plot, used as specified",
+)
+@click.option(
+    "--gridfile",
+    default=None,
+    type=str,
+    help="ICON grid file, needed for unstructured grid",
+)
+@click.option(
+    "--domain",
+    default="all",
+    type=str,
+    help="domain to consider, please define in domains.yaml",
+)
+@click.option(
+    "--deagg",
+    is_flag=True,
+    help="deagreggation of variable, method is detected from GRIB encoding "
+    "(de-averaging and de-accumulation are currently implemented)",
+)
+@click.option(
+    "--bins",
+    type=(float, float, int),
+    default=(0.1, 100.0, 50),
+    nargs=3,
+    help="bins for histogram, format: min (float) max (float) n_bins (int).",
+)
+@click.option(
+    "--xlog",
+    "xlog",
+    default=False,
+    is_flag=True,
+    type=bool,
+    help="plot on x-logscale with logarithmic bins",
+)
+@click.option(
+    "--ylog",
+    "ylog",
+    default=False,
+    is_flag=True,
+    type=bool,
+    help="plot on y-logscale",
+)
+@click.option(
+    "--dask-workers",
+    "dask_nworkers",
+    default=None,
+    type=int,
+    help="ignored if the script does not run on a post-proc node",
+)
+def histograms(
+    exp: Tuple[Tuple, ...],
+    varname: str,
+    level: float | None,
+    color: str | None,
+    gridfile: str | None,
+    domain: str,
+    deagg: bool,
+    bins: Tuple[float, float, int],
+    xlog: bool,
+    ylog: bool,
+    dask_nworkers: int | None,
+):  # pylint: disable=too-many-arguments, too-many-locals
+    """Read data for a variable from GRIB file(s) and plot the values distribution."""
+    # check dask setup
+    chunks = None
+    if "pp" in os.uname().nodename:
+        logging.info("job is running on %s, dask_nworkers active", os.uname().nodename)
+        logging.info("number of dask workers: %d", dask_nworkers)
+        chunks = {"generalVerticalLayer": 1}
+    elif dask_nworkers and "pp" not in os.uname().nodename:
+        logging.warning(
+            "job is running on %s, dask_nworkers not active", os.uname().nodename
+        )
+        logging.warning("send your job on a post-proc node to activate dask_nworkers")
+        dask_nworkers = None
+
+    # gather data for all experiments
+    da_dict = {}  # type: Dict[str, xr.DataArray]
+    for one_exp in exp:
+        logging.info("Preparing experiment %s", one_exp[1])
+        filelist = glob.glob(one_exp[0])
+        if len(filelist) == 0:
+            logging.warning("file list for %s is empty, skipping...", one_exp[0])
+            continue
+        da_masked = prepare_masked_da(
+            filelist,
+            varname,
+            level,
+            gridfile,
+            domain=domain,
+            deagg=deagg,
+            chunks=chunks,
+            dask_nworkers=dask_nworkers,
+        )
+        da_dict[one_exp[1]] = da_masked.copy()
+
+    # plot the histograms
+    _, _ = plot_histograms(
+        da_dict,
+        domain,
+        min_bin=bins[0],
+        max_bin=bins[1],
+        nbins=bins[2],
+        xlog=xlog,
+        ylog=ylog,
+    )
+
+
 @main.command()
 @click.option(
     "--gridfile",

diff --git a/src/icon_timeseries/plot.py b/src/icon_timeseries/plot.py
@@ -51,7 +51,7 @@ def plot_ts_multiple(
 
     # loop over parameters
     for i, (p_key, p_val) in enumerate(da_dict.items()):
-        e_val = xr.DataArray()
+        e_val: xr.DataArray = xr.DataArray()  # pylint needs to have the loop variable
         exp_count = 0
         if colors is None:
             # Take the color sequence from a colormap
@@ -288,3 +288,108 @@ def plot_domain(
         logging.info("saved figure %s", fname)
 
     return fig
+
+
+# pylint: disable=too-many-arguments, too-many-locals
+def plot_histograms(
+    da_dict: dict[str, xr.DataArray],
+    domain: str | None = None,
+    min_bin: float = 0.1,
+    max_bin: float = 100.0,
+    nbins: int = 50,
+    xlog: bool = False,
+    ylog: bool = False,
+    save: bool = True,
+) -> tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]:
+    """Draw a histogram plot for a dataset over a given domain.
+
+    Parameters
+    ----------
+    da_dict : Dict[str, xarray.DataArray]
+        dictionary holding the data {"expid": xarray.DataArray}. the
+        dimension of the xarray.DataArray has to be 'time'
+    domain : str, optional
+        name of the domain for the plot title
+    min_bin : float, optional
+        lowest bin bound
+    max_bin : float, optional
+        highest bin bound
+    nbins : int, optional
+        number of bins
+    xlog : bool, optional
+        log. x-axis
+    ylog : bool, optional
+        log. y-axis
+    save : bool, optional
+        save the figure
+
+    Returns
+    -------
+    fig : matplotlib.figure.Figure
+        figure object
+    ax : matplotlib.axes.Axes
+        axes object
+
+    """
+    logging.info("Histogram plotting started...")
+    fig, ax = plt.subplots(1)
+
+    # prepare bins
+    # https://stackoverflow.com/questions/6855710/
+    # how-to-have-logarithmic-bins-in-a-python-histogram
+    if xlog:
+        bins = 10.0 ** np.linspace(np.log10(min_bin), np.log10(max_bin), nbins)
+    else:
+        bins = np.linspace(min_bin, max_bin, nbins)
+
+    # Take the color sequence from a colormap, analogous to timeseries
+    cmap = plt.cm.get_cmap("gist_rainbow", len(da_dict) + 1)
+
+    logging.info("Number of experiments to plot: %i", len(da_dict))
+    # loop over runs/experiments
+    e_val: xr.DataArray = xr.DataArray()  # pylint needs to have the loop variable
+    for i, (e_key, e_val) in enumerate(da_dict.items()):
+        # loop over runs/experiments
+        logging.info("histogram plotting for exp %s", e_key)
+        # set color
+        color = cmap(i)
+        vals = e_val.values.flatten()
+        counts, bin_edges = np.histogram(vals, bins)
+        width = np.diff(bin_edges)
+        ax.bar(
+            bin_edges[:-1],
+            counts,
+            width=width,
+            align="edge",
+            fill=False,
+            edgecolor=color,
+            alpha=0.8,
+            label=e_key,
+        )
+    ax.set_title(f"Experiments {', '.join(list(da_dict.keys()))}")  # remove brackets
+    ax.set_xlabel(f"{e_val.name} {e_val.GRIB_stepType} ({e_val.GRIB_units})")
+    ax.set_ylabel("Frequency count (-)")
+    ax.grid(True, which="both", axis="both", linestyle="--")
+    ax.legend()
+    if xlog:
+        ax.set_xscale("log")
+    if ylog:
+        ax.set_yscale("log")
+
+    try:
+        fig.suptitle(f"Histogram plots for domain {domain}, level {e_val.level}")
+    except AttributeError:
+        fig.suptitle(f"Histogram plots for domain {domain}")
+
+    if save:
+        try:
+            fname = (
+                f"histograms_{e_val.name}_{'-'.join(da_dict.keys())}_l{e_val.level}.png"
+            )
+        except AttributeError:
+            fname = f"histograms_{e_val.name}_{'-'.join(da_dict.keys())}.png"
+        # fig.set_size_inches(4.0, 8.0)
+        fig.savefig(fname, dpi=300)
+        logging.info("saved figure %s", fname)
+
+    return fig, ax