Skip to content

Commit

Permalink
Merge pull request C2SM#31 from MeteoSwiss-APN/histograms
Browse files Browse the repository at this point in the history
Add histograms:
- Additional function for histograms in plot.py
- Additional entry point for histograms in cli.py
- Separated masking from aggregation as we want the data unaggregated for the binning into histograms
- Tests for histogram plot and results
  • Loading branch information
clairemerker authored Mar 8, 2023
2 parents 9b82c6b + 6b5198b commit 717ebbd
Show file tree
Hide file tree
Showing 5 changed files with 609 additions and 43 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@ The main functionalities of this package are:
* plotting
* time series visualisation for multiple runs
* time series visualisation for ensemble data
* histograms for value distributions

The functions of the package can be used in scripts or via pre-defined command-line tools. Following command-line tools are provided:
* `icon-timeseries meanmax`: time series of domain average and domain maximum of a model variable on a given level (domains can be customised and added in `src/resources/domains.yaml`)
* `icon-timeseries meanmax`: time series of domain average and domain maximum of a model variable on a given level (can display multiple experiments, domains can be customised and added in `src/resources/domains.yaml`)
* `icon-timeseries quicklook-domain`: map plot of the considered domain
* `icon-timeseries nearest-neighbour`: time series of the values of a model variable at the grid point closest to the given location
* `icon-timeseries nearest-neighbour`: time series of the values of a model variable at the grid point closest to the given location (can display multiple experiments)
* `icon-timeseries histograms`: histograms of the values of a model variable (can display multiple experiments)

In order to use more than one dask worker for the parallelisation, the job needs to be send to the slurm queue (`postproc`). A script for `sbatch` containing an example call for `icon-timeseries meanmax` is provided: `sbatch.sh`

Expand Down
136 changes: 134 additions & 2 deletions src/icon_timeseries/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
from .handle_grid import get_domain
from .handle_grid import get_grid
from .plot import plot_domain
from .plot import plot_histograms
from .plot import plot_ts_multiple
from .prepare_data import prepare_masked_da
from .prepare_data import prepare_meanmax
from .prepare_data import prepare_nn

Expand Down Expand Up @@ -175,7 +177,7 @@ def meanmax(
type=str,
help="Coordinates (format 'lon,lat') for nearest neighbour lookup.",
)
@click.option("--level", default=None, type=int, help="model level index")
@click.option("--level", default=None, type=float, help="model level values")
@click.option(
"--gridfile",
default=None,
Expand All @@ -198,7 +200,7 @@ def meanmax(
def nearest_neighbour(
exp: Tuple[Tuple, ...],
varname: str,
level: int | None,
level: float | None,
gridfile: str | None,
lonlat: str,
deagg: bool,
Expand Down Expand Up @@ -241,6 +243,136 @@ def nearest_neighbour(
plot_ts_multiple(da_dict, domain=lonlat)


@main.command()
@click.option(
"--exp",
required=True,
type=(str, str),
nargs=2,
multiple=True,
help=(
"experiment info. file pattern to files to read, will be expanded to a "
"list with glob.glob, and experiment identifier"
),
)
@click.option(
"--varname", required=True, type=str, help="GRIB shortName of the variable"
)
@click.option("--level", default=None, type=float, help="model level value")
@click.option(
"--color",
default=None,
type=str,
multiple=True,
help="color to use for experiments in plot, used as specified",
)
@click.option(
"--gridfile",
default=None,
type=str,
help="ICON grid file, needed for unstructured grid",
)
@click.option(
"--domain",
default="all",
type=str,
help="domain to consider, please define in domains.yaml",
)
@click.option(
"--deagg",
is_flag=True,
help="deagreggation of variable, method is detected from GRIB encoding "
"(de-averaging and de-accumulation are currently implemented)",
)
@click.option(
"--bins",
type=(float, float, int),
default=(0.1, 100.0, 50),
nargs=3,
help="bins for histogram, format: min (float) max (float) n_bins (int).",
)
@click.option(
"--xlog",
"xlog",
default=False,
is_flag=True,
type=bool,
help="plot on x-logscale with logarithmic bins",
)
@click.option(
"--ylog",
"ylog",
default=False,
is_flag=True,
type=bool,
help="plot on y-logscale",
)
@click.option(
"--dask-workers",
"dask_nworkers",
default=None,
type=int,
help="ignored if the script does not run on a post-proc node",
)
def histograms(
exp: Tuple[Tuple, ...],
varname: str,
level: float | None,
color: str | None,
gridfile: str | None,
domain: str,
deagg: bool,
bins: Tuple[float, float, int],
xlog: bool,
ylog: bool,
dask_nworkers: int | None,
): # pylint: disable=too-many-arguments, too-many-locals
"""Read data for a variable from GRIB file(s) and plot the values distribution."""
# check dask setup
chunks = None
if "pp" in os.uname().nodename:
logging.info("job is running on %s, dask_nworkers active", os.uname().nodename)
logging.info("number of dask workers: %d", dask_nworkers)
chunks = {"generalVerticalLayer": 1}
elif dask_nworkers and "pp" not in os.uname().nodename:
logging.warning(
"job is running on %s, dask_nworkers not active", os.uname().nodename
)
logging.warning("send your job on a post-proc node to activate dask_nworkers")
dask_nworkers = None

# gather data for all experiments
da_dict = {} # type: Dict[str, xr.DataArray]
for one_exp in exp:
logging.info("Preparing experiment %s", one_exp[1])
filelist = glob.glob(one_exp[0])
if len(filelist) == 0:
logging.warning("file list for %s is empty, skipping...", one_exp[0])
continue
da_masked = prepare_masked_da(
filelist,
varname,
level,
gridfile,
domain=domain,
deagg=deagg,
chunks=chunks,
dask_nworkers=dask_nworkers,
)
da_dict[one_exp[1]] = da_masked.copy()

# plot the histograms
_, _ = plot_histograms(
da_dict,
domain,
min_bin=bins[0],
max_bin=bins[1],
nbins=bins[2],
xlog=xlog,
ylog=ylog,
)


@main.command()
@click.option(
"--gridfile",
Expand Down
107 changes: 106 additions & 1 deletion src/icon_timeseries/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def plot_ts_multiple(

# loop over parameters
for i, (p_key, p_val) in enumerate(da_dict.items()):
e_val = xr.DataArray()
e_val: xr.DataArray = xr.DataArray() # pylint needs to have the loop variable
exp_count = 0
if colors is None:
# Take the color sequence from a colormap
Expand Down Expand Up @@ -288,3 +288,108 @@ def plot_domain(
logging.info("saved figure %s", fname)

return fig


# pylint: disable=too-many-arguments, too-many-locals
def plot_histograms(
da_dict: dict[str, xr.DataArray],
domain: str | None = None,
min_bin: float = 0.1,
max_bin: float = 100.0,
nbins: int = 50,
xlog: bool = False,
ylog: bool = False,
save: bool = True,
) -> tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]:
"""Draw a histogram plot for a dataset over a given domain.
Parameters
----------
da_dict : Dict[str, xarray.DataArray]
dictionary holding the data {"expid": xarray.DataArray}. the
dimension of the xarray.DataArray has to be 'time'
domain : str, optional
name of the domain for the plot title
min_bin : float, optional
lowest bin bound
max_bin : float, optional
highest bin bound
nbins : int, optional
number of bins
xlog : bool, optional
log. x-axis
ylog : bool, optional
log. y-axis
save : bool, optional
save the figure
Returns
-------
fig : matplotlib.figure.Figure
figure object
ax : matplotlib.axes.Axes
axes object
"""
logging.info("Histogram plotting started...")
fig, ax = plt.subplots(1)

# prepare bins
# https://stackoverflow.com/questions/6855710/
# how-to-have-logarithmic-bins-in-a-python-histogram
if xlog:
bins = 10.0 ** np.linspace(np.log10(min_bin), np.log10(max_bin), nbins)
else:
bins = np.linspace(min_bin, max_bin, nbins)

# Take the color sequence from a colormap, analogous to timeseries
cmap = plt.cm.get_cmap("gist_rainbow", len(da_dict) + 1)

logging.info("Number of experiments to plot: %i", len(da_dict))
# loop over runs/experiments
e_val: xr.DataArray = xr.DataArray() # pylint needs to have the loop variable
for i, (e_key, e_val) in enumerate(da_dict.items()):
# loop over runs/experiments
logging.info("histogram plotting for exp %s", e_key)
# set color
color = cmap(i)
vals = e_val.values.flatten()
counts, bin_edges = np.histogram(vals, bins)
width = np.diff(bin_edges)
ax.bar(
bin_edges[:-1],
counts,
width=width,
align="edge",
fill=False,
edgecolor=color,
alpha=0.8,
label=e_key,
)
ax.set_title(f"Experiments {', '.join(list(da_dict.keys()))}") # remove brackets
ax.set_xlabel(f"{e_val.name} {e_val.GRIB_stepType} ({e_val.GRIB_units})")
ax.set_ylabel("Frequency count (-)")
ax.grid(True, which="both", axis="both", linestyle="--")
ax.legend()
if xlog:
ax.set_xscale("log")
if ylog:
ax.set_yscale("log")

try:
fig.suptitle(f"Histogram plots for domain {domain}, level {e_val.level}")
except AttributeError:
fig.suptitle(f"Histogram plots for domain {domain}")

if save:
try:
fname = (
f"histograms_{e_val.name}_{'-'.join(da_dict.keys())}_l{e_val.level}.png"
)
except AttributeError:
fname = f"histograms_{e_val.name}_{'-'.join(da_dict.keys())}.png"
# fig.set_size_inches(4.0, 8.0)
fig.savefig(fname, dpi=300)
logging.info("saved figure %s", fname)

return fig, ax
Loading

0 comments on commit 717ebbd

Please sign in to comment.