diff --git a/pyproject.toml b/pyproject.toml index 4e79885..0468a15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "pyelq-sdk" -version = "1.0.9" +version = "1.0.10" description = "Package for detection, localization and quantification code." authors = ["Bas van de Kerkhof", "Matthew Jones", "David Randell"] homepage = "https://sede-open.github.io/pyELQ/" @@ -23,7 +23,7 @@ packages = [{ include = "pyelq", from = "src" }] python = ">=3.9, <3.12" pandas = ">=2.1.4" numpy = ">=1.26.2" -plotly = ">=5.18.0" +plotly = ">=5.18.0, <5.24" scipy = ">=1.11.4" pymap3d = ">=3.0.1" geojson = ">=3.1.0" @@ -49,7 +49,7 @@ addopts = "--cov=pyelq --cov-fail-under=90 --ignore-glob=*plot*" testpaths = ["tests"] [tool.coverage.report] -omit = ["*plot*", "*/data_access/*", "*/plotting/*"] +omit = ["*plot*", "*/data_access/*", "*/plotting/*", "*/post_processing/*"] exclude_lines = [".*def.*plot.*", "from pyelq.plotting.plot import Plot"] [tool.coverage.run] diff --git a/src/pyelq/plotting/plot.py b/src/pyelq/plotting/plot.py index 7deaaa3..99ac722 100644 --- a/src/pyelq/plotting/plot.py +++ b/src/pyelq/plotting/plot.py @@ -12,7 +12,7 @@ import warnings from copy import deepcopy from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Callable, Type, Union +from typing import TYPE_CHECKING, Any, Callable, Type, Union import numpy as np import pandas as pd @@ -20,16 +20,20 @@ import plotly.graph_objects as go from geojson import Feature, FeatureCollection from openmcmc.mcmc import MCMC -from scipy.ndimage import label from shapely import geometry from pyelq.component.background import TemporalBackground from pyelq.component.error_model import ErrorModel from pyelq.component.offset import PerSensor from pyelq.component.source_model import SlabAndSpike, SourceModel -from pyelq.coordinate_system import ENU, LLA +from pyelq.coordinate_system import LLA from pyelq.dispersion_model.gaussian_plume import GaussianPlume from pyelq.sensor.sensor import Sensor, SensorGroup +from pyelq.support_functions.post_processing import ( + calculate_rectangular_statistics, + create_lla_polygons_from_xy_points, + is_regularly_spaced, +) if TYPE_CHECKING: from pyelq.model import ELQModel @@ -127,7 +131,7 @@ def plot_quantiles_from_array( return fig -def create_trace_specifics(object_to_plot: Union[Type[SlabAndSpike], SourceModel, MCMC], **kwargs) -> dict: +def create_trace_specifics(object_to_plot: Union[Type[SlabAndSpike], SourceModel, MCMC], **kwargs: Any) -> dict: """Specification of different traces of single variables. Provides all details for plots where we want to plot a single variable as a line plot. Based on the object_to_plot @@ -136,7 +140,7 @@ def create_trace_specifics(object_to_plot: Union[Type[SlabAndSpike], SourceModel Args: object_to_plot (Union[Type[SlabAndSpike], SourceModel, MCMC]): Object which we want to plot a single variable from - **kwargs (dict): Additional key word arguments, e.g. burn_in or dict_key, used in some specific plots but not + **kwargs (Any): Additional key word arguments, e.g. burn_in or dict_key, used in some specific plots but not applicable to all. Returns: @@ -201,7 +205,7 @@ def create_trace_specifics(object_to_plot: Union[Type[SlabAndSpike], SourceModel def create_plot_specifics( - object_to_plot: Union[ErrorModel, PerSensor, MCMC], sensor_object: SensorGroup, plot_type: str = "", **kwargs + object_to_plot: Union[ErrorModel, PerSensor, MCMC], sensor_object: SensorGroup, plot_type: str = "", **kwargs: Any ) -> dict: """Specification of different traces where we want to plot a trace for each sensor. @@ -216,7 +220,7 @@ def create_plot_specifics( object_to_plot (Union[ErrorModel, PerSensor, MCMC]): Object which we want to plot a single variable from sensor_object (SensorGroup): SensorGroup object associated with the object_to_plot plot_type (str, optional): String specifying either a line or a box plot. - **kwargs (dict): Additional key word arguments, e.g. burn_in or dict_key, used in some specific plots but not + **kwargs (Any): Additional key word arguments, e.g. burn_in or dict_key, used in some specific plots but not applicable to all. Returns: @@ -310,7 +314,7 @@ def plot_single_scatter( y_values: np.ndarray, color: str, name: str, - **kwargs, + **kwargs: Any, ) -> go.Figure: """Plots a single scatter trace on the supplied figure object. @@ -320,8 +324,8 @@ def plot_single_scatter( y_values (np.ndarray): Numpy array containing the y-values to use in plotting. color (str): RGB color string to use for this trace. name (str): String name to show in the legend. - **kwargs (dict): Additional key word arguments, e.g. burn_in, legend_group, show_legend, used in some specific plots - but not applicable to all. + **kwargs (Any): Additional key word arguments, e.g. burn_in, legend_group, show_legend, used in some specific + plots but not applicable to all. Returns: fig (go.Figure): Plotly figure with the trace added to it. @@ -377,7 +381,7 @@ def plot_single_box(fig: go.Figure, y_values: np.ndarray, color: str, name: str) def plot_polygons_on_map( - polygons: Union[np.ndarray, list], values: np.ndarray, opacity: float, map_color_scale: str, **kwargs + polygons: Union[np.ndarray, list], values: np.ndarray, opacity: float, map_color_scale: str, **kwargs: Any ) -> go.Choroplethmapbox: """Plot a set of polygons on a map. @@ -387,8 +391,8 @@ def plot_polygons_on_map( used in coloring the polygons on the map. opacity (float): Float between 0 and 1 specifying the opacity of the polygon fill color. map_color_scale (str): The string which defines which plotly color scale. - **kwargs (dict): Additional key word arguments which can be passed on the go.Choroplethmapbox object (will override - the default values as specified in this function) + **kwargs (Any): Additional key word arguments which can be passed on the go.Choroplethmapbox object + (will override the default values as specified in this function) Returns: trace: go.Choroplethmapbox trace with the colored polygons which can be added to a go.Figure object. @@ -577,7 +581,7 @@ def show_all(self, renderer="browser"): for fig in self.figure_dict.values(): fig.show(renderer=renderer) - def plot_single_trace(self, object_to_plot: Union[Type[SlabAndSpike], SourceModel, MCMC], **kwargs): + def plot_single_trace(self, object_to_plot: Union[Type[SlabAndSpike], SourceModel, MCMC], **kwargs: Any): """Plotting a trace of a single variable. Depending on the object to plot it creates a figure which is stored in the figure_dict attribute. @@ -585,8 +589,8 @@ def plot_single_trace(self, object_to_plot: Union[Type[SlabAndSpike], SourceMode Args: object_to_plot (Union[Type[SlabAndSpike], SourceModel, MCMC]): The object from which to plot a variable - **kwargs (dict): Additional key word arguments, e.g. burn_in, legend_group, show_legend, dict_key, used in some - specific plots but not applicable to all. + **kwargs (Any): Additional key word arguments, e.g. burn_in, legend_group, show_legend, dict_key, used in + some specific plots but not applicable to all. """ plot_specifics = create_trace_specifics(object_to_plot=object_to_plot, **kwargs) @@ -632,7 +636,7 @@ def plot_trace_per_sensor( object_to_plot: Union[ErrorModel, PerSensor, MCMC], sensor_object: Union[SensorGroup, Sensor], plot_type: str, - **kwargs, + **kwargs: Any, ): """Plotting a trace of a single variable per sensor. @@ -643,8 +647,8 @@ def plot_trace_per_sensor( object_to_plot (Union[ErrorModel, PerSensor, MCMC]): The object which to plot a variable from sensor_object (Union[SensorGroup, Sensor]): Sensor object associated with the object_to_plot plot_type (str): String specifying a line or box plot. - **kwargs (dict): Additional key word arguments, e.g. burn_in, legend_group, show_legend, dict_key, used in some - specific plots but not applicable to all. + **kwargs (Any): Additional key word arguments, e.g. burn_in, legend_group, show_legend, dict_key, used in + some specific plots but not applicable to all. """ if isinstance(sensor_object, Sensor): @@ -789,7 +793,7 @@ def plot_fitted_values_per_sensor( self.figure_dict[dict_key] = fig - def plot_emission_rate_estimates(self, source_model_object, y_axis_type="linear", **kwargs): + def plot_emission_rate_estimates(self, source_model_object, y_axis_type="linear", **kwargs: Any): """Plot the emission rate estimates source model object against MCMC iteration. Based on the inputs it plots the results of the mcmc analysis, being the estimated emission rate values for @@ -803,7 +807,7 @@ def plot_emission_rate_estimates(self, source_model_object, y_axis_type="linear" Args: source_model_object (SourceModel): Source model object which contains the estimated emission rate estimates. y_axis_type (str, optional): String to indicate whether the y-axis should be linear of log scale. - **kwargs (dict): Additional key word arguments, e.g. burn_in, dict_key, used in some specific plots but not + **kwargs (Any): Additional key word arguments, e.g. burn_in, dict_key, used in some specific plots but not applicable to all. """ @@ -912,7 +916,7 @@ def create_empty_mapbox_figure(self, dict_key: str = "map_plot") -> None: ) def plot_values_on_map( - self, dict_key: str, coordinates: LLA, values: np.ndarray, aggregate_function: Callable = np.sum, **kwargs + self, dict_key: str, coordinates: LLA, values: np.ndarray, aggregate_function: Callable = np.sum, **kwargs: Any ): """Plot values on a map based on coordinates. @@ -922,7 +926,7 @@ def plot_values_on_map( values (np.ndarray): Numpy array of values consistent with coordinates to plot on the map aggregate_function (Callable, optional): Function which to apply on the data in each hexagonal bin to aggregate the data and visualise the result. - **kwargs (dict): Additional keyword arguments for plotting behaviour (opacity, map_color_scale, num_hexagons, + **kwargs (Any): Additional keyword arguments for plotting behaviour (opacity, map_color_scale, num_hexagons, show_positions) """ @@ -975,107 +979,50 @@ def plot_quantification_results_on_map( burn_in: int = 0, show_summary_results: bool = True, ): - """Placeholder for the quantification plots.""" - nof_iterations = model_object.n_iter + """Function to create a map with the quantification results of the model object. + + This function takes the ELQModel object and calculates the statistics for the quantification results. It then + populates the figure dictionary with three different maps showing the normalized count, median emission rate + and the inter-quartile range of the emission rate estimates. + + Args: + model_object (ELQModel): ELQModel object containing the quantification results + bin_size_x (float, optional): Size of the bins in the x-direction. Defaults to 1. + bin_size_y (float, optional): Size of the bins in the y-direction. Defaults to 1. + normalized_count_limit (float, optional): Limit for the normalized count to show on the map. + Defaults to 0.005. + burn_in (int, optional): Number of burn-in iterations to discard before calculating the statistics. + Defaults to 0. + show_summary_results (bool, optional): Flag to show the summary results on the map. Defaults to True. + + """ ref_latitude = model_object.components["source"].dispersion_model.source_map.location.ref_latitude ref_longitude = model_object.components["source"].dispersion_model.source_map.location.ref_longitude ref_altitude = model_object.components["source"].dispersion_model.source_map.location.ref_altitude + datetime_min_string = model_object.sensor_object.time.min().strftime("%d-%b-%Y, %H:%M:%S") datetime_max_string = model_object.sensor_object.time.max().strftime("%d-%b-%Y, %H:%M:%S") - all_source_locations = model_object.mcmc.store["z_src"] - min_x = np.nanmin(all_source_locations[0, :, :]) - max_x = np.nanmax(all_source_locations[0, :, :]) - min_y = np.nanmin(all_source_locations[1, :, :]) - max_y = np.nanmax(all_source_locations[1, :, :]) - - bin_min_x = np.floor(min_x - 0.1) - bin_max_x = np.ceil(max_x + 0.1) - bin_min_y = np.floor(min_y - 0.1) - bin_max_y = np.ceil(max_y + 0.1) - bin_min_iteration = burn_in + 0.5 - bin_max_iteration = nof_iterations + 0.5 - - max_nof_sources = all_source_locations.shape[1] - - x_edges = np.arange(start=bin_min_x, stop=bin_max_x + bin_size_x, step=bin_size_x) - y_edges = np.arange(start=bin_min_y, stop=bin_max_y + bin_size_y, step=bin_size_y) - iteration_edges = np.arange(start=bin_min_iteration, stop=bin_max_iteration + bin_size_y, step=1) - - result_x_vals = all_source_locations[0, :, :].flatten() - result_y_vals = all_source_locations[1, :, :].flatten() - result_z_vals = all_source_locations[2, :, :].flatten() - # 1-indexing for iterations effectively - result_iteration_vals = np.array(range(nof_iterations)).reshape(1, -1) + 1 - result_iteration_vals = np.tile(result_iteration_vals, (max_nof_sources, 1)).flatten() - results_estimates = model_object.mcmc.store["s"].flatten() - - result_weighted, _ = np.histogramdd( - sample=np.array([result_x_vals, result_y_vals, result_iteration_vals]).T, - bins=[x_edges, y_edges, iteration_edges], - weights=results_estimates, - density=False, + result_weighted, _, normalized_count, count_boolean, enu_points, summary_result = ( + calculate_rectangular_statistics( + model_object=model_object, + bin_size_x=bin_size_x, + bin_size_y=bin_size_y, + burn_in=burn_in, + normalized_count_limit=normalized_count_limit, + ) ) - count_result, edges_result = np.histogramdd( - sample=np.array([result_x_vals, result_y_vals, result_iteration_vals]).T, - bins=[x_edges, y_edges, iteration_edges], - density=False, + polygons = create_lla_polygons_from_xy_points( + points_array=enu_points, + ref_latitude=ref_latitude, + ref_longitude=ref_longitude, + ref_altitude=ref_altitude, + boolean_mask=count_boolean, ) - enu_x = edges_result[0] - enu_x = enu_x[:-1] + np.diff(enu_x) / 2 - enu_y = edges_result[1] - enu_y = enu_y[:-1] + np.diff(enu_y) / 2 - - enu_x, enu_y = np.meshgrid(enu_x, enu_y, indexing="ij") - - enu_object_full_grid = ENU(ref_latitude=ref_latitude, ref_longitude=ref_longitude, ref_altitude=ref_altitude) - enu_object_full_grid.east = enu_x.flatten() - enu_object_full_grid.north = enu_y.flatten() - enu_object_full_grid.up = np.zeros_like(enu_object_full_grid.north) - lla_object_full_grid = enu_object_full_grid.to_lla() - - _, gridsize_lat = is_regularly_spaced(lla_object_full_grid.latitude, tolerance=1e-6) - _, gridsize_lon = is_regularly_spaced(lla_object_full_grid.longitude, tolerance=1e-6) - - overall_count = np.sum(count_result, axis=2) - normalized_count = overall_count / (nof_iterations - burn_in) - - count_boolean = normalized_count >= normalized_count_limit - - enu_object = ENU(ref_latitude=ref_latitude, ref_longitude=ref_longitude, ref_altitude=ref_altitude) - enu_object.east = enu_x[count_boolean].flatten() - enu_object.north = enu_y[count_boolean].flatten() - enu_object.up = np.zeros_like(enu_object.north) - lla_object = enu_object.to_lla() - - polygons = [ - geometry.box( - lla_object.longitude[idx] - gridsize_lon / 2, - lla_object.latitude[idx] - gridsize_lat / 2, - lla_object.longitude[idx] + gridsize_lon / 2, - lla_object.latitude[idx] + gridsize_lat / 2, - ) - for idx in range(lla_object.nof_observations) - ] - if show_summary_results: - summary_trace = self.create_summary_trace( - result_iteration_vals=result_iteration_vals, - burn_in=burn_in, - result_x_vals=result_x_vals, - result_y_vals=result_y_vals, - result_z_vals=result_z_vals, - results_estimates=results_estimates, - count_boolean=count_boolean, - x_edges=x_edges, - y_edges=y_edges, - nof_iterations=nof_iterations, - ref_latitude=ref_latitude, - ref_longitude=ref_longitude, - ref_altitude=ref_altitude, - ) + summary_trace = self.create_summary_trace(summary_result=summary_result) self.create_empty_mapbox_figure(dict_key="count_map") trace = plot_polygons_on_map( @@ -1199,118 +1146,20 @@ def plot_coverage( @staticmethod def create_summary_trace( - result_x_vals: np.ndarray, - result_y_vals: np.ndarray, - result_z_vals: np.ndarray, - results_estimates: np.ndarray, - result_iteration_vals: np.ndarray, - count_boolean: np.ndarray, - x_edges: np.ndarray, - y_edges: np.ndarray, - nof_iterations: int, - burn_in: int, - ref_latitude: float, - ref_longitude: float, - ref_altitude: float, + summary_result: pd.DataFrame, ) -> go.Scattermapbox: """Helper function to create the summary information to plot on top of map type plots. - We identify all blobs of estimates which appear close together on the map by looking at connected pixels in the - count_boolean array. Next we find the summary statistics for all estimates in that blob like overall median and - IQR estimate, mean location and the likelihood of that blob. - - When multiple sources are present in the same blob at the same iteration we first sum those emission rate - estimates before taking the median. - - The summary statistics are also printed out on screen. + We use the summary result calculated through the support functions module to create a trace which contains + the summary information for each source location. Args: - result_x_vals (np.ndarray): X-coordinate of estimates, flattened array of (n_sources_max * nof_iterations,). - result_y_vals (np.ndarray): Y-coordinate of estimates, flattened array of (n_sources_max * nof_iterations,). - result_z_vals (np.ndarray): Z-coordinate of estimates, flattened array of (n_sources_max * nof_iterations,). - results_estimates (np.ndarray): Emission rate estimates, flattened array of - (n_sources_max * nof_iterations,). - result_iteration_vals (np.ndarray): Iteration number corresponding each estimated value, flattened array - of (n_sources_max * nof_iterations,). - count_boolean (np.ndarray): Boolean array which indicates if likelihood of pixel is over threshold. - x_edges (np.ndarray): Pixel edges x-coordinates. - y_edges (np.ndarray): Pixel edges y-coordinates. - nof_iterations (int): Number of iterations used in MCMC. - burn_in (int): Burn-in used in MCMC. - ref_latitude (float): Reference latitude in degrees of ENU coordinate system. - ref_longitude (float): Reference longitude in degrees of ENU coordinate system. - ref_altitude (float): Reference altitude in meters of ENU coordinate system. + summary_result (pd.DataFrame): DataFrame containing the summary information for each source location. Returns: summary_trace (go.Scattermapbox): Trace with summary information to plot on top of map type plots. """ - labeled_array, num_features = label(input=count_boolean, structure=np.ones((3, 3))) - - burn_in_bool = result_iteration_vals > burn_in - nan_x_vals = np.isnan(result_x_vals) - nan_y_vals = np.isnan(result_y_vals) - nan_z_vals = np.isnan(result_z_vals) - no_nan_idx = np.logical_not(np.logical_or(np.logical_or(nan_x_vals, nan_y_vals), nan_z_vals)) - no_nan_and_burn_in_bool = np.logical_and(no_nan_idx, burn_in_bool) - result_x_vals_no_nan = result_x_vals[no_nan_and_burn_in_bool] - result_y_vals_no_nan = result_y_vals[no_nan_and_burn_in_bool] - result_z_vals_no_nan = result_z_vals[no_nan_and_burn_in_bool] - results_estimates_no_nan = results_estimates[no_nan_and_burn_in_bool] - result_iteration_vals_no_nan = result_iteration_vals[no_nan_and_burn_in_bool] - - x_idx = np.digitize(result_x_vals_no_nan, x_edges, right=False) - 1 - y_idx = np.digitize(result_y_vals_no_nan, y_edges, right=False) - 1 - bin_numbers = np.ravel_multi_index((x_idx, y_idx), labeled_array.shape) - - bin_numbers_per_label = [ - np.ravel_multi_index(np.nonzero(labeled_array == value), labeled_array.shape) - for value in np.array(range(num_features)) + 1 - ] - - summary_result = pd.DataFrame() - for label_idx, curr_bins in enumerate(bin_numbers_per_label): - boolean_for_result = np.isin(bin_numbers, curr_bins) - mean_x = np.mean(result_x_vals_no_nan[boolean_for_result]) - mean_y = np.mean(result_y_vals_no_nan[boolean_for_result]) - mean_z = np.mean(result_z_vals_no_nan[boolean_for_result]) - - unique_iteration_vals, indices, counts = np.unique( - result_iteration_vals_no_nan[boolean_for_result], return_inverse=True, return_counts=True - ) - nof_iterations_present = unique_iteration_vals.size - blob_likelihood = nof_iterations_present / (nof_iterations - burn_in) - single_idx = np.argwhere(counts == 1) - results_estimates_for_blob = results_estimates_no_nan[boolean_for_result] - temp_estimate_result = results_estimates_for_blob[indices[single_idx.flatten()]] - multiple_idx = np.argwhere(counts > 1) - for single_idx in multiple_idx: - temp_val = np.sum(results_estimates_for_blob[indices == single_idx]) - temp_estimate_result = np.append(temp_estimate_result, temp_val) - - median_estimate = np.median(temp_estimate_result) - iqr_estimate = np.nanquantile(a=temp_estimate_result, q=0.75) - np.nanquantile( - a=temp_estimate_result, q=0.25 - ) - lower_bound = np.nanquantile(a=temp_estimate_result, q=0.025) - upper_bound = np.nanquantile(a=temp_estimate_result, q=0.975) - enu_object = ENU(ref_latitude=ref_latitude, ref_longitude=ref_longitude, ref_altitude=ref_altitude) - enu_object.east = mean_x - enu_object.north = mean_y - enu_object.up = mean_z - lla_object = enu_object.to_lla() - - summary_result.loc[label_idx, "latitude"] = lla_object.latitude - summary_result.loc[label_idx, "longitude"] = lla_object.longitude - summary_result.loc[label_idx, "altitude"] = lla_object.altitude - summary_result.loc[label_idx, "height"] = mean_z - summary_result.loc[label_idx, "median_estimate"] = median_estimate - summary_result.loc[label_idx, "quantile_025"] = lower_bound - summary_result.loc[label_idx, "quantile_975"] = upper_bound - summary_result.loc[label_idx, "iqr_estimate"] = iqr_estimate - summary_result.loc[label_idx, "absolute_count_iterations"] = nof_iterations_present - summary_result.loc[label_idx, "blob_likelihood"] = blob_likelihood - summary_text_values = [ f"Source ID: {value}
" f"(Lon, Lat, Alt) ([deg], [deg], [m]):
" @@ -1337,38 +1186,4 @@ def create_summary_trace( hoverinfo="text", ) - summary_result.index.name = "source_ID" - summary_result = summary_result.astype({"absolute_count_iterations": "int"}) - print("Summary results:") - print(summary_result.to_string(float_format=lambda x: "%.7f" % x)) - return summary_trace - - -def is_regularly_spaced(array: np.ndarray, tolerance: float = 0.01, return_delta: bool = True): - """Determines whether an input array is regularly spaced, within some (absolute) tolerance. - - Gets the large differences (defined by tolerance) in the array, and sees whether all of them are within 5% of one - another. - - Args: - array (np.ndarray): Input array to be analysed. - tolerance (float, optional): Absolute value above which the difference between values is considered significant. - Defaults to 0.01. - return_delta (bool, optional): Whether to return the value of the regular grid spacing. Defaults to True. - - Returns: - (bool): Whether or not the grid is regularly spaced. - (float): The value of the regular grid spacing. - - """ - unique_vals = np.unique(array) - diff_unique_vals = np.diff(unique_vals) - diff_big = diff_unique_vals[diff_unique_vals > tolerance] - - boolean = np.all([np.isclose(diff_big[i], diff_big[i + 1], rtol=0.05) for i in range(len(diff_big) - 1)]) - - if return_delta: - return boolean, np.mean(diff_big) - - return boolean, None diff --git a/src/pyelq/support_functions/__init__.py b/src/pyelq/support_functions/__init__.py index 043f324..60848ca 100644 --- a/src/pyelq/support_functions/__init__.py +++ b/src/pyelq/support_functions/__init__.py @@ -2,4 +2,4 @@ # # SPDX-License-Identifier: Apache-2.0 """Support Functions Module.""" -__all__ = ["spatio_temporal_interpolation"] +__all__ = ["post_processing", "spatio_temporal_interpolation"] diff --git a/src/pyelq/support_functions/post_processing.py b/src/pyelq/support_functions/post_processing.py new file mode 100644 index 0000000..4267148 --- /dev/null +++ b/src/pyelq/support_functions/post_processing.py @@ -0,0 +1,343 @@ +# SPDX-FileCopyrightText: 2024 Shell Global Solutions International B.V. All Rights Reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# -*- coding: utf-8 -*- +"""Post-processing module. + +Module containing some functions used in post-processing of the results. + +""" +from typing import TYPE_CHECKING, Tuple, Union + +import numpy as np +import pandas as pd +from scipy.ndimage import label +from shapely import geometry + +from pyelq.coordinate_system import ENU + +if TYPE_CHECKING: + from pyelq.model import ELQModel + + +def is_regularly_spaced(array: np.ndarray, tolerance: float = 0.01, return_delta: bool = True): + """Determines whether an input array is regularly spaced, within some (absolute) tolerance. + + Gets the large differences (defined by tolerance) in the array, and sees whether all of them are within 5% of one + another. + + Args: + array (np.ndarray): Input array to be analysed. + tolerance (float, optional): Absolute value above which the difference between values is considered significant. + Defaults to 0.01. + return_delta (bool, optional): Whether to return the value of the regular grid spacing. Defaults to True. + + Returns: + (bool): Whether the grid is regularly spaced. + (float): The value of the regular grid spacing. + + """ + unique_vals = np.unique(array) + diff_unique_vals = np.diff(unique_vals) + diff_big = diff_unique_vals[diff_unique_vals > tolerance] + + boolean = np.all([np.isclose(diff_big[i], diff_big[i + 1], rtol=0.05) for i in range(len(diff_big) - 1)]) + + if return_delta: + return boolean, np.mean(diff_big) + + return boolean, None + + +def calculate_rectangular_statistics( + model_object: "ELQModel", + bin_size_x: float = 1, + bin_size_y: float = 1, + burn_in: int = 0, + normalized_count_limit: float = 0.005, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, list, pd.DataFrame]: + """Function which aggregates the pyELQ results into rectangular bins and outputs the related summary statistics. + + The function creates a pixel grid (binning) in East-North coordinates based on the bin_size_x and bin_size_y + parameters. For each bin both a count as well as a weighted sum of the emission rate estimates is calculated. The + count is normalized by the number of iterations used in the MCMC and a boolean array is created which indicates if + the count is above a certain threshold. Connected pixels where the count is above this threshold are considered to + be a single blob/source and emission estimates per blob are summed over all pixels in the blob. The function + then calculates the summary statistics for each blob of estimates which are connected pixels. The summary + statistics include the median and IQR of the emission rate estimates, the mean location of the blob and the + likelihood of the blob. + + Args: + model_object (ELQModel): ELQModel object containing the results of the MCMC run. + bin_size_x (float, optional): Size of the bins in the x-direction. Defaults to 1. + bin_size_y (float, optional): Size of the bins in the y-direction. Defaults to 1. + burn_in (int, optional): Number of burn-in iterations used in the MCMC. Defaults to 0. + normalized_count_limit (float, optional): Threshold for the normalized count to be considered a blob. + + Returns: + result_weighted (np.ndarray): Weighted sum of the emission rate estimates in each bin. + overall_count (np.ndarray): Count of the number of estimates in each bin. + normalized_count (np.ndarray): Normalized count of the number of estimates in each bin. + count_boolean (np.ndarray): Boolean array which indicates if likelihood of pixel is over threshold. + edges_result (np.ndarray): Centers of the pixels in the x and y direction. + summary_result (pd.DataFrame): Summary statistics for each blob of estimates. + + """ + nof_iterations = model_object.n_iter + ref_latitude = model_object.components["source"].dispersion_model.source_map.location.ref_latitude + ref_longitude = model_object.components["source"].dispersion_model.source_map.location.ref_longitude + ref_altitude = model_object.components["source"].dispersion_model.source_map.location.ref_altitude + + all_source_locations = model_object.mcmc.store["z_src"] + min_x = np.nanmin(all_source_locations[0, :, :]) + max_x = np.nanmax(all_source_locations[0, :, :]) + min_y = np.nanmin(all_source_locations[1, :, :]) + max_y = np.nanmax(all_source_locations[1, :, :]) + + bin_min_x = np.floor(min_x - 0.1) + bin_max_x = np.ceil(max_x + 0.1) + bin_min_y = np.floor(min_y - 0.1) + bin_max_y = np.ceil(max_y + 0.1) + bin_min_iteration = burn_in + 0.5 + bin_max_iteration = nof_iterations + 0.5 + + max_nof_sources = all_source_locations.shape[1] + + x_edges = np.arange(start=bin_min_x, stop=bin_max_x + bin_size_x, step=bin_size_x) + y_edges = np.arange(start=bin_min_y, stop=bin_max_y + bin_size_y, step=bin_size_y) + iteration_edges = np.arange(start=bin_min_iteration, stop=bin_max_iteration + bin_size_y, step=1) + + result_x_vals = all_source_locations[0, :, :].flatten() + result_y_vals = all_source_locations[1, :, :].flatten() + result_z_vals = all_source_locations[2, :, :].flatten() + + result_iteration_vals = np.array(range(nof_iterations)).reshape(1, -1) + 1 + result_iteration_vals = np.tile(result_iteration_vals, (max_nof_sources, 1)).flatten() + results_estimates = model_object.mcmc.store["s"].flatten() + + result_weighted, _ = np.histogramdd( + sample=np.array([result_x_vals, result_y_vals, result_iteration_vals]).T, + bins=[x_edges, y_edges, iteration_edges], + weights=results_estimates, + density=False, + ) + + count_result, edges_result = np.histogramdd( + sample=np.array([result_x_vals, result_y_vals, result_iteration_vals]).T, + bins=[x_edges, y_edges, iteration_edges], + density=False, + ) + + overall_count = np.array(np.sum(count_result, axis=2)) + normalized_count = overall_count / (nof_iterations - burn_in) + count_boolean = normalized_count >= normalized_count_limit + + summary_result = create_aggregation( + result_iteration_vals=result_iteration_vals, + burn_in=burn_in, + result_x_vals=result_x_vals, + result_y_vals=result_y_vals, + result_z_vals=result_z_vals, + results_estimates=results_estimates, + count_boolean=count_boolean, + x_edges=x_edges, + y_edges=y_edges, + nof_iterations=nof_iterations, + ref_latitude=ref_latitude, + ref_longitude=ref_longitude, + ref_altitude=ref_altitude, + ) + + return result_weighted, overall_count, normalized_count, count_boolean, edges_result[:2], summary_result + + +def create_lla_polygons_from_xy_points( + points_array: list[np.ndarray], + ref_latitude: float, + ref_longitude: float, + ref_altitude: float, + boolean_mask: Union[np.ndarray, None] = None, +) -> list[geometry.Polygon]: + """Function to create polygons in LLA coordinates from a grid of points in ENU coordinates. + + This function takes a grid of East-North points, these points are used as center points for a pixel grid. The pixel + grid is then converted to LLA coordinates and these center points are used to create a polygon in LLA coordinates. + A polygon is only created if the boolean mask for that pixel is True. + + Args: + points_array (list[np.ndarray]): List of arrays of grid of points in ENU coordinates. + ref_latitude (float): Reference latitude in degrees of ENU coordinate system. + ref_longitude (float): Reference longitude in degrees of ENU coordinate system. + ref_altitude (float): Reference altitude in meters of ENU coordinate system. + boolean_mask (np.ndarray, optional): Boolean mask to indicate which pixels to create polygons for. + Defaults to None which means all pixels are used. + + Returns: + list[geometry.Polygon]: List of polygons in LLA coordinates + """ + if boolean_mask is None: + boolean_mask = np.ones_like(points_array, dtype=bool) + + enu_x = points_array[0] + enu_x = enu_x[:-1] + np.diff(enu_x) / 2 + enu_y = points_array[1] + enu_y = enu_y[:-1] + np.diff(enu_y) / 2 + + enu_x, enu_y = np.meshgrid(enu_x, enu_y, indexing="ij") + + enu_object_full_grid = ENU(ref_latitude=ref_latitude, ref_longitude=ref_longitude, ref_altitude=ref_altitude) + enu_object_full_grid.east = enu_x.flatten() + enu_object_full_grid.north = enu_y.flatten() + enu_object_full_grid.up = np.zeros_like(enu_object_full_grid.north) + lla_object_full_grid = enu_object_full_grid.to_lla() + + _, gridsize_lat = is_regularly_spaced(lla_object_full_grid.latitude, tolerance=1e-6) + _, gridsize_lon = is_regularly_spaced(lla_object_full_grid.longitude, tolerance=1e-6) + + polygons = [ + geometry.box( + lla_object_full_grid.longitude[idx] - gridsize_lon / 2, + lla_object_full_grid.latitude[idx] - gridsize_lat / 2, + lla_object_full_grid.longitude[idx] + gridsize_lon / 2, + lla_object_full_grid.latitude[idx] + gridsize_lat / 2, + ) + for idx in np.argwhere(boolean_mask.flatten()).flatten() + ] + + return polygons + + +def create_aggregation( + result_x_vals: np.ndarray, + result_y_vals: np.ndarray, + result_z_vals: np.ndarray, + results_estimates: np.ndarray, + result_iteration_vals: np.ndarray, + count_boolean: np.ndarray, + x_edges: np.ndarray, + y_edges: np.ndarray, + nof_iterations: int, + burn_in: int, + ref_latitude: float, + ref_longitude: float, + ref_altitude: float, +) -> pd.DataFrame: + """Function to create the aggregated information for the blobs of estimates. + + We identify all blobs of estimates which appear close together on the map by looking at connected pixels in the + count_boolean array. Next we find the summary statistics for all estimates in that blob like overall median and + IQR estimate, mean location and the likelihood of that blob. + + When multiple sources are present in the same blob at the same iteration we first sum those emission rate + estimates before taking the median. + + If no blobs are found a dataframe with nan values is return to avoid breaking plotting code which calls this + function. + + Args: + result_x_vals (np.ndarray): X-coordinate of estimates, flattened array of (n_sources_max * nof_iterations,). + result_y_vals (np.ndarray): Y-coordinate of estimates, flattened array of (n_sources_max * nof_iterations,). + result_z_vals (np.ndarray): Z-coordinate of estimates, flattened array of (n_sources_max * nof_iterations,). + results_estimates (np.ndarray): Emission rate estimates, flattened array of + (n_sources_max * nof_iterations,). + result_iteration_vals (np.ndarray): Iteration number corresponding each estimated value, flattened array + of (n_sources_max * nof_iterations,). + count_boolean (np.ndarray): Boolean array which indicates if likelihood of pixel is over threshold. + x_edges (np.ndarray): Pixel edges x-coordinates. + y_edges (np.ndarray): Pixel edges y-coordinates. + nof_iterations (int): Number of iterations used in MCMC. + burn_in (int): Burn-in used in MCMC. + ref_latitude (float): Reference latitude in degrees of ENU coordinate system. + ref_longitude (float): Reference longitude in degrees of ENU coordinate system. + ref_altitude (float): Reference altitude in meters of ENU coordinate system. + + Returns: + summary_result (pd.DataFrame): Summary statistics for each blob of estimates. + + """ + labeled_array, num_features = label(input=count_boolean, structure=np.ones((3, 3))) + + if num_features == 0: + summary_result = pd.DataFrame() + summary_result.index.name = "source_ID" + summary_result.loc[0, "latitude"] = np.nan + summary_result.loc[0, "longitude"] = np.nan + summary_result.loc[0, "altitude"] = np.nan + summary_result.loc[0, "height"] = np.nan + summary_result.loc[0, "median_estimate"] = np.nan + summary_result.loc[0, "quantile_025"] = np.nan + summary_result.loc[0, "quantile_975"] = np.nan + summary_result.loc[0, "iqr_estimate"] = np.nan + summary_result.loc[0, "absolute_count_iterations"] = np.nan + summary_result.loc[0, "blob_likelihood"] = np.nan + + return summary_result + + burn_in_bool = result_iteration_vals > burn_in + nan_x_vals = np.isnan(result_x_vals) + nan_y_vals = np.isnan(result_y_vals) + nan_z_vals = np.isnan(result_z_vals) + no_nan_idx = np.logical_not(np.logical_or(np.logical_or(nan_x_vals, nan_y_vals), nan_z_vals)) + no_nan_and_burn_in_bool = np.logical_and(no_nan_idx, burn_in_bool) + result_x_vals_no_nan = result_x_vals[no_nan_and_burn_in_bool] + result_y_vals_no_nan = result_y_vals[no_nan_and_burn_in_bool] + result_z_vals_no_nan = result_z_vals[no_nan_and_burn_in_bool] + results_estimates_no_nan = results_estimates[no_nan_and_burn_in_bool] + result_iteration_vals_no_nan = result_iteration_vals[no_nan_and_burn_in_bool] + + x_idx = np.digitize(result_x_vals_no_nan, x_edges, right=False) - 1 + y_idx = np.digitize(result_y_vals_no_nan, y_edges, right=False) - 1 + bin_numbers = np.ravel_multi_index((x_idx, y_idx), labeled_array.shape) + + bin_numbers_per_label = [ + np.ravel_multi_index(np.nonzero(labeled_array == value), labeled_array.shape) + for value in np.array(range(num_features)) + 1 + ] + + summary_result = pd.DataFrame() + summary_result.index.name = "source_ID" + + for label_idx, curr_bins in enumerate(bin_numbers_per_label): + boolean_for_result = np.isin(bin_numbers, curr_bins) + mean_x = np.mean(result_x_vals_no_nan[boolean_for_result]) + mean_y = np.mean(result_y_vals_no_nan[boolean_for_result]) + mean_z = np.mean(result_z_vals_no_nan[boolean_for_result]) + + unique_iteration_vals, indices, counts = np.unique( + result_iteration_vals_no_nan[boolean_for_result], return_inverse=True, return_counts=True + ) + nof_iterations_present = unique_iteration_vals.size + blob_likelihood = nof_iterations_present / (nof_iterations - burn_in) + single_idx = np.argwhere(counts == 1) + results_estimates_for_blob = results_estimates_no_nan[boolean_for_result] + temp_estimate_result = results_estimates_for_blob[indices[single_idx.flatten()]] + multiple_idx = np.argwhere(counts > 1) + for single_idx in multiple_idx: + temp_val = np.sum(results_estimates_for_blob[indices == single_idx]) + temp_estimate_result = np.append(temp_estimate_result, temp_val) + + median_estimate = np.median(temp_estimate_result) + iqr_estimate = np.nanquantile(a=temp_estimate_result, q=0.75) - np.nanquantile(a=temp_estimate_result, q=0.25) + lower_bound = np.nanquantile(a=temp_estimate_result, q=0.025) + upper_bound = np.nanquantile(a=temp_estimate_result, q=0.975) + enu_object = ENU(ref_latitude=ref_latitude, ref_longitude=ref_longitude, ref_altitude=ref_altitude) + enu_object.east = mean_x + enu_object.north = mean_y + enu_object.up = mean_z + lla_object = enu_object.to_lla() + + summary_result.loc[label_idx, "latitude"] = lla_object.latitude + summary_result.loc[label_idx, "longitude"] = lla_object.longitude + summary_result.loc[label_idx, "altitude"] = lla_object.altitude + summary_result.loc[label_idx, "height"] = mean_z + summary_result.loc[label_idx, "median_estimate"] = median_estimate + summary_result.loc[label_idx, "quantile_025"] = lower_bound + summary_result.loc[label_idx, "quantile_975"] = upper_bound + summary_result.loc[label_idx, "iqr_estimate"] = iqr_estimate + summary_result.loc[label_idx, "absolute_count_iterations"] = nof_iterations_present + summary_result.loc[label_idx, "blob_likelihood"] = blob_likelihood + + summary_result = summary_result.astype({"absolute_count_iterations": "int"}) + + return summary_result