diff --git a/Changelog.rst b/Changelog.rst index dc36568b30..9a7c3580d9 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,6 +3,8 @@ version NEXTVERSION **2024-??-??** +* New keyword parameter to `cf.histogram`: ``density`` + (https://github.com/NCAS-CMS/cf-python/issues/794) * New method: `cf.Field.is_discrete_axis` (https://github.com/NCAS-CMS/cf-python/issues/784) * Include the UM version as a field property when reading UM files diff --git a/cf/maths.py b/cf/maths.py index 52b4a4f1b0..94a322a5f6 100644 --- a/cf/maths.py +++ b/cf/maths.py @@ -92,7 +92,7 @@ def relative_vorticity( ) # pragman: no cover -def histogram(*digitized): +def histogram(*digitized, density=False): """Return the distribution of a set of variables in the form of an N-dimensional histogram. @@ -151,6 +151,17 @@ def histogram(*digitized): manipulating the dimensions of the digitized field construct's data to ensure that broadcasting can occur. + density: `bool`, optional + If False, the default, the result will contain the number + of samples in each bin. If True, the result is the value + of the probability density function at the bin, normalized + such that the integral over the range is 1. Note that the + sum of the histogram values will not be equal to 1 unless + bins of unity size are chosen; it is not a probability + mass function. + + .. versionadded:: NEXTVERSION + :Returns: `Field` @@ -197,7 +208,7 @@ def histogram(*digitized): [0.1174 0.1317] [0.1317 0.146 ]] >>> h = cf.histogram(indices) - >>> rint(h) + >>> print(h) Field: number_of_observations ----------------------------- Data : number_of_observations(specific_humidity(10)) 1 @@ -216,7 +227,21 @@ def histogram(*digitized): [0.1031 0.1174] [0.1174 0.1317] [0.1317 0.146 ]] + >>> p = cf.histogram(indices, density=True) + >>> print(p) + Field: long_name=probability density function + --------------------------------------------- + Data : long_name=probability density function(specific_humidity(10)) 1 + Cell methods : latitude: longitude: point + Dimension coords: specific_humidity(10) = [0.01015, ..., 0.13885] 1 + >>> print(p.data.round(2).array)) + [15.73 12.24 15.73 6.99 8.74 1.75 1.75 1.75 3.5 1.75] + + The sum of the density values multiplied by the bin sizes is + unity: + >>> print((p * p.dimension_coordinate().cellsize).sum().array) + 1.0 Create a two-dimensional histogram based on specific humidity and temperature bins. The temperature bins in this example are derived @@ -225,8 +250,8 @@ def histogram(*digitized): >>> g = f.copy() >>> g.standard_name = 'air_temperature' - >>> import numpy - >>> g[...] = numpy.random.normal(loc=290, scale=10, size=40).reshape(5, 8) + >>> import numpy as np + >>> g[...] = np.arange(40).reshape(5, 8) + 253.15 >>> g.override_units('K', inplace=True) >>> print(g) Field: air_temperature (ncvar%q) @@ -247,13 +272,28 @@ def histogram(*digitized): Dimension coords: air_temperature(5) = [281.1054839143287, ..., 313.9741786365939] K : specific_humidity(10) = [0.01015, ..., 0.13885] 1 >>> print(h.array) - [[2 1 5 3 2 -- -- -- -- --] - [1 1 2 -- 1 -- 1 1 -- --] - [4 4 2 1 1 1 -- -- 1 1] - [1 1 -- -- 1 -- -- -- 1 --] - [1 -- -- -- -- -- -- -- -- --]] + [[3 3 2 -- -- -- -- -- -- --] + [1 1 2 1 3 -- -- -- -- --] + [1 -- -- 1 -- 1 1 1 2 1] + [2 1 1 2 2 -- -- -- -- --] + [2 2 4 -- -- -- -- -- -- --]] >>> h.sum() + >>> p = cf.histogram(indices, indices_t, density=True) + >>> print(p) + Field: long_name=probability density function + --------------------------------------------- + Data : long_name=probability density function(air_temperature(5), specific_humidity(10)) K-1 + Cell methods : latitude: longitude: point + Dimension coords: air_temperature(5) = [257.05, ..., 288.25] K + : specific_humidity(10) = [0.01015, ..., 0.13885] 1 + >>> print(p.array) + >>> print(p.data.round(2).array)) + [[0.67 0.67 0.45 -- -- -- -- -- -- --] + [0.22 0.22 0.45 0.22 0.67 -- -- -- -- --] + [0.22 -- -- 0.22 -- 0.22 0.22 0.22 0.45 0.22] + [0.45 0.22 0.22 0.45 0.45 -- -- -- -- --] + [0.45 0.45 0.9 -- -- -- -- -- -- --]] """ if not digitized: @@ -264,7 +304,26 @@ def histogram(*digitized): f = digitized[0].copy() f.clear_properties() - return f.bin("sample_size", digitized=digitized) + out = f.bin("sample_size", digitized=digitized) + + if density: + # Get the measure of each bin. This is the outer product of + # the cell sizes of each dimension coordinate construct (the + # dimension coordinate constructs define the bins). + data_axes = out.get_data_axes() + dim = out.dimension_coordinate(filter_by_axis=(data_axes[0],)) + bin_measures = dim.cellsize + for axis in data_axes[1:]: + dim = out.dimension_coordinate(filter_by_axis=(axis,)) + bin_measures.outerproduct(dim.cellsize, inplace=True) + + # Convert counts to densities + out /= out.data.sum() * bin_measures + + out.del_property("standard_name", None) + out.set_property("long_name", "probability density function") + + return out def curl_xy(fx, fy, x_wrap=None, one_sided_at_boundary=False, radius=None): diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py index 0900692aeb..09bc5e3d11 100644 --- a/cf/test/test_Maths.py +++ b/cf/test/test_Maths.py @@ -4,6 +4,8 @@ faulthandler.enable() # to debug seg faults and timeouts +import numpy as np + import cf @@ -207,6 +209,54 @@ def test_differential_operators(self): zeros[...] = 0 self.assertTrue(cg.data.equals(zeros.data, rtol=0, atol=1e-15)) + def test_histogram(self): + f = cf.example_field(0) + g = f.copy() + g.standard_name = "air_temperature" + g[...] = np.arange(40).reshape(5, 8) + 253.15 + g.override_units("K", inplace=True) + + atol = 1e-15 + + # 1-d histogram + indices = f.digitize(10) + h = cf.histogram(indices) + self.assertTrue((h.array == [9, 7, 9, 4, 5, 1, 1, 1, 2, 1]).all) + h = cf.histogram(indices, density=True) + self.assertEqual(h.Units, cf.Units("1")) + # Check that integral is 1 + bin_measures = h.dimension_coordinate().cellsize + integral = (h * bin_measures).sum() + self.assertTrue(np.allclose(integral.array, 1, rtol=0, atol=atol)) + + # 2-d histogram + indices_t = g.digitize(5) + h = cf.histogram(indices, indices_t) + self.assertEqual(h.Units, cf.Units()) + # The -1 values correspond to missing values in h + self.assertTrue( + ( + h.array + == [ + [3, 3, 2, -1, -1, -1, -1, -1, -1, -1], + [1, 1, 2, 1, 3, -1, -1, -1, -1, -1], + [1, -1, -1, 1, -1, 1, 1, 1, 2, 1], + [2, 1, 1, 2, 2, -1, -1, -1, -1, -1], + [2, 2, 4, -1, -1, -1, -1, -1, -1, -1], + ] + ).all() + ) + + h = cf.histogram(indices, indices_t, density=True) + self.assertEqual(h.Units, cf.Units("K-1")) + # Check that integral is 1 + bin_measures = h.dimension_coordinate("air_temperature").cellsize + bin_measures.outerproduct( + h.dimension_coordinate("specific_humidity").cellsize, inplace=True + ) + integral = (h * bin_measures).sum() + self.assertTrue(np.allclose(integral.array, 1, rtol=0, atol=atol)) + if __name__ == "__main__": print("Run date:", datetime.datetime.now())