From f8c757534df19d33b8c72f83331ba0124f787a12 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 15 Jul 2024 09:44:41 +0100 Subject: [PATCH 1/3] dev --- Changelog.rst | 2 + cf/maths.py | 79 +++++++++++++++++++++++---- cf/test/test_Maths.py | 121 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 192 insertions(+), 10 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index dc36568b30..9a7c3580d9 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,6 +3,8 @@ version NEXTVERSION **2024-??-??** +* New keyword parameter to `cf.histogram`: ``density`` + (https://github.com/NCAS-CMS/cf-python/issues/794) * New method: `cf.Field.is_discrete_axis` (https://github.com/NCAS-CMS/cf-python/issues/784) * Include the UM version as a field property when reading UM files diff --git a/cf/maths.py b/cf/maths.py index 52b4a4f1b0..77d75d58f9 100644 --- a/cf/maths.py +++ b/cf/maths.py @@ -92,7 +92,7 @@ def relative_vorticity( ) # pragman: no cover -def histogram(*digitized): +def histogram(*digitized, density=False): """Return the distribution of a set of variables in the form of an N-dimensional histogram. @@ -151,6 +151,17 @@ def histogram(*digitized): manipulating the dimensions of the digitized field construct's data to ensure that broadcasting can occur. + density: `bool`, optional + If False, the default, the result will contain the number + of samples in each bin. If True, the result is the value + of the probability density function at the bin, normalized + such that the integral over the range is 1. Note that the + sum of the histogram values will not be equal to 1 unless + bins of unity size are chosen; it is not a probability + mass function. + + .. versionadded:: NEXTVERSION + :Returns: `Field` @@ -197,7 +208,7 @@ def histogram(*digitized): [0.1174 0.1317] [0.1317 0.146 ]] >>> h = cf.histogram(indices) - >>> rint(h) + >>> print(h) Field: number_of_observations ----------------------------- Data : number_of_observations(specific_humidity(10)) 1 @@ -216,7 +227,21 @@ def histogram(*digitized): [0.1031 0.1174] [0.1174 0.1317] [0.1317 0.146 ]] + >>> p = cf.histogram(indices, density=True) + >>> print(p) + Field: long_name=probability density function + --------------------------------------------- + Data : long_name=probability density function(specific_humidity(10)) + Cell methods : latitude: longitude: point + Dimension coords: specific_humidity(10) = [0.01015, ..., 0.13885] 1 + >>> print(p.data.round(2).array)) + [15.73 12.24 15.73 6.99 8.74 1.75 1.75 1.75 3.5 1.75] + + The sum of the density values multiplied by the bin sizes is + unity: + >>> print((p * p.dimension_coordinate().cellsize).sum().array) + 1.0 Create a two-dimensional histogram based on specific humidity and temperature bins. The temperature bins in this example are derived @@ -225,8 +250,8 @@ def histogram(*digitized): >>> g = f.copy() >>> g.standard_name = 'air_temperature' - >>> import numpy - >>> g[...] = numpy.random.normal(loc=290, scale=10, size=40).reshape(5, 8) + >>> import numpy as np + >>> g[...] = np.arange(40).reshape(5, 8) + 253.15 >>> g.override_units('K', inplace=True) >>> print(g) Field: air_temperature (ncvar%q) @@ -247,13 +272,28 @@ def histogram(*digitized): Dimension coords: air_temperature(5) = [281.1054839143287, ..., 313.9741786365939] K : specific_humidity(10) = [0.01015, ..., 0.13885] 1 >>> print(h.array) - [[2 1 5 3 2 -- -- -- -- --] - [1 1 2 -- 1 -- 1 1 -- --] - [4 4 2 1 1 1 -- -- 1 1] - [1 1 -- -- 1 -- -- -- 1 --] - [1 -- -- -- -- -- -- -- -- --]] + [[3 3 2 -- -- -- -- -- -- --] + [1 1 2 1 3 -- -- -- -- --] + [1 -- -- 1 -- 1 1 1 2 1] + [2 1 1 2 2 -- -- -- -- --] + [2 2 4 -- -- -- -- -- -- --]] >>> h.sum() + >>> p = cf.histogram(indices, indices_t, density=True) + >>> print(p) + Field: long_name=probability density function + --------------------------------------------- + Data : long_name=probability density function(air_temperature(5), specific_humidity(10)) + Cell methods : latitude: longitude: point + Dimension coords: air_temperature(5) = [257.05, ..., 288.25] K + : specific_humidity(10) = [0.01015, ..., 0.13885] 1 + >>> print(p.array) + >>> print(p.data.round(2).array)) + [[0.67 0.67 0.45 -- -- -- -- -- -- --] + [0.22 0.22 0.45 0.22 0.67 -- -- -- -- --] + [0.22 -- -- 0.22 -- 0.22 0.22 0.22 0.45 0.22] + [0.45 0.22 0.22 0.45 0.45 -- -- -- -- --] + [0.45 0.45 0.9 -- -- -- -- -- -- --]] """ if not digitized: @@ -264,7 +304,26 @@ def histogram(*digitized): f = digitized[0].copy() f.clear_properties() - return f.bin("sample_size", digitized=digitized) + out = f.bin("sample_size", digitized=digitized) + + if density: + # Get the measure of each bin. This is the outer product of + # the cell sizes of each dimension coordinate construct. + data_axes = out.get_data_axes() + dim = out.dimension_coordinate(filter_by_axis=(data_axes[0],)) + bin_measures = dim.cellsize + for axis in data_axes[1:]: + dim = out.dimension_coordinate(filter_by_axis=(axis,)) + bin_measures.outerproduct(dim.cellsize, inplace=True) + + # Convert counts to densities + out /= out.data.sum() * bin_measures + + out.override_units(Units(), inplace=True) + out.del_property("standard_name", None) + out.set_property("long_name", "probability density function") + + return out def curl_xy(fx, fy, x_wrap=None, one_sided_at_boundary=False, radius=None): diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py index 0900692aeb..a3de8e4c37 100644 --- a/cf/test/test_Maths.py +++ b/cf/test/test_Maths.py @@ -4,6 +4,8 @@ faulthandler.enable() # to debug seg faults and timeouts +import numpy as np + import cf @@ -207,6 +209,125 @@ def test_differential_operators(self): zeros[...] = 0 self.assertTrue(cg.data.equals(zeros.data, rtol=0, atol=1e-15)) + def test_histogram(self): + f = cf.example_field(0) + g = f.copy() + g.standard_name = "air_temperature" + g[...] = np.arange(40).reshape(5, 8) + 253.15 + g.override_units("K", inplace=True) + + # 1-d histogram + indices = f.digitize(10) + h = cf.histogram(indices) + self.assertTrue((h.array == [9, 7, 9, 4, 5, 1, 1, 1, 2, 1]).all) + h = cf.histogram(indices, density=True) + self.assertTrue( + ( + h.array + == [ + 15.734265734265733, + 12.237762237762242, + 15.734265734265733, + 6.9930069930069925, + 8.74125874125874, + 1.748251748251749, + 1.7482517482517475, + 1.748251748251749, + 3.496503496503498, + 1.748251748251744, + ] + ).all + ) + integral = (h * h.dimension_coordinate().cellsize).sum() + self.assertEqual(integral.array, 1) + + # 2-d histogram + indices_t = g.digitize(5) + h = cf.histogram(indices, indices_t) + self.assertEqual(h.Units, cf.Units()) + self.assertTrue( + ( + h.array + == [ + [3, 3, 2, -1, -1, -1, -1, -1, -1, -1], + [1, 1, 2, 1, 3, -1, -1, -1, -1, -1], + [1, -1, -1, 1, -1, 1, 1, 1, 2, 1], + [2, 1, 1, 2, 2, -1, -1, -1, -1, -1], + [2, 2, 4, -1, -1, -1, -1, -1, -1, -1], + ] + ).all() + ) + + h = cf.histogram(indices, indices_t, density=True) + self.assertEqual(h.Units, cf.Units()) + self.assertTrue( + ( + h.array + == [ + [ + 0.6724045185583661, + 0.6724045185583664, + 0.44826967903891074, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + ], + [ + 0.22413483951945457, + 0.2241348395194546, + 0.44826967903890913, + 0.22413483951945457, + 0.6724045185583637, + -1, + -1, + -1, + -1, + -1, + ], + [ + 0.22413483951945457, + -1, + -1, + 0.22413483951945457, + -1, + 0.2241348395194547, + 0.22413483951945448, + 0.2241348395194547, + 0.4482696790389094, + 0.224134839519454, + ], + [ + 0.4482696790389124, + 0.22413483951945626, + 0.2241348395194562, + 0.4482696790389124, + 0.4482696790389124, + -1, + -1, + -1, + -1, + -1, + ], + [ + 0.4482696790389059, + 0.44826967903890597, + 0.8965393580778118, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + ], + ] + ).all() + ) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From f3ad202a994496cdf43da98cfb2b25a7d62dfdb8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 16 Jul 2024 09:32:44 +0100 Subject: [PATCH 2/3] dev --- cf/maths.py | 3 +- cf/test/test_Maths.py | 98 ++++++------------------------------------- 2 files changed, 15 insertions(+), 86 deletions(-) diff --git a/cf/maths.py b/cf/maths.py index 77d75d58f9..2e59daa5cc 100644 --- a/cf/maths.py +++ b/cf/maths.py @@ -308,7 +308,8 @@ def histogram(*digitized, density=False): if density: # Get the measure of each bin. This is the outer product of - # the cell sizes of each dimension coordinate construct. + # the cell sizes of each dimension coordinate construct (the + # dimension coordinate constructs define the bins). data_axes = out.get_data_axes() dim = out.dimension_coordinate(filter_by_axis=(data_axes[0],)) bin_measures = dim.cellsize diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py index a3de8e4c37..1221776d3e 100644 --- a/cf/test/test_Maths.py +++ b/cf/test/test_Maths.py @@ -216,35 +216,23 @@ def test_histogram(self): g[...] = np.arange(40).reshape(5, 8) + 253.15 g.override_units("K", inplace=True) + atol = 1e-15 + # 1-d histogram indices = f.digitize(10) h = cf.histogram(indices) self.assertTrue((h.array == [9, 7, 9, 4, 5, 1, 1, 1, 2, 1]).all) h = cf.histogram(indices, density=True) - self.assertTrue( - ( - h.array - == [ - 15.734265734265733, - 12.237762237762242, - 15.734265734265733, - 6.9930069930069925, - 8.74125874125874, - 1.748251748251749, - 1.7482517482517475, - 1.748251748251749, - 3.496503496503498, - 1.748251748251744, - ] - ).all - ) - integral = (h * h.dimension_coordinate().cellsize).sum() - self.assertEqual(integral.array, 1) + # Check that integral is 1 + bin_measures = h.dimension_coordinate().cellsize + integral = (h * bin_measures).sum() + self.assertTrue(np.allclose(integral.array, 1, rtol=0, atol=atol)) # 2-d histogram indices_t = g.digitize(5) h = cf.histogram(indices, indices_t) self.assertEqual(h.Units, cf.Units()) + # The -1 values correspond to missing values in h self.assertTrue( ( h.array @@ -260,73 +248,13 @@ def test_histogram(self): h = cf.histogram(indices, indices_t, density=True) self.assertEqual(h.Units, cf.Units()) - self.assertTrue( - ( - h.array - == [ - [ - 0.6724045185583661, - 0.6724045185583664, - 0.44826967903891074, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - ], - [ - 0.22413483951945457, - 0.2241348395194546, - 0.44826967903890913, - 0.22413483951945457, - 0.6724045185583637, - -1, - -1, - -1, - -1, - -1, - ], - [ - 0.22413483951945457, - -1, - -1, - 0.22413483951945457, - -1, - 0.2241348395194547, - 0.22413483951945448, - 0.2241348395194547, - 0.4482696790389094, - 0.224134839519454, - ], - [ - 0.4482696790389124, - 0.22413483951945626, - 0.2241348395194562, - 0.4482696790389124, - 0.4482696790389124, - -1, - -1, - -1, - -1, - -1, - ], - [ - 0.4482696790389059, - 0.44826967903890597, - 0.8965393580778118, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - ], - ] - ).all() + # Check that integral is 1 + bin_measures = h.dimension_coordinate("air_temperature").cellsize + bin_measures.outerproduct( + h.dimension_coordinate("specific_humidity").cellsize, inplace=True ) + integral = (h * bin_measures).sum() + self.assertTrue(np.allclose(integral.array, 1, rtol=0, atol=atol)) if __name__ == "__main__": From 8e5487ffcba6af534d1b38485a2da1ab515d9db1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 16 Jul 2024 18:55:53 +0100 Subject: [PATCH 3/3] dev --- cf/maths.py | 5 ++--- cf/test/test_Maths.py | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cf/maths.py b/cf/maths.py index 2e59daa5cc..94a322a5f6 100644 --- a/cf/maths.py +++ b/cf/maths.py @@ -231,7 +231,7 @@ def histogram(*digitized, density=False): >>> print(p) Field: long_name=probability density function --------------------------------------------- - Data : long_name=probability density function(specific_humidity(10)) + Data : long_name=probability density function(specific_humidity(10)) 1 Cell methods : latitude: longitude: point Dimension coords: specific_humidity(10) = [0.01015, ..., 0.13885] 1 >>> print(p.data.round(2).array)) @@ -283,7 +283,7 @@ def histogram(*digitized, density=False): >>> print(p) Field: long_name=probability density function --------------------------------------------- - Data : long_name=probability density function(air_temperature(5), specific_humidity(10)) + Data : long_name=probability density function(air_temperature(5), specific_humidity(10)) K-1 Cell methods : latitude: longitude: point Dimension coords: air_temperature(5) = [257.05, ..., 288.25] K : specific_humidity(10) = [0.01015, ..., 0.13885] 1 @@ -320,7 +320,6 @@ def histogram(*digitized, density=False): # Convert counts to densities out /= out.data.sum() * bin_measures - out.override_units(Units(), inplace=True) out.del_property("standard_name", None) out.set_property("long_name", "probability density function") diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py index 1221776d3e..09bc5e3d11 100644 --- a/cf/test/test_Maths.py +++ b/cf/test/test_Maths.py @@ -223,6 +223,7 @@ def test_histogram(self): h = cf.histogram(indices) self.assertTrue((h.array == [9, 7, 9, 4, 5, 1, 1, 1, 2, 1]).all) h = cf.histogram(indices, density=True) + self.assertEqual(h.Units, cf.Units("1")) # Check that integral is 1 bin_measures = h.dimension_coordinate().cellsize integral = (h * bin_measures).sum() @@ -247,7 +248,7 @@ def test_histogram(self): ) h = cf.histogram(indices, indices_t, density=True) - self.assertEqual(h.Units, cf.Units()) + self.assertEqual(h.Units, cf.Units("K-1")) # Check that integral is 1 bin_measures = h.dimension_coordinate("air_temperature").cellsize bin_measures.outerproduct(