From 50822a57b27fa1b80d1b1cade5b5fb493f4157f9 Mon Sep 17 00:00:00 2001 From: Jan Date: Sat, 27 Apr 2024 09:20:40 +0200 Subject: [PATCH 1/7] Update float16.go Added SmallestNonzero. --- float16.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/float16.go b/float16.go index 4ca060d..1691db0 100644 --- a/float16.go +++ b/float16.go @@ -38,8 +38,13 @@ const ( // PrecisionOverflow is for Overflows. Cannot round-trip float32->float16->float32. PrecisionOverflow + ) +// SmallestNonzero value that is possible to represent in float16: 0.00006109476 +// It's the float16 equivalent for [math.SmallestNonzeroFloat32] and [math.SmallestNonzeroFloat64]. +const SmallestNonzero = Float16(0b000010000000001) + // PrecisionFromfloat32 returns Precision without performing // the conversion. Conversions from both Infinity and NaN // values will always report PrecisionExact even if NaN payload From 22b64dacd45fdad0dd0e922aca1b563bf48ca283 Mon Sep 17 00:00:00 2001 From: Jan Date: Sat, 27 Apr 2024 09:22:30 +0200 Subject: [PATCH 2/7] Update float16.go Removed spurious empty line. --- float16.go | 1 - 1 file changed, 1 deletion(-) diff --git a/float16.go b/float16.go index 1691db0..e2e58c2 100644 --- a/float16.go +++ b/float16.go @@ -38,7 +38,6 @@ const ( // PrecisionOverflow is for Overflows. Cannot round-trip float32->float16->float32. PrecisionOverflow - ) // SmallestNonzero value that is possible to represent in float16: 0.00006109476 From f4458718ba23df975ac214495f4432d78d4e5637 Mon Sep 17 00:00:00 2001 From: Jan Pfeifer Date: Sat, 27 Apr 2024 09:25:15 +0200 Subject: [PATCH 3/7] Binary literals requires Go >= go1.13. --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 2074c3a..a70bf40 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ module github.com/x448/float16 -go 1.11 +go 1.13 From 2480cf34288b37e93b38263b8b73ee358897ca06 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 29 Apr 2024 07:44:47 +0200 Subject: [PATCH 4/7] Update float16.go Co-authored-by: Faye Amacker <33205765+fxamacker@users.noreply.github.com> --- float16.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/float16.go b/float16.go index e2e58c2..4174656 100644 --- a/float16.go +++ b/float16.go @@ -40,10 +40,12 @@ const ( PrecisionOverflow ) -// SmallestNonzero value that is possible to represent in float16: 0.00006109476 +// SmallestNonzero is the smallest nonzero denormal value for float16 (0.000000059604645). // It's the float16 equivalent for [math.SmallestNonzeroFloat32] and [math.SmallestNonzeroFloat64]. -const SmallestNonzero = Float16(0b000010000000001) - +// For context, [math.SmallestNonzeroFloat32] used the formula 1 / 2**(127 - 1 + 23) to produce +// the smallest denormal value for float32 (1.401298464324817070923729583289916131280e-45). +// The equivalent formula for float16 is 1 / 2**(15 - 1 + 10). We use Float16(0x0001) to compile as const. +const SmallestNonzero = Float16(0x0001) // 5.9604645e-08 (effectively 0x1p-14 * 0x1p-10) // PrecisionFromfloat32 returns Precision without performing // the conversion. Conversions from both Infinity and NaN // values will always report PrecisionExact even if NaN payload From db1ee2c9fc82ea7ddb1a857caa9ff1080817f661 Mon Sep 17 00:00:00 2001 From: Jan Pfeifer Date: Mon, 29 Apr 2024 07:58:03 +0200 Subject: [PATCH 5/7] Added test of SmallestNonzero comparing it to the same binary representation in float32. --- float16.go | 1 + float16_test.go | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/float16.go b/float16.go index 4174656..db65116 100644 --- a/float16.go +++ b/float16.go @@ -46,6 +46,7 @@ const ( // the smallest denormal value for float32 (1.401298464324817070923729583289916131280e-45). // The equivalent formula for float16 is 1 / 2**(15 - 1 + 10). We use Float16(0x0001) to compile as const. const SmallestNonzero = Float16(0x0001) // 5.9604645e-08 (effectively 0x1p-14 * 0x1p-10) + // PrecisionFromfloat32 returns Precision without performing // the conversion. Conversions from both Infinity and NaN // values will always report PrecisionExact even if NaN payload diff --git a/float16_test.go b/float16_test.go index 766cf35..b12ab91 100644 --- a/float16_test.go +++ b/float16_test.go @@ -794,5 +794,11 @@ func checkRoundTrippedPrecision(t *testing.T, u32 uint32, u16 uint16, u32bis uin t.Errorf("PrecisionFromfloat32 in f32bits=0x%08x (%032b) (%f), out f16bits=0x%04x (%v), back=0x%08x (%f), got %v, wanted PrecisionExact, exp=%d, coef=%d, drpd=%d", u32, u32, f32, u16, f16, u32bis, f32bis, pre, exp32, coef32, dropped32) } } +} +func TestSmallestNonzero(t *testing.T) { + want := float32(0x1p-24) + if float16.SmallestNonzero.Float32() != want { + t.Errorf("Invalid SmallestNonzero to float32 conversion: Float16=%s, wanted %g", float16.SmallestNonzero, want) + } } From b9408396ce40e8013116090acc19a4476527c530 Mon Sep 17 00:00:00 2001 From: Jan Pfeifer Date: Mon, 29 Apr 2024 08:31:43 +0200 Subject: [PATCH 6/7] Added a comment about the -24 binary exponent. --- float16_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/float16_test.go b/float16_test.go index b12ab91..533ae69 100644 --- a/float16_test.go +++ b/float16_test.go @@ -797,7 +797,7 @@ func checkRoundTrippedPrecision(t *testing.T, u32 uint32, u16 uint16, u32bis uin } func TestSmallestNonzero(t *testing.T) { - want := float32(0x1p-24) + want := float32(0x1p-24) // -15 + 1 - 10 if float16.SmallestNonzero.Float32() != want { t.Errorf("Invalid SmallestNonzero to float32 conversion: Float16=%s, wanted %g", float16.SmallestNonzero, want) } From 4b7f7e124ff1f438ff759e04cd338605decbca4c Mon Sep 17 00:00:00 2001 From: Jan Pfeifer Date: Wed, 1 May 2024 07:21:00 +0200 Subject: [PATCH 7/7] Fixed linter errors. --- float16.go | 2 +- float16_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/float16.go b/float16.go index db65116..c315091 100644 --- a/float16.go +++ b/float16.go @@ -45,7 +45,7 @@ const ( // For context, [math.SmallestNonzeroFloat32] used the formula 1 / 2**(127 - 1 + 23) to produce // the smallest denormal value for float32 (1.401298464324817070923729583289916131280e-45). // The equivalent formula for float16 is 1 / 2**(15 - 1 + 10). We use Float16(0x0001) to compile as const. -const SmallestNonzero = Float16(0x0001) // 5.9604645e-08 (effectively 0x1p-14 * 0x1p-10) +const SmallestNonzero = Float16(0x0001) // 5.9604645e-08 (effectively 0x1p-14 * 0x1p-10) // PrecisionFromfloat32 returns Precision without performing // the conversion. Conversions from both Infinity and NaN diff --git a/float16_test.go b/float16_test.go index 533ae69..104a8de 100644 --- a/float16_test.go +++ b/float16_test.go @@ -797,7 +797,7 @@ func checkRoundTrippedPrecision(t *testing.T, u32 uint32, u16 uint16, u32bis uin } func TestSmallestNonzero(t *testing.T) { - want := float32(0x1p-24) // -15 + 1 - 10 + want := float32(0x1p-24) // -15 + 1 - 10 if float16.SmallestNonzero.Float32() != want { t.Errorf("Invalid SmallestNonzero to float32 conversion: Float16=%s, wanted %g", float16.SmallestNonzero, want) }