From 8ae1d48672e0c37e4cca2abaa7d6c9e4cf9f32a1 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Fri, 21 Sep 2018 16:51:02 -0400 Subject: [PATCH] Fix isvalid for 3-byte overlong encoded UTF-8 sequences (cherry picked from commit 1bbea2232a4097e58606f9f9a14477a7338fb0b1) --- src/support/utf8.c | 2 ++ test/strings/basic.jl | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/support/utf8.c b/src/support/utf8.c index 28c779b73b58b..ea7e970be6b51 100644 --- a/src/support/utf8.c +++ b/src/support/utf8.c @@ -570,6 +570,8 @@ int u8_isvalid(const char *str, size_t len) return 0; // Check for surrogate chars if (byt == 0xed && *pnt > 0x9f) return 0; + // Check for overlong encoding + if (byt == 0xe0 && *pnt < 0xa0) return 0; pnt += 2; } else { // 4-byte sequence // Must have 3 valid continuation characters diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 521dfa6d52b99..b2e14ac2f5451 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -467,9 +467,17 @@ end end end end + # Check for short three-byte sequences + @test isvalid(String, UInt8[0xe0]) == false + for (rng, flg) in ((0x00:0x9f, false), (0xa0:0xbf, true), (0xc0:0xff, false)) + for cont in rng + @test isvalid(String, UInt8[0xe0, cont]) == false + @test isvalid(String, UInt8[0xe0, cont, 0x80]) == flg + end + end # Check three-byte sequences - for r1 in (0xe0:0xec, 0xee:0xef) - for byt = r1 + for r1 in (0xe1:0xec, 0xee:0xef) + for byt in r1 # Check for short sequence @test isvalid(String, UInt8[byt]) == false for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))