From b05ea7c7ce2e7e100d76aa16d9642843b1b6251f Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Wed, 20 Mar 2024 20:37:03 +0100 Subject: [PATCH] Fix a ReadConsoleOutputCharacter regression (#16898) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `nLength` parameter of `ReadConsoleOutputCharacterW` indicates the number of columns that should be read. For single-column (narrow) surrogate pairs this previously clipped a trailing character of the returned string. In the major Unicode support update in #13626 surrogate pairs truly got stored as atomic units for the first time. This now meant that a 120 column read with such codepoints resulted in 121 characters. Other parts of conhost still assume UCS2 however, and so this results in the entire read failing. This fixes the issue by turning surrogate pairs into U+FFFD which makes it UCS2 compatible. Closes #16892 * Write U+F15C0 and read it back with `ReadConsoleOutputCharacterW` * Read succeeds with a single U+FFFD ✅ (cherry picked from commit 373faf00c9ae8bafe1d9120a990ef409b037384b) Service-Card-Id: 92129542 Service-Version: 1.19 --- src/host/ft_host/CJK_DbcsTests.cpp | 17 +++++++++++++++++ src/host/output.cpp | 7 ++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/host/ft_host/CJK_DbcsTests.cpp b/src/host/ft_host/CJK_DbcsTests.cpp index 68798b700fd..9d96e470276 100644 --- a/src/host/ft_host/CJK_DbcsTests.cpp +++ b/src/host/ft_host/CJK_DbcsTests.cpp @@ -164,6 +164,8 @@ class DbcsTests BEGIN_TEST_METHOD(TestInvalidTrailer) TEST_METHOD_PROPERTY(L"IsolationLevel", L"Method") END_TEST_METHOD() + + TEST_METHOD(TestNarrowSurrogate); }; bool DbcsTests::DbcsTestSetup() @@ -2183,3 +2185,18 @@ void DbcsTests::TestInvalidTrailer() DbcsWriteRead::Verify(expected, output); } + +// The various console APIs that read back from the buffer are generally incompatible with UTF16 and surrogate pairs. +// ReadConsoleOutputCharacterW in particular has a nLength parameter which is a column count but also the buffer size. +// This makes it impossible to reliably return arbitrarily long graphemes per-cell in the output buffer. +// The test ensures that we replace them with U+FFFD which makes the behavior more consistent for the caller. +void DbcsTests::TestNarrowSurrogate() +{ + const auto out = GetStdHandle(STD_OUTPUT_HANDLE); + wchar_t buf[3]; + DWORD read; + + VERIFY_WIN32_BOOL_SUCCEEDED(WriteConsoleOutputCharacterW(out, L"a\U00010000b", 4, {}, &read)); + VERIFY_WIN32_BOOL_SUCCEEDED(ReadConsoleOutputCharacterW(out, &buf[0], ARRAYSIZE(buf), {}, &read)); + VERIFY_ARE_EQUAL(std::wstring_view(L"a\U0000FFFDb"), std::wstring_view(&buf[0], read)); +} diff --git a/src/host/output.cpp b/src/host/output.cpp index bd168fce9cd..bf1e0a91040 100644 --- a/src/host/output.cpp +++ b/src/host/output.cpp @@ -222,7 +222,12 @@ std::wstring ReadOutputStringW(const SCREEN_INFORMATION& screenInfo, // Otherwise, add anything that isn't a trailing cell. (Trailings are duplicate copies of the leading.) if (it->DbcsAttr() != DbcsAttribute::Trailing) { - retVal += it->Chars(); + auto chars = it->Chars(); + if (chars.size() > 1) + { + chars = { &UNICODE_REPLACEMENT, 1 }; + } + retVal += chars; } }