Skip to content

Commit c144da3

Browse files
committed
Implement RFC 3349, mixed utf8 literals.
Specifically: - Allow unicode chars in b"" and br"" literals. This is done by changing `Mode::allow_unicode_chars` to succeed on `ByteStr` and `RawByteStr`. - Allow unicode escapes in b"" literals. This is done by changing `Mode::allow_unicode_escapes` to succeed on `ByteStr`. Byte string literals can already have high bytes (`\x80`..`\xff`). Because they now also support unicode chars, they can now be mixed utf8, so we use `unescape_mixed`/`cook_mixed` instead of `unescape_unicode`/`cook_unicode` to process them. A new type `Rfc3349`, is used to implement the feature gating. Values of that type are threaded through the unescaping code to track whether rules from rfc3349 are required for unescaping to succeed. Test changes: - tests/ui/mixed-utf8-literals/basic.rs: new `check-pass` UI test with various literals exercising the new forms. - tests/ui/attributes/key-value-non-ascii.rs: changed from a byte string literal to a byte literal; we just need some kind of problem with a literal to preserve the test's intent. - tests/ui/parser/raw/raw-byte-string-literals.rs: moved the raw byte string literal with a non-ASCII char to `basic.rs`. - tests/ui/parser/byte-string-literals.rs: similar. - tests/ui/parser/issues/issue-23620-invalid-escapes.rs: moved one case fully to `basic.rs`, and one partially. - tests/ui/parser/unicode-control-codepoints.rs: left the code unchanged, but the errors are now about mixed-utf8-literals being feature gated. - tests/ui/suggestions/multibyte-escapes.rs: moved one case to `basic.rs`. - compiler/rustc_lexer/src/unescape/tests.rs: various adjustments - two cases that previously failed now succeed - added some more cases for the newly supported syntax I wasn't sure how to handle rust-analyzer in general, so I just allowed mixed utf8 literals everywhere without complaint.
1 parent fb4bca0 commit c144da3

29 files changed

+364
-348
lines changed

compiler/rustc_ast/src/util/literal.rs

+10-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
44
use crate::token::{self, Token};
55
use rustc_lexer::unescape::{
6-
byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
6+
unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
77
};
88
use rustc_span::symbol::{kw, sym, Symbol};
99
use rustc_span::Span;
@@ -49,7 +49,8 @@ impl LitKind {
4949

5050
// For byte/char/string literals, chars and escapes have already been
5151
// checked in the lexer (in `cook_lexer_literal`). So we can assume all
52-
// chars and escapes are valid here.
52+
// chars and escapes are valid here, and ignore `Rfc3349` return
53+
// values.
5354
Ok(match kind {
5455
token::Bool => {
5556
assert!(symbol.is_bool_lit());
@@ -84,7 +85,7 @@ impl LitKind {
8485
// Force-inlining here is aggressive but the closure is
8586
// called on every char in the string, so it can be hot in
8687
// programs with many long strings containing escapes.
87-
unescape_unicode(
88+
_ = unescape_unicode(
8889
s,
8990
Mode::Str,
9091
&mut #[inline(always)]
@@ -108,8 +109,11 @@ impl LitKind {
108109
token::ByteStr => {
109110
let s = symbol.as_str();
110111
let mut buf = Vec::with_capacity(s.len());
111-
unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c {
112-
Ok(c) => buf.push(byte_from_char(c)),
112+
_ = unescape_mixed(s, Mode::ByteStr, &mut |_, c| match c {
113+
Ok(MixedUnit::Char(c)) => {
114+
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
115+
}
116+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
113117
Err(err) => {
114118
assert!(!err.is_fatal(), "failed to unescape string literal")
115119
}
@@ -125,7 +129,7 @@ impl LitKind {
125129
token::CStr => {
126130
let s = symbol.as_str();
127131
let mut buf = Vec::with_capacity(s.len());
128-
unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
132+
_ = unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
129133
Ok(MixedUnit::Char(c)) => {
130134
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
131135
}

compiler/rustc_ast_passes/src/feature_gate.rs

+1
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,7 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) {
508508
}
509509
};
510510
}
511+
gate_all!(mixed_utf8_literals, r#"mixed utf8 b"" and br"" literals are experimental"#);
511512
gate_all!(
512513
if_let_guard,
513514
"`if let` guards are experimental",

compiler/rustc_feature/src/unstable.rs

+2
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,8 @@ declare_features! (
523523
/// standard library until the soundness issues with specialization
524524
/// are fixed.
525525
(unstable, min_specialization, "1.7.0", Some(31844)),
526+
/// Allows mixed utf8 b"" and br"" literals.
527+
(unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)),
526528
/// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns.
527529
(unstable, more_qualified_paths, "1.54.0", Some(86935)),
528530
/// Allows the `#[must_not_suspend]` attribute.

compiler/rustc_lexer/src/unescape.rs

+64-25
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ impl EscapeError {
8585
///
8686
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
8787
/// the callback will be called exactly once.
88-
pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
88+
pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
8989
where
9090
F: FnMut(Range<usize>, Result<char, EscapeError>),
9191
{
@@ -94,16 +94,17 @@ where
9494
let mut chars = src.chars();
9595
let res = unescape_char_or_byte(&mut chars, mode);
9696
callback(0..(src.len() - chars.as_str().len()), res);
97+
Rfc3349::Unused // rfc3349 not relevant for `Mode::{Char,Byte}`
9798
}
98-
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
99+
Str => unescape_non_raw_common(src, mode, callback),
99100
RawStr | RawByteStr => check_raw_common(src, mode, callback),
100101
RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
101102
if let Ok('\0') = result {
102103
result = Err(EscapeError::NulInCStr);
103104
}
104105
callback(r, result)
105106
}),
106-
CStr => unreachable!(),
107+
ByteStr | CStr => unreachable!(),
107108
}
108109
}
109110

@@ -142,18 +143,19 @@ impl From<u8> for MixedUnit {
142143
/// a sequence of escaped characters or errors.
143144
///
144145
/// Values are returned by invoking `callback`.
145-
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
146+
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
146147
where
147148
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
148149
{
149150
match mode {
151+
ByteStr => unescape_non_raw_common(src, mode, callback),
150152
CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
151153
if let Ok(MixedUnit::Char('\0')) = result {
152154
result = Err(EscapeError::NulInCStr);
153155
}
154156
callback(r, result)
155157
}),
156-
Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
158+
Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable!(),
157159
}
158160
}
159161

@@ -169,6 +171,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
169171
unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char)
170172
}
171173

174+
/// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
175+
/// literal to be valid. Once rfc3349 is stabilized this type can be removed.
176+
#[derive(Debug, PartialEq)]
177+
#[must_use]
178+
pub enum Rfc3349 {
179+
Used,
180+
Unused,
181+
}
182+
172183
/// What kind of literal do we parse.
173184
#[derive(Debug, Clone, Copy, PartialEq)]
174185
pub enum Mode {
@@ -205,17 +216,25 @@ impl Mode {
205216

206217
/// Are unicode (non-ASCII) chars allowed?
207218
#[inline]
208-
fn allow_unicode_chars(self) -> bool {
219+
fn allow_unicode_chars(self, rfc3349: &mut Rfc3349) -> bool {
209220
match self {
210-
Byte | ByteStr | RawByteStr => false,
221+
Byte => false,
222+
ByteStr | RawByteStr => {
223+
*rfc3349 = Rfc3349::Used;
224+
true
225+
}
211226
Char | Str | RawStr | CStr | RawCStr => true,
212227
}
213228
}
214229

215230
/// Are unicode escapes (`\u`) allowed?
216-
fn allow_unicode_escapes(self) -> bool {
231+
fn allow_unicode_escapes(self, rfc3349: &mut Rfc3349) -> bool {
217232
match self {
218-
Byte | ByteStr => false,
233+
Byte => false,
234+
ByteStr => {
235+
*rfc3349 = Rfc3349::Used;
236+
true
237+
}
219238
Char | Str | CStr => true,
220239
RawByteStr | RawStr | RawCStr => unreachable!(),
221240
}
@@ -233,6 +252,7 @@ impl Mode {
233252
fn scan_escape<T: From<char> + From<u8>>(
234253
chars: &mut Chars<'_>,
235254
mode: Mode,
255+
rfc3349: &mut Rfc3349,
236256
) -> Result<T, EscapeError> {
237257
// Previous character was '\\', unescape what follows.
238258
let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
@@ -262,13 +282,17 @@ fn scan_escape<T: From<char> + From<u8>>(
262282
Ok(T::from(value as u8))
263283
};
264284
}
265-
'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
285+
'u' => return scan_unicode(chars, mode, rfc3349).map(T::from),
266286
_ => return Err(EscapeError::InvalidEscape),
267287
};
268288
Ok(T::from(res))
269289
}
270290

271-
fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
291+
fn scan_unicode(
292+
chars: &mut Chars<'_>,
293+
mode: Mode,
294+
rfc3349: &mut Rfc3349,
295+
) -> Result<char, EscapeError> {
272296
// We've parsed '\u', now we have to parse '{..}'.
273297

274298
if chars.next() != Some('{') {
@@ -296,7 +320,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
296320

297321
// Incorrect syntax has higher priority for error reporting
298322
// than unallowed value for a literal.
299-
if !allow_unicode_escapes {
323+
if !mode.allow_unicode_escapes(rfc3349) {
300324
return Err(EscapeError::UnicodeEscapeInByte);
301325
}
302326

@@ -322,18 +346,27 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
322346
}
323347

324348
#[inline]
325-
fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
326-
if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) }
349+
fn ascii_check(c: char, mode: Mode, rfc3349: &mut Rfc3349) -> Result<char, EscapeError> {
350+
// We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
351+
if c.is_ascii() || mode.allow_unicode_chars(rfc3349) {
352+
Ok(c)
353+
} else {
354+
Err(EscapeError::NonAsciiCharInByte)
355+
}
327356
}
328357

329358
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
330359
let c = chars.next().ok_or(EscapeError::ZeroChars)?;
360+
let mut rfc3349 = Rfc3349::Unused;
331361
let res = match c {
332-
'\\' => scan_escape(chars, mode),
362+
'\\' => scan_escape(chars, mode, &mut rfc3349),
333363
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
334364
'\r' => Err(EscapeError::BareCarriageReturn),
335-
_ => ascii_check(c, mode.allow_unicode_chars()),
365+
_ => ascii_check(c, mode, &mut rfc3349),
336366
}?;
367+
368+
assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::{Char,Byte}`
369+
337370
if chars.next().is_some() {
338371
return Err(EscapeError::MoreThanOneChar);
339372
}
@@ -342,12 +375,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
342375

343376
/// Takes a contents of a string literal (without quotes) and produces a
344377
/// sequence of escaped characters or errors.
345-
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
378+
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(
379+
src: &str,
380+
mode: Mode,
381+
callback: &mut F,
382+
) -> Rfc3349
346383
where
347384
F: FnMut(Range<usize>, Result<T, EscapeError>),
348385
{
349386
let mut chars = src.chars();
350-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
387+
let mut rfc3349 = Rfc3349::Unused;
351388

352389
// The `start` and `end` computation here is complicated because
353390
// `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -367,16 +404,17 @@ where
367404
});
368405
continue;
369406
}
370-
_ => scan_escape::<T>(&mut chars, mode),
407+
_ => scan_escape::<T>(&mut chars, mode, &mut rfc3349),
371408
}
372409
}
373410
'"' => Err(EscapeError::EscapeOnlyChar),
374411
'\r' => Err(EscapeError::BareCarriageReturn),
375-
_ => ascii_check(c, allow_unicode_chars).map(T::from),
412+
_ => ascii_check(c, mode, &mut rfc3349).map(T::from),
376413
};
377414
let end = src.len() - chars.as_str().len();
378415
callback(start..end, res);
379416
}
417+
rfc3349
380418
}
381419

382420
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
@@ -409,12 +447,12 @@ where
409447
/// sequence of characters or errors.
410448
/// NOTE: Raw strings do not perform any explicit character escaping, here we
411449
/// only produce errors on bare CR.
412-
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
450+
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
413451
where
414452
F: FnMut(Range<usize>, Result<char, EscapeError>),
415453
{
416454
let mut chars = src.chars();
417-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
455+
let mut rfc3349 = Rfc3349::Unused;
418456

419457
// The `start` and `end` computation here matches the one in
420458
// `unescape_non_raw_common` for consistency, even though this function
@@ -423,16 +461,17 @@ where
423461
let start = src.len() - chars.as_str().len() - c.len_utf8();
424462
let res = match c {
425463
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
426-
_ => ascii_check(c, allow_unicode_chars),
464+
_ => ascii_check(c, mode, &mut rfc3349),
427465
};
428466
let end = src.len() - chars.as_str().len();
429467
callback(start..end, res);
430468
}
469+
rfc3349
431470
}
432471

433472
#[inline]
434-
pub fn byte_from_char(c: char) -> u8 {
473+
pub(crate) fn byte_from_char(c: char) -> u8 {
435474
let res = c as u32;
436-
debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
475+
debug_assert!(res <= u8::MAX as u32, "guaranteed because of Byte");
437476
res as u8
438477
}

0 commit comments

Comments
 (0)