Skip to content

Commit f2dee33

Browse files
committed
Implement RFC 3349, mixed utf8 literals.
Specifically: - Allow unicode chars in b"" and br"" literals. This is done by changing `Mode::allow_unicode_chars` to succeed on `ByteStr` and `RawByteStr`. - Allow unicode escapes in b"" literals. This is done by changing `Mode::allow_unicode_escapes` to succeed on `ByteStr`. Byte string literals can already have high bytes (`\x80`..`\xff`). Because they now also support unicode chars, they can now be mixed utf8, so we use `unescape_mixed`/`cook_mixed` instead of `unescape_unicode`/`cook_unicode` to process them. A new type `Rfc3349`, is used to implement the feature gating. Values of that type are threaded through the unescaping code to track whether rules from rfc3349 are required for unescaping to succeed. Test changes XXX: not sure about the latter three; could just move them into accepting tests - tests/ui/attributes/key-value-non-ascii.rs: changed from a byte string literal to a byte literal; we just need some kind of problem with a literal to preserve the test's intent. - tests/ui/parser/raw/raw-byte-string-literals.rs: removed the raw byte string literal with a non-ASCII char. The other lexing errors meant that the feature gate warning wasn't occurring anyway, because compilation was aborting too early. No great loss, because we'll test far more complex cases in `tests/ui/mixed-utf8-literals/`. - tests/ui/parser/byte-string-literals.rs: similar. - tests/ui/parser/issues/issue-23620-invalid-escapes.rs: left the test unchanged; two old `unicode escape in byte string` errors are now `mixed utf8 b"" and br"" literals are experimental` errors. - tests/ui/parser/unicode-control-codepoints.rs: similar. - tests/ui/suggestions/multibyte-escapes.rs: similar. XXX: not sure how to handle rust-analyzer, just allowed mixed utf8 literals everywhere without complaint
1 parent 6077f82 commit f2dee33

29 files changed

+306
-226
lines changed

compiler/rustc_ast/src/util/literal.rs

+10-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
44
use crate::token::{self, Token};
55
use rustc_lexer::unescape::{
6-
byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
6+
unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
77
};
88
use rustc_span::symbol::{kw, sym, Symbol};
99
use rustc_span::Span;
@@ -49,7 +49,8 @@ impl LitKind {
4949

5050
// For byte/char/string literals, chars and escapes have already been
5151
// checked in the lexer (in `cook_lexer_literal`). So we can assume all
52-
// chars and escapes are valid here.
52+
// chars and escapes are valid here, and ignore `Rfc3349` return
53+
// values.
5354
Ok(match kind {
5455
token::Bool => {
5556
assert!(symbol.is_bool_lit());
@@ -84,7 +85,7 @@ impl LitKind {
8485
// Force-inlining here is aggressive but the closure is
8586
// called on every char in the string, so it can be hot in
8687
// programs with many long strings containing escapes.
87-
unescape_unicode(
88+
_ = unescape_unicode(
8889
s,
8990
Mode::Str,
9091
&mut #[inline(always)]
@@ -108,8 +109,11 @@ impl LitKind {
108109
token::ByteStr => {
109110
let s = symbol.as_str();
110111
let mut buf = Vec::with_capacity(s.len());
111-
unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c {
112-
Ok(c) => buf.push(byte_from_char(c)),
112+
_ = unescape_mixed(s, Mode::ByteStr, &mut |_, c| match c {
113+
Ok(MixedUnit::Char(c)) => {
114+
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
115+
}
116+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
113117
Err(err) => {
114118
assert!(!err.is_fatal(), "failed to unescape string literal")
115119
}
@@ -125,7 +129,7 @@ impl LitKind {
125129
token::CStr => {
126130
let s = symbol.as_str();
127131
let mut buf = Vec::with_capacity(s.len());
128-
unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
132+
_ = unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
129133
Ok(MixedUnit::Char(c)) => {
130134
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
131135
}

compiler/rustc_ast_passes/src/feature_gate.rs

+1
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,7 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) {
508508
}
509509
};
510510
}
511+
gate_all!(mixed_utf8_literals, r#"mixed utf8 b"" and br"" literals are experimental"#);
511512
gate_all!(
512513
if_let_guard,
513514
"`if let` guards are experimental",

compiler/rustc_feature/src/unstable.rs

+2
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,8 @@ declare_features! (
520520
/// standard library until the soundness issues with specialization
521521
/// are fixed.
522522
(unstable, min_specialization, "1.7.0", Some(31844)),
523+
/// Allows mixed utf8 b"" and br"" literals.
524+
(unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)),
523525
/// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns.
524526
(unstable, more_qualified_paths, "1.54.0", Some(86935)),
525527
/// Allows the `#[must_not_suspend]` attribute.

compiler/rustc_lexer/src/unescape.rs

+68-25
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ use Mode::*;
99
#[cfg(test)]
1010
mod tests;
1111

12+
// njn: need to add tests in tests/ui/mixed-utf8-literals/; see
13+
// tests/ui/try-block/ for an example to follow
14+
1215
/// Errors and warnings that can occur during string unescaping. They mostly
1316
/// relate to malformed escape sequences, but there are a few that are about
1417
/// other problems.
@@ -85,7 +88,7 @@ impl EscapeError {
8588
///
8689
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
8790
/// the callback will be called exactly once.
88-
pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
91+
pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
8992
where
9093
F: FnMut(Range<usize>, Result<char, EscapeError>),
9194
{
@@ -94,16 +97,17 @@ where
9497
let mut chars = src.chars();
9598
let res = unescape_char_or_byte(&mut chars, mode);
9699
callback(0..(src.len() - chars.as_str().len()), res);
100+
Rfc3349::Unused // rfc3349 is not relevant for char or byte literals
97101
}
98-
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
102+
Str => unescape_non_raw_common(src, mode, callback),
99103
RawStr | RawByteStr => check_raw_common(src, mode, callback),
100104
RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
101105
if let Ok('\0') = result {
102106
result = Err(EscapeError::NulInCStr);
103107
}
104108
callback(r, result)
105109
}),
106-
CStr => unreachable!(),
110+
ByteStr | CStr => unreachable!(),
107111
}
108112
}
109113

@@ -142,18 +146,19 @@ impl From<u8> for MixedUnit {
142146
/// a sequence of escaped characters or errors.
143147
///
144148
/// Values are returned by invoking `callback`.
145-
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
149+
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
146150
where
147151
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
148152
{
149153
match mode {
154+
ByteStr => unescape_non_raw_common(src, mode, callback),
150155
CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
151156
if let Ok(MixedUnit::Char('\0')) = result {
152157
result = Err(EscapeError::NulInCStr);
153158
}
154159
callback(r, result)
155160
}),
156-
Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
161+
Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable!(),
157162
}
158163
}
159164

@@ -169,6 +174,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
169174
unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char)
170175
}
171176

177+
/// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
178+
/// literal to be valid. Once rfc3349 is stabilized this type can be removed.
179+
#[derive(Debug, PartialEq)]
180+
#[must_use]
181+
pub enum Rfc3349 {
182+
Used,
183+
Unused,
184+
}
185+
172186
/// What kind of literal do we parse.
173187
#[derive(Debug, Clone, Copy, PartialEq)]
174188
pub enum Mode {
@@ -205,17 +219,25 @@ impl Mode {
205219

206220
/// Are unicode (non-ASCII) chars allowed?
207221
#[inline]
208-
fn allow_unicode_chars(self) -> bool {
222+
fn allow_unicode_chars(self, rfc3349: &mut Rfc3349) -> bool {
209223
match self {
210-
Byte | ByteStr | RawByteStr => false,
224+
Byte => false,
225+
ByteStr | RawByteStr => {
226+
*rfc3349 = Rfc3349::Used;
227+
true
228+
}
211229
Char | Str | RawStr | CStr | RawCStr => true,
212230
}
213231
}
214232

215233
/// Are unicode escapes (`\u`) allowed?
216-
fn allow_unicode_escapes(self) -> bool {
234+
fn allow_unicode_escapes(self, rfc3349: &mut Rfc3349) -> bool {
217235
match self {
218-
Byte | ByteStr => false,
236+
Byte => false,
237+
ByteStr => {
238+
*rfc3349 = Rfc3349::Used;
239+
true
240+
}
219241
Char | Str | CStr => true,
220242
RawByteStr | RawStr | RawCStr => unreachable!(),
221243
}
@@ -233,6 +255,7 @@ impl Mode {
233255
fn scan_escape<T: From<char> + From<u8>>(
234256
chars: &mut Chars<'_>,
235257
mode: Mode,
258+
rfc3349: &mut Rfc3349,
236259
) -> Result<T, EscapeError> {
237260
// Previous character was '\\', unescape what follows.
238261
let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
@@ -262,13 +285,17 @@ fn scan_escape<T: From<char> + From<u8>>(
262285
Ok(T::from(value as u8))
263286
};
264287
}
265-
'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
288+
'u' => return scan_unicode(chars, mode, rfc3349).map(T::from),
266289
_ => return Err(EscapeError::InvalidEscape),
267290
};
268291
Ok(T::from(res))
269292
}
270293

271-
fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
294+
fn scan_unicode(
295+
chars: &mut Chars<'_>,
296+
mode: Mode,
297+
rfc3349: &mut Rfc3349,
298+
) -> Result<char, EscapeError> {
272299
// We've parsed '\u', now we have to parse '{..}'.
273300

274301
if chars.next() != Some('{') {
@@ -296,7 +323,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
296323

297324
// Incorrect syntax has higher priority for error reporting
298325
// than unallowed value for a literal.
299-
if !allow_unicode_escapes {
326+
if !mode.allow_unicode_escapes(rfc3349) {
300327
return Err(EscapeError::UnicodeEscapeInByte);
301328
}
302329

@@ -322,18 +349,28 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
322349
}
323350

324351
#[inline]
325-
fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
326-
if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) }
352+
fn ascii_check(c: char, mode: Mode, rfc3349: &mut Rfc3349) -> Result<char, EscapeError> {
353+
// We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
354+
if c.is_ascii() || mode.allow_unicode_chars(rfc3349) {
355+
Ok(c)
356+
} else {
357+
Err(EscapeError::NonAsciiCharInByte)
358+
}
327359
}
328360

329361
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
330362
let c = chars.next().ok_or(EscapeError::ZeroChars)?;
363+
let mut rfc3349 = Rfc3349::Unused;
331364
let res = match c {
332-
'\\' => scan_escape(chars, mode),
365+
'\\' => scan_escape(chars, mode, &mut rfc3349),
333366
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
334367
'\r' => Err(EscapeError::BareCarriageReturn),
335-
_ => ascii_check(c, mode.allow_unicode_chars()),
368+
_ => ascii_check(c, mode, &mut rfc3349),
336369
}?;
370+
371+
// rfc3349 is not relevant for char or byte literals.
372+
assert_eq!(rfc3349, Rfc3349::Unused);
373+
337374
if chars.next().is_some() {
338375
return Err(EscapeError::MoreThanOneChar);
339376
}
@@ -342,12 +379,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
342379

343380
/// Takes a contents of a string literal (without quotes) and produces a
344381
/// sequence of escaped characters or errors.
345-
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
382+
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(
383+
src: &str,
384+
mode: Mode,
385+
callback: &mut F,
386+
) -> Rfc3349
346387
where
347388
F: FnMut(Range<usize>, Result<T, EscapeError>),
348389
{
349390
let mut chars = src.chars();
350-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
391+
let mut rfc3349 = Rfc3349::Unused;
351392

352393
// The `start` and `end` computation here is complicated because
353394
// `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -367,16 +408,17 @@ where
367408
});
368409
continue;
369410
}
370-
_ => scan_escape::<T>(&mut chars, mode),
411+
_ => scan_escape::<T>(&mut chars, mode, &mut rfc3349),
371412
}
372413
}
373414
'"' => Err(EscapeError::EscapeOnlyChar),
374415
'\r' => Err(EscapeError::BareCarriageReturn),
375-
_ => ascii_check(c, allow_unicode_chars).map(T::from),
416+
_ => ascii_check(c, mode, &mut rfc3349).map(T::from),
376417
};
377418
let end = src.len() - chars.as_str().len();
378419
callback(start..end, res);
379420
}
421+
rfc3349
380422
}
381423

382424
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
@@ -409,12 +451,12 @@ where
409451
/// sequence of characters or errors.
410452
/// NOTE: Raw strings do not perform any explicit character escaping, here we
411453
/// only produce errors on bare CR.
412-
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
454+
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
413455
where
414456
F: FnMut(Range<usize>, Result<char, EscapeError>),
415457
{
416458
let mut chars = src.chars();
417-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
459+
let mut rfc3349 = Rfc3349::Unused;
418460

419461
// The `start` and `end` computation here matches the one in
420462
// `unescape_non_raw_common` for consistency, even though this function
@@ -423,16 +465,17 @@ where
423465
let start = src.len() - chars.as_str().len() - c.len_utf8();
424466
let res = match c {
425467
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
426-
_ => ascii_check(c, allow_unicode_chars),
468+
_ => ascii_check(c, mode, &mut rfc3349),
427469
};
428470
let end = src.len() - chars.as_str().len();
429471
callback(start..end, res);
430472
}
473+
rfc3349
431474
}
432475

433476
#[inline]
434-
pub fn byte_from_char(c: char) -> u8 {
477+
pub(crate) fn byte_from_char(c: char) -> u8 {
435478
let res = c as u32;
436-
debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
479+
debug_assert!(res <= u8::MAX as u32, "guaranteed because of Byte");
437480
res as u8
438481
}

compiler/rustc_lexer/src/unescape/tests.rs

+22-10
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@ fn test_unescape_char_good() {
100100
fn test_unescape_str_warn() {
101101
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
102102
let mut unescaped = Vec::with_capacity(literal.len());
103-
unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
103+
let rfc3349 =
104+
unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
105+
assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str`
104106
assert_eq!(unescaped, expected);
105107
}
106108

@@ -124,14 +126,15 @@ fn test_unescape_str_warn() {
124126
fn test_unescape_str_good() {
125127
fn check(literal_text: &str, expected: &str) {
126128
let mut buf = Ok(String::with_capacity(literal_text.len()));
127-
unescape_unicode(literal_text, Mode::Str, &mut |range, c| {
129+
let rfc3349 = unescape_unicode(literal_text, Mode::Str, &mut |range, c| {
128130
if let Ok(b) = &mut buf {
129131
match c {
130132
Ok(c) => b.push(c),
131133
Err(e) => buf = Err((range, e)),
132134
}
133135
}
134136
});
137+
assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str`
135138
assert_eq!(buf.as_deref(), Ok(expected))
136139
}
137140

@@ -240,16 +243,20 @@ fn test_unescape_byte_good() {
240243
#[test]
241244
fn test_unescape_byte_str_good() {
242245
fn check(literal_text: &str, expected: &[u8]) {
243-
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
244-
unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| {
245-
if let Ok(b) = &mut buf {
246+
let mut buf_res = Ok(Vec::with_capacity(literal_text.len()));
247+
let rfc3349 = unescape_mixed(literal_text, Mode::ByteStr, &mut |range, c| {
248+
if let Ok(buf) = &mut buf_res {
246249
match c {
247-
Ok(c) => b.push(byte_from_char(c)),
248-
Err(e) => buf = Err((range, e)),
250+
Ok(MixedUnit::Char(c)) => {
251+
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
252+
}
253+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
254+
Err(e) => buf_res = Err((range, e)),
249255
}
250256
}
251257
});
252-
assert_eq!(buf.as_deref(), Ok(expected))
258+
assert_eq!(rfc3349, Rfc3349::Unused); // njn: should have examples where this isn't true
259+
assert_eq!(buf_res.as_deref(), Ok(expected))
253260
}
254261

255262
check("foo", b"foo");
@@ -264,7 +271,9 @@ fn test_unescape_byte_str_good() {
264271
fn test_unescape_raw_str() {
265272
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
266273
let mut unescaped = Vec::with_capacity(literal.len());
267-
unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
274+
let rfc3349 =
275+
unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
276+
assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::RawStr`
268277
assert_eq!(unescaped, expected);
269278
}
270279

@@ -276,7 +285,10 @@ fn test_unescape_raw_str() {
276285
fn test_unescape_raw_byte_str() {
277286
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
278287
let mut unescaped = Vec::with_capacity(literal.len());
279-
unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
288+
let rfc3349 = unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| {
289+
unescaped.push((range, res))
290+
});
291+
assert_eq!(rfc3349, Rfc3349::Unused); // njn: todo
280292
assert_eq!(unescaped, expected);
281293
}
282294

0 commit comments

Comments
 (0)