@@ -9,6 +9,9 @@ use Mode::*;
9
9
#[ cfg( test) ]
10
10
mod tests;
11
11
12
+ // njn: need to add tests in tests/ui/mixed-utf8-literals/; see
13
+ // tests/ui/try-block/ for an example to follow
14
+
12
15
/// Errors and warnings that can occur during string unescaping. They mostly
13
16
/// relate to malformed escape sequences, but there are a few that are about
14
17
/// other problems.
@@ -85,7 +88,7 @@ impl EscapeError {
85
88
///
86
89
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
87
90
/// the callback will be called exactly once.
88
- pub fn unescape_unicode < F > ( src : & str , mode : Mode , callback : & mut F )
91
+ pub fn unescape_unicode < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
89
92
where
90
93
F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
91
94
{
@@ -94,16 +97,17 @@ where
94
97
let mut chars = src. chars ( ) ;
95
98
let res = unescape_char_or_byte ( & mut chars, mode) ;
96
99
callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
100
+ Rfc3349 :: Unused // rfc3349 is not relevant for char or byte literals
97
101
}
98
- Str | ByteStr => unescape_non_raw_common ( src, mode, callback) ,
102
+ Str => unescape_non_raw_common ( src, mode, callback) ,
99
103
RawStr | RawByteStr => check_raw_common ( src, mode, callback) ,
100
104
RawCStr => check_raw_common ( src, mode, & mut |r, mut result| {
101
105
if let Ok ( '\0' ) = result {
102
106
result = Err ( EscapeError :: NulInCStr ) ;
103
107
}
104
108
callback ( r, result)
105
109
} ) ,
106
- CStr => unreachable ! ( ) ,
110
+ ByteStr | CStr => unreachable ! ( ) ,
107
111
}
108
112
}
109
113
@@ -142,18 +146,19 @@ impl From<u8> for MixedUnit {
142
146
/// a sequence of escaped characters or errors.
143
147
///
144
148
/// Values are returned by invoking `callback`.
145
- pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
149
+ pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
146
150
where
147
151
F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
148
152
{
149
153
match mode {
154
+ ByteStr => unescape_non_raw_common ( src, mode, callback) ,
150
155
CStr => unescape_non_raw_common ( src, mode, & mut |r, mut result| {
151
156
if let Ok ( MixedUnit :: Char ( '\0' ) ) = result {
152
157
result = Err ( EscapeError :: NulInCStr ) ;
153
158
}
154
159
callback ( r, result)
155
160
} ) ,
156
- Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable ! ( ) ,
161
+ Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable ! ( ) ,
157
162
}
158
163
}
159
164
@@ -169,6 +174,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
169
174
unescape_char_or_byte ( & mut src. chars ( ) , Byte ) . map ( byte_from_char)
170
175
}
171
176
177
+ /// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
178
+ /// literal to be valid. Once rfc3349 is stabilized this type can be removed.
179
+ #[ derive( Debug , PartialEq ) ]
180
+ #[ must_use]
181
+ pub enum Rfc3349 {
182
+ Used ,
183
+ Unused ,
184
+ }
185
+
172
186
/// What kind of literal do we parse.
173
187
#[ derive( Debug , Clone , Copy , PartialEq ) ]
174
188
pub enum Mode {
@@ -205,17 +219,25 @@ impl Mode {
205
219
206
220
/// Are unicode (non-ASCII) chars allowed?
207
221
#[ inline]
208
- fn allow_unicode_chars ( self ) -> bool {
222
+ fn allow_unicode_chars ( self , rfc3349 : & mut Rfc3349 ) -> bool {
209
223
match self {
210
- Byte | ByteStr | RawByteStr => false ,
224
+ Byte => false ,
225
+ ByteStr | RawByteStr => {
226
+ * rfc3349 = Rfc3349 :: Used ;
227
+ true
228
+ }
211
229
Char | Str | RawStr | CStr | RawCStr => true ,
212
230
}
213
231
}
214
232
215
233
/// Are unicode escapes (`\u`) allowed?
216
- fn allow_unicode_escapes ( self ) -> bool {
234
+ fn allow_unicode_escapes ( self , rfc3349 : & mut Rfc3349 ) -> bool {
217
235
match self {
218
- Byte | ByteStr => false ,
236
+ Byte => false ,
237
+ ByteStr => {
238
+ * rfc3349 = Rfc3349 :: Used ;
239
+ true
240
+ }
219
241
Char | Str | CStr => true ,
220
242
RawByteStr | RawStr | RawCStr => unreachable ! ( ) ,
221
243
}
@@ -233,6 +255,7 @@ impl Mode {
233
255
fn scan_escape < T : From < char > + From < u8 > > (
234
256
chars : & mut Chars < ' _ > ,
235
257
mode : Mode ,
258
+ rfc3349 : & mut Rfc3349 ,
236
259
) -> Result < T , EscapeError > {
237
260
// Previous character was '\\', unescape what follows.
238
261
let res: char = match chars. next ( ) . ok_or ( EscapeError :: LoneSlash ) ? {
@@ -262,13 +285,17 @@ fn scan_escape<T: From<char> + From<u8>>(
262
285
Ok ( T :: from ( value as u8 ) )
263
286
} ;
264
287
}
265
- 'u' => return scan_unicode ( chars, mode. allow_unicode_escapes ( ) ) . map ( T :: from) ,
288
+ 'u' => return scan_unicode ( chars, mode, rfc3349 ) . map ( T :: from) ,
266
289
_ => return Err ( EscapeError :: InvalidEscape ) ,
267
290
} ;
268
291
Ok ( T :: from ( res) )
269
292
}
270
293
271
- fn scan_unicode ( chars : & mut Chars < ' _ > , allow_unicode_escapes : bool ) -> Result < char , EscapeError > {
294
+ fn scan_unicode (
295
+ chars : & mut Chars < ' _ > ,
296
+ mode : Mode ,
297
+ rfc3349 : & mut Rfc3349 ,
298
+ ) -> Result < char , EscapeError > {
272
299
// We've parsed '\u', now we have to parse '{..}'.
273
300
274
301
if chars. next ( ) != Some ( '{' ) {
@@ -296,7 +323,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
296
323
297
324
// Incorrect syntax has higher priority for error reporting
298
325
// than unallowed value for a literal.
299
- if !allow_unicode_escapes {
326
+ if !mode . allow_unicode_escapes ( rfc3349 ) {
300
327
return Err ( EscapeError :: UnicodeEscapeInByte ) ;
301
328
}
302
329
@@ -322,18 +349,28 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
322
349
}
323
350
324
351
#[ inline]
325
- fn ascii_check ( c : char , allow_unicode_chars : bool ) -> Result < char , EscapeError > {
326
- if allow_unicode_chars || c. is_ascii ( ) { Ok ( c) } else { Err ( EscapeError :: NonAsciiCharInByte ) }
352
+ fn ascii_check ( c : char , mode : Mode , rfc3349 : & mut Rfc3349 ) -> Result < char , EscapeError > {
353
+ // We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
354
+ if c. is_ascii ( ) || mode. allow_unicode_chars ( rfc3349) {
355
+ Ok ( c)
356
+ } else {
357
+ Err ( EscapeError :: NonAsciiCharInByte )
358
+ }
327
359
}
328
360
329
361
fn unescape_char_or_byte ( chars : & mut Chars < ' _ > , mode : Mode ) -> Result < char , EscapeError > {
330
362
let c = chars. next ( ) . ok_or ( EscapeError :: ZeroChars ) ?;
363
+ let mut rfc3349 = Rfc3349 :: Unused ;
331
364
let res = match c {
332
- '\\' => scan_escape ( chars, mode) ,
365
+ '\\' => scan_escape ( chars, mode, & mut rfc3349 ) ,
333
366
'\n' | '\t' | '\'' => Err ( EscapeError :: EscapeOnlyChar ) ,
334
367
'\r' => Err ( EscapeError :: BareCarriageReturn ) ,
335
- _ => ascii_check ( c, mode. allow_unicode_chars ( ) ) ,
368
+ _ => ascii_check ( c, mode, & mut rfc3349 ) ,
336
369
} ?;
370
+
371
+ // rfc3349 is not relevant for char or byte literals.
372
+ assert_eq ! ( rfc3349, Rfc3349 :: Unused ) ;
373
+
337
374
if chars. next ( ) . is_some ( ) {
338
375
return Err ( EscapeError :: MoreThanOneChar ) ;
339
376
}
@@ -342,12 +379,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
342
379
343
380
/// Takes a contents of a string literal (without quotes) and produces a
344
381
/// sequence of escaped characters or errors.
345
- fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F )
382
+ fn unescape_non_raw_common < F , T : From < char > + From < u8 > > (
383
+ src : & str ,
384
+ mode : Mode ,
385
+ callback : & mut F ,
386
+ ) -> Rfc3349
346
387
where
347
388
F : FnMut ( Range < usize > , Result < T , EscapeError > ) ,
348
389
{
349
390
let mut chars = src. chars ( ) ;
350
- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
391
+ let mut rfc3349 = Rfc3349 :: Unused ;
351
392
352
393
// The `start` and `end` computation here is complicated because
353
394
// `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -367,16 +408,17 @@ where
367
408
} ) ;
368
409
continue ;
369
410
}
370
- _ => scan_escape :: < T > ( & mut chars, mode) ,
411
+ _ => scan_escape :: < T > ( & mut chars, mode, & mut rfc3349 ) ,
371
412
}
372
413
}
373
414
'"' => Err ( EscapeError :: EscapeOnlyChar ) ,
374
415
'\r' => Err ( EscapeError :: BareCarriageReturn ) ,
375
- _ => ascii_check ( c, allow_unicode_chars ) . map ( T :: from) ,
416
+ _ => ascii_check ( c, mode , & mut rfc3349 ) . map ( T :: from) ,
376
417
} ;
377
418
let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
378
419
callback ( start..end, res) ;
379
420
}
421
+ rfc3349
380
422
}
381
423
382
424
fn skip_ascii_whitespace < F > ( chars : & mut Chars < ' _ > , start : usize , callback : & mut F )
@@ -409,12 +451,12 @@ where
409
451
/// sequence of characters or errors.
410
452
/// NOTE: Raw strings do not perform any explicit character escaping, here we
411
453
/// only produce errors on bare CR.
412
- fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F )
454
+ fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
413
455
where
414
456
F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
415
457
{
416
458
let mut chars = src. chars ( ) ;
417
- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
459
+ let mut rfc3349 = Rfc3349 :: Unused ;
418
460
419
461
// The `start` and `end` computation here matches the one in
420
462
// `unescape_non_raw_common` for consistency, even though this function
@@ -423,16 +465,17 @@ where
423
465
let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
424
466
let res = match c {
425
467
'\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
426
- _ => ascii_check ( c, allow_unicode_chars ) ,
468
+ _ => ascii_check ( c, mode , & mut rfc3349 ) ,
427
469
} ;
428
470
let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
429
471
callback ( start..end, res) ;
430
472
}
473
+ rfc3349
431
474
}
432
475
433
476
#[ inline]
434
- pub fn byte_from_char ( c : char ) -> u8 {
477
+ pub ( crate ) fn byte_from_char ( c : char ) -> u8 {
435
478
let res = c as u32 ;
436
- debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of ByteStr " ) ;
479
+ debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of Byte " ) ;
437
480
res as u8
438
481
}
0 commit comments