@@ -85,7 +85,7 @@ impl EscapeError {
85
85
///
86
86
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
87
87
/// the callback will be called exactly once.
88
- pub fn unescape_unicode < F > ( src : & str , mode : Mode , callback : & mut F )
88
+ pub fn unescape_unicode < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
89
89
where
90
90
F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
91
91
{
@@ -94,16 +94,17 @@ where
94
94
let mut chars = src. chars ( ) ;
95
95
let res = unescape_char_or_byte ( & mut chars, mode) ;
96
96
callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
97
+ Rfc3349 :: Unused // rfc3349 not relevant for `Mode::{Char,Byte}`
97
98
}
98
- Str | ByteStr => unescape_non_raw_common ( src, mode, callback) ,
99
+ Str => unescape_non_raw_common ( src, mode, callback) ,
99
100
RawStr | RawByteStr => check_raw_common ( src, mode, callback) ,
100
101
RawCStr => check_raw_common ( src, mode, & mut |r, mut result| {
101
102
if let Ok ( '\0' ) = result {
102
103
result = Err ( EscapeError :: NulInCStr ) ;
103
104
}
104
105
callback ( r, result)
105
106
} ) ,
106
- CStr => unreachable ! ( ) ,
107
+ ByteStr | CStr => unreachable ! ( ) ,
107
108
}
108
109
}
109
110
@@ -142,18 +143,19 @@ impl From<u8> for MixedUnit {
142
143
/// a sequence of escaped characters or errors.
143
144
///
144
145
/// Values are returned by invoking `callback`.
145
- pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
146
+ pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
146
147
where
147
148
F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
148
149
{
149
150
match mode {
151
+ ByteStr => unescape_non_raw_common ( src, mode, callback) ,
150
152
CStr => unescape_non_raw_common ( src, mode, & mut |r, mut result| {
151
153
if let Ok ( MixedUnit :: Char ( '\0' ) ) = result {
152
154
result = Err ( EscapeError :: NulInCStr ) ;
153
155
}
154
156
callback ( r, result)
155
157
} ) ,
156
- Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable ! ( ) ,
158
+ Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable ! ( ) ,
157
159
}
158
160
}
159
161
@@ -169,6 +171,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
169
171
unescape_char_or_byte ( & mut src. chars ( ) , Byte ) . map ( byte_from_char)
170
172
}
171
173
174
+ /// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
175
+ /// literal to be valid. Once rfc3349 is stabilized this type can be removed.
176
+ #[ derive( Debug , PartialEq ) ]
177
+ #[ must_use]
178
+ pub enum Rfc3349 {
179
+ Used ,
180
+ Unused ,
181
+ }
182
+
172
183
/// What kind of literal do we parse.
173
184
#[ derive( Debug , Clone , Copy , PartialEq ) ]
174
185
pub enum Mode {
@@ -205,17 +216,25 @@ impl Mode {
205
216
206
217
/// Are unicode (non-ASCII) chars allowed?
207
218
#[ inline]
208
- fn allow_unicode_chars ( self ) -> bool {
219
+ fn allow_unicode_chars ( self , rfc3349 : & mut Rfc3349 ) -> bool {
209
220
match self {
210
- Byte | ByteStr | RawByteStr => false ,
221
+ Byte => false ,
222
+ ByteStr | RawByteStr => {
223
+ * rfc3349 = Rfc3349 :: Used ;
224
+ true
225
+ }
211
226
Char | Str | RawStr | CStr | RawCStr => true ,
212
227
}
213
228
}
214
229
215
230
/// Are unicode escapes (`\u`) allowed?
216
- fn allow_unicode_escapes ( self ) -> bool {
231
+ fn allow_unicode_escapes ( self , rfc3349 : & mut Rfc3349 ) -> bool {
217
232
match self {
218
- Byte | ByteStr => false ,
233
+ Byte => false ,
234
+ ByteStr => {
235
+ * rfc3349 = Rfc3349 :: Used ;
236
+ true
237
+ }
219
238
Char | Str | CStr => true ,
220
239
RawByteStr | RawStr | RawCStr => unreachable ! ( ) ,
221
240
}
@@ -233,6 +252,7 @@ impl Mode {
233
252
fn scan_escape < T : From < char > + From < u8 > > (
234
253
chars : & mut Chars < ' _ > ,
235
254
mode : Mode ,
255
+ rfc3349 : & mut Rfc3349 ,
236
256
) -> Result < T , EscapeError > {
237
257
// Previous character was '\\', unescape what follows.
238
258
let res: char = match chars. next ( ) . ok_or ( EscapeError :: LoneSlash ) ? {
@@ -262,13 +282,17 @@ fn scan_escape<T: From<char> + From<u8>>(
262
282
Ok ( T :: from ( value as u8 ) )
263
283
} ;
264
284
}
265
- 'u' => return scan_unicode ( chars, mode. allow_unicode_escapes ( ) ) . map ( T :: from) ,
285
+ 'u' => return scan_unicode ( chars, mode, rfc3349 ) . map ( T :: from) ,
266
286
_ => return Err ( EscapeError :: InvalidEscape ) ,
267
287
} ;
268
288
Ok ( T :: from ( res) )
269
289
}
270
290
271
- fn scan_unicode ( chars : & mut Chars < ' _ > , allow_unicode_escapes : bool ) -> Result < char , EscapeError > {
291
+ fn scan_unicode (
292
+ chars : & mut Chars < ' _ > ,
293
+ mode : Mode ,
294
+ rfc3349 : & mut Rfc3349 ,
295
+ ) -> Result < char , EscapeError > {
272
296
// We've parsed '\u', now we have to parse '{..}'.
273
297
274
298
if chars. next ( ) != Some ( '{' ) {
@@ -296,7 +320,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
296
320
297
321
// Incorrect syntax has higher priority for error reporting
298
322
// than unallowed value for a literal.
299
- if !allow_unicode_escapes {
323
+ if !mode . allow_unicode_escapes ( rfc3349 ) {
300
324
return Err ( EscapeError :: UnicodeEscapeInByte ) ;
301
325
}
302
326
@@ -322,18 +346,27 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
322
346
}
323
347
324
348
#[ inline]
325
- fn ascii_check ( c : char , allow_unicode_chars : bool ) -> Result < char , EscapeError > {
326
- if allow_unicode_chars || c. is_ascii ( ) { Ok ( c) } else { Err ( EscapeError :: NonAsciiCharInByte ) }
349
+ fn ascii_check ( c : char , mode : Mode , rfc3349 : & mut Rfc3349 ) -> Result < char , EscapeError > {
350
+ // We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
351
+ if c. is_ascii ( ) || mode. allow_unicode_chars ( rfc3349) {
352
+ Ok ( c)
353
+ } else {
354
+ Err ( EscapeError :: NonAsciiCharInByte )
355
+ }
327
356
}
328
357
329
358
fn unescape_char_or_byte ( chars : & mut Chars < ' _ > , mode : Mode ) -> Result < char , EscapeError > {
330
359
let c = chars. next ( ) . ok_or ( EscapeError :: ZeroChars ) ?;
360
+ let mut rfc3349 = Rfc3349 :: Unused ;
331
361
let res = match c {
332
- '\\' => scan_escape ( chars, mode) ,
362
+ '\\' => scan_escape ( chars, mode, & mut rfc3349 ) ,
333
363
'\n' | '\t' | '\'' => Err ( EscapeError :: EscapeOnlyChar ) ,
334
364
'\r' => Err ( EscapeError :: BareCarriageReturn ) ,
335
- _ => ascii_check ( c, mode. allow_unicode_chars ( ) ) ,
365
+ _ => ascii_check ( c, mode, & mut rfc3349 ) ,
336
366
} ?;
367
+
368
+ assert_eq ! ( rfc3349, Rfc3349 :: Unused ) ; // rfc3349 not relevant for `Mode::{Char,Byte}`
369
+
337
370
if chars. next ( ) . is_some ( ) {
338
371
return Err ( EscapeError :: MoreThanOneChar ) ;
339
372
}
@@ -342,12 +375,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
342
375
343
376
/// Takes a contents of a string literal (without quotes) and produces a
344
377
/// sequence of escaped characters or errors.
345
- fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F )
378
+ fn unescape_non_raw_common < F , T : From < char > + From < u8 > > (
379
+ src : & str ,
380
+ mode : Mode ,
381
+ callback : & mut F ,
382
+ ) -> Rfc3349
346
383
where
347
384
F : FnMut ( Range < usize > , Result < T , EscapeError > ) ,
348
385
{
349
386
let mut chars = src. chars ( ) ;
350
- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
387
+ let mut rfc3349 = Rfc3349 :: Unused ;
351
388
352
389
// The `start` and `end` computation here is complicated because
353
390
// `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -367,16 +404,17 @@ where
367
404
} ) ;
368
405
continue ;
369
406
}
370
- _ => scan_escape :: < T > ( & mut chars, mode) ,
407
+ _ => scan_escape :: < T > ( & mut chars, mode, & mut rfc3349 ) ,
371
408
}
372
409
}
373
410
'"' => Err ( EscapeError :: EscapeOnlyChar ) ,
374
411
'\r' => Err ( EscapeError :: BareCarriageReturn ) ,
375
- _ => ascii_check ( c, allow_unicode_chars ) . map ( T :: from) ,
412
+ _ => ascii_check ( c, mode , & mut rfc3349 ) . map ( T :: from) ,
376
413
} ;
377
414
let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
378
415
callback ( start..end, res) ;
379
416
}
417
+ rfc3349
380
418
}
381
419
382
420
fn skip_ascii_whitespace < F > ( chars : & mut Chars < ' _ > , start : usize , callback : & mut F )
@@ -409,12 +447,12 @@ where
409
447
/// sequence of characters or errors.
410
448
/// NOTE: Raw strings do not perform any explicit character escaping, here we
411
449
/// only produce errors on bare CR.
412
- fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F )
450
+ fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
413
451
where
414
452
F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
415
453
{
416
454
let mut chars = src. chars ( ) ;
417
- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
455
+ let mut rfc3349 = Rfc3349 :: Unused ;
418
456
419
457
// The `start` and `end` computation here matches the one in
420
458
// `unescape_non_raw_common` for consistency, even though this function
@@ -423,16 +461,17 @@ where
423
461
let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
424
462
let res = match c {
425
463
'\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
426
- _ => ascii_check ( c, allow_unicode_chars ) ,
464
+ _ => ascii_check ( c, mode , & mut rfc3349 ) ,
427
465
} ;
428
466
let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
429
467
callback ( start..end, res) ;
430
468
}
469
+ rfc3349
431
470
}
432
471
433
472
#[ inline]
434
- pub fn byte_from_char ( c : char ) -> u8 {
473
+ pub ( crate ) fn byte_from_char ( c : char ) -> u8 {
435
474
let res = c as u32 ;
436
- debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of ByteStr " ) ;
475
+ debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of Byte " ) ;
437
476
res as u8
438
477
}
0 commit comments