1 | //! impl char {} |
2 | |
3 | use super::*; |
4 | use crate::panic::const_panic; |
5 | use crate::slice; |
6 | use crate::str::from_utf8_unchecked_mut; |
7 | use crate::ub_checks::assert_unsafe_precondition; |
8 | use crate::unicode::printable::is_printable; |
9 | use crate::unicode::{self, conversions}; |
10 | |
11 | impl char { |
12 | /// The lowest valid code point a `char` can have, `'\0'`. |
13 | /// |
14 | /// Unlike integer types, `char` actually has a gap in the middle, |
15 | /// meaning that the range of possible `char`s is smaller than you |
16 | /// might expect. Ranges of `char` will automatically hop this gap |
17 | /// for you: |
18 | /// |
19 | /// ``` |
20 | /// let dist = u32::from(char::MAX) - u32::from(char::MIN); |
21 | /// let size = (char::MIN..=char::MAX).count() as u32; |
22 | /// assert!(size < dist); |
23 | /// ``` |
24 | /// |
25 | /// Despite this gap, the `MIN` and [`MAX`] values can be used as bounds for |
26 | /// all `char` values. |
27 | /// |
28 | /// [`MAX`]: char::MAX |
29 | /// |
30 | /// # Examples |
31 | /// |
32 | /// ``` |
33 | /// # fn something_which_returns_char() -> char { 'a' } |
34 | /// let c: char = something_which_returns_char(); |
35 | /// assert!(char::MIN <= c); |
36 | /// |
37 | /// let value_at_min = u32::from(char::MIN); |
38 | /// assert_eq!(char::from_u32(value_at_min), Some(' \0' )); |
39 | /// ``` |
40 | #[stable (feature = "char_min" , since = "1.83.0" )] |
41 | pub const MIN: char = ' \0' ; |
42 | |
43 | /// The highest valid code point a `char` can have, `'\u{10FFFF}'`. |
44 | /// |
45 | /// Unlike integer types, `char` actually has a gap in the middle, |
46 | /// meaning that the range of possible `char`s is smaller than you |
47 | /// might expect. Ranges of `char` will automatically hop this gap |
48 | /// for you: |
49 | /// |
50 | /// ``` |
51 | /// let dist = u32::from(char::MAX) - u32::from(char::MIN); |
52 | /// let size = (char::MIN..=char::MAX).count() as u32; |
53 | /// assert!(size < dist); |
54 | /// ``` |
55 | /// |
56 | /// Despite this gap, the [`MIN`] and `MAX` values can be used as bounds for |
57 | /// all `char` values. |
58 | /// |
59 | /// [`MIN`]: char::MIN |
60 | /// |
61 | /// # Examples |
62 | /// |
63 | /// ``` |
64 | /// # fn something_which_returns_char() -> char { 'a' } |
65 | /// let c: char = something_which_returns_char(); |
66 | /// assert!(c <= char::MAX); |
67 | /// |
68 | /// let value_at_max = u32::from(char::MAX); |
69 | /// assert_eq!(char::from_u32(value_at_max), Some(' \u{10FFFF}' )); |
70 | /// assert_eq!(char::from_u32(value_at_max + 1), None); |
71 | /// ``` |
72 | #[stable (feature = "assoc_char_consts" , since = "1.52.0" )] |
73 | pub const MAX: char = ' \u{10FFFF}' ; |
74 | |
75 | /// The maximum number of bytes required to [encode](char::encode_utf8) a `char` to |
76 | /// UTF-8 encoding. |
77 | #[unstable (feature = "char_max_len" , issue = "121714" )] |
78 | pub const MAX_LEN_UTF8: usize = 4; |
79 | |
80 | /// The maximum number of two-byte units required to [encode](char::encode_utf16) a `char` |
81 | /// to UTF-16 encoding. |
82 | #[unstable (feature = "char_max_len" , issue = "121714" )] |
83 | pub const MAX_LEN_UTF16: usize = 2; |
84 | |
85 | /// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a |
86 | /// decoding error. |
87 | /// |
88 | /// It can occur, for example, when giving ill-formed UTF-8 bytes to |
89 | /// [`String::from_utf8_lossy`](../std/string/struct.String.html#method.from_utf8_lossy). |
90 | #[stable (feature = "assoc_char_consts" , since = "1.52.0" )] |
91 | pub const REPLACEMENT_CHARACTER: char = ' \u{FFFD}' ; |
92 | |
93 | /// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of |
94 | /// `char` and `str` methods are based on. |
95 | /// |
96 | /// New versions of Unicode are released regularly and subsequently all methods |
97 | /// in the standard library depending on Unicode are updated. Therefore the |
98 | /// behavior of some `char` and `str` methods and the value of this constant |
99 | /// changes over time. This is *not* considered to be a breaking change. |
100 | /// |
101 | /// The version numbering scheme is explained in |
102 | /// [Unicode 11.0 or later, Section 3.1 Versions of the Unicode Standard](https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf#page=4). |
103 | #[stable (feature = "assoc_char_consts" , since = "1.52.0" )] |
104 | pub const UNICODE_VERSION: (u8, u8, u8) = crate::unicode::UNICODE_VERSION; |
105 | |
106 | /// Creates an iterator over the native endian UTF-16 encoded code points in `iter`, |
107 | /// returning unpaired surrogates as `Err`s. |
108 | /// |
109 | /// # Examples |
110 | /// |
111 | /// Basic usage: |
112 | /// |
113 | /// ``` |
114 | /// // 𝄞mus<invalid>ic<invalid> |
115 | /// let v = [ |
116 | /// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834, |
117 | /// ]; |
118 | /// |
119 | /// assert_eq!( |
120 | /// char::decode_utf16(v) |
121 | /// .map(|r| r.map_err(|e| e.unpaired_surrogate())) |
122 | /// .collect::<Vec<_>>(), |
123 | /// vec![ |
124 | /// Ok('𝄞' ), |
125 | /// Ok('m' ), Ok('u' ), Ok('s' ), |
126 | /// Err(0xDD1E), |
127 | /// Ok('i' ), Ok('c' ), |
128 | /// Err(0xD834) |
129 | /// ] |
130 | /// ); |
131 | /// ``` |
132 | /// |
133 | /// A lossy decoder can be obtained by replacing `Err` results with the replacement character: |
134 | /// |
135 | /// ``` |
136 | /// // 𝄞mus<invalid>ic<invalid> |
137 | /// let v = [ |
138 | /// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834, |
139 | /// ]; |
140 | /// |
141 | /// assert_eq!( |
142 | /// char::decode_utf16(v) |
143 | /// .map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER)) |
144 | /// .collect::<String>(), |
145 | /// "𝄞mus�ic�" |
146 | /// ); |
147 | /// ``` |
148 | #[stable (feature = "assoc_char_funcs" , since = "1.52.0" )] |
149 | #[inline ] |
150 | pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> { |
151 | super::decode::decode_utf16(iter) |
152 | } |
153 | |
154 | /// Converts a `u32` to a `char`. |
155 | /// |
156 | /// Note that all `char`s are valid [`u32`]s, and can be cast to one with |
157 | /// [`as`](../std/keyword.as.html): |
158 | /// |
159 | /// ``` |
160 | /// let c = '💯' ; |
161 | /// let i = c as u32; |
162 | /// |
163 | /// assert_eq!(128175, i); |
164 | /// ``` |
165 | /// |
166 | /// However, the reverse is not true: not all valid [`u32`]s are valid |
167 | /// `char`s. `from_u32()` will return `None` if the input is not a valid value |
168 | /// for a `char`. |
169 | /// |
170 | /// For an unsafe version of this function which ignores these checks, see |
171 | /// [`from_u32_unchecked`]. |
172 | /// |
173 | /// [`from_u32_unchecked`]: #method.from_u32_unchecked |
174 | /// |
175 | /// # Examples |
176 | /// |
177 | /// Basic usage: |
178 | /// |
179 | /// ``` |
180 | /// let c = char::from_u32(0x2764); |
181 | /// |
182 | /// assert_eq!(Some('❤' ), c); |
183 | /// ``` |
184 | /// |
185 | /// Returning `None` when the input is not a valid `char`: |
186 | /// |
187 | /// ``` |
188 | /// let c = char::from_u32(0x110000); |
189 | /// |
190 | /// assert_eq!(None, c); |
191 | /// ``` |
192 | #[stable (feature = "assoc_char_funcs" , since = "1.52.0" )] |
193 | #[rustc_const_stable (feature = "const_char_convert" , since = "1.67.0" )] |
194 | #[must_use ] |
195 | #[inline ] |
196 | pub const fn from_u32(i: u32) -> Option<char> { |
197 | super::convert::from_u32(i) |
198 | } |
199 | |
200 | /// Converts a `u32` to a `char`, ignoring validity. |
201 | /// |
202 | /// Note that all `char`s are valid [`u32`]s, and can be cast to one with |
203 | /// `as`: |
204 | /// |
205 | /// ``` |
206 | /// let c = '💯' ; |
207 | /// let i = c as u32; |
208 | /// |
209 | /// assert_eq!(128175, i); |
210 | /// ``` |
211 | /// |
212 | /// However, the reverse is not true: not all valid [`u32`]s are valid |
213 | /// `char`s. `from_u32_unchecked()` will ignore this, and blindly cast to |
214 | /// `char`, possibly creating an invalid one. |
215 | /// |
216 | /// # Safety |
217 | /// |
218 | /// This function is unsafe, as it may construct invalid `char` values. |
219 | /// |
220 | /// For a safe version of this function, see the [`from_u32`] function. |
221 | /// |
222 | /// [`from_u32`]: #method.from_u32 |
223 | /// |
224 | /// # Examples |
225 | /// |
226 | /// Basic usage: |
227 | /// |
228 | /// ``` |
229 | /// let c = unsafe { char::from_u32_unchecked(0x2764) }; |
230 | /// |
231 | /// assert_eq!('❤' , c); |
232 | /// ``` |
233 | #[stable (feature = "assoc_char_funcs" , since = "1.52.0" )] |
234 | #[rustc_const_stable (feature = "const_char_from_u32_unchecked" , since = "1.81.0" )] |
235 | #[must_use ] |
236 | #[inline ] |
237 | pub const unsafe fn from_u32_unchecked(i: u32) -> char { |
238 | // SAFETY: the safety contract must be upheld by the caller. |
239 | unsafe { super::convert::from_u32_unchecked(i) } |
240 | } |
241 | |
242 | /// Converts a digit in the given radix to a `char`. |
243 | /// |
244 | /// A 'radix' here is sometimes also called a 'base'. A radix of two |
245 | /// indicates a binary number, a radix of ten, decimal, and a radix of |
246 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
247 | /// radices are supported. |
248 | /// |
249 | /// `from_digit()` will return `None` if the input is not a digit in |
250 | /// the given radix. |
251 | /// |
252 | /// # Panics |
253 | /// |
254 | /// Panics if given a radix larger than 36. |
255 | /// |
256 | /// # Examples |
257 | /// |
258 | /// Basic usage: |
259 | /// |
260 | /// ``` |
261 | /// let c = char::from_digit(4, 10); |
262 | /// |
263 | /// assert_eq!(Some('4' ), c); |
264 | /// |
265 | /// // Decimal 11 is a single digit in base 16 |
266 | /// let c = char::from_digit(11, 16); |
267 | /// |
268 | /// assert_eq!(Some('b' ), c); |
269 | /// ``` |
270 | /// |
271 | /// Returning `None` when the input is not a digit: |
272 | /// |
273 | /// ``` |
274 | /// let c = char::from_digit(20, 10); |
275 | /// |
276 | /// assert_eq!(None, c); |
277 | /// ``` |
278 | /// |
279 | /// Passing a large radix, causing a panic: |
280 | /// |
281 | /// ```should_panic |
282 | /// // this panics |
283 | /// let _c = char::from_digit(1, 37); |
284 | /// ``` |
285 | #[stable (feature = "assoc_char_funcs" , since = "1.52.0" )] |
286 | #[rustc_const_stable (feature = "const_char_convert" , since = "1.67.0" )] |
287 | #[must_use ] |
288 | #[inline ] |
289 | pub const fn from_digit(num: u32, radix: u32) -> Option<char> { |
290 | super::convert::from_digit(num, radix) |
291 | } |
292 | |
293 | /// Checks if a `char` is a digit in the given radix. |
294 | /// |
295 | /// A 'radix' here is sometimes also called a 'base'. A radix of two |
296 | /// indicates a binary number, a radix of ten, decimal, and a radix of |
297 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
298 | /// radices are supported. |
299 | /// |
300 | /// Compared to [`is_numeric()`], this function only recognizes the characters |
301 | /// `0-9`, `a-z` and `A-Z`. |
302 | /// |
303 | /// 'Digit' is defined to be only the following characters: |
304 | /// |
305 | /// * `0-9` |
306 | /// * `a-z` |
307 | /// * `A-Z` |
308 | /// |
309 | /// For a more comprehensive understanding of 'digit', see [`is_numeric()`]. |
310 | /// |
311 | /// [`is_numeric()`]: #method.is_numeric |
312 | /// |
313 | /// # Panics |
314 | /// |
315 | /// Panics if given a radix smaller than 2 or larger than 36. |
316 | /// |
317 | /// # Examples |
318 | /// |
319 | /// Basic usage: |
320 | /// |
321 | /// ``` |
322 | /// assert!('1' .is_digit(10)); |
323 | /// assert!('f' .is_digit(16)); |
324 | /// assert!(!'f' .is_digit(10)); |
325 | /// ``` |
326 | /// |
327 | /// Passing a large radix, causing a panic: |
328 | /// |
329 | /// ```should_panic |
330 | /// // this panics |
331 | /// '1' .is_digit(37); |
332 | /// ``` |
333 | /// |
334 | /// Passing a small radix, causing a panic: |
335 | /// |
336 | /// ```should_panic |
337 | /// // this panics |
338 | /// '1' .is_digit(1); |
339 | /// ``` |
340 | #[stable (feature = "rust1" , since = "1.0.0" )] |
341 | #[rustc_const_stable (feature = "const_char_classify" , since = "1.87.0" )] |
342 | #[inline ] |
343 | pub const fn is_digit(self, radix: u32) -> bool { |
344 | self.to_digit(radix).is_some() |
345 | } |
346 | |
347 | /// Converts a `char` to a digit in the given radix. |
348 | /// |
349 | /// A 'radix' here is sometimes also called a 'base'. A radix of two |
350 | /// indicates a binary number, a radix of ten, decimal, and a radix of |
351 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
352 | /// radices are supported. |
353 | /// |
354 | /// 'Digit' is defined to be only the following characters: |
355 | /// |
356 | /// * `0-9` |
357 | /// * `a-z` |
358 | /// * `A-Z` |
359 | /// |
360 | /// # Errors |
361 | /// |
362 | /// Returns `None` if the `char` does not refer to a digit in the given radix. |
363 | /// |
364 | /// # Panics |
365 | /// |
366 | /// Panics if given a radix smaller than 2 or larger than 36. |
367 | /// |
368 | /// # Examples |
369 | /// |
370 | /// Basic usage: |
371 | /// |
372 | /// ``` |
373 | /// assert_eq!('1' .to_digit(10), Some(1)); |
374 | /// assert_eq!('f' .to_digit(16), Some(15)); |
375 | /// ``` |
376 | /// |
377 | /// Passing a non-digit results in failure: |
378 | /// |
379 | /// ``` |
380 | /// assert_eq!('f' .to_digit(10), None); |
381 | /// assert_eq!('z' .to_digit(16), None); |
382 | /// ``` |
383 | /// |
384 | /// Passing a large radix, causing a panic: |
385 | /// |
386 | /// ```should_panic |
387 | /// // this panics |
388 | /// let _ = '1' .to_digit(37); |
389 | /// ``` |
390 | /// Passing a small radix, causing a panic: |
391 | /// |
392 | /// ```should_panic |
393 | /// // this panics |
394 | /// let _ = '1' .to_digit(1); |
395 | /// ``` |
396 | #[stable (feature = "rust1" , since = "1.0.0" )] |
397 | #[rustc_const_stable (feature = "const_char_convert" , since = "1.67.0" )] |
398 | #[must_use = "this returns the result of the operation, \ |
399 | without modifying the original" ] |
400 | #[inline ] |
401 | pub const fn to_digit(self, radix: u32) -> Option<u32> { |
402 | assert!( |
403 | radix >= 2 && radix <= 36, |
404 | "to_digit: invalid radix -- radix must be in the range 2 to 36 inclusive" |
405 | ); |
406 | // check radix to remove letter handling code when radix is a known constant |
407 | let value = if self > '9' && radix > 10 { |
408 | // mask to convert ASCII letters to uppercase |
409 | const TO_UPPERCASE_MASK: u32 = !0b0010_0000; |
410 | // Converts an ASCII letter to its corresponding integer value: |
411 | // A-Z => 10-35, a-z => 10-35. Other characters produce values >= 36. |
412 | // |
413 | // Add Overflow Safety: |
414 | // By applying the mask after the subtraction, the first addendum is |
415 | // constrained such that it never exceeds u32::MAX - 0x20. |
416 | ((self as u32).wrapping_sub('A' as u32) & TO_UPPERCASE_MASK) + 10 |
417 | } else { |
418 | // convert digit to value, non-digits wrap to values > 36 |
419 | (self as u32).wrapping_sub('0' as u32) |
420 | }; |
421 | // FIXME(const-hack): once then_some is const fn, use it here |
422 | if value < radix { Some(value) } else { None } |
423 | } |
424 | |
425 | /// Returns an iterator that yields the hexadecimal Unicode escape of a |
426 | /// character as `char`s. |
427 | /// |
428 | /// This will escape characters with the Rust syntax of the form |
429 | /// `\u{NNNNNN}` where `NNNNNN` is a hexadecimal representation. |
430 | /// |
431 | /// # Examples |
432 | /// |
433 | /// As an iterator: |
434 | /// |
435 | /// ``` |
436 | /// for c in '❤' .escape_unicode() { |
437 | /// print!("{c}" ); |
438 | /// } |
439 | /// println!(); |
440 | /// ``` |
441 | /// |
442 | /// Using `println!` directly: |
443 | /// |
444 | /// ``` |
445 | /// println!("{}" , '❤' .escape_unicode()); |
446 | /// ``` |
447 | /// |
448 | /// Both are equivalent to: |
449 | /// |
450 | /// ``` |
451 | /// println!(" \\u{{2764}}" ); |
452 | /// ``` |
453 | /// |
454 | /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): |
455 | /// |
456 | /// ``` |
457 | /// assert_eq!('❤' .escape_unicode().to_string(), " \\u{2764}" ); |
458 | /// ``` |
459 | #[must_use = "this returns the escaped char as an iterator, \ |
460 | without modifying the original" ] |
461 | #[stable (feature = "rust1" , since = "1.0.0" )] |
462 | #[inline ] |
463 | pub fn escape_unicode(self) -> EscapeUnicode { |
464 | EscapeUnicode::new(self) |
465 | } |
466 | |
467 | /// An extended version of `escape_debug` that optionally permits escaping |
468 | /// Extended Grapheme codepoints, single quotes, and double quotes. This |
469 | /// allows us to format characters like nonspacing marks better when they're |
470 | /// at the start of a string, and allows escaping single quotes in |
471 | /// characters, and double quotes in strings. |
472 | #[inline ] |
473 | pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> EscapeDebug { |
474 | match self { |
475 | ' \0' => EscapeDebug::backslash(ascii::Char::Digit0), |
476 | ' \t' => EscapeDebug::backslash(ascii::Char::SmallT), |
477 | ' \r' => EscapeDebug::backslash(ascii::Char::SmallR), |
478 | ' \n' => EscapeDebug::backslash(ascii::Char::SmallN), |
479 | ' \\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus), |
480 | ' \"' if args.escape_double_quote => EscapeDebug::backslash(ascii::Char::QuotationMark), |
481 | ' \'' if args.escape_single_quote => EscapeDebug::backslash(ascii::Char::Apostrophe), |
482 | _ if args.escape_grapheme_extended && self.is_grapheme_extended() => { |
483 | EscapeDebug::unicode(self) |
484 | } |
485 | _ if is_printable(self) => EscapeDebug::printable(self), |
486 | _ => EscapeDebug::unicode(self), |
487 | } |
488 | } |
489 | |
490 | /// Returns an iterator that yields the literal escape code of a character |
491 | /// as `char`s. |
492 | /// |
493 | /// This will escape the characters similar to the [`Debug`](core::fmt::Debug) implementations |
494 | /// of `str` or `char`. |
495 | /// |
496 | /// # Examples |
497 | /// |
498 | /// As an iterator: |
499 | /// |
500 | /// ``` |
501 | /// for c in ' \n' .escape_debug() { |
502 | /// print!("{c}" ); |
503 | /// } |
504 | /// println!(); |
505 | /// ``` |
506 | /// |
507 | /// Using `println!` directly: |
508 | /// |
509 | /// ``` |
510 | /// println!("{}" , ' \n' .escape_debug()); |
511 | /// ``` |
512 | /// |
513 | /// Both are equivalent to: |
514 | /// |
515 | /// ``` |
516 | /// println!(" \\n" ); |
517 | /// ``` |
518 | /// |
519 | /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): |
520 | /// |
521 | /// ``` |
522 | /// assert_eq!(' \n' .escape_debug().to_string(), " \\n" ); |
523 | /// ``` |
524 | #[must_use = "this returns the escaped char as an iterator, \ |
525 | without modifying the original" ] |
526 | #[stable (feature = "char_escape_debug" , since = "1.20.0" )] |
527 | #[inline ] |
528 | pub fn escape_debug(self) -> EscapeDebug { |
529 | self.escape_debug_ext(EscapeDebugExtArgs::ESCAPE_ALL) |
530 | } |
531 | |
532 | /// Returns an iterator that yields the literal escape code of a character |
533 | /// as `char`s. |
534 | /// |
535 | /// The default is chosen with a bias toward producing literals that are |
536 | /// legal in a variety of languages, including C++11 and similar C-family |
537 | /// languages. The exact rules are: |
538 | /// |
539 | /// * Tab is escaped as `\t`. |
540 | /// * Carriage return is escaped as `\r`. |
541 | /// * Line feed is escaped as `\n`. |
542 | /// * Single quote is escaped as `\'`. |
543 | /// * Double quote is escaped as `\"`. |
544 | /// * Backslash is escaped as `\\`. |
545 | /// * Any character in the 'printable ASCII' range `0x20` .. `0x7e` |
546 | /// inclusive is not escaped. |
547 | /// * All other characters are given hexadecimal Unicode escapes; see |
548 | /// [`escape_unicode`]. |
549 | /// |
550 | /// [`escape_unicode`]: #method.escape_unicode |
551 | /// |
552 | /// # Examples |
553 | /// |
554 | /// As an iterator: |
555 | /// |
556 | /// ``` |
557 | /// for c in '"' .escape_default() { |
558 | /// print!("{c}" ); |
559 | /// } |
560 | /// println!(); |
561 | /// ``` |
562 | /// |
563 | /// Using `println!` directly: |
564 | /// |
565 | /// ``` |
566 | /// println!("{}" , '"' .escape_default()); |
567 | /// ``` |
568 | /// |
569 | /// Both are equivalent to: |
570 | /// |
571 | /// ``` |
572 | /// println!(" \\\"" ); |
573 | /// ``` |
574 | /// |
575 | /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): |
576 | /// |
577 | /// ``` |
578 | /// assert_eq!('"' .escape_default().to_string(), " \\\"" ); |
579 | /// ``` |
580 | #[must_use = "this returns the escaped char as an iterator, \ |
581 | without modifying the original" ] |
582 | #[stable (feature = "rust1" , since = "1.0.0" )] |
583 | #[inline ] |
584 | pub fn escape_default(self) -> EscapeDefault { |
585 | match self { |
586 | ' \t' => EscapeDefault::backslash(ascii::Char::SmallT), |
587 | ' \r' => EscapeDefault::backslash(ascii::Char::SmallR), |
588 | ' \n' => EscapeDefault::backslash(ascii::Char::SmallN), |
589 | ' \\' | ' \'' | ' \"' => EscapeDefault::backslash(self.as_ascii().unwrap()), |
590 | ' \x20' ..=' \x7e' => EscapeDefault::printable(self.as_ascii().unwrap()), |
591 | _ => EscapeDefault::unicode(self), |
592 | } |
593 | } |
594 | |
595 | /// Returns the number of bytes this `char` would need if encoded in UTF-8. |
596 | /// |
597 | /// That number of bytes is always between 1 and 4, inclusive. |
598 | /// |
599 | /// # Examples |
600 | /// |
601 | /// Basic usage: |
602 | /// |
603 | /// ``` |
604 | /// let len = 'A' .len_utf8(); |
605 | /// assert_eq!(len, 1); |
606 | /// |
607 | /// let len = 'ß' .len_utf8(); |
608 | /// assert_eq!(len, 2); |
609 | /// |
610 | /// let len = 'ℝ' .len_utf8(); |
611 | /// assert_eq!(len, 3); |
612 | /// |
613 | /// let len = '💣' .len_utf8(); |
614 | /// assert_eq!(len, 4); |
615 | /// ``` |
616 | /// |
617 | /// The `&str` type guarantees that its contents are UTF-8, and so we can compare the length it |
618 | /// would take if each code point was represented as a `char` vs in the `&str` itself: |
619 | /// |
620 | /// ``` |
621 | /// // as chars |
622 | /// let eastern = '東' ; |
623 | /// let capital = '京' ; |
624 | /// |
625 | /// // both can be represented as three bytes |
626 | /// assert_eq!(3, eastern.len_utf8()); |
627 | /// assert_eq!(3, capital.len_utf8()); |
628 | /// |
629 | /// // as a &str, these two are encoded in UTF-8 |
630 | /// let tokyo = "東京" ; |
631 | /// |
632 | /// let len = eastern.len_utf8() + capital.len_utf8(); |
633 | /// |
634 | /// // we can see that they take six bytes total... |
635 | /// assert_eq!(6, tokyo.len()); |
636 | /// |
637 | /// // ... just like the &str |
638 | /// assert_eq!(len, tokyo.len()); |
639 | /// ``` |
640 | #[stable (feature = "rust1" , since = "1.0.0" )] |
641 | #[rustc_const_stable (feature = "const_char_len_utf" , since = "1.52.0" )] |
642 | #[inline ] |
643 | #[must_use ] |
644 | pub const fn len_utf8(self) -> usize { |
645 | len_utf8(self as u32) |
646 | } |
647 | |
648 | /// Returns the number of 16-bit code units this `char` would need if |
649 | /// encoded in UTF-16. |
650 | /// |
651 | /// That number of code units is always either 1 or 2, for unicode scalar values in |
652 | /// the [basic multilingual plane] or [supplementary planes] respectively. |
653 | /// |
654 | /// See the documentation for [`len_utf8()`] for more explanation of this |
655 | /// concept. This function is a mirror, but for UTF-16 instead of UTF-8. |
656 | /// |
657 | /// [basic multilingual plane]: http://www.unicode.org/glossary/#basic_multilingual_plane |
658 | /// [supplementary planes]: http://www.unicode.org/glossary/#supplementary_planes |
659 | /// [`len_utf8()`]: #method.len_utf8 |
660 | /// |
661 | /// # Examples |
662 | /// |
663 | /// Basic usage: |
664 | /// |
665 | /// ``` |
666 | /// let n = 'ß' .len_utf16(); |
667 | /// assert_eq!(n, 1); |
668 | /// |
669 | /// let len = '💣' .len_utf16(); |
670 | /// assert_eq!(len, 2); |
671 | /// ``` |
672 | #[stable (feature = "rust1" , since = "1.0.0" )] |
673 | #[rustc_const_stable (feature = "const_char_len_utf" , since = "1.52.0" )] |
674 | #[inline ] |
675 | #[must_use ] |
676 | pub const fn len_utf16(self) -> usize { |
677 | len_utf16(self as u32) |
678 | } |
679 | |
680 | /// Encodes this character as UTF-8 into the provided byte buffer, |
681 | /// and then returns the subslice of the buffer that contains the encoded character. |
682 | /// |
683 | /// # Panics |
684 | /// |
685 | /// Panics if the buffer is not large enough. |
686 | /// A buffer of length four is large enough to encode any `char`. |
687 | /// |
688 | /// # Examples |
689 | /// |
690 | /// In both of these examples, 'ß' takes two bytes to encode. |
691 | /// |
692 | /// ``` |
693 | /// let mut b = [0; 2]; |
694 | /// |
695 | /// let result = 'ß' .encode_utf8(&mut b); |
696 | /// |
697 | /// assert_eq!(result, "ß" ); |
698 | /// |
699 | /// assert_eq!(result.len(), 2); |
700 | /// ``` |
701 | /// |
702 | /// A buffer that's too small: |
703 | /// |
704 | /// ```should_panic |
705 | /// let mut b = [0; 1]; |
706 | /// |
707 | /// // this panics |
708 | /// 'ß' .encode_utf8(&mut b); |
709 | /// ``` |
710 | #[stable (feature = "unicode_encode_char" , since = "1.15.0" )] |
711 | #[rustc_const_stable (feature = "const_char_encode_utf8" , since = "1.83.0" )] |
712 | #[inline ] |
713 | pub const fn encode_utf8(self, dst: &mut [u8]) -> &mut str { |
714 | // SAFETY: `char` is not a surrogate, so this is valid UTF-8. |
715 | unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) } |
716 | } |
717 | |
718 | /// Encodes this character as native endian UTF-16 into the provided `u16` buffer, |
719 | /// and then returns the subslice of the buffer that contains the encoded character. |
720 | /// |
721 | /// # Panics |
722 | /// |
723 | /// Panics if the buffer is not large enough. |
724 | /// A buffer of length 2 is large enough to encode any `char`. |
725 | /// |
726 | /// # Examples |
727 | /// |
728 | /// In both of these examples, '𝕊' takes two `u16`s to encode. |
729 | /// |
730 | /// ``` |
731 | /// let mut b = [0; 2]; |
732 | /// |
733 | /// let result = '𝕊' .encode_utf16(&mut b); |
734 | /// |
735 | /// assert_eq!(result.len(), 2); |
736 | /// ``` |
737 | /// |
738 | /// A buffer that's too small: |
739 | /// |
740 | /// ```should_panic |
741 | /// let mut b = [0; 1]; |
742 | /// |
743 | /// // this panics |
744 | /// '𝕊' .encode_utf16(&mut b); |
745 | /// ``` |
746 | #[stable (feature = "unicode_encode_char" , since = "1.15.0" )] |
747 | #[rustc_const_stable (feature = "const_char_encode_utf16" , since = "1.84.0" )] |
748 | #[inline ] |
749 | pub const fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { |
750 | encode_utf16_raw(self as u32, dst) |
751 | } |
752 | |
753 | /// Returns `true` if this `char` has the `Alphabetic` property. |
754 | /// |
755 | /// `Alphabetic` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and |
756 | /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. |
757 | /// |
758 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
759 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
760 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
761 | /// |
762 | /// # Examples |
763 | /// |
764 | /// Basic usage: |
765 | /// |
766 | /// ``` |
767 | /// assert!('a' .is_alphabetic()); |
768 | /// assert!('京' .is_alphabetic()); |
769 | /// |
770 | /// let c = '💝' ; |
771 | /// // love is many things, but it is not alphabetic |
772 | /// assert!(!c.is_alphabetic()); |
773 | /// ``` |
774 | #[must_use ] |
775 | #[stable (feature = "rust1" , since = "1.0.0" )] |
776 | #[inline ] |
777 | pub fn is_alphabetic(self) -> bool { |
778 | match self { |
779 | 'a' ..='z' | 'A' ..='Z' => true, |
780 | c => c > ' \x7f' && unicode::Alphabetic(c), |
781 | } |
782 | } |
783 | |
784 | /// Returns `true` if this `char` has the `Lowercase` property. |
785 | /// |
786 | /// `Lowercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and |
787 | /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. |
788 | /// |
789 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
790 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
791 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
792 | /// |
793 | /// # Examples |
794 | /// |
795 | /// Basic usage: |
796 | /// |
797 | /// ``` |
798 | /// assert!('a' .is_lowercase()); |
799 | /// assert!('δ' .is_lowercase()); |
800 | /// assert!(!'A' .is_lowercase()); |
801 | /// assert!(!'Δ' .is_lowercase()); |
802 | /// |
803 | /// // The various Chinese scripts and punctuation do not have case, and so: |
804 | /// assert!(!'中' .is_lowercase()); |
805 | /// assert!(!' ' .is_lowercase()); |
806 | /// ``` |
807 | /// |
808 | /// In a const context: |
809 | /// |
810 | /// ``` |
811 | /// const CAPITAL_DELTA_IS_LOWERCASE: bool = 'Δ' .is_lowercase(); |
812 | /// assert!(!CAPITAL_DELTA_IS_LOWERCASE); |
813 | /// ``` |
814 | #[must_use ] |
815 | #[stable (feature = "rust1" , since = "1.0.0" )] |
816 | #[rustc_const_stable (feature = "const_unicode_case_lookup" , since = "1.84.0" )] |
817 | #[inline ] |
818 | pub const fn is_lowercase(self) -> bool { |
819 | match self { |
820 | 'a' ..='z' => true, |
821 | c => c > ' \x7f' && unicode::Lowercase(c), |
822 | } |
823 | } |
824 | |
825 | /// Returns `true` if this `char` has the `Uppercase` property. |
826 | /// |
827 | /// `Uppercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and |
828 | /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. |
829 | /// |
830 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
831 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
832 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
833 | /// |
834 | /// # Examples |
835 | /// |
836 | /// Basic usage: |
837 | /// |
838 | /// ``` |
839 | /// assert!(!'a' .is_uppercase()); |
840 | /// assert!(!'δ' .is_uppercase()); |
841 | /// assert!('A' .is_uppercase()); |
842 | /// assert!('Δ' .is_uppercase()); |
843 | /// |
844 | /// // The various Chinese scripts and punctuation do not have case, and so: |
845 | /// assert!(!'中' .is_uppercase()); |
846 | /// assert!(!' ' .is_uppercase()); |
847 | /// ``` |
848 | /// |
849 | /// In a const context: |
850 | /// |
851 | /// ``` |
852 | /// const CAPITAL_DELTA_IS_UPPERCASE: bool = 'Δ' .is_uppercase(); |
853 | /// assert!(CAPITAL_DELTA_IS_UPPERCASE); |
854 | /// ``` |
855 | #[must_use ] |
856 | #[stable (feature = "rust1" , since = "1.0.0" )] |
857 | #[rustc_const_stable (feature = "const_unicode_case_lookup" , since = "1.84.0" )] |
858 | #[inline ] |
859 | pub const fn is_uppercase(self) -> bool { |
860 | match self { |
861 | 'A' ..='Z' => true, |
862 | c => c > ' \x7f' && unicode::Uppercase(c), |
863 | } |
864 | } |
865 | |
866 | /// Returns `true` if this `char` has the `White_Space` property. |
867 | /// |
868 | /// `White_Space` is specified in the [Unicode Character Database][ucd] [`PropList.txt`]. |
869 | /// |
870 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
871 | /// [`PropList.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt |
872 | /// |
873 | /// # Examples |
874 | /// |
875 | /// Basic usage: |
876 | /// |
877 | /// ``` |
878 | /// assert!(' ' .is_whitespace()); |
879 | /// |
880 | /// // line break |
881 | /// assert!(' \n' .is_whitespace()); |
882 | /// |
883 | /// // a non-breaking space |
884 | /// assert!(' \u{A0}' .is_whitespace()); |
885 | /// |
886 | /// assert!(!'越' .is_whitespace()); |
887 | /// ``` |
888 | #[must_use ] |
889 | #[stable (feature = "rust1" , since = "1.0.0" )] |
890 | #[rustc_const_stable (feature = "const_char_classify" , since = "1.87.0" )] |
891 | #[inline ] |
892 | pub const fn is_whitespace(self) -> bool { |
893 | match self { |
894 | ' ' | ' \x09' ..=' \x0d' => true, |
895 | c => c > ' \x7f' && unicode::White_Space(c), |
896 | } |
897 | } |
898 | |
899 | /// Returns `true` if this `char` satisfies either [`is_alphabetic()`] or [`is_numeric()`]. |
900 | /// |
901 | /// [`is_alphabetic()`]: #method.is_alphabetic |
902 | /// [`is_numeric()`]: #method.is_numeric |
903 | /// |
904 | /// # Examples |
905 | /// |
906 | /// Basic usage: |
907 | /// |
908 | /// ``` |
909 | /// assert!('٣' .is_alphanumeric()); |
910 | /// assert!('7' .is_alphanumeric()); |
911 | /// assert!('৬' .is_alphanumeric()); |
912 | /// assert!('¾' .is_alphanumeric()); |
913 | /// assert!('①' .is_alphanumeric()); |
914 | /// assert!('K' .is_alphanumeric()); |
915 | /// assert!('و' .is_alphanumeric()); |
916 | /// assert!('藏' .is_alphanumeric()); |
917 | /// ``` |
918 | #[must_use ] |
919 | #[stable (feature = "rust1" , since = "1.0.0" )] |
920 | #[inline ] |
921 | pub fn is_alphanumeric(self) -> bool { |
922 | self.is_alphabetic() || self.is_numeric() |
923 | } |
924 | |
925 | /// Returns `true` if this `char` has the general category for control codes. |
926 | /// |
927 | /// Control codes (code points with the general category of `Cc`) are described in Chapter 4 |
928 | /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character |
929 | /// Database][ucd] [`UnicodeData.txt`]. |
930 | /// |
931 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
932 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
933 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt |
934 | /// |
935 | /// # Examples |
936 | /// |
937 | /// Basic usage: |
938 | /// |
939 | /// ``` |
940 | /// // U+009C, STRING TERMINATOR |
941 | /// assert!('' .is_control()); |
942 | /// assert!(!'q' .is_control()); |
943 | /// ``` |
944 | #[must_use ] |
945 | #[stable (feature = "rust1" , since = "1.0.0" )] |
946 | #[inline ] |
947 | pub fn is_control(self) -> bool { |
948 | unicode::Cc(self) |
949 | } |
950 | |
951 | /// Returns `true` if this `char` has the `Grapheme_Extend` property. |
952 | /// |
953 | /// `Grapheme_Extend` is described in [Unicode Standard Annex #29 (Unicode Text |
954 | /// Segmentation)][uax29] and specified in the [Unicode Character Database][ucd] |
955 | /// [`DerivedCoreProperties.txt`]. |
956 | /// |
957 | /// [uax29]: https://www.unicode.org/reports/tr29/ |
958 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
959 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt |
960 | #[must_use ] |
961 | #[inline ] |
962 | pub(crate) fn is_grapheme_extended(self) -> bool { |
963 | unicode::Grapheme_Extend(self) |
964 | } |
965 | |
966 | /// Returns `true` if this `char` has one of the general categories for numbers. |
967 | /// |
968 | /// The general categories for numbers (`Nd` for decimal digits, `Nl` for letter-like numeric |
969 | /// characters, and `No` for other numeric characters) are specified in the [Unicode Character |
970 | /// Database][ucd] [`UnicodeData.txt`]. |
971 | /// |
972 | /// This method doesn't cover everything that could be considered a number, e.g. ideographic numbers like '三'. |
973 | /// If you want everything including characters with overlapping purposes then you might want to use |
974 | /// a unicode or language-processing library that exposes the appropriate character properties instead |
975 | /// of looking at the unicode categories. |
976 | /// |
977 | /// If you want to parse ASCII decimal digits (0-9) or ASCII base-N, use |
978 | /// `is_ascii_digit` or `is_digit` instead. |
979 | /// |
980 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
981 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
982 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt |
983 | /// |
984 | /// # Examples |
985 | /// |
986 | /// Basic usage: |
987 | /// |
988 | /// ``` |
989 | /// assert!('٣' .is_numeric()); |
990 | /// assert!('7' .is_numeric()); |
991 | /// assert!('৬' .is_numeric()); |
992 | /// assert!('¾' .is_numeric()); |
993 | /// assert!('①' .is_numeric()); |
994 | /// assert!(!'K' .is_numeric()); |
995 | /// assert!(!'و' .is_numeric()); |
996 | /// assert!(!'藏' .is_numeric()); |
997 | /// assert!(!'三' .is_numeric()); |
998 | /// ``` |
999 | #[must_use ] |
1000 | #[stable (feature = "rust1" , since = "1.0.0" )] |
1001 | #[inline ] |
1002 | pub fn is_numeric(self) -> bool { |
1003 | match self { |
1004 | '0' ..='9' => true, |
1005 | c => c > ' \x7f' && unicode::N(c), |
1006 | } |
1007 | } |
1008 | |
1009 | /// Returns an iterator that yields the lowercase mapping of this `char` as one or more |
1010 | /// `char`s. |
1011 | /// |
1012 | /// If this `char` does not have a lowercase mapping, the iterator yields the same `char`. |
1013 | /// |
1014 | /// If this `char` has a one-to-one lowercase mapping given by the [Unicode Character |
1015 | /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. |
1016 | /// |
1017 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
1018 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt |
1019 | /// |
1020 | /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields |
1021 | /// the `char`(s) given by [`SpecialCasing.txt`]. |
1022 | /// |
1023 | /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt |
1024 | /// |
1025 | /// This operation performs an unconditional mapping without tailoring. That is, the conversion |
1026 | /// is independent of context and language. |
1027 | /// |
1028 | /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in |
1029 | /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. |
1030 | /// |
1031 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
1032 | /// |
1033 | /// # Examples |
1034 | /// |
1035 | /// As an iterator: |
1036 | /// |
1037 | /// ``` |
1038 | /// for c in 'İ' .to_lowercase() { |
1039 | /// print!("{c}" ); |
1040 | /// } |
1041 | /// println!(); |
1042 | /// ``` |
1043 | /// |
1044 | /// Using `println!` directly: |
1045 | /// |
1046 | /// ``` |
1047 | /// println!("{}" , 'İ' .to_lowercase()); |
1048 | /// ``` |
1049 | /// |
1050 | /// Both are equivalent to: |
1051 | /// |
1052 | /// ``` |
1053 | /// println!("i \u{307}" ); |
1054 | /// ``` |
1055 | /// |
1056 | /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): |
1057 | /// |
1058 | /// ``` |
1059 | /// assert_eq!('C' .to_lowercase().to_string(), "c" ); |
1060 | /// |
1061 | /// // Sometimes the result is more than one character: |
1062 | /// assert_eq!('İ' .to_lowercase().to_string(), "i \u{307}" ); |
1063 | /// |
1064 | /// // Characters that do not have both uppercase and lowercase |
1065 | /// // convert into themselves. |
1066 | /// assert_eq!('山' .to_lowercase().to_string(), "山" ); |
1067 | /// ``` |
1068 | #[must_use = "this returns the lowercase character as a new iterator, \ |
1069 | without modifying the original" ] |
1070 | #[stable (feature = "rust1" , since = "1.0.0" )] |
1071 | #[inline ] |
1072 | pub fn to_lowercase(self) -> ToLowercase { |
1073 | ToLowercase(CaseMappingIter::new(conversions::to_lower(self))) |
1074 | } |
1075 | |
1076 | /// Returns an iterator that yields the uppercase mapping of this `char` as one or more |
1077 | /// `char`s. |
1078 | /// |
1079 | /// If this `char` does not have an uppercase mapping, the iterator yields the same `char`. |
1080 | /// |
1081 | /// If this `char` has a one-to-one uppercase mapping given by the [Unicode Character |
1082 | /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. |
1083 | /// |
1084 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
1085 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt |
1086 | /// |
1087 | /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields |
1088 | /// the `char`(s) given by [`SpecialCasing.txt`]. |
1089 | /// |
1090 | /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt |
1091 | /// |
1092 | /// This operation performs an unconditional mapping without tailoring. That is, the conversion |
1093 | /// is independent of context and language. |
1094 | /// |
1095 | /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in |
1096 | /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. |
1097 | /// |
1098 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
1099 | /// |
1100 | /// # Examples |
1101 | /// |
1102 | /// As an iterator: |
1103 | /// |
1104 | /// ``` |
1105 | /// for c in 'ß' .to_uppercase() { |
1106 | /// print!("{c}" ); |
1107 | /// } |
1108 | /// println!(); |
1109 | /// ``` |
1110 | /// |
1111 | /// Using `println!` directly: |
1112 | /// |
1113 | /// ``` |
1114 | /// println!("{}" , 'ß' .to_uppercase()); |
1115 | /// ``` |
1116 | /// |
1117 | /// Both are equivalent to: |
1118 | /// |
1119 | /// ``` |
1120 | /// println!("SS" ); |
1121 | /// ``` |
1122 | /// |
1123 | /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): |
1124 | /// |
1125 | /// ``` |
1126 | /// assert_eq!('c' .to_uppercase().to_string(), "C" ); |
1127 | /// |
1128 | /// // Sometimes the result is more than one character: |
1129 | /// assert_eq!('ß' .to_uppercase().to_string(), "SS" ); |
1130 | /// |
1131 | /// // Characters that do not have both uppercase and lowercase |
1132 | /// // convert into themselves. |
1133 | /// assert_eq!('山' .to_uppercase().to_string(), "山" ); |
1134 | /// ``` |
1135 | /// |
1136 | /// # Note on locale |
1137 | /// |
1138 | /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two: |
1139 | /// |
1140 | /// * 'Dotless': I / ı, sometimes written ï |
1141 | /// * 'Dotted': İ / i |
1142 | /// |
1143 | /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: |
1144 | /// |
1145 | /// ``` |
1146 | /// let upper_i = 'i' .to_uppercase().to_string(); |
1147 | /// ``` |
1148 | /// |
1149 | /// The value of `upper_i` here relies on the language of the text: if we're |
1150 | /// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should |
1151 | /// be `"İ"`. `to_uppercase()` does not take this into account, and so: |
1152 | /// |
1153 | /// ``` |
1154 | /// let upper_i = 'i' .to_uppercase().to_string(); |
1155 | /// |
1156 | /// assert_eq!(upper_i, "I" ); |
1157 | /// ``` |
1158 | /// |
1159 | /// holds across languages. |
1160 | #[must_use = "this returns the uppercase character as a new iterator, \ |
1161 | without modifying the original" ] |
1162 | #[stable (feature = "rust1" , since = "1.0.0" )] |
1163 | #[inline ] |
1164 | pub fn to_uppercase(self) -> ToUppercase { |
1165 | ToUppercase(CaseMappingIter::new(conversions::to_upper(self))) |
1166 | } |
1167 | |
1168 | /// Checks if the value is within the ASCII range. |
1169 | /// |
1170 | /// # Examples |
1171 | /// |
1172 | /// ``` |
1173 | /// let ascii = 'a' ; |
1174 | /// let non_ascii = '❤' ; |
1175 | /// |
1176 | /// assert!(ascii.is_ascii()); |
1177 | /// assert!(!non_ascii.is_ascii()); |
1178 | /// ``` |
1179 | #[must_use ] |
1180 | #[stable (feature = "ascii_methods_on_intrinsics" , since = "1.23.0" )] |
1181 | #[rustc_const_stable (feature = "const_char_is_ascii" , since = "1.32.0" )] |
1182 | #[rustc_diagnostic_item = "char_is_ascii" ] |
1183 | #[inline ] |
1184 | pub const fn is_ascii(&self) -> bool { |
1185 | *self as u32 <= 0x7F |
1186 | } |
1187 | |
1188 | /// Returns `Some` if the value is within the ASCII range, |
1189 | /// or `None` if it's not. |
1190 | /// |
1191 | /// This is preferred to [`Self::is_ascii`] when you're passing the value |
1192 | /// along to something else that can take [`ascii::Char`] rather than |
1193 | /// needing to check again for itself whether the value is in ASCII. |
1194 | #[must_use ] |
1195 | #[unstable (feature = "ascii_char" , issue = "110998" )] |
1196 | #[inline ] |
1197 | pub const fn as_ascii(&self) -> Option<ascii::Char> { |
1198 | if self.is_ascii() { |
1199 | // SAFETY: Just checked that this is ASCII. |
1200 | Some(unsafe { ascii::Char::from_u8_unchecked(*self as u8) }) |
1201 | } else { |
1202 | None |
1203 | } |
1204 | } |
1205 | |
1206 | /// Converts this char into an [ASCII character](`ascii::Char`), without |
1207 | /// checking whether it is valid. |
1208 | /// |
1209 | /// # Safety |
1210 | /// |
1211 | /// This char must be within the ASCII range, or else this is UB. |
1212 | #[must_use ] |
1213 | #[unstable (feature = "ascii_char" , issue = "110998" )] |
1214 | #[inline ] |
1215 | pub const unsafe fn as_ascii_unchecked(&self) -> ascii::Char { |
1216 | assert_unsafe_precondition!( |
1217 | check_library_ub, |
1218 | "as_ascii_unchecked requires that the char is valid ASCII" , |
1219 | (it: &char = self) => it.is_ascii() |
1220 | ); |
1221 | |
1222 | // SAFETY: the caller promised that this char is ASCII. |
1223 | unsafe { ascii::Char::from_u8_unchecked(*self as u8) } |
1224 | } |
1225 | |
1226 | /// Makes a copy of the value in its ASCII upper case equivalent. |
1227 | /// |
1228 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', |
1229 | /// but non-ASCII letters are unchanged. |
1230 | /// |
1231 | /// To uppercase the value in-place, use [`make_ascii_uppercase()`]. |
1232 | /// |
1233 | /// To uppercase ASCII characters in addition to non-ASCII characters, use |
1234 | /// [`to_uppercase()`]. |
1235 | /// |
1236 | /// # Examples |
1237 | /// |
1238 | /// ``` |
1239 | /// let ascii = 'a' ; |
1240 | /// let non_ascii = '❤' ; |
1241 | /// |
1242 | /// assert_eq!('A' , ascii.to_ascii_uppercase()); |
1243 | /// assert_eq!('❤' , non_ascii.to_ascii_uppercase()); |
1244 | /// ``` |
1245 | /// |
1246 | /// [`make_ascii_uppercase()`]: #method.make_ascii_uppercase |
1247 | /// [`to_uppercase()`]: #method.to_uppercase |
1248 | #[must_use = "to uppercase the value in-place, use `make_ascii_uppercase()`" ] |
1249 | #[stable (feature = "ascii_methods_on_intrinsics" , since = "1.23.0" )] |
1250 | #[rustc_const_stable (feature = "const_ascii_methods_on_intrinsics" , since = "1.52.0" )] |
1251 | #[inline ] |
1252 | pub const fn to_ascii_uppercase(&self) -> char { |
1253 | if self.is_ascii_lowercase() { |
1254 | (*self as u8).ascii_change_case_unchecked() as char |
1255 | } else { |
1256 | *self |
1257 | } |
1258 | } |
1259 | |
1260 | /// Makes a copy of the value in its ASCII lower case equivalent. |
1261 | /// |
1262 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', |
1263 | /// but non-ASCII letters are unchanged. |
1264 | /// |
1265 | /// To lowercase the value in-place, use [`make_ascii_lowercase()`]. |
1266 | /// |
1267 | /// To lowercase ASCII characters in addition to non-ASCII characters, use |
1268 | /// [`to_lowercase()`]. |
1269 | /// |
1270 | /// # Examples |
1271 | /// |
1272 | /// ``` |
1273 | /// let ascii = 'A' ; |
1274 | /// let non_ascii = '❤' ; |
1275 | /// |
1276 | /// assert_eq!('a' , ascii.to_ascii_lowercase()); |
1277 | /// assert_eq!('❤' , non_ascii.to_ascii_lowercase()); |
1278 | /// ``` |
1279 | /// |
1280 | /// [`make_ascii_lowercase()`]: #method.make_ascii_lowercase |
1281 | /// [`to_lowercase()`]: #method.to_lowercase |
1282 | #[must_use = "to lowercase the value in-place, use `make_ascii_lowercase()`" ] |
1283 | #[stable (feature = "ascii_methods_on_intrinsics" , since = "1.23.0" )] |
1284 | #[rustc_const_stable (feature = "const_ascii_methods_on_intrinsics" , since = "1.52.0" )] |
1285 | #[inline ] |
1286 | pub const fn to_ascii_lowercase(&self) -> char { |
1287 | if self.is_ascii_uppercase() { |
1288 | (*self as u8).ascii_change_case_unchecked() as char |
1289 | } else { |
1290 | *self |
1291 | } |
1292 | } |
1293 | |
1294 | /// Checks that two values are an ASCII case-insensitive match. |
1295 | /// |
1296 | /// Equivalent to <code>[to_ascii_lowercase]\(a) == [to_ascii_lowercase]\(b)</code>. |
1297 | /// |
1298 | /// # Examples |
1299 | /// |
1300 | /// ``` |
1301 | /// let upper_a = 'A' ; |
1302 | /// let lower_a = 'a' ; |
1303 | /// let lower_z = 'z' ; |
1304 | /// |
1305 | /// assert!(upper_a.eq_ignore_ascii_case(&lower_a)); |
1306 | /// assert!(upper_a.eq_ignore_ascii_case(&upper_a)); |
1307 | /// assert!(!upper_a.eq_ignore_ascii_case(&lower_z)); |
1308 | /// ``` |
1309 | /// |
1310 | /// [to_ascii_lowercase]: #method.to_ascii_lowercase |
1311 | #[stable (feature = "ascii_methods_on_intrinsics" , since = "1.23.0" )] |
1312 | #[rustc_const_stable (feature = "const_ascii_methods_on_intrinsics" , since = "1.52.0" )] |
1313 | #[inline ] |
1314 | pub const fn eq_ignore_ascii_case(&self, other: &char) -> bool { |
1315 | self.to_ascii_lowercase() == other.to_ascii_lowercase() |
1316 | } |
1317 | |
1318 | /// Converts this type to its ASCII upper case equivalent in-place. |
1319 | /// |
1320 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', |
1321 | /// but non-ASCII letters are unchanged. |
1322 | /// |
1323 | /// To return a new uppercased value without modifying the existing one, use |
1324 | /// [`to_ascii_uppercase()`]. |
1325 | /// |
1326 | /// # Examples |
1327 | /// |
1328 | /// ``` |
1329 | /// let mut ascii = 'a' ; |
1330 | /// |
1331 | /// ascii.make_ascii_uppercase(); |
1332 | /// |
1333 | /// assert_eq!('A' , ascii); |
1334 | /// ``` |
1335 | /// |
1336 | /// [`to_ascii_uppercase()`]: #method.to_ascii_uppercase |
1337 | #[stable (feature = "ascii_methods_on_intrinsics" , since = "1.23.0" )] |
1338 | #[rustc_const_stable (feature = "const_make_ascii" , since = "1.84.0" )] |
1339 | #[inline ] |
1340 | pub const fn make_ascii_uppercase(&mut self) { |
1341 | *self = self.to_ascii_uppercase(); |
1342 | } |
1343 | |
1344 | /// Converts this type to its ASCII lower case equivalent in-place. |
1345 | /// |
1346 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', |
1347 | /// but non-ASCII letters are unchanged. |
1348 | /// |
1349 | /// To return a new lowercased value without modifying the existing one, use |
1350 | /// [`to_ascii_lowercase()`]. |
1351 | /// |
1352 | /// # Examples |
1353 | /// |
1354 | /// ``` |
1355 | /// let mut ascii = 'A' ; |
1356 | /// |
1357 | /// ascii.make_ascii_lowercase(); |
1358 | /// |
1359 | /// assert_eq!('a' , ascii); |
1360 | /// ``` |
1361 | /// |
1362 | /// [`to_ascii_lowercase()`]: #method.to_ascii_lowercase |
1363 | #[stable (feature = "ascii_methods_on_intrinsics" , since = "1.23.0" )] |
1364 | #[rustc_const_stable (feature = "const_make_ascii" , since = "1.84.0" )] |
1365 | #[inline ] |
1366 | pub const fn make_ascii_lowercase(&mut self) { |
1367 | *self = self.to_ascii_lowercase(); |
1368 | } |
1369 | |
1370 | /// Checks if the value is an ASCII alphabetic character: |
1371 | /// |
1372 | /// - U+0041 'A' ..= U+005A 'Z', or |
1373 | /// - U+0061 'a' ..= U+007A 'z'. |
1374 | /// |
1375 | /// # Examples |
1376 | /// |
1377 | /// ``` |
1378 | /// let uppercase_a = 'A' ; |
1379 | /// let uppercase_g = 'G' ; |
1380 | /// let a = 'a' ; |
1381 | /// let g = 'g' ; |
1382 | /// let zero = '0' ; |
1383 | /// let percent = '%' ; |
1384 | /// let space = ' ' ; |
1385 | /// let lf = ' \n' ; |
1386 | /// let esc = ' \x1b' ; |
1387 | /// |
1388 | /// assert!(uppercase_a.is_ascii_alphabetic()); |
1389 | /// assert!(uppercase_g.is_ascii_alphabetic()); |
1390 | /// assert!(a.is_ascii_alphabetic()); |
1391 | /// assert!(g.is_ascii_alphabetic()); |
1392 | /// assert!(!zero.is_ascii_alphabetic()); |
1393 | /// assert!(!percent.is_ascii_alphabetic()); |
1394 | /// assert!(!space.is_ascii_alphabetic()); |
1395 | /// assert!(!lf.is_ascii_alphabetic()); |
1396 | /// assert!(!esc.is_ascii_alphabetic()); |
1397 | /// ``` |
1398 | #[must_use ] |
1399 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1400 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1401 | #[inline ] |
1402 | pub const fn is_ascii_alphabetic(&self) -> bool { |
1403 | matches!(*self, 'A' ..='Z' | 'a' ..='z' ) |
1404 | } |
1405 | |
1406 | /// Checks if the value is an ASCII uppercase character: |
1407 | /// U+0041 'A' ..= U+005A 'Z'. |
1408 | /// |
1409 | /// # Examples |
1410 | /// |
1411 | /// ``` |
1412 | /// let uppercase_a = 'A' ; |
1413 | /// let uppercase_g = 'G' ; |
1414 | /// let a = 'a' ; |
1415 | /// let g = 'g' ; |
1416 | /// let zero = '0' ; |
1417 | /// let percent = '%' ; |
1418 | /// let space = ' ' ; |
1419 | /// let lf = ' \n' ; |
1420 | /// let esc = ' \x1b' ; |
1421 | /// |
1422 | /// assert!(uppercase_a.is_ascii_uppercase()); |
1423 | /// assert!(uppercase_g.is_ascii_uppercase()); |
1424 | /// assert!(!a.is_ascii_uppercase()); |
1425 | /// assert!(!g.is_ascii_uppercase()); |
1426 | /// assert!(!zero.is_ascii_uppercase()); |
1427 | /// assert!(!percent.is_ascii_uppercase()); |
1428 | /// assert!(!space.is_ascii_uppercase()); |
1429 | /// assert!(!lf.is_ascii_uppercase()); |
1430 | /// assert!(!esc.is_ascii_uppercase()); |
1431 | /// ``` |
1432 | #[must_use ] |
1433 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1434 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1435 | #[inline ] |
1436 | pub const fn is_ascii_uppercase(&self) -> bool { |
1437 | matches!(*self, 'A' ..='Z' ) |
1438 | } |
1439 | |
1440 | /// Checks if the value is an ASCII lowercase character: |
1441 | /// U+0061 'a' ..= U+007A 'z'. |
1442 | /// |
1443 | /// # Examples |
1444 | /// |
1445 | /// ``` |
1446 | /// let uppercase_a = 'A' ; |
1447 | /// let uppercase_g = 'G' ; |
1448 | /// let a = 'a' ; |
1449 | /// let g = 'g' ; |
1450 | /// let zero = '0' ; |
1451 | /// let percent = '%' ; |
1452 | /// let space = ' ' ; |
1453 | /// let lf = ' \n' ; |
1454 | /// let esc = ' \x1b' ; |
1455 | /// |
1456 | /// assert!(!uppercase_a.is_ascii_lowercase()); |
1457 | /// assert!(!uppercase_g.is_ascii_lowercase()); |
1458 | /// assert!(a.is_ascii_lowercase()); |
1459 | /// assert!(g.is_ascii_lowercase()); |
1460 | /// assert!(!zero.is_ascii_lowercase()); |
1461 | /// assert!(!percent.is_ascii_lowercase()); |
1462 | /// assert!(!space.is_ascii_lowercase()); |
1463 | /// assert!(!lf.is_ascii_lowercase()); |
1464 | /// assert!(!esc.is_ascii_lowercase()); |
1465 | /// ``` |
1466 | #[must_use ] |
1467 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1468 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1469 | #[inline ] |
1470 | pub const fn is_ascii_lowercase(&self) -> bool { |
1471 | matches!(*self, 'a' ..='z' ) |
1472 | } |
1473 | |
1474 | /// Checks if the value is an ASCII alphanumeric character: |
1475 | /// |
1476 | /// - U+0041 'A' ..= U+005A 'Z', or |
1477 | /// - U+0061 'a' ..= U+007A 'z', or |
1478 | /// - U+0030 '0' ..= U+0039 '9'. |
1479 | /// |
1480 | /// # Examples |
1481 | /// |
1482 | /// ``` |
1483 | /// let uppercase_a = 'A' ; |
1484 | /// let uppercase_g = 'G' ; |
1485 | /// let a = 'a' ; |
1486 | /// let g = 'g' ; |
1487 | /// let zero = '0' ; |
1488 | /// let percent = '%' ; |
1489 | /// let space = ' ' ; |
1490 | /// let lf = ' \n' ; |
1491 | /// let esc = ' \x1b' ; |
1492 | /// |
1493 | /// assert!(uppercase_a.is_ascii_alphanumeric()); |
1494 | /// assert!(uppercase_g.is_ascii_alphanumeric()); |
1495 | /// assert!(a.is_ascii_alphanumeric()); |
1496 | /// assert!(g.is_ascii_alphanumeric()); |
1497 | /// assert!(zero.is_ascii_alphanumeric()); |
1498 | /// assert!(!percent.is_ascii_alphanumeric()); |
1499 | /// assert!(!space.is_ascii_alphanumeric()); |
1500 | /// assert!(!lf.is_ascii_alphanumeric()); |
1501 | /// assert!(!esc.is_ascii_alphanumeric()); |
1502 | /// ``` |
1503 | #[must_use ] |
1504 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1505 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1506 | #[inline ] |
1507 | pub const fn is_ascii_alphanumeric(&self) -> bool { |
1508 | matches!(*self, '0' ..='9' ) | matches!(*self, 'A' ..='Z' ) | matches!(*self, 'a' ..='z' ) |
1509 | } |
1510 | |
1511 | /// Checks if the value is an ASCII decimal digit: |
1512 | /// U+0030 '0' ..= U+0039 '9'. |
1513 | /// |
1514 | /// # Examples |
1515 | /// |
1516 | /// ``` |
1517 | /// let uppercase_a = 'A' ; |
1518 | /// let uppercase_g = 'G' ; |
1519 | /// let a = 'a' ; |
1520 | /// let g = 'g' ; |
1521 | /// let zero = '0' ; |
1522 | /// let percent = '%' ; |
1523 | /// let space = ' ' ; |
1524 | /// let lf = ' \n' ; |
1525 | /// let esc = ' \x1b' ; |
1526 | /// |
1527 | /// assert!(!uppercase_a.is_ascii_digit()); |
1528 | /// assert!(!uppercase_g.is_ascii_digit()); |
1529 | /// assert!(!a.is_ascii_digit()); |
1530 | /// assert!(!g.is_ascii_digit()); |
1531 | /// assert!(zero.is_ascii_digit()); |
1532 | /// assert!(!percent.is_ascii_digit()); |
1533 | /// assert!(!space.is_ascii_digit()); |
1534 | /// assert!(!lf.is_ascii_digit()); |
1535 | /// assert!(!esc.is_ascii_digit()); |
1536 | /// ``` |
1537 | #[must_use ] |
1538 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1539 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1540 | #[inline ] |
1541 | pub const fn is_ascii_digit(&self) -> bool { |
1542 | matches!(*self, '0' ..='9' ) |
1543 | } |
1544 | |
1545 | /// Checks if the value is an ASCII octal digit: |
1546 | /// U+0030 '0' ..= U+0037 '7'. |
1547 | /// |
1548 | /// # Examples |
1549 | /// |
1550 | /// ``` |
1551 | /// #![feature(is_ascii_octdigit)] |
1552 | /// |
1553 | /// let uppercase_a = 'A' ; |
1554 | /// let a = 'a' ; |
1555 | /// let zero = '0' ; |
1556 | /// let seven = '7' ; |
1557 | /// let nine = '9' ; |
1558 | /// let percent = '%' ; |
1559 | /// let lf = ' \n' ; |
1560 | /// |
1561 | /// assert!(!uppercase_a.is_ascii_octdigit()); |
1562 | /// assert!(!a.is_ascii_octdigit()); |
1563 | /// assert!(zero.is_ascii_octdigit()); |
1564 | /// assert!(seven.is_ascii_octdigit()); |
1565 | /// assert!(!nine.is_ascii_octdigit()); |
1566 | /// assert!(!percent.is_ascii_octdigit()); |
1567 | /// assert!(!lf.is_ascii_octdigit()); |
1568 | /// ``` |
1569 | #[must_use ] |
1570 | #[unstable (feature = "is_ascii_octdigit" , issue = "101288" )] |
1571 | #[inline ] |
1572 | pub const fn is_ascii_octdigit(&self) -> bool { |
1573 | matches!(*self, '0' ..='7' ) |
1574 | } |
1575 | |
1576 | /// Checks if the value is an ASCII hexadecimal digit: |
1577 | /// |
1578 | /// - U+0030 '0' ..= U+0039 '9', or |
1579 | /// - U+0041 'A' ..= U+0046 'F', or |
1580 | /// - U+0061 'a' ..= U+0066 'f'. |
1581 | /// |
1582 | /// # Examples |
1583 | /// |
1584 | /// ``` |
1585 | /// let uppercase_a = 'A' ; |
1586 | /// let uppercase_g = 'G' ; |
1587 | /// let a = 'a' ; |
1588 | /// let g = 'g' ; |
1589 | /// let zero = '0' ; |
1590 | /// let percent = '%' ; |
1591 | /// let space = ' ' ; |
1592 | /// let lf = ' \n' ; |
1593 | /// let esc = ' \x1b' ; |
1594 | /// |
1595 | /// assert!(uppercase_a.is_ascii_hexdigit()); |
1596 | /// assert!(!uppercase_g.is_ascii_hexdigit()); |
1597 | /// assert!(a.is_ascii_hexdigit()); |
1598 | /// assert!(!g.is_ascii_hexdigit()); |
1599 | /// assert!(zero.is_ascii_hexdigit()); |
1600 | /// assert!(!percent.is_ascii_hexdigit()); |
1601 | /// assert!(!space.is_ascii_hexdigit()); |
1602 | /// assert!(!lf.is_ascii_hexdigit()); |
1603 | /// assert!(!esc.is_ascii_hexdigit()); |
1604 | /// ``` |
1605 | #[must_use ] |
1606 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1607 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1608 | #[inline ] |
1609 | pub const fn is_ascii_hexdigit(&self) -> bool { |
1610 | matches!(*self, '0' ..='9' ) | matches!(*self, 'A' ..='F' ) | matches!(*self, 'a' ..='f' ) |
1611 | } |
1612 | |
1613 | /// Checks if the value is an ASCII punctuation character: |
1614 | /// |
1615 | /// - U+0021 ..= U+002F `! " # $ % & ' ( ) * + , - . /`, or |
1616 | /// - U+003A ..= U+0040 `: ; < = > ? @`, or |
1617 | /// - U+005B ..= U+0060 ``[ \ ] ^ _ ` ``, or |
1618 | /// - U+007B ..= U+007E `{ | } ~` |
1619 | /// |
1620 | /// # Examples |
1621 | /// |
1622 | /// ``` |
1623 | /// let uppercase_a = 'A' ; |
1624 | /// let uppercase_g = 'G' ; |
1625 | /// let a = 'a' ; |
1626 | /// let g = 'g' ; |
1627 | /// let zero = '0' ; |
1628 | /// let percent = '%' ; |
1629 | /// let space = ' ' ; |
1630 | /// let lf = ' \n' ; |
1631 | /// let esc = ' \x1b' ; |
1632 | /// |
1633 | /// assert!(!uppercase_a.is_ascii_punctuation()); |
1634 | /// assert!(!uppercase_g.is_ascii_punctuation()); |
1635 | /// assert!(!a.is_ascii_punctuation()); |
1636 | /// assert!(!g.is_ascii_punctuation()); |
1637 | /// assert!(!zero.is_ascii_punctuation()); |
1638 | /// assert!(percent.is_ascii_punctuation()); |
1639 | /// assert!(!space.is_ascii_punctuation()); |
1640 | /// assert!(!lf.is_ascii_punctuation()); |
1641 | /// assert!(!esc.is_ascii_punctuation()); |
1642 | /// ``` |
1643 | #[must_use ] |
1644 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1645 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1646 | #[inline ] |
1647 | pub const fn is_ascii_punctuation(&self) -> bool { |
1648 | matches!(*self, '!' ..='/' ) |
1649 | | matches!(*self, ':' ..='@' ) |
1650 | | matches!(*self, '[' ..='`' ) |
1651 | | matches!(*self, '{' ..='~' ) |
1652 | } |
1653 | |
1654 | /// Checks if the value is an ASCII graphic character: |
1655 | /// U+0021 '!' ..= U+007E '~'. |
1656 | /// |
1657 | /// # Examples |
1658 | /// |
1659 | /// ``` |
1660 | /// let uppercase_a = 'A' ; |
1661 | /// let uppercase_g = 'G' ; |
1662 | /// let a = 'a' ; |
1663 | /// let g = 'g' ; |
1664 | /// let zero = '0' ; |
1665 | /// let percent = '%' ; |
1666 | /// let space = ' ' ; |
1667 | /// let lf = ' \n' ; |
1668 | /// let esc = ' \x1b' ; |
1669 | /// |
1670 | /// assert!(uppercase_a.is_ascii_graphic()); |
1671 | /// assert!(uppercase_g.is_ascii_graphic()); |
1672 | /// assert!(a.is_ascii_graphic()); |
1673 | /// assert!(g.is_ascii_graphic()); |
1674 | /// assert!(zero.is_ascii_graphic()); |
1675 | /// assert!(percent.is_ascii_graphic()); |
1676 | /// assert!(!space.is_ascii_graphic()); |
1677 | /// assert!(!lf.is_ascii_graphic()); |
1678 | /// assert!(!esc.is_ascii_graphic()); |
1679 | /// ``` |
1680 | #[must_use ] |
1681 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1682 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1683 | #[inline ] |
1684 | pub const fn is_ascii_graphic(&self) -> bool { |
1685 | matches!(*self, '!' ..='~' ) |
1686 | } |
1687 | |
1688 | /// Checks if the value is an ASCII whitespace character: |
1689 | /// U+0020 SPACE, U+0009 HORIZONTAL TAB, U+000A LINE FEED, |
1690 | /// U+000C FORM FEED, or U+000D CARRIAGE RETURN. |
1691 | /// |
1692 | /// Rust uses the WhatWG Infra Standard's [definition of ASCII |
1693 | /// whitespace][infra-aw]. There are several other definitions in |
1694 | /// wide use. For instance, [the POSIX locale][pct] includes |
1695 | /// U+000B VERTICAL TAB as well as all the above characters, |
1696 | /// but—from the very same specification—[the default rule for |
1697 | /// "field splitting" in the Bourne shell][bfs] considers *only* |
1698 | /// SPACE, HORIZONTAL TAB, and LINE FEED as whitespace. |
1699 | /// |
1700 | /// If you are writing a program that will process an existing |
1701 | /// file format, check what that format's definition of whitespace is |
1702 | /// before using this function. |
1703 | /// |
1704 | /// [infra-aw]: https://infra.spec.whatwg.org/#ascii-whitespace |
1705 | /// [pct]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01 |
1706 | /// [bfs]: https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_05 |
1707 | /// |
1708 | /// # Examples |
1709 | /// |
1710 | /// ``` |
1711 | /// let uppercase_a = 'A' ; |
1712 | /// let uppercase_g = 'G' ; |
1713 | /// let a = 'a' ; |
1714 | /// let g = 'g' ; |
1715 | /// let zero = '0' ; |
1716 | /// let percent = '%' ; |
1717 | /// let space = ' ' ; |
1718 | /// let lf = ' \n' ; |
1719 | /// let esc = ' \x1b' ; |
1720 | /// |
1721 | /// assert!(!uppercase_a.is_ascii_whitespace()); |
1722 | /// assert!(!uppercase_g.is_ascii_whitespace()); |
1723 | /// assert!(!a.is_ascii_whitespace()); |
1724 | /// assert!(!g.is_ascii_whitespace()); |
1725 | /// assert!(!zero.is_ascii_whitespace()); |
1726 | /// assert!(!percent.is_ascii_whitespace()); |
1727 | /// assert!(space.is_ascii_whitespace()); |
1728 | /// assert!(lf.is_ascii_whitespace()); |
1729 | /// assert!(!esc.is_ascii_whitespace()); |
1730 | /// ``` |
1731 | #[must_use ] |
1732 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1733 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1734 | #[inline ] |
1735 | pub const fn is_ascii_whitespace(&self) -> bool { |
1736 | matches!(*self, ' \t' | ' \n' | ' \x0C' | ' \r' | ' ' ) |
1737 | } |
1738 | |
1739 | /// Checks if the value is an ASCII control character: |
1740 | /// U+0000 NUL ..= U+001F UNIT SEPARATOR, or U+007F DELETE. |
1741 | /// Note that most ASCII whitespace characters are control |
1742 | /// characters, but SPACE is not. |
1743 | /// |
1744 | /// # Examples |
1745 | /// |
1746 | /// ``` |
1747 | /// let uppercase_a = 'A' ; |
1748 | /// let uppercase_g = 'G' ; |
1749 | /// let a = 'a' ; |
1750 | /// let g = 'g' ; |
1751 | /// let zero = '0' ; |
1752 | /// let percent = '%' ; |
1753 | /// let space = ' ' ; |
1754 | /// let lf = ' \n' ; |
1755 | /// let esc = ' \x1b' ; |
1756 | /// |
1757 | /// assert!(!uppercase_a.is_ascii_control()); |
1758 | /// assert!(!uppercase_g.is_ascii_control()); |
1759 | /// assert!(!a.is_ascii_control()); |
1760 | /// assert!(!g.is_ascii_control()); |
1761 | /// assert!(!zero.is_ascii_control()); |
1762 | /// assert!(!percent.is_ascii_control()); |
1763 | /// assert!(!space.is_ascii_control()); |
1764 | /// assert!(lf.is_ascii_control()); |
1765 | /// assert!(esc.is_ascii_control()); |
1766 | /// ``` |
1767 | #[must_use ] |
1768 | #[stable (feature = "ascii_ctype_on_intrinsics" , since = "1.24.0" )] |
1769 | #[rustc_const_stable (feature = "const_ascii_ctype_on_intrinsics" , since = "1.47.0" )] |
1770 | #[inline ] |
1771 | pub const fn is_ascii_control(&self) -> bool { |
1772 | matches!(*self, ' \0' ..=' \x1F' | ' \x7F' ) |
1773 | } |
1774 | } |
1775 | |
1776 | pub(crate) struct EscapeDebugExtArgs { |
1777 | /// Escape Extended Grapheme codepoints? |
1778 | pub(crate) escape_grapheme_extended: bool, |
1779 | |
1780 | /// Escape single quotes? |
1781 | pub(crate) escape_single_quote: bool, |
1782 | |
1783 | /// Escape double quotes? |
1784 | pub(crate) escape_double_quote: bool, |
1785 | } |
1786 | |
1787 | impl EscapeDebugExtArgs { |
1788 | pub(crate) const ESCAPE_ALL: Self = Self { |
1789 | escape_grapheme_extended: true, |
1790 | escape_single_quote: true, |
1791 | escape_double_quote: true, |
1792 | }; |
1793 | } |
1794 | |
1795 | #[inline ] |
1796 | #[must_use ] |
1797 | const fn len_utf8(code: u32) -> usize { |
1798 | match code { |
1799 | ..MAX_ONE_B => 1, |
1800 | ..MAX_TWO_B => 2, |
1801 | ..MAX_THREE_B => 3, |
1802 | _ => 4, |
1803 | } |
1804 | } |
1805 | |
1806 | #[inline ] |
1807 | #[must_use ] |
1808 | const fn len_utf16(code: u32) -> usize { |
1809 | if (code & 0xFFFF) == code { 1 } else { 2 } |
1810 | } |
1811 | |
1812 | /// Encodes a raw `u32` value as UTF-8 into the provided byte buffer, |
1813 | /// and then returns the subslice of the buffer that contains the encoded character. |
1814 | /// |
1815 | /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. |
1816 | /// (Creating a `char` in the surrogate range is UB.) |
1817 | /// The result is valid [generalized UTF-8] but not valid UTF-8. |
1818 | /// |
1819 | /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8 |
1820 | /// |
1821 | /// # Panics |
1822 | /// |
1823 | /// Panics if the buffer is not large enough. |
1824 | /// A buffer of length four is large enough to encode any `char`. |
1825 | #[unstable (feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" )] |
1826 | #[doc (hidden)] |
1827 | #[inline ] |
1828 | pub const fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] { |
1829 | let len: usize = len_utf8(code); |
1830 | if dst.len() < len { |
1831 | const_panic!( |
1832 | "encode_utf8: buffer does not have enough bytes to encode code point" , |
1833 | "encode_utf8: need {len} bytes to encode U+ {code:04X} but buffer has just {dst_len}" , |
1834 | code: u32 = code, |
1835 | len: usize = len, |
1836 | dst_len: usize = dst.len(), |
1837 | ); |
1838 | } |
1839 | |
1840 | // SAFETY: `dst` is checked to be at least the length needed to encode the codepoint. |
1841 | unsafe { encode_utf8_raw_unchecked(code, dst.as_mut_ptr()) }; |
1842 | |
1843 | // SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds. |
1844 | unsafe { slice::from_raw_parts_mut(data:dst.as_mut_ptr(), len) } |
1845 | } |
1846 | |
1847 | /// Encodes a raw `u32` value as UTF-8 into the byte buffer pointed to by `dst`. |
1848 | /// |
1849 | /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. |
1850 | /// (Creating a `char` in the surrogate range is UB.) |
1851 | /// The result is valid [generalized UTF-8] but not valid UTF-8. |
1852 | /// |
1853 | /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8 |
1854 | /// |
1855 | /// # Safety |
1856 | /// |
1857 | /// The behavior is undefined if the buffer pointed to by `dst` is not |
1858 | /// large enough to hold the encoded codepoint. A buffer of length four |
1859 | /// is large enough to encode any `char`. |
1860 | /// |
1861 | /// For a safe version of this function, see the [`encode_utf8_raw`] function. |
1862 | #[unstable (feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" )] |
1863 | #[doc (hidden)] |
1864 | #[inline ] |
1865 | pub const unsafe fn encode_utf8_raw_unchecked(code: u32, dst: *mut u8) { |
1866 | let len = len_utf8(code); |
1867 | // SAFETY: The caller must guarantee that the buffer pointed to by `dst` |
1868 | // is at least `len` bytes long. |
1869 | unsafe { |
1870 | match len { |
1871 | 1 => { |
1872 | *dst = code as u8; |
1873 | } |
1874 | 2 => { |
1875 | *dst = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; |
1876 | *dst.add(1) = (code & 0x3F) as u8 | TAG_CONT; |
1877 | } |
1878 | 3 => { |
1879 | *dst = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; |
1880 | *dst.add(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT; |
1881 | *dst.add(2) = (code & 0x3F) as u8 | TAG_CONT; |
1882 | } |
1883 | 4 => { |
1884 | *dst = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; |
1885 | *dst.add(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT; |
1886 | *dst.add(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT; |
1887 | *dst.add(3) = (code & 0x3F) as u8 | TAG_CONT; |
1888 | } |
1889 | // SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8. |
1890 | _ => crate::hint::unreachable_unchecked(), |
1891 | } |
1892 | } |
1893 | } |
1894 | |
1895 | /// Encodes a raw `u32` value as native endian UTF-16 into the provided `u16` buffer, |
1896 | /// and then returns the subslice of the buffer that contains the encoded character. |
1897 | /// |
1898 | /// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range. |
1899 | /// (Creating a `char` in the surrogate range is UB.) |
1900 | /// |
1901 | /// # Panics |
1902 | /// |
1903 | /// Panics if the buffer is not large enough. |
1904 | /// A buffer of length 2 is large enough to encode any `char`. |
1905 | #[unstable (feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" )] |
1906 | #[doc (hidden)] |
1907 | #[inline ] |
1908 | pub const fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] { |
1909 | let len: usize = len_utf16(code); |
1910 | match (len, &mut *dst) { |
1911 | (1, [a: &mut u16, ..]) => { |
1912 | *a = code as u16; |
1913 | } |
1914 | (2, [a: &mut u16, b: &mut u16, ..]) => { |
1915 | code -= 0x1_0000; |
1916 | *a = (code >> 10) as u16 | 0xD800; |
1917 | *b = (code & 0x3FF) as u16 | 0xDC00; |
1918 | } |
1919 | _ => { |
1920 | const_panic!( |
1921 | "encode_utf16: buffer does not have enough bytes to encode code point" , |
1922 | "encode_utf16: need {len} bytes to encode U+ {code:04X} but buffer has just {dst_len}" , |
1923 | code: u32 = code, |
1924 | len: usize = len, |
1925 | dst_len: usize = dst.len(), |
1926 | ) |
1927 | } |
1928 | }; |
1929 | // SAFETY: `<&mut [u16]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds. |
1930 | unsafe { slice::from_raw_parts_mut(data:dst.as_mut_ptr(), len) } |
1931 | } |
1932 | |