1 | //! Utilities for the `char` primitive type. |
2 | //! |
3 | //! *[See also the `char` primitive type](primitive@char).* |
4 | //! |
5 | //! The `char` type represents a single character. More specifically, since |
6 | //! 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode |
7 | //! scalar value]', which is similar to, but not the same as, a '[Unicode code |
8 | //! point]'. |
9 | //! |
10 | //! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value |
11 | //! [Unicode code point]: https://www.unicode.org/glossary/#code_point |
12 | //! |
13 | //! This module exists for technical reasons, the primary documentation for |
14 | //! `char` is directly on [the `char` primitive type][char] itself. |
15 | //! |
16 | //! This module is the home of the iterator implementations for the iterators |
17 | //! implemented on `char`, as well as some useful constants and conversion |
18 | //! functions that convert various types to `char`. |
19 | |
20 | #![allow (non_snake_case)] |
21 | #![stable (feature = "core_char" , since = "1.2.0" )] |
22 | |
23 | mod convert; |
24 | mod decode; |
25 | mod methods; |
26 | |
27 | // stable re-exports |
28 | #[stable (feature = "try_from" , since = "1.34.0" )] |
29 | pub use self::convert::CharTryFromError; |
30 | #[stable (feature = "char_from_str" , since = "1.20.0" )] |
31 | pub use self::convert::ParseCharError; |
32 | #[stable (feature = "decode_utf16" , since = "1.9.0" )] |
33 | pub use self::decode::{DecodeUtf16, DecodeUtf16Error}; |
34 | |
35 | // perma-unstable re-exports |
36 | #[unstable (feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" )] |
37 | pub use self::methods::encode_utf16_raw; |
38 | #[unstable (feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" )] |
39 | pub use self::methods::encode_utf8_raw; |
40 | |
41 | use crate::ascii; |
42 | use crate::error::Error; |
43 | use crate::escape; |
44 | use crate::fmt::{self, Write}; |
45 | use crate::iter::FusedIterator; |
46 | use crate::num::NonZeroUsize; |
47 | |
48 | pub(crate) use self::methods::EscapeDebugExtArgs; |
49 | |
50 | // UTF-8 ranges and tags for encoding characters |
51 | const TAG_CONT: u8 = 0b1000_0000; |
52 | const TAG_TWO_B: u8 = 0b1100_0000; |
53 | const TAG_THREE_B: u8 = 0b1110_0000; |
54 | const TAG_FOUR_B: u8 = 0b1111_0000; |
55 | const MAX_ONE_B: u32 = 0x80; |
56 | const MAX_TWO_B: u32 = 0x800; |
57 | const MAX_THREE_B: u32 = 0x10000; |
58 | |
59 | /* |
60 | Lu Uppercase_Letter an uppercase letter |
61 | Ll Lowercase_Letter a lowercase letter |
62 | Lt Titlecase_Letter a digraphic character, with first part uppercase |
63 | Lm Modifier_Letter a modifier letter |
64 | Lo Other_Letter other letters, including syllables and ideographs |
65 | Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) |
66 | Mc Spacing_Mark a spacing combining mark (positive advance width) |
67 | Me Enclosing_Mark an enclosing combining mark |
68 | Nd Decimal_Number a decimal digit |
69 | Nl Letter_Number a letterlike numeric character |
70 | No Other_Number a numeric character of other type |
71 | Pc Connector_Punctuation a connecting punctuation mark, like a tie |
72 | Pd Dash_Punctuation a dash or hyphen punctuation mark |
73 | Ps Open_Punctuation an opening punctuation mark (of a pair) |
74 | Pe Close_Punctuation a closing punctuation mark (of a pair) |
75 | Pi Initial_Punctuation an initial quotation mark |
76 | Pf Final_Punctuation a final quotation mark |
77 | Po Other_Punctuation a punctuation mark of other type |
78 | Sm Math_Symbol a symbol of primarily mathematical use |
79 | Sc Currency_Symbol a currency sign |
80 | Sk Modifier_Symbol a non-letterlike modifier symbol |
81 | So Other_Symbol a symbol of other type |
82 | Zs Space_Separator a space character (of various non-zero widths) |
83 | Zl Line_Separator U+2028 LINE SEPARATOR only |
84 | Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only |
85 | Cc Control a C0 or C1 control code |
86 | Cf Format a format control character |
87 | Cs Surrogate a surrogate code point |
88 | Co Private_Use a private-use character |
89 | Cn Unassigned a reserved unassigned code point or a noncharacter |
90 | */ |
91 | |
92 | /// The highest valid code point a `char` can have, `'\u{10FFFF}'`. Use [`char::MAX`] instead. |
93 | #[stable (feature = "rust1" , since = "1.0.0" )] |
94 | pub const MAX: char = char::MAX; |
95 | |
96 | /// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a |
97 | /// decoding error. Use [`char::REPLACEMENT_CHARACTER`] instead. |
98 | #[stable (feature = "decode_utf16" , since = "1.9.0" )] |
99 | pub const REPLACEMENT_CHARACTER: char = char::REPLACEMENT_CHARACTER; |
100 | |
101 | /// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of |
102 | /// `char` and `str` methods are based on. Use [`char::UNICODE_VERSION`] instead. |
103 | #[stable (feature = "unicode_version" , since = "1.45.0" )] |
104 | pub const UNICODE_VERSION: (u8, u8, u8) = char::UNICODE_VERSION; |
105 | |
106 | /// Creates an iterator over the UTF-16 encoded code points in `iter`, returning |
107 | /// unpaired surrogates as `Err`s. Use [`char::decode_utf16`] instead. |
108 | #[stable (feature = "decode_utf16" , since = "1.9.0" )] |
109 | #[inline ] |
110 | pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> { |
111 | self::decode::decode_utf16(iter) |
112 | } |
113 | |
114 | /// Converts a `u32` to a `char`. Use [`char::from_u32`] instead. |
115 | #[stable (feature = "rust1" , since = "1.0.0" )] |
116 | #[rustc_const_stable (feature = "const_char_convert" , since = "1.67.0" )] |
117 | #[must_use ] |
118 | #[inline ] |
119 | pub const fn from_u32(i: u32) -> Option<char> { |
120 | self::convert::from_u32(i) |
121 | } |
122 | |
123 | /// Converts a `u32` to a `char`, ignoring validity. Use [`char::from_u32_unchecked`]. |
124 | /// instead. |
125 | #[stable (feature = "char_from_unchecked" , since = "1.5.0" )] |
126 | #[rustc_const_unstable (feature = "const_char_from_u32_unchecked" , issue = "89259" )] |
127 | #[must_use ] |
128 | #[inline ] |
129 | pub const unsafe fn from_u32_unchecked(i: u32) -> char { |
130 | // SAFETY: the safety contract must be upheld by the caller. |
131 | unsafe { self::convert::from_u32_unchecked(i) } |
132 | } |
133 | |
134 | /// Converts a digit in the given radix to a `char`. Use [`char::from_digit`] instead. |
135 | #[stable (feature = "rust1" , since = "1.0.0" )] |
136 | #[rustc_const_stable (feature = "const_char_convert" , since = "1.67.0" )] |
137 | #[must_use ] |
138 | #[inline ] |
139 | pub const fn from_digit(num: u32, radix: u32) -> Option<char> { |
140 | self::convert::from_digit(num, radix) |
141 | } |
142 | |
143 | /// Returns an iterator that yields the hexadecimal Unicode escape of a |
144 | /// character, as `char`s. |
145 | /// |
146 | /// This `struct` is created by the [`escape_unicode`] method on [`char`]. See |
147 | /// its documentation for more. |
148 | /// |
149 | /// [`escape_unicode`]: char::escape_unicode |
150 | #[derive (Clone, Debug)] |
151 | #[stable (feature = "rust1" , since = "1.0.0" )] |
152 | pub struct EscapeUnicode(escape::EscapeIterInner<10>); |
153 | |
154 | impl EscapeUnicode { |
155 | fn new(chr: char) -> Self { |
156 | let mut data: [AsciiChar; 10] = [ascii::Char::Null; 10]; |
157 | let range: Range = escape::escape_unicode_into(&mut data, ch:chr); |
158 | Self(escape::EscapeIterInner::new(data, alive:range)) |
159 | } |
160 | } |
161 | |
162 | #[stable (feature = "rust1" , since = "1.0.0" )] |
163 | impl Iterator for EscapeUnicode { |
164 | type Item = char; |
165 | |
166 | #[inline ] |
167 | fn next(&mut self) -> Option<char> { |
168 | self.0.next().map(char::from) |
169 | } |
170 | |
171 | #[inline ] |
172 | fn size_hint(&self) -> (usize, Option<usize>) { |
173 | let n = self.0.len(); |
174 | (n, Some(n)) |
175 | } |
176 | |
177 | #[inline ] |
178 | fn count(self) -> usize { |
179 | self.0.len() |
180 | } |
181 | |
182 | #[inline ] |
183 | fn last(mut self) -> Option<char> { |
184 | self.0.next_back().map(char::from) |
185 | } |
186 | |
187 | #[inline ] |
188 | fn advance_by(&mut self, n: usize) -> Result<(), NonZeroUsize> { |
189 | self.0.advance_by(n) |
190 | } |
191 | } |
192 | |
193 | #[stable (feature = "exact_size_escape" , since = "1.11.0" )] |
194 | impl ExactSizeIterator for EscapeUnicode { |
195 | #[inline ] |
196 | fn len(&self) -> usize { |
197 | self.0.len() |
198 | } |
199 | } |
200 | |
201 | #[stable (feature = "fused" , since = "1.26.0" )] |
202 | impl FusedIterator for EscapeUnicode {} |
203 | |
204 | #[stable (feature = "char_struct_display" , since = "1.16.0" )] |
205 | impl fmt::Display for EscapeUnicode { |
206 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
207 | f.write_str(self.0.as_str()) |
208 | } |
209 | } |
210 | |
211 | /// An iterator that yields the literal escape code of a `char`. |
212 | /// |
213 | /// This `struct` is created by the [`escape_default`] method on [`char`]. See |
214 | /// its documentation for more. |
215 | /// |
216 | /// [`escape_default`]: char::escape_default |
217 | #[derive (Clone, Debug)] |
218 | #[stable (feature = "rust1" , since = "1.0.0" )] |
219 | pub struct EscapeDefault(escape::EscapeIterInner<10>); |
220 | |
221 | impl EscapeDefault { |
222 | fn printable(chr: ascii::Char) -> Self { |
223 | let data: [AsciiChar; 1] = [chr]; |
224 | Self(escape::EscapeIterInner::from_array(data)) |
225 | } |
226 | |
227 | fn backslash(chr: ascii::Char) -> Self { |
228 | let data: [AsciiChar; 2] = [ascii::Char::ReverseSolidus, chr]; |
229 | Self(escape::EscapeIterInner::from_array(data)) |
230 | } |
231 | |
232 | fn from_unicode(esc: EscapeUnicode) -> Self { |
233 | Self(esc.0) |
234 | } |
235 | } |
236 | |
237 | #[stable (feature = "rust1" , since = "1.0.0" )] |
238 | impl Iterator for EscapeDefault { |
239 | type Item = char; |
240 | |
241 | #[inline ] |
242 | fn next(&mut self) -> Option<char> { |
243 | self.0.next().map(char::from) |
244 | } |
245 | |
246 | #[inline ] |
247 | fn size_hint(&self) -> (usize, Option<usize>) { |
248 | let n = self.0.len(); |
249 | (n, Some(n)) |
250 | } |
251 | |
252 | #[inline ] |
253 | fn count(self) -> usize { |
254 | self.0.len() |
255 | } |
256 | |
257 | #[inline ] |
258 | fn last(mut self) -> Option<char> { |
259 | self.0.next_back().map(char::from) |
260 | } |
261 | |
262 | #[inline ] |
263 | fn advance_by(&mut self, n: usize) -> Result<(), NonZeroUsize> { |
264 | self.0.advance_by(n) |
265 | } |
266 | } |
267 | |
268 | #[stable (feature = "exact_size_escape" , since = "1.11.0" )] |
269 | impl ExactSizeIterator for EscapeDefault { |
270 | #[inline ] |
271 | fn len(&self) -> usize { |
272 | self.0.len() |
273 | } |
274 | } |
275 | |
276 | #[stable (feature = "fused" , since = "1.26.0" )] |
277 | impl FusedIterator for EscapeDefault {} |
278 | |
279 | #[stable (feature = "char_struct_display" , since = "1.16.0" )] |
280 | impl fmt::Display for EscapeDefault { |
281 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
282 | f.write_str(self.0.as_str()) |
283 | } |
284 | } |
285 | |
286 | /// An iterator that yields the literal escape code of a `char`. |
287 | /// |
288 | /// This `struct` is created by the [`escape_debug`] method on [`char`]. See its |
289 | /// documentation for more. |
290 | /// |
291 | /// [`escape_debug`]: char::escape_debug |
292 | #[stable (feature = "char_escape_debug" , since = "1.20.0" )] |
293 | #[derive (Clone, Debug)] |
294 | pub struct EscapeDebug(EscapeDebugInner); |
295 | |
296 | #[derive (Clone, Debug)] |
297 | // Note: It’s possible to manually encode the EscapeDebugInner inside of |
298 | // EscapeIterInner (e.g. with alive=254..255 indicating that data[0..4] holds |
299 | // a char) which would likely result in a more optimised code. For now we use |
300 | // the option easier to implement. |
301 | enum EscapeDebugInner { |
302 | Bytes(escape::EscapeIterInner<10>), |
303 | Char(char), |
304 | } |
305 | |
306 | impl EscapeDebug { |
307 | fn printable(chr: char) -> Self { |
308 | Self(EscapeDebugInner::Char(chr)) |
309 | } |
310 | |
311 | fn backslash(chr: ascii::Char) -> Self { |
312 | let data: [AsciiChar; 2] = [ascii::Char::ReverseSolidus, chr]; |
313 | let iter: EscapeIterInner<10> = escape::EscapeIterInner::from_array(data); |
314 | Self(EscapeDebugInner::Bytes(iter)) |
315 | } |
316 | |
317 | fn from_unicode(esc: EscapeUnicode) -> Self { |
318 | Self(EscapeDebugInner::Bytes(esc.0)) |
319 | } |
320 | |
321 | fn clear(&mut self) { |
322 | let bytes: EscapeIterInner<10> = escape::EscapeIterInner::from_array([]); |
323 | self.0 = EscapeDebugInner::Bytes(bytes); |
324 | } |
325 | } |
326 | |
327 | #[stable (feature = "char_escape_debug" , since = "1.20.0" )] |
328 | impl Iterator for EscapeDebug { |
329 | type Item = char; |
330 | |
331 | #[inline ] |
332 | fn next(&mut self) -> Option<char> { |
333 | match self.0 { |
334 | EscapeDebugInner::Bytes(ref mut bytes: &mut EscapeIterInner<10>) => bytes.next().map(char::from), |
335 | EscapeDebugInner::Char(chr: char) => { |
336 | self.clear(); |
337 | Some(chr) |
338 | } |
339 | } |
340 | } |
341 | |
342 | fn size_hint(&self) -> (usize, Option<usize>) { |
343 | let n: usize = self.len(); |
344 | (n, Some(n)) |
345 | } |
346 | |
347 | #[inline ] |
348 | fn count(self) -> usize { |
349 | self.len() |
350 | } |
351 | } |
352 | |
353 | #[stable (feature = "char_escape_debug" , since = "1.20.0" )] |
354 | impl ExactSizeIterator for EscapeDebug { |
355 | fn len(&self) -> usize { |
356 | match &self.0 { |
357 | EscapeDebugInner::Bytes(bytes: &EscapeIterInner<10>) => bytes.len(), |
358 | EscapeDebugInner::Char(_) => 1, |
359 | } |
360 | } |
361 | } |
362 | |
363 | #[stable (feature = "fused" , since = "1.26.0" )] |
364 | impl FusedIterator for EscapeDebug {} |
365 | |
366 | #[stable (feature = "char_escape_debug" , since = "1.20.0" )] |
367 | impl fmt::Display for EscapeDebug { |
368 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
369 | match &self.0 { |
370 | EscapeDebugInner::Bytes(bytes: &EscapeIterInner<10>) => f.write_str(data:bytes.as_str()), |
371 | EscapeDebugInner::Char(chr: &char) => f.write_char(*chr), |
372 | } |
373 | } |
374 | } |
375 | |
376 | /// Returns an iterator that yields the lowercase equivalent of a `char`. |
377 | /// |
378 | /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See |
379 | /// its documentation for more. |
380 | /// |
381 | /// [`to_lowercase`]: char::to_lowercase |
382 | #[stable (feature = "rust1" , since = "1.0.0" )] |
383 | #[derive (Debug, Clone)] |
384 | pub struct ToLowercase(CaseMappingIter); |
385 | |
386 | #[stable (feature = "rust1" , since = "1.0.0" )] |
387 | impl Iterator for ToLowercase { |
388 | type Item = char; |
389 | fn next(&mut self) -> Option<char> { |
390 | self.0.next() |
391 | } |
392 | fn size_hint(&self) -> (usize, Option<usize>) { |
393 | self.0.size_hint() |
394 | } |
395 | } |
396 | |
397 | #[stable (feature = "case_mapping_double_ended" , since = "1.59.0" )] |
398 | impl DoubleEndedIterator for ToLowercase { |
399 | fn next_back(&mut self) -> Option<char> { |
400 | self.0.next_back() |
401 | } |
402 | } |
403 | |
404 | #[stable (feature = "fused" , since = "1.26.0" )] |
405 | impl FusedIterator for ToLowercase {} |
406 | |
407 | #[stable (feature = "exact_size_case_mapping_iter" , since = "1.35.0" )] |
408 | impl ExactSizeIterator for ToLowercase {} |
409 | |
410 | /// Returns an iterator that yields the uppercase equivalent of a `char`. |
411 | /// |
412 | /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See |
413 | /// its documentation for more. |
414 | /// |
415 | /// [`to_uppercase`]: char::to_uppercase |
416 | #[stable (feature = "rust1" , since = "1.0.0" )] |
417 | #[derive (Debug, Clone)] |
418 | pub struct ToUppercase(CaseMappingIter); |
419 | |
420 | #[stable (feature = "rust1" , since = "1.0.0" )] |
421 | impl Iterator for ToUppercase { |
422 | type Item = char; |
423 | fn next(&mut self) -> Option<char> { |
424 | self.0.next() |
425 | } |
426 | fn size_hint(&self) -> (usize, Option<usize>) { |
427 | self.0.size_hint() |
428 | } |
429 | } |
430 | |
431 | #[stable (feature = "case_mapping_double_ended" , since = "1.59.0" )] |
432 | impl DoubleEndedIterator for ToUppercase { |
433 | fn next_back(&mut self) -> Option<char> { |
434 | self.0.next_back() |
435 | } |
436 | } |
437 | |
438 | #[stable (feature = "fused" , since = "1.26.0" )] |
439 | impl FusedIterator for ToUppercase {} |
440 | |
441 | #[stable (feature = "exact_size_case_mapping_iter" , since = "1.35.0" )] |
442 | impl ExactSizeIterator for ToUppercase {} |
443 | |
444 | #[derive (Debug, Clone)] |
445 | enum CaseMappingIter { |
446 | Three(char, char, char), |
447 | Two(char, char), |
448 | One(char), |
449 | Zero, |
450 | } |
451 | |
452 | impl CaseMappingIter { |
453 | fn new(chars: [char; 3]) -> CaseMappingIter { |
454 | if chars[2] == ' \0' { |
455 | if chars[1] == ' \0' { |
456 | CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0' |
457 | } else { |
458 | CaseMappingIter::Two(chars[0], chars[1]) |
459 | } |
460 | } else { |
461 | CaseMappingIter::Three(chars[0], chars[1], chars[2]) |
462 | } |
463 | } |
464 | } |
465 | |
466 | impl Iterator for CaseMappingIter { |
467 | type Item = char; |
468 | fn next(&mut self) -> Option<char> { |
469 | match *self { |
470 | CaseMappingIter::Three(a, b, c) => { |
471 | *self = CaseMappingIter::Two(b, c); |
472 | Some(a) |
473 | } |
474 | CaseMappingIter::Two(b, c) => { |
475 | *self = CaseMappingIter::One(c); |
476 | Some(b) |
477 | } |
478 | CaseMappingIter::One(c) => { |
479 | *self = CaseMappingIter::Zero; |
480 | Some(c) |
481 | } |
482 | CaseMappingIter::Zero => None, |
483 | } |
484 | } |
485 | |
486 | fn size_hint(&self) -> (usize, Option<usize>) { |
487 | let size = match self { |
488 | CaseMappingIter::Three(..) => 3, |
489 | CaseMappingIter::Two(..) => 2, |
490 | CaseMappingIter::One(_) => 1, |
491 | CaseMappingIter::Zero => 0, |
492 | }; |
493 | (size, Some(size)) |
494 | } |
495 | } |
496 | |
497 | impl DoubleEndedIterator for CaseMappingIter { |
498 | fn next_back(&mut self) -> Option<char> { |
499 | match *self { |
500 | CaseMappingIter::Three(a: char, b: char, c: char) => { |
501 | *self = CaseMappingIter::Two(a, b); |
502 | Some(c) |
503 | } |
504 | CaseMappingIter::Two(b: char, c: char) => { |
505 | *self = CaseMappingIter::One(b); |
506 | Some(c) |
507 | } |
508 | CaseMappingIter::One(c: char) => { |
509 | *self = CaseMappingIter::Zero; |
510 | Some(c) |
511 | } |
512 | CaseMappingIter::Zero => None, |
513 | } |
514 | } |
515 | } |
516 | |
517 | impl fmt::Display for CaseMappingIter { |
518 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
519 | match *self { |
520 | CaseMappingIter::Three(a: char, b: char, c: char) => { |
521 | f.write_char(a)?; |
522 | f.write_char(b)?; |
523 | f.write_char(c) |
524 | } |
525 | CaseMappingIter::Two(b: char, c: char) => { |
526 | f.write_char(b)?; |
527 | f.write_char(c) |
528 | } |
529 | CaseMappingIter::One(c: char) => f.write_char(c), |
530 | CaseMappingIter::Zero => Ok(()), |
531 | } |
532 | } |
533 | } |
534 | |
535 | #[stable (feature = "char_struct_display" , since = "1.16.0" )] |
536 | impl fmt::Display for ToLowercase { |
537 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
538 | fmt::Display::fmt(&self.0, f) |
539 | } |
540 | } |
541 | |
542 | #[stable (feature = "char_struct_display" , since = "1.16.0" )] |
543 | impl fmt::Display for ToUppercase { |
544 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
545 | fmt::Display::fmt(&self.0, f) |
546 | } |
547 | } |
548 | |
549 | /// The error type returned when a checked char conversion fails. |
550 | #[stable (feature = "u8_from_char" , since = "1.59.0" )] |
551 | #[derive (Debug, Copy, Clone, PartialEq, Eq)] |
552 | pub struct TryFromCharError(pub(crate) ()); |
553 | |
554 | #[stable (feature = "u8_from_char" , since = "1.59.0" )] |
555 | impl fmt::Display for TryFromCharError { |
556 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { |
557 | "unicode code point out of range" .fmt(fmt) |
558 | } |
559 | } |
560 | |
561 | #[stable (feature = "u8_from_char" , since = "1.59.0" )] |
562 | impl Error for TryFromCharError {} |
563 | |