1 | // Copyright 2013 The rust-url developers. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
6 | // option. This file may not be copied, modified, or distributed |
7 | // except according to those terms. |
8 | |
9 | //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation. |
10 | //! |
11 | //! Since Punycode fundamentally works on unicode code points, |
12 | //! `encode` and `decode` take and return slices and vectors of `char`. |
13 | //! `encode_str` and `decode_to_string` provide convenience wrappers |
14 | //! that convert from and to Rust’s UTF-8 based `str` and `String` types. |
15 | |
16 | use alloc::{string::String, vec::Vec}; |
17 | use core::char; |
18 | use core::fmt::Write; |
19 | use core::marker::PhantomData; |
20 | |
21 | // Bootstring parameters for Punycode |
22 | const BASE: u32 = 36; |
23 | const T_MIN: u32 = 1; |
24 | const T_MAX: u32 = 26; |
25 | const SKEW: u32 = 38; |
26 | const DAMP: u32 = 700; |
27 | const INITIAL_BIAS: u32 = 72; |
28 | const INITIAL_N: u32 = 0x80; |
29 | |
30 | #[inline ] |
31 | fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 { |
32 | delta /= if first_time { DAMP } else { 2 }; |
33 | delta += delta / num_points; |
34 | let mut k: u32 = 0; |
35 | while delta > ((BASE - T_MIN) * T_MAX) / 2 { |
36 | delta /= BASE - T_MIN; |
37 | k += BASE; |
38 | } |
39 | k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW)) |
40 | } |
41 | |
42 | /// Convert Punycode to an Unicode `String`. |
43 | /// |
44 | /// Return None on malformed input or overflow. |
45 | /// Overflow can only happen on inputs that take more than |
46 | /// 63 encoded bytes, the DNS limit on domain name labels. |
47 | #[inline ] |
48 | pub fn decode_to_string(input: &str) -> Option<String> { |
49 | Some( |
50 | DecoderDecode<'_, u8, ExternalCaller>::default() |
51 | .decode::<u8, ExternalCaller>(input.as_bytes()) |
52 | .ok()? |
53 | .collect(), |
54 | ) |
55 | } |
56 | |
57 | /// Convert Punycode to Unicode. |
58 | /// |
59 | /// Return None on malformed input or overflow. |
60 | /// Overflow can only happen on inputs that take more than |
61 | /// 63 encoded bytes, the DNS limit on domain name labels. |
62 | pub fn decode(input: &str) -> Option<Vec<char>> { |
63 | Some( |
64 | DecoderDecode<'_, u8, ExternalCaller>::default() |
65 | .decode::<u8, ExternalCaller>(input.as_bytes()) |
66 | .ok()? |
67 | .collect(), |
68 | ) |
69 | } |
70 | |
71 | /// Marker for internal vs. external caller to retain old API behavior |
72 | /// while tweaking behavior for internal callers. |
73 | /// |
74 | /// External callers need overflow checks when encoding, but internal |
75 | /// callers don't, because `PUNYCODE_ENCODE_MAX_INPUT_LENGTH` is set |
76 | /// to 1000, and per RFC 3492 section 6.4, the integer variable does |
77 | /// not need to be able to represent values larger than |
78 | /// (char::MAX - INITIAL_N) * (PUNYCODE_ENCODE_MAX_INPUT_LENGTH + 1), |
79 | /// which is less than u32::MAX. |
80 | /// |
81 | /// External callers need to handle upper-case ASCII when decoding, |
82 | /// but internal callers don't, because the internal code calls the |
83 | /// decoder only with lower-case inputs. |
84 | pub(crate) trait PunycodeCaller { |
85 | const EXTERNAL_CALLER: bool; |
86 | } |
87 | |
88 | pub(crate) struct InternalCaller; |
89 | |
90 | impl PunycodeCaller for InternalCaller { |
91 | const EXTERNAL_CALLER: bool = false; |
92 | } |
93 | |
94 | struct ExternalCaller; |
95 | |
96 | impl PunycodeCaller for ExternalCaller { |
97 | const EXTERNAL_CALLER: bool = true; |
98 | } |
99 | |
100 | pub(crate) trait PunycodeCodeUnit { |
101 | fn is_delimiter(&self) -> bool; |
102 | fn is_ascii(&self) -> bool; |
103 | fn digit(&self) -> Option<u32>; |
104 | fn char(&self) -> char; |
105 | fn char_ascii_lower_case(&self) -> char; |
106 | } |
107 | |
108 | impl PunycodeCodeUnit for u8 { |
109 | fn is_delimiter(&self) -> bool { |
110 | *self == b'-' |
111 | } |
112 | fn is_ascii(&self) -> bool { |
113 | *self < 0x80 |
114 | } |
115 | fn digit(&self) -> Option<u32> { |
116 | let byte: u8 = *self; |
117 | Some(match byte { |
118 | byte: u8 @ b'0' ..=b'9' => byte - b'0' + 26, |
119 | byte: u8 @ b'A' ..=b'Z' => byte - b'A' , |
120 | byte: u8 @ b'a' ..=b'z' => byte - b'a' , |
121 | _ => return None, |
122 | } as u32) |
123 | } |
124 | fn char(&self) -> char { |
125 | char::from(*self) |
126 | } |
127 | fn char_ascii_lower_case(&self) -> char { |
128 | char::from(self.to_ascii_lowercase()) |
129 | } |
130 | } |
131 | |
132 | impl PunycodeCodeUnit for char { |
133 | fn is_delimiter(&self) -> bool { |
134 | *self == '-' |
135 | } |
136 | fn is_ascii(&self) -> bool { |
137 | debug_assert!(false); // Unused |
138 | true |
139 | } |
140 | fn digit(&self) -> Option<u32> { |
141 | let byte = *self; |
142 | Some(match byte { |
143 | byte @ '0' ..='9' => u32::from(byte) - u32::from('0' ) + 26, |
144 | // byte @ 'A'..='Z' => u32::from(byte) - u32::from('A'), // XXX not needed if no public input |
145 | byte @ 'a' ..='z' => u32::from(byte) - u32::from('a' ), |
146 | _ => return None, |
147 | }) |
148 | } |
149 | fn char(&self) -> char { |
150 | debug_assert!(false); // Unused |
151 | *self |
152 | } |
153 | fn char_ascii_lower_case(&self) -> char { |
154 | // No need to actually lower-case! |
155 | *self |
156 | } |
157 | } |
158 | |
159 | #[derive (Default)] |
160 | pub(crate) struct Decoder { |
161 | insertions: smallvec::SmallVec<[(usize, char); 59]>, |
162 | } |
163 | |
164 | impl Decoder { |
165 | /// Split the input iterator and return a Vec with insertions of encoded characters |
166 | pub(crate) fn decode<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller>( |
167 | &'a mut self, |
168 | input: &'a [T], |
169 | ) -> Result<Decode<'a, T, C>, ()> { |
170 | self.insertions.clear(); |
171 | // Handle "basic" (ASCII) code points. |
172 | // They are encoded as-is before the last delimiter, if any. |
173 | let (base, input) = if let Some(position) = input.iter().rposition(|c| c.is_delimiter()) { |
174 | ( |
175 | &input[..position], |
176 | if position > 0 { |
177 | &input[position + 1..] |
178 | } else { |
179 | input |
180 | }, |
181 | ) |
182 | } else { |
183 | (&input[..0], input) |
184 | }; |
185 | |
186 | if C::EXTERNAL_CALLER && !base.iter().all(|c| c.is_ascii()) { |
187 | return Err(()); |
188 | } |
189 | |
190 | let base_len = base.len(); |
191 | let mut length = base_len as u32; |
192 | let mut code_point = INITIAL_N; |
193 | let mut bias = INITIAL_BIAS; |
194 | let mut i = 0u32; |
195 | let mut iter = input.iter(); |
196 | loop { |
197 | let previous_i = i; |
198 | let mut weight = 1; |
199 | let mut k = BASE; |
200 | let mut byte = match iter.next() { |
201 | None => break, |
202 | Some(byte) => byte, |
203 | }; |
204 | |
205 | // Decode a generalized variable-length integer into delta, |
206 | // which gets added to i. |
207 | loop { |
208 | let digit = if let Some(digit) = byte.digit() { |
209 | digit |
210 | } else { |
211 | return Err(()); |
212 | }; |
213 | let product = digit.checked_mul(weight).ok_or(())?; |
214 | i = i.checked_add(product).ok_or(())?; |
215 | let t = if k <= bias { |
216 | T_MIN |
217 | } else if k >= bias + T_MAX { |
218 | T_MAX |
219 | } else { |
220 | k - bias |
221 | }; |
222 | if digit < t { |
223 | break; |
224 | } |
225 | weight = weight.checked_mul(BASE - t).ok_or(())?; |
226 | k += BASE; |
227 | byte = match iter.next() { |
228 | None => return Err(()), // End of input before the end of this delta |
229 | Some(byte) => byte, |
230 | }; |
231 | } |
232 | |
233 | bias = adapt(i - previous_i, length + 1, previous_i == 0); |
234 | |
235 | // i was supposed to wrap around from length+1 to 0, |
236 | // incrementing code_point each time. |
237 | code_point = code_point.checked_add(i / (length + 1)).ok_or(())?; |
238 | i %= length + 1; |
239 | let c = match char::from_u32(code_point) { |
240 | Some(c) => c, |
241 | None => return Err(()), |
242 | }; |
243 | |
244 | // Move earlier insertions farther out in the string |
245 | for (idx, _) in &mut self.insertions { |
246 | if *idx >= i as usize { |
247 | *idx += 1; |
248 | } |
249 | } |
250 | self.insertions.push((i as usize, c)); |
251 | length += 1; |
252 | i += 1; |
253 | } |
254 | |
255 | self.insertions.sort_by_key(|(i, _)| *i); |
256 | Ok(Decode { |
257 | base: base.iter(), |
258 | insertions: &self.insertions, |
259 | inserted: 0, |
260 | position: 0, |
261 | len: base_len + self.insertions.len(), |
262 | phantom: PhantomData::<C>, |
263 | }) |
264 | } |
265 | } |
266 | |
267 | pub(crate) struct Decode<'a, T, C> |
268 | where |
269 | T: PunycodeCodeUnit + Copy, |
270 | C: PunycodeCaller, |
271 | { |
272 | base: core::slice::Iter<'a, T>, |
273 | pub(crate) insertions: &'a [(usize, char)], |
274 | inserted: usize, |
275 | position: usize, |
276 | len: usize, |
277 | phantom: PhantomData<C>, |
278 | } |
279 | |
280 | impl<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller> Iterator for Decode<'a, T, C> { |
281 | type Item = char; |
282 | |
283 | fn next(&mut self) -> Option<Self::Item> { |
284 | loop { |
285 | match self.insertions.get(self.inserted) { |
286 | Some((pos, c)) if *pos == self.position => { |
287 | self.inserted += 1; |
288 | self.position += 1; |
289 | return Some(*c); |
290 | } |
291 | _ => {} |
292 | } |
293 | if let Some(c) = self.base.next() { |
294 | self.position += 1; |
295 | return Some(if C::EXTERNAL_CALLER { |
296 | c.char() |
297 | } else { |
298 | c.char_ascii_lower_case() |
299 | }); |
300 | } else if self.inserted >= self.insertions.len() { |
301 | return None; |
302 | } |
303 | } |
304 | } |
305 | |
306 | fn size_hint(&self) -> (usize, Option<usize>) { |
307 | let len = self.len - self.position; |
308 | (len, Some(len)) |
309 | } |
310 | } |
311 | |
312 | impl<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller> ExactSizeIterator for Decode<'a, T, C> { |
313 | fn len(&self) -> usize { |
314 | self.len - self.position |
315 | } |
316 | } |
317 | |
318 | /// Convert an Unicode `str` to Punycode. |
319 | /// |
320 | /// This is a convenience wrapper around `encode`. |
321 | #[inline ] |
322 | pub fn encode_str(input: &str) -> Option<String> { |
323 | if input.len() > u32::MAX as usize { |
324 | return None; |
325 | } |
326 | let mut buf: String = String::with_capacity(input.len()); |
327 | encode_intoOption<()>::<_, _, ExternalCaller>(input.chars(), &mut buf) |
328 | .ok() |
329 | .map(|()| buf) |
330 | } |
331 | |
332 | /// Convert Unicode to Punycode. |
333 | /// |
334 | /// Return None on overflow, which can only happen on inputs that would take more than |
335 | /// 63 encoded bytes, the DNS limit on domain name labels. |
336 | pub fn encode(input: &[char]) -> Option<String> { |
337 | if input.len() > u32::MAX as usize { |
338 | return None; |
339 | } |
340 | let mut buf: String = String::with_capacity(input.len()); |
341 | encode_intoOption<()>::<_, _, ExternalCaller>(input.iter().copied(), &mut buf) |
342 | .ok() |
343 | .map(|()| buf) |
344 | } |
345 | |
346 | pub(crate) enum PunycodeEncodeError { |
347 | Overflow, |
348 | Sink, |
349 | } |
350 | |
351 | impl From<core::fmt::Error> for PunycodeEncodeError { |
352 | fn from(_: core::fmt::Error) -> Self { |
353 | PunycodeEncodeError::Sink |
354 | } |
355 | } |
356 | |
357 | pub(crate) fn encode_into<I, W, C>(input: I, output: &mut W) -> Result<(), PunycodeEncodeError> |
358 | where |
359 | I: Iterator<Item = char> + Clone, |
360 | W: Write + ?Sized, |
361 | C: PunycodeCaller, |
362 | { |
363 | // Handle "basic" (ASCII) code points. They are encoded as-is. |
364 | let (mut input_length, mut basic_length) = (0u32, 0); |
365 | for c in input.clone() { |
366 | input_length = input_length |
367 | .checked_add(1) |
368 | .ok_or(PunycodeEncodeError::Overflow)?; |
369 | if c.is_ascii() { |
370 | output.write_char(c)?; |
371 | basic_length += 1; |
372 | } |
373 | } |
374 | |
375 | if !C::EXTERNAL_CALLER { |
376 | // We should never get an overflow here with the internal caller being |
377 | // length-limited, but let's check anyway once here trusting the math |
378 | // from RFC 3492 section 6.4 and then omit the overflow checks in the |
379 | // loop below. |
380 | let len_plus_one = input_length |
381 | .checked_add(1) |
382 | .ok_or(PunycodeEncodeError::Overflow)?; |
383 | len_plus_one |
384 | .checked_mul(u32::from(char::MAX) - INITIAL_N) |
385 | .ok_or(PunycodeEncodeError::Overflow)?; |
386 | } |
387 | |
388 | if basic_length > 0 { |
389 | output.write_char('-' )?; |
390 | } |
391 | let mut code_point = INITIAL_N; |
392 | let mut delta = 0u32; |
393 | let mut bias = INITIAL_BIAS; |
394 | let mut processed = basic_length; |
395 | while processed < input_length { |
396 | // All code points < code_point have been handled already. |
397 | // Find the next larger one. |
398 | let min_code_point = input |
399 | .clone() |
400 | .map(|c| c as u32) |
401 | .filter(|&c| c >= code_point) |
402 | .min() |
403 | .unwrap(); |
404 | // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0> |
405 | if C::EXTERNAL_CALLER { |
406 | let product = (min_code_point - code_point) |
407 | .checked_mul(processed + 1) |
408 | .ok_or(PunycodeEncodeError::Overflow)?; |
409 | delta = delta |
410 | .checked_add(product) |
411 | .ok_or(PunycodeEncodeError::Overflow)?; |
412 | } else { |
413 | delta += (min_code_point - code_point) * (processed + 1); |
414 | } |
415 | code_point = min_code_point; |
416 | for c in input.clone() { |
417 | let c = c as u32; |
418 | if c < code_point { |
419 | if C::EXTERNAL_CALLER { |
420 | delta = delta.checked_add(1).ok_or(PunycodeEncodeError::Overflow)?; |
421 | } else { |
422 | delta += 1; |
423 | } |
424 | } |
425 | if c == code_point { |
426 | // Represent delta as a generalized variable-length integer: |
427 | let mut q = delta; |
428 | let mut k = BASE; |
429 | loop { |
430 | let t = if k <= bias { |
431 | T_MIN |
432 | } else if k >= bias + T_MAX { |
433 | T_MAX |
434 | } else { |
435 | k - bias |
436 | }; |
437 | if q < t { |
438 | break; |
439 | } |
440 | let value = t + ((q - t) % (BASE - t)); |
441 | output.write_char(value_to_digit(value))?; |
442 | q = (q - t) / (BASE - t); |
443 | k += BASE; |
444 | } |
445 | output.write_char(value_to_digit(q))?; |
446 | bias = adapt(delta, processed + 1, processed == basic_length); |
447 | delta = 0; |
448 | processed += 1; |
449 | } |
450 | } |
451 | delta += 1; |
452 | code_point += 1; |
453 | } |
454 | Ok(()) |
455 | } |
456 | |
457 | #[inline ] |
458 | fn value_to_digit(value: u32) -> char { |
459 | match value { |
460 | 0..=25 => (value as u8 + b'a' ) as char, // a..z |
461 | 26..=35 => (value as u8 - 26 + b'0' ) as char, // 0..9 |
462 | _ => panic!(), |
463 | } |
464 | } |
465 | |
466 | #[test ] |
467 | #[ignore = "slow" ] |
468 | #[cfg (target_pointer_width = "64" )] |
469 | fn huge_encode() { |
470 | let mut buf = String::new(); |
471 | assert!(encode_into::<_, _, ExternalCaller>( |
472 | core::iter::repeat('ß' ).take(u32::MAX as usize + 1), |
473 | &mut buf |
474 | ) |
475 | .is_err()); |
476 | assert_eq!(buf.len(), 0); |
477 | } |
478 | |