punycode.rs source code [crates/idna/src/punycode.rs]

1	// Copyright 2013 The rust-url developers.
2	//
3	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6	// option. This file may not be copied, modified, or distributed
7	// except according to those terms.
8
9	//! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
10	//!
11	//! Since Punycode fundamentally works on unicode code points,
12	//! `encode` and `decode` take and return slices and vectors of `char`.
13	//! `encode_str` and `decode_to_string` provide convenience wrappers
14	//! that convert from and to Rust’s UTF-8 based `str` and `String` types.
15
16	use alloc::{string::String, vec::Vec};
17	use core::char;
18	use core::fmt::Write;
19	use core::marker::PhantomData;
20
21	// Bootstring parameters for Punycode
22	const BASE: u32 = `36`;
23	const T_MIN: u32 = `1`;
24	const T_MAX: u32 = `26`;
25	const SKEW: u32 = `38`;
26	const DAMP: u32 = `700`;
27	const INITIAL_BIAS: u32 = `72`;
28	const INITIAL_N: u32 = `0x80`;
29
30	#[inline]
31	fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
32	delta /= if first_time { DAMP } else { `2` };
33	delta += delta / num_points;
34	let mut k: u32 = `0`;
35	while delta > ((BASE - T_MIN) * T_MAX) / `2` {
36	delta /= BASE - T_MIN;
37	k += BASE;
38	}
39	k + (((BASE - T_MIN + `1`) * delta) / (delta + SKEW))
40	}
41
42	/// Convert Punycode to an Unicode `String`.
43	///
44	/// Return None on malformed input or overflow.
45	/// Overflow can only happen on inputs that take more than
46	/// 63 encoded bytes, the DNS limit on domain name labels.
47	#[inline]
48	pub fn decode_to_string(input: &str) -> Option<String> {
49	Some(
50	DecoderDecode<'_, u8, ExternalCaller>::default()
51	.decode::<u8, ExternalCaller>(input.as_bytes())
52	.ok()?
53	.collect(),
54	)
55	}
56
57	/// Convert Punycode to Unicode.
58	///
59	/// Return None on malformed input or overflow.
60	/// Overflow can only happen on inputs that take more than
61	/// 63 encoded bytes, the DNS limit on domain name labels.
62	pub fn decode(input: &str) -> Option<Vec<char>> {
63	Some(
64	DecoderDecode<'_, u8, ExternalCaller>::default()
65	.decode::<u8, ExternalCaller>(input.as_bytes())
66	.ok()?
67	.collect(),
68	)
69	}
70
71	/// Marker for internal vs. external caller to retain old API behavior
72	/// while tweaking behavior for internal callers.
73	///
74	/// External callers need overflow checks when encoding, but internal
75	/// callers don't, because `PUNYCODE_ENCODE_MAX_INPUT_LENGTH` is set
76	/// to 1000, and per RFC 3492 section 6.4, the integer variable does
77	/// not need to be able to represent values larger than
78	/// (char::MAX - INITIAL_N) (PUNYCODE_ENCODE_MAX_INPUT_LENGTH + 1),*
79	/// which is less than u32::MAX.
80	///
81	/// External callers need to handle upper-case ASCII when decoding,
82	/// but internal callers don't, because the internal code calls the
83	/// decoder only with lower-case inputs.
84	pub(crate) trait PunycodeCaller {
85	const EXTERNAL_CALLER: bool;
86	}
87
88	pub(crate) struct InternalCaller;
89
90	impl PunycodeCaller for InternalCaller {
91	const EXTERNAL_CALLER: bool = `false`;
92	}
93
94	struct ExternalCaller;
95
96	impl PunycodeCaller for ExternalCaller {
97	const EXTERNAL_CALLER: bool = `true`;
98	}
99
100	pub(crate) trait PunycodeCodeUnit {
101	fn is_delimiter(&self) -> bool;
102	fn is_ascii(&self) -> bool;
103	fn digit(&self) -> Option<u32>;
104	fn char(&self) -> char;
105	fn char_ascii_lower_case(&self) -> char;
106	}
107
108	impl PunycodeCodeUnit for u8 {
109	fn is_delimiter(&self) -> bool {
110	*self == b'-'
111	}
112	fn is_ascii(&self) -> bool {
113	*self < `0x80`
114	}
115	fn digit(&self) -> Option<u32> {
116	let byte: u8 = *self;
117	Some(match byte {
118	byte: u8 @ b'0'..=b'9' => byte - b'0' + `26`,
119	byte: u8 @ b'A'..=b'Z' => byte - b'A',
120	byte: u8 @ b'a'..=b'z' => byte - b'a',
121	_ => return None,
122	} as u32)
123	}
124	fn char(&self) -> char {
125	char::from(*self)
126	}
127	fn char_ascii_lower_case(&self) -> char {
128	char::from(self.to_ascii_lowercase())
129	}
130	}
131
132	impl PunycodeCodeUnit for char {
133	fn is_delimiter(&self) -> bool {
134	*self == '-'
135	}
136	fn is_ascii(&self) -> bool {
137	debug_assert!(`false`); // Unused
138	`true`
139	}
140	fn digit(&self) -> Option<u32> {
141	let byte = *self;
142	Some(match byte {
143	byte @ '0'..='9' => u32::from(byte) - u32::from('0') + `26`,
144	// byte @ 'A'..='Z' => u32::from(byte) - u32::from('A'), // XXX not needed if no public input
145	byte @ 'a'..='z' => u32::from(byte) - u32::from('a'),
146	_ => return None,
147	})
148	}
149	fn char(&self) -> char {
150	debug_assert!(`false`); // Unused
151	*self
152	}
153	fn char_ascii_lower_case(&self) -> char {
154	// No need to actually lower-case!
155	*self
156	}
157	}
158
159	#[derive(Default)]
160	pub(crate) struct Decoder {
161	insertions: smallvec::SmallVec<[(usize, char); `59`]>,
162	}
163
164	impl Decoder {
165	/// Split the input iterator and return a Vec with insertions of encoded characters
166	pub(crate) fn decode<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller>(
167	&'a mut self,
168	input: &'a [T],
169	) -> Result<Decode<'a, T, C>, ()> {
170	self.insertions.clear();
171	// Handle "basic" (ASCII) code points.
172	// They are encoded as-is before the last delimiter, if any.
173	let (base, input) = if let Some(position) = input.iter().rposition(\|c\| c.is_delimiter()) {
174	(
175	&input[..position],
176	if position > `0` {
177	&input[position + `1`..]
178	} else {
179	input
180	},
181	)
182	} else {
183	(&input[..`0`], input)
184	};
185
186	if C::EXTERNAL_CALLER && !base.iter().all(\|c\| c.is_ascii()) {
187	return Err(());
188	}
189
190	let base_len = base.len();
191	let mut length = base_len as u32;
192	let mut code_point = INITIAL_N;
193	let mut bias = INITIAL_BIAS;
194	let mut i = `0u32`;
195	let mut iter = input.iter();
196	loop {
197	let previous_i = i;
198	let mut weight = `1`;
199	let mut k = BASE;
200	let mut byte = match iter.next() {
201	None => break,
202	Some(byte) => byte,
203	};
204
205	// Decode a generalized variable-length integer into delta,
206	// which gets added to i.
207	loop {
208	let digit = if let Some(digit) = byte.digit() {
209	digit
210	} else {
211	return Err(());
212	};
213	let product = digit.checked_mul(weight).ok_or(())?;
214	i = i.checked_add(product).ok_or(())?;
215	let t = if k <= bias {
216	T_MIN
217	} else if k >= bias + T_MAX {
218	T_MAX
219	} else {
220	k - bias
221	};
222	if digit < t {
223	break;
224	}
225	weight = weight.checked_mul(BASE - t).ok_or(())?;
226	k += BASE;
227	byte = match iter.next() {
228	None => return Err(()), // End of input before the end of this delta
229	Some(byte) => byte,
230	};
231	}
232
233	bias = adapt(i - previous_i, length + `1`, previous_i == `0`);
234
235	// i was supposed to wrap around from length+1 to 0,
236	// incrementing code_point each time.
237	code_point = code_point.checked_add(i / (length + `1`)).ok_or(())?;
238	i %= length + `1`;
239	let c = match char::from_u32(code_point) {
240	Some(c) => c,
241	None => return Err(()),
242	};
243
244	// Move earlier insertions farther out in the string
245	for (idx, _) in &mut self.insertions {
246	if idx >= i as usize* {
247	*idx += `1`;
248	}
249	}
250	self.insertions.push((i as usize, c));
251	length += `1`;
252	i += `1`;
253	}
254
255	self.insertions.sort_by_key(\|(i, _)\| *i);
256	Ok(Decode {
257	base: base.iter(),
258	insertions: &self.insertions,
259	inserted: `0`,
260	position: `0`,
261	len: base_len + self.insertions.len(),
262	phantom: PhantomData::<C>,
263	})
264	}
265	}
266
267	pub(crate) struct Decode<'a, T, C>
268	where
269	T: PunycodeCodeUnit + Copy,
270	C: PunycodeCaller,
271	{
272	base: core::slice::Iter<'a, T>,
273	pub(crate) insertions: &'a [(usize, char)],
274	inserted: usize,
275	position: usize,
276	len: usize,
277	phantom: PhantomData<C>,
278	}
279
280	impl<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller> Iterator for Decode<'a, T, C> {
281	type Item = char;
282
283	fn next(&mut self) -> Option<Self::Item> {
284	loop {
285	match self.insertions.get(self.inserted) {
286	Some((pos, c)) if *pos == self.position => {
287	self.inserted += `1`;
288	self.position += `1`;
289	return Some(*c);
290	}
291	_ => {}
292	}
293	if let Some(c) = self.base.next() {
294	self.position += `1`;
295	return Some(if C::EXTERNAL_CALLER {
296	c.char()
297	} else {
298	c.char_ascii_lower_case()
299	});
300	} else if self.inserted >= self.insertions.len() {
301	return None;
302	}
303	}
304	}
305
306	fn size_hint(&self) -> (usize, Option<usize>) {
307	let len = self.len - self.position;
308	(len, Some(len))
309	}
310	}
311
312	impl<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller> ExactSizeIterator for Decode<'a, T, C> {
313	fn len(&self) -> usize {
314	self.len - self.position
315	}
316	}
317
318	/// Convert an Unicode `str` to Punycode.
319	///
320	/// This is a convenience wrapper around `encode`.
321	#[inline]
322	pub fn encode_str(input: &str) -> Option<String> {
323	if input.len() > u32::MAX as usize {
324	return None;
325	}
326	let mut buf: String = String::with_capacity(input.len());
327	encode_intoOption<()>::<_, _, ExternalCaller>(input.chars(), &mut buf)
328	.ok()
329	.map(\|()\| buf)
330	}
331
332	/// Convert Unicode to Punycode.
333	///
334	/// Return None on overflow, which can only happen on inputs that would take more than
335	/// 63 encoded bytes, the DNS limit on domain name labels.
336	pub fn encode(input: &[char]) -> Option<String> {
337	if input.len() > u32::MAX as usize {
338	return None;
339	}
340	let mut buf: String = String::with_capacity(input.len());
341	encode_intoOption<()>::<_, _, ExternalCaller>(input.iter().copied(), &mut buf)
342	.ok()
343	.map(\|()\| buf)
344	}
345
346	pub(crate) enum PunycodeEncodeError {
347	Overflow,
348	Sink,
349	}
350
351	impl From<core::fmt::Error> for PunycodeEncodeError {
352	fn from(_: core::fmt::Error) -> Self {
353	PunycodeEncodeError::Sink
354	}
355	}
356
357	pub(crate) fn encode_into<I, W, C>(input: I, output: &mut W) -> Result<(), PunycodeEncodeError>
358	where
359	I: Iterator<Item = char> + Clone,
360	W: Write + ?Sized,
361	C: PunycodeCaller,
362	{
363	// Handle "basic" (ASCII) code points. They are encoded as-is.
364	let (mut input_length, mut basic_length) = (`0u32`, `0`);
365	for c in input.clone() {
366	input_length = input_length
367	.checked_add(`1`)
368	.ok_or(PunycodeEncodeError::Overflow)?;
369	if c.is_ascii() {
370	output.write_char(c)?;
371	basic_length += `1`;
372	}
373	}
374
375	if !C::EXTERNAL_CALLER {
376	// We should never get an overflow here with the internal caller being
377	// length-limited, but let's check anyway once here trusting the math
378	// from RFC 3492 section 6.4 and then omit the overflow checks in the
379	// loop below.
380	let len_plus_one = input_length
381	.checked_add(`1`)
382	.ok_or(PunycodeEncodeError::Overflow)?;
383	len_plus_one
384	.checked_mul(u32::from(char::MAX) - INITIAL_N)
385	.ok_or(PunycodeEncodeError::Overflow)?;
386	}
387
388	if basic_length > `0` {
389	output.write_char('-')?;
390	}
391	let mut code_point = INITIAL_N;
392	let mut delta = `0u32`;
393	let mut bias = INITIAL_BIAS;
394	let mut processed = basic_length;
395	while processed < input_length {
396	// All code points < code_point have been handled already.
397	// Find the next larger one.
398	let min_code_point = input
399	.clone()
400	.map(\|c\| c as u32)
401	.filter(\|&c\| c >= code_point)
402	.min()
403	.unwrap();
404	// Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
405	if C::EXTERNAL_CALLER {
406	let product = (min_code_point - code_point)
407	.checked_mul(processed + `1`)
408	.ok_or(PunycodeEncodeError::Overflow)?;
409	delta = delta
410	.checked_add(product)
411	.ok_or(PunycodeEncodeError::Overflow)?;
412	} else {
413	delta += (min_code_point - code_point) * (processed + `1`);
414	}
415	code_point = min_code_point;
416	for c in input.clone() {
417	let c = c as u32;
418	if c < code_point {
419	if C::EXTERNAL_CALLER {
420	delta = delta.checked_add(`1`).ok_or(PunycodeEncodeError::Overflow)?;
421	} else {
422	delta += `1`;
423	}
424	}
425	if c == code_point {
426	// Represent delta as a generalized variable-length integer:
427	let mut q = delta;
428	let mut k = BASE;
429	loop {
430	let t = if k <= bias {
431	T_MIN
432	} else if k >= bias + T_MAX {
433	T_MAX
434	} else {
435	k - bias
436	};
437	if q < t {
438	break;
439	}
440	let value = t + ((q - t) % (BASE - t));
441	output.write_char(value_to_digit(value))?;
442	q = (q - t) / (BASE - t);
443	k += BASE;
444	}
445	output.write_char(value_to_digit(q))?;
446	bias = adapt(delta, processed + `1`, processed == basic_length);
447	delta = `0`;
448	processed += `1`;
449	}
450	}
451	delta += `1`;
452	code_point += `1`;
453	}
454	Ok(())
455	}
456
457	#[inline]
458	fn value_to_digit(value: u32) -> char {
459	match value {
460	`0`..=`25` => (value as u8 + b'a') as char, // a..z
461	`26`..=`35` => (value as u8 - `26` + b'0') as char, // 0..9
462	_ => panic!(),
463	}
464	}
465
466	#[test]
467	#[ignore = "slow"]
468	#[cfg(target_pointer_width = "64")]
469	fn huge_encode() {
470	let mut buf = String::new();
471	assert!(encode_into::<_, _, ExternalCaller>(
472	core::iter::repeat('ß').take(u32::MAX as usize + `1`),
473	&mut buf
474	)
475	.is_err());
476	assert_eq!(buf.len(), `0`);
477	}
478