| 1 | #![deny ( |
| 2 | missing_copy_implementations, |
| 3 | missing_debug_implementations, |
| 4 | missing_docs, |
| 5 | trivial_casts, |
| 6 | trivial_numeric_casts, |
| 7 | unsafe_code, |
| 8 | unused_import_braces, |
| 9 | unused_qualifications, |
| 10 | )] |
| 11 | |
| 12 | #![cfg_attr (feature = "dev" , feature(plugin))] |
| 13 | #![cfg_attr (feature = "dev" , plugin(clippy))] |
| 14 | #![cfg_attr (feature = "dev" , deny(clippy))] |
| 15 | |
| 16 | //! Fonctions to decode and encode [RFC-3492 Punycode](https://tools.ietf.org/html/rfc3492). |
| 17 | |
| 18 | // See [RFC-3492, section 4](https://tools.ietf.org/html/rfc3492#section-4). |
| 19 | const BASE : u32 = 36; |
| 20 | const TMIN : u32 = 1; |
| 21 | const TMAX : u32 = 26; |
| 22 | const SKEW : u32 = 38; |
| 23 | const DAMP : u32 = 700; |
| 24 | const INITIAL_BIAS : u32 = 72; |
| 25 | const INITIAL_N : u32 = 128; |
| 26 | const DELIMITER : char = '-' ; |
| 27 | |
| 28 | /// Decode the string as Punycode. The string should not contain the initial `xn--` and must |
| 29 | /// contain only ASCII characters. |
| 30 | /// # Example |
| 31 | /// ``` |
| 32 | /// assert_eq!( |
| 33 | /// punycode::decode("acadmie-franaise-npb1a" ).unwrap(), |
| 34 | /// "académie-française" |
| 35 | /// ); |
| 36 | /// ``` |
| 37 | pub fn decode(input: &str) -> Result<String, ()> { |
| 38 | if !input.is_ascii() { |
| 39 | return Err(()); |
| 40 | } |
| 41 | |
| 42 | let mut n = INITIAL_N; |
| 43 | let mut i = 0; |
| 44 | let mut bias = INITIAL_BIAS; |
| 45 | |
| 46 | let (mut output, input) = if let Some(i) = input.rfind(DELIMITER) { |
| 47 | (input[0..i].chars().collect(), &input[i+1..]) |
| 48 | } |
| 49 | else { |
| 50 | (vec![], &input[..]) |
| 51 | }; |
| 52 | |
| 53 | let mut it = input.chars().peekable(); |
| 54 | while it.peek() != None { |
| 55 | let oldi = i; |
| 56 | let mut w = 1; |
| 57 | |
| 58 | for k in 1.. { |
| 59 | let c = if let Some(c) = it.next() { |
| 60 | c |
| 61 | } |
| 62 | else { |
| 63 | return Err(()); |
| 64 | }; |
| 65 | |
| 66 | let k = k*BASE; |
| 67 | |
| 68 | let digit = decode_digit(c); |
| 69 | |
| 70 | if digit == BASE { |
| 71 | return Err(()); |
| 72 | } |
| 73 | |
| 74 | // overflow check |
| 75 | if digit > (std::u32::MAX - i) / w { |
| 76 | return Err(()); |
| 77 | } |
| 78 | i += digit * w; |
| 79 | |
| 80 | let t = clamped_sub(TMIN, k, bias, TMAX); |
| 81 | if digit < t { |
| 82 | break; |
| 83 | } |
| 84 | |
| 85 | // overflow check |
| 86 | if BASE > (std::u32::MAX - t) / w { |
| 87 | return Err(()); |
| 88 | } |
| 89 | w *= BASE - t; |
| 90 | } |
| 91 | |
| 92 | let len = (output.len() + 1) as u32; |
| 93 | bias = adapt(i - oldi, len, oldi == 0); |
| 94 | |
| 95 | let il = i / len; |
| 96 | // overflow check |
| 97 | if n > std::u32::MAX - il { |
| 98 | return Err(()); |
| 99 | } |
| 100 | n += il; |
| 101 | i %= len; |
| 102 | |
| 103 | if let Some(c) = std::char::from_u32(n) { |
| 104 | output.insert(i as usize, c); |
| 105 | } |
| 106 | else { |
| 107 | return Err(()); |
| 108 | } |
| 109 | |
| 110 | i += 1; |
| 111 | } |
| 112 | |
| 113 | Ok(output.iter().cloned().collect()) |
| 114 | } |
| 115 | |
| 116 | /// Encode a string as punycode. The result string will contain only ASCII characters. The result |
| 117 | /// string does not start with `xn--`. |
| 118 | /// # Example |
| 119 | /// ``` |
| 120 | /// assert_eq!( |
| 121 | /// punycode::encode("académie-française" ).unwrap(), |
| 122 | /// "acadmie-franaise-npb1a" |
| 123 | /// ); |
| 124 | /// ``` |
| 125 | pub fn encode(input: &str) -> Result<String, ()> { |
| 126 | encode_slice(&input.chars().collect::<Vec<char>>()) |
| 127 | } |
| 128 | |
| 129 | fn encode_slice(input: &[char]) -> Result<String, ()> { |
| 130 | let mut n = INITIAL_N; |
| 131 | let mut delta = 0; |
| 132 | let mut bias = INITIAL_BIAS; |
| 133 | |
| 134 | let mut output : String = input.iter().filter(|&&c| c.is_ascii()).cloned().collect(); |
| 135 | let mut h = output.len() as u32; |
| 136 | let b = h; |
| 137 | |
| 138 | if b > 0 { |
| 139 | output.push(DELIMITER) |
| 140 | } |
| 141 | |
| 142 | while h < input.len() as u32 { |
| 143 | let m = *input.iter().filter(|&&c| (c as u32) >= n).min().unwrap() as u32; |
| 144 | |
| 145 | if m - n > (std::u32::MAX - delta) / (h + 1) { |
| 146 | return Err(()); |
| 147 | } |
| 148 | delta += (m - n) * (h + 1); |
| 149 | |
| 150 | n = m; |
| 151 | |
| 152 | for c in input { |
| 153 | let c = *c as u32; |
| 154 | if c < n { |
| 155 | delta += 1; |
| 156 | } |
| 157 | else if c == n { |
| 158 | let mut q = delta; |
| 159 | |
| 160 | for k in 1.. { |
| 161 | let k = k*BASE; |
| 162 | |
| 163 | let t = clamped_sub(TMIN, k, bias, TMAX); |
| 164 | |
| 165 | if q < t { |
| 166 | break; |
| 167 | } |
| 168 | |
| 169 | output.push(encode_digit(t + (q - t) % (BASE - t))); |
| 170 | |
| 171 | q = (q - t) / (BASE - t); |
| 172 | } |
| 173 | |
| 174 | output.push(encode_digit(q)); |
| 175 | |
| 176 | bias = adapt(delta, h+1, h == b); |
| 177 | delta = 0; |
| 178 | h += 1; |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | delta += 1; |
| 183 | n += 1; |
| 184 | } |
| 185 | |
| 186 | Ok(output) |
| 187 | } |
| 188 | |
| 189 | fn adapt(delta: u32, numpoint: u32, firsttime: bool) -> u32 { |
| 190 | let mut delta: u32 = if firsttime { |
| 191 | delta / DAMP |
| 192 | } |
| 193 | else { |
| 194 | delta / 2 |
| 195 | }; |
| 196 | |
| 197 | delta += delta / numpoint; |
| 198 | let mut k: u32 = 0; |
| 199 | |
| 200 | while delta > (BASE - TMIN) * TMAX / 2 { |
| 201 | delta /= BASE - TMIN; |
| 202 | k += BASE |
| 203 | } |
| 204 | |
| 205 | k + (BASE - TMIN + 1) * delta / (delta + SKEW) |
| 206 | } |
| 207 | |
| 208 | /// Compute `lhs-rhs`. Result will be clamped in [min, max]. |
| 209 | fn clamped_sub<T>(min: T, lhs: T, rhs: T, max: T) -> T |
| 210 | where T : Ord |
| 211 | + std::ops::Add<Output=T> |
| 212 | + std::ops::Sub<Output=T> |
| 213 | + Copy |
| 214 | { |
| 215 | if min + rhs >= lhs { min } |
| 216 | else if max + rhs <= lhs { max } |
| 217 | else { lhs - rhs } |
| 218 | } |
| 219 | |
| 220 | fn decode_digit(c: char) -> u32 { |
| 221 | let cp: u32 = c as u32; |
| 222 | |
| 223 | match c { |
| 224 | '0' ... '9' => cp - ('0' as u32) + 26, |
| 225 | 'A' ... 'Z' => cp - ('A' as u32), |
| 226 | 'a' ... 'z' => cp - ('a' as u32), |
| 227 | _ => BASE, |
| 228 | } |
| 229 | } |
| 230 | |
| 231 | fn encode_digit(d: u32) -> char { |
| 232 | let r: char = (d + 22 + (if d < 26 { 75 } else { 0 })) as u8 as char; |
| 233 | |
| 234 | assert!(('0' <= r && r <= '9' ) || ('a' <= r && r <= 'z' ), "r = {}" , r); |
| 235 | |
| 236 | r |
| 237 | } |
| 238 | |
| 239 | #[cfg (test)] |
| 240 | static TESTS: &'static [(&'static str, &'static str)] = &[ |
| 241 | // examples taken from [RCF-3492, section 7.1](https://tools.ietf.org/html/rfc3492#section-7.1) |
| 242 | (&" \u{0644}\u{064A}\u{0647}\u{0645}\u{0627}\u{0628}\u{062A}\u{0643}\u{0644}\ |
| 243 | \u{0645}\u{0648}\u{0634}\u{0639}\u{0631}\u{0628}\u{064A}\u{061F}" , |
| 244 | &"egbpdaj6bu4bxfgehfvwxn" ), |
| 245 | |
| 246 | (&" \u{4ED6}\u{4EEC}\u{4E3A}\u{4EC0}\u{4E48}\u{4E0D}\u{8BF4}\u{4E2D}\u{6587}" , |
| 247 | &"ihqwcrb4cv8a8dqg056pqjye" ), |
| 248 | |
| 249 | (&" \u{4ED6}\u{5011}\u{7232}\u{4EC0}\u{9EBD}\u{4E0D}\u{8AAA}\u{4E2D}\u{6587}" , |
| 250 | &"ihqwctvzc91f659drss3x8bo0yb" ), |
| 251 | |
| 252 | (&" \u{0050}\u{0072}\u{006F}\u{010D}\u{0070}\u{0072}\u{006F}\u{0073}\u{0074}\ |
| 253 | \u{011B}\u{006E}\u{0065}\u{006D}\u{006C}\u{0075}\u{0076}\u{00ED}\u{010D}\ |
| 254 | \u{0065}\u{0073}\u{006B}\u{0079}" , |
| 255 | &"Proprostnemluvesky-uyb24dma41a" ), |
| 256 | |
| 257 | (&" \u{05DC}\u{05DE}\u{05D4}\u{05D4}\u{05DD}\u{05E4}\u{05E9}\u{05D5}\u{05D8}\ |
| 258 | \u{05DC}\u{05D0}\u{05DE}\u{05D3}\u{05D1}\u{05E8}\u{05D9}\u{05DD}\u{05E2}\ |
| 259 | \u{05D1}\u{05E8}\u{05D9}\u{05EA}" , |
| 260 | &"4dbcagdahymbxekheh6e0a7fei0b" ), |
| 261 | |
| 262 | (&" \u{092F}\u{0939}\u{0932}\u{094B}\u{0917}\u{0939}\u{093F}\u{0928}\u{094D}\ |
| 263 | \u{0926}\u{0940}\u{0915}\u{094D}\u{092F}\u{094B}\u{0902}\u{0928}\u{0939}\ |
| 264 | \u{0940}\u{0902}\u{092C}\u{094B}\u{0932}\u{0938}\u{0915}\u{0924}\u{0947}\ |
| 265 | \u{0939}\u{0948}\u{0902}" , |
| 266 | &"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" ), |
| 267 | |
| 268 | (&" \u{306A}\u{305C}\u{307F}\u{3093}\u{306A}\u{65E5}\u{672C}\u{8A9E}\u{3092}\ |
| 269 | \u{8A71}\u{3057}\u{3066}\u{304F}\u{308C}\u{306A}\u{3044}\u{306E}\u{304B}" , |
| 270 | &"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" ), |
| 271 | |
| 272 | (&" \u{C138}\u{ACC4}\u{C758}\u{BAA8}\u{B4E0}\u{C0AC}\u{B78C}\u{B4E4}\u{C774}\ |
| 273 | \u{D55C}\u{AD6D}\u{C5B4}\u{B97C}\u{C774}\u{D574}\u{D55C}\u{B2E4}\u{BA74}\ |
| 274 | \u{C5BC}\u{B9C8}\u{B098}\u{C88B}\u{C744}\u{AE4C}" , |
| 275 | &"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" ), |
| 276 | |
| 277 | (&" \u{043F}\u{043E}\u{0447}\u{0435}\u{043C}\u{0443}\u{0436}\u{0435}\u{043E}\ |
| 278 | \u{043D}\u{0438}\u{043D}\u{0435}\u{0433}\u{043E}\u{0432}\u{043E}\u{0440}\ |
| 279 | \u{044F}\u{0442}\u{043F}\u{043E}\u{0440}\u{0443}\u{0441}\u{0441}\u{043A}\ |
| 280 | \u{0438}" , |
| 281 | &"b1abfaaepdrnnbgefbaDotcwatmq2g4l" ), |
| 282 | |
| 283 | (&" \u{0050}\u{006F}\u{0072}\u{0071}\u{0075}\u{00E9}\u{006E}\u{006F}\u{0070}\ |
| 284 | \u{0075}\u{0065}\u{0064}\u{0065}\u{006E}\u{0073}\u{0069}\u{006D}\u{0070}\ |
| 285 | \u{006C}\u{0065}\u{006D}\u{0065}\u{006E}\u{0074}\u{0065}\u{0068}\u{0061}\ |
| 286 | \u{0062}\u{006C}\u{0061}\u{0072}\u{0065}\u{006E}\u{0045}\u{0073}\u{0070}\ |
| 287 | \u{0061}\u{00F1}\u{006F}\u{006C}" , |
| 288 | &"PorqunopuedensimplementehablarenEspaol-fmd56a" ), |
| 289 | |
| 290 | (&" \u{0054}\u{1EA1}\u{0069}\u{0073}\u{0061}\u{006F}\u{0068}\u{1ECD}\u{006B}\ |
| 291 | \u{0068}\u{00F4}\u{006E}\u{0067}\u{0074}\u{0068}\u{1EC3}\u{0063}\u{0068}\ |
| 292 | \u{1EC9}\u{006E}\u{00F3}\u{0069}\u{0074}\u{0069}\u{1EBF}\u{006E}\u{0067}\ |
| 293 | \u{0056}\u{0069}\u{1EC7}\u{0074}" , |
| 294 | &"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" ), |
| 295 | |
| 296 | (&" \u{0033}\u{5E74}\u{0042}\u{7D44}\u{91D1}\u{516B}\u{5148}\u{751F}" , |
| 297 | &"3B-ww4c5e180e575a65lsy2b" ), |
| 298 | |
| 299 | (&" \u{5B89}\u{5BA4}\u{5948}\u{7F8E}\u{6075}\u{002D}\u{0077}\u{0069}\u{0074}\ |
| 300 | \u{0068}\u{002D}\u{0053}\u{0055}\u{0050}\u{0045}\u{0052}\u{002D}\u{004D}\ |
| 301 | \u{004F}\u{004E}\u{004B}\u{0045}\u{0059}\u{0053}" , |
| 302 | &"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n" ), |
| 303 | |
| 304 | (&" \u{0048}\u{0065}\u{006C}\u{006C}\u{006F}\u{002D}\u{0041}\u{006E}\u{006F}\ |
| 305 | \u{0074}\u{0068}\u{0065}\u{0072}\u{002D}\u{0057}\u{0061}\u{0079}\u{002D}\ |
| 306 | \u{305D}\u{308C}\u{305E}\u{308C}\u{306E}\u{5834}\u{6240}" , |
| 307 | &"Hello-Another-Way--fc4qua05auwb3674vfr0b" ), |
| 308 | |
| 309 | (&" \u{3072}\u{3068}\u{3064}\u{5C4B}\u{6839}\u{306E}\u{4E0B}\u{0032}" , |
| 310 | &"2-u9tlzr9756bt3uc0v" ), |
| 311 | |
| 312 | (&" \u{004D}\u{0061}\u{006A}\u{0069}\u{3067}\u{004B}\u{006F}\u{0069}\u{3059}\ |
| 313 | \u{308B}\u{0035}\u{79D2}\u{524D}" , |
| 314 | &"MajiKoi5-783gue6qz075azm5e" ), |
| 315 | |
| 316 | (&" \u{30D1}\u{30D5}\u{30A3}\u{30FC}\u{0064}\u{0065}\u{30EB}\u{30F3}\u{30D0}" , |
| 317 | &"de-jg4avhby1noc0d" ), |
| 318 | |
| 319 | (&" \u{305D}\u{306E}\u{30B9}\u{30D4}\u{30FC}\u{30C9}\u{3067}" , |
| 320 | &"d9juau41awczczp" ), |
| 321 | |
| 322 | (&" \u{002D}\u{003E}\u{0020}\u{0024}\u{0031}\u{002E}\u{0030}\u{0030}\u{0020}\ |
| 323 | \u{003C}\u{002D}" , |
| 324 | &"-> $1.00 <--" ), |
| 325 | |
| 326 | // some real-life examples |
| 327 | (&"académie-française" , &"acadmie-franaise-npb1a" ), |
| 328 | (&"bücher" , &"bcher-kva" ), |
| 329 | (&"république-numérique" , &"rpublique-numrique-bwbm" ), |
| 330 | |
| 331 | // some real-life TLD |
| 332 | (&"бг" , &"90ae" ), |
| 333 | (&"рф" , &"p1ai" ), |
| 334 | (&"укр" , &"j1amh" ), |
| 335 | (&"السعودية" , &"mgberp4a5d4ar" ), |
| 336 | (&"امارات" , &"mgbaam7a8h" ), |
| 337 | (&"مصر" , &"wgbh1c" ), |
| 338 | (&"中国" , &"fiqs8s" ), |
| 339 | (&"中國" , &"fiqz9s" ), |
| 340 | (&"台湾" , &"kprw13d" ), |
| 341 | (&"台灣" , &"kpry57d" ), |
| 342 | (&"香港" , &"j6w193g" ), |
| 343 | |
| 344 | // other |
| 345 | (&"" , &"" ), |
| 346 | (&"a" , &"a-" ), |
| 347 | (&"0" , &"0-" ), |
| 348 | (&"A" , &"A-" ), |
| 349 | (&"é" , &"9ca" ), |
| 350 | (&" \n" , &" \n-" ), |
| 351 | ]; |
| 352 | |
| 353 | #[test ] |
| 354 | fn test_decode() { |
| 355 | for t in TESTS { |
| 356 | assert_eq!(decode(&t.1), Ok(t.0.into())); |
| 357 | } |
| 358 | } |
| 359 | |
| 360 | #[test ] |
| 361 | fn test_encode() { |
| 362 | for t in TESTS { |
| 363 | assert_eq!(encode(t.0).unwrap().to_lowercase(), t.1.to_lowercase()); |
| 364 | } |
| 365 | } |
| 366 | |
| 367 | #[test ] |
| 368 | fn test_fail_decode() { |
| 369 | assert_eq!(decode(&"bcher-kva.ch" ), Err(())); |
| 370 | assert_eq!(decode(&"+" ), Err(())); |
| 371 | assert_eq!(decode(&" \\" ), Err(())); |
| 372 | assert_eq!(decode(&"é" ), Err(())); |
| 373 | assert_eq!(decode(&"99999999" ), Err(())); |
| 374 | } |
| 375 | |