| 1 | // Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT | 
| 2 | // file at the top-level directory of this distribution and at | 
|---|
| 3 | // http://rust-lang.org/COPYRIGHT. | 
|---|
| 4 | // | 
|---|
| 5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | 
|---|
| 6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | 
|---|
| 7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | 
|---|
| 8 | // option. This file may not be copied, modified, or distributed | 
|---|
| 9 | // except according to those terms. | 
|---|
| 10 | // | 
|---|
| 11 | // Original authors: alexchrichton, bluss | 
|---|
| 12 |  | 
|---|
| 13 | // UTF-8 ranges and tags for encoding characters | 
|---|
| 14 | const TAG_CONT: u8    = 0b1000_0000; | 
|---|
| 15 | const TAG_TWO_B: u8   = 0b1100_0000; | 
|---|
| 16 | const TAG_THREE_B: u8 = 0b1110_0000; | 
|---|
| 17 | const TAG_FOUR_B: u8  = 0b1111_0000; | 
|---|
| 18 | const MAX_ONE_B: u32   =     0x80; | 
|---|
| 19 | const MAX_TWO_B: u32   =    0x800; | 
|---|
| 20 | const MAX_THREE_B: u32 =  0x10000; | 
|---|
| 21 |  | 
|---|
| 22 | /// Placeholder | 
|---|
| 23 | pub struct EncodeUtf8Error; | 
|---|
| 24 |  | 
|---|
| 25 | /// Encode a char into buf using UTF-8. | 
|---|
| 26 | /// | 
|---|
| 27 | /// On success, return the byte length of the encoding (1, 2, 3 or 4).<br> | 
|---|
| 28 | /// On error, return `EncodeUtf8Error` if the buffer was too short for the char. | 
|---|
| 29 | /// | 
|---|
| 30 | /// Safety: `ptr` must be writable for `len` bytes. | 
|---|
| 31 | #[ inline] | 
|---|
| 32 | pub unsafe fn encode_utf8(ch: char, ptr: *mut u8, len: usize) -> Result<usize, EncodeUtf8Error> | 
|---|
| 33 | { | 
|---|
| 34 | let code: u32 = ch as u32; | 
|---|
| 35 | if code < MAX_ONE_B && len >= 1 { | 
|---|
| 36 | ptr.add(0).write(val:code as u8); | 
|---|
| 37 | return Ok(1); | 
|---|
| 38 | } else if code < MAX_TWO_B && len >= 2 { | 
|---|
| 39 | ptr.add(0).write((code >> 6 & 0x1F) as u8 | TAG_TWO_B); | 
|---|
| 40 | ptr.add(1).write((code & 0x3F) as u8 | TAG_CONT); | 
|---|
| 41 | return Ok(2); | 
|---|
| 42 | } else if code < MAX_THREE_B && len >= 3 { | 
|---|
| 43 | ptr.add(0).write((code >> 12 & 0x0F) as u8 | TAG_THREE_B); | 
|---|
| 44 | ptr.add(1).write((code >>  6 & 0x3F) as u8 | TAG_CONT); | 
|---|
| 45 | ptr.add(2).write((code & 0x3F) as u8 | TAG_CONT); | 
|---|
| 46 | return Ok(3); | 
|---|
| 47 | } else if len >= 4 { | 
|---|
| 48 | ptr.add(0).write((code >> 18 & 0x07) as u8 | TAG_FOUR_B); | 
|---|
| 49 | ptr.add(1).write((code >> 12 & 0x3F) as u8 | TAG_CONT); | 
|---|
| 50 | ptr.add(2).write((code >>  6 & 0x3F) as u8 | TAG_CONT); | 
|---|
| 51 | ptr.add(3).write((code & 0x3F) as u8 | TAG_CONT); | 
|---|
| 52 | return Ok(4); | 
|---|
| 53 | }; | 
|---|
| 54 | Err(EncodeUtf8Error) | 
|---|
| 55 | } | 
|---|
| 56 |  | 
|---|
| 57 |  | 
|---|
| 58 | #[ test] | 
|---|
| 59 | #[ cfg_attr(miri, ignore)] // Miri is too slow | 
|---|
| 60 | fn test_encode_utf8() { | 
|---|
| 61 | // Test that all codepoints are encoded correctly | 
|---|
| 62 | let mut data = [0u8; 16]; | 
|---|
| 63 | for codepoint in 0..=(std::char::MAX as u32) { | 
|---|
| 64 | if let Some(ch) = std::char::from_u32(codepoint) { | 
|---|
| 65 | for elt in &mut data { *elt = 0; } | 
|---|
| 66 | let ptr = data.as_mut_ptr(); | 
|---|
| 67 | let len = data.len(); | 
|---|
| 68 | unsafe { | 
|---|
| 69 | let res = encode_utf8(ch, ptr, len).ok().unwrap(); | 
|---|
| 70 | assert_eq!(res, ch.len_utf8()); | 
|---|
| 71 | } | 
|---|
| 72 | let string = std::str::from_utf8(&data).unwrap(); | 
|---|
| 73 | assert_eq!(string.chars().next(), Some(ch)); | 
|---|
| 74 | } | 
|---|
| 75 | } | 
|---|
| 76 | } | 
|---|
| 77 |  | 
|---|
| 78 | #[ test] | 
|---|
| 79 | fn test_encode_utf8_oob() { | 
|---|
| 80 | // test that we report oob if the buffer is too short | 
|---|
| 81 | let mut data = [0u8; 16]; | 
|---|
| 82 | let chars = [ 'a', 'α', '�', '𐍈']; | 
|---|
| 83 | for (len, &ch) in (1..=4).zip(&chars) { | 
|---|
| 84 | assert_eq!(len, ch.len_utf8(), "Len of ch={}", ch); | 
|---|
| 85 | let ptr = data.as_mut_ptr(); | 
|---|
| 86 | unsafe { | 
|---|
| 87 | assert!(matches::matches!(encode_utf8(ch, ptr, len - 1), Err(_))); | 
|---|
| 88 | assert!(matches::matches!(encode_utf8(ch, ptr, len), Ok(_))); | 
|---|
| 89 | } | 
|---|
| 90 | } | 
|---|
| 91 | } | 
|---|
| 92 |  | 
|---|
| 93 |  | 
|---|