| 1 | /* | 
| 2 |  * Copyright (C) 2007 Apple Inc.  All rights reserved. | 
| 3 |  * | 
| 4 |  * Redistribution and use in source and binary forms, with or without | 
| 5 |  * modification, are permitted provided that the following conditions | 
| 6 |  * are met: | 
| 7 |  * 1. Redistributions of source code must retain the above copyright | 
| 8 |  *    notice, this list of conditions and the following disclaimer. | 
| 9 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 10 |  *    notice, this list of conditions and the following disclaimer in the | 
| 11 |  *    documentation and/or other materials provided with the distribution. | 
| 12 |  * | 
| 13 |  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY | 
| 14 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 15 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
| 16 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR | 
| 17 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
| 18 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
| 19 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
| 20 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
| 21 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
| 22 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 23 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
| 24 |  */ | 
| 25 |  | 
| 26 | #include "config.h" | 
| 27 | #include "UTF8.h" | 
| 28 |  | 
| 29 | namespace WTF { | 
| 30 | namespace Unicode { | 
| 31 |  | 
| 32 | inline int inlineUTF8SequenceLengthNonASCII(char b0) | 
| 33 | { | 
| 34 |     if ((b0 & 0xC0) != 0xC0) | 
| 35 |         return 0; | 
| 36 |     if ((b0 & 0xE0) == 0xC0) | 
| 37 |         return 2; | 
| 38 |     if ((b0 & 0xF0) == 0xE0) | 
| 39 |         return 3; | 
| 40 |     if ((b0 & 0xF8) == 0xF0) | 
| 41 |         return 4; | 
| 42 |     return 0; | 
| 43 | } | 
| 44 |  | 
| 45 | inline int inlineUTF8SequenceLength(char b0) | 
| 46 | { | 
| 47 |     return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); | 
| 48 | } | 
| 49 |  | 
| 50 | int UTF8SequenceLength(char b0) | 
| 51 | { | 
| 52 |     return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); | 
| 53 | } | 
| 54 |  | 
| 55 | int decodeUTF8Sequence(const char* sequence) | 
| 56 | { | 
| 57 |     // Handle 0-byte sequences (never valid). | 
| 58 |     const unsigned char b0 = sequence[0]; | 
| 59 |     const int length = inlineUTF8SequenceLength(b0); | 
| 60 |     if (length == 0) | 
| 61 |         return -1; | 
| 62 |  | 
| 63 |     // Handle 1-byte sequences (plain ASCII). | 
| 64 |     const unsigned char b1 = sequence[1]; | 
| 65 |     if (length == 1) { | 
| 66 |         if (b1) | 
| 67 |             return -1; | 
| 68 |         return b0; | 
| 69 |     } | 
| 70 |  | 
| 71 |     // Handle 2-byte sequences. | 
| 72 |     if ((b1 & 0xC0) != 0x80) | 
| 73 |         return -1; | 
| 74 |     const unsigned char b2 = sequence[2]; | 
| 75 |     if (length == 2) { | 
| 76 |         if (b2) | 
| 77 |             return -1; | 
| 78 |         const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); | 
| 79 |         if (c < 0x80) | 
| 80 |             return -1; | 
| 81 |         return c; | 
| 82 |     } | 
| 83 |  | 
| 84 |     // Handle 3-byte sequences. | 
| 85 |     if ((b2 & 0xC0) != 0x80) | 
| 86 |         return -1; | 
| 87 |     const unsigned char b3 = sequence[3]; | 
| 88 |     if (length == 3) { | 
| 89 |         if (b3) | 
| 90 |             return -1; | 
| 91 |         const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); | 
| 92 |         if (c < 0x800) | 
| 93 |             return -1; | 
| 94 |         // UTF-16 surrogates should never appear in UTF-8 data. | 
| 95 |         if (c >= 0xD800 && c <= 0xDFFF) | 
| 96 |             return -1; | 
| 97 |         return c; | 
| 98 |     } | 
| 99 |  | 
| 100 |     // Handle 4-byte sequences. | 
| 101 |     if ((b3 & 0xC0) != 0x80) | 
| 102 |         return -1; | 
| 103 |     const unsigned char b4 = sequence[4]; | 
| 104 |     if (length == 4) { | 
| 105 |         if (b4) | 
| 106 |             return -1; | 
| 107 |         const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); | 
| 108 |         if (c < 0x10000 || c > 0x10FFFF) | 
| 109 |             return -1; | 
| 110 |         return c; | 
| 111 |     } | 
| 112 |  | 
| 113 |     return -1; | 
| 114 | } | 
| 115 |  | 
| 116 | // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed | 
| 117 | // into the first byte, depending on how many bytes follow.  There are | 
| 118 | // as many entries in this table as there are UTF-8 sequence types. | 
| 119 | // (I.e., one byte sequence, two byte... etc.). Remember that sequencs | 
| 120 | // for *legal* UTF-8 will be 4 or fewer bytes total. | 
| 121 | static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; | 
| 122 |  | 
| 123 | ConversionResult convertUTF16ToUTF8( | 
| 124 |     const UChar** sourceStart, const UChar* sourceEnd,  | 
| 125 |     char** targetStart, char* targetEnd, bool strict) | 
| 126 | { | 
| 127 |     ConversionResult result = conversionOK; | 
| 128 |     const UChar* source = *sourceStart; | 
| 129 |     char* target = *targetStart; | 
| 130 |     while (source < sourceEnd) { | 
| 131 |         UChar32 ch; | 
| 132 |         unsigned short bytesToWrite = 0; | 
| 133 |         const UChar32 byteMask = 0xBF; | 
| 134 |         const UChar32 byteMark = 0x80;  | 
| 135 |         const UChar* oldSource = source; // In case we have to back up because of target overflow. | 
| 136 |         ch = static_cast<unsigned short>(*source++); | 
| 137 |         // If we have a surrogate pair, convert to UChar32 first. | 
| 138 |         if (ch >= 0xD800 && ch <= 0xDBFF) { | 
| 139 |             // If the 16 bits following the high surrogate are in the source buffer... | 
| 140 |             if (source < sourceEnd) { | 
| 141 |                 UChar32 ch2 = static_cast<unsigned short>(*source); | 
| 142 |                 // If it's a low surrogate, convert to UChar32. | 
| 143 |                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | 
| 144 |                     ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; | 
| 145 |                     ++source; | 
| 146 |                 } else if (strict) { // it's an unpaired high surrogate | 
| 147 |                     --source; // return to the illegal value itself | 
| 148 |                     result = sourceIllegal; | 
| 149 |                     break; | 
| 150 |                 } | 
| 151 |             } else { // We don't have the 16 bits following the high surrogate. | 
| 152 |                 --source; // return to the high surrogate | 
| 153 |                 result = sourceExhausted; | 
| 154 |                 break; | 
| 155 |             } | 
| 156 |         } else if (strict) { | 
| 157 |             // UTF-16 surrogate values are illegal in UTF-32 | 
| 158 |             if (ch >= 0xDC00 && ch <= 0xDFFF) { | 
| 159 |                 --source; // return to the illegal value itself | 
| 160 |                 result = sourceIllegal; | 
| 161 |                 break; | 
| 162 |             } | 
| 163 |         } | 
| 164 |         // Figure out how many bytes the result will require | 
| 165 |         if (ch < (UChar32)0x80) { | 
| 166 |             bytesToWrite = 1; | 
| 167 |         } else if (ch < (UChar32)0x800) { | 
| 168 |             bytesToWrite = 2; | 
| 169 |         } else if (ch < (UChar32)0x10000) { | 
| 170 |             bytesToWrite = 3; | 
| 171 |         } else if (ch < (UChar32)0x110000) { | 
| 172 |             bytesToWrite = 4; | 
| 173 |         } else { | 
| 174 |             bytesToWrite = 3; | 
| 175 |             ch = 0xFFFD; | 
| 176 |         } | 
| 177 |  | 
| 178 |         target += bytesToWrite; | 
| 179 |         if (target > targetEnd) { | 
| 180 |             source = oldSource; // Back up source pointer! | 
| 181 |             target -= bytesToWrite; | 
| 182 |             result = targetExhausted; | 
| 183 |             break; | 
| 184 |         } | 
| 185 |         switch (bytesToWrite) { // note: everything falls through. | 
| 186 |             case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; | 
| 187 |             case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; | 
| 188 |             case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; | 
| 189 |             case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]); | 
| 190 |         } | 
| 191 |         target += bytesToWrite; | 
| 192 |     } | 
| 193 |     *sourceStart = source; | 
| 194 |     *targetStart = target; | 
| 195 |     return result; | 
| 196 | } | 
| 197 |  | 
| 198 | // This must be called with the length pre-determined by the first byte. | 
| 199 | // If presented with a length > 4, this returns false.  The Unicode | 
| 200 | // definition of UTF-8 goes up to 4-byte sequences. | 
| 201 | static bool isLegalUTF8(const unsigned char* source, int length) | 
| 202 | { | 
| 203 |     unsigned char a; | 
| 204 |     const unsigned char* srcptr = source + length; | 
| 205 |     switch (length) { | 
| 206 |         default: return false; | 
| 207 |         // Everything else falls through when "true"... | 
| 208 |         case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; | 
| 209 |         case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; | 
| 210 |         case 2: if ((a = (*--srcptr)) > 0xBF) return false; | 
| 211 |  | 
| 212 |         switch (*source) { | 
| 213 |             // no fall-through in this inner switch | 
| 214 |             case 0xE0: if (a < 0xA0) return false; break; | 
| 215 |             case 0xED: if (a > 0x9F) return false; break; | 
| 216 |             case 0xF0: if (a < 0x90) return false; break; | 
| 217 |             case 0xF4: if (a > 0x8F) return false; break; | 
| 218 |             default:   if (a < 0x80) return false; | 
| 219 |         } | 
| 220 |  | 
| 221 |         case 1: if (*source >= 0x80 && *source < 0xC2) return false; | 
| 222 |     } | 
| 223 |     if (*source > 0xF4) | 
| 224 |         return false; | 
| 225 |     return true; | 
| 226 | } | 
| 227 |  | 
| 228 | // Magic values subtracted from a buffer value during UTF8 conversion. | 
| 229 | // This table contains as many values as there might be trailing bytes | 
| 230 | // in a UTF-8 sequence. | 
| 231 | static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,  | 
| 232 |             0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82082080UL) }; | 
| 233 |  | 
| 234 | ConversionResult convertUTF8ToUTF16( | 
| 235 |     const char** sourceStart, const char* sourceEnd,  | 
| 236 |     UChar** targetStart, UChar* targetEnd, bool strict) | 
| 237 | { | 
| 238 |     ConversionResult result = conversionOK; | 
| 239 |     const char* source = *sourceStart; | 
| 240 |     UChar* target = *targetStart; | 
| 241 |     while (source < sourceEnd) { | 
| 242 |         UChar32 ch = 0; | 
| 243 |         int  = UTF8SequenceLength(b0: *source) - 1; | 
| 244 |         if (source + extraBytesToRead >= sourceEnd) { | 
| 245 |             result = sourceExhausted; | 
| 246 |             break; | 
| 247 |         } | 
| 248 |         // Do this check whether lenient or strict | 
| 249 |         if (!isLegalUTF8(source: reinterpret_cast<const unsigned char*>(source), length: extraBytesToRead + 1)) { | 
| 250 |             result = sourceIllegal; | 
| 251 |             break; | 
| 252 |         } | 
| 253 |         // The cases all fall through. | 
| 254 |         switch (extraBytesToRead) { | 
| 255 |             case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 | 
| 256 |             case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 | 
| 257 |             case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6; | 
| 258 |             case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6; | 
| 259 |             case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6; | 
| 260 |             case 0: ch += static_cast<unsigned char>(*source++); | 
| 261 |         } | 
| 262 |         ch -= offsetsFromUTF8[extraBytesToRead]; | 
| 263 |  | 
| 264 |         if (target >= targetEnd) { | 
| 265 |             source -= (extraBytesToRead + 1); // Back up source pointer! | 
| 266 |             result = targetExhausted; break; | 
| 267 |         } | 
| 268 |         if (ch <= 0xFFFF) { | 
| 269 |             // UTF-16 surrogate values are illegal in UTF-32 | 
| 270 |             if (ch >= 0xD800 && ch <= 0xDFFF) { | 
| 271 |                 if (strict) { | 
| 272 |                     source -= (extraBytesToRead + 1); // return to the illegal value itself | 
| 273 |                     result = sourceIllegal; | 
| 274 |                     break; | 
| 275 |                 } else | 
| 276 |                     *target++ = 0xFFFD; | 
| 277 |             } else | 
| 278 |                 *target++ = (UChar)ch; // normal case | 
| 279 |         } else if (ch > 0x10FFFF) { | 
| 280 |             if (strict) { | 
| 281 |                 result = sourceIllegal; | 
| 282 |                 source -= (extraBytesToRead + 1); // return to the start | 
| 283 |                 break; // Bail out; shouldn't continue | 
| 284 |             } else | 
| 285 |                 *target++ = 0xFFFD; | 
| 286 |         } else { | 
| 287 |             // target is a character in range 0xFFFF - 0x10FFFF | 
| 288 |             if (target + 1 >= targetEnd) { | 
| 289 |                 source -= (extraBytesToRead + 1); // Back up source pointer! | 
| 290 |                 result = targetExhausted; | 
| 291 |                 break; | 
| 292 |             } | 
| 293 |             ch -= 0x0010000UL; | 
| 294 |             *target++ = (UChar)((ch >> 10) + 0xD800); | 
| 295 |             *target++ = (UChar)((ch & 0x03FF) + 0xDC00); | 
| 296 |         } | 
| 297 |     } | 
| 298 |     *sourceStart = source; | 
| 299 |     *targetStart = target; | 
| 300 |     return result; | 
| 301 | } | 
| 302 |  | 
| 303 | } | 
| 304 | } | 
| 305 |  |