| 1 | /* | 
| 2 |  * Copyright (C) 2007 Apple Inc.  All rights reserved. | 
| 3 |  * | 
| 4 |  * Redistribution and use in source and binary forms, with or without | 
| 5 |  * modification, are permitted provided that the following conditions | 
| 6 |  * are met: | 
| 7 |  * 1. Redistributions of source code must retain the above copyright | 
| 8 |  *    notice, this list of conditions and the following disclaimer. | 
| 9 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 10 |  *    notice, this list of conditions and the following disclaimer in the | 
| 11 |  *    documentation and/or other materials provided with the distribution. | 
| 12 |  * | 
| 13 |  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY | 
| 14 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 15 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
| 16 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR | 
| 17 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
| 18 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
| 19 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
| 20 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
| 21 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
| 22 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 23 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
| 24 |  */ | 
| 25 |  | 
| 26 | #ifndef WTF_UTF8_h | 
| 27 | #define WTF_UTF8_h | 
| 28 |  | 
| 29 | #include "Unicode.h" | 
| 30 |  | 
| 31 | namespace WTF { | 
| 32 |   namespace Unicode { | 
| 33 |  | 
| 34 |     // Given a first byte, gives the length of the UTF-8 sequence it begins. | 
| 35 |     // Returns 0 for bytes that are not legal starts of UTF-8 sequences. | 
| 36 |     // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). | 
| 37 |     int UTF8SequenceLength(char); | 
| 38 |  | 
| 39 |     // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. | 
| 40 |     // Only allows Unicode characters (U-00000000 to U-0010FFFF). | 
| 41 |     // Returns -1 if the sequence is not valid (including presence of extra bytes). | 
| 42 |     int decodeUTF8Sequence(const char*); | 
| 43 |  | 
| 44 |     typedef enum { | 
| 45 |             conversionOK,       // conversion successful | 
| 46 |             sourceExhausted,    // partial character in source, but hit end | 
| 47 |             targetExhausted,    // insuff. room in target for conversion | 
| 48 |             sourceIllegal       // source sequence is illegal/malformed | 
| 49 |     } ConversionResult; | 
| 50 |  | 
| 51 |     // These conversion functions take a "strict" argument. When this | 
| 52 |     // flag is set to strict, both irregular sequences and isolated surrogates | 
| 53 |     // will cause an error.  When the flag is set to lenient, both irregular | 
| 54 |     // sequences and isolated surrogates are converted. | 
| 55 |     //  | 
| 56 |     // Whether the flag is strict or lenient, all illegal sequences will cause | 
| 57 |     // an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, | 
| 58 |     // or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code | 
| 59 |     // must check for illegal sequences. | 
| 60 |     //  | 
| 61 |     // When the flag is set to lenient, characters over 0x10FFFF are converted | 
| 62 |     // to the replacement character; otherwise (when the flag is set to strict) | 
| 63 |     // they constitute an error. | 
| 64 |  | 
| 65 |     ConversionResult convertUTF8ToUTF16( | 
| 66 |                     const char** sourceStart, const char* sourceEnd,  | 
| 67 |                     UChar** targetStart, UChar* targetEnd, bool strict = true); | 
| 68 |  | 
| 69 |     ConversionResult convertUTF16ToUTF8( | 
| 70 |                     const UChar** sourceStart, const UChar* sourceEnd,  | 
| 71 |                     char** targetStart, char* targetEnd, bool strict = true); | 
| 72 |   } | 
| 73 | } | 
| 74 |  | 
| 75 | #endif // WTF_UTF8_h | 
| 76 |  |