1 | /* |
2 | * Copyright (C) 2007 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #include "config.h" |
27 | #include "UTF8.h" |
28 | |
29 | namespace WTF { |
30 | namespace Unicode { |
31 | |
32 | inline int inlineUTF8SequenceLengthNonASCII(char b0) |
33 | { |
34 | if ((b0 & 0xC0) != 0xC0) |
35 | return 0; |
36 | if ((b0 & 0xE0) == 0xC0) |
37 | return 2; |
38 | if ((b0 & 0xF0) == 0xE0) |
39 | return 3; |
40 | if ((b0 & 0xF8) == 0xF0) |
41 | return 4; |
42 | return 0; |
43 | } |
44 | |
45 | inline int inlineUTF8SequenceLength(char b0) |
46 | { |
47 | return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); |
48 | } |
49 | |
50 | int UTF8SequenceLength(char b0) |
51 | { |
52 | return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); |
53 | } |
54 | |
55 | int decodeUTF8Sequence(const char* sequence) |
56 | { |
57 | // Handle 0-byte sequences (never valid). |
58 | const unsigned char b0 = sequence[0]; |
59 | const int length = inlineUTF8SequenceLength(b0); |
60 | if (length == 0) |
61 | return -1; |
62 | |
63 | // Handle 1-byte sequences (plain ASCII). |
64 | const unsigned char b1 = sequence[1]; |
65 | if (length == 1) { |
66 | if (b1) |
67 | return -1; |
68 | return b0; |
69 | } |
70 | |
71 | // Handle 2-byte sequences. |
72 | if ((b1 & 0xC0) != 0x80) |
73 | return -1; |
74 | const unsigned char b2 = sequence[2]; |
75 | if (length == 2) { |
76 | if (b2) |
77 | return -1; |
78 | const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); |
79 | if (c < 0x80) |
80 | return -1; |
81 | return c; |
82 | } |
83 | |
84 | // Handle 3-byte sequences. |
85 | if ((b2 & 0xC0) != 0x80) |
86 | return -1; |
87 | const unsigned char b3 = sequence[3]; |
88 | if (length == 3) { |
89 | if (b3) |
90 | return -1; |
91 | const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); |
92 | if (c < 0x800) |
93 | return -1; |
94 | // UTF-16 surrogates should never appear in UTF-8 data. |
95 | if (c >= 0xD800 && c <= 0xDFFF) |
96 | return -1; |
97 | return c; |
98 | } |
99 | |
100 | // Handle 4-byte sequences. |
101 | if ((b3 & 0xC0) != 0x80) |
102 | return -1; |
103 | const unsigned char b4 = sequence[4]; |
104 | if (length == 4) { |
105 | if (b4) |
106 | return -1; |
107 | const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); |
108 | if (c < 0x10000 || c > 0x10FFFF) |
109 | return -1; |
110 | return c; |
111 | } |
112 | |
113 | return -1; |
114 | } |
115 | |
116 | // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed |
117 | // into the first byte, depending on how many bytes follow. There are |
118 | // as many entries in this table as there are UTF-8 sequence types. |
119 | // (I.e., one byte sequence, two byte... etc.). Remember that sequencs |
120 | // for *legal* UTF-8 will be 4 or fewer bytes total. |
121 | static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
122 | |
123 | ConversionResult convertUTF16ToUTF8( |
124 | const UChar** sourceStart, const UChar* sourceEnd, |
125 | char** targetStart, char* targetEnd, bool strict) |
126 | { |
127 | ConversionResult result = conversionOK; |
128 | const UChar* source = *sourceStart; |
129 | char* target = *targetStart; |
130 | while (source < sourceEnd) { |
131 | UChar32 ch; |
132 | unsigned short bytesToWrite = 0; |
133 | const UChar32 byteMask = 0xBF; |
134 | const UChar32 byteMark = 0x80; |
135 | const UChar* oldSource = source; // In case we have to back up because of target overflow. |
136 | ch = static_cast<unsigned short>(*source++); |
137 | // If we have a surrogate pair, convert to UChar32 first. |
138 | if (ch >= 0xD800 && ch <= 0xDBFF) { |
139 | // If the 16 bits following the high surrogate are in the source buffer... |
140 | if (source < sourceEnd) { |
141 | UChar32 ch2 = static_cast<unsigned short>(*source); |
142 | // If it's a low surrogate, convert to UChar32. |
143 | if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |
144 | ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; |
145 | ++source; |
146 | } else if (strict) { // it's an unpaired high surrogate |
147 | --source; // return to the illegal value itself |
148 | result = sourceIllegal; |
149 | break; |
150 | } |
151 | } else { // We don't have the 16 bits following the high surrogate. |
152 | --source; // return to the high surrogate |
153 | result = sourceExhausted; |
154 | break; |
155 | } |
156 | } else if (strict) { |
157 | // UTF-16 surrogate values are illegal in UTF-32 |
158 | if (ch >= 0xDC00 && ch <= 0xDFFF) { |
159 | --source; // return to the illegal value itself |
160 | result = sourceIllegal; |
161 | break; |
162 | } |
163 | } |
164 | // Figure out how many bytes the result will require |
165 | if (ch < (UChar32)0x80) { |
166 | bytesToWrite = 1; |
167 | } else if (ch < (UChar32)0x800) { |
168 | bytesToWrite = 2; |
169 | } else if (ch < (UChar32)0x10000) { |
170 | bytesToWrite = 3; |
171 | } else if (ch < (UChar32)0x110000) { |
172 | bytesToWrite = 4; |
173 | } else { |
174 | bytesToWrite = 3; |
175 | ch = 0xFFFD; |
176 | } |
177 | |
178 | target += bytesToWrite; |
179 | if (target > targetEnd) { |
180 | source = oldSource; // Back up source pointer! |
181 | target -= bytesToWrite; |
182 | result = targetExhausted; |
183 | break; |
184 | } |
185 | switch (bytesToWrite) { // note: everything falls through. |
186 | case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; |
187 | case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; |
188 | case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; |
189 | case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]); |
190 | } |
191 | target += bytesToWrite; |
192 | } |
193 | *sourceStart = source; |
194 | *targetStart = target; |
195 | return result; |
196 | } |
197 | |
198 | // This must be called with the length pre-determined by the first byte. |
199 | // If presented with a length > 4, this returns false. The Unicode |
200 | // definition of UTF-8 goes up to 4-byte sequences. |
201 | static bool isLegalUTF8(const unsigned char* source, int length) |
202 | { |
203 | unsigned char a; |
204 | const unsigned char* srcptr = source + length; |
205 | switch (length) { |
206 | default: return false; |
207 | // Everything else falls through when "true"... |
208 | case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
209 | case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
210 | case 2: if ((a = (*--srcptr)) > 0xBF) return false; |
211 | |
212 | switch (*source) { |
213 | // no fall-through in this inner switch |
214 | case 0xE0: if (a < 0xA0) return false; break; |
215 | case 0xED: if (a > 0x9F) return false; break; |
216 | case 0xF0: if (a < 0x90) return false; break; |
217 | case 0xF4: if (a > 0x8F) return false; break; |
218 | default: if (a < 0x80) return false; |
219 | } |
220 | |
221 | case 1: if (*source >= 0x80 && *source < 0xC2) return false; |
222 | } |
223 | if (*source > 0xF4) |
224 | return false; |
225 | return true; |
226 | } |
227 | |
228 | // Magic values subtracted from a buffer value during UTF8 conversion. |
229 | // This table contains as many values as there might be trailing bytes |
230 | // in a UTF-8 sequence. |
231 | static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, |
232 | 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82082080UL) }; |
233 | |
234 | ConversionResult convertUTF8ToUTF16( |
235 | const char** sourceStart, const char* sourceEnd, |
236 | UChar** targetStart, UChar* targetEnd, bool strict) |
237 | { |
238 | ConversionResult result = conversionOK; |
239 | const char* source = *sourceStart; |
240 | UChar* target = *targetStart; |
241 | while (source < sourceEnd) { |
242 | UChar32 ch = 0; |
243 | int = UTF8SequenceLength(b0: *source) - 1; |
244 | if (source + extraBytesToRead >= sourceEnd) { |
245 | result = sourceExhausted; |
246 | break; |
247 | } |
248 | // Do this check whether lenient or strict |
249 | if (!isLegalUTF8(source: reinterpret_cast<const unsigned char*>(source), length: extraBytesToRead + 1)) { |
250 | result = sourceIllegal; |
251 | break; |
252 | } |
253 | // The cases all fall through. |
254 | switch (extraBytesToRead) { |
255 | case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 |
256 | case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 |
257 | case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6; |
258 | case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6; |
259 | case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6; |
260 | case 0: ch += static_cast<unsigned char>(*source++); |
261 | } |
262 | ch -= offsetsFromUTF8[extraBytesToRead]; |
263 | |
264 | if (target >= targetEnd) { |
265 | source -= (extraBytesToRead + 1); // Back up source pointer! |
266 | result = targetExhausted; break; |
267 | } |
268 | if (ch <= 0xFFFF) { |
269 | // UTF-16 surrogate values are illegal in UTF-32 |
270 | if (ch >= 0xD800 && ch <= 0xDFFF) { |
271 | if (strict) { |
272 | source -= (extraBytesToRead + 1); // return to the illegal value itself |
273 | result = sourceIllegal; |
274 | break; |
275 | } else |
276 | *target++ = 0xFFFD; |
277 | } else |
278 | *target++ = (UChar)ch; // normal case |
279 | } else if (ch > 0x10FFFF) { |
280 | if (strict) { |
281 | result = sourceIllegal; |
282 | source -= (extraBytesToRead + 1); // return to the start |
283 | break; // Bail out; shouldn't continue |
284 | } else |
285 | *target++ = 0xFFFD; |
286 | } else { |
287 | // target is a character in range 0xFFFF - 0x10FFFF |
288 | if (target + 1 >= targetEnd) { |
289 | source -= (extraBytesToRead + 1); // Back up source pointer! |
290 | result = targetExhausted; |
291 | break; |
292 | } |
293 | ch -= 0x0010000UL; |
294 | *target++ = (UChar)((ch >> 10) + 0xD800); |
295 | *target++ = (UChar)((ch & 0x03FF) + 0xDC00); |
296 | } |
297 | } |
298 | *sourceStart = source; |
299 | *targetStart = target; |
300 | return result; |
301 | } |
302 | |
303 | } |
304 | } |
305 | |