UTF8.cpp source code [qtscript/src/3rdparty/javascriptcore/JavaScriptCore/wtf/unicode/UTF8.cpp]

1	/*
2	* Copyright (C) 2007 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#include "config.h"
27	#include "UTF8.h"
28
29	namespace WTF {
30	namespace Unicode {
31
32	inline int inlineUTF8SequenceLengthNonASCII(char b0)
33	{
34	if ((b0 & `0xC0`) != `0xC0`)
35	return `0`;
36	if ((b0 & `0xE0`) == `0xC0`)
37	return `2`;
38	if ((b0 & `0xF0`) == `0xE0`)
39	return `3`;
40	if ((b0 & `0xF8`) == `0xF0`)
41	return `4`;
42	return `0`;
43	}
44
45	inline int inlineUTF8SequenceLength(char b0)
46	{
47	return (b0 & `0x80`) == `0` ? `1` : inlineUTF8SequenceLengthNonASCII(b0);
48	}
49
50	int UTF8SequenceLength(char b0)
51	{
52	return (b0 & `0x80`) == `0` ? `1` : inlineUTF8SequenceLengthNonASCII(b0);
53	}
54
55	int decodeUTF8Sequence(const char* sequence)
56	{
57	// Handle 0-byte sequences (never valid).
58	const unsigned char b0 = sequence[`0`];
59	const int length = inlineUTF8SequenceLength(b0);
60	if (length == `0`)
61	return -`1`;
62
63	// Handle 1-byte sequences (plain ASCII).
64	const unsigned char b1 = sequence[`1`];
65	if (length == `1`) {
66	if (b1)
67	return -`1`;
68	return b0;
69	}
70
71	// Handle 2-byte sequences.
72	if ((b1 & `0xC0`) != `0x80`)
73	return -`1`;
74	const unsigned char b2 = sequence[`2`];
75	if (length == `2`) {
76	if (b2)
77	return -`1`;
78	const int c = ((b0 & `0x1F`) << `6`) \| (b1 & `0x3F`);
79	if (c < `0x80`)
80	return -`1`;
81	return c;
82	}
83
84	// Handle 3-byte sequences.
85	if ((b2 & `0xC0`) != `0x80`)
86	return -`1`;
87	const unsigned char b3 = sequence[`3`];
88	if (length == `3`) {
89	if (b3)
90	return -`1`;
91	const int c = ((b0 & `0xF`) << `12`) \| ((b1 & `0x3F`) << `6`) \| (b2 & `0x3F`);
92	if (c < `0x800`)
93	return -`1`;
94	// UTF-16 surrogates should never appear in UTF-8 data.
95	if (c >= `0xD800` && c <= `0xDFFF`)
96	return -`1`;
97	return c;
98	}
99
100	// Handle 4-byte sequences.
101	if ((b3 & `0xC0`) != `0x80`)
102	return -`1`;
103	const unsigned char b4 = sequence[`4`];
104	if (length == `4`) {
105	if (b4)
106	return -`1`;
107	const int c = ((b0 & `0x7`) << `18`) \| ((b1 & `0x3F`) << `12`) \| ((b2 & `0x3F`) << `6`) \| (b3 & `0x3F`);
108	if (c < `0x10000` \|\| c > `0x10FFFF`)
109	return -`1`;
110	return c;
111	}
112
113	return -`1`;
114	}
115
116	// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
117	// into the first byte, depending on how many bytes follow. There are
118	// as many entries in this table as there are UTF-8 sequence types.
119	// (I.e., one byte sequence, two byte... etc.). Remember that sequencs
120	// for legal* UTF-8 will be 4 or fewer bytes total.*
121	static const unsigned char firstByteMark[`7`] = { `0x00`, `0x00`, `0xC0`, `0xE0`, `0xF0`, `0xF8`, `0xFC` };
122
123	ConversionResult convertUTF16ToUTF8(
124	const UChar** sourceStart, const UChar* sourceEnd,
125	char** targetStart, char* targetEnd, bool strict)
126	{
127	ConversionResult result = conversionOK;
128	const UChar* source = *sourceStart;
129	char* target = *targetStart;
130	while (source < sourceEnd) {
131	UChar32 ch;
132	unsigned short bytesToWrite = `0`;
133	const UChar32 byteMask = `0xBF`;
134	const UChar32 byteMark = `0x80`;
135	const UChar* oldSource = source; // In case we have to back up because of target overflow.
136	ch = static_cast<unsigned short>(*source++);
137	// If we have a surrogate pair, convert to UChar32 first.
138	if (ch >= `0xD800` && ch <= `0xDBFF`) {
139	// If the 16 bits following the high surrogate are in the source buffer...
140	if (source < sourceEnd) {
141	UChar32 ch2 = static_cast<unsigned short>(*source);
142	// If it's a low surrogate, convert to UChar32.
143	if (ch2 >= `0xDC00` && ch2 <= `0xDFFF`) {
144	ch = ((ch - `0xD800`) << `10`) + (ch2 - `0xDC00`) + `0x0010000`;
145	++source;
146	} else if (strict) { // it's an unpaired high surrogate
147	--source; // return to the illegal value itself
148	result = sourceIllegal;
149	break;
150	}
151	} else { // We don't have the 16 bits following the high surrogate.
152	--source; // return to the high surrogate
153	result = sourceExhausted;
154	break;
155	}
156	} else if (strict) {
157	// UTF-16 surrogate values are illegal in UTF-32
158	if (ch >= `0xDC00` && ch <= `0xDFFF`) {
159	--source; // return to the illegal value itself
160	result = sourceIllegal;
161	break;
162	}
163	}
164	// Figure out how many bytes the result will require
165	if (ch < (UChar32)`0x80`) {
166	bytesToWrite = `1`;
167	} else if (ch < (UChar32)`0x800`) {
168	bytesToWrite = `2`;
169	} else if (ch < (UChar32)`0x10000`) {
170	bytesToWrite = `3`;
171	} else if (ch < (UChar32)`0x110000`) {
172	bytesToWrite = `4`;
173	} else {
174	bytesToWrite = `3`;
175	ch = `0xFFFD`;
176	}
177
178	target += bytesToWrite;
179	if (target > targetEnd) {
180	source = oldSource; // Back up source pointer!
181	target -= bytesToWrite;
182	result = targetExhausted;
183	break;
184	}
185	switch (bytesToWrite) { // note: everything falls through.
186	case `4`: --target = (char*)((ch \| byteMark) & byteMask); ch >>= `6`;
187	case `3`: --target = (char*)((ch \| byteMark) & byteMask); ch >>= `6`;
188	case `2`: --target = (char*)((ch \| byteMark) & byteMask); ch >>= `6`;
189	case `1`: --target = (char*)(ch \| firstByteMark[bytesToWrite]);
190	}
191	target += bytesToWrite;
192	}
193	*sourceStart = source;
194	*targetStart = target;
195	return result;
196	}
197
198	// This must be called with the length pre-determined by the first byte.
199	// If presented with a length > 4, this returns false. The Unicode
200	// definition of UTF-8 goes up to 4-byte sequences.
201	static bool isLegalUTF8(const unsigned char* source, int length)
202	{
203	unsigned char a;
204	const unsigned char* srcptr = source + length;
205	switch (length) {
206	default: return false;
207	// Everything else falls through when "true"...
208	case `4`: if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
209	case `3`: if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
210	case `2`: if ((a = (--srcptr)) > `0xBF`) return* false;
211
212	switch (*source) {
213	// no fall-through in this inner switch
214	case `0xE0`: if (a < `0xA0`) return false; break;
215	case `0xED`: if (a > `0x9F`) return false; break;
216	case `0xF0`: if (a < `0x90`) return false; break;
217	case `0xF4`: if (a > `0x8F`) return false; break;
218	default: if (a < `0x80`) return false;
219	}
220
221	case `1`: if (source >= `0x80` && source < `0xC2`) return false;
222	}
223	if (*source > `0xF4`)
224	return false;
225	return true;
226	}
227
228	// Magic values subtracted from a buffer value during UTF8 conversion.
229	// This table contains as many values as there might be trailing bytes
230	// in a UTF-8 sequence.
231	static const UChar32 offsetsFromUTF8[`6`] = { `0x00000000UL`, `0x00003080UL`, `0x000E2080UL`,
232	`0x03C82080UL`, static_cast<UChar32>(`0xFA082080UL`), static_cast<UChar32>(`0x82082080UL`) };
233
234	ConversionResult convertUTF8ToUTF16(
235	const char** sourceStart, const char* sourceEnd,
236	UChar** targetStart, UChar* targetEnd, bool strict)
237	{
238	ConversionResult result = conversionOK;
239	const char* source = *sourceStart;
240	UChar* target = *targetStart;
241	while (source < sourceEnd) {
242	UChar32 ch = `0`;
243	int extraBytesToRead = UTF8SequenceLength(b0: *source) - `1`;
244	if (source + extraBytesToRead >= sourceEnd) {
245	result = sourceExhausted;
246	break;
247	}
248	// Do this check whether lenient or strict
249	if (!isLegalUTF8(source: reinterpret_cast<const unsigned char*>(source), length: extraBytesToRead + `1`)) {
250	result = sourceIllegal;
251	break;
252	}
253	// The cases all fall through.
254	switch (extraBytesToRead) {
255	case `5`: ch += static_cast<unsigned char>(source++); ch <<= `6`; // remember, illegal UTF-8*
256	case `4`: ch += static_cast<unsigned char>(source++); ch <<= `6`; // remember, illegal UTF-8*
257	case `3`: ch += static_cast<unsigned char>(*source++); ch <<= `6`;
258	case `2`: ch += static_cast<unsigned char>(*source++); ch <<= `6`;
259	case `1`: ch += static_cast<unsigned char>(*source++); ch <<= `6`;
260	case `0`: ch += static_cast<unsigned char>(*source++);
261	}
262	ch -= offsetsFromUTF8[extraBytesToRead];
263
264	if (target >= targetEnd) {
265	source -= (extraBytesToRead + `1`); // Back up source pointer!
266	result = targetExhausted; break;
267	}
268	if (ch <= `0xFFFF`) {
269	// UTF-16 surrogate values are illegal in UTF-32
270	if (ch >= `0xD800` && ch <= `0xDFFF`) {
271	if (strict) {
272	source -= (extraBytesToRead + `1`); // return to the illegal value itself
273	result = sourceIllegal;
274	break;
275	} else
276	*target++ = `0xFFFD`;
277	} else
278	target++ = (UChar)ch; // normal case*
279	} else if (ch > `0x10FFFF`) {
280	if (strict) {
281	result = sourceIllegal;
282	source -= (extraBytesToRead + `1`); // return to the start
283	break; // Bail out; shouldn't continue
284	} else
285	*target++ = `0xFFFD`;
286	} else {
287	// target is a character in range 0xFFFF - 0x10FFFF
288	if (target + `1` >= targetEnd) {
289	source -= (extraBytesToRead + `1`); // Back up source pointer!
290	result = targetExhausted;
291	break;
292	}
293	ch -= `0x0010000UL`;
294	*target++ = (UChar)((ch >> `10`) + `0xD800`);
295	*target++ = (UChar)((ch & `0x03FF`) + `0xDC00`);
296	}
297	}
298	*sourceStart = source;
299	*targetStart = target;
300	return result;
301	}
302
303	}
304	}
305

source code of qtscript/src/3rdparty/javascriptcore/JavaScriptCore/wtf/unicode/UTF8.cpp