1/* Pango
2 * pango-script.c: Script tag handling
3 *
4 * Copyright (C) 2002 Red Hat Software
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 *
21 * Implementation of pango_script_iter is derived from ICU:
22 *
23 * icu/sources/common/usc_impl.c
24 *
25 **********************************************************************
26 * Copyright (C) 1999-2002, International Business Machines
27 * Corporation and others. All Rights Reserved.
28 **********************************************************************
29 *
30 * Permission is hereby granted, free of charge, to any person obtaining a
31 * copy of this software and associated documentation files (the
32 * "Software"), to deal in the Software without restriction, including
33 * without limitation the rights to use, copy, modify, merge, publish,
34 * distribute, and/or sell copies of the Software, and to permit persons
35 * to whom the Software is furnished to do so, provided that the above
36 * copyright notice(s) and this permission notice appear in all copies of
37 * the Software and that both the above copyright notice(s) and this
38 * permission notice appear in supporting documentation.
39 *
40 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
41 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
43 * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
44 * HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
45 * INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
46 * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
47 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
48 * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
49 *
50 * Except as contained in this notice, the name of a copyright holder
51 * shall not be used in advertising or otherwise to promote the sale, use
52 * or other dealings in this Software without prior written authorization
53 * of the copyright holder.
54 */
55
56#include "config.h"
57#include <stdlib.h>
58#include <string.h>
59
60#include "pango-script.h"
61#include "pango-script-private.h"
62
63/**
64 * pango_script_for_unichar:
65 * @ch: a Unicode character
66 *
67 * Looks up the script for a particular character.
68 *
69 * The script of a character is defined by
70 * [Unicode Standard Annex 24: Script names](http://www.unicode.org/reports/tr24/).
71 *
72 * No check is made for @ch being a valid Unicode character; if you pass
73 * in invalid character, the result is undefined.
74 *
75 * Note that while the return type of this function is declared
76 * as `PangoScript`, as of Pango 1.18, this function simply returns
77 * the return value of [func@GLib.unichar_get_script]. Callers must be
78 * prepared to handle unknown values.
79 *
80 * Return value: the `PangoScript` for the character.
81 *
82 * Since: 1.4
83 * Deprecated: 1.44. Use g_unichar_get_script()
84 **/
85PangoScript
86pango_script_for_unichar (gunichar ch)
87{
88 return (PangoScript)g_unichar_get_script (ch);
89}
90
91/**********************************************************************/
92
93static PangoScriptIter *pango_script_iter_copy (PangoScriptIter *iter);
94
95G_DEFINE_BOXED_TYPE (PangoScriptIter,
96 pango_script_iter,
97 pango_script_iter_copy,
98 pango_script_iter_free)
99
100PangoScriptIter *
101_pango_script_iter_init (PangoScriptIter *iter,
102 const char *text,
103 int length)
104{
105 iter->text_start = text;
106 if (length >= 0)
107 iter->text_end = text + length;
108 else
109 iter->text_end = text + strlen (s: text);
110
111 iter->script_start = text;
112 iter->script_end = text;
113 iter->script_code = PANGO_SCRIPT_COMMON;
114
115 iter->paren_sp = -1;
116
117 pango_script_iter_next (iter);
118
119 return iter;
120}
121
122/**
123 * pango_script_iter_new:
124 * @text: a UTF-8 string
125 * @length: length of @text, or -1 if @text is nul-terminated
126 *
127 * Create a new `PangoScriptIter`, used to break a string of
128 * Unicode text into runs by Unicode script.
129 *
130 * No copy is made of @text, so the caller needs to make
131 * sure it remains valid until the iterator is freed with
132 * [method@Pango.ScriptIter.free].
133 *
134 * Return value: the new script iterator, initialized
135 * to point at the first range in the text, which should be
136 * freed with [method@Pango.ScriptIter.free]. If the string is
137 * empty, it will point at an empty range.
138 *
139 * Since: 1.4
140 **/
141PangoScriptIter *
142pango_script_iter_new (const char *text,
143 int length)
144{
145 return _pango_script_iter_init (g_slice_new (PangoScriptIter), text, length);
146}
147
148static PangoScriptIter *
149pango_script_iter_copy (PangoScriptIter *iter)
150{
151 return g_slice_dup (PangoScriptIter, iter);
152}
153
154void
155_pango_script_iter_fini (PangoScriptIter *iter)
156{
157}
158
159/**
160 * pango_script_iter_free:
161 * @iter: a `PangoScriptIter`
162 *
163 * Frees a `PangoScriptIter`.
164 *
165 * Since: 1.4
166 */
167void
168pango_script_iter_free (PangoScriptIter *iter)
169{
170 _pango_script_iter_fini (iter);
171 g_slice_free (PangoScriptIter, iter);
172}
173
174/**
175 * pango_script_iter_get_range:
176 * @iter: a `PangoScriptIter`
177 * @start: (out) (optional): location to store start position of the range
178 * @end: (out) (optional): location to store end position of the range
179 * @script: (out) (optional): location to store script for range
180 *
181 * Gets information about the range to which @iter currently points.
182 *
183 * The range is the set of locations p where *start <= p < *end.
184 * (That is, it doesn't include the character stored at *end)
185 *
186 * Note that while the type of the @script argument is declared
187 * as `PangoScript`, as of Pango 1.18, this function simply returns
188 * `GUnicodeScript` values. Callers must be prepared to handle unknown
189 * values.
190 *
191 * Since: 1.4
192 */
193void
194pango_script_iter_get_range (PangoScriptIter *iter,
195 const char **start,
196 const char **end,
197 PangoScript *script)
198{
199 if (start)
200 *start = iter->script_start;
201 if (end)
202 *end = iter->script_end;
203 if (script)
204 *script = iter->script_code;
205}
206
207static const gunichar paired_chars[] = {
208 0x0028, 0x0029, /* ascii paired punctuation */
209 0x003c, 0x003e,
210 0x005b, 0x005d,
211 0x007b, 0x007d,
212 0x00ab, 0x00bb, /* guillemets */
213 0x0f3a, 0x0f3b, /* tibetan */
214 0x0f3c, 0x0f3d,
215 0x169b, 0x169c, /* ogham */
216 0x2018, 0x2019, /* general punctuation */
217 0x201c, 0x201d,
218 0x2039, 0x203a,
219 0x2045, 0x2046,
220 0x207d, 0x207e,
221 0x208d, 0x208e,
222 0x27e6, 0x27e7, /* math */
223 0x27e8, 0x27e9,
224 0x27ea, 0x27eb,
225 0x27ec, 0x27ed,
226 0x27ee, 0x27ef,
227 0x2983, 0x2984,
228 0x2985, 0x2986,
229 0x2987, 0x2988,
230 0x2989, 0x298a,
231 0x298b, 0x298c,
232 0x298d, 0x298e,
233 0x298f, 0x2990,
234 0x2991, 0x2992,
235 0x2993, 0x2994,
236 0x2995, 0x2996,
237 0x2997, 0x2998,
238 0x29fc, 0x29fd,
239 0x2e02, 0x2e03,
240 0x2e04, 0x2e05,
241 0x2e09, 0x2e0a,
242 0x2e0c, 0x2e0d,
243 0x2e1c, 0x2e1d,
244 0x2e20, 0x2e21,
245 0x2e22, 0x2e23,
246 0x2e24, 0x2e25,
247 0x2e26, 0x2e27,
248 0x2e28, 0x2e29,
249 0x3008, 0x3009, /* chinese paired punctuation */
250 0x300a, 0x300b,
251 0x300c, 0x300d,
252 0x300e, 0x300f,
253 0x3010, 0x3011,
254 0x3014, 0x3015,
255 0x3016, 0x3017,
256 0x3018, 0x3019,
257 0x301a, 0x301b,
258 0xfe59, 0xfe5a,
259 0xfe5b, 0xfe5c,
260 0xfe5d, 0xfe5e,
261 0xff08, 0xff09,
262 0xff3b, 0xff3d,
263 0xff5b, 0xff5d,
264 0xff5f, 0xff60,
265 0xff62, 0xff63
266};
267
268static int
269get_pair_index (gunichar ch)
270{
271 int lower = 0;
272 int upper = G_N_ELEMENTS (paired_chars) - 1;
273
274 while (lower <= upper)
275 {
276 int mid = (lower + upper) / 2;
277
278 if (ch < paired_chars[mid])
279 upper = mid - 1;
280 else if (ch > paired_chars[mid])
281 lower = mid + 1;
282 else
283 return mid;
284 }
285
286 return -1;
287}
288
289/* duplicated in pango-language.c */
290#define REAL_SCRIPT(script) \
291 ((script) > PANGO_SCRIPT_INHERITED && (script) != PANGO_SCRIPT_UNKNOWN)
292
293#define SAME_SCRIPT(script1, script2) \
294 (!REAL_SCRIPT (script1) || !REAL_SCRIPT (script2) || (script1) == (script2))
295
296#define IS_OPEN(pair_index) (((pair_index) & 1) == 0)
297
298/**
299 * pango_script_iter_next:
300 * @iter: a `PangoScriptIter`
301 *
302 * Advances a `PangoScriptIter` to the next range.
303 *
304 * If @iter is already at the end, it is left unchanged
305 * and %FALSE is returned.
306 *
307 * Return value: %TRUE if @iter was successfully advanced
308 *
309 * Since: 1.4
310 */
311gboolean
312pango_script_iter_next (PangoScriptIter *iter)
313{
314 int start_sp;
315
316 if (iter->script_end == iter->text_end)
317 return FALSE;
318
319 start_sp = iter->paren_sp;
320 iter->script_code = PANGO_SCRIPT_COMMON;
321 iter->script_start = iter->script_end;
322
323 for (; iter->script_end < iter->text_end; iter->script_end = g_utf8_next_char (iter->script_end))
324 {
325 gunichar ch = g_utf8_get_char (p: iter->script_end);
326 PangoScript sc;
327 int pair_index;
328
329 sc = (PangoScript)g_unichar_get_script (ch);
330 if (sc != PANGO_SCRIPT_COMMON)
331 pair_index = -1;
332 else
333 pair_index = get_pair_index (ch);
334
335 /*
336 * Paired character handling:
337 *
338 * if it's an open character, push it onto the stack.
339 * if it's a close character, find the matching open on the
340 * stack, and use that script code. Any non-matching open
341 * characters above it on the stack will be poped.
342 */
343 if (pair_index >= 0)
344 {
345 if (IS_OPEN (pair_index))
346 {
347 /*
348 * If the paren stack is full, empty it. This
349 * means that deeply nested paired punctuation
350 * characters will be ignored, but that's an unusual
351 * case, and it's better to ignore them than to
352 * write off the end of the stack...
353 */
354 if (++iter->paren_sp >= PAREN_STACK_DEPTH)
355 iter->paren_sp = 0;
356
357 iter->paren_stack[iter->paren_sp].pair_index = pair_index;
358 iter->paren_stack[iter->paren_sp].script_code = iter->script_code;
359 }
360 else if (iter->paren_sp >= 0)
361 {
362 int pi = pair_index & ~1;
363
364 while (iter->paren_sp >= 0 && iter->paren_stack[iter->paren_sp].pair_index != pi)
365 iter->paren_sp--;
366
367 if (iter->paren_sp < start_sp)
368 start_sp = iter->paren_sp;
369
370 if (iter->paren_sp >= 0)
371 sc = iter->paren_stack[iter->paren_sp].script_code;
372 }
373 }
374
375 if (SAME_SCRIPT (iter->script_code, sc))
376 {
377 if (!REAL_SCRIPT (iter->script_code) && REAL_SCRIPT (sc))
378 {
379 iter->script_code = sc;
380
381 /*
382 * now that we have a final script code, fix any open
383 * characters we pushed before we knew the script code.
384 */
385 while (start_sp < iter->paren_sp)
386 iter->paren_stack[++start_sp].script_code = iter->script_code;
387 }
388
389 /*
390 * if this character is a close paired character,
391 * pop it from the stack
392 */
393 if (pair_index >= 0 && !IS_OPEN (pair_index) && iter->paren_sp >= 0)
394 {
395 iter->paren_sp--;
396
397 if (iter->paren_sp < start_sp)
398 start_sp = iter->paren_sp;
399 }
400 }
401 else
402 {
403 /* Different script, we're done */
404 break;
405 }
406 }
407
408 return TRUE;
409}
410
411/**********************************************************
412 * End of code from ICU
413 **********************************************************/
414

source code of gtk/subprojects/pango/pango/pango-script.c