1 | /* Pango |
2 | * pango-script.c: Script tag handling |
3 | * |
4 | * Copyright (C) 2002 Red Hat Software |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Library General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Library General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Library General Public |
17 | * License along with this library; if not, write to the |
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
19 | * Boston, MA 02111-1307, USA. |
20 | * |
21 | * Implementation of pango_script_iter is derived from ICU: |
22 | * |
23 | * icu/sources/common/usc_impl.c |
24 | * |
25 | ********************************************************************** |
26 | * Copyright (C) 1999-2002, International Business Machines |
27 | * Corporation and others. All Rights Reserved. |
28 | ********************************************************************** |
29 | * |
30 | * Permission is hereby granted, free of charge, to any person obtaining a |
31 | * copy of this software and associated documentation files (the |
32 | * "Software"), to deal in the Software without restriction, including |
33 | * without limitation the rights to use, copy, modify, merge, publish, |
34 | * distribute, and/or sell copies of the Software, and to permit persons |
35 | * to whom the Software is furnished to do so, provided that the above |
36 | * copyright notice(s) and this permission notice appear in all copies of |
37 | * the Software and that both the above copyright notice(s) and this |
38 | * permission notice appear in supporting documentation. |
39 | * |
40 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
41 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
42 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT |
43 | * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
44 | * HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL |
45 | * INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING |
46 | * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, |
47 | * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION |
48 | * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
49 | * |
50 | * Except as contained in this notice, the name of a copyright holder |
51 | * shall not be used in advertising or otherwise to promote the sale, use |
52 | * or other dealings in this Software without prior written authorization |
53 | * of the copyright holder. |
54 | */ |
55 | |
56 | #include "config.h" |
57 | #include <stdlib.h> |
58 | #include <string.h> |
59 | |
60 | #include "pango-script.h" |
61 | #include "pango-script-private.h" |
62 | |
63 | /** |
64 | * pango_script_for_unichar: |
65 | * @ch: a Unicode character |
66 | * |
67 | * Looks up the script for a particular character. |
68 | * |
69 | * The script of a character is defined by |
70 | * [Unicode Standard Annex 24: Script names](http://www.unicode.org/reports/tr24/). |
71 | * |
72 | * No check is made for @ch being a valid Unicode character; if you pass |
73 | * in invalid character, the result is undefined. |
74 | * |
75 | * Note that while the return type of this function is declared |
76 | * as `PangoScript`, as of Pango 1.18, this function simply returns |
77 | * the return value of [func@GLib.unichar_get_script]. Callers must be |
78 | * prepared to handle unknown values. |
79 | * |
80 | * Return value: the `PangoScript` for the character. |
81 | * |
82 | * Since: 1.4 |
83 | * Deprecated: 1.44. Use g_unichar_get_script() |
84 | **/ |
85 | PangoScript |
86 | pango_script_for_unichar (gunichar ch) |
87 | { |
88 | return (PangoScript)g_unichar_get_script (ch); |
89 | } |
90 | |
91 | /**********************************************************************/ |
92 | |
93 | static PangoScriptIter *pango_script_iter_copy (PangoScriptIter *iter); |
94 | |
95 | G_DEFINE_BOXED_TYPE (PangoScriptIter, |
96 | pango_script_iter, |
97 | pango_script_iter_copy, |
98 | pango_script_iter_free) |
99 | |
100 | PangoScriptIter * |
101 | _pango_script_iter_init (PangoScriptIter *iter, |
102 | const char *text, |
103 | int length) |
104 | { |
105 | iter->text_start = text; |
106 | if (length >= 0) |
107 | iter->text_end = text + length; |
108 | else |
109 | iter->text_end = text + strlen (s: text); |
110 | |
111 | iter->script_start = text; |
112 | iter->script_end = text; |
113 | iter->script_code = PANGO_SCRIPT_COMMON; |
114 | |
115 | iter->paren_sp = -1; |
116 | |
117 | pango_script_iter_next (iter); |
118 | |
119 | return iter; |
120 | } |
121 | |
122 | /** |
123 | * pango_script_iter_new: |
124 | * @text: a UTF-8 string |
125 | * @length: length of @text, or -1 if @text is nul-terminated |
126 | * |
127 | * Create a new `PangoScriptIter`, used to break a string of |
128 | * Unicode text into runs by Unicode script. |
129 | * |
130 | * No copy is made of @text, so the caller needs to make |
131 | * sure it remains valid until the iterator is freed with |
132 | * [method@Pango.ScriptIter.free]. |
133 | * |
134 | * Return value: the new script iterator, initialized |
135 | * to point at the first range in the text, which should be |
136 | * freed with [method@Pango.ScriptIter.free]. If the string is |
137 | * empty, it will point at an empty range. |
138 | * |
139 | * Since: 1.4 |
140 | **/ |
141 | PangoScriptIter * |
142 | pango_script_iter_new (const char *text, |
143 | int length) |
144 | { |
145 | return _pango_script_iter_init (g_slice_new (PangoScriptIter), text, length); |
146 | } |
147 | |
148 | static PangoScriptIter * |
149 | pango_script_iter_copy (PangoScriptIter *iter) |
150 | { |
151 | return g_slice_dup (PangoScriptIter, iter); |
152 | } |
153 | |
154 | void |
155 | _pango_script_iter_fini (PangoScriptIter *iter) |
156 | { |
157 | } |
158 | |
159 | /** |
160 | * pango_script_iter_free: |
161 | * @iter: a `PangoScriptIter` |
162 | * |
163 | * Frees a `PangoScriptIter`. |
164 | * |
165 | * Since: 1.4 |
166 | */ |
167 | void |
168 | pango_script_iter_free (PangoScriptIter *iter) |
169 | { |
170 | _pango_script_iter_fini (iter); |
171 | g_slice_free (PangoScriptIter, iter); |
172 | } |
173 | |
174 | /** |
175 | * pango_script_iter_get_range: |
176 | * @iter: a `PangoScriptIter` |
177 | * @start: (out) (optional): location to store start position of the range |
178 | * @end: (out) (optional): location to store end position of the range |
179 | * @script: (out) (optional): location to store script for range |
180 | * |
181 | * Gets information about the range to which @iter currently points. |
182 | * |
183 | * The range is the set of locations p where *start <= p < *end. |
184 | * (That is, it doesn't include the character stored at *end) |
185 | * |
186 | * Note that while the type of the @script argument is declared |
187 | * as `PangoScript`, as of Pango 1.18, this function simply returns |
188 | * `GUnicodeScript` values. Callers must be prepared to handle unknown |
189 | * values. |
190 | * |
191 | * Since: 1.4 |
192 | */ |
193 | void |
194 | pango_script_iter_get_range (PangoScriptIter *iter, |
195 | const char **start, |
196 | const char **end, |
197 | PangoScript *script) |
198 | { |
199 | if (start) |
200 | *start = iter->script_start; |
201 | if (end) |
202 | *end = iter->script_end; |
203 | if (script) |
204 | *script = iter->script_code; |
205 | } |
206 | |
207 | static const gunichar paired_chars[] = { |
208 | 0x0028, 0x0029, /* ascii paired punctuation */ |
209 | 0x003c, 0x003e, |
210 | 0x005b, 0x005d, |
211 | 0x007b, 0x007d, |
212 | 0x00ab, 0x00bb, /* guillemets */ |
213 | 0x0f3a, 0x0f3b, /* tibetan */ |
214 | 0x0f3c, 0x0f3d, |
215 | 0x169b, 0x169c, /* ogham */ |
216 | 0x2018, 0x2019, /* general punctuation */ |
217 | 0x201c, 0x201d, |
218 | 0x2039, 0x203a, |
219 | 0x2045, 0x2046, |
220 | 0x207d, 0x207e, |
221 | 0x208d, 0x208e, |
222 | 0x27e6, 0x27e7, /* math */ |
223 | 0x27e8, 0x27e9, |
224 | 0x27ea, 0x27eb, |
225 | 0x27ec, 0x27ed, |
226 | 0x27ee, 0x27ef, |
227 | 0x2983, 0x2984, |
228 | 0x2985, 0x2986, |
229 | 0x2987, 0x2988, |
230 | 0x2989, 0x298a, |
231 | 0x298b, 0x298c, |
232 | 0x298d, 0x298e, |
233 | 0x298f, 0x2990, |
234 | 0x2991, 0x2992, |
235 | 0x2993, 0x2994, |
236 | 0x2995, 0x2996, |
237 | 0x2997, 0x2998, |
238 | 0x29fc, 0x29fd, |
239 | 0x2e02, 0x2e03, |
240 | 0x2e04, 0x2e05, |
241 | 0x2e09, 0x2e0a, |
242 | 0x2e0c, 0x2e0d, |
243 | 0x2e1c, 0x2e1d, |
244 | 0x2e20, 0x2e21, |
245 | 0x2e22, 0x2e23, |
246 | 0x2e24, 0x2e25, |
247 | 0x2e26, 0x2e27, |
248 | 0x2e28, 0x2e29, |
249 | 0x3008, 0x3009, /* chinese paired punctuation */ |
250 | 0x300a, 0x300b, |
251 | 0x300c, 0x300d, |
252 | 0x300e, 0x300f, |
253 | 0x3010, 0x3011, |
254 | 0x3014, 0x3015, |
255 | 0x3016, 0x3017, |
256 | 0x3018, 0x3019, |
257 | 0x301a, 0x301b, |
258 | 0xfe59, 0xfe5a, |
259 | 0xfe5b, 0xfe5c, |
260 | 0xfe5d, 0xfe5e, |
261 | 0xff08, 0xff09, |
262 | 0xff3b, 0xff3d, |
263 | 0xff5b, 0xff5d, |
264 | 0xff5f, 0xff60, |
265 | 0xff62, 0xff63 |
266 | }; |
267 | |
268 | static int |
269 | get_pair_index (gunichar ch) |
270 | { |
271 | int lower = 0; |
272 | int upper = G_N_ELEMENTS (paired_chars) - 1; |
273 | |
274 | while (lower <= upper) |
275 | { |
276 | int mid = (lower + upper) / 2; |
277 | |
278 | if (ch < paired_chars[mid]) |
279 | upper = mid - 1; |
280 | else if (ch > paired_chars[mid]) |
281 | lower = mid + 1; |
282 | else |
283 | return mid; |
284 | } |
285 | |
286 | return -1; |
287 | } |
288 | |
289 | /* duplicated in pango-language.c */ |
290 | #define REAL_SCRIPT(script) \ |
291 | ((script) > PANGO_SCRIPT_INHERITED && (script) != PANGO_SCRIPT_UNKNOWN) |
292 | |
293 | #define SAME_SCRIPT(script1, script2) \ |
294 | (!REAL_SCRIPT (script1) || !REAL_SCRIPT (script2) || (script1) == (script2)) |
295 | |
296 | #define IS_OPEN(pair_index) (((pair_index) & 1) == 0) |
297 | |
298 | /** |
299 | * pango_script_iter_next: |
300 | * @iter: a `PangoScriptIter` |
301 | * |
302 | * Advances a `PangoScriptIter` to the next range. |
303 | * |
304 | * If @iter is already at the end, it is left unchanged |
305 | * and %FALSE is returned. |
306 | * |
307 | * Return value: %TRUE if @iter was successfully advanced |
308 | * |
309 | * Since: 1.4 |
310 | */ |
311 | gboolean |
312 | pango_script_iter_next (PangoScriptIter *iter) |
313 | { |
314 | int start_sp; |
315 | |
316 | if (iter->script_end == iter->text_end) |
317 | return FALSE; |
318 | |
319 | start_sp = iter->paren_sp; |
320 | iter->script_code = PANGO_SCRIPT_COMMON; |
321 | iter->script_start = iter->script_end; |
322 | |
323 | for (; iter->script_end < iter->text_end; iter->script_end = g_utf8_next_char (iter->script_end)) |
324 | { |
325 | gunichar ch = g_utf8_get_char (p: iter->script_end); |
326 | PangoScript sc; |
327 | int pair_index; |
328 | |
329 | sc = (PangoScript)g_unichar_get_script (ch); |
330 | if (sc != PANGO_SCRIPT_COMMON) |
331 | pair_index = -1; |
332 | else |
333 | pair_index = get_pair_index (ch); |
334 | |
335 | /* |
336 | * Paired character handling: |
337 | * |
338 | * if it's an open character, push it onto the stack. |
339 | * if it's a close character, find the matching open on the |
340 | * stack, and use that script code. Any non-matching open |
341 | * characters above it on the stack will be poped. |
342 | */ |
343 | if (pair_index >= 0) |
344 | { |
345 | if (IS_OPEN (pair_index)) |
346 | { |
347 | /* |
348 | * If the paren stack is full, empty it. This |
349 | * means that deeply nested paired punctuation |
350 | * characters will be ignored, but that's an unusual |
351 | * case, and it's better to ignore them than to |
352 | * write off the end of the stack... |
353 | */ |
354 | if (++iter->paren_sp >= PAREN_STACK_DEPTH) |
355 | iter->paren_sp = 0; |
356 | |
357 | iter->paren_stack[iter->paren_sp].pair_index = pair_index; |
358 | iter->paren_stack[iter->paren_sp].script_code = iter->script_code; |
359 | } |
360 | else if (iter->paren_sp >= 0) |
361 | { |
362 | int pi = pair_index & ~1; |
363 | |
364 | while (iter->paren_sp >= 0 && iter->paren_stack[iter->paren_sp].pair_index != pi) |
365 | iter->paren_sp--; |
366 | |
367 | if (iter->paren_sp < start_sp) |
368 | start_sp = iter->paren_sp; |
369 | |
370 | if (iter->paren_sp >= 0) |
371 | sc = iter->paren_stack[iter->paren_sp].script_code; |
372 | } |
373 | } |
374 | |
375 | if (SAME_SCRIPT (iter->script_code, sc)) |
376 | { |
377 | if (!REAL_SCRIPT (iter->script_code) && REAL_SCRIPT (sc)) |
378 | { |
379 | iter->script_code = sc; |
380 | |
381 | /* |
382 | * now that we have a final script code, fix any open |
383 | * characters we pushed before we knew the script code. |
384 | */ |
385 | while (start_sp < iter->paren_sp) |
386 | iter->paren_stack[++start_sp].script_code = iter->script_code; |
387 | } |
388 | |
389 | /* |
390 | * if this character is a close paired character, |
391 | * pop it from the stack |
392 | */ |
393 | if (pair_index >= 0 && !IS_OPEN (pair_index) && iter->paren_sp >= 0) |
394 | { |
395 | iter->paren_sp--; |
396 | |
397 | if (iter->paren_sp < start_sp) |
398 | start_sp = iter->paren_sp; |
399 | } |
400 | } |
401 | else |
402 | { |
403 | /* Different script, we're done */ |
404 | break; |
405 | } |
406 | } |
407 | |
408 | return TRUE; |
409 | } |
410 | |
411 | /********************************************************** |
412 | * End of code from ICU |
413 | **********************************************************/ |
414 | |