1/* Pango
2 * break.c:
3 *
4 * Copyright (C) 1999 Red Hat Software
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22#include "config.h"
23
24#include "pango-break.h"
25#include "pango-script-private.h"
26#include "pango-emoji-private.h"
27#include "pango-attributes-private.h"
28#include "pango-break-table.h"
29#include "pango-impl-utils.h"
30#include <string.h>
31
32/* {{{ Unicode line breaking and segmentation */
33
34#define PARAGRAPH_SEPARATOR 0x2029
35
36/* See http://www.unicode.org/unicode/reports/tr14/ if you hope
37 * to understand the line breaking code.
38 */
39
40typedef enum
41{
42 BREAK_ALREADY_HANDLED, /* didn't use the table */
43 BREAK_PROHIBITED, /* no break, even if spaces intervene */
44 BREAK_IF_SPACES, /* "indirect break" (only if there are spaces) */
45 BREAK_ALLOWED /* "direct break" (can always break here) */
46 /* TR 14 has two more break-opportunity classes,
47 * "indirect break opportunity for combining marks following a space"
48 * and "prohibited break for combining marks"
49 * but we handle that inline in the code.
50 */
51} BreakOpportunity;
52
53/* need to sync the break range to glib/gunicode.h . */
54#define BREAK_TYPE_SAFE(btype) \
55 ((btype) <= G_UNICODE_BREAK_ZERO_WIDTH_JOINER ? (btype) : G_UNICODE_BREAK_UNKNOWN)
56
57
58/*
59 * Hangul Conjoining Jamo handling.
60 *
61 * The way we implement it is just a bit different from TR14,
62 * but produces the same results.
63 * The same algorithm is also used in TR29 for cluster boundaries.
64 *
65 */
66
67
68/* An enum that works as the states of the Hangul syllables system.
69 **/
70typedef enum
71{
72 JAMO_L, /* G_UNICODE_BREAK_HANGUL_L_JAMO */
73 JAMO_V, /* G_UNICODE_BREAK_HANGUL_V_JAMO */
74 JAMO_T, /* G_UNICODE_BREAK_HANGUL_T_JAMO */
75 JAMO_LV, /* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE */
76 JAMO_LVT, /* G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE */
77 NO_JAMO /* Other */
78} JamoType;
79
80/* There are Hangul syllables encoded as characters, that act like a
81 * sequence of Jamos. For each character we define a JamoType
82 * that the character starts with, and one that it ends with. This
83 * decomposes JAMO_LV and JAMO_LVT to simple other JAMOs. So for
84 * example, a character with LineBreak type
85 * G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V.
86 */
87typedef struct _CharJamoProps
88{
89 JamoType start, end;
90} CharJamoProps;
91
92/* Map from JamoType to CharJamoProps that hold only simple
93 * JamoTypes (no LV or LVT) or none.
94 */
95static const CharJamoProps HangulJamoProps[] = {
96 {JAMO_L, JAMO_L}, /* JAMO_L */
97 {JAMO_V, JAMO_V}, /* JAMO_V */
98 {JAMO_T, JAMO_T}, /* JAMO_T */
99 {JAMO_L, JAMO_V}, /* JAMO_LV */
100 {JAMO_L, JAMO_T}, /* JAMO_LVT */
101 {NO_JAMO, NO_JAMO} /* NO_JAMO */
102};
103
104/* A character forms a syllable with the previous character if and only if:
105 * JamoType(this) is not NO_JAMO and:
106 *
107 * HangulJamoProps[JamoType(prev)].end and
108 * HangulJamoProps[JamoType(this)].start are equal,
109 * or the former is one less than the latter.
110 */
111
112#define IS_JAMO(btype) \
113 ((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \
114 (btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
115#define JAMO_TYPE(btype) \
116 (IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
117
118/* Types of Japanese characters */
119#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
120#define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
121#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
122#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
123
124#define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
125#define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
126#define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
127#define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
128#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
129#define EMOJI(wc) (_pango_Is_Emoji_Base_Character (wc))
130#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA (wc) && !HANGUL (wc) && !EMOJI (wc))
131
132/* Previously "123foo" was two words. But in UAX 29 of Unicode,
133 * we know don't break words between consecutive letters and numbers
134 */
135typedef enum
136{
137 WordNone,
138 WordLetters,
139 WordNumbers
140} WordType;
141
142static void
143default_break (const char *text,
144 int length,
145 PangoAnalysis *analysis G_GNUC_UNUSED,
146 PangoLogAttr *attrs,
147 int attrs_len G_GNUC_UNUSED)
148{
149 /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
150 * the line breaking stuff is also in TR14 on unicode.org
151 */
152
153 /* This is a default break implementation that should work for nearly all
154 * languages. Language engines can override it optionally.
155 */
156
157 /* FIXME one cheesy optimization here would be to memset attrs to 0
158 * before we start, and then never assign %FALSE to anything
159 */
160
161 const gchar *next;
162 gint i;
163
164 gunichar prev_wc;
165 gunichar next_wc;
166
167 JamoType prev_jamo;
168
169 GUnicodeBreakType next_break_type;
170 GUnicodeBreakType prev_break_type;
171 GUnicodeBreakType prev_prev_break_type;
172
173 PangoScript prev_script;
174
175 /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
176 typedef enum
177 {
178 GB_Other,
179 GB_ControlCRLF,
180 GB_Extend,
181 GB_ZWJ,
182 GB_Prepend,
183 GB_SpacingMark,
184 GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
185 /* Use state machine to handle emoji sequence */
186 /* Rule GB12 and GB13 */
187 GB_RI_Odd, /* Meets odd number of RI */
188 GB_RI_Even, /* Meets even number of RI */
189 } GraphemeBreakType;
190 GraphemeBreakType prev_GB_type = GB_Other;
191 gboolean met_Extended_Pictographic = FALSE;
192
193 /* See Word_Break Property Values table of UAX#29 */
194 typedef enum
195 {
196 WB_Other,
197 WB_NewlineCRLF,
198 WB_ExtendFormat,
199 WB_Katakana,
200 WB_Hebrew_Letter,
201 WB_ALetter,
202 WB_MidNumLet,
203 WB_MidLetter,
204 WB_MidNum,
205 WB_Numeric,
206 WB_ExtendNumLet,
207 WB_RI_Odd,
208 WB_RI_Even,
209 WB_WSegSpace,
210 } WordBreakType;
211 WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
212 gint prev_WB_i = -1;
213
214 /* See Sentence_Break Property Values table of UAX#29 */
215 typedef enum
216 {
217 SB_Other,
218 SB_ExtendFormat,
219 SB_ParaSep,
220 SB_Sp,
221 SB_Lower,
222 SB_Upper,
223 SB_OLetter,
224 SB_Numeric,
225 SB_ATerm,
226 SB_SContinue,
227 SB_STerm,
228 SB_Close,
229 /* Rules SB8 and SB8a */
230 SB_ATerm_Close_Sp,
231 SB_STerm_Close_Sp,
232 } SentenceBreakType;
233 SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
234 gint prev_SB_i = -1;
235
236 /* Rule LB25 with Example 7 of Customization */
237 typedef enum
238 {
239 LB_Other,
240 LB_Numeric,
241 LB_Numeric_Close,
242 LB_RI_Odd,
243 LB_RI_Even,
244 } LineBreakType;
245 LineBreakType prev_LB_type = LB_Other;
246
247 WordType current_word_type = WordNone;
248 gunichar last_word_letter = 0;
249 gunichar base_character = 0;
250
251 gint last_sentence_start = -1;
252 gint last_non_space = -1;
253
254 gboolean prev_space_or_hyphen;
255
256 gboolean almost_done = FALSE;
257 gboolean done = FALSE;
258
259 g_return_if_fail (length == 0 || text != NULL);
260 g_return_if_fail (attrs != NULL);
261
262 next = text;
263
264 prev_break_type = G_UNICODE_BREAK_UNKNOWN;
265 prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN;
266 prev_wc = 0;
267 prev_script = PANGO_SCRIPT_COMMON;
268 prev_jamo = NO_JAMO;
269 prev_space_or_hyphen = FALSE;
270
271 if (length == 0 || *text == '\0')
272 {
273 next_wc = PARAGRAPH_SEPARATOR;
274 almost_done = TRUE;
275 }
276 else
277 next_wc = g_utf8_get_char (p: next);
278
279 next_break_type = g_unichar_break_type (c: next_wc);
280 next_break_type = BREAK_TYPE_SAFE (next_break_type);
281
282 for (i = 0; !done ; i++)
283 {
284 GUnicodeType type;
285 gunichar wc;
286 GUnicodeBreakType break_type;
287 GUnicodeBreakType row_break_type;
288 BreakOpportunity break_op;
289 JamoType jamo;
290 gboolean makes_hangul_syllable;
291
292 /* UAX#29 boundaries */
293 gboolean is_grapheme_boundary;
294 gboolean is_word_boundary;
295 gboolean is_sentence_boundary;
296
297 /* Emoji extended pictographics */
298 gboolean is_Extended_Pictographic;
299
300 PangoScript script;
301
302 wc = next_wc;
303 break_type = next_break_type;
304
305 if (almost_done)
306 {
307 /*
308 * If we have already reached the end of @text g_utf8_next_char()
309 * may not increment next
310 */
311 next_wc = 0;
312 next_break_type = G_UNICODE_BREAK_UNKNOWN;
313 done = TRUE;
314 }
315 else
316 {
317 next = g_utf8_next_char (next);
318
319 if ((length >= 0 && next >= text + length) || *next == '\0')
320 {
321 /* This is how we fill in the last element (end position) of the
322 * attr array - assume there's a paragraph separators off the end
323 * of @text.
324 */
325 next_wc = PARAGRAPH_SEPARATOR;
326 almost_done = TRUE;
327 }
328 else
329 next_wc = g_utf8_get_char (p: next);
330
331 next_break_type = g_unichar_break_type (c: next_wc);
332 next_break_type = BREAK_TYPE_SAFE (next_break_type);
333 }
334
335 type = g_unichar_type (c: wc);
336 jamo = JAMO_TYPE (break_type);
337
338 /* Determine wheter this forms a Hangul syllable with prev. */
339 if (jamo == NO_JAMO)
340 makes_hangul_syllable = FALSE;
341 else
342 {
343 JamoType prev_end = HangulJamoProps[prev_jamo].end ;
344 JamoType this_start = HangulJamoProps[ jamo].start;
345
346 /* See comments before IS_JAMO */
347 makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
348 }
349
350 switch ((int)type)
351 {
352 case G_UNICODE_SPACE_SEPARATOR:
353 case G_UNICODE_LINE_SEPARATOR:
354 case G_UNICODE_PARAGRAPH_SEPARATOR:
355 attrs[i].is_white = TRUE;
356 break;
357 case G_UNICODE_CONTROL:
358 if (wc == '\t' || wc == '\n' || wc == '\r' || wc == '\f')
359 attrs[i].is_white = TRUE;
360 else
361 attrs[i].is_white = FALSE;
362 break;
363 default:
364 attrs[i].is_white = FALSE;
365 break;
366 }
367
368 /* Just few spaces have variable width. So explicitly mark them.
369 */
370 attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);
371 is_Extended_Pictographic =
372 _pango_Is_Emoji_Extended_Pictographic (ch: wc);
373
374
375 /* ---- UAX#29 Grapheme Boundaries ---- */
376 {
377 GraphemeBreakType GB_type;
378
379 /* Find the GraphemeBreakType of wc */
380 GB_type = GB_Other;
381 switch ((int)type)
382 {
383 case G_UNICODE_FORMAT:
384 if (G_UNLIKELY (wc == 0x200C))
385 {
386 GB_type = GB_Extend;
387 break;
388 }
389 if (G_UNLIKELY (wc == 0x200D))
390 {
391 GB_type = GB_ZWJ;
392 break;
393 }
394 if (G_UNLIKELY((wc >= 0x600 && wc <= 0x605) ||
395 wc == 0x6DD ||
396 wc == 0x70F ||
397 wc == 0x8E2 ||
398 wc == 0x110BD ||
399 wc == 0x110CD))
400 {
401 GB_type = GB_Prepend;
402 break;
403 }
404 /* Tag chars */
405 if (wc >= 0xE0020 && wc <= 0xE00FF)
406 {
407 GB_type = GB_Extend;
408 break;
409 }
410 G_GNUC_FALLTHROUGH;
411 case G_UNICODE_CONTROL:
412 case G_UNICODE_LINE_SEPARATOR:
413 case G_UNICODE_PARAGRAPH_SEPARATOR:
414 case G_UNICODE_SURROGATE:
415 GB_type = GB_ControlCRLF;
416 break;
417
418 case G_UNICODE_UNASSIGNED:
419 /* Unassigned default ignorables */
420 if ((wc >= 0xFFF0 && wc <= 0xFFF8) ||
421 (wc >= 0xE0000 && wc <= 0xE0FFF))
422 {
423 GB_type = GB_ControlCRLF;
424 break;
425 }
426 G_GNUC_FALLTHROUGH;
427
428 case G_UNICODE_OTHER_LETTER:
429 if (makes_hangul_syllable)
430 GB_type = GB_InHangulSyllable;
431
432 if (_pango_is_Consonant_Preceding_Repha (wc) ||
433 _pango_is_Consonant_Prefixed (wc))
434 GB_type = GB_Prepend;
435 break;
436
437 case G_UNICODE_MODIFIER_LETTER:
438 if (wc >= 0xFF9E && wc <= 0xFF9F)
439 GB_type = GB_Extend; /* Other_Grapheme_Extend */
440 break;
441
442 case G_UNICODE_SPACING_MARK:
443 GB_type = GB_SpacingMark; /* SpacingMark */
444 if (wc >= 0x0900)
445 {
446 if (wc == 0x09BE || wc == 0x09D7 ||
447 wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 ||
448 wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 ||
449 wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF ||
450 wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172))
451 GB_type = GB_Extend; /* Other_Grapheme_Extend */
452 }
453 break;
454
455 case G_UNICODE_ENCLOSING_MARK:
456 case G_UNICODE_NON_SPACING_MARK:
457 GB_type = GB_Extend; /* Grapheme_Extend */
458 break;
459
460 case G_UNICODE_OTHER_SYMBOL:
461 if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF))
462 {
463 if (prev_GB_type == GB_RI_Odd)
464 GB_type = GB_RI_Even;
465 else
466 GB_type = GB_RI_Odd;
467 break;
468 }
469 break;
470
471 case G_UNICODE_MODIFIER_SYMBOL:
472 /* Fitzpatrick modifiers */
473 if (wc >= 0x1F3FB && wc <= 0x1F3FF)
474 GB_type = GB_Extend;
475 break;
476
477 default:
478 break;
479 }
480
481 /* Rule GB11 */
482 if (met_Extended_Pictographic)
483 {
484 if (GB_type == GB_Extend)
485 met_Extended_Pictographic = TRUE;
486 else if (_pango_Is_Emoji_Extended_Pictographic (ch: prev_wc) &&
487 GB_type == GB_ZWJ)
488 met_Extended_Pictographic = TRUE;
489 else if (prev_GB_type == GB_Extend && GB_type == GB_ZWJ)
490 met_Extended_Pictographic = TRUE;
491 else if (prev_GB_type == GB_ZWJ && is_Extended_Pictographic)
492 met_Extended_Pictographic = TRUE;
493 else
494 met_Extended_Pictographic = FALSE;
495 }
496
497 /* Grapheme Cluster Boundary Rules */
498 is_grapheme_boundary = TRUE; /* Rule GB999 */
499
500 /* We apply Rules GB1 and GB2 at the end of the function */
501 if (wc == '\n' && prev_wc == '\r')
502 is_grapheme_boundary = FALSE; /* Rule GB3 */
503 else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
504 is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
505 else if (GB_type == GB_InHangulSyllable)
506 is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
507 else if (GB_type == GB_Extend)
508 is_grapheme_boundary = FALSE; /* Rule GB9 */
509 else if (GB_type == GB_ZWJ)
510 is_grapheme_boundary = FALSE; /* Rule GB9 */
511 else if (GB_type == GB_SpacingMark)
512 is_grapheme_boundary = FALSE; /* Rule GB9a */
513 else if (prev_GB_type == GB_Prepend)
514 is_grapheme_boundary = FALSE; /* Rule GB9b */
515 else if (is_Extended_Pictographic)
516 { /* Rule GB11 */
517 if (prev_GB_type == GB_ZWJ && met_Extended_Pictographic)
518 is_grapheme_boundary = FALSE;
519 }
520 else if (prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
521 is_grapheme_boundary = FALSE; /* Rule GB12 and GB13 */
522
523 if (is_Extended_Pictographic)
524 met_Extended_Pictographic = TRUE;
525
526 attrs[i].is_cursor_position = is_grapheme_boundary;
527 /* If this is a grapheme boundary, we have to decide if backspace
528 * deletes a character or the whole grapheme cluster */
529 if (is_grapheme_boundary)
530 {
531 attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
532
533 /* Dependent Vowels for Indic language */
534 if (_pango_is_Virama (wc: prev_wc) ||
535 _pango_is_Vowel_Dependent (wc: prev_wc))
536 attrs[i].backspace_deletes_character = TRUE;
537 }
538 else
539 attrs[i].backspace_deletes_character = FALSE;
540
541 prev_GB_type = GB_type;
542 }
543
544 script = (PangoScript)g_unichar_get_script (ch: wc);
545 /* ---- UAX#29 Word Boundaries ---- */
546 {
547 is_word_boundary = FALSE;
548 if (is_grapheme_boundary ||
549 G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) /* Rules WB3 and WB4 */
550 {
551 WordBreakType WB_type;
552
553 /* Find the WordBreakType of wc */
554 WB_type = WB_Other;
555
556 if (script == PANGO_SCRIPT_KATAKANA)
557 WB_type = WB_Katakana;
558
559 if (script == PANGO_SCRIPT_HEBREW && type == G_UNICODE_OTHER_LETTER)
560 WB_type = WB_Hebrew_Letter;
561
562 if (WB_type == WB_Other)
563 switch (wc >> 8)
564 {
565 case 0x30:
566 if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 ||
567 wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc)
568 WB_type = WB_Katakana; /* Katakana exceptions */
569 break;
570 case 0xFF:
571 if (wc == 0xFF70)
572 WB_type = WB_Katakana; /* Katakana exceptions */
573 else if (wc >= 0xFF9E && wc <= 0xFF9F)
574 WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */
575 break;
576 case 0x05:
577 if (wc == 0x058A)
578 WB_type = WB_ALetter; /* ALetter exceptions */
579 break;
580 default:
581 break;
582 }
583
584 if (WB_type == WB_Other)
585 switch ((int) break_type)
586 {
587 case G_UNICODE_BREAK_NUMERIC:
588 if (wc != 0x066C)
589 WB_type = WB_Numeric; /* Numeric */
590 break;
591 case G_UNICODE_BREAK_INFIX_SEPARATOR:
592 if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E)
593 WB_type = WB_MidNum; /* MidNum */
594 break;
595 default:
596 break;
597 }
598
599 if (WB_type == WB_Other)
600 switch ((int) type)
601 {
602 case G_UNICODE_CONTROL:
603 if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085)
604 break;
605 G_GNUC_FALLTHROUGH;
606 case G_UNICODE_LINE_SEPARATOR:
607 case G_UNICODE_PARAGRAPH_SEPARATOR:
608 WB_type = WB_NewlineCRLF; /* CR, LF, Newline */
609 break;
610
611 case G_UNICODE_FORMAT:
612 case G_UNICODE_SPACING_MARK:
613 case G_UNICODE_ENCLOSING_MARK:
614 case G_UNICODE_NON_SPACING_MARK:
615 WB_type = WB_ExtendFormat; /* Extend, Format */
616 break;
617
618 case G_UNICODE_CONNECT_PUNCTUATION:
619 WB_type = WB_ExtendNumLet; /* ExtendNumLet */
620 break;
621
622 case G_UNICODE_INITIAL_PUNCTUATION:
623 case G_UNICODE_FINAL_PUNCTUATION:
624 if (wc == 0x2018 || wc == 0x2019)
625 WB_type = WB_MidNumLet; /* MidNumLet */
626 break;
627 case G_UNICODE_OTHER_PUNCTUATION:
628 if ((wc >= 0x055a && wc <= 0x055c) ||
629 wc == 0x055e || wc == 0x05f3)
630 WB_type = WB_ALetter; /* ALetter */
631 else if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 ||
632 wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e)
633 WB_type = WB_MidNumLet; /* MidNumLet */
634 else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 ||
635 wc == 0x003a || wc == 0x0387 || wc == 0x055f ||
636 wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a)
637 WB_type = WB_MidLetter; /* MidLetter */
638 else if (wc == 0x066c ||
639 wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b)
640 WB_type = WB_MidNum; /* MidNum */
641 break;
642
643 case G_UNICODE_OTHER_SYMBOL:
644 if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */
645 goto Alphabetic;
646
647 if (G_UNLIKELY(wc >= 0x1F1E6 && wc <= 0x1F1FF))
648 {
649 if (prev_WB_type == WB_RI_Odd)
650 WB_type = WB_RI_Even;
651 else
652 WB_type = WB_RI_Odd;
653 }
654
655 break;
656
657 case G_UNICODE_OTHER_LETTER:
658 case G_UNICODE_LETTER_NUMBER:
659 if (wc == 0x3006 || wc == 0x3007 ||
660 (wc >= 0x3021 && wc <= 0x3029) ||
661 (wc >= 0x3038 && wc <= 0x303A) ||
662 (wc >= 0x3400 && wc <= 0x4DB5) ||
663 (wc >= 0x4E00 && wc <= 0x9FC3) ||
664 (wc >= 0xF900 && wc <= 0xFA2D) ||
665 (wc >= 0xFA30 && wc <= 0xFA6A) ||
666 (wc >= 0xFA70 && wc <= 0xFAD9) ||
667 (wc >= 0x20000 && wc <= 0x2A6D6) ||
668 (wc >= 0x2F800 && wc <= 0x2FA1D))
669 break; /* ALetter exceptions: Ideographic */
670 goto Alphabetic;
671
672 case G_UNICODE_LOWERCASE_LETTER:
673 case G_UNICODE_MODIFIER_LETTER:
674 case G_UNICODE_TITLECASE_LETTER:
675 case G_UNICODE_UPPERCASE_LETTER:
676 Alphabetic:
677 if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA)
678 WB_type = WB_ALetter; /* ALetter */
679 break;
680 default:
681 break;
682 }
683
684 if (WB_type == WB_Other)
685 {
686 if (type == G_UNICODE_SPACE_SEPARATOR &&
687 break_type != G_UNICODE_BREAK_NON_BREAKING_GLUE)
688 WB_type = WB_WSegSpace;
689 }
690
691 /* Word Cluster Boundary Rules */
692
693 /* We apply Rules WB1 and WB2 at the end of the function */
694
695 if (prev_wc == 0x3031 && wc == 0x41)
696 g_debug ("Y %d %d", prev_WB_type, WB_type);
697 if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
698 {
699 /* The extra check for prev_WB_i is to correctly handle sequences like
700 * Newline ÷ Extend × Extend
701 * since we have not skipped ExtendFormat yet.
702 */
703 is_word_boundary = TRUE; /* Rule WB3a */
704 }
705 else if (WB_type == WB_NewlineCRLF)
706 is_word_boundary = TRUE; /* Rule WB3b */
707 else if (prev_wc == 0x200D && is_Extended_Pictographic)
708 is_word_boundary = FALSE; /* Rule WB3c */
709 else if (prev_WB_type == WB_WSegSpace &&
710 WB_type == WB_WSegSpace && prev_WB_i + 1 == i)
711 is_word_boundary = FALSE; /* Rule WB3d */
712 else if (WB_type == WB_ExtendFormat)
713 is_word_boundary = FALSE; /* Rules WB4? */
714 else if ((prev_WB_type == WB_ALetter ||
715 prev_WB_type == WB_Hebrew_Letter ||
716 prev_WB_type == WB_Numeric) &&
717 (WB_type == WB_ALetter ||
718 WB_type == WB_Hebrew_Letter ||
719 WB_type == WB_Numeric))
720 is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10 */
721 else if (prev_WB_type == WB_Katakana && WB_type == WB_Katakana)
722 is_word_boundary = FALSE; /* Rule WB13 */
723 else if ((prev_WB_type == WB_ALetter ||
724 prev_WB_type == WB_Hebrew_Letter ||
725 prev_WB_type == WB_Numeric ||
726 prev_WB_type == WB_Katakana ||
727 prev_WB_type == WB_ExtendNumLet) &&
728 WB_type == WB_ExtendNumLet)
729 is_word_boundary = FALSE; /* Rule WB13a */
730 else if (prev_WB_type == WB_ExtendNumLet &&
731 (WB_type == WB_ALetter ||
732 WB_type == WB_Hebrew_Letter ||
733 WB_type == WB_Numeric ||
734 WB_type == WB_Katakana))
735 is_word_boundary = FALSE; /* Rule WB13b */
736 else if (((prev_prev_WB_type == WB_ALetter ||
737 prev_prev_WB_type == WB_Hebrew_Letter) &&
738 (WB_type == WB_ALetter ||
739 WB_type == WB_Hebrew_Letter)) &&
740 (prev_WB_type == WB_MidLetter ||
741 prev_WB_type == WB_MidNumLet ||
742 prev_wc == 0x0027))
743 {
744 attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
745 is_word_boundary = FALSE; /* Rule WB7 */
746 }
747 else if (prev_WB_type == WB_Hebrew_Letter && wc == 0x0027)
748 is_word_boundary = FALSE; /* Rule WB7a */
749 else if (prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == 0x0022 &&
750 WB_type == WB_Hebrew_Letter) {
751 attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB7b */
752 is_word_boundary = FALSE; /* Rule WB7c */
753 }
754 else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
755 (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet ||
756 prev_wc == 0x0027))
757 {
758 is_word_boundary = FALSE; /* Rule WB11 */
759 attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
760 }
761 else if (prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even)
762 is_word_boundary = FALSE; /* Rule WB15 and WB16 */
763 else
764 is_word_boundary = TRUE; /* Rule WB999 */
765
766 if (WB_type != WB_ExtendFormat)
767 {
768 prev_prev_WB_type = prev_WB_type;
769 prev_WB_type = WB_type;
770 prev_WB_i = i;
771 }
772 }
773
774 attrs[i].is_word_boundary = is_word_boundary;
775 }
776
777 /* ---- UAX#29 Sentence Boundaries ---- */
778 {
779 is_sentence_boundary = FALSE;
780 if (is_word_boundary ||
781 wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */
782 {
783 SentenceBreakType SB_type;
784
785 /* Find the SentenceBreakType of wc */
786 SB_type = SB_Other;
787
788 if (break_type == G_UNICODE_BREAK_NUMERIC)
789 SB_type = SB_Numeric; /* Numeric */
790
791 if (SB_type == SB_Other)
792 switch ((int) type)
793 {
794 case G_UNICODE_CONTROL:
795 if (wc == '\r' || wc == '\n')
796 SB_type = SB_ParaSep;
797 else if (wc == 0x0009 || wc == 0x000B || wc == 0x000C)
798 SB_type = SB_Sp;
799 else if (wc == 0x0085)
800 SB_type = SB_ParaSep;
801 break;
802
803 case G_UNICODE_SPACE_SEPARATOR:
804 if (wc == 0x0020 || wc == 0x00A0 || wc == 0x1680 ||
805 (wc >= 0x2000 && wc <= 0x200A) ||
806 wc == 0x202F || wc == 0x205F || wc == 0x3000)
807 SB_type = SB_Sp;
808 break;
809
810 case G_UNICODE_LINE_SEPARATOR:
811 case G_UNICODE_PARAGRAPH_SEPARATOR:
812 SB_type = SB_ParaSep;
813 break;
814
815 case G_UNICODE_FORMAT:
816 case G_UNICODE_SPACING_MARK:
817 case G_UNICODE_ENCLOSING_MARK:
818 case G_UNICODE_NON_SPACING_MARK:
819 SB_type = SB_ExtendFormat; /* Extend, Format */
820 break;
821
822 case G_UNICODE_MODIFIER_LETTER:
823 if (wc >= 0xFF9E && wc <= 0xFF9F)
824 SB_type = SB_ExtendFormat; /* Other_Grapheme_Extend */
825 break;
826
827 case G_UNICODE_TITLECASE_LETTER:
828 SB_type = SB_Upper;
829 break;
830
831 case G_UNICODE_DASH_PUNCTUATION:
832 if (wc == 0x002D ||
833 (wc >= 0x2013 && wc <= 0x2014) ||
834 (wc >= 0xFE31 && wc <= 0xFE32) ||
835 wc == 0xFE58 ||
836 wc == 0xFE63 ||
837 wc == 0xFF0D)
838 SB_type = SB_SContinue;
839 break;
840
841 case G_UNICODE_OTHER_PUNCTUATION:
842 if (wc == 0x05F3)
843 SB_type = SB_OLetter;
844 else if (wc == 0x002E || wc == 0x2024 ||
845 wc == 0xFE52 || wc == 0xFF0E)
846 SB_type = SB_ATerm;
847
848 if (wc == 0x002C ||
849 wc == 0x003A ||
850 wc == 0x055D ||
851 (wc >= 0x060C && wc <= 0x060D) ||
852 wc == 0x07F8 ||
853 wc == 0x1802 ||
854 wc == 0x1808 ||
855 wc == 0x3001 ||
856 (wc >= 0xFE10 && wc <= 0xFE11) ||
857 wc == 0xFE13 ||
858 (wc >= 0xFE50 && wc <= 0xFE51) ||
859 wc == 0xFE55 ||
860 wc == 0xFF0C ||
861 wc == 0xFF1A ||
862 wc == 0xFF64)
863 SB_type = SB_SContinue;
864
865 if (_pango_is_STerm(wc))
866 SB_type = SB_STerm;
867
868 break;
869
870 default:
871 break;
872 }
873
874 if (SB_type == SB_Other)
875 {
876 if (type == G_UNICODE_LOWERCASE_LETTER)
877 SB_type = SB_Lower;
878 else if (type == G_UNICODE_UPPERCASE_LETTER)
879 SB_type = SB_Upper;
880 else if (type == G_UNICODE_TITLECASE_LETTER ||
881 type == G_UNICODE_MODIFIER_LETTER ||
882 type == G_UNICODE_OTHER_LETTER)
883 SB_type = SB_OLetter;
884
885 if (type == G_UNICODE_OPEN_PUNCTUATION ||
886 type == G_UNICODE_CLOSE_PUNCTUATION ||
887 break_type == G_UNICODE_BREAK_QUOTATION)
888 SB_type = SB_Close;
889 }
890
891 /* Sentence Boundary Rules */
892
893 /* We apply Rules SB1 and SB2 at the end of the function */
894
895#define IS_OTHER_TERM(SB_type) \
896 /* not in (OLetter | Upper | Lower | ParaSep | SATerm) */ \
897 !(SB_type == SB_OLetter || \
898 SB_type == SB_Upper || SB_type == SB_Lower || \
899 SB_type == SB_ParaSep || \
900 SB_type == SB_ATerm || SB_type == SB_STerm || \
901 SB_type == SB_ATerm_Close_Sp || \
902 SB_type == SB_STerm_Close_Sp)
903
904
905 if (wc == '\n' && prev_wc == '\r')
906 is_sentence_boundary = FALSE; /* Rule SB3 */
907 else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i)
908 {
909 /* The extra check for prev_SB_i is to correctly handle sequences like
910 * ParaSep ÷ Extend × Extend
911 * since we have not skipped ExtendFormat yet.
912 */
913
914 is_sentence_boundary = TRUE; /* Rule SB4 */
915 }
916 else if (SB_type == SB_ExtendFormat)
917 is_sentence_boundary = FALSE; /* Rule SB5? */
918 else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
919 is_sentence_boundary = FALSE; /* Rule SB6 */
920 else if ((prev_prev_SB_type == SB_Upper ||
921 prev_prev_SB_type == SB_Lower) &&
922 prev_SB_type == SB_ATerm &&
923 SB_type == SB_Upper)
924 is_sentence_boundary = FALSE; /* Rule SB7 */
925 else if (prev_SB_type == SB_ATerm && SB_type == SB_Close)
926 SB_type = SB_ATerm;
927 else if (prev_SB_type == SB_STerm && SB_type == SB_Close)
928 SB_type = SB_STerm;
929 else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp)
930 SB_type = SB_ATerm_Close_Sp;
931 else if (prev_SB_type == SB_STerm && SB_type == SB_Sp)
932 SB_type = SB_STerm_Close_Sp;
933 /* Rule SB8 */
934 else if ((prev_SB_type == SB_ATerm ||
935 prev_SB_type == SB_ATerm_Close_Sp) &&
936 SB_type == SB_Lower)
937 is_sentence_boundary = FALSE;
938 else if ((prev_prev_SB_type == SB_ATerm ||
939 prev_prev_SB_type == SB_ATerm_Close_Sp) &&
940 IS_OTHER_TERM(prev_SB_type) &&
941 SB_type == SB_Lower)
942 {
943 attrs[prev_SB_i].is_sentence_boundary = FALSE;
944 attrs[prev_SB_i].is_sentence_end = FALSE;
945 last_sentence_start = -1;
946 for (int j = prev_SB_i - 1; j >= 0; j--)
947 {
948 attrs[j].is_sentence_end = FALSE;
949 if (attrs[j].is_sentence_boundary)
950 {
951 last_sentence_start = j;
952 break;
953 }
954 }
955 }
956 else if ((prev_SB_type == SB_ATerm ||
957 prev_SB_type == SB_ATerm_Close_Sp ||
958 prev_SB_type == SB_STerm ||
959 prev_SB_type == SB_STerm_Close_Sp) &&
960 (SB_type == SB_SContinue ||
961 SB_type == SB_ATerm || SB_type == SB_STerm))
962 is_sentence_boundary = FALSE; /* Rule SB8a */
963 else if ((prev_SB_type == SB_ATerm ||
964 prev_SB_type == SB_STerm) &&
965 (SB_type == SB_Close || SB_type == SB_Sp ||
966 SB_type == SB_ParaSep))
967 is_sentence_boundary = FALSE; /* Rule SB9 */
968 else if ((prev_SB_type == SB_ATerm ||
969 prev_SB_type == SB_ATerm_Close_Sp ||
970 prev_SB_type == SB_STerm ||
971 prev_SB_type == SB_STerm_Close_Sp) &&
972 (SB_type == SB_Sp || SB_type == SB_ParaSep))
973 is_sentence_boundary = FALSE; /* Rule SB10 */
974 else if ((prev_SB_type == SB_ATerm ||
975 prev_SB_type == SB_ATerm_Close_Sp ||
976 prev_SB_type == SB_STerm ||
977 prev_SB_type == SB_STerm_Close_Sp) &&
978 SB_type != SB_ParaSep)
979 is_sentence_boundary = TRUE; /* Rule SB11 */
980 else
981 is_sentence_boundary = FALSE; /* Rule SB998 */
982
983 if (SB_type != SB_ExtendFormat &&
984 !((prev_prev_SB_type == SB_ATerm ||
985 prev_prev_SB_type == SB_ATerm_Close_Sp) &&
986 IS_OTHER_TERM(prev_SB_type) &&
987 IS_OTHER_TERM(SB_type)))
988 {
989 prev_prev_SB_type = prev_SB_type;
990 prev_SB_type = SB_type;
991 prev_SB_i = i;
992 }
993
994#undef IS_OTHER_TERM
995
996 }
997
998 if (i == 0 || done)
999 is_sentence_boundary = TRUE; /* Rules SB1 and SB2 */
1000
1001 attrs[i].is_sentence_boundary = is_sentence_boundary;
1002 }
1003
1004 /* ---- Line breaking ---- */
1005
1006 break_op = BREAK_ALREADY_HANDLED;
1007
1008 row_break_type = prev_break_type == G_UNICODE_BREAK_SPACE ?
1009 prev_prev_break_type : prev_break_type;
1010 g_assert (row_break_type != G_UNICODE_BREAK_SPACE);
1011
1012 attrs[i].is_char_break = FALSE;
1013 attrs[i].is_line_break = FALSE;
1014 attrs[i].is_mandatory_break = FALSE;
1015
1016 /* Rule LB1:
1017 assign a line breaking class to each code point of the input. */
1018 switch ((int)break_type)
1019 {
1020 case G_UNICODE_BREAK_AMBIGUOUS:
1021 case G_UNICODE_BREAK_SURROGATE:
1022 case G_UNICODE_BREAK_UNKNOWN:
1023 break_type = G_UNICODE_BREAK_ALPHABETIC;
1024 break;
1025
1026 case G_UNICODE_BREAK_COMPLEX_CONTEXT:
1027 if (type == G_UNICODE_NON_SPACING_MARK ||
1028 type == G_UNICODE_SPACING_MARK)
1029 break_type = G_UNICODE_BREAK_COMBINING_MARK;
1030 else
1031 break_type = G_UNICODE_BREAK_ALPHABETIC;
1032 break;
1033
1034 case G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER:
1035 break_type = G_UNICODE_BREAK_NON_STARTER;
1036 break;
1037
1038 default:
1039 break;
1040 }
1041
1042 /* If it's not a grapheme boundary, it's not a line break either */
1043 if (attrs[i].is_cursor_position ||
1044 break_type == G_UNICODE_BREAK_COMBINING_MARK ||
1045 break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER ||
1046 break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
1047 break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1048 break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
1049 break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
1050 break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE ||
1051 break_type == G_UNICODE_BREAK_EMOJI_MODIFIER ||
1052 break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
1053 {
1054 LineBreakType LB_type;
1055
1056 /* Find the LineBreakType of wc */
1057 LB_type = LB_Other;
1058
1059 if (break_type == G_UNICODE_BREAK_NUMERIC)
1060 LB_type = LB_Numeric;
1061
1062 if (break_type == G_UNICODE_BREAK_SYMBOL ||
1063 break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
1064 {
1065 if (!(prev_LB_type == LB_Numeric))
1066 LB_type = LB_Other;
1067 }
1068
1069 if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
1070 break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS)
1071 {
1072 if (prev_LB_type == LB_Numeric)
1073 LB_type = LB_Numeric_Close;
1074 else
1075 LB_type = LB_Other;
1076 }
1077
1078 if (break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
1079 {
1080 if (prev_LB_type == LB_RI_Odd)
1081 LB_type = LB_RI_Even;
1082 else
1083 LB_type = LB_RI_Odd;
1084 }
1085
1086 attrs[i].is_line_break = TRUE; /* Rule LB31 */
1087 /* Unicode doesn't specify char wrap;
1088 we wrap around all chars currently. */
1089 if (attrs[i].is_cursor_position)
1090 attrs[i].is_char_break = TRUE;
1091
1092 /* Make any necessary replacements first */
1093 if (row_break_type == G_UNICODE_BREAK_UNKNOWN)
1094 row_break_type = G_UNICODE_BREAK_ALPHABETIC;
1095
1096 /* add the line break rules in reverse order to override
1097 the lower priority rules. */
1098
1099 /* Rule LB30 */
1100 if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
1101 prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER ||
1102 prev_break_type == G_UNICODE_BREAK_NUMERIC) &&
1103 break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
1104 !_pango_is_EastAsianWide (wc))
1105 break_op = BREAK_PROHIBITED;
1106
1107 if (prev_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS &&
1108 !_pango_is_EastAsianWide (wc: prev_wc)&&
1109 (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1110 break_type == G_UNICODE_BREAK_HEBREW_LETTER ||
1111 break_type == G_UNICODE_BREAK_NUMERIC))
1112 break_op = BREAK_PROHIBITED;
1113
1114 /* Rule LB30a */
1115 if (prev_LB_type == LB_RI_Odd && LB_type == LB_RI_Even)
1116 break_op = BREAK_PROHIBITED;
1117
1118 /* Rule LB30b */
1119 if (prev_break_type == G_UNICODE_BREAK_EMOJI_BASE &&
1120 break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
1121 break_op = BREAK_PROHIBITED;
1122
1123 if ((_pango_Is_Emoji_Extended_Pictographic (ch: prev_wc) &&
1124 g_unichar_type (c: prev_wc) == G_UNICODE_UNASSIGNED) &&
1125 break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
1126 break_op = BREAK_PROHIBITED;
1127
1128 /* Rule LB29 */
1129 if (prev_break_type == G_UNICODE_BREAK_INFIX_SEPARATOR &&
1130 (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1131 break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1132 break_op = BREAK_PROHIBITED;
1133
1134 /* Rule LB28 */
1135 if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
1136 prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
1137 (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1138 break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1139 break_op = BREAK_PROHIBITED;
1140
1141 /* Rule LB27 */
1142 if ((prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
1143 prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1144 prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
1145 prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
1146 prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
1147 break_type == G_UNICODE_BREAK_POSTFIX)
1148 break_op = BREAK_PROHIBITED;
1149
1150 if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
1151 (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
1152 break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1153 break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
1154 break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
1155 break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
1156 break_op = BREAK_PROHIBITED;
1157
1158 /* Rule LB26 */
1159 if (prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO &&
1160 (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
1161 break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1162 break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
1163 break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
1164 break_op = BREAK_PROHIBITED;
1165
1166 if ((prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1167 prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE) &&
1168 (break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1169 break_type == G_UNICODE_BREAK_HANGUL_T_JAMO))
1170 break_op = BREAK_PROHIBITED;
1171
1172 if ((prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
1173 prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
1174 break_type == G_UNICODE_BREAK_HANGUL_T_JAMO)
1175 break_op = BREAK_PROHIBITED;
1176
1177 /* Rule LB25 with Example 7 of Customization */
1178 if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
1179 prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
1180 break_type == G_UNICODE_BREAK_NUMERIC)
1181 break_op = BREAK_PROHIBITED;
1182
1183 if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
1184 prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
1185 (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION ||
1186 break_type == G_UNICODE_BREAK_HYPHEN) &&
1187 next_break_type == G_UNICODE_BREAK_NUMERIC)
1188 break_op = BREAK_PROHIBITED;
1189
1190 if ((prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION ||
1191 prev_break_type == G_UNICODE_BREAK_HYPHEN) &&
1192 break_type == G_UNICODE_BREAK_NUMERIC)
1193 break_op = BREAK_PROHIBITED;
1194
1195 if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
1196 (break_type == G_UNICODE_BREAK_NUMERIC ||
1197 break_type == G_UNICODE_BREAK_SYMBOL ||
1198 break_type == G_UNICODE_BREAK_INFIX_SEPARATOR))
1199 break_op = BREAK_PROHIBITED;
1200
1201 if (prev_LB_type == LB_Numeric &&
1202 (break_type == G_UNICODE_BREAK_NUMERIC ||
1203 break_type == G_UNICODE_BREAK_SYMBOL ||
1204 break_type == G_UNICODE_BREAK_INFIX_SEPARATOR ||
1205 break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
1206 break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS))
1207 break_op = BREAK_PROHIBITED;
1208
1209 if ((prev_LB_type == LB_Numeric ||
1210 prev_LB_type == LB_Numeric_Close) &&
1211 (break_type == G_UNICODE_BREAK_POSTFIX ||
1212 break_type == G_UNICODE_BREAK_PREFIX))
1213 break_op = BREAK_PROHIBITED;
1214
1215 /* Rule LB24 */
1216 if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
1217 prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
1218 (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1219 break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1220 break_op = BREAK_PROHIBITED;
1221
1222 if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
1223 prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
1224 (break_type == G_UNICODE_BREAK_PREFIX ||
1225 break_type == G_UNICODE_BREAK_POSTFIX))
1226 break_op = BREAK_PROHIBITED;
1227
1228 /* Rule LB23 */
1229 if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
1230 prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
1231 break_type == G_UNICODE_BREAK_NUMERIC)
1232 break_op = BREAK_PROHIBITED;
1233
1234 if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
1235 (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1236 break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1237 break_op = BREAK_PROHIBITED;
1238
1239 /* Rule LB23a */
1240 if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
1241 (break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
1242 break_type == G_UNICODE_BREAK_EMOJI_BASE ||
1243 break_type == G_UNICODE_BREAK_EMOJI_MODIFIER))
1244 break_op = BREAK_PROHIBITED;
1245
1246 if ((prev_break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
1247 prev_break_type == G_UNICODE_BREAK_EMOJI_BASE ||
1248 prev_break_type == G_UNICODE_BREAK_EMOJI_MODIFIER) &&
1249 break_type == G_UNICODE_BREAK_POSTFIX)
1250 break_op = BREAK_PROHIBITED;
1251
1252 /* Rule LB22 */
1253 if (break_type == G_UNICODE_BREAK_INSEPARABLE)
1254 break_op = BREAK_PROHIBITED;
1255
1256 if (break_type == G_UNICODE_BREAK_AFTER ||
1257 break_type == G_UNICODE_BREAK_HYPHEN ||
1258 break_type == G_UNICODE_BREAK_NON_STARTER ||
1259 prev_break_type == G_UNICODE_BREAK_BEFORE)
1260 break_op = BREAK_PROHIBITED; /* Rule LB21 */
1261
1262 if (prev_prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER &&
1263 (prev_break_type == G_UNICODE_BREAK_HYPHEN ||
1264 prev_break_type == G_UNICODE_BREAK_AFTER))
1265 break_op = BREAK_PROHIBITED; /* Rule LB21a */
1266
1267 if (prev_break_type == G_UNICODE_BREAK_SYMBOL &&
1268 break_type == G_UNICODE_BREAK_HEBREW_LETTER)
1269 break_op = BREAK_PROHIBITED; /* Rule LB21b */
1270
1271 if (prev_break_type == G_UNICODE_BREAK_CONTINGENT ||
1272 break_type == G_UNICODE_BREAK_CONTINGENT)
1273 break_op = BREAK_ALLOWED; /* Rule LB20 */
1274
1275 if (prev_break_type == G_UNICODE_BREAK_QUOTATION ||
1276 break_type == G_UNICODE_BREAK_QUOTATION)
1277 break_op = BREAK_PROHIBITED; /* Rule LB19 */
1278
1279 /* handle related rules for Space as state machine here,
1280 and override the pair table result. */
1281 if (prev_break_type == G_UNICODE_BREAK_SPACE) /* Rule LB18 */
1282 break_op = BREAK_ALLOWED;
1283
1284 if (row_break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER &&
1285 break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER)
1286 break_op = BREAK_PROHIBITED; /* Rule LB17 */
1287
1288 if ((row_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
1289 row_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS) &&
1290 break_type == G_UNICODE_BREAK_NON_STARTER)
1291 break_op = BREAK_PROHIBITED; /* Rule LB16 */
1292
1293 if (row_break_type == G_UNICODE_BREAK_QUOTATION &&
1294 break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
1295 break_op = BREAK_PROHIBITED; /* Rule LB15 */
1296
1297 if (row_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
1298 break_op = BREAK_PROHIBITED; /* Rule LB14 */
1299
1300 /* Rule LB13 with Example 7 of Customization */
1301 if (break_type == G_UNICODE_BREAK_EXCLAMATION)
1302 break_op = BREAK_PROHIBITED;
1303
1304 if (prev_break_type != G_UNICODE_BREAK_NUMERIC &&
1305 (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
1306 break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS ||
1307 break_type == G_UNICODE_BREAK_INFIX_SEPARATOR ||
1308 break_type == G_UNICODE_BREAK_SYMBOL))
1309 break_op = BREAK_PROHIBITED;
1310
1311 if (prev_break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE)
1312 break_op = BREAK_PROHIBITED; /* Rule LB12 */
1313
1314 if (break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE &&
1315 (prev_break_type != G_UNICODE_BREAK_SPACE &&
1316 prev_break_type != G_UNICODE_BREAK_AFTER &&
1317 prev_break_type != G_UNICODE_BREAK_HYPHEN))
1318 break_op = BREAK_PROHIBITED; /* Rule LB12a */
1319
1320 if (prev_break_type == G_UNICODE_BREAK_WORD_JOINER ||
1321 break_type == G_UNICODE_BREAK_WORD_JOINER)
1322 break_op = BREAK_PROHIBITED; /* Rule LB11 */
1323
1324
1325 /* Rule LB9 */
1326 if (break_type == G_UNICODE_BREAK_COMBINING_MARK ||
1327 break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
1328 {
1329 if (!(prev_break_type == G_UNICODE_BREAK_MANDATORY ||
1330 prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
1331 prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
1332 prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
1333 prev_break_type == G_UNICODE_BREAK_SPACE ||
1334 prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE))
1335 break_op = BREAK_PROHIBITED;
1336 }
1337
1338 if (row_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
1339 break_op = BREAK_ALLOWED; /* Rule LB8 */
1340
1341 if (prev_wc == 0x200D)
1342 break_op = BREAK_PROHIBITED; /* Rule LB8a */
1343
1344 if (break_type == G_UNICODE_BREAK_SPACE ||
1345 break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
1346 break_op = BREAK_PROHIBITED; /* Rule LB7 */
1347
1348 /* Rule LB6 */
1349 if (break_type == G_UNICODE_BREAK_MANDATORY ||
1350 break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
1351 break_type == G_UNICODE_BREAK_LINE_FEED ||
1352 break_type == G_UNICODE_BREAK_NEXT_LINE)
1353 break_op = BREAK_PROHIBITED;
1354
1355 /* Rules LB4 and LB5 */
1356 if (prev_break_type == G_UNICODE_BREAK_MANDATORY ||
1357 (prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN &&
1358 wc != '\n') ||
1359 prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
1360 prev_break_type == G_UNICODE_BREAK_NEXT_LINE)
1361 {
1362 attrs[i].is_mandatory_break = TRUE;
1363 break_op = BREAK_ALLOWED;
1364 }
1365
1366 switch (break_op)
1367 {
1368 case BREAK_PROHIBITED:
1369 /* can't break here */
1370 attrs[i].is_line_break = FALSE;
1371 break;
1372
1373 case BREAK_IF_SPACES:
1374 /* break if prev char was space */
1375 if (prev_break_type != G_UNICODE_BREAK_SPACE)
1376 attrs[i].is_line_break = FALSE;
1377 break;
1378
1379 case BREAK_ALLOWED:
1380 attrs[i].is_line_break = TRUE;
1381 break;
1382
1383 case BREAK_ALREADY_HANDLED:
1384 break;
1385
1386 default:
1387 g_assert_not_reached ();
1388 break;
1389 }
1390
1391 /* Rule LB9 */
1392 if (!(break_type == G_UNICODE_BREAK_COMBINING_MARK ||
1393 break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER))
1394 {
1395 /* Rule LB25 with Example 7 of Customization */
1396 if (break_type == G_UNICODE_BREAK_NUMERIC ||
1397 break_type == G_UNICODE_BREAK_SYMBOL ||
1398 break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
1399 {
1400 if (prev_LB_type != LB_Numeric)
1401 prev_LB_type = LB_type;
1402 /* else don't change the prev_LB_type */
1403 }
1404 else
1405 {
1406 prev_LB_type = LB_type;
1407 }
1408 }
1409 /* else don't change the prev_LB_type for Rule LB9 */
1410 }
1411
1412 if (break_type != G_UNICODE_BREAK_SPACE)
1413 {
1414 /* Rule LB9 */
1415 if (break_type == G_UNICODE_BREAK_COMBINING_MARK ||
1416 break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
1417 {
1418 if (i == 0 /* start of text */ ||
1419 prev_break_type == G_UNICODE_BREAK_MANDATORY ||
1420 prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
1421 prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
1422 prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
1423 prev_break_type == G_UNICODE_BREAK_SPACE ||
1424 prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
1425 prev_break_type = G_UNICODE_BREAK_ALPHABETIC; /* Rule LB10 */
1426 /* else don't change the prev_break_type for Rule LB9 */
1427 }
1428 else
1429 {
1430 prev_prev_break_type = prev_break_type;
1431 prev_break_type = break_type;
1432 }
1433
1434 prev_jamo = jamo;
1435 }
1436 else
1437 {
1438 if (prev_break_type != G_UNICODE_BREAK_SPACE)
1439 {
1440 prev_prev_break_type = prev_break_type;
1441 prev_break_type = break_type;
1442 }
1443 /* else don't change the prev_break_type */
1444 }
1445
1446 /* ---- Word breaks ---- */
1447
1448 /* default to not a word start/end */
1449 attrs[i].is_word_start = FALSE;
1450 attrs[i].is_word_end = FALSE;
1451
1452 if (current_word_type != WordNone)
1453 {
1454 /* Check for a word end */
1455 switch ((int) type)
1456 {
1457 case G_UNICODE_SPACING_MARK:
1458 case G_UNICODE_ENCLOSING_MARK:
1459 case G_UNICODE_NON_SPACING_MARK:
1460 case G_UNICODE_FORMAT:
1461 /* nothing, we just eat these up as part of the word */
1462 break;
1463
1464 case G_UNICODE_LOWERCASE_LETTER:
1465 case G_UNICODE_MODIFIER_LETTER:
1466 case G_UNICODE_OTHER_LETTER:
1467 case G_UNICODE_TITLECASE_LETTER:
1468 case G_UNICODE_UPPERCASE_LETTER:
1469 if (current_word_type == WordLetters)
1470 {
1471 /* Japanese special cases for ending the word */
1472 if (JAPANESE (last_word_letter) ||
1473 JAPANESE (wc))
1474 {
1475 if ((HIRAGANA (last_word_letter) &&
1476 !HIRAGANA (wc)) ||
1477 (KATAKANA (last_word_letter) &&
1478 !(KATAKANA (wc) || HIRAGANA (wc))) ||
1479 (KANJI (last_word_letter) &&
1480 !(HIRAGANA (wc) || KANJI (wc))) ||
1481 (JAPANESE (last_word_letter) &&
1482 !JAPANESE (wc)) ||
1483 (!JAPANESE (last_word_letter) &&
1484 JAPANESE (wc)))
1485 attrs[i].is_word_end = TRUE;
1486 }
1487 }
1488 last_word_letter = wc;
1489 break;
1490
1491 case G_UNICODE_DECIMAL_NUMBER:
1492 case G_UNICODE_LETTER_NUMBER:
1493 case G_UNICODE_OTHER_NUMBER:
1494 last_word_letter = wc;
1495 break;
1496
1497 default:
1498 /* Punctuation, control/format chars, etc. all end a word. */
1499 attrs[i].is_word_end = TRUE;
1500 current_word_type = WordNone;
1501 break;
1502 }
1503 }
1504 else
1505 {
1506 /* Check for a word start */
1507 switch ((int) type)
1508 {
1509 case G_UNICODE_LOWERCASE_LETTER:
1510 case G_UNICODE_MODIFIER_LETTER:
1511 case G_UNICODE_OTHER_LETTER:
1512 case G_UNICODE_TITLECASE_LETTER:
1513 case G_UNICODE_UPPERCASE_LETTER:
1514 current_word_type = WordLetters;
1515 last_word_letter = wc;
1516 attrs[i].is_word_start = TRUE;
1517 break;
1518
1519 case G_UNICODE_DECIMAL_NUMBER:
1520 case G_UNICODE_LETTER_NUMBER:
1521 case G_UNICODE_OTHER_NUMBER:
1522 current_word_type = WordNumbers;
1523 last_word_letter = wc;
1524 attrs[i].is_word_start = TRUE;
1525 break;
1526
1527 default:
1528 /* No word here */
1529 break;
1530 }
1531 }
1532
1533 /* ---- Sentence breaks ---- */
1534 {
1535
1536 /* default to not a sentence start/end */
1537 attrs[i].is_sentence_start = FALSE;
1538 attrs[i].is_sentence_end = FALSE;
1539
1540 /* maybe start sentence */
1541 if (last_sentence_start == -1 && !is_sentence_boundary)
1542 last_sentence_start = i - 1;
1543
1544 /* remember last non space character position */
1545 if (i > 0 && !attrs[i - 1].is_white)
1546 last_non_space = i;
1547
1548 /* meets sentence end, mark both sentence start and end */
1549 if (last_sentence_start != -1 && is_sentence_boundary) {
1550 if (last_non_space >= last_sentence_start) {
1551 attrs[last_sentence_start].is_sentence_start = TRUE;
1552 attrs[last_non_space].is_sentence_end = TRUE;
1553 }
1554
1555 last_sentence_start = -1;
1556 last_non_space = -1;
1557 }
1558
1559 /* meets space character, move sentence start */
1560 if (last_sentence_start != -1 &&
1561 last_sentence_start == i - 1 &&
1562 attrs[i - 1].is_white) {
1563 last_sentence_start++;
1564 }
1565 }
1566
1567 /* --- Hyphens --- */
1568
1569 {
1570 gboolean insert_hyphens;
1571 gboolean space_or_hyphen = FALSE;
1572
1573 attrs[i].break_inserts_hyphen = FALSE;
1574 attrs[i].break_removes_preceding = FALSE;
1575
1576 switch ((int)prev_script)
1577 {
1578 case PANGO_SCRIPT_COMMON:
1579 insert_hyphens = prev_wc == 0x00ad;
1580 break;
1581 case PANGO_SCRIPT_HAN:
1582 case PANGO_SCRIPT_HANGUL:
1583 case PANGO_SCRIPT_HIRAGANA:
1584 case PANGO_SCRIPT_KATAKANA:
1585 insert_hyphens = FALSE;
1586 break;
1587 default:
1588 insert_hyphens = TRUE;
1589 break;
1590 }
1591
1592 switch ((int)type)
1593 {
1594 case G_UNICODE_SPACE_SEPARATOR:
1595 case G_UNICODE_LINE_SEPARATOR:
1596 case G_UNICODE_PARAGRAPH_SEPARATOR:
1597 space_or_hyphen = TRUE;
1598 break;
1599 case G_UNICODE_CONTROL:
1600 if (wc == '\t' || wc == '\n' || wc == '\r' || wc == '\f')
1601 space_or_hyphen = TRUE;
1602 break;
1603 default:
1604 break;
1605 }
1606
1607 if (!space_or_hyphen)
1608 {
1609 if (wc == '-' || /* Hyphen-minus */
1610 wc == 0x058a || /* Armenian hyphen */
1611 wc == 0x1400 || /* Canadian syllabics hyphen */
1612 wc == 0x1806 || /* Mongolian todo hyphen */
1613 wc == 0x2010 || /* Hyphen */
1614 wc == 0x2e17 || /* Double oblique hyphen */
1615 wc == 0x2e40 || /* Double hyphen */
1616 wc == 0x30a0 || /* Katakana-Hiragana double hyphen */
1617 wc == 0xfe63 || /* Small hyphen-minus */
1618 wc == 0xff0d) /* Fullwidth hyphen-minus */
1619 space_or_hyphen = TRUE;
1620 }
1621
1622 if (attrs[i].is_word_boundary)
1623 attrs[i].break_inserts_hyphen = FALSE;
1624 else if (prev_space_or_hyphen)
1625 attrs[i].break_inserts_hyphen = FALSE;
1626 else if (space_or_hyphen)
1627 attrs[i].break_inserts_hyphen = FALSE;
1628 else
1629 attrs[i].break_inserts_hyphen = insert_hyphens;
1630
1631 if (prev_wc == 0x2027) /* Hyphenation point */
1632 {
1633 attrs[i].break_inserts_hyphen = TRUE;
1634 attrs[i].break_removes_preceding = TRUE;
1635 }
1636
1637 prev_space_or_hyphen = space_or_hyphen;
1638 }
1639
1640 prev_wc = wc;
1641 prev_script = script;
1642
1643 /* wc might not be a valid Unicode base character, but really all we
1644 * need to know is the last non-combining character */
1645 if (type != G_UNICODE_SPACING_MARK &&
1646 type != G_UNICODE_ENCLOSING_MARK &&
1647 type != G_UNICODE_NON_SPACING_MARK)
1648 base_character = wc;
1649 }
1650
1651 i--;
1652
1653 attrs[0].is_cursor_position = TRUE; /* Rule GB1 */
1654 attrs[i].is_cursor_position = TRUE; /* Rule GB2 */
1655
1656 attrs[0].is_word_boundary = TRUE; /* Rule WB1 */
1657 attrs[i].is_word_boundary = TRUE; /* Rule WB2 */
1658
1659 attrs[0].is_line_break = FALSE; /* Rule LB2 */
1660 attrs[i].is_line_break = TRUE; /* Rule LB3 */
1661 attrs[i].is_mandatory_break = TRUE; /* Rule LB3 */
1662}
1663
1664/* }}} */
1665/* {{{ Tailoring */
1666/* {{{ Script-specific tailoring */
1667
1668#include "break-arabic.c"
1669#include "break-indic.c"
1670#include "break-thai.c"
1671#include "break-latin.c"
1672
1673static gboolean
1674break_script (const char *item_text,
1675 unsigned int item_length,
1676 const PangoAnalysis *analysis,
1677 PangoLogAttr *attrs,
1678 int attrs_len)
1679{
1680 switch (analysis->script)
1681 {
1682 case PANGO_SCRIPT_ARABIC:
1683 break_arabic (text: item_text, length: item_length, analysis, attrs, attrs_len);
1684 break;
1685
1686 case PANGO_SCRIPT_DEVANAGARI:
1687 case PANGO_SCRIPT_BENGALI:
1688 case PANGO_SCRIPT_GURMUKHI:
1689 case PANGO_SCRIPT_GUJARATI:
1690 case PANGO_SCRIPT_ORIYA:
1691 case PANGO_SCRIPT_TAMIL:
1692 case PANGO_SCRIPT_TELUGU:
1693 case PANGO_SCRIPT_KANNADA:
1694 case PANGO_SCRIPT_MALAYALAM:
1695 case PANGO_SCRIPT_SINHALA:
1696 break_indic (text: item_text, length: item_length, analysis, attrs, attrs_len);
1697 break;
1698
1699 case PANGO_SCRIPT_THAI:
1700 break_thai (text: item_text, len: item_length, analysis, attrs, attrs_len);
1701 break;
1702
1703 case PANGO_SCRIPT_LATIN:
1704 break_latin (text: item_text, length: item_length, analysis, attrs, attrs_len);
1705 break;
1706
1707 default:
1708 return FALSE;
1709 }
1710
1711 return TRUE;
1712}
1713
1714/* }}} */
1715/* {{{ Attribute-based customization */
1716
1717/* We allow customizing log attrs in two ways:
1718 *
1719 * - You can directly remove breaks from a range, using allow_breaks=false.
1720 * We preserve the non-tailorable rules from UAX #14, so mandatory breaks
1721 * and breaks after ZWS remain. We also preserve break opportunities after
1722 * hyphens and visible word dividers.
1723 *
1724 * - You can tweak the segmentation by marking ranges as word or sentence.
1725 * When doing so, we split adjacent segments to preserve alternating
1726 * starts and ends. We add a line break opportunity before each word that
1727 * is created in this way, and we remove line break opportunities inside
1728 * the word in the same way as for a range marked as allow_breaks=false,
1729 * except that we don't remove char break opportunities.
1730 *
1731 * Note that UAX #14 does not guarantee that words fall neatly into
1732 * sentences, so we don't do extra work to enforce that.
1733 */
1734
1735static void
1736remove_breaks_from_range (const char *text,
1737 int start,
1738 PangoLogAttr *log_attrs,
1739 int start_pos,
1740 int end_pos)
1741{
1742 int pos;
1743 const char *p;
1744 gunichar ch;
1745 int bt;
1746 gboolean after_zws;
1747 gboolean after_hyphen;
1748
1749 /* Assume our range doesn't start after a hyphen or in a zws sequence */
1750 after_zws = FALSE;
1751 after_hyphen = FALSE;
1752 for (pos = start_pos + 1, p = g_utf8_next_char (text + start);
1753 pos < end_pos;
1754 pos++, p = g_utf8_next_char (p))
1755 {
1756 /* Mandatory breaks aren't tailorable */
1757 if (!log_attrs[pos].is_mandatory_break)
1758 log_attrs[pos].is_line_break = FALSE;
1759
1760 ch = g_utf8_get_char (p);
1761 bt = g_unichar_break_type (c: ch);
1762
1763 /* Hyphens and visible word dividers */
1764 if (after_hyphen)
1765 log_attrs[pos].is_line_break = TRUE;
1766
1767 after_hyphen = ch == 0x00ad || /* Soft Hyphen */
1768 ch == 0x05A0 || ch == 0x2010 || /* Breaking Hyphens */
1769 ch == 0x2012 || ch == 0x2013 ||
1770 ch == 0x05BE || ch == 0x0F0B || /* Visible word dividers */
1771 ch == 0x1361 || ch == 0x17D8 ||
1772 ch == 0x17DA || ch == 0x2027 ||
1773 ch == 0x007C;
1774
1775 /* ZWS sequence */
1776 if (after_zws && bt != G_UNICODE_BREAK_SPACE)
1777 log_attrs[pos].is_line_break = TRUE;
1778
1779 after_zws = bt == G_UNICODE_BREAK_ZERO_WIDTH_SPACE ||
1780 (bt == G_UNICODE_BREAK_SPACE && after_zws);
1781 }
1782}
1783
1784static gboolean
1785handle_allow_breaks (const char *text,
1786 int length,
1787 PangoAttrList *attrs,
1788 int offset,
1789 PangoLogAttr *log_attrs,
1790 int log_attrs_len)
1791{
1792 PangoAttrIterator iter;
1793 gboolean tailored = FALSE;
1794
1795 _pango_attr_list_get_iterator (list: attrs, iterator: &iter);
1796
1797 do
1798 {
1799 const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_ALLOW_BREAKS);
1800
1801 if (!attr)
1802 continue;
1803
1804 if (!((PangoAttrInt*)attr)->value)
1805 {
1806 int start, end;
1807 int start_pos, end_pos;
1808 int pos;
1809
1810 start = attr->start_index;
1811 end = attr->end_index;
1812 if (start < offset)
1813 start_pos = 0;
1814 else
1815 start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset);
1816 if (end >= offset + length)
1817 end_pos = log_attrs_len;
1818 else
1819 end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset);
1820
1821 for (pos = start_pos + 1; pos < end_pos; pos++)
1822 log_attrs[pos].is_char_break = FALSE;
1823
1824 remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs, start_pos, end_pos);
1825
1826 tailored = TRUE;
1827 }
1828 }
1829 while (pango_attr_iterator_next (iterator: &iter));
1830
1831 _pango_attr_iterator_destroy (iterator: &iter);
1832
1833 return tailored;
1834}
1835
1836
1837static gboolean
1838handle_words (const char *text,
1839 int length,
1840 PangoAttrList *attrs,
1841 int offset,
1842 PangoLogAttr *log_attrs,
1843 int log_attrs_len)
1844{
1845 PangoAttrIterator iter;
1846 gboolean tailored = FALSE;
1847
1848 _pango_attr_list_get_iterator (list: attrs, iterator: &iter);
1849
1850 do
1851 {
1852 const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_WORD);
1853 int start, end;
1854 int start_pos, end_pos;
1855 int pos;
1856
1857 if (!attr)
1858 continue;
1859
1860 start = attr->start_index;
1861 end = attr->end_index;
1862 if (start < offset)
1863 start_pos = 0;
1864 else
1865 start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset);
1866 if (end >= offset + length)
1867 end_pos = log_attrs_len;
1868 else
1869 end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset);
1870
1871 for (pos = start_pos + 1; pos < end_pos; pos++)
1872 {
1873 log_attrs[pos].is_word_start = FALSE;
1874 log_attrs[pos].is_word_end = FALSE;
1875 log_attrs[pos].is_word_boundary = FALSE;
1876 }
1877
1878 remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs,
1879 start_pos, end_pos);
1880
1881 if (start >= offset)
1882 {
1883 gboolean in_word = FALSE;
1884 for (pos = start_pos; pos >= 0; pos--)
1885 {
1886 if (log_attrs[pos].is_word_end)
1887 {
1888 in_word = pos == start_pos;
1889 break;
1890 }
1891 if (pos < start_pos && log_attrs[pos].is_word_start)
1892 {
1893 in_word = TRUE;
1894 break;
1895 }
1896 }
1897 log_attrs[start_pos].is_word_start = TRUE;
1898 log_attrs[start_pos].is_word_end = in_word;
1899 log_attrs[start_pos].is_word_boundary = TRUE;
1900
1901 /* Allow line breaks before words */
1902 if (start_pos > 0)
1903 log_attrs[start_pos].is_line_break = TRUE;
1904
1905 tailored = TRUE;
1906 }
1907
1908 if (end < offset + length)
1909 {
1910 gboolean in_word = FALSE;
1911 for (pos = end_pos; pos < log_attrs_len; pos++)
1912 {
1913 if (log_attrs[pos].is_word_start)
1914 {
1915 in_word = pos == end_pos;
1916 break;
1917 }
1918 if (pos > end_pos && log_attrs[pos].is_word_end)
1919 {
1920 in_word = TRUE;
1921 break;
1922 }
1923 }
1924 log_attrs[end_pos].is_word_start = in_word;
1925 log_attrs[end_pos].is_word_end = TRUE;
1926 log_attrs[end_pos].is_word_boundary = TRUE;
1927
1928 /* Allow line breaks before words */
1929 if (in_word)
1930 log_attrs[end_pos].is_line_break = TRUE;
1931
1932 tailored = TRUE;
1933 }
1934 }
1935 while (pango_attr_iterator_next (iterator: &iter));
1936
1937 _pango_attr_iterator_destroy (iterator: &iter);
1938
1939 return tailored;
1940}
1941
1942static gboolean
1943handle_sentences (const char *text,
1944 int length,
1945 PangoAttrList *attrs,
1946 int offset,
1947 PangoLogAttr *log_attrs,
1948 int log_attrs_len)
1949{
1950 PangoAttrIterator iter;
1951 gboolean tailored = FALSE;
1952
1953 _pango_attr_list_get_iterator (list: attrs, iterator: &iter);
1954
1955 do
1956 {
1957 const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_SENTENCE);
1958 int start, end;
1959 int start_pos, end_pos;
1960 int pos;
1961
1962 if (!attr)
1963 continue;
1964
1965 start = attr->start_index;
1966 end = attr->end_index;
1967 if (start < offset)
1968 start_pos = 0;
1969 else
1970 start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset);
1971 if (end >= offset + length)
1972 end_pos = log_attrs_len;
1973 else
1974 end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset);
1975
1976 for (pos = start_pos + 1; pos < end_pos; pos++)
1977 {
1978 log_attrs[pos].is_sentence_start = FALSE;
1979 log_attrs[pos].is_sentence_end = FALSE;
1980 log_attrs[pos].is_sentence_boundary = FALSE;
1981
1982 tailored = TRUE;
1983 }
1984 if (start >= offset)
1985 {
1986 gboolean in_sentence = FALSE;
1987 for (pos = start_pos - 1; pos >= 0; pos--)
1988 {
1989 if (log_attrs[pos].is_sentence_end)
1990 break;
1991 if (log_attrs[pos].is_sentence_start)
1992 {
1993 in_sentence = TRUE;
1994 break;
1995 }
1996 }
1997 log_attrs[start_pos].is_sentence_start = TRUE;
1998 log_attrs[start_pos].is_sentence_end = in_sentence;
1999 log_attrs[start_pos].is_sentence_boundary = TRUE;
2000
2001 tailored = TRUE;
2002 }
2003 if (end < offset + length)
2004 {
2005 gboolean in_sentence = FALSE;
2006 for (pos = end_pos + 1; end_pos < log_attrs_len; pos++)
2007 {
2008 if (log_attrs[pos].is_sentence_start)
2009 break;
2010 if (log_attrs[pos].is_sentence_end)
2011 {
2012 in_sentence = TRUE;
2013 break;
2014 }
2015 }
2016 log_attrs[end_pos].is_sentence_start = in_sentence;
2017 log_attrs[end_pos].is_sentence_end = TRUE;
2018 log_attrs[end_pos].is_sentence_boundary = TRUE;
2019
2020 tailored = TRUE;
2021 }
2022 }
2023 while (pango_attr_iterator_next (iterator: &iter));
2024
2025 _pango_attr_iterator_destroy (iterator: &iter);
2026
2027 return tailored;
2028}
2029
2030static gboolean
2031handle_hyphens (const char *text,
2032 int length,
2033 PangoAttrList *attrs,
2034 int offset,
2035 PangoLogAttr *log_attrs,
2036 int log_attrs_len)
2037{
2038 PangoAttrIterator iter;
2039 gboolean tailored = FALSE;
2040
2041 _pango_attr_list_get_iterator (list: attrs, iterator: &iter);
2042
2043 do {
2044 const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_INSERT_HYPHENS);
2045
2046 if (attr && ((PangoAttrInt*)attr)->value == 0)
2047 {
2048 int start, end;
2049 int start_pos, end_pos;
2050 int pos;
2051
2052 pango_attr_iterator_range (iterator: &iter, start: &start, end: &end);
2053 if (start < offset)
2054 start_pos = 0;
2055 else
2056 start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset);
2057 if (end >= offset + length)
2058 end_pos = log_attrs_len;
2059 else
2060 end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset);
2061
2062 for (pos = start_pos + 1; pos < end_pos; pos++)
2063 {
2064 if (!log_attrs[pos].break_removes_preceding)
2065 {
2066 log_attrs[pos].break_inserts_hyphen = FALSE;
2067
2068 tailored = TRUE;
2069 }
2070 }
2071 }
2072 } while (pango_attr_iterator_next (iterator: &iter));
2073
2074 _pango_attr_iterator_destroy (iterator: &iter);
2075
2076 return tailored;
2077}
2078
2079static gboolean
2080break_attrs (const char *text,
2081 int length,
2082 GSList *attributes,
2083 int offset,
2084 PangoLogAttr *log_attrs,
2085 int log_attrs_len)
2086{
2087 PangoAttrList allow_breaks;
2088 PangoAttrList words;
2089 PangoAttrList sentences;
2090 PangoAttrList hyphens;
2091 GSList *l;
2092 gboolean tailored = FALSE;
2093
2094 _pango_attr_list_init (list: &allow_breaks);
2095 _pango_attr_list_init (list: &words);
2096 _pango_attr_list_init (list: &sentences);
2097 _pango_attr_list_init (list: &hyphens);
2098
2099 for (l = attributes; l; l = l->next)
2100 {
2101 PangoAttribute *attr = l->data;
2102
2103 if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS)
2104 pango_attr_list_insert (list: &allow_breaks, attr: pango_attribute_copy (attr));
2105 else if (attr->klass->type == PANGO_ATTR_WORD)
2106 pango_attr_list_insert (list: &words, attr: pango_attribute_copy (attr));
2107 else if (attr->klass->type == PANGO_ATTR_SENTENCE)
2108 pango_attr_list_insert (list: &sentences, attr: pango_attribute_copy (attr));
2109 else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS)
2110 pango_attr_list_insert (list: &hyphens, attr: pango_attribute_copy (attr));
2111 }
2112
2113 tailored |= handle_words (text, length, attrs: &words, offset,
2114 log_attrs, log_attrs_len);
2115
2116 tailored |= handle_sentences (text, length, attrs: &words, offset,
2117 log_attrs, log_attrs_len);
2118
2119 tailored |= handle_hyphens (text, length, attrs: &hyphens, offset,
2120 log_attrs, log_attrs_len);
2121
2122 tailored |= handle_allow_breaks (text, length, attrs: &allow_breaks, offset,
2123 log_attrs, log_attrs_len);
2124
2125 _pango_attr_list_destroy (list: &allow_breaks);
2126 _pango_attr_list_destroy (list: &words);
2127 _pango_attr_list_destroy (list: &sentences);
2128 _pango_attr_list_destroy (list: &hyphens);
2129
2130 return tailored;
2131}
2132
2133/* }}} */
2134
2135static gboolean
2136tailor_break (const char *text,
2137 int length,
2138 PangoAnalysis *analysis,
2139 int item_offset,
2140 PangoLogAttr *attrs,
2141 int attrs_len)
2142{
2143 gboolean res;
2144
2145 if (length < 0)
2146 length = strlen (s: text);
2147 else if (text == NULL)
2148 text = "";
2149
2150 res = break_script (item_text: text, item_length: length, analysis, attrs, attrs_len);
2151
2152 if (item_offset >= 0 && analysis->extra_attrs)
2153 res |= break_attrs (text, length, attributes: analysis->extra_attrs, offset: item_offset, log_attrs: attrs, log_attrs_len: attrs_len);
2154
2155 return res;
2156}
2157
2158/* }}} */
2159/* {{{ Public API */
2160
2161/**
2162 * pango_default_break:
2163 * @text: text to break. Must be valid UTF-8
2164 * @length: length of text in bytes (may be -1 if @text is nul-terminated)
2165 * @analysis: (nullable): a `PangoAnalysis` structure for the @text
2166 * @attrs: logical attributes to fill in
2167 * @attrs_len: size of the array passed as @attrs
2168 *
2169 * This is the default break algorithm.
2170 *
2171 * It applies rules from the [Unicode Line Breaking Algorithm](http://www.unicode.org/unicode/reports/tr14/)
2172 * without language-specific tailoring, therefore the @analyis argument is unused
2173 * and can be %NULL.
2174 *
2175 * See [func@Pango.tailor_break] for language-specific breaks.
2176 *
2177 * See [func@Pango.attr_break] for attribute-based customization.
2178 */
2179void
2180pango_default_break (const char *text,
2181 int length,
2182 PangoAnalysis *analysis G_GNUC_UNUSED,
2183 PangoLogAttr *attrs,
2184 int attrs_len G_GNUC_UNUSED)
2185{
2186 PangoLogAttr before = *attrs;
2187
2188 default_break (text, length, analysis, attrs, attrs_len);
2189
2190 attrs->is_line_break |= before.is_line_break;
2191 attrs->is_mandatory_break |= before.is_mandatory_break;
2192 attrs->is_cursor_position |= before.is_cursor_position;
2193}
2194
2195/**
2196 * pango_break:
2197 * @text: the text to process. Must be valid UTF-8
2198 * @length: length of @text in bytes (may be -1 if @text is nul-terminated)
2199 * @analysis: `PangoAnalysis` structure for @text
2200 * @attrs: (array length=attrs_len): an array to store character information in
2201 * @attrs_len: size of the array passed as @attrs
2202 *
2203 * Determines possible line, word, and character breaks
2204 * for a string of Unicode text with a single analysis.
2205 *
2206 * For most purposes you may want to use [func@Pango.get_log_attrs].
2207 *
2208 * Deprecated: 1.44: Use [func@Pango.default_break],
2209 * [func@Pango.tailor_break] and [func@Pango.attr_break].
2210 */
2211void
2212pango_break (const char *text,
2213 gint length,
2214 PangoAnalysis *analysis,
2215 PangoLogAttr *attrs,
2216 int attrs_len)
2217{
2218 g_return_if_fail (analysis != NULL);
2219 g_return_if_fail (attrs != NULL);
2220
2221 default_break (text, length, analysis, attrs, attrs_len);
2222 tailor_break (text, length, analysis, item_offset: -1, attrs, attrs_len);
2223}
2224
2225/**
2226 * pango_tailor_break:
2227 * @text: text to process. Must be valid UTF-8
2228 * @length: length in bytes of @text
2229 * @analysis: `PangoAnalysis` for @text
2230 * @offset: Byte offset of @text from the beginning of the
2231 * paragraph, or -1 to ignore attributes from @analysis
2232 * @attrs: (array length=attrs_len): array with one `PangoLogAttr`
2233 * per character in @text, plus one extra, to be filled in
2234 * @attrs_len: length of @attrs array
2235 *
2236 * Apply language-specific tailoring to the breaks in @attrs.
2237 *
2238 * The line breaks are assumed to have been produced by [func@Pango.default_break].
2239 *
2240 * If @offset is not -1, it is used to apply attributes from @analysis that are
2241 * relevant to line breaking.
2242 *
2243 * Note that it is better to pass -1 for @offset and use [func@Pango.attr_break]
2244 * to apply attributes to the whole paragraph.
2245 *
2246 * Since: 1.44
2247 */
2248void
2249pango_tailor_break (const char *text,
2250 int length,
2251 PangoAnalysis *analysis,
2252 int offset,
2253 PangoLogAttr *attrs,
2254 int attrs_len)
2255{
2256 PangoLogAttr *start = attrs;
2257 PangoLogAttr attr_before = *start;
2258
2259 if (tailor_break (text, length, analysis, item_offset: offset, attrs, attrs_len))
2260 {
2261 /* if tailored, we enforce some of the attrs from before
2262 * tailoring at the boundary
2263 */
2264
2265 start->backspace_deletes_character = attr_before.backspace_deletes_character;
2266
2267 start->is_line_break |= attr_before.is_line_break;
2268 start->is_mandatory_break |= attr_before.is_mandatory_break;
2269 start->is_cursor_position |= attr_before.is_cursor_position;
2270 }
2271}
2272
2273/**
2274 * pango_attr_break:
2275 * @text: text to break. Must be valid UTF-8
2276 * @length: length of text in bytes (may be -1 if @text is nul-terminated)
2277 * @attr_list: `PangoAttrList` to apply
2278 * @offset: Byte offset of @text from the beginning of the paragraph
2279 * @attrs: (array length=attrs_len): array with one `PangoLogAttr`
2280 * per character in @text, plus one extra, to be filled in
2281 * @attrs_len: length of @attrs array
2282 *
2283 * Apply customization from attributes to the breaks in @attrs.
2284 *
2285 * The line breaks are assumed to have been produced
2286 * by [func@Pango.default_break] and [func@Pango.tailor_break].
2287 *
2288 * Since: 1.50
2289 */
2290void
2291pango_attr_break (const char *text,
2292 int length,
2293 PangoAttrList *attr_list,
2294 int offset,
2295 PangoLogAttr *attrs,
2296 int attrs_len)
2297{
2298 PangoLogAttr *start = attrs;
2299 PangoLogAttr attr_before = *start;
2300 GSList *attributes;
2301
2302 attributes = pango_attr_list_get_attributes (list: attr_list);
2303 if (break_attrs (text, length, attributes, offset, log_attrs: attrs, log_attrs_len: attrs_len))
2304 {
2305 /* if tailored, we enforce some of the attrs from before
2306 * tailoring at the boundary
2307 */
2308
2309 start->backspace_deletes_character = attr_before.backspace_deletes_character;
2310
2311 start->is_line_break |= attr_before.is_line_break;
2312 start->is_mandatory_break |= attr_before.is_mandatory_break;
2313 start->is_cursor_position |= attr_before.is_cursor_position;
2314 }
2315
2316 g_slist_free_full (list: attributes, free_func: (GDestroyNotify)pango_attribute_destroy);
2317}
2318
2319/**
2320 * pango_get_log_attrs:
2321 * @text: text to process. Must be valid UTF-8
2322 * @length: length in bytes of @text
2323 * @level: embedding level, or -1 if unknown
2324 * @language: language tag
2325 * @attrs: (array length=attrs_len): array with one `PangoLogAttr`
2326 * per character in @text, plus one extra, to be filled in
2327 * @attrs_len: length of @attrs array
2328 *
2329 * Computes a `PangoLogAttr` for each character in @text.
2330 *
2331 * The @attrs array must have one `PangoLogAttr` for
2332 * each position in @text; if @text contains N characters,
2333 * it has N+1 positions, including the last position at the
2334 * end of the text. @text should be an entire paragraph;
2335 * logical attributes can't be computed without context
2336 * (for example you need to see spaces on either side of
2337 * a word to know the word is a word).
2338 */
2339void
2340pango_get_log_attrs (const char *text,
2341 int length,
2342 int level,
2343 PangoLanguage *language,
2344 PangoLogAttr *attrs,
2345 int attrs_len)
2346{
2347 int chars_broken;
2348 PangoAnalysis analysis = { NULL };
2349 PangoScriptIter iter;
2350
2351 g_return_if_fail (length == 0 || text != NULL);
2352 g_return_if_fail (attrs != NULL);
2353
2354 analysis.level = level;
2355 analysis.language = language;
2356
2357 pango_default_break (text, length, analysis: &analysis, attrs, attrs_len);
2358
2359 chars_broken = 0;
2360
2361 _pango_script_iter_init (iter: &iter, text, length);
2362 do
2363 {
2364 const char *run_start, *run_end;
2365 PangoScript script;
2366 int chars_in_range;
2367
2368 pango_script_iter_get_range (iter: &iter, start: &run_start, end: &run_end, script: &script);
2369 analysis.script = script;
2370
2371 chars_in_range = pango_utf8_strlen (p: run_start, max: run_end - run_start);
2372
2373 pango_tailor_break (text: run_start,
2374 length: run_end - run_start,
2375 analysis: &analysis,
2376 offset: -1,
2377 attrs: attrs + chars_broken,
2378 attrs_len: chars_in_range + 1);
2379
2380 chars_broken += chars_in_range;
2381 }
2382 while (pango_script_iter_next (iter: &iter));
2383 _pango_script_iter_fini (iter: &iter);
2384
2385 if (chars_broken + 1 > attrs_len)
2386 g_warning ("pango_get_log_attrs: attrs_len should have been at least %d, but was %d. Expect corrupted memory.",
2387 chars_broken + 1,
2388 attrs_len);
2389}
2390
2391/* }}} */
2392
2393/* vim:set foldmethod=marker expandtab: */
2394

source code of gtk/subprojects/pango/pango/break.c