1 | /* Pango |
2 | * break.c: |
3 | * |
4 | * Copyright (C) 1999 Red Hat Software |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Library General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Library General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Library General Public |
17 | * License along with this library; if not, write to the |
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
19 | * Boston, MA 02111-1307, USA. |
20 | */ |
21 | |
22 | #include "config.h" |
23 | |
24 | #include "pango-break.h" |
25 | #include "pango-script-private.h" |
26 | #include "pango-emoji-private.h" |
27 | #include "pango-attributes-private.h" |
28 | #include "pango-break-table.h" |
29 | #include "pango-impl-utils.h" |
30 | #include <string.h> |
31 | |
32 | /* {{{ Unicode line breaking and segmentation */ |
33 | |
34 | #define PARAGRAPH_SEPARATOR 0x2029 |
35 | |
36 | /* See http://www.unicode.org/unicode/reports/tr14/ if you hope |
37 | * to understand the line breaking code. |
38 | */ |
39 | |
40 | typedef enum |
41 | { |
42 | BREAK_ALREADY_HANDLED, /* didn't use the table */ |
43 | BREAK_PROHIBITED, /* no break, even if spaces intervene */ |
44 | BREAK_IF_SPACES, /* "indirect break" (only if there are spaces) */ |
45 | BREAK_ALLOWED /* "direct break" (can always break here) */ |
46 | /* TR 14 has two more break-opportunity classes, |
47 | * "indirect break opportunity for combining marks following a space" |
48 | * and "prohibited break for combining marks" |
49 | * but we handle that inline in the code. |
50 | */ |
51 | } BreakOpportunity; |
52 | |
53 | /* need to sync the break range to glib/gunicode.h . */ |
54 | #define BREAK_TYPE_SAFE(btype) \ |
55 | ((btype) <= G_UNICODE_BREAK_ZERO_WIDTH_JOINER ? (btype) : G_UNICODE_BREAK_UNKNOWN) |
56 | |
57 | |
58 | /* |
59 | * Hangul Conjoining Jamo handling. |
60 | * |
61 | * The way we implement it is just a bit different from TR14, |
62 | * but produces the same results. |
63 | * The same algorithm is also used in TR29 for cluster boundaries. |
64 | * |
65 | */ |
66 | |
67 | |
68 | /* An enum that works as the states of the Hangul syllables system. |
69 | **/ |
70 | typedef enum |
71 | { |
72 | JAMO_L, /* G_UNICODE_BREAK_HANGUL_L_JAMO */ |
73 | JAMO_V, /* G_UNICODE_BREAK_HANGUL_V_JAMO */ |
74 | JAMO_T, /* G_UNICODE_BREAK_HANGUL_T_JAMO */ |
75 | JAMO_LV, /* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE */ |
76 | JAMO_LVT, /* G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE */ |
77 | NO_JAMO /* Other */ |
78 | } JamoType; |
79 | |
80 | /* There are Hangul syllables encoded as characters, that act like a |
81 | * sequence of Jamos. For each character we define a JamoType |
82 | * that the character starts with, and one that it ends with. This |
83 | * decomposes JAMO_LV and JAMO_LVT to simple other JAMOs. So for |
84 | * example, a character with LineBreak type |
85 | * G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V. |
86 | */ |
87 | typedef struct _CharJamoProps |
88 | { |
89 | JamoType start, end; |
90 | } CharJamoProps; |
91 | |
92 | /* Map from JamoType to CharJamoProps that hold only simple |
93 | * JamoTypes (no LV or LVT) or none. |
94 | */ |
95 | static const CharJamoProps HangulJamoProps[] = { |
96 | {JAMO_L, JAMO_L}, /* JAMO_L */ |
97 | {JAMO_V, JAMO_V}, /* JAMO_V */ |
98 | {JAMO_T, JAMO_T}, /* JAMO_T */ |
99 | {JAMO_L, JAMO_V}, /* JAMO_LV */ |
100 | {JAMO_L, JAMO_T}, /* JAMO_LVT */ |
101 | {NO_JAMO, NO_JAMO} /* NO_JAMO */ |
102 | }; |
103 | |
104 | /* A character forms a syllable with the previous character if and only if: |
105 | * JamoType(this) is not NO_JAMO and: |
106 | * |
107 | * HangulJamoProps[JamoType(prev)].end and |
108 | * HangulJamoProps[JamoType(this)].start are equal, |
109 | * or the former is one less than the latter. |
110 | */ |
111 | |
112 | #define IS_JAMO(btype) \ |
113 | ((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \ |
114 | (btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE)) |
115 | #define JAMO_TYPE(btype) \ |
116 | (IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO) |
117 | |
118 | /* Types of Japanese characters */ |
119 | #define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF) |
120 | #define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF) |
121 | #define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F) |
122 | #define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF) |
123 | |
124 | #define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF)) |
125 | #define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F)) |
126 | #define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF)) |
127 | #define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF) |
128 | #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3) |
129 | #define EMOJI(wc) (_pango_Is_Emoji_Base_Character (wc)) |
130 | #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA (wc) && !HANGUL (wc) && !EMOJI (wc)) |
131 | |
132 | /* Previously "123foo" was two words. But in UAX 29 of Unicode, |
133 | * we know don't break words between consecutive letters and numbers |
134 | */ |
135 | typedef enum |
136 | { |
137 | WordNone, |
138 | WordLetters, |
139 | WordNumbers |
140 | } WordType; |
141 | |
142 | static void |
143 | default_break (const char *text, |
144 | int length, |
145 | PangoAnalysis *analysis G_GNUC_UNUSED, |
146 | PangoLogAttr *attrs, |
147 | int attrs_len G_GNUC_UNUSED) |
148 | { |
149 | /* The rationale for all this is in section 5.15 of the Unicode 3.0 book, |
150 | * the line breaking stuff is also in TR14 on unicode.org |
151 | */ |
152 | |
153 | /* This is a default break implementation that should work for nearly all |
154 | * languages. Language engines can override it optionally. |
155 | */ |
156 | |
157 | /* FIXME one cheesy optimization here would be to memset attrs to 0 |
158 | * before we start, and then never assign %FALSE to anything |
159 | */ |
160 | |
161 | const gchar *next; |
162 | gint i; |
163 | |
164 | gunichar prev_wc; |
165 | gunichar next_wc; |
166 | |
167 | JamoType prev_jamo; |
168 | |
169 | GUnicodeBreakType next_break_type; |
170 | GUnicodeBreakType prev_break_type; |
171 | GUnicodeBreakType prev_prev_break_type; |
172 | |
173 | PangoScript prev_script; |
174 | |
175 | /* See Grapheme_Cluster_Break Property Values table of UAX#29 */ |
176 | typedef enum |
177 | { |
178 | GB_Other, |
179 | GB_ControlCRLF, |
180 | GB_Extend, |
181 | GB_ZWJ, |
182 | GB_Prepend, |
183 | GB_SpacingMark, |
184 | GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */ |
185 | /* Use state machine to handle emoji sequence */ |
186 | /* Rule GB12 and GB13 */ |
187 | GB_RI_Odd, /* Meets odd number of RI */ |
188 | GB_RI_Even, /* Meets even number of RI */ |
189 | } GraphemeBreakType; |
190 | GraphemeBreakType prev_GB_type = GB_Other; |
191 | gboolean met_Extended_Pictographic = FALSE; |
192 | |
193 | /* See Word_Break Property Values table of UAX#29 */ |
194 | typedef enum |
195 | { |
196 | WB_Other, |
197 | WB_NewlineCRLF, |
198 | WB_ExtendFormat, |
199 | WB_Katakana, |
200 | WB_Hebrew_Letter, |
201 | WB_ALetter, |
202 | WB_MidNumLet, |
203 | WB_MidLetter, |
204 | WB_MidNum, |
205 | WB_Numeric, |
206 | WB_ExtendNumLet, |
207 | WB_RI_Odd, |
208 | WB_RI_Even, |
209 | WB_WSegSpace, |
210 | } WordBreakType; |
211 | WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other; |
212 | gint prev_WB_i = -1; |
213 | |
214 | /* See Sentence_Break Property Values table of UAX#29 */ |
215 | typedef enum |
216 | { |
217 | SB_Other, |
218 | SB_ExtendFormat, |
219 | SB_ParaSep, |
220 | SB_Sp, |
221 | SB_Lower, |
222 | SB_Upper, |
223 | SB_OLetter, |
224 | SB_Numeric, |
225 | SB_ATerm, |
226 | SB_SContinue, |
227 | SB_STerm, |
228 | SB_Close, |
229 | /* Rules SB8 and SB8a */ |
230 | SB_ATerm_Close_Sp, |
231 | SB_STerm_Close_Sp, |
232 | } SentenceBreakType; |
233 | SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other; |
234 | gint prev_SB_i = -1; |
235 | |
236 | /* Rule LB25 with Example 7 of Customization */ |
237 | typedef enum |
238 | { |
239 | LB_Other, |
240 | LB_Numeric, |
241 | LB_Numeric_Close, |
242 | LB_RI_Odd, |
243 | LB_RI_Even, |
244 | } LineBreakType; |
245 | LineBreakType prev_LB_type = LB_Other; |
246 | |
247 | WordType current_word_type = WordNone; |
248 | gunichar last_word_letter = 0; |
249 | gunichar base_character = 0; |
250 | |
251 | gint last_sentence_start = -1; |
252 | gint last_non_space = -1; |
253 | |
254 | gboolean prev_space_or_hyphen; |
255 | |
256 | gboolean almost_done = FALSE; |
257 | gboolean done = FALSE; |
258 | |
259 | g_return_if_fail (length == 0 || text != NULL); |
260 | g_return_if_fail (attrs != NULL); |
261 | |
262 | next = text; |
263 | |
264 | prev_break_type = G_UNICODE_BREAK_UNKNOWN; |
265 | prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN; |
266 | prev_wc = 0; |
267 | prev_script = PANGO_SCRIPT_COMMON; |
268 | prev_jamo = NO_JAMO; |
269 | prev_space_or_hyphen = FALSE; |
270 | |
271 | if (length == 0 || *text == '\0') |
272 | { |
273 | next_wc = PARAGRAPH_SEPARATOR; |
274 | almost_done = TRUE; |
275 | } |
276 | else |
277 | next_wc = g_utf8_get_char (p: next); |
278 | |
279 | next_break_type = g_unichar_break_type (c: next_wc); |
280 | next_break_type = BREAK_TYPE_SAFE (next_break_type); |
281 | |
282 | for (i = 0; !done ; i++) |
283 | { |
284 | GUnicodeType type; |
285 | gunichar wc; |
286 | GUnicodeBreakType break_type; |
287 | GUnicodeBreakType row_break_type; |
288 | BreakOpportunity break_op; |
289 | JamoType jamo; |
290 | gboolean makes_hangul_syllable; |
291 | |
292 | /* UAX#29 boundaries */ |
293 | gboolean is_grapheme_boundary; |
294 | gboolean is_word_boundary; |
295 | gboolean is_sentence_boundary; |
296 | |
297 | /* Emoji extended pictographics */ |
298 | gboolean is_Extended_Pictographic; |
299 | |
300 | PangoScript script; |
301 | |
302 | wc = next_wc; |
303 | break_type = next_break_type; |
304 | |
305 | if (almost_done) |
306 | { |
307 | /* |
308 | * If we have already reached the end of @text g_utf8_next_char() |
309 | * may not increment next |
310 | */ |
311 | next_wc = 0; |
312 | next_break_type = G_UNICODE_BREAK_UNKNOWN; |
313 | done = TRUE; |
314 | } |
315 | else |
316 | { |
317 | next = g_utf8_next_char (next); |
318 | |
319 | if ((length >= 0 && next >= text + length) || *next == '\0') |
320 | { |
321 | /* This is how we fill in the last element (end position) of the |
322 | * attr array - assume there's a paragraph separators off the end |
323 | * of @text. |
324 | */ |
325 | next_wc = PARAGRAPH_SEPARATOR; |
326 | almost_done = TRUE; |
327 | } |
328 | else |
329 | next_wc = g_utf8_get_char (p: next); |
330 | |
331 | next_break_type = g_unichar_break_type (c: next_wc); |
332 | next_break_type = BREAK_TYPE_SAFE (next_break_type); |
333 | } |
334 | |
335 | type = g_unichar_type (c: wc); |
336 | jamo = JAMO_TYPE (break_type); |
337 | |
338 | /* Determine wheter this forms a Hangul syllable with prev. */ |
339 | if (jamo == NO_JAMO) |
340 | makes_hangul_syllable = FALSE; |
341 | else |
342 | { |
343 | JamoType prev_end = HangulJamoProps[prev_jamo].end ; |
344 | JamoType this_start = HangulJamoProps[ jamo].start; |
345 | |
346 | /* See comments before IS_JAMO */ |
347 | makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start); |
348 | } |
349 | |
350 | switch ((int)type) |
351 | { |
352 | case G_UNICODE_SPACE_SEPARATOR: |
353 | case G_UNICODE_LINE_SEPARATOR: |
354 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
355 | attrs[i].is_white = TRUE; |
356 | break; |
357 | case G_UNICODE_CONTROL: |
358 | if (wc == '\t' || wc == '\n' || wc == '\r' || wc == '\f') |
359 | attrs[i].is_white = TRUE; |
360 | else |
361 | attrs[i].is_white = FALSE; |
362 | break; |
363 | default: |
364 | attrs[i].is_white = FALSE; |
365 | break; |
366 | } |
367 | |
368 | /* Just few spaces have variable width. So explicitly mark them. |
369 | */ |
370 | attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc); |
371 | is_Extended_Pictographic = |
372 | _pango_Is_Emoji_Extended_Pictographic (ch: wc); |
373 | |
374 | |
375 | /* ---- UAX#29 Grapheme Boundaries ---- */ |
376 | { |
377 | GraphemeBreakType GB_type; |
378 | |
379 | /* Find the GraphemeBreakType of wc */ |
380 | GB_type = GB_Other; |
381 | switch ((int)type) |
382 | { |
383 | case G_UNICODE_FORMAT: |
384 | if (G_UNLIKELY (wc == 0x200C)) |
385 | { |
386 | GB_type = GB_Extend; |
387 | break; |
388 | } |
389 | if (G_UNLIKELY (wc == 0x200D)) |
390 | { |
391 | GB_type = GB_ZWJ; |
392 | break; |
393 | } |
394 | if (G_UNLIKELY((wc >= 0x600 && wc <= 0x605) || |
395 | wc == 0x6DD || |
396 | wc == 0x70F || |
397 | wc == 0x8E2 || |
398 | wc == 0x110BD || |
399 | wc == 0x110CD)) |
400 | { |
401 | GB_type = GB_Prepend; |
402 | break; |
403 | } |
404 | /* Tag chars */ |
405 | if (wc >= 0xE0020 && wc <= 0xE00FF) |
406 | { |
407 | GB_type = GB_Extend; |
408 | break; |
409 | } |
410 | G_GNUC_FALLTHROUGH; |
411 | case G_UNICODE_CONTROL: |
412 | case G_UNICODE_LINE_SEPARATOR: |
413 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
414 | case G_UNICODE_SURROGATE: |
415 | GB_type = GB_ControlCRLF; |
416 | break; |
417 | |
418 | case G_UNICODE_UNASSIGNED: |
419 | /* Unassigned default ignorables */ |
420 | if ((wc >= 0xFFF0 && wc <= 0xFFF8) || |
421 | (wc >= 0xE0000 && wc <= 0xE0FFF)) |
422 | { |
423 | GB_type = GB_ControlCRLF; |
424 | break; |
425 | } |
426 | G_GNUC_FALLTHROUGH; |
427 | |
428 | case G_UNICODE_OTHER_LETTER: |
429 | if (makes_hangul_syllable) |
430 | GB_type = GB_InHangulSyllable; |
431 | |
432 | if (_pango_is_Consonant_Preceding_Repha (wc) || |
433 | _pango_is_Consonant_Prefixed (wc)) |
434 | GB_type = GB_Prepend; |
435 | break; |
436 | |
437 | case G_UNICODE_MODIFIER_LETTER: |
438 | if (wc >= 0xFF9E && wc <= 0xFF9F) |
439 | GB_type = GB_Extend; /* Other_Grapheme_Extend */ |
440 | break; |
441 | |
442 | case G_UNICODE_SPACING_MARK: |
443 | GB_type = GB_SpacingMark; /* SpacingMark */ |
444 | if (wc >= 0x0900) |
445 | { |
446 | if (wc == 0x09BE || wc == 0x09D7 || |
447 | wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 || |
448 | wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 || |
449 | wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF || |
450 | wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172)) |
451 | GB_type = GB_Extend; /* Other_Grapheme_Extend */ |
452 | } |
453 | break; |
454 | |
455 | case G_UNICODE_ENCLOSING_MARK: |
456 | case G_UNICODE_NON_SPACING_MARK: |
457 | GB_type = GB_Extend; /* Grapheme_Extend */ |
458 | break; |
459 | |
460 | case G_UNICODE_OTHER_SYMBOL: |
461 | if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) |
462 | { |
463 | if (prev_GB_type == GB_RI_Odd) |
464 | GB_type = GB_RI_Even; |
465 | else |
466 | GB_type = GB_RI_Odd; |
467 | break; |
468 | } |
469 | break; |
470 | |
471 | case G_UNICODE_MODIFIER_SYMBOL: |
472 | /* Fitzpatrick modifiers */ |
473 | if (wc >= 0x1F3FB && wc <= 0x1F3FF) |
474 | GB_type = GB_Extend; |
475 | break; |
476 | |
477 | default: |
478 | break; |
479 | } |
480 | |
481 | /* Rule GB11 */ |
482 | if (met_Extended_Pictographic) |
483 | { |
484 | if (GB_type == GB_Extend) |
485 | met_Extended_Pictographic = TRUE; |
486 | else if (_pango_Is_Emoji_Extended_Pictographic (ch: prev_wc) && |
487 | GB_type == GB_ZWJ) |
488 | met_Extended_Pictographic = TRUE; |
489 | else if (prev_GB_type == GB_Extend && GB_type == GB_ZWJ) |
490 | met_Extended_Pictographic = TRUE; |
491 | else if (prev_GB_type == GB_ZWJ && is_Extended_Pictographic) |
492 | met_Extended_Pictographic = TRUE; |
493 | else |
494 | met_Extended_Pictographic = FALSE; |
495 | } |
496 | |
497 | /* Grapheme Cluster Boundary Rules */ |
498 | is_grapheme_boundary = TRUE; /* Rule GB999 */ |
499 | |
500 | /* We apply Rules GB1 and GB2 at the end of the function */ |
501 | if (wc == '\n' && prev_wc == '\r') |
502 | is_grapheme_boundary = FALSE; /* Rule GB3 */ |
503 | else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF) |
504 | is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */ |
505 | else if (GB_type == GB_InHangulSyllable) |
506 | is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */ |
507 | else if (GB_type == GB_Extend) |
508 | is_grapheme_boundary = FALSE; /* Rule GB9 */ |
509 | else if (GB_type == GB_ZWJ) |
510 | is_grapheme_boundary = FALSE; /* Rule GB9 */ |
511 | else if (GB_type == GB_SpacingMark) |
512 | is_grapheme_boundary = FALSE; /* Rule GB9a */ |
513 | else if (prev_GB_type == GB_Prepend) |
514 | is_grapheme_boundary = FALSE; /* Rule GB9b */ |
515 | else if (is_Extended_Pictographic) |
516 | { /* Rule GB11 */ |
517 | if (prev_GB_type == GB_ZWJ && met_Extended_Pictographic) |
518 | is_grapheme_boundary = FALSE; |
519 | } |
520 | else if (prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even) |
521 | is_grapheme_boundary = FALSE; /* Rule GB12 and GB13 */ |
522 | |
523 | if (is_Extended_Pictographic) |
524 | met_Extended_Pictographic = TRUE; |
525 | |
526 | attrs[i].is_cursor_position = is_grapheme_boundary; |
527 | /* If this is a grapheme boundary, we have to decide if backspace |
528 | * deletes a character or the whole grapheme cluster */ |
529 | if (is_grapheme_boundary) |
530 | { |
531 | attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character); |
532 | |
533 | /* Dependent Vowels for Indic language */ |
534 | if (_pango_is_Virama (wc: prev_wc) || |
535 | _pango_is_Vowel_Dependent (wc: prev_wc)) |
536 | attrs[i].backspace_deletes_character = TRUE; |
537 | } |
538 | else |
539 | attrs[i].backspace_deletes_character = FALSE; |
540 | |
541 | prev_GB_type = GB_type; |
542 | } |
543 | |
544 | script = (PangoScript)g_unichar_get_script (ch: wc); |
545 | /* ---- UAX#29 Word Boundaries ---- */ |
546 | { |
547 | is_word_boundary = FALSE; |
548 | if (is_grapheme_boundary || |
549 | G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) /* Rules WB3 and WB4 */ |
550 | { |
551 | WordBreakType WB_type; |
552 | |
553 | /* Find the WordBreakType of wc */ |
554 | WB_type = WB_Other; |
555 | |
556 | if (script == PANGO_SCRIPT_KATAKANA) |
557 | WB_type = WB_Katakana; |
558 | |
559 | if (script == PANGO_SCRIPT_HEBREW && type == G_UNICODE_OTHER_LETTER) |
560 | WB_type = WB_Hebrew_Letter; |
561 | |
562 | if (WB_type == WB_Other) |
563 | switch (wc >> 8) |
564 | { |
565 | case 0x30: |
566 | if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 || |
567 | wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc) |
568 | WB_type = WB_Katakana; /* Katakana exceptions */ |
569 | break; |
570 | case 0xFF: |
571 | if (wc == 0xFF70) |
572 | WB_type = WB_Katakana; /* Katakana exceptions */ |
573 | else if (wc >= 0xFF9E && wc <= 0xFF9F) |
574 | WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */ |
575 | break; |
576 | case 0x05: |
577 | if (wc == 0x058A) |
578 | WB_type = WB_ALetter; /* ALetter exceptions */ |
579 | break; |
580 | default: |
581 | break; |
582 | } |
583 | |
584 | if (WB_type == WB_Other) |
585 | switch ((int) break_type) |
586 | { |
587 | case G_UNICODE_BREAK_NUMERIC: |
588 | if (wc != 0x066C) |
589 | WB_type = WB_Numeric; /* Numeric */ |
590 | break; |
591 | case G_UNICODE_BREAK_INFIX_SEPARATOR: |
592 | if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E) |
593 | WB_type = WB_MidNum; /* MidNum */ |
594 | break; |
595 | default: |
596 | break; |
597 | } |
598 | |
599 | if (WB_type == WB_Other) |
600 | switch ((int) type) |
601 | { |
602 | case G_UNICODE_CONTROL: |
603 | if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085) |
604 | break; |
605 | G_GNUC_FALLTHROUGH; |
606 | case G_UNICODE_LINE_SEPARATOR: |
607 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
608 | WB_type = WB_NewlineCRLF; /* CR, LF, Newline */ |
609 | break; |
610 | |
611 | case G_UNICODE_FORMAT: |
612 | case G_UNICODE_SPACING_MARK: |
613 | case G_UNICODE_ENCLOSING_MARK: |
614 | case G_UNICODE_NON_SPACING_MARK: |
615 | WB_type = WB_ExtendFormat; /* Extend, Format */ |
616 | break; |
617 | |
618 | case G_UNICODE_CONNECT_PUNCTUATION: |
619 | WB_type = WB_ExtendNumLet; /* ExtendNumLet */ |
620 | break; |
621 | |
622 | case G_UNICODE_INITIAL_PUNCTUATION: |
623 | case G_UNICODE_FINAL_PUNCTUATION: |
624 | if (wc == 0x2018 || wc == 0x2019) |
625 | WB_type = WB_MidNumLet; /* MidNumLet */ |
626 | break; |
627 | case G_UNICODE_OTHER_PUNCTUATION: |
628 | if ((wc >= 0x055a && wc <= 0x055c) || |
629 | wc == 0x055e || wc == 0x05f3) |
630 | WB_type = WB_ALetter; /* ALetter */ |
631 | else if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 || |
632 | wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e) |
633 | WB_type = WB_MidNumLet; /* MidNumLet */ |
634 | else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 || |
635 | wc == 0x003a || wc == 0x0387 || wc == 0x055f || |
636 | wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a) |
637 | WB_type = WB_MidLetter; /* MidLetter */ |
638 | else if (wc == 0x066c || |
639 | wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b) |
640 | WB_type = WB_MidNum; /* MidNum */ |
641 | break; |
642 | |
643 | case G_UNICODE_OTHER_SYMBOL: |
644 | if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */ |
645 | goto Alphabetic; |
646 | |
647 | if (G_UNLIKELY(wc >= 0x1F1E6 && wc <= 0x1F1FF)) |
648 | { |
649 | if (prev_WB_type == WB_RI_Odd) |
650 | WB_type = WB_RI_Even; |
651 | else |
652 | WB_type = WB_RI_Odd; |
653 | } |
654 | |
655 | break; |
656 | |
657 | case G_UNICODE_OTHER_LETTER: |
658 | case G_UNICODE_LETTER_NUMBER: |
659 | if (wc == 0x3006 || wc == 0x3007 || |
660 | (wc >= 0x3021 && wc <= 0x3029) || |
661 | (wc >= 0x3038 && wc <= 0x303A) || |
662 | (wc >= 0x3400 && wc <= 0x4DB5) || |
663 | (wc >= 0x4E00 && wc <= 0x9FC3) || |
664 | (wc >= 0xF900 && wc <= 0xFA2D) || |
665 | (wc >= 0xFA30 && wc <= 0xFA6A) || |
666 | (wc >= 0xFA70 && wc <= 0xFAD9) || |
667 | (wc >= 0x20000 && wc <= 0x2A6D6) || |
668 | (wc >= 0x2F800 && wc <= 0x2FA1D)) |
669 | break; /* ALetter exceptions: Ideographic */ |
670 | goto Alphabetic; |
671 | |
672 | case G_UNICODE_LOWERCASE_LETTER: |
673 | case G_UNICODE_MODIFIER_LETTER: |
674 | case G_UNICODE_TITLECASE_LETTER: |
675 | case G_UNICODE_UPPERCASE_LETTER: |
676 | Alphabetic: |
677 | if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA) |
678 | WB_type = WB_ALetter; /* ALetter */ |
679 | break; |
680 | default: |
681 | break; |
682 | } |
683 | |
684 | if (WB_type == WB_Other) |
685 | { |
686 | if (type == G_UNICODE_SPACE_SEPARATOR && |
687 | break_type != G_UNICODE_BREAK_NON_BREAKING_GLUE) |
688 | WB_type = WB_WSegSpace; |
689 | } |
690 | |
691 | /* Word Cluster Boundary Rules */ |
692 | |
693 | /* We apply Rules WB1 and WB2 at the end of the function */ |
694 | |
695 | if (prev_wc == 0x3031 && wc == 0x41) |
696 | g_debug ("Y %d %d" , prev_WB_type, WB_type); |
697 | if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i) |
698 | { |
699 | /* The extra check for prev_WB_i is to correctly handle sequences like |
700 | * Newline ÷ Extend × Extend |
701 | * since we have not skipped ExtendFormat yet. |
702 | */ |
703 | is_word_boundary = TRUE; /* Rule WB3a */ |
704 | } |
705 | else if (WB_type == WB_NewlineCRLF) |
706 | is_word_boundary = TRUE; /* Rule WB3b */ |
707 | else if (prev_wc == 0x200D && is_Extended_Pictographic) |
708 | is_word_boundary = FALSE; /* Rule WB3c */ |
709 | else if (prev_WB_type == WB_WSegSpace && |
710 | WB_type == WB_WSegSpace && prev_WB_i + 1 == i) |
711 | is_word_boundary = FALSE; /* Rule WB3d */ |
712 | else if (WB_type == WB_ExtendFormat) |
713 | is_word_boundary = FALSE; /* Rules WB4? */ |
714 | else if ((prev_WB_type == WB_ALetter || |
715 | prev_WB_type == WB_Hebrew_Letter || |
716 | prev_WB_type == WB_Numeric) && |
717 | (WB_type == WB_ALetter || |
718 | WB_type == WB_Hebrew_Letter || |
719 | WB_type == WB_Numeric)) |
720 | is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10 */ |
721 | else if (prev_WB_type == WB_Katakana && WB_type == WB_Katakana) |
722 | is_word_boundary = FALSE; /* Rule WB13 */ |
723 | else if ((prev_WB_type == WB_ALetter || |
724 | prev_WB_type == WB_Hebrew_Letter || |
725 | prev_WB_type == WB_Numeric || |
726 | prev_WB_type == WB_Katakana || |
727 | prev_WB_type == WB_ExtendNumLet) && |
728 | WB_type == WB_ExtendNumLet) |
729 | is_word_boundary = FALSE; /* Rule WB13a */ |
730 | else if (prev_WB_type == WB_ExtendNumLet && |
731 | (WB_type == WB_ALetter || |
732 | WB_type == WB_Hebrew_Letter || |
733 | WB_type == WB_Numeric || |
734 | WB_type == WB_Katakana)) |
735 | is_word_boundary = FALSE; /* Rule WB13b */ |
736 | else if (((prev_prev_WB_type == WB_ALetter || |
737 | prev_prev_WB_type == WB_Hebrew_Letter) && |
738 | (WB_type == WB_ALetter || |
739 | WB_type == WB_Hebrew_Letter)) && |
740 | (prev_WB_type == WB_MidLetter || |
741 | prev_WB_type == WB_MidNumLet || |
742 | prev_wc == 0x0027)) |
743 | { |
744 | attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */ |
745 | is_word_boundary = FALSE; /* Rule WB7 */ |
746 | } |
747 | else if (prev_WB_type == WB_Hebrew_Letter && wc == 0x0027) |
748 | is_word_boundary = FALSE; /* Rule WB7a */ |
749 | else if (prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == 0x0022 && |
750 | WB_type == WB_Hebrew_Letter) { |
751 | attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB7b */ |
752 | is_word_boundary = FALSE; /* Rule WB7c */ |
753 | } |
754 | else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) && |
755 | (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet || |
756 | prev_wc == 0x0027)) |
757 | { |
758 | is_word_boundary = FALSE; /* Rule WB11 */ |
759 | attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */ |
760 | } |
761 | else if (prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even) |
762 | is_word_boundary = FALSE; /* Rule WB15 and WB16 */ |
763 | else |
764 | is_word_boundary = TRUE; /* Rule WB999 */ |
765 | |
766 | if (WB_type != WB_ExtendFormat) |
767 | { |
768 | prev_prev_WB_type = prev_WB_type; |
769 | prev_WB_type = WB_type; |
770 | prev_WB_i = i; |
771 | } |
772 | } |
773 | |
774 | attrs[i].is_word_boundary = is_word_boundary; |
775 | } |
776 | |
777 | /* ---- UAX#29 Sentence Boundaries ---- */ |
778 | { |
779 | is_sentence_boundary = FALSE; |
780 | if (is_word_boundary || |
781 | wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */ |
782 | { |
783 | SentenceBreakType SB_type; |
784 | |
785 | /* Find the SentenceBreakType of wc */ |
786 | SB_type = SB_Other; |
787 | |
788 | if (break_type == G_UNICODE_BREAK_NUMERIC) |
789 | SB_type = SB_Numeric; /* Numeric */ |
790 | |
791 | if (SB_type == SB_Other) |
792 | switch ((int) type) |
793 | { |
794 | case G_UNICODE_CONTROL: |
795 | if (wc == '\r' || wc == '\n') |
796 | SB_type = SB_ParaSep; |
797 | else if (wc == 0x0009 || wc == 0x000B || wc == 0x000C) |
798 | SB_type = SB_Sp; |
799 | else if (wc == 0x0085) |
800 | SB_type = SB_ParaSep; |
801 | break; |
802 | |
803 | case G_UNICODE_SPACE_SEPARATOR: |
804 | if (wc == 0x0020 || wc == 0x00A0 || wc == 0x1680 || |
805 | (wc >= 0x2000 && wc <= 0x200A) || |
806 | wc == 0x202F || wc == 0x205F || wc == 0x3000) |
807 | SB_type = SB_Sp; |
808 | break; |
809 | |
810 | case G_UNICODE_LINE_SEPARATOR: |
811 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
812 | SB_type = SB_ParaSep; |
813 | break; |
814 | |
815 | case G_UNICODE_FORMAT: |
816 | case G_UNICODE_SPACING_MARK: |
817 | case G_UNICODE_ENCLOSING_MARK: |
818 | case G_UNICODE_NON_SPACING_MARK: |
819 | SB_type = SB_ExtendFormat; /* Extend, Format */ |
820 | break; |
821 | |
822 | case G_UNICODE_MODIFIER_LETTER: |
823 | if (wc >= 0xFF9E && wc <= 0xFF9F) |
824 | SB_type = SB_ExtendFormat; /* Other_Grapheme_Extend */ |
825 | break; |
826 | |
827 | case G_UNICODE_TITLECASE_LETTER: |
828 | SB_type = SB_Upper; |
829 | break; |
830 | |
831 | case G_UNICODE_DASH_PUNCTUATION: |
832 | if (wc == 0x002D || |
833 | (wc >= 0x2013 && wc <= 0x2014) || |
834 | (wc >= 0xFE31 && wc <= 0xFE32) || |
835 | wc == 0xFE58 || |
836 | wc == 0xFE63 || |
837 | wc == 0xFF0D) |
838 | SB_type = SB_SContinue; |
839 | break; |
840 | |
841 | case G_UNICODE_OTHER_PUNCTUATION: |
842 | if (wc == 0x05F3) |
843 | SB_type = SB_OLetter; |
844 | else if (wc == 0x002E || wc == 0x2024 || |
845 | wc == 0xFE52 || wc == 0xFF0E) |
846 | SB_type = SB_ATerm; |
847 | |
848 | if (wc == 0x002C || |
849 | wc == 0x003A || |
850 | wc == 0x055D || |
851 | (wc >= 0x060C && wc <= 0x060D) || |
852 | wc == 0x07F8 || |
853 | wc == 0x1802 || |
854 | wc == 0x1808 || |
855 | wc == 0x3001 || |
856 | (wc >= 0xFE10 && wc <= 0xFE11) || |
857 | wc == 0xFE13 || |
858 | (wc >= 0xFE50 && wc <= 0xFE51) || |
859 | wc == 0xFE55 || |
860 | wc == 0xFF0C || |
861 | wc == 0xFF1A || |
862 | wc == 0xFF64) |
863 | SB_type = SB_SContinue; |
864 | |
865 | if (_pango_is_STerm(wc)) |
866 | SB_type = SB_STerm; |
867 | |
868 | break; |
869 | |
870 | default: |
871 | break; |
872 | } |
873 | |
874 | if (SB_type == SB_Other) |
875 | { |
876 | if (type == G_UNICODE_LOWERCASE_LETTER) |
877 | SB_type = SB_Lower; |
878 | else if (type == G_UNICODE_UPPERCASE_LETTER) |
879 | SB_type = SB_Upper; |
880 | else if (type == G_UNICODE_TITLECASE_LETTER || |
881 | type == G_UNICODE_MODIFIER_LETTER || |
882 | type == G_UNICODE_OTHER_LETTER) |
883 | SB_type = SB_OLetter; |
884 | |
885 | if (type == G_UNICODE_OPEN_PUNCTUATION || |
886 | type == G_UNICODE_CLOSE_PUNCTUATION || |
887 | break_type == G_UNICODE_BREAK_QUOTATION) |
888 | SB_type = SB_Close; |
889 | } |
890 | |
891 | /* Sentence Boundary Rules */ |
892 | |
893 | /* We apply Rules SB1 and SB2 at the end of the function */ |
894 | |
895 | #define IS_OTHER_TERM(SB_type) \ |
896 | /* not in (OLetter | Upper | Lower | ParaSep | SATerm) */ \ |
897 | !(SB_type == SB_OLetter || \ |
898 | SB_type == SB_Upper || SB_type == SB_Lower || \ |
899 | SB_type == SB_ParaSep || \ |
900 | SB_type == SB_ATerm || SB_type == SB_STerm || \ |
901 | SB_type == SB_ATerm_Close_Sp || \ |
902 | SB_type == SB_STerm_Close_Sp) |
903 | |
904 | |
905 | if (wc == '\n' && prev_wc == '\r') |
906 | is_sentence_boundary = FALSE; /* Rule SB3 */ |
907 | else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i) |
908 | { |
909 | /* The extra check for prev_SB_i is to correctly handle sequences like |
910 | * ParaSep ÷ Extend × Extend |
911 | * since we have not skipped ExtendFormat yet. |
912 | */ |
913 | |
914 | is_sentence_boundary = TRUE; /* Rule SB4 */ |
915 | } |
916 | else if (SB_type == SB_ExtendFormat) |
917 | is_sentence_boundary = FALSE; /* Rule SB5? */ |
918 | else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric) |
919 | is_sentence_boundary = FALSE; /* Rule SB6 */ |
920 | else if ((prev_prev_SB_type == SB_Upper || |
921 | prev_prev_SB_type == SB_Lower) && |
922 | prev_SB_type == SB_ATerm && |
923 | SB_type == SB_Upper) |
924 | is_sentence_boundary = FALSE; /* Rule SB7 */ |
925 | else if (prev_SB_type == SB_ATerm && SB_type == SB_Close) |
926 | SB_type = SB_ATerm; |
927 | else if (prev_SB_type == SB_STerm && SB_type == SB_Close) |
928 | SB_type = SB_STerm; |
929 | else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp) |
930 | SB_type = SB_ATerm_Close_Sp; |
931 | else if (prev_SB_type == SB_STerm && SB_type == SB_Sp) |
932 | SB_type = SB_STerm_Close_Sp; |
933 | /* Rule SB8 */ |
934 | else if ((prev_SB_type == SB_ATerm || |
935 | prev_SB_type == SB_ATerm_Close_Sp) && |
936 | SB_type == SB_Lower) |
937 | is_sentence_boundary = FALSE; |
938 | else if ((prev_prev_SB_type == SB_ATerm || |
939 | prev_prev_SB_type == SB_ATerm_Close_Sp) && |
940 | IS_OTHER_TERM(prev_SB_type) && |
941 | SB_type == SB_Lower) |
942 | { |
943 | attrs[prev_SB_i].is_sentence_boundary = FALSE; |
944 | attrs[prev_SB_i].is_sentence_end = FALSE; |
945 | last_sentence_start = -1; |
946 | for (int j = prev_SB_i - 1; j >= 0; j--) |
947 | { |
948 | attrs[j].is_sentence_end = FALSE; |
949 | if (attrs[j].is_sentence_boundary) |
950 | { |
951 | last_sentence_start = j; |
952 | break; |
953 | } |
954 | } |
955 | } |
956 | else if ((prev_SB_type == SB_ATerm || |
957 | prev_SB_type == SB_ATerm_Close_Sp || |
958 | prev_SB_type == SB_STerm || |
959 | prev_SB_type == SB_STerm_Close_Sp) && |
960 | (SB_type == SB_SContinue || |
961 | SB_type == SB_ATerm || SB_type == SB_STerm)) |
962 | is_sentence_boundary = FALSE; /* Rule SB8a */ |
963 | else if ((prev_SB_type == SB_ATerm || |
964 | prev_SB_type == SB_STerm) && |
965 | (SB_type == SB_Close || SB_type == SB_Sp || |
966 | SB_type == SB_ParaSep)) |
967 | is_sentence_boundary = FALSE; /* Rule SB9 */ |
968 | else if ((prev_SB_type == SB_ATerm || |
969 | prev_SB_type == SB_ATerm_Close_Sp || |
970 | prev_SB_type == SB_STerm || |
971 | prev_SB_type == SB_STerm_Close_Sp) && |
972 | (SB_type == SB_Sp || SB_type == SB_ParaSep)) |
973 | is_sentence_boundary = FALSE; /* Rule SB10 */ |
974 | else if ((prev_SB_type == SB_ATerm || |
975 | prev_SB_type == SB_ATerm_Close_Sp || |
976 | prev_SB_type == SB_STerm || |
977 | prev_SB_type == SB_STerm_Close_Sp) && |
978 | SB_type != SB_ParaSep) |
979 | is_sentence_boundary = TRUE; /* Rule SB11 */ |
980 | else |
981 | is_sentence_boundary = FALSE; /* Rule SB998 */ |
982 | |
983 | if (SB_type != SB_ExtendFormat && |
984 | !((prev_prev_SB_type == SB_ATerm || |
985 | prev_prev_SB_type == SB_ATerm_Close_Sp) && |
986 | IS_OTHER_TERM(prev_SB_type) && |
987 | IS_OTHER_TERM(SB_type))) |
988 | { |
989 | prev_prev_SB_type = prev_SB_type; |
990 | prev_SB_type = SB_type; |
991 | prev_SB_i = i; |
992 | } |
993 | |
994 | #undef IS_OTHER_TERM |
995 | |
996 | } |
997 | |
998 | if (i == 0 || done) |
999 | is_sentence_boundary = TRUE; /* Rules SB1 and SB2 */ |
1000 | |
1001 | attrs[i].is_sentence_boundary = is_sentence_boundary; |
1002 | } |
1003 | |
1004 | /* ---- Line breaking ---- */ |
1005 | |
1006 | break_op = BREAK_ALREADY_HANDLED; |
1007 | |
1008 | row_break_type = prev_break_type == G_UNICODE_BREAK_SPACE ? |
1009 | prev_prev_break_type : prev_break_type; |
1010 | g_assert (row_break_type != G_UNICODE_BREAK_SPACE); |
1011 | |
1012 | attrs[i].is_char_break = FALSE; |
1013 | attrs[i].is_line_break = FALSE; |
1014 | attrs[i].is_mandatory_break = FALSE; |
1015 | |
1016 | /* Rule LB1: |
1017 | assign a line breaking class to each code point of the input. */ |
1018 | switch ((int)break_type) |
1019 | { |
1020 | case G_UNICODE_BREAK_AMBIGUOUS: |
1021 | case G_UNICODE_BREAK_SURROGATE: |
1022 | case G_UNICODE_BREAK_UNKNOWN: |
1023 | break_type = G_UNICODE_BREAK_ALPHABETIC; |
1024 | break; |
1025 | |
1026 | case G_UNICODE_BREAK_COMPLEX_CONTEXT: |
1027 | if (type == G_UNICODE_NON_SPACING_MARK || |
1028 | type == G_UNICODE_SPACING_MARK) |
1029 | break_type = G_UNICODE_BREAK_COMBINING_MARK; |
1030 | else |
1031 | break_type = G_UNICODE_BREAK_ALPHABETIC; |
1032 | break; |
1033 | |
1034 | case G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER: |
1035 | break_type = G_UNICODE_BREAK_NON_STARTER; |
1036 | break; |
1037 | |
1038 | default: |
1039 | break; |
1040 | } |
1041 | |
1042 | /* If it's not a grapheme boundary, it's not a line break either */ |
1043 | if (attrs[i].is_cursor_position || |
1044 | break_type == G_UNICODE_BREAK_COMBINING_MARK || |
1045 | break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER || |
1046 | break_type == G_UNICODE_BREAK_HANGUL_L_JAMO || |
1047 | break_type == G_UNICODE_BREAK_HANGUL_V_JAMO || |
1048 | break_type == G_UNICODE_BREAK_HANGUL_T_JAMO || |
1049 | break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE || |
1050 | break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE || |
1051 | break_type == G_UNICODE_BREAK_EMOJI_MODIFIER || |
1052 | break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR) |
1053 | { |
1054 | LineBreakType LB_type; |
1055 | |
1056 | /* Find the LineBreakType of wc */ |
1057 | LB_type = LB_Other; |
1058 | |
1059 | if (break_type == G_UNICODE_BREAK_NUMERIC) |
1060 | LB_type = LB_Numeric; |
1061 | |
1062 | if (break_type == G_UNICODE_BREAK_SYMBOL || |
1063 | break_type == G_UNICODE_BREAK_INFIX_SEPARATOR) |
1064 | { |
1065 | if (!(prev_LB_type == LB_Numeric)) |
1066 | LB_type = LB_Other; |
1067 | } |
1068 | |
1069 | if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION || |
1070 | break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS) |
1071 | { |
1072 | if (prev_LB_type == LB_Numeric) |
1073 | LB_type = LB_Numeric_Close; |
1074 | else |
1075 | LB_type = LB_Other; |
1076 | } |
1077 | |
1078 | if (break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR) |
1079 | { |
1080 | if (prev_LB_type == LB_RI_Odd) |
1081 | LB_type = LB_RI_Even; |
1082 | else |
1083 | LB_type = LB_RI_Odd; |
1084 | } |
1085 | |
1086 | attrs[i].is_line_break = TRUE; /* Rule LB31 */ |
1087 | /* Unicode doesn't specify char wrap; |
1088 | we wrap around all chars currently. */ |
1089 | if (attrs[i].is_cursor_position) |
1090 | attrs[i].is_char_break = TRUE; |
1091 | |
1092 | /* Make any necessary replacements first */ |
1093 | if (row_break_type == G_UNICODE_BREAK_UNKNOWN) |
1094 | row_break_type = G_UNICODE_BREAK_ALPHABETIC; |
1095 | |
1096 | /* add the line break rules in reverse order to override |
1097 | the lower priority rules. */ |
1098 | |
1099 | /* Rule LB30 */ |
1100 | if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC || |
1101 | prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER || |
1102 | prev_break_type == G_UNICODE_BREAK_NUMERIC) && |
1103 | break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION && |
1104 | !_pango_is_EastAsianWide (wc)) |
1105 | break_op = BREAK_PROHIBITED; |
1106 | |
1107 | if (prev_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS && |
1108 | !_pango_is_EastAsianWide (wc: prev_wc)&& |
1109 | (break_type == G_UNICODE_BREAK_ALPHABETIC || |
1110 | break_type == G_UNICODE_BREAK_HEBREW_LETTER || |
1111 | break_type == G_UNICODE_BREAK_NUMERIC)) |
1112 | break_op = BREAK_PROHIBITED; |
1113 | |
1114 | /* Rule LB30a */ |
1115 | if (prev_LB_type == LB_RI_Odd && LB_type == LB_RI_Even) |
1116 | break_op = BREAK_PROHIBITED; |
1117 | |
1118 | /* Rule LB30b */ |
1119 | if (prev_break_type == G_UNICODE_BREAK_EMOJI_BASE && |
1120 | break_type == G_UNICODE_BREAK_EMOJI_MODIFIER) |
1121 | break_op = BREAK_PROHIBITED; |
1122 | |
1123 | if ((_pango_Is_Emoji_Extended_Pictographic (ch: prev_wc) && |
1124 | g_unichar_type (c: prev_wc) == G_UNICODE_UNASSIGNED) && |
1125 | break_type == G_UNICODE_BREAK_EMOJI_MODIFIER) |
1126 | break_op = BREAK_PROHIBITED; |
1127 | |
1128 | /* Rule LB29 */ |
1129 | if (prev_break_type == G_UNICODE_BREAK_INFIX_SEPARATOR && |
1130 | (break_type == G_UNICODE_BREAK_ALPHABETIC || |
1131 | break_type == G_UNICODE_BREAK_HEBREW_LETTER)) |
1132 | break_op = BREAK_PROHIBITED; |
1133 | |
1134 | /* Rule LB28 */ |
1135 | if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC || |
1136 | prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) && |
1137 | (break_type == G_UNICODE_BREAK_ALPHABETIC || |
1138 | break_type == G_UNICODE_BREAK_HEBREW_LETTER)) |
1139 | break_op = BREAK_PROHIBITED; |
1140 | |
1141 | /* Rule LB27 */ |
1142 | if ((prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO || |
1143 | prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO || |
1144 | prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO || |
1145 | prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE || |
1146 | prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) && |
1147 | break_type == G_UNICODE_BREAK_POSTFIX) |
1148 | break_op = BREAK_PROHIBITED; |
1149 | |
1150 | if (prev_break_type == G_UNICODE_BREAK_PREFIX && |
1151 | (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO || |
1152 | break_type == G_UNICODE_BREAK_HANGUL_V_JAMO || |
1153 | break_type == G_UNICODE_BREAK_HANGUL_T_JAMO || |
1154 | break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE || |
1155 | break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE)) |
1156 | break_op = BREAK_PROHIBITED; |
1157 | |
1158 | /* Rule LB26 */ |
1159 | if (prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO && |
1160 | (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO || |
1161 | break_type == G_UNICODE_BREAK_HANGUL_V_JAMO || |
1162 | break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE || |
1163 | break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE)) |
1164 | break_op = BREAK_PROHIBITED; |
1165 | |
1166 | if ((prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO || |
1167 | prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE) && |
1168 | (break_type == G_UNICODE_BREAK_HANGUL_V_JAMO || |
1169 | break_type == G_UNICODE_BREAK_HANGUL_T_JAMO)) |
1170 | break_op = BREAK_PROHIBITED; |
1171 | |
1172 | if ((prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO || |
1173 | prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) && |
1174 | break_type == G_UNICODE_BREAK_HANGUL_T_JAMO) |
1175 | break_op = BREAK_PROHIBITED; |
1176 | |
1177 | /* Rule LB25 with Example 7 of Customization */ |
1178 | if ((prev_break_type == G_UNICODE_BREAK_PREFIX || |
1179 | prev_break_type == G_UNICODE_BREAK_POSTFIX) && |
1180 | break_type == G_UNICODE_BREAK_NUMERIC) |
1181 | break_op = BREAK_PROHIBITED; |
1182 | |
1183 | if ((prev_break_type == G_UNICODE_BREAK_PREFIX || |
1184 | prev_break_type == G_UNICODE_BREAK_POSTFIX) && |
1185 | (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION || |
1186 | break_type == G_UNICODE_BREAK_HYPHEN) && |
1187 | next_break_type == G_UNICODE_BREAK_NUMERIC) |
1188 | break_op = BREAK_PROHIBITED; |
1189 | |
1190 | if ((prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION || |
1191 | prev_break_type == G_UNICODE_BREAK_HYPHEN) && |
1192 | break_type == G_UNICODE_BREAK_NUMERIC) |
1193 | break_op = BREAK_PROHIBITED; |
1194 | |
1195 | if (prev_break_type == G_UNICODE_BREAK_NUMERIC && |
1196 | (break_type == G_UNICODE_BREAK_NUMERIC || |
1197 | break_type == G_UNICODE_BREAK_SYMBOL || |
1198 | break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)) |
1199 | break_op = BREAK_PROHIBITED; |
1200 | |
1201 | if (prev_LB_type == LB_Numeric && |
1202 | (break_type == G_UNICODE_BREAK_NUMERIC || |
1203 | break_type == G_UNICODE_BREAK_SYMBOL || |
1204 | break_type == G_UNICODE_BREAK_INFIX_SEPARATOR || |
1205 | break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION || |
1206 | break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS)) |
1207 | break_op = BREAK_PROHIBITED; |
1208 | |
1209 | if ((prev_LB_type == LB_Numeric || |
1210 | prev_LB_type == LB_Numeric_Close) && |
1211 | (break_type == G_UNICODE_BREAK_POSTFIX || |
1212 | break_type == G_UNICODE_BREAK_PREFIX)) |
1213 | break_op = BREAK_PROHIBITED; |
1214 | |
1215 | /* Rule LB24 */ |
1216 | if ((prev_break_type == G_UNICODE_BREAK_PREFIX || |
1217 | prev_break_type == G_UNICODE_BREAK_POSTFIX) && |
1218 | (break_type == G_UNICODE_BREAK_ALPHABETIC || |
1219 | break_type == G_UNICODE_BREAK_HEBREW_LETTER)) |
1220 | break_op = BREAK_PROHIBITED; |
1221 | |
1222 | if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC || |
1223 | prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) && |
1224 | (break_type == G_UNICODE_BREAK_PREFIX || |
1225 | break_type == G_UNICODE_BREAK_POSTFIX)) |
1226 | break_op = BREAK_PROHIBITED; |
1227 | |
1228 | /* Rule LB23 */ |
1229 | if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC || |
1230 | prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) && |
1231 | break_type == G_UNICODE_BREAK_NUMERIC) |
1232 | break_op = BREAK_PROHIBITED; |
1233 | |
1234 | if (prev_break_type == G_UNICODE_BREAK_NUMERIC && |
1235 | (break_type == G_UNICODE_BREAK_ALPHABETIC || |
1236 | break_type == G_UNICODE_BREAK_HEBREW_LETTER)) |
1237 | break_op = BREAK_PROHIBITED; |
1238 | |
1239 | /* Rule LB23a */ |
1240 | if (prev_break_type == G_UNICODE_BREAK_PREFIX && |
1241 | (break_type == G_UNICODE_BREAK_IDEOGRAPHIC || |
1242 | break_type == G_UNICODE_BREAK_EMOJI_BASE || |
1243 | break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)) |
1244 | break_op = BREAK_PROHIBITED; |
1245 | |
1246 | if ((prev_break_type == G_UNICODE_BREAK_IDEOGRAPHIC || |
1247 | prev_break_type == G_UNICODE_BREAK_EMOJI_BASE || |
1248 | prev_break_type == G_UNICODE_BREAK_EMOJI_MODIFIER) && |
1249 | break_type == G_UNICODE_BREAK_POSTFIX) |
1250 | break_op = BREAK_PROHIBITED; |
1251 | |
1252 | /* Rule LB22 */ |
1253 | if (break_type == G_UNICODE_BREAK_INSEPARABLE) |
1254 | break_op = BREAK_PROHIBITED; |
1255 | |
1256 | if (break_type == G_UNICODE_BREAK_AFTER || |
1257 | break_type == G_UNICODE_BREAK_HYPHEN || |
1258 | break_type == G_UNICODE_BREAK_NON_STARTER || |
1259 | prev_break_type == G_UNICODE_BREAK_BEFORE) |
1260 | break_op = BREAK_PROHIBITED; /* Rule LB21 */ |
1261 | |
1262 | if (prev_prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER && |
1263 | (prev_break_type == G_UNICODE_BREAK_HYPHEN || |
1264 | prev_break_type == G_UNICODE_BREAK_AFTER)) |
1265 | break_op = BREAK_PROHIBITED; /* Rule LB21a */ |
1266 | |
1267 | if (prev_break_type == G_UNICODE_BREAK_SYMBOL && |
1268 | break_type == G_UNICODE_BREAK_HEBREW_LETTER) |
1269 | break_op = BREAK_PROHIBITED; /* Rule LB21b */ |
1270 | |
1271 | if (prev_break_type == G_UNICODE_BREAK_CONTINGENT || |
1272 | break_type == G_UNICODE_BREAK_CONTINGENT) |
1273 | break_op = BREAK_ALLOWED; /* Rule LB20 */ |
1274 | |
1275 | if (prev_break_type == G_UNICODE_BREAK_QUOTATION || |
1276 | break_type == G_UNICODE_BREAK_QUOTATION) |
1277 | break_op = BREAK_PROHIBITED; /* Rule LB19 */ |
1278 | |
1279 | /* handle related rules for Space as state machine here, |
1280 | and override the pair table result. */ |
1281 | if (prev_break_type == G_UNICODE_BREAK_SPACE) /* Rule LB18 */ |
1282 | break_op = BREAK_ALLOWED; |
1283 | |
1284 | if (row_break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER && |
1285 | break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER) |
1286 | break_op = BREAK_PROHIBITED; /* Rule LB17 */ |
1287 | |
1288 | if ((row_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION || |
1289 | row_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS) && |
1290 | break_type == G_UNICODE_BREAK_NON_STARTER) |
1291 | break_op = BREAK_PROHIBITED; /* Rule LB16 */ |
1292 | |
1293 | if (row_break_type == G_UNICODE_BREAK_QUOTATION && |
1294 | break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION) |
1295 | break_op = BREAK_PROHIBITED; /* Rule LB15 */ |
1296 | |
1297 | if (row_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION) |
1298 | break_op = BREAK_PROHIBITED; /* Rule LB14 */ |
1299 | |
1300 | /* Rule LB13 with Example 7 of Customization */ |
1301 | if (break_type == G_UNICODE_BREAK_EXCLAMATION) |
1302 | break_op = BREAK_PROHIBITED; |
1303 | |
1304 | if (prev_break_type != G_UNICODE_BREAK_NUMERIC && |
1305 | (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION || |
1306 | break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS || |
1307 | break_type == G_UNICODE_BREAK_INFIX_SEPARATOR || |
1308 | break_type == G_UNICODE_BREAK_SYMBOL)) |
1309 | break_op = BREAK_PROHIBITED; |
1310 | |
1311 | if (prev_break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE) |
1312 | break_op = BREAK_PROHIBITED; /* Rule LB12 */ |
1313 | |
1314 | if (break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE && |
1315 | (prev_break_type != G_UNICODE_BREAK_SPACE && |
1316 | prev_break_type != G_UNICODE_BREAK_AFTER && |
1317 | prev_break_type != G_UNICODE_BREAK_HYPHEN)) |
1318 | break_op = BREAK_PROHIBITED; /* Rule LB12a */ |
1319 | |
1320 | if (prev_break_type == G_UNICODE_BREAK_WORD_JOINER || |
1321 | break_type == G_UNICODE_BREAK_WORD_JOINER) |
1322 | break_op = BREAK_PROHIBITED; /* Rule LB11 */ |
1323 | |
1324 | |
1325 | /* Rule LB9 */ |
1326 | if (break_type == G_UNICODE_BREAK_COMBINING_MARK || |
1327 | break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER) |
1328 | { |
1329 | if (!(prev_break_type == G_UNICODE_BREAK_MANDATORY || |
1330 | prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN || |
1331 | prev_break_type == G_UNICODE_BREAK_LINE_FEED || |
1332 | prev_break_type == G_UNICODE_BREAK_NEXT_LINE || |
1333 | prev_break_type == G_UNICODE_BREAK_SPACE || |
1334 | prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)) |
1335 | break_op = BREAK_PROHIBITED; |
1336 | } |
1337 | |
1338 | if (row_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE) |
1339 | break_op = BREAK_ALLOWED; /* Rule LB8 */ |
1340 | |
1341 | if (prev_wc == 0x200D) |
1342 | break_op = BREAK_PROHIBITED; /* Rule LB8a */ |
1343 | |
1344 | if (break_type == G_UNICODE_BREAK_SPACE || |
1345 | break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE) |
1346 | break_op = BREAK_PROHIBITED; /* Rule LB7 */ |
1347 | |
1348 | /* Rule LB6 */ |
1349 | if (break_type == G_UNICODE_BREAK_MANDATORY || |
1350 | break_type == G_UNICODE_BREAK_CARRIAGE_RETURN || |
1351 | break_type == G_UNICODE_BREAK_LINE_FEED || |
1352 | break_type == G_UNICODE_BREAK_NEXT_LINE) |
1353 | break_op = BREAK_PROHIBITED; |
1354 | |
1355 | /* Rules LB4 and LB5 */ |
1356 | if (prev_break_type == G_UNICODE_BREAK_MANDATORY || |
1357 | (prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN && |
1358 | wc != '\n') || |
1359 | prev_break_type == G_UNICODE_BREAK_LINE_FEED || |
1360 | prev_break_type == G_UNICODE_BREAK_NEXT_LINE) |
1361 | { |
1362 | attrs[i].is_mandatory_break = TRUE; |
1363 | break_op = BREAK_ALLOWED; |
1364 | } |
1365 | |
1366 | switch (break_op) |
1367 | { |
1368 | case BREAK_PROHIBITED: |
1369 | /* can't break here */ |
1370 | attrs[i].is_line_break = FALSE; |
1371 | break; |
1372 | |
1373 | case BREAK_IF_SPACES: |
1374 | /* break if prev char was space */ |
1375 | if (prev_break_type != G_UNICODE_BREAK_SPACE) |
1376 | attrs[i].is_line_break = FALSE; |
1377 | break; |
1378 | |
1379 | case BREAK_ALLOWED: |
1380 | attrs[i].is_line_break = TRUE; |
1381 | break; |
1382 | |
1383 | case BREAK_ALREADY_HANDLED: |
1384 | break; |
1385 | |
1386 | default: |
1387 | g_assert_not_reached (); |
1388 | break; |
1389 | } |
1390 | |
1391 | /* Rule LB9 */ |
1392 | if (!(break_type == G_UNICODE_BREAK_COMBINING_MARK || |
1393 | break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)) |
1394 | { |
1395 | /* Rule LB25 with Example 7 of Customization */ |
1396 | if (break_type == G_UNICODE_BREAK_NUMERIC || |
1397 | break_type == G_UNICODE_BREAK_SYMBOL || |
1398 | break_type == G_UNICODE_BREAK_INFIX_SEPARATOR) |
1399 | { |
1400 | if (prev_LB_type != LB_Numeric) |
1401 | prev_LB_type = LB_type; |
1402 | /* else don't change the prev_LB_type */ |
1403 | } |
1404 | else |
1405 | { |
1406 | prev_LB_type = LB_type; |
1407 | } |
1408 | } |
1409 | /* else don't change the prev_LB_type for Rule LB9 */ |
1410 | } |
1411 | |
1412 | if (break_type != G_UNICODE_BREAK_SPACE) |
1413 | { |
1414 | /* Rule LB9 */ |
1415 | if (break_type == G_UNICODE_BREAK_COMBINING_MARK || |
1416 | break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER) |
1417 | { |
1418 | if (i == 0 /* start of text */ || |
1419 | prev_break_type == G_UNICODE_BREAK_MANDATORY || |
1420 | prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN || |
1421 | prev_break_type == G_UNICODE_BREAK_LINE_FEED || |
1422 | prev_break_type == G_UNICODE_BREAK_NEXT_LINE || |
1423 | prev_break_type == G_UNICODE_BREAK_SPACE || |
1424 | prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE) |
1425 | prev_break_type = G_UNICODE_BREAK_ALPHABETIC; /* Rule LB10 */ |
1426 | /* else don't change the prev_break_type for Rule LB9 */ |
1427 | } |
1428 | else |
1429 | { |
1430 | prev_prev_break_type = prev_break_type; |
1431 | prev_break_type = break_type; |
1432 | } |
1433 | |
1434 | prev_jamo = jamo; |
1435 | } |
1436 | else |
1437 | { |
1438 | if (prev_break_type != G_UNICODE_BREAK_SPACE) |
1439 | { |
1440 | prev_prev_break_type = prev_break_type; |
1441 | prev_break_type = break_type; |
1442 | } |
1443 | /* else don't change the prev_break_type */ |
1444 | } |
1445 | |
1446 | /* ---- Word breaks ---- */ |
1447 | |
1448 | /* default to not a word start/end */ |
1449 | attrs[i].is_word_start = FALSE; |
1450 | attrs[i].is_word_end = FALSE; |
1451 | |
1452 | if (current_word_type != WordNone) |
1453 | { |
1454 | /* Check for a word end */ |
1455 | switch ((int) type) |
1456 | { |
1457 | case G_UNICODE_SPACING_MARK: |
1458 | case G_UNICODE_ENCLOSING_MARK: |
1459 | case G_UNICODE_NON_SPACING_MARK: |
1460 | case G_UNICODE_FORMAT: |
1461 | /* nothing, we just eat these up as part of the word */ |
1462 | break; |
1463 | |
1464 | case G_UNICODE_LOWERCASE_LETTER: |
1465 | case G_UNICODE_MODIFIER_LETTER: |
1466 | case G_UNICODE_OTHER_LETTER: |
1467 | case G_UNICODE_TITLECASE_LETTER: |
1468 | case G_UNICODE_UPPERCASE_LETTER: |
1469 | if (current_word_type == WordLetters) |
1470 | { |
1471 | /* Japanese special cases for ending the word */ |
1472 | if (JAPANESE (last_word_letter) || |
1473 | JAPANESE (wc)) |
1474 | { |
1475 | if ((HIRAGANA (last_word_letter) && |
1476 | !HIRAGANA (wc)) || |
1477 | (KATAKANA (last_word_letter) && |
1478 | !(KATAKANA (wc) || HIRAGANA (wc))) || |
1479 | (KANJI (last_word_letter) && |
1480 | !(HIRAGANA (wc) || KANJI (wc))) || |
1481 | (JAPANESE (last_word_letter) && |
1482 | !JAPANESE (wc)) || |
1483 | (!JAPANESE (last_word_letter) && |
1484 | JAPANESE (wc))) |
1485 | attrs[i].is_word_end = TRUE; |
1486 | } |
1487 | } |
1488 | last_word_letter = wc; |
1489 | break; |
1490 | |
1491 | case G_UNICODE_DECIMAL_NUMBER: |
1492 | case G_UNICODE_LETTER_NUMBER: |
1493 | case G_UNICODE_OTHER_NUMBER: |
1494 | last_word_letter = wc; |
1495 | break; |
1496 | |
1497 | default: |
1498 | /* Punctuation, control/format chars, etc. all end a word. */ |
1499 | attrs[i].is_word_end = TRUE; |
1500 | current_word_type = WordNone; |
1501 | break; |
1502 | } |
1503 | } |
1504 | else |
1505 | { |
1506 | /* Check for a word start */ |
1507 | switch ((int) type) |
1508 | { |
1509 | case G_UNICODE_LOWERCASE_LETTER: |
1510 | case G_UNICODE_MODIFIER_LETTER: |
1511 | case G_UNICODE_OTHER_LETTER: |
1512 | case G_UNICODE_TITLECASE_LETTER: |
1513 | case G_UNICODE_UPPERCASE_LETTER: |
1514 | current_word_type = WordLetters; |
1515 | last_word_letter = wc; |
1516 | attrs[i].is_word_start = TRUE; |
1517 | break; |
1518 | |
1519 | case G_UNICODE_DECIMAL_NUMBER: |
1520 | case G_UNICODE_LETTER_NUMBER: |
1521 | case G_UNICODE_OTHER_NUMBER: |
1522 | current_word_type = WordNumbers; |
1523 | last_word_letter = wc; |
1524 | attrs[i].is_word_start = TRUE; |
1525 | break; |
1526 | |
1527 | default: |
1528 | /* No word here */ |
1529 | break; |
1530 | } |
1531 | } |
1532 | |
1533 | /* ---- Sentence breaks ---- */ |
1534 | { |
1535 | |
1536 | /* default to not a sentence start/end */ |
1537 | attrs[i].is_sentence_start = FALSE; |
1538 | attrs[i].is_sentence_end = FALSE; |
1539 | |
1540 | /* maybe start sentence */ |
1541 | if (last_sentence_start == -1 && !is_sentence_boundary) |
1542 | last_sentence_start = i - 1; |
1543 | |
1544 | /* remember last non space character position */ |
1545 | if (i > 0 && !attrs[i - 1].is_white) |
1546 | last_non_space = i; |
1547 | |
1548 | /* meets sentence end, mark both sentence start and end */ |
1549 | if (last_sentence_start != -1 && is_sentence_boundary) { |
1550 | if (last_non_space >= last_sentence_start) { |
1551 | attrs[last_sentence_start].is_sentence_start = TRUE; |
1552 | attrs[last_non_space].is_sentence_end = TRUE; |
1553 | } |
1554 | |
1555 | last_sentence_start = -1; |
1556 | last_non_space = -1; |
1557 | } |
1558 | |
1559 | /* meets space character, move sentence start */ |
1560 | if (last_sentence_start != -1 && |
1561 | last_sentence_start == i - 1 && |
1562 | attrs[i - 1].is_white) { |
1563 | last_sentence_start++; |
1564 | } |
1565 | } |
1566 | |
1567 | /* --- Hyphens --- */ |
1568 | |
1569 | { |
1570 | gboolean insert_hyphens; |
1571 | gboolean space_or_hyphen = FALSE; |
1572 | |
1573 | attrs[i].break_inserts_hyphen = FALSE; |
1574 | attrs[i].break_removes_preceding = FALSE; |
1575 | |
1576 | switch ((int)prev_script) |
1577 | { |
1578 | case PANGO_SCRIPT_COMMON: |
1579 | insert_hyphens = prev_wc == 0x00ad; |
1580 | break; |
1581 | case PANGO_SCRIPT_HAN: |
1582 | case PANGO_SCRIPT_HANGUL: |
1583 | case PANGO_SCRIPT_HIRAGANA: |
1584 | case PANGO_SCRIPT_KATAKANA: |
1585 | insert_hyphens = FALSE; |
1586 | break; |
1587 | default: |
1588 | insert_hyphens = TRUE; |
1589 | break; |
1590 | } |
1591 | |
1592 | switch ((int)type) |
1593 | { |
1594 | case G_UNICODE_SPACE_SEPARATOR: |
1595 | case G_UNICODE_LINE_SEPARATOR: |
1596 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
1597 | space_or_hyphen = TRUE; |
1598 | break; |
1599 | case G_UNICODE_CONTROL: |
1600 | if (wc == '\t' || wc == '\n' || wc == '\r' || wc == '\f') |
1601 | space_or_hyphen = TRUE; |
1602 | break; |
1603 | default: |
1604 | break; |
1605 | } |
1606 | |
1607 | if (!space_or_hyphen) |
1608 | { |
1609 | if (wc == '-' || /* Hyphen-minus */ |
1610 | wc == 0x058a || /* Armenian hyphen */ |
1611 | wc == 0x1400 || /* Canadian syllabics hyphen */ |
1612 | wc == 0x1806 || /* Mongolian todo hyphen */ |
1613 | wc == 0x2010 || /* Hyphen */ |
1614 | wc == 0x2e17 || /* Double oblique hyphen */ |
1615 | wc == 0x2e40 || /* Double hyphen */ |
1616 | wc == 0x30a0 || /* Katakana-Hiragana double hyphen */ |
1617 | wc == 0xfe63 || /* Small hyphen-minus */ |
1618 | wc == 0xff0d) /* Fullwidth hyphen-minus */ |
1619 | space_or_hyphen = TRUE; |
1620 | } |
1621 | |
1622 | if (attrs[i].is_word_boundary) |
1623 | attrs[i].break_inserts_hyphen = FALSE; |
1624 | else if (prev_space_or_hyphen) |
1625 | attrs[i].break_inserts_hyphen = FALSE; |
1626 | else if (space_or_hyphen) |
1627 | attrs[i].break_inserts_hyphen = FALSE; |
1628 | else |
1629 | attrs[i].break_inserts_hyphen = insert_hyphens; |
1630 | |
1631 | if (prev_wc == 0x2027) /* Hyphenation point */ |
1632 | { |
1633 | attrs[i].break_inserts_hyphen = TRUE; |
1634 | attrs[i].break_removes_preceding = TRUE; |
1635 | } |
1636 | |
1637 | prev_space_or_hyphen = space_or_hyphen; |
1638 | } |
1639 | |
1640 | prev_wc = wc; |
1641 | prev_script = script; |
1642 | |
1643 | /* wc might not be a valid Unicode base character, but really all we |
1644 | * need to know is the last non-combining character */ |
1645 | if (type != G_UNICODE_SPACING_MARK && |
1646 | type != G_UNICODE_ENCLOSING_MARK && |
1647 | type != G_UNICODE_NON_SPACING_MARK) |
1648 | base_character = wc; |
1649 | } |
1650 | |
1651 | i--; |
1652 | |
1653 | attrs[0].is_cursor_position = TRUE; /* Rule GB1 */ |
1654 | attrs[i].is_cursor_position = TRUE; /* Rule GB2 */ |
1655 | |
1656 | attrs[0].is_word_boundary = TRUE; /* Rule WB1 */ |
1657 | attrs[i].is_word_boundary = TRUE; /* Rule WB2 */ |
1658 | |
1659 | attrs[0].is_line_break = FALSE; /* Rule LB2 */ |
1660 | attrs[i].is_line_break = TRUE; /* Rule LB3 */ |
1661 | attrs[i].is_mandatory_break = TRUE; /* Rule LB3 */ |
1662 | } |
1663 | |
1664 | /* }}} */ |
1665 | /* {{{ Tailoring */ |
1666 | /* {{{ Script-specific tailoring */ |
1667 | |
1668 | #include "break-arabic.c" |
1669 | #include "break-indic.c" |
1670 | #include "break-thai.c" |
1671 | #include "break-latin.c" |
1672 | |
1673 | static gboolean |
1674 | break_script (const char *item_text, |
1675 | unsigned int item_length, |
1676 | const PangoAnalysis *analysis, |
1677 | PangoLogAttr *attrs, |
1678 | int attrs_len) |
1679 | { |
1680 | switch (analysis->script) |
1681 | { |
1682 | case PANGO_SCRIPT_ARABIC: |
1683 | break_arabic (text: item_text, length: item_length, analysis, attrs, attrs_len); |
1684 | break; |
1685 | |
1686 | case PANGO_SCRIPT_DEVANAGARI: |
1687 | case PANGO_SCRIPT_BENGALI: |
1688 | case PANGO_SCRIPT_GURMUKHI: |
1689 | case PANGO_SCRIPT_GUJARATI: |
1690 | case PANGO_SCRIPT_ORIYA: |
1691 | case PANGO_SCRIPT_TAMIL: |
1692 | case PANGO_SCRIPT_TELUGU: |
1693 | case PANGO_SCRIPT_KANNADA: |
1694 | case PANGO_SCRIPT_MALAYALAM: |
1695 | case PANGO_SCRIPT_SINHALA: |
1696 | break_indic (text: item_text, length: item_length, analysis, attrs, attrs_len); |
1697 | break; |
1698 | |
1699 | case PANGO_SCRIPT_THAI: |
1700 | break_thai (text: item_text, len: item_length, analysis, attrs, attrs_len); |
1701 | break; |
1702 | |
1703 | case PANGO_SCRIPT_LATIN: |
1704 | break_latin (text: item_text, length: item_length, analysis, attrs, attrs_len); |
1705 | break; |
1706 | |
1707 | default: |
1708 | return FALSE; |
1709 | } |
1710 | |
1711 | return TRUE; |
1712 | } |
1713 | |
1714 | /* }}} */ |
1715 | /* {{{ Attribute-based customization */ |
1716 | |
1717 | /* We allow customizing log attrs in two ways: |
1718 | * |
1719 | * - You can directly remove breaks from a range, using allow_breaks=false. |
1720 | * We preserve the non-tailorable rules from UAX #14, so mandatory breaks |
1721 | * and breaks after ZWS remain. We also preserve break opportunities after |
1722 | * hyphens and visible word dividers. |
1723 | * |
1724 | * - You can tweak the segmentation by marking ranges as word or sentence. |
1725 | * When doing so, we split adjacent segments to preserve alternating |
1726 | * starts and ends. We add a line break opportunity before each word that |
1727 | * is created in this way, and we remove line break opportunities inside |
1728 | * the word in the same way as for a range marked as allow_breaks=false, |
1729 | * except that we don't remove char break opportunities. |
1730 | * |
1731 | * Note that UAX #14 does not guarantee that words fall neatly into |
1732 | * sentences, so we don't do extra work to enforce that. |
1733 | */ |
1734 | |
1735 | static void |
1736 | remove_breaks_from_range (const char *text, |
1737 | int start, |
1738 | PangoLogAttr *log_attrs, |
1739 | int start_pos, |
1740 | int end_pos) |
1741 | { |
1742 | int pos; |
1743 | const char *p; |
1744 | gunichar ch; |
1745 | int bt; |
1746 | gboolean after_zws; |
1747 | gboolean after_hyphen; |
1748 | |
1749 | /* Assume our range doesn't start after a hyphen or in a zws sequence */ |
1750 | after_zws = FALSE; |
1751 | after_hyphen = FALSE; |
1752 | for (pos = start_pos + 1, p = g_utf8_next_char (text + start); |
1753 | pos < end_pos; |
1754 | pos++, p = g_utf8_next_char (p)) |
1755 | { |
1756 | /* Mandatory breaks aren't tailorable */ |
1757 | if (!log_attrs[pos].is_mandatory_break) |
1758 | log_attrs[pos].is_line_break = FALSE; |
1759 | |
1760 | ch = g_utf8_get_char (p); |
1761 | bt = g_unichar_break_type (c: ch); |
1762 | |
1763 | /* Hyphens and visible word dividers */ |
1764 | if (after_hyphen) |
1765 | log_attrs[pos].is_line_break = TRUE; |
1766 | |
1767 | after_hyphen = ch == 0x00ad || /* Soft Hyphen */ |
1768 | ch == 0x05A0 || ch == 0x2010 || /* Breaking Hyphens */ |
1769 | ch == 0x2012 || ch == 0x2013 || |
1770 | ch == 0x05BE || ch == 0x0F0B || /* Visible word dividers */ |
1771 | ch == 0x1361 || ch == 0x17D8 || |
1772 | ch == 0x17DA || ch == 0x2027 || |
1773 | ch == 0x007C; |
1774 | |
1775 | /* ZWS sequence */ |
1776 | if (after_zws && bt != G_UNICODE_BREAK_SPACE) |
1777 | log_attrs[pos].is_line_break = TRUE; |
1778 | |
1779 | after_zws = bt == G_UNICODE_BREAK_ZERO_WIDTH_SPACE || |
1780 | (bt == G_UNICODE_BREAK_SPACE && after_zws); |
1781 | } |
1782 | } |
1783 | |
1784 | static gboolean |
1785 | handle_allow_breaks (const char *text, |
1786 | int length, |
1787 | PangoAttrList *attrs, |
1788 | int offset, |
1789 | PangoLogAttr *log_attrs, |
1790 | int log_attrs_len) |
1791 | { |
1792 | PangoAttrIterator iter; |
1793 | gboolean tailored = FALSE; |
1794 | |
1795 | _pango_attr_list_get_iterator (list: attrs, iterator: &iter); |
1796 | |
1797 | do |
1798 | { |
1799 | const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_ALLOW_BREAKS); |
1800 | |
1801 | if (!attr) |
1802 | continue; |
1803 | |
1804 | if (!((PangoAttrInt*)attr)->value) |
1805 | { |
1806 | int start, end; |
1807 | int start_pos, end_pos; |
1808 | int pos; |
1809 | |
1810 | start = attr->start_index; |
1811 | end = attr->end_index; |
1812 | if (start < offset) |
1813 | start_pos = 0; |
1814 | else |
1815 | start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset); |
1816 | if (end >= offset + length) |
1817 | end_pos = log_attrs_len; |
1818 | else |
1819 | end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset); |
1820 | |
1821 | for (pos = start_pos + 1; pos < end_pos; pos++) |
1822 | log_attrs[pos].is_char_break = FALSE; |
1823 | |
1824 | remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs, start_pos, end_pos); |
1825 | |
1826 | tailored = TRUE; |
1827 | } |
1828 | } |
1829 | while (pango_attr_iterator_next (iterator: &iter)); |
1830 | |
1831 | _pango_attr_iterator_destroy (iterator: &iter); |
1832 | |
1833 | return tailored; |
1834 | } |
1835 | |
1836 | |
1837 | static gboolean |
1838 | handle_words (const char *text, |
1839 | int length, |
1840 | PangoAttrList *attrs, |
1841 | int offset, |
1842 | PangoLogAttr *log_attrs, |
1843 | int log_attrs_len) |
1844 | { |
1845 | PangoAttrIterator iter; |
1846 | gboolean tailored = FALSE; |
1847 | |
1848 | _pango_attr_list_get_iterator (list: attrs, iterator: &iter); |
1849 | |
1850 | do |
1851 | { |
1852 | const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_WORD); |
1853 | int start, end; |
1854 | int start_pos, end_pos; |
1855 | int pos; |
1856 | |
1857 | if (!attr) |
1858 | continue; |
1859 | |
1860 | start = attr->start_index; |
1861 | end = attr->end_index; |
1862 | if (start < offset) |
1863 | start_pos = 0; |
1864 | else |
1865 | start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset); |
1866 | if (end >= offset + length) |
1867 | end_pos = log_attrs_len; |
1868 | else |
1869 | end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset); |
1870 | |
1871 | for (pos = start_pos + 1; pos < end_pos; pos++) |
1872 | { |
1873 | log_attrs[pos].is_word_start = FALSE; |
1874 | log_attrs[pos].is_word_end = FALSE; |
1875 | log_attrs[pos].is_word_boundary = FALSE; |
1876 | } |
1877 | |
1878 | remove_breaks_from_range (text, MAX (start - offset, 0), log_attrs, |
1879 | start_pos, end_pos); |
1880 | |
1881 | if (start >= offset) |
1882 | { |
1883 | gboolean in_word = FALSE; |
1884 | for (pos = start_pos; pos >= 0; pos--) |
1885 | { |
1886 | if (log_attrs[pos].is_word_end) |
1887 | { |
1888 | in_word = pos == start_pos; |
1889 | break; |
1890 | } |
1891 | if (pos < start_pos && log_attrs[pos].is_word_start) |
1892 | { |
1893 | in_word = TRUE; |
1894 | break; |
1895 | } |
1896 | } |
1897 | log_attrs[start_pos].is_word_start = TRUE; |
1898 | log_attrs[start_pos].is_word_end = in_word; |
1899 | log_attrs[start_pos].is_word_boundary = TRUE; |
1900 | |
1901 | /* Allow line breaks before words */ |
1902 | if (start_pos > 0) |
1903 | log_attrs[start_pos].is_line_break = TRUE; |
1904 | |
1905 | tailored = TRUE; |
1906 | } |
1907 | |
1908 | if (end < offset + length) |
1909 | { |
1910 | gboolean in_word = FALSE; |
1911 | for (pos = end_pos; pos < log_attrs_len; pos++) |
1912 | { |
1913 | if (log_attrs[pos].is_word_start) |
1914 | { |
1915 | in_word = pos == end_pos; |
1916 | break; |
1917 | } |
1918 | if (pos > end_pos && log_attrs[pos].is_word_end) |
1919 | { |
1920 | in_word = TRUE; |
1921 | break; |
1922 | } |
1923 | } |
1924 | log_attrs[end_pos].is_word_start = in_word; |
1925 | log_attrs[end_pos].is_word_end = TRUE; |
1926 | log_attrs[end_pos].is_word_boundary = TRUE; |
1927 | |
1928 | /* Allow line breaks before words */ |
1929 | if (in_word) |
1930 | log_attrs[end_pos].is_line_break = TRUE; |
1931 | |
1932 | tailored = TRUE; |
1933 | } |
1934 | } |
1935 | while (pango_attr_iterator_next (iterator: &iter)); |
1936 | |
1937 | _pango_attr_iterator_destroy (iterator: &iter); |
1938 | |
1939 | return tailored; |
1940 | } |
1941 | |
1942 | static gboolean |
1943 | handle_sentences (const char *text, |
1944 | int length, |
1945 | PangoAttrList *attrs, |
1946 | int offset, |
1947 | PangoLogAttr *log_attrs, |
1948 | int log_attrs_len) |
1949 | { |
1950 | PangoAttrIterator iter; |
1951 | gboolean tailored = FALSE; |
1952 | |
1953 | _pango_attr_list_get_iterator (list: attrs, iterator: &iter); |
1954 | |
1955 | do |
1956 | { |
1957 | const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_SENTENCE); |
1958 | int start, end; |
1959 | int start_pos, end_pos; |
1960 | int pos; |
1961 | |
1962 | if (!attr) |
1963 | continue; |
1964 | |
1965 | start = attr->start_index; |
1966 | end = attr->end_index; |
1967 | if (start < offset) |
1968 | start_pos = 0; |
1969 | else |
1970 | start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset); |
1971 | if (end >= offset + length) |
1972 | end_pos = log_attrs_len; |
1973 | else |
1974 | end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset); |
1975 | |
1976 | for (pos = start_pos + 1; pos < end_pos; pos++) |
1977 | { |
1978 | log_attrs[pos].is_sentence_start = FALSE; |
1979 | log_attrs[pos].is_sentence_end = FALSE; |
1980 | log_attrs[pos].is_sentence_boundary = FALSE; |
1981 | |
1982 | tailored = TRUE; |
1983 | } |
1984 | if (start >= offset) |
1985 | { |
1986 | gboolean in_sentence = FALSE; |
1987 | for (pos = start_pos - 1; pos >= 0; pos--) |
1988 | { |
1989 | if (log_attrs[pos].is_sentence_end) |
1990 | break; |
1991 | if (log_attrs[pos].is_sentence_start) |
1992 | { |
1993 | in_sentence = TRUE; |
1994 | break; |
1995 | } |
1996 | } |
1997 | log_attrs[start_pos].is_sentence_start = TRUE; |
1998 | log_attrs[start_pos].is_sentence_end = in_sentence; |
1999 | log_attrs[start_pos].is_sentence_boundary = TRUE; |
2000 | |
2001 | tailored = TRUE; |
2002 | } |
2003 | if (end < offset + length) |
2004 | { |
2005 | gboolean in_sentence = FALSE; |
2006 | for (pos = end_pos + 1; end_pos < log_attrs_len; pos++) |
2007 | { |
2008 | if (log_attrs[pos].is_sentence_start) |
2009 | break; |
2010 | if (log_attrs[pos].is_sentence_end) |
2011 | { |
2012 | in_sentence = TRUE; |
2013 | break; |
2014 | } |
2015 | } |
2016 | log_attrs[end_pos].is_sentence_start = in_sentence; |
2017 | log_attrs[end_pos].is_sentence_end = TRUE; |
2018 | log_attrs[end_pos].is_sentence_boundary = TRUE; |
2019 | |
2020 | tailored = TRUE; |
2021 | } |
2022 | } |
2023 | while (pango_attr_iterator_next (iterator: &iter)); |
2024 | |
2025 | _pango_attr_iterator_destroy (iterator: &iter); |
2026 | |
2027 | return tailored; |
2028 | } |
2029 | |
2030 | static gboolean |
2031 | handle_hyphens (const char *text, |
2032 | int length, |
2033 | PangoAttrList *attrs, |
2034 | int offset, |
2035 | PangoLogAttr *log_attrs, |
2036 | int log_attrs_len) |
2037 | { |
2038 | PangoAttrIterator iter; |
2039 | gboolean tailored = FALSE; |
2040 | |
2041 | _pango_attr_list_get_iterator (list: attrs, iterator: &iter); |
2042 | |
2043 | do { |
2044 | const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_INSERT_HYPHENS); |
2045 | |
2046 | if (attr && ((PangoAttrInt*)attr)->value == 0) |
2047 | { |
2048 | int start, end; |
2049 | int start_pos, end_pos; |
2050 | int pos; |
2051 | |
2052 | pango_attr_iterator_range (iterator: &iter, start: &start, end: &end); |
2053 | if (start < offset) |
2054 | start_pos = 0; |
2055 | else |
2056 | start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset); |
2057 | if (end >= offset + length) |
2058 | end_pos = log_attrs_len; |
2059 | else |
2060 | end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset); |
2061 | |
2062 | for (pos = start_pos + 1; pos < end_pos; pos++) |
2063 | { |
2064 | if (!log_attrs[pos].break_removes_preceding) |
2065 | { |
2066 | log_attrs[pos].break_inserts_hyphen = FALSE; |
2067 | |
2068 | tailored = TRUE; |
2069 | } |
2070 | } |
2071 | } |
2072 | } while (pango_attr_iterator_next (iterator: &iter)); |
2073 | |
2074 | _pango_attr_iterator_destroy (iterator: &iter); |
2075 | |
2076 | return tailored; |
2077 | } |
2078 | |
2079 | static gboolean |
2080 | break_attrs (const char *text, |
2081 | int length, |
2082 | GSList *attributes, |
2083 | int offset, |
2084 | PangoLogAttr *log_attrs, |
2085 | int log_attrs_len) |
2086 | { |
2087 | PangoAttrList allow_breaks; |
2088 | PangoAttrList words; |
2089 | PangoAttrList sentences; |
2090 | PangoAttrList hyphens; |
2091 | GSList *l; |
2092 | gboolean tailored = FALSE; |
2093 | |
2094 | _pango_attr_list_init (list: &allow_breaks); |
2095 | _pango_attr_list_init (list: &words); |
2096 | _pango_attr_list_init (list: &sentences); |
2097 | _pango_attr_list_init (list: &hyphens); |
2098 | |
2099 | for (l = attributes; l; l = l->next) |
2100 | { |
2101 | PangoAttribute *attr = l->data; |
2102 | |
2103 | if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS) |
2104 | pango_attr_list_insert (list: &allow_breaks, attr: pango_attribute_copy (attr)); |
2105 | else if (attr->klass->type == PANGO_ATTR_WORD) |
2106 | pango_attr_list_insert (list: &words, attr: pango_attribute_copy (attr)); |
2107 | else if (attr->klass->type == PANGO_ATTR_SENTENCE) |
2108 | pango_attr_list_insert (list: &sentences, attr: pango_attribute_copy (attr)); |
2109 | else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS) |
2110 | pango_attr_list_insert (list: &hyphens, attr: pango_attribute_copy (attr)); |
2111 | } |
2112 | |
2113 | tailored |= handle_words (text, length, attrs: &words, offset, |
2114 | log_attrs, log_attrs_len); |
2115 | |
2116 | tailored |= handle_sentences (text, length, attrs: &words, offset, |
2117 | log_attrs, log_attrs_len); |
2118 | |
2119 | tailored |= handle_hyphens (text, length, attrs: &hyphens, offset, |
2120 | log_attrs, log_attrs_len); |
2121 | |
2122 | tailored |= handle_allow_breaks (text, length, attrs: &allow_breaks, offset, |
2123 | log_attrs, log_attrs_len); |
2124 | |
2125 | _pango_attr_list_destroy (list: &allow_breaks); |
2126 | _pango_attr_list_destroy (list: &words); |
2127 | _pango_attr_list_destroy (list: &sentences); |
2128 | _pango_attr_list_destroy (list: &hyphens); |
2129 | |
2130 | return tailored; |
2131 | } |
2132 | |
2133 | /* }}} */ |
2134 | |
2135 | static gboolean |
2136 | tailor_break (const char *text, |
2137 | int length, |
2138 | PangoAnalysis *analysis, |
2139 | int item_offset, |
2140 | PangoLogAttr *attrs, |
2141 | int attrs_len) |
2142 | { |
2143 | gboolean res; |
2144 | |
2145 | if (length < 0) |
2146 | length = strlen (s: text); |
2147 | else if (text == NULL) |
2148 | text = "" ; |
2149 | |
2150 | res = break_script (item_text: text, item_length: length, analysis, attrs, attrs_len); |
2151 | |
2152 | if (item_offset >= 0 && analysis->extra_attrs) |
2153 | res |= break_attrs (text, length, attributes: analysis->extra_attrs, offset: item_offset, log_attrs: attrs, log_attrs_len: attrs_len); |
2154 | |
2155 | return res; |
2156 | } |
2157 | |
2158 | /* }}} */ |
2159 | /* {{{ Public API */ |
2160 | |
2161 | /** |
2162 | * pango_default_break: |
2163 | * @text: text to break. Must be valid UTF-8 |
2164 | * @length: length of text in bytes (may be -1 if @text is nul-terminated) |
2165 | * @analysis: (nullable): a `PangoAnalysis` structure for the @text |
2166 | * @attrs: logical attributes to fill in |
2167 | * @attrs_len: size of the array passed as @attrs |
2168 | * |
2169 | * This is the default break algorithm. |
2170 | * |
2171 | * It applies rules from the [Unicode Line Breaking Algorithm](http://www.unicode.org/unicode/reports/tr14/) |
2172 | * without language-specific tailoring, therefore the @analyis argument is unused |
2173 | * and can be %NULL. |
2174 | * |
2175 | * See [func@Pango.tailor_break] for language-specific breaks. |
2176 | * |
2177 | * See [func@Pango.attr_break] for attribute-based customization. |
2178 | */ |
2179 | void |
2180 | pango_default_break (const char *text, |
2181 | int length, |
2182 | PangoAnalysis *analysis G_GNUC_UNUSED, |
2183 | PangoLogAttr *attrs, |
2184 | int attrs_len G_GNUC_UNUSED) |
2185 | { |
2186 | PangoLogAttr before = *attrs; |
2187 | |
2188 | default_break (text, length, analysis, attrs, attrs_len); |
2189 | |
2190 | attrs->is_line_break |= before.is_line_break; |
2191 | attrs->is_mandatory_break |= before.is_mandatory_break; |
2192 | attrs->is_cursor_position |= before.is_cursor_position; |
2193 | } |
2194 | |
2195 | /** |
2196 | * pango_break: |
2197 | * @text: the text to process. Must be valid UTF-8 |
2198 | * @length: length of @text in bytes (may be -1 if @text is nul-terminated) |
2199 | * @analysis: `PangoAnalysis` structure for @text |
2200 | * @attrs: (array length=attrs_len): an array to store character information in |
2201 | * @attrs_len: size of the array passed as @attrs |
2202 | * |
2203 | * Determines possible line, word, and character breaks |
2204 | * for a string of Unicode text with a single analysis. |
2205 | * |
2206 | * For most purposes you may want to use [func@Pango.get_log_attrs]. |
2207 | * |
2208 | * Deprecated: 1.44: Use [func@Pango.default_break], |
2209 | * [func@Pango.tailor_break] and [func@Pango.attr_break]. |
2210 | */ |
2211 | void |
2212 | pango_break (const char *text, |
2213 | gint length, |
2214 | PangoAnalysis *analysis, |
2215 | PangoLogAttr *attrs, |
2216 | int attrs_len) |
2217 | { |
2218 | g_return_if_fail (analysis != NULL); |
2219 | g_return_if_fail (attrs != NULL); |
2220 | |
2221 | default_break (text, length, analysis, attrs, attrs_len); |
2222 | tailor_break (text, length, analysis, item_offset: -1, attrs, attrs_len); |
2223 | } |
2224 | |
2225 | /** |
2226 | * pango_tailor_break: |
2227 | * @text: text to process. Must be valid UTF-8 |
2228 | * @length: length in bytes of @text |
2229 | * @analysis: `PangoAnalysis` for @text |
2230 | * @offset: Byte offset of @text from the beginning of the |
2231 | * paragraph, or -1 to ignore attributes from @analysis |
2232 | * @attrs: (array length=attrs_len): array with one `PangoLogAttr` |
2233 | * per character in @text, plus one extra, to be filled in |
2234 | * @attrs_len: length of @attrs array |
2235 | * |
2236 | * Apply language-specific tailoring to the breaks in @attrs. |
2237 | * |
2238 | * The line breaks are assumed to have been produced by [func@Pango.default_break]. |
2239 | * |
2240 | * If @offset is not -1, it is used to apply attributes from @analysis that are |
2241 | * relevant to line breaking. |
2242 | * |
2243 | * Note that it is better to pass -1 for @offset and use [func@Pango.attr_break] |
2244 | * to apply attributes to the whole paragraph. |
2245 | * |
2246 | * Since: 1.44 |
2247 | */ |
2248 | void |
2249 | pango_tailor_break (const char *text, |
2250 | int length, |
2251 | PangoAnalysis *analysis, |
2252 | int offset, |
2253 | PangoLogAttr *attrs, |
2254 | int attrs_len) |
2255 | { |
2256 | PangoLogAttr *start = attrs; |
2257 | PangoLogAttr attr_before = *start; |
2258 | |
2259 | if (tailor_break (text, length, analysis, item_offset: offset, attrs, attrs_len)) |
2260 | { |
2261 | /* if tailored, we enforce some of the attrs from before |
2262 | * tailoring at the boundary |
2263 | */ |
2264 | |
2265 | start->backspace_deletes_character = attr_before.backspace_deletes_character; |
2266 | |
2267 | start->is_line_break |= attr_before.is_line_break; |
2268 | start->is_mandatory_break |= attr_before.is_mandatory_break; |
2269 | start->is_cursor_position |= attr_before.is_cursor_position; |
2270 | } |
2271 | } |
2272 | |
2273 | /** |
2274 | * pango_attr_break: |
2275 | * @text: text to break. Must be valid UTF-8 |
2276 | * @length: length of text in bytes (may be -1 if @text is nul-terminated) |
2277 | * @attr_list: `PangoAttrList` to apply |
2278 | * @offset: Byte offset of @text from the beginning of the paragraph |
2279 | * @attrs: (array length=attrs_len): array with one `PangoLogAttr` |
2280 | * per character in @text, plus one extra, to be filled in |
2281 | * @attrs_len: length of @attrs array |
2282 | * |
2283 | * Apply customization from attributes to the breaks in @attrs. |
2284 | * |
2285 | * The line breaks are assumed to have been produced |
2286 | * by [func@Pango.default_break] and [func@Pango.tailor_break]. |
2287 | * |
2288 | * Since: 1.50 |
2289 | */ |
2290 | void |
2291 | pango_attr_break (const char *text, |
2292 | int length, |
2293 | PangoAttrList *attr_list, |
2294 | int offset, |
2295 | PangoLogAttr *attrs, |
2296 | int attrs_len) |
2297 | { |
2298 | PangoLogAttr *start = attrs; |
2299 | PangoLogAttr attr_before = *start; |
2300 | GSList *attributes; |
2301 | |
2302 | attributes = pango_attr_list_get_attributes (list: attr_list); |
2303 | if (break_attrs (text, length, attributes, offset, log_attrs: attrs, log_attrs_len: attrs_len)) |
2304 | { |
2305 | /* if tailored, we enforce some of the attrs from before |
2306 | * tailoring at the boundary |
2307 | */ |
2308 | |
2309 | start->backspace_deletes_character = attr_before.backspace_deletes_character; |
2310 | |
2311 | start->is_line_break |= attr_before.is_line_break; |
2312 | start->is_mandatory_break |= attr_before.is_mandatory_break; |
2313 | start->is_cursor_position |= attr_before.is_cursor_position; |
2314 | } |
2315 | |
2316 | g_slist_free_full (list: attributes, free_func: (GDestroyNotify)pango_attribute_destroy); |
2317 | } |
2318 | |
2319 | /** |
2320 | * pango_get_log_attrs: |
2321 | * @text: text to process. Must be valid UTF-8 |
2322 | * @length: length in bytes of @text |
2323 | * @level: embedding level, or -1 if unknown |
2324 | * @language: language tag |
2325 | * @attrs: (array length=attrs_len): array with one `PangoLogAttr` |
2326 | * per character in @text, plus one extra, to be filled in |
2327 | * @attrs_len: length of @attrs array |
2328 | * |
2329 | * Computes a `PangoLogAttr` for each character in @text. |
2330 | * |
2331 | * The @attrs array must have one `PangoLogAttr` for |
2332 | * each position in @text; if @text contains N characters, |
2333 | * it has N+1 positions, including the last position at the |
2334 | * end of the text. @text should be an entire paragraph; |
2335 | * logical attributes can't be computed without context |
2336 | * (for example you need to see spaces on either side of |
2337 | * a word to know the word is a word). |
2338 | */ |
2339 | void |
2340 | pango_get_log_attrs (const char *text, |
2341 | int length, |
2342 | int level, |
2343 | PangoLanguage *language, |
2344 | PangoLogAttr *attrs, |
2345 | int attrs_len) |
2346 | { |
2347 | int chars_broken; |
2348 | PangoAnalysis analysis = { NULL }; |
2349 | PangoScriptIter iter; |
2350 | |
2351 | g_return_if_fail (length == 0 || text != NULL); |
2352 | g_return_if_fail (attrs != NULL); |
2353 | |
2354 | analysis.level = level; |
2355 | analysis.language = language; |
2356 | |
2357 | pango_default_break (text, length, analysis: &analysis, attrs, attrs_len); |
2358 | |
2359 | chars_broken = 0; |
2360 | |
2361 | _pango_script_iter_init (iter: &iter, text, length); |
2362 | do |
2363 | { |
2364 | const char *run_start, *run_end; |
2365 | PangoScript script; |
2366 | int chars_in_range; |
2367 | |
2368 | pango_script_iter_get_range (iter: &iter, start: &run_start, end: &run_end, script: &script); |
2369 | analysis.script = script; |
2370 | |
2371 | chars_in_range = pango_utf8_strlen (p: run_start, max: run_end - run_start); |
2372 | |
2373 | pango_tailor_break (text: run_start, |
2374 | length: run_end - run_start, |
2375 | analysis: &analysis, |
2376 | offset: -1, |
2377 | attrs: attrs + chars_broken, |
2378 | attrs_len: chars_in_range + 1); |
2379 | |
2380 | chars_broken += chars_in_range; |
2381 | } |
2382 | while (pango_script_iter_next (iter: &iter)); |
2383 | _pango_script_iter_fini (iter: &iter); |
2384 | |
2385 | if (chars_broken + 1 > attrs_len) |
2386 | g_warning ("pango_get_log_attrs: attrs_len should have been at least %d, but was %d. Expect corrupted memory." , |
2387 | chars_broken + 1, |
2388 | attrs_len); |
2389 | } |
2390 | |
2391 | /* }}} */ |
2392 | |
2393 | /* vim:set foldmethod=marker expandtab: */ |
2394 | |