break.c source code [gtk/subprojects/pango/pango/break.c]

1	/ Pango*
2	* break.c:
3	*
4	* Copyright (C) 1999 Red Hat Software
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Library General Public
8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Library General Public License for more details.
15	*
16	* You should have received a copy of the GNU Library General Public
17	* License along with this library; if not, write to the
18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19	* Boston, MA 02111-1307, USA.
20	*/
21
22	#include "config.h"
23
24	#include "pango-break.h"
25	#include "pango-script-private.h"
26	#include "pango-emoji-private.h"
27	#include "pango-attributes-private.h"
28	#include "pango-break-table.h"
29	#include "pango-impl-utils.h"
30	#include <string.h>
31
32	/ {{{ Unicode line breaking and segmentation /
33
34	#define PARAGRAPH_SEPARATOR 0x2029
35
36	/ See http://www.unicode.org/unicode/reports/tr14/ if you hope*
37	* to understand the line breaking code.
38	*/
39
40	typedef enum
41	{
42	BREAK_ALREADY_HANDLED, / didn't use the table /
43	BREAK_PROHIBITED, / no break, even if spaces intervene /
44	BREAK_IF_SPACES, / "indirect break" (only if there are spaces) /
45	BREAK_ALLOWED / "direct break" (can always break here) /
46	/ TR 14 has two more break-opportunity classes,*
47	* "indirect break opportunity for combining marks following a space"
48	* and "prohibited break for combining marks"
49	* but we handle that inline in the code.
50	*/
51	} BreakOpportunity;
52
53	/ need to sync the break range to glib/gunicode.h . /
54	#define BREAK_TYPE_SAFE(btype) \
55	((btype) <= G_UNICODE_BREAK_ZERO_WIDTH_JOINER ? (btype) : G_UNICODE_BREAK_UNKNOWN)
56
57
58	/*
59	* Hangul Conjoining Jamo handling.
60	*
61	* The way we implement it is just a bit different from TR14,
62	* but produces the same results.
63	* The same algorithm is also used in TR29 for cluster boundaries.
64	*
65	*/
66
67
68	/ An enum that works as the states of the Hangul syllables system.*
69	**/
70	typedef enum
71	{
72	JAMO_L, / G_UNICODE_BREAK_HANGUL_L_JAMO /
73	JAMO_V, / G_UNICODE_BREAK_HANGUL_V_JAMO /
74	JAMO_T, / G_UNICODE_BREAK_HANGUL_T_JAMO /
75	JAMO_LV, / G_UNICODE_BREAK_HANGUL_LV_SYLLABLE /
76	JAMO_LVT, / G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE /
77	NO_JAMO / Other /
78	} JamoType;
79
80	/ There are Hangul syllables encoded as characters, that act like a*
81	* sequence of Jamos. For each character we define a JamoType
82	* that the character starts with, and one that it ends with. This
83	* decomposes JAMO_LV and JAMO_LVT to simple other JAMOs. So for
84	* example, a character with LineBreak type
85	* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V.
86	*/
87	typedef struct _CharJamoProps
88	{
89	JamoType start, end;
90	} CharJamoProps;
91
92	/ Map from JamoType to CharJamoProps that hold only simple*
93	* JamoTypes (no LV or LVT) or none.
94	*/
95	static const CharJamoProps HangulJamoProps[] = {
96	{JAMO_L, JAMO_L}, / JAMO_L /
97	{JAMO_V, JAMO_V}, / JAMO_V /
98	{JAMO_T, JAMO_T}, / JAMO_T /
99	{JAMO_L, JAMO_V}, / JAMO_LV /
100	{JAMO_L, JAMO_T}, / JAMO_LVT /
101	{NO_JAMO, NO_JAMO} / NO_JAMO /
102	};
103
104	/ A character forms a syllable with the previous character if and only if:*
105	* JamoType(this) is not NO_JAMO and:
106	*
107	* HangulJamoProps[JamoType(prev)].end and
108	* HangulJamoProps[JamoType(this)].start are equal,
109	* or the former is one less than the latter.
110	*/
111
112	#define IS_JAMO(btype) \
113	((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \
114	(btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
115	#define JAMO_TYPE(btype) \
116	(IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
117
118	/ Types of Japanese characters /
119	#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
120	#define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
121	#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
122	#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
123
124	#define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) \|\| ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
125	#define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
126	#define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) \|\| ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
127	#define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
128	#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
129	#define EMOJI(wc) (_pango_Is_Emoji_Base_Character (wc))
130	#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA (wc) && !HANGUL (wc) && !EMOJI (wc))
131
132	/ Previously "123foo" was two words. But in UAX 29 of Unicode,*
133	* we know don't break words between consecutive letters and numbers
134	*/
135	typedef enum
136	{
137	WordNone,
138	WordLetters,
139	WordNumbers
140	} WordType;
141
142	static void
143	default_break (const char *text,
144	int length,
145	PangoAnalysis *analysis G_GNUC_UNUSED,
146	PangoLogAttr *attrs,
147	int attrs_len G_GNUC_UNUSED)
148	{
149	/ The rationale for all this is in section 5.15 of the Unicode 3.0 book,*
150	* the line breaking stuff is also in TR14 on unicode.org
151	*/
152
153	/ This is a default break implementation that should work for nearly all*
154	* languages. Language engines can override it optionally.
155	*/
156
157	/ FIXME one cheesy optimization here would be to memset attrs to 0*
158	* before we start, and then never assign %FALSE to anything
159	*/
160
161	const gchar *next;
162	gint i;
163
164	gunichar prev_wc;
165	gunichar next_wc;
166
167	JamoType prev_jamo;
168
169	GUnicodeBreakType next_break_type;
170	GUnicodeBreakType prev_break_type;
171	GUnicodeBreakType prev_prev_break_type;
172
173	PangoScript prev_script;
174
175	/ See Grapheme_Cluster_Break Property Values table of UAX#29 /
176	typedef enum
177	{
178	GB_Other,
179	GB_ControlCRLF,
180	GB_Extend,
181	GB_ZWJ,
182	GB_Prepend,
183	GB_SpacingMark,
184	GB_InHangulSyllable, / Handles all of L, V, T, LV, LVT rules /
185	/ Use state machine to handle emoji sequence /
186	/ Rule GB12 and GB13 /
187	GB_RI_Odd, / Meets odd number of RI /
188	GB_RI_Even, / Meets even number of RI /
189	} GraphemeBreakType;
190	GraphemeBreakType prev_GB_type = GB_Other;
191	gboolean met_Extended_Pictographic = FALSE;
192
193	/ See Word_Break Property Values table of UAX#29 /
194	typedef enum
195	{
196	WB_Other,
197	WB_NewlineCRLF,
198	WB_ExtendFormat,
199	WB_Katakana,
200	WB_Hebrew_Letter,
201	WB_ALetter,
202	WB_MidNumLet,
203	WB_MidLetter,
204	WB_MidNum,
205	WB_Numeric,
206	WB_ExtendNumLet,
207	WB_RI_Odd,
208	WB_RI_Even,
209	WB_WSegSpace,
210	} WordBreakType;
211	WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
212	gint prev_WB_i = -`1`;
213
214	/ See Sentence_Break Property Values table of UAX#29 /
215	typedef enum
216	{
217	SB_Other,
218	SB_ExtendFormat,
219	SB_ParaSep,
220	SB_Sp,
221	SB_Lower,
222	SB_Upper,
223	SB_OLetter,
224	SB_Numeric,
225	SB_ATerm,
226	SB_SContinue,
227	SB_STerm,
228	SB_Close,
229	/ Rules SB8 and SB8a /
230	SB_ATerm_Close_Sp,
231	SB_STerm_Close_Sp,
232	} SentenceBreakType;
233	SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
234	gint prev_SB_i = -`1`;
235
236	/ Rule LB25 with Example 7 of Customization /
237	typedef enum
238	{
239	LB_Other,
240	LB_Numeric,
241	LB_Numeric_Close,
242	LB_RI_Odd,
243	LB_RI_Even,
244	} LineBreakType;
245	LineBreakType prev_LB_type = LB_Other;
246
247	WordType current_word_type = WordNone;
248	gunichar last_word_letter = `0`;
249	gunichar base_character = `0`;
250
251	gint last_sentence_start = -`1`;
252	gint last_non_space = -`1`;
253
254	gboolean prev_space_or_hyphen;
255
256	gboolean almost_done = FALSE;
257	gboolean done = FALSE;
258
259	g_return_if_fail (length == `0` \|\| text != NULL);
260	g_return_if_fail (attrs != NULL);
261
262	next = text;
263
264	prev_break_type = G_UNICODE_BREAK_UNKNOWN;
265	prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN;
266	prev_wc = `0`;
267	prev_script = PANGO_SCRIPT_COMMON;
268	prev_jamo = NO_JAMO;
269	prev_space_or_hyphen = FALSE;
270
271	if (length == `0` \|\| *text == `'\0'`)
272	{
273	next_wc = PARAGRAPH_SEPARATOR;
274	almost_done = TRUE;
275	}
276	else
277	next_wc = g_utf8_get_char (p: next);
278
279	next_break_type = g_unichar_break_type (c: next_wc);
280	next_break_type = BREAK_TYPE_SAFE (next_break_type);
281
282	for (i = `0`; !done ; i++)
283	{
284	GUnicodeType type;
285	gunichar wc;
286	GUnicodeBreakType break_type;
287	GUnicodeBreakType row_break_type;
288	BreakOpportunity break_op;
289	JamoType jamo;
290	gboolean makes_hangul_syllable;
291
292	/ UAX#29 boundaries /
293	gboolean is_grapheme_boundary;
294	gboolean is_word_boundary;
295	gboolean is_sentence_boundary;
296
297	/ Emoji extended pictographics /
298	gboolean is_Extended_Pictographic;
299
300	PangoScript script;
301
302	wc = next_wc;
303	break_type = next_break_type;
304
305	if (almost_done)
306	{
307	/*
308	* If we have already reached the end of @text g_utf8_next_char()
309	* may not increment next
310	*/
311	next_wc = `0`;
312	next_break_type = G_UNICODE_BREAK_UNKNOWN;
313	done = TRUE;
314	}
315	else
316	{
317	next = g_utf8_next_char (next);
318
319	if ((length >= `0` && next >= text + length) \|\| *next == `'\0'`)
320	{
321	/ This is how we fill in the last element (end position) of the*
322	* attr array - assume there's a paragraph separators off the end
323	* of @text.
324	*/
325	next_wc = PARAGRAPH_SEPARATOR;
326	almost_done = TRUE;
327	}
328	else
329	next_wc = g_utf8_get_char (p: next);
330
331	next_break_type = g_unichar_break_type (c: next_wc);
332	next_break_type = BREAK_TYPE_SAFE (next_break_type);
333	}
334
335	type = g_unichar_type (c: wc);
336	jamo = JAMO_TYPE (break_type);
337
338	/ Determine wheter this forms a Hangul syllable with prev. /
339	if (jamo == NO_JAMO)
340	makes_hangul_syllable = FALSE;
341	else
342	{
343	JamoType prev_end = HangulJamoProps[prev_jamo].end ;
344	JamoType this_start = HangulJamoProps[ jamo].start;
345
346	/ See comments before IS_JAMO /
347	makes_hangul_syllable = (prev_end == this_start) \|\| (prev_end + `1` == this_start);
348	}
349
350	switch ((int)type)
351	{
352	case G_UNICODE_SPACE_SEPARATOR:
353	case G_UNICODE_LINE_SEPARATOR:
354	case G_UNICODE_PARAGRAPH_SEPARATOR:
355	attrs[i].is_white = TRUE;
356	break;
357	case G_UNICODE_CONTROL:
358	if (wc == `'\t'` \|\| wc == `'\n'` \|\| wc == `'\r'` \|\| wc == `'\f'`)
359	attrs[i].is_white = TRUE;
360	else
361	attrs[i].is_white = FALSE;
362	break;
363	default:
364	attrs[i].is_white = FALSE;
365	break;
366	}
367
368	/ Just few spaces have variable width. So explicitly mark them.*
369	*/
370	attrs[i].is_expandable_space = (`0x0020` == wc \|\| `0x00A0` == wc);
371	is_Extended_Pictographic =
372	_pango_Is_Emoji_Extended_Pictographic (ch: wc);
373
374
375	/ ---- UAX#29 Grapheme Boundaries ---- /
376	{
377	GraphemeBreakType GB_type;
378
379	/ Find the GraphemeBreakType of wc /
380	GB_type = GB_Other;
381	switch ((int)type)
382	{
383	case G_UNICODE_FORMAT:
384	if (G_UNLIKELY (wc == `0x200C`))
385	{
386	GB_type = GB_Extend;
387	break;
388	}
389	if (G_UNLIKELY (wc == `0x200D`))
390	{
391	GB_type = GB_ZWJ;
392	break;
393	}
394	if (G_UNLIKELY((wc >= `0x600` && wc <= `0x605`) \|\|
395	wc == `0x6DD` \|\|
396	wc == `0x70F` \|\|
397	wc == `0x8E2` \|\|
398	wc == `0x110BD` \|\|
399	wc == `0x110CD`))
400	{
401	GB_type = GB_Prepend;
402	break;
403	}
404	/ Tag chars /
405	if (wc >= `0xE0020` && wc <= `0xE00FF`)
406	{
407	GB_type = GB_Extend;
408	break;
409	}
410	G_GNUC_FALLTHROUGH;
411	case G_UNICODE_CONTROL:
412	case G_UNICODE_LINE_SEPARATOR:
413	case G_UNICODE_PARAGRAPH_SEPARATOR:
414	case G_UNICODE_SURROGATE:
415	GB_type = GB_ControlCRLF;
416	break;
417
418	case G_UNICODE_UNASSIGNED:
419	/ Unassigned default ignorables /
420	if ((wc >= `0xFFF0` && wc <= `0xFFF8`) \|\|
421	(wc >= `0xE0000` && wc <= `0xE0FFF`))
422	{
423	GB_type = GB_ControlCRLF;
424	break;
425	}
426	G_GNUC_FALLTHROUGH;
427
428	case G_UNICODE_OTHER_LETTER:
429	if (makes_hangul_syllable)
430	GB_type = GB_InHangulSyllable;
431
432	if (_pango_is_Consonant_Preceding_Repha (wc) \|\|
433	_pango_is_Consonant_Prefixed (wc))
434	GB_type = GB_Prepend;
435	break;
436
437	case G_UNICODE_MODIFIER_LETTER:
438	if (wc >= `0xFF9E` && wc <= `0xFF9F`)
439	GB_type = GB_Extend; / Other_Grapheme_Extend /
440	break;
441
442	case G_UNICODE_SPACING_MARK:
443	GB_type = GB_SpacingMark; / SpacingMark /
444	if (wc >= `0x0900`)
445	{
446	if (wc == `0x09BE` \|\| wc == `0x09D7` \|\|
447	wc == `0x0B3E` \|\| wc == `0x0B57` \|\| wc == `0x0BBE` \|\| wc == `0x0BD7` \|\|
448	wc == `0x0CC2` \|\| wc == `0x0CD5` \|\| wc == `0x0CD6` \|\|
449	wc == `0x0D3E` \|\| wc == `0x0D57` \|\| wc == `0x0DCF` \|\| wc == `0x0DDF` \|\|
450	wc == `0x1D165` \|\| (wc >= `0x1D16E` && wc <= `0x1D172`))
451	GB_type = GB_Extend; / Other_Grapheme_Extend /
452	}
453	break;
454
455	case G_UNICODE_ENCLOSING_MARK:
456	case G_UNICODE_NON_SPACING_MARK:
457	GB_type = GB_Extend; / Grapheme_Extend /
458	break;
459
460	case G_UNICODE_OTHER_SYMBOL:
461	if (G_UNLIKELY(wc >=`0x1F1E6` && wc <=`0x1F1FF`))
462	{
463	if (prev_GB_type == GB_RI_Odd)
464	GB_type = GB_RI_Even;
465	else
466	GB_type = GB_RI_Odd;
467	break;
468	}
469	break;
470
471	case G_UNICODE_MODIFIER_SYMBOL:
472	/ Fitzpatrick modifiers /
473	if (wc >= `0x1F3FB` && wc <= `0x1F3FF`)
474	GB_type = GB_Extend;
475	break;
476
477	default:
478	break;
479	}
480
481	/ Rule GB11 /
482	if (met_Extended_Pictographic)
483	{
484	if (GB_type == GB_Extend)
485	met_Extended_Pictographic = TRUE;
486	else if (_pango_Is_Emoji_Extended_Pictographic (ch: prev_wc) &&
487	GB_type == GB_ZWJ)
488	met_Extended_Pictographic = TRUE;
489	else if (prev_GB_type == GB_Extend && GB_type == GB_ZWJ)
490	met_Extended_Pictographic = TRUE;
491	else if (prev_GB_type == GB_ZWJ && is_Extended_Pictographic)
492	met_Extended_Pictographic = TRUE;
493	else
494	met_Extended_Pictographic = FALSE;
495	}
496
497	/ Grapheme Cluster Boundary Rules /
498	is_grapheme_boundary = TRUE; / Rule GB999 /
499
500	/ We apply Rules GB1 and GB2 at the end of the function /
501	if (wc == `'\n'` && prev_wc == `'\r'`)
502	is_grapheme_boundary = FALSE; / Rule GB3 /
503	else if (prev_GB_type == GB_ControlCRLF \|\| GB_type == GB_ControlCRLF)
504	is_grapheme_boundary = TRUE; / Rules GB4 and GB5 /
505	else if (GB_type == GB_InHangulSyllable)
506	is_grapheme_boundary = FALSE; / Rules GB6, GB7, GB8 /
507	else if (GB_type == GB_Extend)
508	is_grapheme_boundary = FALSE; / Rule GB9 /
509	else if (GB_type == GB_ZWJ)
510	is_grapheme_boundary = FALSE; / Rule GB9 /
511	else if (GB_type == GB_SpacingMark)
512	is_grapheme_boundary = FALSE; / Rule GB9a /
513	else if (prev_GB_type == GB_Prepend)
514	is_grapheme_boundary = FALSE; / Rule GB9b /
515	else if (is_Extended_Pictographic)
516	{ / Rule GB11 /
517	if (prev_GB_type == GB_ZWJ && met_Extended_Pictographic)
518	is_grapheme_boundary = FALSE;
519	}
520	else if (prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
521	is_grapheme_boundary = FALSE; / Rule GB12 and GB13 /
522
523	if (is_Extended_Pictographic)
524	met_Extended_Pictographic = TRUE;
525
526	attrs[i].is_cursor_position = is_grapheme_boundary;
527	/ If this is a grapheme boundary, we have to decide if backspace*
528	* deletes a character or the whole grapheme cluster */
529	if (is_grapheme_boundary)
530	{
531	attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
532
533	/ Dependent Vowels for Indic language /
534	if (_pango_is_Virama (wc: prev_wc) \|\|
535	_pango_is_Vowel_Dependent (wc: prev_wc))
536	attrs[i].backspace_deletes_character = TRUE;
537	}
538	else
539	attrs[i].backspace_deletes_character = FALSE;
540
541	prev_GB_type = GB_type;
542	}
543
544	script = (PangoScript)g_unichar_get_script (ch: wc);
545	/ ---- UAX#29 Word Boundaries ---- /
546	{
547	is_word_boundary = FALSE;
548	if (is_grapheme_boundary \|\|
549	G_UNLIKELY(wc >=`0x1F1E6` && wc <=`0x1F1FF`)) / Rules WB3 and WB4 /
550	{
551	WordBreakType WB_type;
552
553	/ Find the WordBreakType of wc /
554	WB_type = WB_Other;
555
556	if (script == PANGO_SCRIPT_KATAKANA)
557	WB_type = WB_Katakana;
558
559	if (script == PANGO_SCRIPT_HEBREW && type == G_UNICODE_OTHER_LETTER)
560	WB_type = WB_Hebrew_Letter;
561
562	if (WB_type == WB_Other)
563	switch (wc >> `8`)
564	{
565	case `0x30`:
566	if (wc == `0x3031` \|\| wc == `0x3032` \|\| wc == `0x3033` \|\| wc == `0x3034` \|\| wc == `0x3035` \|\|
567	wc == `0x309b` \|\| wc == `0x309c` \|\| wc == `0x30a0` \|\| wc == `0x30fc`)
568	WB_type = WB_Katakana; / Katakana exceptions /
569	break;
570	case `0xFF`:
571	if (wc == `0xFF70`)
572	WB_type = WB_Katakana; / Katakana exceptions /
573	else if (wc >= `0xFF9E` && wc <= `0xFF9F`)
574	WB_type = WB_ExtendFormat; / Other_Grapheme_Extend /
575	break;
576	case `0x05`:
577	if (wc == `0x058A`)
578	WB_type = WB_ALetter; / ALetter exceptions /
579	break;
580	default:
581	break;
582	}
583
584	if (WB_type == WB_Other)
585	switch ((int) break_type)
586	{
587	case G_UNICODE_BREAK_NUMERIC:
588	if (wc != `0x066C`)
589	WB_type = WB_Numeric; / Numeric /
590	break;
591	case G_UNICODE_BREAK_INFIX_SEPARATOR:
592	if (wc != `0x003A` && wc != `0xFE13` && wc != `0x002E`)
593	WB_type = WB_MidNum; / MidNum /
594	break;
595	default:
596	break;
597	}
598
599	if (WB_type == WB_Other)
600	switch ((int) type)
601	{
602	case G_UNICODE_CONTROL:
603	if (wc != `0x000D` && wc != `0x000A` && wc != `0x000B` && wc != `0x000C` && wc != `0x0085`)
604	break;
605	G_GNUC_FALLTHROUGH;
606	case G_UNICODE_LINE_SEPARATOR:
607	case G_UNICODE_PARAGRAPH_SEPARATOR:
608	WB_type = WB_NewlineCRLF; / CR, LF, Newline /
609	break;
610
611	case G_UNICODE_FORMAT:
612	case G_UNICODE_SPACING_MARK:
613	case G_UNICODE_ENCLOSING_MARK:
614	case G_UNICODE_NON_SPACING_MARK:
615	WB_type = WB_ExtendFormat; / Extend, Format /
616	break;
617
618	case G_UNICODE_CONNECT_PUNCTUATION:
619	WB_type = WB_ExtendNumLet; / ExtendNumLet /
620	break;
621
622	case G_UNICODE_INITIAL_PUNCTUATION:
623	case G_UNICODE_FINAL_PUNCTUATION:
624	if (wc == `0x2018` \|\| wc == `0x2019`)
625	WB_type = WB_MidNumLet; / MidNumLet /
626	break;
627	case G_UNICODE_OTHER_PUNCTUATION:
628	if ((wc >= `0x055a` && wc <= `0x055c`) \|\|
629	wc == `0x055e` \|\| wc == `0x05f3`)
630	WB_type = WB_ALetter; / ALetter /
631	else if (wc == `0x0027` \|\| wc == `0x002e` \|\| wc == `0x2024` \|\|
632	wc == `0xfe52` \|\| wc == `0xff07` \|\| wc == `0xff0e`)
633	WB_type = WB_MidNumLet; / MidNumLet /
634	else if (wc == `0x00b7` \|\| wc == `0x05f4` \|\| wc == `0x2027` \|\|
635	wc == `0x003a` \|\| wc == `0x0387` \|\| wc == `0x055f` \|\|
636	wc == `0xfe13` \|\| wc == `0xfe55` \|\| wc == `0xff1a`)
637	WB_type = WB_MidLetter; / MidLetter /
638	else if (wc == `0x066c` \|\|
639	wc == `0xfe50` \|\| wc == `0xfe54` \|\| wc == `0xff0c` \|\| wc == `0xff1b`)
640	WB_type = WB_MidNum; / MidNum /
641	break;
642
643	case G_UNICODE_OTHER_SYMBOL:
644	if (wc >= `0x24B6` && wc <= `0x24E9`) / Other_Alphabetic /
645	goto Alphabetic;
646
647	if (G_UNLIKELY(wc >= `0x1F1E6` && wc <= `0x1F1FF`))
648	{
649	if (prev_WB_type == WB_RI_Odd)
650	WB_type = WB_RI_Even;
651	else
652	WB_type = WB_RI_Odd;
653	}
654
655	break;
656
657	case G_UNICODE_OTHER_LETTER:
658	case G_UNICODE_LETTER_NUMBER:
659	if (wc == `0x3006` \|\| wc == `0x3007` \|\|
660	(wc >= `0x3021` && wc <= `0x3029`) \|\|
661	(wc >= `0x3038` && wc <= `0x303A`) \|\|
662	(wc >= `0x3400` && wc <= `0x4DB5`) \|\|
663	(wc >= `0x4E00` && wc <= `0x9FC3`) \|\|
664	(wc >= `0xF900` && wc <= `0xFA2D`) \|\|
665	(wc >= `0xFA30` && wc <= `0xFA6A`) \|\|
666	(wc >= `0xFA70` && wc <= `0xFAD9`) \|\|
667	(wc >= `0x20000` && wc <= `0x2A6D6`) \|\|
668	(wc >= `0x2F800` && wc <= `0x2FA1D`))
669	break; / ALetter exceptions: Ideographic /
670	goto Alphabetic;
671
672	case G_UNICODE_LOWERCASE_LETTER:
673	case G_UNICODE_MODIFIER_LETTER:
674	case G_UNICODE_TITLECASE_LETTER:
675	case G_UNICODE_UPPERCASE_LETTER:
676	Alphabetic:
677	if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA)
678	WB_type = WB_ALetter; / ALetter /
679	break;
680	default:
681	break;
682	}
683
684	if (WB_type == WB_Other)
685	{
686	if (type == G_UNICODE_SPACE_SEPARATOR &&
687	break_type != G_UNICODE_BREAK_NON_BREAKING_GLUE)
688	WB_type = WB_WSegSpace;
689	}
690
691	/ Word Cluster Boundary Rules /
692
693	/ We apply Rules WB1 and WB2 at the end of the function /
694
695	if (prev_wc == `0x3031` && wc == `0x41`)
696	g_debug ("Y %d %d", prev_WB_type, WB_type);
697	if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + `1` == i)
698	{
699	/ The extra check for prev_WB_i is to correctly handle sequences like*
700	* Newline ÷ Extend × Extend
701	* since we have not skipped ExtendFormat yet.
702	*/
703	is_word_boundary = TRUE; / Rule WB3a /
704	}
705	else if (WB_type == WB_NewlineCRLF)
706	is_word_boundary = TRUE; / Rule WB3b /
707	else if (prev_wc == `0x200D` && is_Extended_Pictographic)
708	is_word_boundary = FALSE; / Rule WB3c /
709	else if (prev_WB_type == WB_WSegSpace &&
710	WB_type == WB_WSegSpace && prev_WB_i + `1` == i)
711	is_word_boundary = FALSE; / Rule WB3d /
712	else if (WB_type == WB_ExtendFormat)
713	is_word_boundary = FALSE; / Rules WB4? /
714	else if ((prev_WB_type == WB_ALetter \|\|
715	prev_WB_type == WB_Hebrew_Letter \|\|
716	prev_WB_type == WB_Numeric) &&
717	(WB_type == WB_ALetter \|\|
718	WB_type == WB_Hebrew_Letter \|\|
719	WB_type == WB_Numeric))
720	is_word_boundary = FALSE; / Rules WB5, WB8, WB9, WB10 /
721	else if (prev_WB_type == WB_Katakana && WB_type == WB_Katakana)
722	is_word_boundary = FALSE; / Rule WB13 /
723	else if ((prev_WB_type == WB_ALetter \|\|
724	prev_WB_type == WB_Hebrew_Letter \|\|
725	prev_WB_type == WB_Numeric \|\|
726	prev_WB_type == WB_Katakana \|\|
727	prev_WB_type == WB_ExtendNumLet) &&
728	WB_type == WB_ExtendNumLet)
729	is_word_boundary = FALSE; / Rule WB13a /
730	else if (prev_WB_type == WB_ExtendNumLet &&
731	(WB_type == WB_ALetter \|\|
732	WB_type == WB_Hebrew_Letter \|\|
733	WB_type == WB_Numeric \|\|
734	WB_type == WB_Katakana))
735	is_word_boundary = FALSE; / Rule WB13b /
736	else if (((prev_prev_WB_type == WB_ALetter \|\|
737	prev_prev_WB_type == WB_Hebrew_Letter) &&
738	(WB_type == WB_ALetter \|\|
739	WB_type == WB_Hebrew_Letter)) &&
740	(prev_WB_type == WB_MidLetter \|\|
741	prev_WB_type == WB_MidNumLet \|\|
742	prev_wc == `0x0027`))
743	{
744	attrs[prev_WB_i].is_word_boundary = FALSE; / Rule WB6 /
745	is_word_boundary = FALSE; / Rule WB7 /
746	}
747	else if (prev_WB_type == WB_Hebrew_Letter && wc == `0x0027`)
748	is_word_boundary = FALSE; / Rule WB7a /
749	else if (prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == `0x0022` &&
750	WB_type == WB_Hebrew_Letter) {
751	attrs[prev_WB_i].is_word_boundary = FALSE; / Rule WB7b /
752	is_word_boundary = FALSE; / Rule WB7c /
753	}
754	else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
755	(prev_WB_type == WB_MidNum \|\| prev_WB_type == WB_MidNumLet \|\|
756	prev_wc == `0x0027`))
757	{
758	is_word_boundary = FALSE; / Rule WB11 /
759	attrs[prev_WB_i].is_word_boundary = FALSE; / Rule WB12 /
760	}
761	else if (prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even)
762	is_word_boundary = FALSE; / Rule WB15 and WB16 /
763	else
764	is_word_boundary = TRUE; / Rule WB999 /
765
766	if (WB_type != WB_ExtendFormat)
767	{
768	prev_prev_WB_type = prev_WB_type;
769	prev_WB_type = WB_type;
770	prev_WB_i = i;
771	}
772	}
773
774	attrs[i].is_word_boundary = is_word_boundary;
775	}
776
777	/ ---- UAX#29 Sentence Boundaries ---- /
778	{
779	is_sentence_boundary = FALSE;
780	if (is_word_boundary \|\|
781	wc == `'\r'` \|\| wc == `'\n'`) / Rules SB3 and SB5 /
782	{
783	SentenceBreakType SB_type;
784
785	/ Find the SentenceBreakType of wc /
786	SB_type = SB_Other;
787
788	if (break_type == G_UNICODE_BREAK_NUMERIC)
789	SB_type = SB_Numeric; / Numeric /
790
791	if (SB_type == SB_Other)
792	switch ((int) type)
793	{
794	case G_UNICODE_CONTROL:
795	if (wc == `'\r'` \|\| wc == `'\n'`)
796	SB_type = SB_ParaSep;
797	else if (wc == `0x0009` \|\| wc == `0x000B` \|\| wc == `0x000C`)
798	SB_type = SB_Sp;
799	else if (wc == `0x0085`)
800	SB_type = SB_ParaSep;
801	break;
802
803	case G_UNICODE_SPACE_SEPARATOR:
804	if (wc == `0x0020` \|\| wc == `0x00A0` \|\| wc == `0x1680` \|\|
805	(wc >= `0x2000` && wc <= `0x200A`) \|\|
806	wc == `0x202F` \|\| wc == `0x205F` \|\| wc == `0x3000`)
807	SB_type = SB_Sp;
808	break;
809
810	case G_UNICODE_LINE_SEPARATOR:
811	case G_UNICODE_PARAGRAPH_SEPARATOR:
812	SB_type = SB_ParaSep;
813	break;
814
815	case G_UNICODE_FORMAT:
816	case G_UNICODE_SPACING_MARK:
817	case G_UNICODE_ENCLOSING_MARK:
818	case G_UNICODE_NON_SPACING_MARK:
819	SB_type = SB_ExtendFormat; / Extend, Format /
820	break;
821
822	case G_UNICODE_MODIFIER_LETTER:
823	if (wc >= `0xFF9E` && wc <= `0xFF9F`)
824	SB_type = SB_ExtendFormat; / Other_Grapheme_Extend /
825	break;
826
827	case G_UNICODE_TITLECASE_LETTER:
828	SB_type = SB_Upper;
829	break;
830
831	case G_UNICODE_DASH_PUNCTUATION:
832	if (wc == `0x002D` \|\|
833	(wc >= `0x2013` && wc <= `0x2014`) \|\|
834	(wc >= `0xFE31` && wc <= `0xFE32`) \|\|
835	wc == `0xFE58` \|\|
836	wc == `0xFE63` \|\|
837	wc == `0xFF0D`)
838	SB_type = SB_SContinue;
839	break;
840
841	case G_UNICODE_OTHER_PUNCTUATION:
842	if (wc == `0x05F3`)
843	SB_type = SB_OLetter;
844	else if (wc == `0x002E` \|\| wc == `0x2024` \|\|
845	wc == `0xFE52` \|\| wc == `0xFF0E`)
846	SB_type = SB_ATerm;
847
848	if (wc == `0x002C` \|\|
849	wc == `0x003A` \|\|
850	wc == `0x055D` \|\|
851	(wc >= `0x060C` && wc <= `0x060D`) \|\|
852	wc == `0x07F8` \|\|
853	wc == `0x1802` \|\|
854	wc == `0x1808` \|\|
855	wc == `0x3001` \|\|
856	(wc >= `0xFE10` && wc <= `0xFE11`) \|\|
857	wc == `0xFE13` \|\|
858	(wc >= `0xFE50` && wc <= `0xFE51`) \|\|
859	wc == `0xFE55` \|\|
860	wc == `0xFF0C` \|\|
861	wc == `0xFF1A` \|\|
862	wc == `0xFF64`)
863	SB_type = SB_SContinue;
864
865	if (_pango_is_STerm(wc))
866	SB_type = SB_STerm;
867
868	break;
869
870	default:
871	break;
872	}
873
874	if (SB_type == SB_Other)
875	{
876	if (type == G_UNICODE_LOWERCASE_LETTER)
877	SB_type = SB_Lower;
878	else if (type == G_UNICODE_UPPERCASE_LETTER)
879	SB_type = SB_Upper;
880	else if (type == G_UNICODE_TITLECASE_LETTER \|\|
881	type == G_UNICODE_MODIFIER_LETTER \|\|
882	type == G_UNICODE_OTHER_LETTER)
883	SB_type = SB_OLetter;
884
885	if (type == G_UNICODE_OPEN_PUNCTUATION \|\|
886	type == G_UNICODE_CLOSE_PUNCTUATION \|\|
887	break_type == G_UNICODE_BREAK_QUOTATION)
888	SB_type = SB_Close;
889	}
890
891	/ Sentence Boundary Rules /
892
893	/ We apply Rules SB1 and SB2 at the end of the function /
894
895	#define IS_OTHER_TERM(SB_type) \
896	/* not in (OLetter \| Upper \| Lower \| ParaSep \| SATerm) */ \
897	!(SB_type == SB_OLetter \|\| \
898	SB_type == SB_Upper \|\| SB_type == SB_Lower \|\| \
899	SB_type == SB_ParaSep \|\| \
900	SB_type == SB_ATerm \|\| SB_type == SB_STerm \|\| \
901	SB_type == SB_ATerm_Close_Sp \|\| \
902	SB_type == SB_STerm_Close_Sp)
903
904
905	if (wc == `'\n'` && prev_wc == `'\r'`)
906	is_sentence_boundary = FALSE; / Rule SB3 /
907	else if (prev_SB_type == SB_ParaSep && prev_SB_i + `1` == i)
908	{
909	/ The extra check for prev_SB_i is to correctly handle sequences like*
910	* ParaSep ÷ Extend × Extend
911	* since we have not skipped ExtendFormat yet.
912	*/
913
914	is_sentence_boundary = TRUE; / Rule SB4 /
915	}
916	else if (SB_type == SB_ExtendFormat)
917	is_sentence_boundary = FALSE; / Rule SB5? /
918	else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
919	is_sentence_boundary = FALSE; / Rule SB6 /
920	else if ((prev_prev_SB_type == SB_Upper \|\|
921	prev_prev_SB_type == SB_Lower) &&
922	prev_SB_type == SB_ATerm &&
923	SB_type == SB_Upper)
924	is_sentence_boundary = FALSE; / Rule SB7 /
925	else if (prev_SB_type == SB_ATerm && SB_type == SB_Close)
926	SB_type = SB_ATerm;
927	else if (prev_SB_type == SB_STerm && SB_type == SB_Close)
928	SB_type = SB_STerm;
929	else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp)
930	SB_type = SB_ATerm_Close_Sp;
931	else if (prev_SB_type == SB_STerm && SB_type == SB_Sp)
932	SB_type = SB_STerm_Close_Sp;
933	/ Rule SB8 /
934	else if ((prev_SB_type == SB_ATerm \|\|
935	prev_SB_type == SB_ATerm_Close_Sp) &&
936	SB_type == SB_Lower)
937	is_sentence_boundary = FALSE;
938	else if ((prev_prev_SB_type == SB_ATerm \|\|
939	prev_prev_SB_type == SB_ATerm_Close_Sp) &&
940	IS_OTHER_TERM(prev_SB_type) &&
941	SB_type == SB_Lower)
942	{
943	attrs[prev_SB_i].is_sentence_boundary = FALSE;
944	attrs[prev_SB_i].is_sentence_end = FALSE;
945	last_sentence_start = -`1`;
946	for (int j = prev_SB_i - `1`; j >= `0`; j--)
947	{
948	attrs[j].is_sentence_end = FALSE;
949	if (attrs[j].is_sentence_boundary)
950	{
951	last_sentence_start = j;
952	break;
953	}
954	}
955	}
956	else if ((prev_SB_type == SB_ATerm \|\|
957	prev_SB_type == SB_ATerm_Close_Sp \|\|
958	prev_SB_type == SB_STerm \|\|
959	prev_SB_type == SB_STerm_Close_Sp) &&
960	(SB_type == SB_SContinue \|\|
961	SB_type == SB_ATerm \|\| SB_type == SB_STerm))
962	is_sentence_boundary = FALSE; / Rule SB8a /
963	else if ((prev_SB_type == SB_ATerm \|\|
964	prev_SB_type == SB_STerm) &&
965	(SB_type == SB_Close \|\| SB_type == SB_Sp \|\|
966	SB_type == SB_ParaSep))
967	is_sentence_boundary = FALSE; / Rule SB9 /
968	else if ((prev_SB_type == SB_ATerm \|\|
969	prev_SB_type == SB_ATerm_Close_Sp \|\|
970	prev_SB_type == SB_STerm \|\|
971	prev_SB_type == SB_STerm_Close_Sp) &&
972	(SB_type == SB_Sp \|\| SB_type == SB_ParaSep))
973	is_sentence_boundary = FALSE; / Rule SB10 /
974	else if ((prev_SB_type == SB_ATerm \|\|
975	prev_SB_type == SB_ATerm_Close_Sp \|\|
976	prev_SB_type == SB_STerm \|\|
977	prev_SB_type == SB_STerm_Close_Sp) &&
978	SB_type != SB_ParaSep)
979	is_sentence_boundary = TRUE; / Rule SB11 /
980	else
981	is_sentence_boundary = FALSE; / Rule SB998 /
982
983	if (SB_type != SB_ExtendFormat &&
984	!((prev_prev_SB_type == SB_ATerm \|\|
985	prev_prev_SB_type == SB_ATerm_Close_Sp) &&
986	IS_OTHER_TERM(prev_SB_type) &&
987	IS_OTHER_TERM(SB_type)))
988	{
989	prev_prev_SB_type = prev_SB_type;
990	prev_SB_type = SB_type;
991	prev_SB_i = i;
992	}
993
994	#undef IS_OTHER_TERM
995
996	}
997
998	if (i == `0` \|\| done)
999	is_sentence_boundary = TRUE; / Rules SB1 and SB2 /
1000
1001	attrs[i].is_sentence_boundary = is_sentence_boundary;
1002	}
1003
1004	/ ---- Line breaking ---- /
1005
1006	break_op = BREAK_ALREADY_HANDLED;
1007
1008	row_break_type = prev_break_type == G_UNICODE_BREAK_SPACE ?
1009	prev_prev_break_type : prev_break_type;
1010	g_assert (row_break_type != G_UNICODE_BREAK_SPACE);
1011
1012	attrs[i].is_char_break = FALSE;
1013	attrs[i].is_line_break = FALSE;
1014	attrs[i].is_mandatory_break = FALSE;
1015
1016	/ Rule LB1:*
1017	assign a line breaking class to each code point of the input. /*
1018	switch ((int)break_type)
1019	{
1020	case G_UNICODE_BREAK_AMBIGUOUS:
1021	case G_UNICODE_BREAK_SURROGATE:
1022	case G_UNICODE_BREAK_UNKNOWN:
1023	break_type = G_UNICODE_BREAK_ALPHABETIC;
1024	break;
1025
1026	case G_UNICODE_BREAK_COMPLEX_CONTEXT:
1027	if (type == G_UNICODE_NON_SPACING_MARK \|\|
1028	type == G_UNICODE_SPACING_MARK)
1029	break_type = G_UNICODE_BREAK_COMBINING_MARK;
1030	else
1031	break_type = G_UNICODE_BREAK_ALPHABETIC;
1032	break;
1033
1034	case G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER:
1035	break_type = G_UNICODE_BREAK_NON_STARTER;
1036	break;
1037
1038	default:
1039	break;
1040	}
1041
1042	/ If it's not a grapheme boundary, it's not a line break either /
1043	if (attrs[i].is_cursor_position \|\|
1044	break_type == G_UNICODE_BREAK_COMBINING_MARK \|\|
1045	break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER \|\|
1046	break_type == G_UNICODE_BREAK_HANGUL_L_JAMO \|\|
1047	break_type == G_UNICODE_BREAK_HANGUL_V_JAMO \|\|
1048	break_type == G_UNICODE_BREAK_HANGUL_T_JAMO \|\|
1049	break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE \|\|
1050	break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE \|\|
1051	break_type == G_UNICODE_BREAK_EMOJI_MODIFIER \|\|
1052	break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
1053	{
1054	LineBreakType LB_type;
1055
1056	/ Find the LineBreakType of wc /
1057	LB_type = LB_Other;
1058
1059	if (break_type == G_UNICODE_BREAK_NUMERIC)
1060	LB_type = LB_Numeric;
1061
1062	if (break_type == G_UNICODE_BREAK_SYMBOL \|\|
1063	break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
1064	{
1065	if (!(prev_LB_type == LB_Numeric))
1066	LB_type = LB_Other;
1067	}
1068
1069	if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION \|\|
1070	break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS)
1071	{
1072	if (prev_LB_type == LB_Numeric)
1073	LB_type = LB_Numeric_Close;
1074	else
1075	LB_type = LB_Other;
1076	}
1077
1078	if (break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
1079	{
1080	if (prev_LB_type == LB_RI_Odd)
1081	LB_type = LB_RI_Even;
1082	else
1083	LB_type = LB_RI_Odd;
1084	}
1085
1086	attrs[i].is_line_break = TRUE; / Rule LB31 /
1087	/ Unicode doesn't specify char wrap;*
1088	we wrap around all chars currently. /*
1089	if (attrs[i].is_cursor_position)
1090	attrs[i].is_char_break = TRUE;
1091
1092	/ Make any necessary replacements first /
1093	if (row_break_type == G_UNICODE_BREAK_UNKNOWN)
1094	row_break_type = G_UNICODE_BREAK_ALPHABETIC;
1095
1096	/ add the line break rules in reverse order to override*
1097	the lower priority rules. /*
1098
1099	/ Rule LB30 /
1100	if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC \|\|
1101	prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER \|\|
1102	prev_break_type == G_UNICODE_BREAK_NUMERIC) &&
1103	break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION &&
1104	!_pango_is_EastAsianWide (wc))
1105	break_op = BREAK_PROHIBITED;
1106
1107	if (prev_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS &&
1108	!_pango_is_EastAsianWide (wc: prev_wc)&&
1109	(break_type == G_UNICODE_BREAK_ALPHABETIC \|\|
1110	break_type == G_UNICODE_BREAK_HEBREW_LETTER \|\|
1111	break_type == G_UNICODE_BREAK_NUMERIC))
1112	break_op = BREAK_PROHIBITED;
1113
1114	/ Rule LB30a /
1115	if (prev_LB_type == LB_RI_Odd && LB_type == LB_RI_Even)
1116	break_op = BREAK_PROHIBITED;
1117
1118	/ Rule LB30b /
1119	if (prev_break_type == G_UNICODE_BREAK_EMOJI_BASE &&
1120	break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
1121	break_op = BREAK_PROHIBITED;
1122
1123	if ((_pango_Is_Emoji_Extended_Pictographic (ch: prev_wc) &&
1124	g_unichar_type (c: prev_wc) == G_UNICODE_UNASSIGNED) &&
1125	break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
1126	break_op = BREAK_PROHIBITED;
1127
1128	/ Rule LB29 /
1129	if (prev_break_type == G_UNICODE_BREAK_INFIX_SEPARATOR &&
1130	(break_type == G_UNICODE_BREAK_ALPHABETIC \|\|
1131	break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1132	break_op = BREAK_PROHIBITED;
1133
1134	/ Rule LB28 /
1135	if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC \|\|
1136	prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
1137	(break_type == G_UNICODE_BREAK_ALPHABETIC \|\|
1138	break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1139	break_op = BREAK_PROHIBITED;
1140
1141	/ Rule LB27 /
1142	if ((prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO \|\|
1143	prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO \|\|
1144	prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO \|\|
1145	prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE \|\|
1146	prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
1147	break_type == G_UNICODE_BREAK_POSTFIX)
1148	break_op = BREAK_PROHIBITED;
1149
1150	if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
1151	(break_type == G_UNICODE_BREAK_HANGUL_L_JAMO \|\|
1152	break_type == G_UNICODE_BREAK_HANGUL_V_JAMO \|\|
1153	break_type == G_UNICODE_BREAK_HANGUL_T_JAMO \|\|
1154	break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE \|\|
1155	break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
1156	break_op = BREAK_PROHIBITED;
1157
1158	/ Rule LB26 /
1159	if (prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO &&
1160	(break_type == G_UNICODE_BREAK_HANGUL_L_JAMO \|\|
1161	break_type == G_UNICODE_BREAK_HANGUL_V_JAMO \|\|
1162	break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE \|\|
1163	break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
1164	break_op = BREAK_PROHIBITED;
1165
1166	if ((prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO \|\|
1167	prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE) &&
1168	(break_type == G_UNICODE_BREAK_HANGUL_V_JAMO \|\|
1169	break_type == G_UNICODE_BREAK_HANGUL_T_JAMO))
1170	break_op = BREAK_PROHIBITED;
1171
1172	if ((prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO \|\|
1173	prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
1174	break_type == G_UNICODE_BREAK_HANGUL_T_JAMO)
1175	break_op = BREAK_PROHIBITED;
1176
1177	/ Rule LB25 with Example 7 of Customization /
1178	if ((prev_break_type == G_UNICODE_BREAK_PREFIX \|\|
1179	prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
1180	break_type == G_UNICODE_BREAK_NUMERIC)
1181	break_op = BREAK_PROHIBITED;
1182
1183	if ((prev_break_type == G_UNICODE_BREAK_PREFIX \|\|
1184	prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
1185	(break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION \|\|
1186	break_type == G_UNICODE_BREAK_HYPHEN) &&
1187	next_break_type == G_UNICODE_BREAK_NUMERIC)
1188	break_op = BREAK_PROHIBITED;
1189
1190	if ((prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION \|\|
1191	prev_break_type == G_UNICODE_BREAK_HYPHEN) &&
1192	break_type == G_UNICODE_BREAK_NUMERIC)
1193	break_op = BREAK_PROHIBITED;
1194
1195	if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
1196	(break_type == G_UNICODE_BREAK_NUMERIC \|\|
1197	break_type == G_UNICODE_BREAK_SYMBOL \|\|
1198	break_type == G_UNICODE_BREAK_INFIX_SEPARATOR))
1199	break_op = BREAK_PROHIBITED;
1200
1201	if (prev_LB_type == LB_Numeric &&
1202	(break_type == G_UNICODE_BREAK_NUMERIC \|\|
1203	break_type == G_UNICODE_BREAK_SYMBOL \|\|
1204	break_type == G_UNICODE_BREAK_INFIX_SEPARATOR \|\|
1205	break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION \|\|
1206	break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS))
1207	break_op = BREAK_PROHIBITED;
1208
1209	if ((prev_LB_type == LB_Numeric \|\|
1210	prev_LB_type == LB_Numeric_Close) &&
1211	(break_type == G_UNICODE_BREAK_POSTFIX \|\|
1212	break_type == G_UNICODE_BREAK_PREFIX))
1213	break_op = BREAK_PROHIBITED;
1214
1215	/ Rule LB24 /
1216	if ((prev_break_type == G_UNICODE_BREAK_PREFIX \|\|
1217	prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
1218	(break_type == G_UNICODE_BREAK_ALPHABETIC \|\|
1219	break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1220	break_op = BREAK_PROHIBITED;
1221
1222	if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC \|\|
1223	prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
1224	(break_type == G_UNICODE_BREAK_PREFIX \|\|
1225	break_type == G_UNICODE_BREAK_POSTFIX))
1226	break_op = BREAK_PROHIBITED;
1227
1228	/ Rule LB23 /
1229	if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC \|\|
1230	prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
1231	break_type == G_UNICODE_BREAK_NUMERIC)
1232	break_op = BREAK_PROHIBITED;
1233
1234	if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
1235	(break_type == G_UNICODE_BREAK_ALPHABETIC \|\|
1236	break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1237	break_op = BREAK_PROHIBITED;
1238
1239	/ Rule LB23a /
1240	if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
1241	(break_type == G_UNICODE_BREAK_IDEOGRAPHIC \|\|
1242	break_type == G_UNICODE_BREAK_EMOJI_BASE \|\|
1243	break_type == G_UNICODE_BREAK_EMOJI_MODIFIER))
1244	break_op = BREAK_PROHIBITED;
1245
1246	if ((prev_break_type == G_UNICODE_BREAK_IDEOGRAPHIC \|\|
1247	prev_break_type == G_UNICODE_BREAK_EMOJI_BASE \|\|
1248	prev_break_type == G_UNICODE_BREAK_EMOJI_MODIFIER) &&
1249	break_type == G_UNICODE_BREAK_POSTFIX)
1250	break_op = BREAK_PROHIBITED;
1251
1252	/ Rule LB22 /
1253	if (break_type == G_UNICODE_BREAK_INSEPARABLE)
1254	break_op = BREAK_PROHIBITED;
1255
1256	if (break_type == G_UNICODE_BREAK_AFTER \|\|
1257	break_type == G_UNICODE_BREAK_HYPHEN \|\|
1258	break_type == G_UNICODE_BREAK_NON_STARTER \|\|
1259	prev_break_type == G_UNICODE_BREAK_BEFORE)
1260	break_op = BREAK_PROHIBITED; / Rule LB21 /
1261
1262	if (prev_prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER &&
1263	(prev_break_type == G_UNICODE_BREAK_HYPHEN \|\|
1264	prev_break_type == G_UNICODE_BREAK_AFTER))
1265	break_op = BREAK_PROHIBITED; / Rule LB21a /
1266
1267	if (prev_break_type == G_UNICODE_BREAK_SYMBOL &&
1268	break_type == G_UNICODE_BREAK_HEBREW_LETTER)
1269	break_op = BREAK_PROHIBITED; / Rule LB21b /
1270
1271	if (prev_break_type == G_UNICODE_BREAK_CONTINGENT \|\|
1272	break_type == G_UNICODE_BREAK_CONTINGENT)
1273	break_op = BREAK_ALLOWED; / Rule LB20 /
1274
1275	if (prev_break_type == G_UNICODE_BREAK_QUOTATION \|\|
1276	break_type == G_UNICODE_BREAK_QUOTATION)
1277	break_op = BREAK_PROHIBITED; / Rule LB19 /
1278
1279	/ handle related rules for Space as state machine here,*
1280	and override the pair table result. /*
1281	if (prev_break_type == G_UNICODE_BREAK_SPACE) / Rule LB18 /
1282	break_op = BREAK_ALLOWED;
1283
1284	if (row_break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER &&
1285	break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER)
1286	break_op = BREAK_PROHIBITED; / Rule LB17 /
1287
1288	if ((row_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION \|\|
1289	row_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS) &&
1290	break_type == G_UNICODE_BREAK_NON_STARTER)
1291	break_op = BREAK_PROHIBITED; / Rule LB16 /
1292
1293	if (row_break_type == G_UNICODE_BREAK_QUOTATION &&
1294	break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
1295	break_op = BREAK_PROHIBITED; / Rule LB15 /
1296
1297	if (row_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
1298	break_op = BREAK_PROHIBITED; / Rule LB14 /
1299
1300	/ Rule LB13 with Example 7 of Customization /
1301	if (break_type == G_UNICODE_BREAK_EXCLAMATION)
1302	break_op = BREAK_PROHIBITED;
1303
1304	if (prev_break_type != G_UNICODE_BREAK_NUMERIC &&
1305	(break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION \|\|
1306	break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS \|\|
1307	break_type == G_UNICODE_BREAK_INFIX_SEPARATOR \|\|
1308	break_type == G_UNICODE_BREAK_SYMBOL))
1309	break_op = BREAK_PROHIBITED;
1310
1311	if (prev_break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE)
1312	break_op = BREAK_PROHIBITED; / Rule LB12 /
1313
1314	if (break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE &&
1315	(prev_break_type != G_UNICODE_BREAK_SPACE &&
1316	prev_break_type != G_UNICODE_BREAK_AFTER &&
1317	prev_break_type != G_UNICODE_BREAK_HYPHEN))
1318	break_op = BREAK_PROHIBITED; / Rule LB12a /
1319
1320	if (prev_break_type == G_UNICODE_BREAK_WORD_JOINER \|\|
1321	break_type == G_UNICODE_BREAK_WORD_JOINER)
1322	break_op = BREAK_PROHIBITED; / Rule LB11 /
1323
1324
1325	/ Rule LB9 /
1326	if (break_type == G_UNICODE_BREAK_COMBINING_MARK \|\|
1327	break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
1328	{
1329	if (!(prev_break_type == G_UNICODE_BREAK_MANDATORY \|\|
1330	prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN \|\|
1331	prev_break_type == G_UNICODE_BREAK_LINE_FEED \|\|
1332	prev_break_type == G_UNICODE_BREAK_NEXT_LINE \|\|
1333	prev_break_type == G_UNICODE_BREAK_SPACE \|\|
1334	prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE))
1335	break_op = BREAK_PROHIBITED;
1336	}
1337
1338	if (row_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
1339	break_op = BREAK_ALLOWED; / Rule LB8 /
1340
1341	if (prev_wc == `0x200D`)
1342	break_op = BREAK_PROHIBITED; / Rule LB8a /
1343
1344	if (break_type == G_UNICODE_BREAK_SPACE \|\|
1345	break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
1346	break_op = BREAK_PROHIBITED; / Rule LB7 /
1347
1348	/ Rule LB6 /
1349	if (break_type == G_UNICODE_BREAK_MANDATORY \|\|
1350	break_type == G_UNICODE_BREAK_CARRIAGE_RETURN \|\|
1351	break_type == G_UNICODE_BREAK_LINE_FEED \|\|
1352	break_type == G_UNICODE_BREAK_NEXT_LINE)
1353	break_op = BREAK_PROHIBITED;
1354
1355	/ Rules LB4 and LB5 /
1356	if (prev_break_type == G_UNICODE_BREAK_MANDATORY \|\|
1357	(prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN &&
1358	wc != `'\n'`) \|\|
1359	prev_break_type == G_UNICODE_BREAK_LINE_FEED \|\|
1360	prev_break_type == G_UNICODE_BREAK_NEXT_LINE)
1361	{
1362	attrs[i].is_mandatory_break = TRUE;
1363	break_op = BREAK_ALLOWED;
1364	}
1365
1366	switch (break_op)
1367	{
1368	case BREAK_PROHIBITED:
1369	/ can't break here /
1370	attrs[i].is_line_break = FALSE;
1371	break;
1372
1373	case BREAK_IF_SPACES:
1374	/ break if prev char was space /
1375	if (prev_break_type != G_UNICODE_BREAK_SPACE)
1376	attrs[i].is_line_break = FALSE;
1377	break;
1378
1379	case BREAK_ALLOWED:
1380	attrs[i].is_line_break = TRUE;
1381	break;
1382
1383	case BREAK_ALREADY_HANDLED:
1384	break;
1385
1386	default:
1387	g_assert_not_reached ();
1388	break;
1389	}
1390
1391	/ Rule LB9 /
1392	if (!(break_type == G_UNICODE_BREAK_COMBINING_MARK \|\|
1393	break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER))
1394	{
1395	/ Rule LB25 with Example 7 of Customization /
1396	if (break_type == G_UNICODE_BREAK_NUMERIC \|\|
1397	break_type == G_UNICODE_BREAK_SYMBOL \|\|
1398	break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
1399	{
1400	if (prev_LB_type != LB_Numeric)
1401	prev_LB_type = LB_type;
1402	/ else don't change the prev_LB_type /
1403	}
1404	else
1405	{
1406	prev_LB_type = LB_type;
1407	}
1408	}
1409	/ else don't change the prev_LB_type for Rule LB9 /
1410	}
1411
1412	if (break_type != G_UNICODE_BREAK_SPACE)
1413	{
1414	/ Rule LB9 /
1415	if (break_type == G_UNICODE_BREAK_COMBINING_MARK \|\|
1416	break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
1417	{
1418	if (i == `0` / start of text / \|\|
1419	prev_break_type == G_UNICODE_BREAK_MANDATORY \|\|
1420	prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN \|\|
1421	prev_break_type == G_UNICODE_BREAK_LINE_FEED \|\|
1422	prev_break_type == G_UNICODE_BREAK_NEXT_LINE \|\|
1423	prev_break_type == G_UNICODE_BREAK_SPACE \|\|
1424	prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
1425	prev_break_type = G_UNICODE_BREAK_ALPHABETIC; / Rule LB10 /
1426	/ else don't change the prev_break_type for Rule LB9 /
1427	}
1428	else
1429	{
1430	prev_prev_break_type = prev_break_type;
1431	prev_break_type = break_type;
1432	}
1433
1434	prev_jamo = jamo;
1435	}
1436	else
1437	{
1438	if (prev_break_type != G_UNICODE_BREAK_SPACE)
1439	{
1440	prev_prev_break_type = prev_break_type;
1441	prev_break_type = break_type;
1442	}
1443	/ else don't change the prev_break_type /
1444	}
1445
1446	/ ---- Word breaks ---- /
1447
1448	/ default to not a word start/end /
1449	attrs[i].is_word_start = FALSE;
1450	attrs[i].is_word_end = FALSE;
1451
1452	if (current_word_type != WordNone)
1453	{
1454	/ Check for a word end /
1455	switch ((int) type)
1456	{
1457	case G_UNICODE_SPACING_MARK:
1458	case G_UNICODE_ENCLOSING_MARK:
1459	case G_UNICODE_NON_SPACING_MARK:
1460	case G_UNICODE_FORMAT:
1461	/ nothing, we just eat these up as part of the word /
1462	break;
1463
1464	case G_UNICODE_LOWERCASE_LETTER:
1465	case G_UNICODE_MODIFIER_LETTER:
1466	case G_UNICODE_OTHER_LETTER:
1467	case G_UNICODE_TITLECASE_LETTER:
1468	case G_UNICODE_UPPERCASE_LETTER:
1469	if (current_word_type == WordLetters)
1470	{
1471	/ Japanese special cases for ending the word /
1472	if (JAPANESE (last_word_letter) \|\|
1473	JAPANESE (wc))
1474	{
1475	if ((HIRAGANA (last_word_letter) &&
1476	!HIRAGANA (wc)) \|\|
1477	(KATAKANA (last_word_letter) &&
1478	!(KATAKANA (wc) \|\| HIRAGANA (wc))) \|\|
1479	(KANJI (last_word_letter) &&
1480	!(HIRAGANA (wc) \|\| KANJI (wc))) \|\|
1481	(JAPANESE (last_word_letter) &&
1482	!JAPANESE (wc)) \|\|
1483	(!JAPANESE (last_word_letter) &&
1484	JAPANESE (wc)))
1485	attrs[i].is_word_end = TRUE;
1486	}
1487	}
1488	last_word_letter = wc;
1489	break;
1490
1491	case G_UNICODE_DECIMAL_NUMBER:
1492	case G_UNICODE_LETTER_NUMBER:
1493	case G_UNICODE_OTHER_NUMBER:
1494	last_word_letter = wc;
1495	break;
1496
1497	default:
1498	/ Punctuation, control/format chars, etc. all end a word. /
1499	attrs[i].is_word_end = TRUE;
1500	current_word_type = WordNone;
1501	break;
1502	}
1503	}
1504	else
1505	{
1506	/ Check for a word start /
1507	switch ((int) type)
1508	{
1509	case G_UNICODE_LOWERCASE_LETTER:
1510	case G_UNICODE_MODIFIER_LETTER:
1511	case G_UNICODE_OTHER_LETTER:
1512	case G_UNICODE_TITLECASE_LETTER:
1513	case G_UNICODE_UPPERCASE_LETTER:
1514	current_word_type = WordLetters;
1515	last_word_letter = wc;
1516	attrs[i].is_word_start = TRUE;
1517	break;
1518
1519	case G_UNICODE_DECIMAL_NUMBER:
1520	case G_UNICODE_LETTER_NUMBER:
1521	case G_UNICODE_OTHER_NUMBER:
1522	current_word_type = WordNumbers;
1523	last_word_letter = wc;
1524	attrs[i].is_word_start = TRUE;
1525	break;
1526
1527	default:
1528	/ No word here /
1529	break;
1530	}
1531	}
1532
1533	/ ---- Sentence breaks ---- /
1534	{
1535
1536	/ default to not a sentence start/end /
1537	attrs[i].is_sentence_start = FALSE;
1538	attrs[i].is_sentence_end = FALSE;
1539
1540	/ maybe start sentence /
1541	if (last_sentence_start == -`1` && !is_sentence_boundary)
1542	last_sentence_start = i - `1`;
1543
1544	/ remember last non space character position /
1545	if (i > `0` && !attrs[i - `1`].is_white)
1546	last_non_space = i;
1547
1548	/ meets sentence end, mark both sentence start and end /
1549	if (last_sentence_start != -`1` && is_sentence_boundary) {
1550	if (last_non_space >= last_sentence_start) {
1551	attrs[last_sentence_start].is_sentence_start = TRUE;
1552	attrs[last_non_space].is_sentence_end = TRUE;
1553	}
1554
1555	last_sentence_start = -`1`;
1556	last_non_space = -`1`;
1557	}
1558
1559	/ meets space character, move sentence start /
1560	if (last_sentence_start != -`1` &&
1561	last_sentence_start == i - `1` &&
1562	attrs[i - `1`].is_white) {
1563	last_sentence_start++;
1564	}
1565	}
1566
1567	/ --- Hyphens --- /
1568
1569	{
1570	gboolean insert_hyphens;
1571	gboolean space_or_hyphen = FALSE;
1572
1573	attrs[i].break_inserts_hyphen = FALSE;
1574	attrs[i].break_removes_preceding = FALSE;
1575
1576	switch ((int)prev_script)
1577	{
1578	case PANGO_SCRIPT_COMMON:
1579	insert_hyphens = prev_wc == `0x00ad`;
1580	break;
1581	case PANGO_SCRIPT_HAN:
1582	case PANGO_SCRIPT_HANGUL:
1583	case PANGO_SCRIPT_HIRAGANA:
1584	case PANGO_SCRIPT_KATAKANA:
1585	insert_hyphens = FALSE;
1586	break;
1587	default:
1588	insert_hyphens = TRUE;
1589	break;
1590	}
1591
1592	switch ((int)type)
1593	{
1594	case G_UNICODE_SPACE_SEPARATOR:
1595	case G_UNICODE_LINE_SEPARATOR:
1596	case G_UNICODE_PARAGRAPH_SEPARATOR:
1597	space_or_hyphen = TRUE;
1598	break;
1599	case G_UNICODE_CONTROL:
1600	if (wc == `'\t'` \|\| wc == `'\n'` \|\| wc == `'\r'` \|\| wc == `'\f'`)
1601	space_or_hyphen = TRUE;
1602	break;
1603	default:
1604	break;
1605	}
1606
1607	if (!space_or_hyphen)
1608	{
1609	if (wc == `'-'` \|\| / Hyphen-minus /
1610	wc == `0x058a` \|\| / Armenian hyphen /
1611	wc == `0x1400` \|\| / Canadian syllabics hyphen /
1612	wc == `0x1806` \|\| / Mongolian todo hyphen /
1613	wc == `0x2010` \|\| / Hyphen /
1614	wc == `0x2e17` \|\| / Double oblique hyphen /
1615	wc == `0x2e40` \|\| / Double hyphen /
1616	wc == `0x30a0` \|\| / Katakana-Hiragana double hyphen /
1617	wc == `0xfe63` \|\| / Small hyphen-minus /
1618	wc == `0xff0d`) / Fullwidth hyphen-minus /
1619	space_or_hyphen = TRUE;
1620	}
1621
1622	if (attrs[i].is_word_boundary)
1623	attrs[i].break_inserts_hyphen = FALSE;
1624	else if (prev_space_or_hyphen)
1625	attrs[i].break_inserts_hyphen = FALSE;
1626	else if (space_or_hyphen)
1627	attrs[i].break_inserts_hyphen = FALSE;
1628	else
1629	attrs[i].break_inserts_hyphen = insert_hyphens;
1630
1631	if (prev_wc == `0x2027`) / Hyphenation point /
1632	{
1633	attrs[i].break_inserts_hyphen = TRUE;
1634	attrs[i].break_removes_preceding = TRUE;
1635	}
1636
1637	prev_space_or_hyphen = space_or_hyphen;
1638	}
1639
1640	prev_wc = wc;
1641	prev_script = script;
1642
1643	/ wc might not be a valid Unicode base character, but really all we*
1644	* need to know is the last non-combining character */
1645	if (type != G_UNICODE_SPACING_MARK &&
1646	type != G_UNICODE_ENCLOSING_MARK &&
1647	type != G_UNICODE_NON_SPACING_MARK)
1648	base_character = wc;
1649	}
1650
1651	i--;
1652
1653	attrs[`0`].is_cursor_position = TRUE; / Rule GB1 /
1654	attrs[i].is_cursor_position = TRUE; / Rule GB2 /
1655
1656	attrs[`0`].is_word_boundary = TRUE; / Rule WB1 /
1657	attrs[i].is_word_boundary = TRUE; / Rule WB2 /
1658
1659	attrs[`0`].is_line_break = FALSE; / Rule LB2 /
1660	attrs[i].is_line_break = TRUE; / Rule LB3 /
1661	attrs[i].is_mandatory_break = TRUE; / Rule LB3 /
1662	}
1663
1664	/ }}} /
1665	/ {{{ Tailoring /
1666	/ {{{ Script-specific tailoring /
1667
1668	#include "break-arabic.c"
1669	#include "break-indic.c"
1670	#include "break-thai.c"
1671	#include "break-latin.c"
1672
1673	static gboolean
1674	break_script (const char *item_text,
1675	unsigned int item_length,
1676	const PangoAnalysis *analysis,
1677	PangoLogAttr *attrs,
1678	int attrs_len)
1679	{
1680	switch (analysis->script)
1681	{
1682	case PANGO_SCRIPT_ARABIC:
1683	break_arabic (text: item_text, length: item_length, analysis, attrs, attrs_len);
1684	break;
1685
1686	case PANGO_SCRIPT_DEVANAGARI:
1687	case PANGO_SCRIPT_BENGALI:
1688	case PANGO_SCRIPT_GURMUKHI:
1689	case PANGO_SCRIPT_GUJARATI:
1690	case PANGO_SCRIPT_ORIYA:
1691	case PANGO_SCRIPT_TAMIL:
1692	case PANGO_SCRIPT_TELUGU:
1693	case PANGO_SCRIPT_KANNADA:
1694	case PANGO_SCRIPT_MALAYALAM:
1695	case PANGO_SCRIPT_SINHALA:
1696	break_indic (text: item_text, length: item_length, analysis, attrs, attrs_len);
1697	break;
1698
1699	case PANGO_SCRIPT_THAI:
1700	break_thai (text: item_text, len: item_length, analysis, attrs, attrs_len);
1701	break;
1702
1703	case PANGO_SCRIPT_LATIN:
1704	break_latin (text: item_text, length: item_length, analysis, attrs, attrs_len);
1705	break;
1706
1707	default:
1708	return FALSE;
1709	}
1710
1711	return TRUE;
1712	}
1713
1714	/ }}} /
1715	/ {{{ Attribute-based customization /
1716
1717	/ We allow customizing log attrs in two ways:*
1718	*
1719	* - You can directly remove breaks from a range, using allow_breaks=false.
1720	* We preserve the non-tailorable rules from UAX #14, so mandatory breaks
1721	* and breaks after ZWS remain. We also preserve break opportunities after
1722	* hyphens and visible word dividers.
1723	*
1724	* - You can tweak the segmentation by marking ranges as word or sentence.
1725	* When doing so, we split adjacent segments to preserve alternating
1726	* starts and ends. We add a line break opportunity before each word that
1727	* is created in this way, and we remove line break opportunities inside
1728	* the word in the same way as for a range marked as allow_breaks=false,
1729	* except that we don't remove char break opportunities.
1730	*
1731	* Note that UAX #14 does not guarantee that words fall neatly into
1732	* sentences, so we don't do extra work to enforce that.
1733	*/
1734
1735	static void
1736	remove_breaks_from_range (const char *text,
1737	int start,
1738	PangoLogAttr *log_attrs,
1739	int start_pos,
1740	int end_pos)
1741	{
1742	int pos;
1743	const char *p;
1744	gunichar ch;
1745	int bt;
1746	gboolean after_zws;
1747	gboolean after_hyphen;
1748
1749	/ Assume our range doesn't start after a hyphen or in a zws sequence /
1750	after_zws = FALSE;
1751	after_hyphen = FALSE;
1752	for (pos = start_pos + `1`, p = g_utf8_next_char (text + start);
1753	pos < end_pos;
1754	pos++, p = g_utf8_next_char (p))
1755	{
1756	/ Mandatory breaks aren't tailorable /
1757	if (!log_attrs[pos].is_mandatory_break)
1758	log_attrs[pos].is_line_break = FALSE;
1759
1760	ch = g_utf8_get_char (p);
1761	bt = g_unichar_break_type (c: ch);
1762
1763	/ Hyphens and visible word dividers /
1764	if (after_hyphen)
1765	log_attrs[pos].is_line_break = TRUE;
1766
1767	after_hyphen = ch == `0x00ad` \|\| / Soft Hyphen /
1768	ch == `0x05A0` \|\| ch == `0x2010` \|\| / Breaking Hyphens /
1769	ch == `0x2012` \|\| ch == `0x2013` \|\|
1770	ch == `0x05BE` \|\| ch == `0x0F0B` \|\| / Visible word dividers /
1771	ch == `0x1361` \|\| ch == `0x17D8` \|\|
1772	ch == `0x17DA` \|\| ch == `0x2027` \|\|
1773	ch == `0x007C`;
1774
1775	/ ZWS sequence /
1776	if (after_zws && bt != G_UNICODE_BREAK_SPACE)
1777	log_attrs[pos].is_line_break = TRUE;
1778
1779	after_zws = bt == G_UNICODE_BREAK_ZERO_WIDTH_SPACE \|\|
1780	(bt == G_UNICODE_BREAK_SPACE && after_zws);
1781	}
1782	}
1783
1784	static gboolean
1785	handle_allow_breaks (const char *text,
1786	int length,
1787	PangoAttrList *attrs,
1788	int offset,
1789	PangoLogAttr *log_attrs,
1790	int log_attrs_len)
1791	{
1792	PangoAttrIterator iter;
1793	gboolean tailored = FALSE;
1794
1795	_pango_attr_list_get_iterator (list: attrs, iterator: &iter);
1796
1797	do
1798	{
1799	const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_ALLOW_BREAKS);
1800
1801	if (!attr)
1802	continue;
1803
1804	if (!((PangoAttrInt*)attr)->value)
1805	{
1806	int start, end;
1807	int start_pos, end_pos;
1808	int pos;
1809
1810	start = attr->start_index;
1811	end = attr->end_index;
1812	if (start < offset)
1813	start_pos = `0`;
1814	else
1815	start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset);
1816	if (end >= offset + length)
1817	end_pos = log_attrs_len;
1818	else
1819	end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset);
1820
1821	for (pos = start_pos + `1`; pos < end_pos; pos++)
1822	log_attrs[pos].is_char_break = FALSE;
1823
1824	remove_breaks_from_range (text, MAX (start - offset, `0`), log_attrs, start_pos, end_pos);
1825
1826	tailored = TRUE;
1827	}
1828	}
1829	while (pango_attr_iterator_next (iterator: &iter));
1830
1831	_pango_attr_iterator_destroy (iterator: &iter);
1832
1833	return tailored;
1834	}
1835
1836
1837	static gboolean
1838	handle_words (const char *text,
1839	int length,
1840	PangoAttrList *attrs,
1841	int offset,
1842	PangoLogAttr *log_attrs,
1843	int log_attrs_len)
1844	{
1845	PangoAttrIterator iter;
1846	gboolean tailored = FALSE;
1847
1848	_pango_attr_list_get_iterator (list: attrs, iterator: &iter);
1849
1850	do
1851	{
1852	const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_WORD);
1853	int start, end;
1854	int start_pos, end_pos;
1855	int pos;
1856
1857	if (!attr)
1858	continue;
1859
1860	start = attr->start_index;
1861	end = attr->end_index;
1862	if (start < offset)
1863	start_pos = `0`;
1864	else
1865	start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset);
1866	if (end >= offset + length)
1867	end_pos = log_attrs_len;
1868	else
1869	end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset);
1870
1871	for (pos = start_pos + `1`; pos < end_pos; pos++)
1872	{
1873	log_attrs[pos].is_word_start = FALSE;
1874	log_attrs[pos].is_word_end = FALSE;
1875	log_attrs[pos].is_word_boundary = FALSE;
1876	}
1877
1878	remove_breaks_from_range (text, MAX (start - offset, `0`), log_attrs,
1879	start_pos, end_pos);
1880
1881	if (start >= offset)
1882	{
1883	gboolean in_word = FALSE;
1884	for (pos = start_pos; pos >= `0`; pos--)
1885	{
1886	if (log_attrs[pos].is_word_end)
1887	{
1888	in_word = pos == start_pos;
1889	break;
1890	}
1891	if (pos < start_pos && log_attrs[pos].is_word_start)
1892	{
1893	in_word = TRUE;
1894	break;
1895	}
1896	}
1897	log_attrs[start_pos].is_word_start = TRUE;
1898	log_attrs[start_pos].is_word_end = in_word;
1899	log_attrs[start_pos].is_word_boundary = TRUE;
1900
1901	/ Allow line breaks before words /
1902	if (start_pos > `0`)
1903	log_attrs[start_pos].is_line_break = TRUE;
1904
1905	tailored = TRUE;
1906	}
1907
1908	if (end < offset + length)
1909	{
1910	gboolean in_word = FALSE;
1911	for (pos = end_pos; pos < log_attrs_len; pos++)
1912	{
1913	if (log_attrs[pos].is_word_start)
1914	{
1915	in_word = pos == end_pos;
1916	break;
1917	}
1918	if (pos > end_pos && log_attrs[pos].is_word_end)
1919	{
1920	in_word = TRUE;
1921	break;
1922	}
1923	}
1924	log_attrs[end_pos].is_word_start = in_word;
1925	log_attrs[end_pos].is_word_end = TRUE;
1926	log_attrs[end_pos].is_word_boundary = TRUE;
1927
1928	/ Allow line breaks before words /
1929	if (in_word)
1930	log_attrs[end_pos].is_line_break = TRUE;
1931
1932	tailored = TRUE;
1933	}
1934	}
1935	while (pango_attr_iterator_next (iterator: &iter));
1936
1937	_pango_attr_iterator_destroy (iterator: &iter);
1938
1939	return tailored;
1940	}
1941
1942	static gboolean
1943	handle_sentences (const char *text,
1944	int length,
1945	PangoAttrList *attrs,
1946	int offset,
1947	PangoLogAttr *log_attrs,
1948	int log_attrs_len)
1949	{
1950	PangoAttrIterator iter;
1951	gboolean tailored = FALSE;
1952
1953	_pango_attr_list_get_iterator (list: attrs, iterator: &iter);
1954
1955	do
1956	{
1957	const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_SENTENCE);
1958	int start, end;
1959	int start_pos, end_pos;
1960	int pos;
1961
1962	if (!attr)
1963	continue;
1964
1965	start = attr->start_index;
1966	end = attr->end_index;
1967	if (start < offset)
1968	start_pos = `0`;
1969	else
1970	start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset);
1971	if (end >= offset + length)
1972	end_pos = log_attrs_len;
1973	else
1974	end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset);
1975
1976	for (pos = start_pos + `1`; pos < end_pos; pos++)
1977	{
1978	log_attrs[pos].is_sentence_start = FALSE;
1979	log_attrs[pos].is_sentence_end = FALSE;
1980	log_attrs[pos].is_sentence_boundary = FALSE;
1981
1982	tailored = TRUE;
1983	}
1984	if (start >= offset)
1985	{
1986	gboolean in_sentence = FALSE;
1987	for (pos = start_pos - `1`; pos >= `0`; pos--)
1988	{
1989	if (log_attrs[pos].is_sentence_end)
1990	break;
1991	if (log_attrs[pos].is_sentence_start)
1992	{
1993	in_sentence = TRUE;
1994	break;
1995	}
1996	}
1997	log_attrs[start_pos].is_sentence_start = TRUE;
1998	log_attrs[start_pos].is_sentence_end = in_sentence;
1999	log_attrs[start_pos].is_sentence_boundary = TRUE;
2000
2001	tailored = TRUE;
2002	}
2003	if (end < offset + length)
2004	{
2005	gboolean in_sentence = FALSE;
2006	for (pos = end_pos + `1`; end_pos < log_attrs_len; pos++)
2007	{
2008	if (log_attrs[pos].is_sentence_start)
2009	break;
2010	if (log_attrs[pos].is_sentence_end)
2011	{
2012	in_sentence = TRUE;
2013	break;
2014	}
2015	}
2016	log_attrs[end_pos].is_sentence_start = in_sentence;
2017	log_attrs[end_pos].is_sentence_end = TRUE;
2018	log_attrs[end_pos].is_sentence_boundary = TRUE;
2019
2020	tailored = TRUE;
2021	}
2022	}
2023	while (pango_attr_iterator_next (iterator: &iter));
2024
2025	_pango_attr_iterator_destroy (iterator: &iter);
2026
2027	return tailored;
2028	}
2029
2030	static gboolean
2031	handle_hyphens (const char *text,
2032	int length,
2033	PangoAttrList *attrs,
2034	int offset,
2035	PangoLogAttr *log_attrs,
2036	int log_attrs_len)
2037	{
2038	PangoAttrIterator iter;
2039	gboolean tailored = FALSE;
2040
2041	_pango_attr_list_get_iterator (list: attrs, iterator: &iter);
2042
2043	do {
2044	const PangoAttribute *attr = pango_attr_iterator_get (iterator: &iter, type: PANGO_ATTR_INSERT_HYPHENS);
2045
2046	if (attr && ((PangoAttrInt*)attr)->value == `0`)
2047	{
2048	int start, end;
2049	int start_pos, end_pos;
2050	int pos;
2051
2052	pango_attr_iterator_range (iterator: &iter, start: &start, end: &end);
2053	if (start < offset)
2054	start_pos = `0`;
2055	else
2056	start_pos = g_utf8_pointer_to_offset (str: text, pos: text + start - offset);
2057	if (end >= offset + length)
2058	end_pos = log_attrs_len;
2059	else
2060	end_pos = g_utf8_pointer_to_offset (str: text, pos: text + end - offset);
2061
2062	for (pos = start_pos + `1`; pos < end_pos; pos++)
2063	{
2064	if (!log_attrs[pos].break_removes_preceding)
2065	{
2066	log_attrs[pos].break_inserts_hyphen = FALSE;
2067
2068	tailored = TRUE;
2069	}
2070	}
2071	}
2072	} while (pango_attr_iterator_next (iterator: &iter));
2073
2074	_pango_attr_iterator_destroy (iterator: &iter);
2075
2076	return tailored;
2077	}
2078
2079	static gboolean
2080	break_attrs (const char *text,
2081	int length,
2082	GSList *attributes,
2083	int offset,
2084	PangoLogAttr *log_attrs,
2085	int log_attrs_len)
2086	{
2087	PangoAttrList allow_breaks;
2088	PangoAttrList words;
2089	PangoAttrList sentences;
2090	PangoAttrList hyphens;
2091	GSList *l;
2092	gboolean tailored = FALSE;
2093
2094	_pango_attr_list_init (list: &allow_breaks);
2095	_pango_attr_list_init (list: &words);
2096	_pango_attr_list_init (list: &sentences);
2097	_pango_attr_list_init (list: &hyphens);
2098
2099	for (l = attributes; l; l = l->next)
2100	{
2101	PangoAttribute *attr = l->data;
2102
2103	if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS)
2104	pango_attr_list_insert (list: &allow_breaks, attr: pango_attribute_copy (attr));
2105	else if (attr->klass->type == PANGO_ATTR_WORD)
2106	pango_attr_list_insert (list: &words, attr: pango_attribute_copy (attr));
2107	else if (attr->klass->type == PANGO_ATTR_SENTENCE)
2108	pango_attr_list_insert (list: &sentences, attr: pango_attribute_copy (attr));
2109	else if (attr->klass->type == PANGO_ATTR_INSERT_HYPHENS)
2110	pango_attr_list_insert (list: &hyphens, attr: pango_attribute_copy (attr));
2111	}
2112
2113	tailored \|= handle_words (text, length, attrs: &words, offset,
2114	log_attrs, log_attrs_len);
2115
2116	tailored \|= handle_sentences (text, length, attrs: &words, offset,
2117	log_attrs, log_attrs_len);
2118
2119	tailored \|= handle_hyphens (text, length, attrs: &hyphens, offset,
2120	log_attrs, log_attrs_len);
2121
2122	tailored \|= handle_allow_breaks (text, length, attrs: &allow_breaks, offset,
2123	log_attrs, log_attrs_len);
2124
2125	_pango_attr_list_destroy (list: &allow_breaks);
2126	_pango_attr_list_destroy (list: &words);
2127	_pango_attr_list_destroy (list: &sentences);
2128	_pango_attr_list_destroy (list: &hyphens);
2129
2130	return tailored;
2131	}
2132
2133	/ }}} /
2134
2135	static gboolean
2136	tailor_break (const char *text,
2137	int length,
2138	PangoAnalysis *analysis,
2139	int item_offset,
2140	PangoLogAttr *attrs,
2141	int attrs_len)
2142	{
2143	gboolean res;
2144
2145	if (length < `0`)
2146	length = strlen (s: text);
2147	else if (text == NULL)
2148	text = "";
2149
2150	res = break_script (item_text: text, item_length: length, analysis, attrs, attrs_len);
2151
2152	if (item_offset >= `0` && analysis->extra_attrs)
2153	res \|= break_attrs (text, length, attributes: analysis->extra_attrs, offset: item_offset, log_attrs: attrs, log_attrs_len: attrs_len);
2154
2155	return res;
2156	}
2157
2158	/ }}} /
2159	/ {{{ Public API /
2160
2161	/**
2162	* pango_default_break:
2163	* @text: text to break. Must be valid UTF-8
2164	* @length: length of text in bytes (may be -1 if @text is nul-terminated)
2165	* @analysis: (nullable): a `PangoAnalysis` structure for the @text
2166	* @attrs: logical attributes to fill in
2167	* @attrs_len: size of the array passed as @attrs
2168	*
2169	* This is the default break algorithm.
2170	*
2171	* It applies rules from the [Unicode Line Breaking Algorithm](http://www.unicode.org/unicode/reports/tr14/)
2172	* without language-specific tailoring, therefore the @analyis argument is unused
2173	* and can be %NULL.
2174	*
2175	* See [func@Pango.tailor_break] for language-specific breaks.
2176	*
2177	* See [func@Pango.attr_break] for attribute-based customization.
2178	*/
2179	void
2180	pango_default_break (const char *text,
2181	int length,
2182	PangoAnalysis *analysis G_GNUC_UNUSED,
2183	PangoLogAttr *attrs,
2184	int attrs_len G_GNUC_UNUSED)
2185	{
2186	PangoLogAttr before = *attrs;
2187
2188	default_break (text, length, analysis, attrs, attrs_len);
2189
2190	attrs->is_line_break \|= before.is_line_break;
2191	attrs->is_mandatory_break \|= before.is_mandatory_break;
2192	attrs->is_cursor_position \|= before.is_cursor_position;
2193	}
2194
2195	/**
2196	* pango_break:
2197	* @text: the text to process. Must be valid UTF-8
2198	* @length: length of @text in bytes (may be -1 if @text is nul-terminated)
2199	* @analysis: `PangoAnalysis` structure for @text
2200	* @attrs: (array length=attrs_len): an array to store character information in
2201	* @attrs_len: size of the array passed as @attrs
2202	*
2203	* Determines possible line, word, and character breaks
2204	* for a string of Unicode text with a single analysis.
2205	*
2206	* For most purposes you may want to use [func@Pango.get_log_attrs].
2207	*
2208	* Deprecated: 1.44: Use [func@Pango.default_break],
2209	* [func@Pango.tailor_break] and [func@Pango.attr_break].
2210	*/
2211	void
2212	pango_break (const char *text,
2213	gint length,
2214	PangoAnalysis *analysis,
2215	PangoLogAttr *attrs,
2216	int attrs_len)
2217	{
2218	g_return_if_fail (analysis != NULL);
2219	g_return_if_fail (attrs != NULL);
2220
2221	default_break (text, length, analysis, attrs, attrs_len);
2222	tailor_break (text, length, analysis, item_offset: -`1`, attrs, attrs_len);
2223	}
2224
2225	/**
2226	* pango_tailor_break:
2227	* @text: text to process. Must be valid UTF-8
2228	* @length: length in bytes of @text
2229	* @analysis: `PangoAnalysis` for @text
2230	* @offset: Byte offset of @text from the beginning of the
2231	* paragraph, or -1 to ignore attributes from @analysis
2232	* @attrs: (array length=attrs_len): array with one `PangoLogAttr`
2233	* per character in @text, plus one extra, to be filled in
2234	* @attrs_len: length of @attrs array
2235	*
2236	* Apply language-specific tailoring to the breaks in @attrs.
2237	*
2238	* The line breaks are assumed to have been produced by [func@Pango.default_break].
2239	*
2240	* If @offset is not -1, it is used to apply attributes from @analysis that are
2241	* relevant to line breaking.
2242	*
2243	* Note that it is better to pass -1 for @offset and use [func@Pango.attr_break]
2244	* to apply attributes to the whole paragraph.
2245	*
2246	* Since: 1.44
2247	*/
2248	void
2249	pango_tailor_break (const char *text,
2250	int length,
2251	PangoAnalysis *analysis,
2252	int offset,
2253	PangoLogAttr *attrs,
2254	int attrs_len)
2255	{
2256	PangoLogAttr *start = attrs;
2257	PangoLogAttr attr_before = *start;
2258
2259	if (tailor_break (text, length, analysis, item_offset: offset, attrs, attrs_len))
2260	{
2261	/ if tailored, we enforce some of the attrs from before*
2262	* tailoring at the boundary
2263	*/
2264
2265	start->backspace_deletes_character = attr_before.backspace_deletes_character;
2266
2267	start->is_line_break \|= attr_before.is_line_break;
2268	start->is_mandatory_break \|= attr_before.is_mandatory_break;
2269	start->is_cursor_position \|= attr_before.is_cursor_position;
2270	}
2271	}
2272
2273	/**
2274	* pango_attr_break:
2275	* @text: text to break. Must be valid UTF-8
2276	* @length: length of text in bytes (may be -1 if @text is nul-terminated)
2277	* @attr_list: `PangoAttrList` to apply
2278	* @offset: Byte offset of @text from the beginning of the paragraph
2279	* @attrs: (array length=attrs_len): array with one `PangoLogAttr`
2280	* per character in @text, plus one extra, to be filled in
2281	* @attrs_len: length of @attrs array
2282	*
2283	* Apply customization from attributes to the breaks in @attrs.
2284	*
2285	* The line breaks are assumed to have been produced
2286	* by [func@Pango.default_break] and [func@Pango.tailor_break].
2287	*
2288	* Since: 1.50
2289	*/
2290	void
2291	pango_attr_break (const char *text,
2292	int length,
2293	PangoAttrList *attr_list,
2294	int offset,
2295	PangoLogAttr *attrs,
2296	int attrs_len)
2297	{
2298	PangoLogAttr *start = attrs;
2299	PangoLogAttr attr_before = *start;
2300	GSList *attributes;
2301
2302	attributes = pango_attr_list_get_attributes (list: attr_list);
2303	if (break_attrs (text, length, attributes, offset, log_attrs: attrs, log_attrs_len: attrs_len))
2304	{
2305	/ if tailored, we enforce some of the attrs from before*
2306	* tailoring at the boundary
2307	*/
2308
2309	start->backspace_deletes_character = attr_before.backspace_deletes_character;
2310
2311	start->is_line_break \|= attr_before.is_line_break;
2312	start->is_mandatory_break \|= attr_before.is_mandatory_break;
2313	start->is_cursor_position \|= attr_before.is_cursor_position;
2314	}
2315
2316	g_slist_free_full (list: attributes, free_func: (GDestroyNotify)pango_attribute_destroy);
2317	}
2318
2319	/**
2320	* pango_get_log_attrs:
2321	* @text: text to process. Must be valid UTF-8
2322	* @length: length in bytes of @text
2323	* @level: embedding level, or -1 if unknown
2324	* @language: language tag
2325	* @attrs: (array length=attrs_len): array with one `PangoLogAttr`
2326	* per character in @text, plus one extra, to be filled in
2327	* @attrs_len: length of @attrs array
2328	*
2329	* Computes a `PangoLogAttr` for each character in @text.
2330	*
2331	* The @attrs array must have one `PangoLogAttr` for
2332	* each position in @text; if @text contains N characters,
2333	* it has N+1 positions, including the last position at the
2334	* end of the text. @text should be an entire paragraph;
2335	* logical attributes can't be computed without context
2336	* (for example you need to see spaces on either side of
2337	* a word to know the word is a word).
2338	*/
2339	void
2340	pango_get_log_attrs (const char *text,
2341	int length,
2342	int level,
2343	PangoLanguage *language,
2344	PangoLogAttr *attrs,
2345	int attrs_len)
2346	{
2347	int chars_broken;
2348	PangoAnalysis analysis = { NULL };
2349	PangoScriptIter iter;
2350
2351	g_return_if_fail (length == `0` \|\| text != NULL);
2352	g_return_if_fail (attrs != NULL);
2353
2354	analysis.level = level;
2355	analysis.language = language;
2356
2357	pango_default_break (text, length, analysis: &analysis, attrs, attrs_len);
2358
2359	chars_broken = `0`;
2360
2361	_pango_script_iter_init (iter: &iter, text, length);
2362	do
2363	{
2364	const char run_start, run_end;
2365	PangoScript script;
2366	int chars_in_range;
2367
2368	pango_script_iter_get_range (iter: &iter, start: &run_start, end: &run_end, script: &script);
2369	analysis.script = script;
2370
2371	chars_in_range = pango_utf8_strlen (p: run_start, max: run_end - run_start);
2372
2373	pango_tailor_break (text: run_start,
2374	length: run_end - run_start,
2375	analysis: &analysis,
2376	offset: -`1`,
2377	attrs: attrs + chars_broken,
2378	attrs_len: chars_in_range + `1`);
2379
2380	chars_broken += chars_in_range;
2381	}
2382	while (pango_script_iter_next (iter: &iter));
2383	_pango_script_iter_fini (iter: &iter);
2384
2385	if (chars_broken + `1` > attrs_len)
2386	g_warning ("pango_get_log_attrs: attrs_len should have been at least %d, but was %d. Expect corrupted memory.",
2387	chars_broken + `1`,
2388	attrs_len);
2389	}
2390
2391	/ }}} /
2392
2393	/ vim:set foldmethod=marker expandtab: /
2394

source code of gtk/subprojects/pango/pango/break.c