1 | /* Pango |
2 | * testboundaries.c: Test text boundary algorithms |
3 | * |
4 | * Copyright (C) 1999-2000 Red Hat Software |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Library General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Library General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Library General Public |
17 | * License along with this library; if not, write to the |
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
19 | * Boston, MA 02111-1307, USA. |
20 | */ |
21 | |
22 | #include <string.h> |
23 | #include <stdlib.h> |
24 | #include <stdio.h> |
25 | |
26 | #include <glib.h> |
27 | #include <pango/pango.h> |
28 | |
29 | #ifndef G_OS_WIN32 |
30 | #include <unistd.h> |
31 | #endif |
32 | |
33 | #define CHFORMAT "%0#6x" |
34 | |
35 | /* FIXME for now this just tests that the breaking of some sample |
36 | * text conforms to certain rules and invariants. But eventually |
37 | * we should also have test-result pairs, i.e. a string and some |
38 | * encoding of the correct way to break the string, to check |
39 | * more precisely that things worked |
40 | */ |
41 | |
42 | |
43 | static int offset = 0; |
44 | static int line = 0; |
45 | static gunichar current_wc = 0; |
46 | static const char *line_start = NULL; |
47 | static const char *line_end = NULL; |
48 | |
49 | typedef void (* CharForeachFunc) (gunichar wc, |
50 | gunichar prev_wc, |
51 | gunichar next_wc, |
52 | GUnicodeType type, |
53 | GUnicodeType prev_type, |
54 | GUnicodeType next_type, |
55 | PangoLogAttr *attr, |
56 | PangoLogAttr *prev_attr, |
57 | PangoLogAttr *next_attr, |
58 | gpointer data); |
59 | |
60 | static void |
61 | log_attr_foreach (const char *text, |
62 | PangoLogAttr *attrs, |
63 | CharForeachFunc func, |
64 | gpointer data) |
65 | { |
66 | const gchar *next = text; |
67 | gint length = strlen (s: text); |
68 | const gchar *end = text + length; |
69 | gint i = 0; |
70 | gunichar prev_wc; |
71 | gunichar next_wc; |
72 | GUnicodeType prev_type; |
73 | GUnicodeType next_type; |
74 | |
75 | if (next == end) |
76 | return; |
77 | |
78 | offset = 0; |
79 | line = 1; |
80 | |
81 | prev_type = (GUnicodeType) -1; |
82 | prev_wc = 0; |
83 | |
84 | next_wc = g_utf8_get_char (p: next); |
85 | next_type = g_unichar_type (c: next_wc); |
86 | |
87 | line_start = text; |
88 | line_end = text; |
89 | |
90 | while (next_wc != 0) |
91 | { |
92 | GUnicodeType type; |
93 | gunichar wc; |
94 | |
95 | wc = next_wc; |
96 | type = next_type; |
97 | |
98 | current_wc = wc; |
99 | |
100 | next = g_utf8_next_char (next); |
101 | line_end = next; |
102 | |
103 | if (next >= end) |
104 | next_wc = 0; |
105 | else |
106 | next_wc = g_utf8_get_char (p: next); |
107 | |
108 | if (next_wc) |
109 | next_type = g_unichar_type (c: next_wc); |
110 | |
111 | (* func) (wc, prev_wc, next_wc, |
112 | type, prev_type, next_type, |
113 | &attrs[i], |
114 | i != 0 ? &attrs[i-1] : NULL, |
115 | next_wc != 0 ? &attrs[i+1] : NULL, |
116 | data); |
117 | |
118 | prev_type = type; |
119 | prev_wc = wc; |
120 | ++i; |
121 | ++offset; |
122 | if (wc == '\n') |
123 | { |
124 | ++line; |
125 | offset = 0; |
126 | line_start = next; |
127 | line_end = next; |
128 | } |
129 | } |
130 | } |
131 | |
132 | static void |
133 | check_line_char (gunichar wc, |
134 | gunichar prev_wc, |
135 | gunichar next_wc, |
136 | GUnicodeType type, |
137 | GUnicodeType prev_type, |
138 | GUnicodeType next_type, |
139 | PangoLogAttr *attr, |
140 | PangoLogAttr *prev_attr, |
141 | PangoLogAttr *next_attr, |
142 | gpointer data) |
143 | { |
144 | GUnicodeBreakType break_type; |
145 | GUnicodeBreakType prev_break_type; |
146 | |
147 | break_type = g_unichar_break_type (c: wc); |
148 | if (prev_wc) |
149 | prev_break_type = g_unichar_break_type (c: prev_wc); |
150 | else |
151 | prev_break_type = G_UNICODE_BREAK_UNKNOWN; |
152 | |
153 | if (wc == '\n') |
154 | { |
155 | if (prev_wc == '\r') |
156 | { |
157 | if (g_test_verbose ()) if (g_test_verbose ()) g_test_message (format: "Do not line break between \\r and \\n" ); |
158 | g_assert_false (attr->is_line_break); |
159 | } |
160 | |
161 | if (next_attr != NULL) |
162 | { |
163 | if (g_test_verbose ()) g_test_message (format: "Line break after \\n" ); |
164 | g_assert_true (next_attr->is_line_break); |
165 | } |
166 | } |
167 | |
168 | if (attr->is_line_break) |
169 | { |
170 | if (g_test_verbose ()) g_test_message (format: "first char in string should not be marked as a line break" ); |
171 | g_assert_false (prev_wc == 0); |
172 | } |
173 | |
174 | if (break_type == G_UNICODE_BREAK_SPACE) |
175 | { |
176 | if (g_test_verbose ()) g_test_message (format: "can't break lines before a space unless a mandatory break char precedes it or a combining mark follows; prev char was: " CHFORMAT, prev_wc); |
177 | g_assert_false (attr->is_line_break && prev_attr != NULL && |
178 | !attr->is_mandatory_break && |
179 | !(next_wc && g_unichar_break_type (next_wc) == G_UNICODE_BREAK_COMBINING_MARK)); |
180 | } |
181 | |
182 | if (attr->is_mandatory_break) |
183 | { |
184 | if (g_test_verbose ()) g_test_message (format: "mandatory breaks must also be marked as regular breaks" ); |
185 | g_assert_true (attr->is_line_break); |
186 | } |
187 | |
188 | |
189 | /* FIXME use the break tables from break.c to automatically |
190 | * check invariants for each cell in the table. Shouldn't |
191 | * be that hard to do. |
192 | */ |
193 | |
194 | if (g_test_verbose ()) g_test_message (format: "can't break between two open punctuation chars" ); |
195 | g_assert_false (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION && |
196 | prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION && |
197 | attr->is_line_break && |
198 | !attr->is_mandatory_break); |
199 | |
200 | if (g_test_verbose ()) g_test_message (format: "can't break between two close punctuation chars" ); |
201 | g_assert_false (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION && |
202 | prev_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION && |
203 | attr->is_line_break && |
204 | !attr->is_mandatory_break); |
205 | |
206 | if (g_test_verbose ()) g_test_message (format: "can't break letter-quotemark sequence" ); |
207 | g_assert_false (break_type == G_UNICODE_BREAK_QUOTATION && |
208 | prev_break_type == G_UNICODE_BREAK_ALPHABETIC && |
209 | attr->is_line_break && |
210 | !attr->is_mandatory_break); |
211 | } |
212 | |
213 | static void |
214 | check_line_invariants (const char *text, |
215 | PangoLogAttr *attrs) |
216 | { |
217 | log_attr_foreach (text, attrs, func: check_line_char, NULL); |
218 | } |
219 | |
220 | static void |
221 | check_word_invariants (const char *text, |
222 | PangoLogAttr *attrs) |
223 | { |
224 | |
225 | |
226 | } |
227 | |
228 | static void |
229 | check_sentence_invariants (const char *text, |
230 | PangoLogAttr *attrs) |
231 | { |
232 | |
233 | |
234 | } |
235 | |
236 | static void |
237 | check_grapheme_invariants (const char *text, |
238 | PangoLogAttr *attrs) |
239 | { |
240 | |
241 | |
242 | } |
243 | |
244 | #if 0 |
245 | static void print_sentences (const char *text, |
246 | PangoLogAttr *attrs); |
247 | static void |
248 | print_sentences (const char *text, |
249 | PangoLogAttr *attrs) |
250 | { |
251 | const char *p; |
252 | const char *last; |
253 | int i = 0; |
254 | |
255 | last = text; |
256 | p = text; |
257 | |
258 | while (*p) |
259 | { |
260 | if (attrs[i].is_sentence_boundary) |
261 | { |
262 | char *s = g_strndup (last, p - last); |
263 | printf ("%s\n" , s); |
264 | g_free (s); |
265 | last = p; |
266 | } |
267 | |
268 | p = g_utf8_next_char (p); |
269 | ++i; |
270 | } |
271 | } |
272 | #endif |
273 | |
274 | static void |
275 | check_invariants (const char *text) |
276 | { |
277 | int len; |
278 | PangoLogAttr *attrs; |
279 | |
280 | g_assert_true (g_utf8_validate (text, -1, NULL)); |
281 | |
282 | len = g_utf8_strlen (p: text, max: -1); |
283 | attrs = g_new0 (PangoLogAttr, len + 1); |
284 | |
285 | pango_get_log_attrs (text, |
286 | length: -1, |
287 | level: 0, |
288 | language: pango_language_from_string (language: "C" ), |
289 | attrs, |
290 | attrs_len: len + 1); |
291 | |
292 | check_line_invariants (text, attrs); |
293 | check_sentence_invariants (text, attrs); |
294 | check_grapheme_invariants (text, attrs); |
295 | check_word_invariants (text, attrs); |
296 | |
297 | #if 0 |
298 | print_sentences (text, attrs); |
299 | #endif |
300 | |
301 | g_free (mem: attrs); |
302 | } |
303 | |
304 | static void |
305 | test_boundaries (void) |
306 | { |
307 | const char *filename; |
308 | GError *error = NULL; |
309 | char *text; |
310 | |
311 | filename = g_test_get_filename (file_type: G_TEST_DIST, first_path: "boundaries.utf8" , NULL); |
312 | |
313 | if (g_test_verbose ()) g_test_message (format: "sample file: %s\n" , filename); |
314 | |
315 | g_file_get_contents (filename, contents: &text, NULL, error: &error); |
316 | g_assert_no_error (error); |
317 | |
318 | check_invariants (text); |
319 | |
320 | g_free (mem: text); |
321 | } |
322 | |
323 | int |
324 | main (int argc, char *argv[]) |
325 | { |
326 | g_test_init (argc: &argc, argv: &argv, NULL); |
327 | |
328 | g_test_add_func (testpath: "/text/boundaries" , test_func: test_boundaries); |
329 | |
330 | return g_test_run (); |
331 | } |
332 | |