1 | /* |
2 | This file is part of the KDE libraries |
3 | |
4 | SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org> |
5 | SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org> |
6 | SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net> |
7 | |
8 | SPDX-License-Identifier: LGPL-2.0-or-later |
9 | */ |
10 | #include "kcharsets.h" |
11 | #include "kcharsets_p.h" |
12 | #include "kcodecs_debug.h" |
13 | |
14 | #include <QHash> |
15 | |
16 | #include <algorithm> |
17 | #include <assert.h> |
18 | |
19 | /* |
20 | * The encoding names (like "ISO 8859-1") in this list are user-visible, |
21 | * and should be mostly uppercase. |
22 | * Generate with generate_string_table.pl (located in kde-dev-scripts), |
23 | * input data: |
24 | ISO 8859-1 |
25 | i18n:Western European |
26 | ISO 8859-15 |
27 | i18n:Western European |
28 | ISO 8859-14 |
29 | i18n:Western European |
30 | cp 1252 |
31 | i18n:Western European |
32 | IBM850 |
33 | i18n:Western European |
34 | ISO 8859-2 |
35 | i18n:Central European |
36 | ISO 8859-3 |
37 | i18n:Central European |
38 | ISO 8859-4 |
39 | i18n:Baltic |
40 | ISO 8859-13 |
41 | i18n:Baltic |
42 | ISO 8859-16 |
43 | i18n:South-Eastern Europe |
44 | cp 1250 |
45 | i18n:Central European |
46 | cp 1254 |
47 | i18n:Turkish |
48 | cp 1257 |
49 | i18n:Baltic |
50 | KOI8-R |
51 | i18n:Cyrillic |
52 | ISO 8859-5 |
53 | i18n:Cyrillic |
54 | cp 1251 |
55 | i18n:Cyrillic |
56 | KOI8-U |
57 | i18n:Cyrillic |
58 | IBM866 |
59 | i18n:Cyrillic |
60 | Big5 |
61 | i18n:Chinese Traditional |
62 | Big5-HKSCS |
63 | i18n:Chinese Traditional |
64 | GB18030 |
65 | i18n:Chinese Simplified |
66 | GBK |
67 | i18n:Chinese Simplified |
68 | GB2312 |
69 | i18n:Chinese Simplified |
70 | EUC-KR |
71 | i18n:Korean |
72 | windows-949 |
73 | i18n:Korean |
74 | sjis |
75 | i18n:Japanese |
76 | ISO-2022-JP |
77 | i18n:Japanese |
78 | EUC-JP |
79 | i18n:Japanese |
80 | ISO 8859-7 |
81 | i18n:Greek |
82 | cp 1253 |
83 | i18n:Greek |
84 | ISO 8859-6 |
85 | i18n:Arabic |
86 | cp 1256 |
87 | i18n:Arabic |
88 | ISO 8859-8 |
89 | i18n:Hebrew |
90 | ISO 8859-8-I |
91 | i18n:Hebrew |
92 | cp 1255 |
93 | i18n:Hebrew |
94 | ISO 8859-9 |
95 | i18n:Turkish |
96 | TIS620 |
97 | i18n:Thai |
98 | ISO 8859-11 |
99 | i18n:Thai |
100 | UTF-8 |
101 | i18n:Unicode |
102 | UTF-16 |
103 | i18n:Unicode |
104 | utf7 |
105 | i18n:Unicode |
106 | ucs2 |
107 | i18n:Unicode |
108 | ISO 10646-UCS-2 |
109 | i18n:Unicode |
110 | windows-1258 |
111 | i18n:Other |
112 | IBM874 |
113 | i18n:Other |
114 | TSCII |
115 | i18n:Other |
116 | */ |
117 | /* |
118 | * Notes about the table: |
119 | * |
120 | * - The following entries were disabled and removed from the table: |
121 | ibm852 |
122 | i18n:Central European |
123 | pt 154 |
124 | i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt |
125 | * |
126 | * - ISO 8559-11 is the deprecated name of TIS-620 |
127 | * - utf7 is not in Qt |
128 | * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" |
129 | * - windows-1258: TODO |
130 | * - IBM874: TODO |
131 | * - TSCII: TODO |
132 | */ |
133 | |
134 | /* |
135 | * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that |
136 | * statically initialised text should be translated so that it expands to just |
137 | * the string that should be translated, making it possible to use it in the |
138 | * single string construct below. |
139 | */ |
140 | #undef QT_TRANSLATE_NOOP3 |
141 | #define QT_TRANSLATE_NOOP3(a, b, c) b |
142 | |
143 | /* |
144 | * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND. |
145 | * The script used was generate_string_table.pl which can be found in kde-dev-scripts. |
146 | * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP. |
147 | */ |
148 | |
149 | static const char language_for_encoding_string[] = |
150 | "ISO 8859-1\0" |
151 | QT_TRANSLATE_NOOP3("KCharsets" , "Western European" , "@item Text character set" )"\0" |
152 | "ISO 8859-15\0" |
153 | "ISO 8859-14\0" |
154 | "cp 1252\0" |
155 | "IBM850\0" |
156 | "ISO 8859-2\0" |
157 | QT_TRANSLATE_NOOP3("KCharsets" , "Central European" , "@item Text character set" )"\0" |
158 | "ISO 8859-3\0" |
159 | "ISO 8859-4\0" |
160 | QT_TRANSLATE_NOOP3("KCharsets" , "Baltic" , "@item Text character set" )"\0" |
161 | "ISO 8859-13\0" |
162 | "ISO 8859-16\0" |
163 | QT_TRANSLATE_NOOP3("KCharsets" , "South-Eastern Europe" , "@item Text character set" )"\0" |
164 | "cp 1250\0" |
165 | "cp 1254\0" |
166 | QT_TRANSLATE_NOOP3("KCharsets" , "Turkish" , "@item Text character set" )"\0" |
167 | "cp 1257\0" |
168 | "KOI8-R\0" |
169 | QT_TRANSLATE_NOOP3("KCharsets" , "Cyrillic" , "@item Text character set" )"\0" |
170 | "ISO 8859-5\0" |
171 | "cp 1251\0" |
172 | "KOI8-U\0" |
173 | "IBM866\0" |
174 | "Big5\0" |
175 | QT_TRANSLATE_NOOP3("KCharsets" , "Chinese Traditional" , "@item Text character set" )"\0" |
176 | "Big5-HKSCS\0" |
177 | "GB18030\0" |
178 | QT_TRANSLATE_NOOP3("KCharsets" , "Chinese Simplified" , "@item Text character set" )"\0" |
179 | "GBK\0" |
180 | "GB2312\0" |
181 | "EUC-KR\0" |
182 | QT_TRANSLATE_NOOP3("KCharsets" , "Korean" , "@item Text character set" )"\0" |
183 | "windows-949\0" |
184 | "sjis\0" |
185 | QT_TRANSLATE_NOOP3("KCharsets" , "Japanese" , "@item Text character set" )"\0" |
186 | "ISO-2022-JP\0" |
187 | "EUC-JP\0" |
188 | "ISO 8859-7\0" |
189 | QT_TRANSLATE_NOOP3("KCharsets" , "Greek" , "@item Text character set" )"\0" |
190 | "cp 1253\0" |
191 | "ISO 8859-6\0" |
192 | QT_TRANSLATE_NOOP3("KCharsets" , "Arabic" , "@item Text character set" )"\0" |
193 | "cp 1256\0" |
194 | "ISO 8859-8\0" |
195 | QT_TRANSLATE_NOOP3("KCharsets" , "Hebrew" , "@item Text character set" )"\0" |
196 | "ISO 8859-8-I\0" |
197 | "cp 1255\0" |
198 | "ISO 8859-9\0" |
199 | "TIS620\0" |
200 | QT_TRANSLATE_NOOP3("KCharsets" , "Thai" , "@item Text character set" )"\0" |
201 | "ISO 8859-11\0" |
202 | "UTF-8\0" |
203 | QT_TRANSLATE_NOOP3("KCharsets" , "Unicode" , "@item Text character set" )"\0" |
204 | "UTF-16\0" |
205 | "utf7\0" |
206 | "ucs2\0" |
207 | "ISO 10646-UCS-2\0" |
208 | "windows-1258\0" |
209 | QT_TRANSLATE_NOOP3("KCharsets" , "Other" , "@item Text character set" )"\0" |
210 | "IBM874\0" |
211 | "TSCII\0" |
212 | "\0" ; |
213 | |
214 | static const int language_for_encoding_indices[] = { |
215 | 0, 11, 28, 11, 40, 11, 52, 11, 60, 11, 67, 78, 95, 78, 106, 117, 124, 117, 136, 148, 169, 78, 177, 185, 193, 117, 201, 208, 217, 208, 228, |
216 | 208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419, |
217 | 426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1}; |
218 | |
219 | /* |
220 | * GENERATED CODE ENDS HERE |
221 | */ |
222 | |
223 | struct KCharsetsSingletonPrivate { |
224 | KCharsets instance; |
225 | }; |
226 | |
227 | Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets) |
228 | |
229 | // search an array of items index/data, find first matching index |
230 | // and return data, or return 0 |
231 | static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry) |
232 | { |
233 | for (int i = 0; indices[i] != -1; i += 2) { |
234 | if (qstrcmp(str1: start + indices[i], str2: entry) == 0) { |
235 | return start + indices[i + 1]; |
236 | } |
237 | } |
238 | return nullptr; |
239 | } |
240 | |
241 | // -------------------------------------------------------------------------- |
242 | |
243 | KCharsets::KCharsets() |
244 | : d(new KCharsetsPrivate) |
245 | { |
246 | } |
247 | |
248 | KCharsets::~KCharsets() = default; |
249 | |
250 | // sorted entities list for lookup |
251 | constexpr inline auto MAX_CODE_SIZE = 8; |
252 | |
253 | struct Entity { |
254 | template<std::size_t N> |
255 | constexpr inline Entity(const char (&n)[N], uint32_t c) |
256 | : code(c) |
257 | { |
258 | for (std::size_t i = 0; i < N - 1; ++i) { |
259 | name[i] = n[i]; |
260 | } |
261 | for (std::size_t i = N - 1; i < MAX_CODE_SIZE; ++i) { |
262 | name[i] = '\0'; |
263 | } |
264 | } |
265 | |
266 | char name[MAX_CODE_SIZE]; |
267 | uint32_t code; |
268 | }; |
269 | static constexpr inline const Entity entities[] = { |
270 | {"AElig" , 0x00c6}, {"Aacute" , 0x00c1}, {"Acirc" , 0x00c2}, {"Agrave" , 0x00c0}, {"Alpha" , 0x0391}, {"AMP" , 38}, {"Aring" , 0x00c5}, |
271 | {"Atilde" , 0x00c3}, {"Auml" , 0x00c4}, {"Beta" , 0x0392}, {"Ccaron" , 0x010c}, {"Ccedil" , 0x00c7}, {"Chi" , 0x03a7}, {"Dagger" , 0x2021}, |
272 | {"Dcaron" , 0x010e}, {"Delta" , 0x0394}, {"ETH" , 0x00d0}, {"Eacute" , 0x00c9}, {"Ecaron" , 0x011a}, {"Ecirc" , 0x00ca}, {"Egrave" , 0x00c8}, |
273 | {"Eague" , 0x00c9}, {"Epsilon" , 0x0395}, {"Eta" , 0x0397}, {"Euml" , 0x00cb}, {"Gamma" , 0x0393}, {"GT" , 62}, {"Iacute" , 0x00cd}, |
274 | {"Icirc" , 0x00ce}, {"Igrave" , 0x00cc}, {"Iota" , 0x0399}, {"Iuml" , 0x00cf}, {"Kappa" , 0x039a}, {"Lambda" , 0x039b}, {"LT" , 60}, |
275 | {"Mu" , 0x039c}, {"Ncaron" , 0x0147}, {"Ntilde" , 0x00d1}, {"Nu" , 0x039d}, {"OElig" , 0x0152}, {"Oacute" , 0x00d3}, {"Ocirc" , 0x00d4}, |
276 | {"Ograve" , 0x00d2}, {"Omega" , 0x03a9}, {"Omicron" , 0x039f}, {"Oslash" , 0x00d8}, {"Otilde" , 0x00d5}, {"Ouml" , 0x00d6}, {"Phi" , 0x03a6}, |
277 | {"Pi" , 0x03a0}, {"Prime" , 0x2033}, {"Psi" , 0x03a8}, {"QUOT" , 34}, {"Rcaron" , 0x0158}, {"Rho" , 0x03a1}, {"Scaron" , 0x0160}, |
278 | {"Sigma" , 0x03a3}, {"THORN" , 0x00de}, {"Tau" , 0x03a4}, {"Tcaron" , 0x0164}, {"Theta" , 0x0398}, {"Uacute" , 0x00da}, {"Ucirc" , 0x00db}, |
279 | {"Ugrave" , 0x00d9}, {"Upsilon" , 0x03a5}, {"Uring" , 0x016e}, {"Uuml" , 0x00dc}, {"Xi" , 0x039e}, {"Yacute" , 0x00dd}, {"Yuml" , 0x0178}, |
280 | {"Zcaron" , 0x017d}, {"Zeta" , 0x0396}, {"aacute" , 0x00e1}, {"acirc" , 0x00e2}, {"acute" , 0x00b4}, {"aelig" , 0x00e6}, {"agrave" , 0x00e0}, |
281 | {"alefsym" , 0x2135}, {"alpha" , 0x03b1}, {"amp" , 38}, {"and" , 0x2227}, {"ang" , 0x2220}, {"apos" , 0x0027}, {"aring" , 0x00e5}, |
282 | {"asymp" , 0x2248}, {"atilde" , 0x00e3}, {"auml" , 0x00e4}, {"bdquo" , 0x201e}, {"beta" , 0x03b2}, {"brvbar" , 0x00a6}, {"bull" , 0x2022}, |
283 | {"cap" , 0x2229}, {"ccaron" , 0x010d}, {"ccedil" , 0x00e7}, {"cedil" , 0x00b8}, {"cent" , 0x00a2}, {"chi" , 0x03c7}, {"circ" , 0x02c6}, |
284 | {"clubs" , 0x2663}, {"cong" , 0x2245}, {"copy" , 0x00a9}, {"crarr" , 0x21b5}, {"cup" , 0x222a}, {"curren" , 0x00a4}, {"dArr" , 0x21d3}, |
285 | {"dagger" , 0x2020}, {"darr" , 0x2193}, {"dcaron" , 0x10f}, {"deg" , 0x00b0}, {"delta" , 0x03b4}, {"diams" , 0x2666}, {"divide" , 0x00f7}, |
286 | {"dol" , 0x0024}, {"dollar" , 0x0024}, {"eacute" , 0x00e9}, {"ecaron" , 0x011b}, {"eague" , 0x00e9}, {"ecirc" , 0x00ea}, {"egrave" , 0x00e8}, |
287 | {"emdash" , 0x2014}, {"empty" , 0x2205}, {"emsp" , 0x2003}, {"endash" , 0x2013}, {"ensp" , 0x2002}, {"epsilon" , 0x03b5}, {"equiv" , 0x2261}, |
288 | {"eta" , 0x03b7}, {"eth" , 0x00f0}, {"euml" , 0x00eb}, {"euro" , 0x20ac}, {"exist" , 0x2203}, {"fnof" , 0x0192}, {"forall" , 0x2200}, |
289 | {"frac12" , 0x00bd}, {"frac14" , 0x00bc}, {"frac34" , 0x00be}, {"frasl" , 0x2044}, {"gamma" , 0x03b3}, {"ge" , 0x2265}, {"gt" , 62}, |
290 | {"hArr" , 0x21d4}, {"harr" , 0x2194}, {"hearts" , 0x2665}, {"hellip" , 0x2026}, {"iacute" , 0x00ed}, {"icirc" , 0x00ee}, {"iexcl" , 0x00a1}, |
291 | {"igrave" , 0x00ec}, {"image" , 0x2111}, {"infin" , 0x221e}, {"int" , 0x222b}, {"iota" , 0x03b9}, {"iquest" , 0x00bf}, {"isin" , 0x2208}, |
292 | {"iuml" , 0x00ef}, {"kappa" , 0x03ba}, {"lArr" , 0x21d0}, {"lambda" , 0x03bb}, {"lang" , 0x2329}, {"laquo" , 0x00ab}, {"larr" , 0x2190}, |
293 | {"lceil" , 0x2308}, {"ldquo" , 0x201c}, {"le" , 0x2264}, {"lfloor" , 0x230a}, {"lowast" , 0x2217}, {"loz" , 0x25ca}, {"lrm" , 0x200e}, |
294 | {"lsaquo" , 0x2039}, {"lsquo" , 0x2018}, {"lt" , 60}, {"macr" , 0x00af}, {"mdash" , 0x2014}, {"micro" , 0x00b5}, {"middot" , 0x00b7}, |
295 | {"minus" , 0x2212}, {"mu" , 0x03bc}, {"nabla" , 0x2207}, {"nbsp" , 0x00a0}, {"ncaron" , 0x0148}, {"ndash" , 0x2013}, {"ne" , 0x2260}, |
296 | {"ni" , 0x220b}, {"not" , 0x00ac}, {"notin" , 0x2209}, {"nsub" , 0x2284}, {"ntilde" , 0x00f1}, {"nu" , 0x03bd}, {"oacute" , 0x00f3}, |
297 | {"ocirc" , 0x00f4}, {"oelig" , 0x0153}, {"ograve" , 0x00f2}, {"oline" , 0x203e}, {"omega" , 0x03c9}, {"omicron" , 0x03bf}, {"oplus" , 0x2295}, |
298 | {"or" , 0x2228}, {"ordf" , 0x00aa}, {"ordm" , 0x00ba}, {"oslash" , 0x00f8}, {"otilde" , 0x00f5}, {"otimes" , 0x2297}, {"ouml" , 0x00f6}, |
299 | {"para" , 0x00b6}, {"part" , 0x2202}, {"percnt" , 0x0025}, {"permil" , 0x2030}, {"perp" , 0x22a5}, {"phi" , 0x03c6}, {"pi" , 0x03c0}, |
300 | {"piv" , 0x03d6}, {"plusmn" , 0x00b1}, {"pound" , 0x00a3}, {"prime" , 0x2032}, {"prod" , 0x220f}, {"prop" , 0x221d}, {"psi" , 0x03c8}, |
301 | {"quot" , 34}, {"rArr" , 0x21d2}, {"radic" , 0x221a}, {"rang" , 0x232a}, {"raquo" , 0x00bb}, {"rarr" , 0x2192}, {"rcaron" , 0x0159}, |
302 | {"rceil" , 0x2309}, {"rdquo" , 0x201d}, {"real" , 0x211c}, {"reg" , 0x00ae}, {"rfloor" , 0x230b}, {"rho" , 0x03c1}, {"rlm" , 0x200f}, |
303 | {"rsaquo" , 0x203a}, {"rsquo" , 0x2019}, {"sbquo" , 0x201a}, {"scaron" , 0x0161}, {"sdot" , 0x22c5}, {"sect" , 0x00a7}, {"shy" , 0x00ad}, |
304 | {"sigma" , 0x03c3}, {"sigmaf" , 0x03c2}, {"sim" , 0x223c}, {"spades" , 0x2660}, {"sub" , 0x2282}, {"sube" , 0x2286}, {"sum" , 0x2211}, |
305 | {"sup1" , 0x00b9}, {"supl" , 0x00b9}, {"sup2" , 0x00b2}, {"sup3" , 0x00b3}, {"sup" , 0x2283}, {"supe" , 0x2287}, {"szlig" , 0x00df}, |
306 | {"tau" , 0x03c4}, {"tcaron" , 0x0165}, {"there4" , 0x2234}, {"theta" , 0x03b8}, {"thetasym" , 0x03d1}, {"thinsp" , 0x2009}, {"thorn" , 0x00fe}, |
307 | {"tilde" , 0x02dc}, {"times" , 0x00d7}, {"trade" , 0x2122}, {"uArr" , 0x21d1}, {"uacute" , 0x00fa}, {"uarr" , 0x2191}, {"ucirc" , 0x00fb}, |
308 | {"ugrave" , 0x00f9}, {"uml" , 0x00a8}, {"upsih" , 0x03d2}, {"upsilon" , 0x03c5}, {"uring" , 0x016f}, {"uuml" , 0x00fc}, {"weierp" , 0x2118}, |
309 | {"xi" , 0x03be}, {"yacute" , 0x00fd}, {"yen" , 0x00a5}, {"yuml" , 0x00ff}, {"zcaron" , 0x017e}, {"zeta" , 0x03b6}, {"zwj" , 0x200d}, |
310 | {"zwnj" , 0x200c}}; |
311 | |
312 | [[nodiscard]] static bool operator<(const Entity &lhs, const QByteArray &rhs) |
313 | { |
314 | return std::strncmp(s1: lhs.name, s2: rhs.constData(), n: MAX_CODE_SIZE) < 0; |
315 | } |
316 | |
317 | QChar KCharsets::fromEntity(QStringView str) |
318 | { |
319 | QChar res = QChar::Null; |
320 | |
321 | if (str.isEmpty()) { |
322 | return QChar::Null; |
323 | } |
324 | |
325 | int pos = 0; |
326 | if (str[pos] == QLatin1Char('&')) { |
327 | pos++; |
328 | } |
329 | |
330 | // Check for '�' or '�' sequence |
331 | if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) { |
332 | bool ok; |
333 | pos++; |
334 | if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) { |
335 | pos++; |
336 | // '�', hexadecimal character reference |
337 | const auto tmp = str.mid(pos); |
338 | res = QChar(tmp.toInt(ok: &ok, base: 16)); |
339 | } else { |
340 | // '�', decimal character reference |
341 | const auto tmp = str.mid(pos); |
342 | res = QChar(tmp.toInt(ok: &ok, base: 10)); |
343 | } |
344 | if (ok) { |
345 | return res; |
346 | } else { |
347 | return QChar::Null; |
348 | } |
349 | } |
350 | |
351 | const QByteArray raw(str.toLatin1()); |
352 | const auto e = std::lower_bound(first: std::begin(arr: entities), last: std::end(arr: entities), val: raw); |
353 | |
354 | if (e == std::end(arr: entities) || raw.size() > MAX_CODE_SIZE || std::strncmp(s1: e->name, s2: raw.constData(), n: MAX_CODE_SIZE) != 0) { |
355 | return QChar::Null; |
356 | } |
357 | |
358 | return QChar(e->code); |
359 | } |
360 | |
361 | QChar KCharsets::fromEntity(QStringView str, int &len) |
362 | { |
363 | // entities are never longer than 8 chars... we start from |
364 | // that length and work backwards... |
365 | len = 8; |
366 | while (len > 0) { |
367 | const auto tmp = str.left(n: len); |
368 | QChar res = fromEntity(str: tmp); |
369 | if (res != QChar::Null) { |
370 | return res; |
371 | } |
372 | len--; |
373 | } |
374 | return QChar::Null; |
375 | } |
376 | |
377 | QString KCharsets::toEntity(const QChar &ch) |
378 | { |
379 | return QString::asprintf(format: "�x%x;" , ch.unicode()); |
380 | } |
381 | |
382 | QString KCharsets::resolveEntities(const QString &input) |
383 | { |
384 | QString text = input; |
385 | const QChar *p = text.unicode(); |
386 | const QChar *end = p + text.length(); |
387 | const QChar *ampersand = nullptr; |
388 | bool scanForSemicolon = false; |
389 | |
390 | for (; p < end; ++p) { |
391 | const QChar ch = *p; |
392 | |
393 | if (ch == QLatin1Char('&')) { |
394 | ampersand = p; |
395 | scanForSemicolon = true; |
396 | continue; |
397 | } |
398 | |
399 | if (ch != QLatin1Char(';') || scanForSemicolon == false) { |
400 | continue; |
401 | } |
402 | |
403 | assert(ampersand); |
404 | |
405 | scanForSemicolon = false; |
406 | |
407 | const QChar *entityBegin = ampersand + 1; |
408 | |
409 | const uint entityLength = p - entityBegin; |
410 | if (entityLength == 0) { |
411 | continue; |
412 | } |
413 | |
414 | const QChar entityValue = KCharsets::fromEntity(str: QStringView(entityBegin, entityLength)); |
415 | if (entityValue.isNull()) { |
416 | continue; |
417 | } |
418 | |
419 | const uint ampersandPos = ampersand - text.unicode(); |
420 | |
421 | text[(int)ampersandPos] = entityValue; |
422 | text.remove(i: ampersandPos + 1, len: entityLength + 1); |
423 | p = text.unicode() + ampersandPos; |
424 | end = text.unicode() + text.length(); |
425 | ampersand = nullptr; |
426 | } |
427 | |
428 | return text; |
429 | } |
430 | |
431 | QStringList KCharsets::availableEncodingNames() const |
432 | { |
433 | QStringList available; |
434 | for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { |
435 | available.append(t: QString::fromUtf8(utf8: language_for_encoding_string + *p)); |
436 | } |
437 | available.sort(); |
438 | return available; |
439 | } |
440 | |
441 | QString KCharsets::descriptionForEncoding(QStringView encoding) const |
442 | { |
443 | const char *lang = kcharsets_array_search(start: language_for_encoding_string, indices: language_for_encoding_indices, entry: encoding.toUtf8().data()); |
444 | if (lang) { |
445 | return tr(sourceText: "%1 ( %2 )" , disambiguation: "@item %1 character set, %2 encoding" ).arg(args: tr(sourceText: lang, disambiguation: "@item Text character set" ), args&: encoding); |
446 | } else { |
447 | return tr(sourceText: "Other encoding (%1)" , disambiguation: "@item" ).arg(a: encoding); |
448 | } |
449 | } |
450 | |
451 | QString KCharsets::encodingForName(const QString &descriptiveName) const |
452 | { |
453 | const int left = descriptiveName.lastIndexOf(c: QLatin1Char('(')); |
454 | |
455 | if (left < 0) { // No parenthesis, so assume it is a normal encoding name |
456 | return descriptiveName.trimmed(); |
457 | } |
458 | |
459 | QString name(descriptiveName.mid(position: left + 1)); |
460 | |
461 | const int right = name.lastIndexOf(c: QLatin1Char(')')); |
462 | |
463 | if (right < 0) { |
464 | return name; |
465 | } |
466 | |
467 | return name.left(n: right).trimmed(); |
468 | } |
469 | |
470 | QStringList KCharsets::descriptiveEncodingNames() const |
471 | { |
472 | QStringList encodings; |
473 | for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { |
474 | const QString name = QString::fromUtf8(utf8: language_for_encoding_string + p[0]); |
475 | const QString description = tr(sourceText: language_for_encoding_string + p[1], disambiguation: "@item Text character set" ); |
476 | encodings.append(t: tr(sourceText: "%1 ( %2 )" , disambiguation: "@item Text encoding: %1 character set, %2 encoding" ).arg(args: description, args: name)); |
477 | } |
478 | encodings.sort(); |
479 | return encodings; |
480 | } |
481 | |
482 | QList<QStringList> KCharsets::encodingsByScript() const |
483 | { |
484 | if (!d->encodingsByScript.isEmpty()) { |
485 | return d->encodingsByScript; |
486 | } |
487 | int i; |
488 | for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { |
489 | const QString name = QString::fromUtf8(utf8: language_for_encoding_string + p[0]); |
490 | const QString description = tr(sourceText: language_for_encoding_string + p[1], disambiguation: "@item Text character set" ); |
491 | |
492 | for (i = 0; i < d->encodingsByScript.size(); ++i) { |
493 | if (d->encodingsByScript.at(i).at(i: 0) == description) { |
494 | d->encodingsByScript[i].append(t: name); |
495 | break; |
496 | } |
497 | } |
498 | |
499 | if (i == d->encodingsByScript.size()) { |
500 | d->encodingsByScript.append(t: QStringList() << description << name); |
501 | } |
502 | } |
503 | return d->encodingsByScript; |
504 | } |
505 | |
506 | KCharsets *KCharsets::charsets() |
507 | { |
508 | return &globalCharsets()->instance; |
509 | } |
510 | |