| 1 | /* |
| 2 | This file is part of the KDE libraries |
| 3 | |
| 4 | SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org> |
| 5 | SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org> |
| 6 | SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net> |
| 7 | |
| 8 | SPDX-License-Identifier: LGPL-2.0-or-later |
| 9 | */ |
| 10 | #include "kcharsets.h" |
| 11 | #include "kcharsets_p.h" |
| 12 | #include "kcodecs_debug.h" |
| 13 | |
| 14 | #include <QHash> |
| 15 | |
| 16 | #include <algorithm> |
| 17 | #include <assert.h> |
| 18 | |
| 19 | /* |
| 20 | * The encoding names (like "ISO 8859-1") in this list are user-visible, |
| 21 | * and should be mostly uppercase. |
| 22 | * Generate with generate_string_table.pl (located in kde-dev-scripts), |
| 23 | * input data: |
| 24 | ISO 8859-1 |
| 25 | i18n:Western European |
| 26 | ISO 8859-15 |
| 27 | i18n:Western European |
| 28 | ISO 8859-14 |
| 29 | i18n:Western European |
| 30 | cp 1252 |
| 31 | i18n:Western European |
| 32 | IBM850 |
| 33 | i18n:Western European |
| 34 | ISO 8859-2 |
| 35 | i18n:Central European |
| 36 | ISO 8859-3 |
| 37 | i18n:Central European |
| 38 | ISO 8859-4 |
| 39 | i18n:Baltic |
| 40 | ISO 8859-13 |
| 41 | i18n:Baltic |
| 42 | ISO 8859-16 |
| 43 | i18n:South-Eastern Europe |
| 44 | cp 1250 |
| 45 | i18n:Central European |
| 46 | cp 1254 |
| 47 | i18n:Turkish |
| 48 | cp 1257 |
| 49 | i18n:Baltic |
| 50 | KOI8-R |
| 51 | i18n:Cyrillic |
| 52 | ISO 8859-5 |
| 53 | i18n:Cyrillic |
| 54 | cp 1251 |
| 55 | i18n:Cyrillic |
| 56 | KOI8-U |
| 57 | i18n:Cyrillic |
| 58 | IBM866 |
| 59 | i18n:Cyrillic |
| 60 | Big5 |
| 61 | i18n:Chinese Traditional |
| 62 | Big5-HKSCS |
| 63 | i18n:Chinese Traditional |
| 64 | GB18030 |
| 65 | i18n:Chinese Simplified |
| 66 | GBK |
| 67 | i18n:Chinese Simplified |
| 68 | GB2312 |
| 69 | i18n:Chinese Simplified |
| 70 | EUC-KR |
| 71 | i18n:Korean |
| 72 | windows-949 |
| 73 | i18n:Korean |
| 74 | sjis |
| 75 | i18n:Japanese |
| 76 | ISO-2022-JP |
| 77 | i18n:Japanese |
| 78 | EUC-JP |
| 79 | i18n:Japanese |
| 80 | ISO 8859-7 |
| 81 | i18n:Greek |
| 82 | cp 1253 |
| 83 | i18n:Greek |
| 84 | ISO 8859-6 |
| 85 | i18n:Arabic |
| 86 | cp 1256 |
| 87 | i18n:Arabic |
| 88 | ISO 8859-8 |
| 89 | i18n:Hebrew |
| 90 | ISO 8859-8-I |
| 91 | i18n:Hebrew |
| 92 | cp 1255 |
| 93 | i18n:Hebrew |
| 94 | ISO 8859-9 |
| 95 | i18n:Turkish |
| 96 | TIS620 |
| 97 | i18n:Thai |
| 98 | ISO 8859-11 |
| 99 | i18n:Thai |
| 100 | UTF-8 |
| 101 | i18n:Unicode |
| 102 | UTF-16 |
| 103 | i18n:Unicode |
| 104 | utf7 |
| 105 | i18n:Unicode |
| 106 | ucs2 |
| 107 | i18n:Unicode |
| 108 | ISO 10646-UCS-2 |
| 109 | i18n:Unicode |
| 110 | windows-1258 |
| 111 | i18n:Other |
| 112 | IBM874 |
| 113 | i18n:Other |
| 114 | TSCII |
| 115 | i18n:Other |
| 116 | */ |
| 117 | /* |
| 118 | * Notes about the table: |
| 119 | * |
| 120 | * - The following entries were disabled and removed from the table: |
| 121 | ibm852 |
| 122 | i18n:Central European |
| 123 | pt 154 |
| 124 | i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt |
| 125 | * |
| 126 | * - ISO 8559-11 is the deprecated name of TIS-620 |
| 127 | * - utf7 is not in Qt |
| 128 | * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" |
| 129 | * - windows-1258: TODO |
| 130 | * - IBM874: TODO |
| 131 | * - TSCII: TODO |
| 132 | */ |
| 133 | |
| 134 | /* |
| 135 | * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that |
| 136 | * statically initialised text should be translated so that it expands to just |
| 137 | * the string that should be translated, making it possible to use it in the |
| 138 | * single string construct below. |
| 139 | */ |
| 140 | #undef QT_TRANSLATE_NOOP3 |
| 141 | #define QT_TRANSLATE_NOOP3(a, b, c) b |
| 142 | |
| 143 | /* |
| 144 | * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND. |
| 145 | * The script used was generate_string_table.pl which can be found in kde-dev-scripts. |
| 146 | * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP. |
| 147 | */ |
| 148 | |
| 149 | static const char language_for_encoding_string[] = |
| 150 | "ISO 8859-1\0" |
| 151 | QT_TRANSLATE_NOOP3("KCharsets" , "Western European" , "@item Text character set" )"\0" |
| 152 | "ISO 8859-15\0" |
| 153 | "ISO 8859-14\0" |
| 154 | "cp 1252\0" |
| 155 | "IBM850\0" |
| 156 | "ISO 8859-2\0" |
| 157 | QT_TRANSLATE_NOOP3("KCharsets" , "Central European" , "@item Text character set" )"\0" |
| 158 | "ISO 8859-3\0" |
| 159 | "ISO 8859-4\0" |
| 160 | QT_TRANSLATE_NOOP3("KCharsets" , "Baltic" , "@item Text character set" )"\0" |
| 161 | "ISO 8859-13\0" |
| 162 | "ISO 8859-16\0" |
| 163 | QT_TRANSLATE_NOOP3("KCharsets" , "South-Eastern Europe" , "@item Text character set" )"\0" |
| 164 | "cp 1250\0" |
| 165 | "cp 1254\0" |
| 166 | QT_TRANSLATE_NOOP3("KCharsets" , "Turkish" , "@item Text character set" )"\0" |
| 167 | "cp 1257\0" |
| 168 | "KOI8-R\0" |
| 169 | QT_TRANSLATE_NOOP3("KCharsets" , "Cyrillic" , "@item Text character set" )"\0" |
| 170 | "ISO 8859-5\0" |
| 171 | "cp 1251\0" |
| 172 | "KOI8-U\0" |
| 173 | "IBM866\0" |
| 174 | "Big5\0" |
| 175 | QT_TRANSLATE_NOOP3("KCharsets" , "Chinese Traditional" , "@item Text character set" )"\0" |
| 176 | "Big5-HKSCS\0" |
| 177 | "GB18030\0" |
| 178 | QT_TRANSLATE_NOOP3("KCharsets" , "Chinese Simplified" , "@item Text character set" )"\0" |
| 179 | "GBK\0" |
| 180 | "GB2312\0" |
| 181 | "EUC-KR\0" |
| 182 | QT_TRANSLATE_NOOP3("KCharsets" , "Korean" , "@item Text character set" )"\0" |
| 183 | "windows-949\0" |
| 184 | "sjis\0" |
| 185 | QT_TRANSLATE_NOOP3("KCharsets" , "Japanese" , "@item Text character set" )"\0" |
| 186 | "ISO-2022-JP\0" |
| 187 | "EUC-JP\0" |
| 188 | "ISO 8859-7\0" |
| 189 | QT_TRANSLATE_NOOP3("KCharsets" , "Greek" , "@item Text character set" )"\0" |
| 190 | "cp 1253\0" |
| 191 | "ISO 8859-6\0" |
| 192 | QT_TRANSLATE_NOOP3("KCharsets" , "Arabic" , "@item Text character set" )"\0" |
| 193 | "cp 1256\0" |
| 194 | "ISO 8859-8\0" |
| 195 | QT_TRANSLATE_NOOP3("KCharsets" , "Hebrew" , "@item Text character set" )"\0" |
| 196 | "ISO 8859-8-I\0" |
| 197 | "cp 1255\0" |
| 198 | "ISO 8859-9\0" |
| 199 | "TIS620\0" |
| 200 | QT_TRANSLATE_NOOP3("KCharsets" , "Thai" , "@item Text character set" )"\0" |
| 201 | "ISO 8859-11\0" |
| 202 | "UTF-8\0" |
| 203 | QT_TRANSLATE_NOOP3("KCharsets" , "Unicode" , "@item Text character set" )"\0" |
| 204 | "UTF-16\0" |
| 205 | "utf7\0" |
| 206 | "ucs2\0" |
| 207 | "ISO 10646-UCS-2\0" |
| 208 | "windows-1258\0" |
| 209 | QT_TRANSLATE_NOOP3("KCharsets" , "Other" , "@item Text character set" )"\0" |
| 210 | "IBM874\0" |
| 211 | "TSCII\0" |
| 212 | "\0" ; |
| 213 | |
| 214 | static const int language_for_encoding_indices[] = { |
| 215 | 0, 11, 28, 11, 40, 11, 52, 11, 60, 11, 67, 78, 95, 78, 106, 117, 124, 117, 136, 148, 169, 78, 177, 185, 193, 117, 201, 208, 217, 208, 228, |
| 216 | 208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419, |
| 217 | 426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1}; |
| 218 | |
| 219 | /* |
| 220 | * GENERATED CODE ENDS HERE |
| 221 | */ |
| 222 | |
| 223 | struct KCharsetsSingletonPrivate { |
| 224 | KCharsets instance; |
| 225 | }; |
| 226 | |
| 227 | Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets) |
| 228 | |
| 229 | // search an array of items index/data, find first matching index |
| 230 | // and return data, or return 0 |
| 231 | static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry) |
| 232 | { |
| 233 | for (int i = 0; indices[i] != -1; i += 2) { |
| 234 | if (qstrcmp(str1: start + indices[i], str2: entry) == 0) { |
| 235 | return start + indices[i + 1]; |
| 236 | } |
| 237 | } |
| 238 | return nullptr; |
| 239 | } |
| 240 | |
| 241 | // -------------------------------------------------------------------------- |
| 242 | |
| 243 | KCharsets::KCharsets() |
| 244 | : d(new KCharsetsPrivate) |
| 245 | { |
| 246 | } |
| 247 | |
| 248 | KCharsets::~KCharsets() = default; |
| 249 | |
| 250 | // sorted entities list for lookup |
| 251 | constexpr inline auto MAX_CODE_SIZE = 8; |
| 252 | |
| 253 | struct Entity { |
| 254 | template<std::size_t N> |
| 255 | constexpr inline Entity(const char (&n)[N], uint32_t c) |
| 256 | : code(c) |
| 257 | { |
| 258 | for (std::size_t i = 0; i < N - 1; ++i) { |
| 259 | name[i] = n[i]; |
| 260 | } |
| 261 | for (std::size_t i = N - 1; i < MAX_CODE_SIZE; ++i) { |
| 262 | name[i] = '\0'; |
| 263 | } |
| 264 | } |
| 265 | |
| 266 | char name[MAX_CODE_SIZE]; |
| 267 | uint32_t code; |
| 268 | }; |
| 269 | static constexpr inline const Entity entities[] = { |
| 270 | {"AElig" , 0x00c6}, {"Aacute" , 0x00c1}, {"Acirc" , 0x00c2}, {"Agrave" , 0x00c0}, {"Alpha" , 0x0391}, {"AMP" , 38}, {"Aring" , 0x00c5}, |
| 271 | {"Atilde" , 0x00c3}, {"Auml" , 0x00c4}, {"Beta" , 0x0392}, {"Ccaron" , 0x010c}, {"Ccedil" , 0x00c7}, {"Chi" , 0x03a7}, {"Dagger" , 0x2021}, |
| 272 | {"Dcaron" , 0x010e}, {"Delta" , 0x0394}, {"ETH" , 0x00d0}, {"Eacute" , 0x00c9}, {"Ecaron" , 0x011a}, {"Ecirc" , 0x00ca}, {"Egrave" , 0x00c8}, |
| 273 | {"Eague" , 0x00c9}, {"Epsilon" , 0x0395}, {"Eta" , 0x0397}, {"Euml" , 0x00cb}, {"Gamma" , 0x0393}, {"GT" , 62}, {"Iacute" , 0x00cd}, |
| 274 | {"Icirc" , 0x00ce}, {"Igrave" , 0x00cc}, {"Iota" , 0x0399}, {"Iuml" , 0x00cf}, {"Kappa" , 0x039a}, {"Lambda" , 0x039b}, {"LT" , 60}, |
| 275 | {"Mu" , 0x039c}, {"Ncaron" , 0x0147}, {"Ntilde" , 0x00d1}, {"Nu" , 0x039d}, {"OElig" , 0x0152}, {"Oacute" , 0x00d3}, {"Ocirc" , 0x00d4}, |
| 276 | {"Ograve" , 0x00d2}, {"Omega" , 0x03a9}, {"Omicron" , 0x039f}, {"Oslash" , 0x00d8}, {"Otilde" , 0x00d5}, {"Ouml" , 0x00d6}, {"Phi" , 0x03a6}, |
| 277 | {"Pi" , 0x03a0}, {"Prime" , 0x2033}, {"Psi" , 0x03a8}, {"QUOT" , 34}, {"Rcaron" , 0x0158}, {"Rho" , 0x03a1}, {"Scaron" , 0x0160}, |
| 278 | {"Sigma" , 0x03a3}, {"THORN" , 0x00de}, {"Tau" , 0x03a4}, {"Tcaron" , 0x0164}, {"Theta" , 0x0398}, {"Uacute" , 0x00da}, {"Ucirc" , 0x00db}, |
| 279 | {"Ugrave" , 0x00d9}, {"Upsilon" , 0x03a5}, {"Uring" , 0x016e}, {"Uuml" , 0x00dc}, {"Xi" , 0x039e}, {"Yacute" , 0x00dd}, {"Yuml" , 0x0178}, |
| 280 | {"Zcaron" , 0x017d}, {"Zeta" , 0x0396}, {"aacute" , 0x00e1}, {"acirc" , 0x00e2}, {"acute" , 0x00b4}, {"aelig" , 0x00e6}, {"agrave" , 0x00e0}, |
| 281 | {"alefsym" , 0x2135}, {"alpha" , 0x03b1}, {"amp" , 38}, {"and" , 0x2227}, {"ang" , 0x2220}, {"apos" , 0x0027}, {"aring" , 0x00e5}, |
| 282 | {"asymp" , 0x2248}, {"atilde" , 0x00e3}, {"auml" , 0x00e4}, {"bdquo" , 0x201e}, {"beta" , 0x03b2}, {"brvbar" , 0x00a6}, {"bull" , 0x2022}, |
| 283 | {"cap" , 0x2229}, {"ccaron" , 0x010d}, {"ccedil" , 0x00e7}, {"cedil" , 0x00b8}, {"cent" , 0x00a2}, {"chi" , 0x03c7}, {"circ" , 0x02c6}, |
| 284 | {"clubs" , 0x2663}, {"cong" , 0x2245}, {"copy" , 0x00a9}, {"crarr" , 0x21b5}, {"cup" , 0x222a}, {"curren" , 0x00a4}, {"dArr" , 0x21d3}, |
| 285 | {"dagger" , 0x2020}, {"darr" , 0x2193}, {"dcaron" , 0x10f}, {"deg" , 0x00b0}, {"delta" , 0x03b4}, {"diams" , 0x2666}, {"divide" , 0x00f7}, |
| 286 | {"dol" , 0x0024}, {"dollar" , 0x0024}, {"eacute" , 0x00e9}, {"ecaron" , 0x011b}, {"eague" , 0x00e9}, {"ecirc" , 0x00ea}, {"egrave" , 0x00e8}, |
| 287 | {"emdash" , 0x2014}, {"empty" , 0x2205}, {"emsp" , 0x2003}, {"endash" , 0x2013}, {"ensp" , 0x2002}, {"epsilon" , 0x03b5}, {"equiv" , 0x2261}, |
| 288 | {"eta" , 0x03b7}, {"eth" , 0x00f0}, {"euml" , 0x00eb}, {"euro" , 0x20ac}, {"exist" , 0x2203}, {"fnof" , 0x0192}, {"forall" , 0x2200}, |
| 289 | {"frac12" , 0x00bd}, {"frac14" , 0x00bc}, {"frac34" , 0x00be}, {"frasl" , 0x2044}, {"gamma" , 0x03b3}, {"ge" , 0x2265}, {"gt" , 62}, |
| 290 | {"hArr" , 0x21d4}, {"harr" , 0x2194}, {"hearts" , 0x2665}, {"hellip" , 0x2026}, {"iacute" , 0x00ed}, {"icirc" , 0x00ee}, {"iexcl" , 0x00a1}, |
| 291 | {"igrave" , 0x00ec}, {"image" , 0x2111}, {"infin" , 0x221e}, {"int" , 0x222b}, {"iota" , 0x03b9}, {"iquest" , 0x00bf}, {"isin" , 0x2208}, |
| 292 | {"iuml" , 0x00ef}, {"kappa" , 0x03ba}, {"lArr" , 0x21d0}, {"lambda" , 0x03bb}, {"lang" , 0x2329}, {"laquo" , 0x00ab}, {"larr" , 0x2190}, |
| 293 | {"lceil" , 0x2308}, {"ldquo" , 0x201c}, {"le" , 0x2264}, {"lfloor" , 0x230a}, {"lowast" , 0x2217}, {"loz" , 0x25ca}, {"lrm" , 0x200e}, |
| 294 | {"lsaquo" , 0x2039}, {"lsquo" , 0x2018}, {"lt" , 60}, {"macr" , 0x00af}, {"mdash" , 0x2014}, {"micro" , 0x00b5}, {"middot" , 0x00b7}, |
| 295 | {"minus" , 0x2212}, {"mu" , 0x03bc}, {"nabla" , 0x2207}, {"nbsp" , 0x00a0}, {"ncaron" , 0x0148}, {"ndash" , 0x2013}, {"ne" , 0x2260}, |
| 296 | {"ni" , 0x220b}, {"not" , 0x00ac}, {"notin" , 0x2209}, {"nsub" , 0x2284}, {"ntilde" , 0x00f1}, {"nu" , 0x03bd}, {"oacute" , 0x00f3}, |
| 297 | {"ocirc" , 0x00f4}, {"oelig" , 0x0153}, {"ograve" , 0x00f2}, {"oline" , 0x203e}, {"omega" , 0x03c9}, {"omicron" , 0x03bf}, {"oplus" , 0x2295}, |
| 298 | {"or" , 0x2228}, {"ordf" , 0x00aa}, {"ordm" , 0x00ba}, {"oslash" , 0x00f8}, {"otilde" , 0x00f5}, {"otimes" , 0x2297}, {"ouml" , 0x00f6}, |
| 299 | {"para" , 0x00b6}, {"part" , 0x2202}, {"percnt" , 0x0025}, {"permil" , 0x2030}, {"perp" , 0x22a5}, {"phi" , 0x03c6}, {"pi" , 0x03c0}, |
| 300 | {"piv" , 0x03d6}, {"plusmn" , 0x00b1}, {"pound" , 0x00a3}, {"prime" , 0x2032}, {"prod" , 0x220f}, {"prop" , 0x221d}, {"psi" , 0x03c8}, |
| 301 | {"quot" , 34}, {"rArr" , 0x21d2}, {"radic" , 0x221a}, {"rang" , 0x232a}, {"raquo" , 0x00bb}, {"rarr" , 0x2192}, {"rcaron" , 0x0159}, |
| 302 | {"rceil" , 0x2309}, {"rdquo" , 0x201d}, {"real" , 0x211c}, {"reg" , 0x00ae}, {"rfloor" , 0x230b}, {"rho" , 0x03c1}, {"rlm" , 0x200f}, |
| 303 | {"rsaquo" , 0x203a}, {"rsquo" , 0x2019}, {"sbquo" , 0x201a}, {"scaron" , 0x0161}, {"sdot" , 0x22c5}, {"sect" , 0x00a7}, {"shy" , 0x00ad}, |
| 304 | {"sigma" , 0x03c3}, {"sigmaf" , 0x03c2}, {"sim" , 0x223c}, {"spades" , 0x2660}, {"sub" , 0x2282}, {"sube" , 0x2286}, {"sum" , 0x2211}, |
| 305 | {"sup1" , 0x00b9}, {"supl" , 0x00b9}, {"sup2" , 0x00b2}, {"sup3" , 0x00b3}, {"sup" , 0x2283}, {"supe" , 0x2287}, {"szlig" , 0x00df}, |
| 306 | {"tau" , 0x03c4}, {"tcaron" , 0x0165}, {"there4" , 0x2234}, {"theta" , 0x03b8}, {"thetasym" , 0x03d1}, {"thinsp" , 0x2009}, {"thorn" , 0x00fe}, |
| 307 | {"tilde" , 0x02dc}, {"times" , 0x00d7}, {"trade" , 0x2122}, {"uArr" , 0x21d1}, {"uacute" , 0x00fa}, {"uarr" , 0x2191}, {"ucirc" , 0x00fb}, |
| 308 | {"ugrave" , 0x00f9}, {"uml" , 0x00a8}, {"upsih" , 0x03d2}, {"upsilon" , 0x03c5}, {"uring" , 0x016f}, {"uuml" , 0x00fc}, {"weierp" , 0x2118}, |
| 309 | {"xi" , 0x03be}, {"yacute" , 0x00fd}, {"yen" , 0x00a5}, {"yuml" , 0x00ff}, {"zcaron" , 0x017e}, {"zeta" , 0x03b6}, {"zwj" , 0x200d}, |
| 310 | {"zwnj" , 0x200c}}; |
| 311 | |
| 312 | [[nodiscard]] static bool operator<(const Entity &lhs, const QByteArray &rhs) |
| 313 | { |
| 314 | return std::strncmp(s1: lhs.name, s2: rhs.constData(), n: MAX_CODE_SIZE) < 0; |
| 315 | } |
| 316 | |
| 317 | QChar KCharsets::fromEntity(QStringView str) |
| 318 | { |
| 319 | QChar res = QChar::Null; |
| 320 | |
| 321 | if (str.isEmpty()) { |
| 322 | return QChar::Null; |
| 323 | } |
| 324 | |
| 325 | int pos = 0; |
| 326 | if (str[pos] == QLatin1Char('&')) { |
| 327 | pos++; |
| 328 | } |
| 329 | |
| 330 | // Check for '�' or '�' sequence |
| 331 | if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) { |
| 332 | bool ok; |
| 333 | pos++; |
| 334 | if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) { |
| 335 | pos++; |
| 336 | // '�', hexadecimal character reference |
| 337 | const auto tmp = str.mid(pos); |
| 338 | res = QChar(tmp.toInt(ok: &ok, base: 16)); |
| 339 | } else { |
| 340 | // '�', decimal character reference |
| 341 | const auto tmp = str.mid(pos); |
| 342 | res = QChar(tmp.toInt(ok: &ok, base: 10)); |
| 343 | } |
| 344 | if (ok) { |
| 345 | return res; |
| 346 | } else { |
| 347 | return QChar::Null; |
| 348 | } |
| 349 | } |
| 350 | |
| 351 | const QByteArray raw(str.toLatin1()); |
| 352 | const auto e = std::lower_bound(first: std::begin(arr: entities), last: std::end(arr: entities), val: raw); |
| 353 | |
| 354 | if (e == std::end(arr: entities) || raw.size() > MAX_CODE_SIZE || std::strncmp(s1: e->name, s2: raw.constData(), n: MAX_CODE_SIZE) != 0) { |
| 355 | return QChar::Null; |
| 356 | } |
| 357 | |
| 358 | return QChar(e->code); |
| 359 | } |
| 360 | |
| 361 | QChar KCharsets::fromEntity(QStringView str, int &len) |
| 362 | { |
| 363 | // entities are never longer than 8 chars... we start from |
| 364 | // that length and work backwards... |
| 365 | len = 8; |
| 366 | while (len > 0) { |
| 367 | const auto tmp = str.left(n: len); |
| 368 | QChar res = fromEntity(str: tmp); |
| 369 | if (res != QChar::Null) { |
| 370 | return res; |
| 371 | } |
| 372 | len--; |
| 373 | } |
| 374 | return QChar::Null; |
| 375 | } |
| 376 | |
| 377 | QString KCharsets::toEntity(const QChar &ch) |
| 378 | { |
| 379 | return QString::asprintf(format: "�x%x;" , ch.unicode()); |
| 380 | } |
| 381 | |
| 382 | QString KCharsets::resolveEntities(const QString &input) |
| 383 | { |
| 384 | QString text = input; |
| 385 | const QChar *p = text.unicode(); |
| 386 | const QChar *end = p + text.length(); |
| 387 | const QChar *ampersand = nullptr; |
| 388 | bool scanForSemicolon = false; |
| 389 | |
| 390 | for (; p < end; ++p) { |
| 391 | const QChar ch = *p; |
| 392 | |
| 393 | if (ch == QLatin1Char('&')) { |
| 394 | ampersand = p; |
| 395 | scanForSemicolon = true; |
| 396 | continue; |
| 397 | } |
| 398 | |
| 399 | if (ch != QLatin1Char(';') || scanForSemicolon == false) { |
| 400 | continue; |
| 401 | } |
| 402 | |
| 403 | assert(ampersand); |
| 404 | |
| 405 | scanForSemicolon = false; |
| 406 | |
| 407 | const QChar *entityBegin = ampersand + 1; |
| 408 | |
| 409 | const uint entityLength = p - entityBegin; |
| 410 | if (entityLength == 0) { |
| 411 | continue; |
| 412 | } |
| 413 | |
| 414 | const QChar entityValue = KCharsets::fromEntity(str: QStringView(entityBegin, entityLength)); |
| 415 | if (entityValue.isNull()) { |
| 416 | continue; |
| 417 | } |
| 418 | |
| 419 | const uint ampersandPos = ampersand - text.unicode(); |
| 420 | |
| 421 | text[(int)ampersandPos] = entityValue; |
| 422 | text.remove(i: ampersandPos + 1, len: entityLength + 1); |
| 423 | p = text.unicode() + ampersandPos; |
| 424 | end = text.unicode() + text.length(); |
| 425 | ampersand = nullptr; |
| 426 | } |
| 427 | |
| 428 | return text; |
| 429 | } |
| 430 | |
| 431 | QStringList KCharsets::availableEncodingNames() const |
| 432 | { |
| 433 | QStringList available; |
| 434 | for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { |
| 435 | available.append(t: QString::fromUtf8(utf8: language_for_encoding_string + *p)); |
| 436 | } |
| 437 | available.sort(); |
| 438 | return available; |
| 439 | } |
| 440 | |
| 441 | QString KCharsets::descriptionForEncoding(QStringView encoding) const |
| 442 | { |
| 443 | const char *lang = kcharsets_array_search(start: language_for_encoding_string, indices: language_for_encoding_indices, entry: encoding.toUtf8().data()); |
| 444 | if (lang) { |
| 445 | return tr(sourceText: "%1 ( %2 )" , disambiguation: "@item %1 character set, %2 encoding" ).arg(args: tr(sourceText: lang, disambiguation: "@item Text character set" ), args&: encoding); |
| 446 | } else { |
| 447 | return tr(sourceText: "Other encoding (%1)" , disambiguation: "@item" ).arg(a: encoding); |
| 448 | } |
| 449 | } |
| 450 | |
| 451 | QString KCharsets::encodingForName(const QString &descriptiveName) const |
| 452 | { |
| 453 | const int left = descriptiveName.lastIndexOf(c: QLatin1Char('(')); |
| 454 | |
| 455 | if (left < 0) { // No parenthesis, so assume it is a normal encoding name |
| 456 | return descriptiveName.trimmed(); |
| 457 | } |
| 458 | |
| 459 | QString name(descriptiveName.mid(position: left + 1)); |
| 460 | |
| 461 | const int right = name.lastIndexOf(c: QLatin1Char(')')); |
| 462 | |
| 463 | if (right < 0) { |
| 464 | return name; |
| 465 | } |
| 466 | |
| 467 | return name.left(n: right).trimmed(); |
| 468 | } |
| 469 | |
| 470 | QStringList KCharsets::descriptiveEncodingNames() const |
| 471 | { |
| 472 | QStringList encodings; |
| 473 | for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { |
| 474 | const QString name = QString::fromUtf8(utf8: language_for_encoding_string + p[0]); |
| 475 | const QString description = tr(sourceText: language_for_encoding_string + p[1], disambiguation: "@item Text character set" ); |
| 476 | encodings.append(t: tr(sourceText: "%1 ( %2 )" , disambiguation: "@item Text encoding: %1 character set, %2 encoding" ).arg(args: description, args: name)); |
| 477 | } |
| 478 | encodings.sort(); |
| 479 | return encodings; |
| 480 | } |
| 481 | |
| 482 | QList<QStringList> KCharsets::encodingsByScript() const |
| 483 | { |
| 484 | if (!d->encodingsByScript.isEmpty()) { |
| 485 | return d->encodingsByScript; |
| 486 | } |
| 487 | int i; |
| 488 | for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { |
| 489 | const QString name = QString::fromUtf8(utf8: language_for_encoding_string + p[0]); |
| 490 | const QString description = tr(sourceText: language_for_encoding_string + p[1], disambiguation: "@item Text character set" ); |
| 491 | |
| 492 | for (i = 0; i < d->encodingsByScript.size(); ++i) { |
| 493 | if (d->encodingsByScript.at(i).at(i: 0) == description) { |
| 494 | d->encodingsByScript[i].append(t: name); |
| 495 | break; |
| 496 | } |
| 497 | } |
| 498 | |
| 499 | if (i == d->encodingsByScript.size()) { |
| 500 | d->encodingsByScript.append(t: QStringList() << description << name); |
| 501 | } |
| 502 | } |
| 503 | return d->encodingsByScript; |
| 504 | } |
| 505 | |
| 506 | KCharsets *KCharsets::charsets() |
| 507 | { |
| 508 | return &globalCharsets()->instance; |
| 509 | } |
| 510 | |