1 | /* |
2 | This file is part of the KDE libraries |
3 | SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de> |
4 | |
5 | SPDX-License-Identifier: LGPL-2.0-or-later |
6 | */ |
7 | |
8 | #include "kcharselectdata_p.h" |
9 | |
10 | #include <QCoreApplication> |
11 | #include <QFile> |
12 | #include <QFutureInterface> |
13 | #include <QRegularExpression> |
14 | #include <QRunnable> |
15 | #include <QStringList> |
16 | #include <QThreadPool> |
17 | #include <qendian.h> |
18 | |
19 | #include <../test-config.h> |
20 | #include <qstandardpaths.h> |
21 | #include <string.h> |
22 | |
23 | /* constants for hangul (de)composition, see UAX #15 */ |
24 | #define SBase 0xAC00 |
25 | #define LBase 0x1100 |
26 | #define VBase 0x1161 |
27 | #define TBase 0x11A7 |
28 | #define LCount 19 |
29 | #define VCount 21 |
30 | #define TCount 28 |
31 | #define NCount (VCount * TCount) |
32 | #define SCount (LCount * NCount) |
33 | |
34 | class RunIndexCreation : public QFutureInterface<Index>, public QRunnable |
35 | { |
36 | public: |
37 | RunIndexCreation(KCharSelectData *data, const QByteArray &dataFile) |
38 | : m_data(data) |
39 | , m_dataFile(dataFile) |
40 | { |
41 | } |
42 | |
43 | QFuture<Index> start() |
44 | { |
45 | setRunnable(this); |
46 | reportStarted(); |
47 | QFuture<Index> f = this->future(); |
48 | QThreadPool::globalInstance()->start(runnable: this); |
49 | return f; |
50 | } |
51 | |
52 | void run() override |
53 | { |
54 | Index index = m_data->createIndex(dataFile: m_dataFile); |
55 | reportResult(result: index); |
56 | reportFinished(result: nullptr); |
57 | } |
58 | |
59 | private: |
60 | KCharSelectData *const m_data; |
61 | const QByteArray m_dataFile; |
62 | }; |
63 | |
64 | // clang-format off |
65 | static const char JAMO_L_TABLE[][4] = { |
66 | "G" , "GG" , "N" , "D" , "DD" , "R" , "M" , "B" , "BB" , |
67 | "S" , "SS" , "" , "J" , "JJ" , "C" , "K" , "T" , "P" , "H" |
68 | }; |
69 | |
70 | static const char JAMO_V_TABLE[][4] = { |
71 | "A" , "AE" , "YA" , "YAE" , "EO" , "E" , "YEO" , "YE" , "O" , |
72 | "WA" , "WAE" , "OE" , "YO" , "U" , "WEO" , "WE" , "WI" , |
73 | "YU" , "EU" , "YI" , "I" |
74 | }; |
75 | |
76 | static const char JAMO_T_TABLE[][4] = { |
77 | "" , "G" , "GG" , "GS" , "N" , "NJ" , "NH" , "D" , "L" , "LG" , "LM" , |
78 | "LB" , "LS" , "LT" , "LP" , "LH" , "M" , "B" , "BS" , |
79 | "S" , "SS" , "NG" , "J" , "C" , "K" , "T" , "P" , "H" |
80 | }; |
81 | // clang-format on |
82 | |
83 | bool KCharSelectData::openDataFile() |
84 | { |
85 | if (!dataFile.isEmpty()) { |
86 | return true; |
87 | } else { |
88 | QFile file(QStringLiteral(":/kf6/kcharselect/kcharselect-data" )); |
89 | file.open(flags: QIODevice::ReadOnly); |
90 | dataFile = file.readAll(); |
91 | file.close(); |
92 | if (dataFile.size() < 40) { |
93 | dataFile.clear(); |
94 | return false; |
95 | } |
96 | const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); |
97 | const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 20); |
98 | const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 24); |
99 | uint blocks = (offsetEnd - offsetBegin) / 4; |
100 | if (blocks <= 167) { // maximum possible number of blocks in BMP |
101 | // no remapping |
102 | remapType = -1; |
103 | } else if (blocks >= 174 && blocks <= 180) { |
104 | // remapping introduced in 5.25 |
105 | remapType = 0; |
106 | } else { |
107 | // unknown remapping, abort |
108 | dataFile.clear(); |
109 | return false; |
110 | } |
111 | futureIndex = (new RunIndexCreation(this, dataFile))->start(); |
112 | return true; |
113 | } |
114 | } |
115 | |
116 | // Temporary remapping code points <-> 16 bit database codes |
117 | // See kcharselect-generate-datafile.py for details |
118 | |
119 | quint16 KCharSelectData::mapCodePointToDataBase(uint code) const |
120 | { |
121 | if (remapType == 0) { |
122 | if (code >= 0xE000 && code <= 0xEFFF) { |
123 | return 0xFFFF; |
124 | } |
125 | if (code >= 0xF000 && code <= 0xFFFF) { |
126 | return code - 0x1000; |
127 | } |
128 | if (code >= 0x1F000 && code <= 0x1FFFF) { |
129 | return code - 0x10000; |
130 | } |
131 | } |
132 | if (code >= 0x10000) { |
133 | return 0xFFFF; |
134 | } |
135 | return code; |
136 | } |
137 | |
138 | uint KCharSelectData::mapDataBaseToCodePoint(quint16 code) const |
139 | { |
140 | if (remapType == 0) { |
141 | if (code >= 0xE000 && code <= 0xEFFF) { |
142 | return code + 0x1000; |
143 | } |
144 | if (code >= 0xF000) { |
145 | return code + 0x10000; |
146 | } |
147 | } |
148 | return code; |
149 | } |
150 | |
151 | quint32 KCharSelectData::getDetailIndex(uint c) const |
152 | { |
153 | const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); |
154 | // Convert from little-endian, so that this code works on PPC too. |
155 | // http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=482286 |
156 | const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 12); |
157 | const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 16); |
158 | |
159 | int min = 0; |
160 | int mid; |
161 | int max = ((offsetEnd - offsetBegin) / 27) - 1; |
162 | |
163 | quint16 unicode = mapCodePointToDataBase(code: c); |
164 | if (unicode == 0xFFFF) { |
165 | return 0; |
166 | } |
167 | |
168 | static quint16 most_recent_searched; |
169 | static quint32 most_recent_result; |
170 | |
171 | if (unicode == most_recent_searched) { |
172 | return most_recent_result; |
173 | } |
174 | |
175 | most_recent_searched = unicode; |
176 | |
177 | while (max >= min) { |
178 | mid = (min + max) / 2; |
179 | const quint16 midUnicode = qFromLittleEndian<quint16>(src: data + offsetBegin + mid * 27); |
180 | if (unicode > midUnicode) { |
181 | min = mid + 1; |
182 | } else if (unicode < midUnicode) { |
183 | max = mid - 1; |
184 | } else { |
185 | most_recent_result = offsetBegin + mid * 27; |
186 | |
187 | return most_recent_result; |
188 | } |
189 | } |
190 | |
191 | most_recent_result = 0; |
192 | return 0; |
193 | } |
194 | |
195 | QString KCharSelectData::formatCode(uint code, int length, const QString &prefix, int base) |
196 | { |
197 | QString s = QString::number(code, base).toUpper(); |
198 | while (s.size() < length) { |
199 | s.prepend(c: QLatin1Char('0')); |
200 | } |
201 | s.prepend(s: prefix); |
202 | return s; |
203 | } |
204 | |
205 | QList<uint> KCharSelectData::blockContents(int block) |
206 | { |
207 | if (!openDataFile()) { |
208 | return QList<uint>(); |
209 | } |
210 | |
211 | const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); |
212 | const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 20); |
213 | const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 24); |
214 | |
215 | int max = ((offsetEnd - offsetBegin) / 4) - 1; |
216 | |
217 | QList<uint> res; |
218 | |
219 | if (block > max) { |
220 | return res; |
221 | } |
222 | |
223 | quint16 unicodeBegin = qFromLittleEndian<quint16>(src: data + offsetBegin + block * 4); |
224 | quint16 unicodeEnd = qFromLittleEndian<quint16>(src: data + offsetBegin + block * 4 + 2); |
225 | |
226 | while (unicodeBegin < unicodeEnd) { |
227 | res.append(t: mapDataBaseToCodePoint(code: unicodeBegin)); |
228 | unicodeBegin++; |
229 | } |
230 | res.append(t: mapDataBaseToCodePoint(code: unicodeBegin)); // Be careful when unicodeEnd==0xffff |
231 | |
232 | return res; |
233 | } |
234 | |
235 | QList<int> KCharSelectData::sectionContents(int section) |
236 | { |
237 | section -= 1; |
238 | if (!openDataFile()) { |
239 | return QList<int>(); |
240 | } |
241 | |
242 | const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); |
243 | const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 28); |
244 | const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 32); |
245 | |
246 | int max = ((offsetEnd - offsetBegin) / 4) - 1; |
247 | |
248 | QList<int> res; |
249 | |
250 | if (section > max) { |
251 | return res; |
252 | } |
253 | |
254 | for (int i = 0; i <= max; i++) { |
255 | const quint16 currSection = qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4); |
256 | if (currSection == section || section < 0) { |
257 | res.append(t: qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4 + 2)); |
258 | } |
259 | } |
260 | |
261 | return res; |
262 | } |
263 | |
264 | QStringList KCharSelectData::sectionList() |
265 | { |
266 | if (!openDataFile()) { |
267 | return QStringList(); |
268 | } |
269 | |
270 | const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); |
271 | const quint32 stringBegin = qFromLittleEndian<quint32>(src: udata + 24); |
272 | const quint32 stringEnd = qFromLittleEndian<quint32>(src: udata + 28); |
273 | |
274 | const char *data = dataFile.constData(); |
275 | QStringList list; |
276 | quint32 i = stringBegin; |
277 | list.append(t: QCoreApplication::translate(context: "KCharSelectData" , key: "All" , disambiguation: "KCharSelect section name" )); |
278 | while (i < stringEnd) { |
279 | list.append(t: QCoreApplication::translate(context: "KCharSelectData" , key: data + i, disambiguation: "KCharSelect section name" )); |
280 | i += qstrlen(str: data + i) + 1; |
281 | } |
282 | |
283 | return list; |
284 | } |
285 | |
286 | QString KCharSelectData::block(uint c) |
287 | { |
288 | return blockName(index: blockIndex(c)); |
289 | } |
290 | |
291 | QString KCharSelectData::section(uint c) |
292 | { |
293 | return sectionName(index: sectionIndex(block: blockIndex(c))); |
294 | } |
295 | |
296 | QString KCharSelectData::name(uint c) |
297 | { |
298 | if (!openDataFile()) { |
299 | return QString(); |
300 | } |
301 | |
302 | if ((c & 0xFFFE) == 0xFFFE || (c >= 0xFDD0 && c <= 0xFDEF)) { |
303 | return QCoreApplication::translate(context: "KCharSelectData" , key: "<noncharacter>" ); |
304 | } else if ((c >= 0x3400 && c <= 0x4DBF) || (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x20000 && c <= 0x2F7FF)) { |
305 | return QLatin1String("CJK UNIFIED IDEOGRAPH-" ) + formatCode(code: c, length: 4, prefix: QString()); |
306 | } else if (c >= 0xAC00 && c <= 0xD7AF) { |
307 | /* compute hangul syllable name as per UAX #15 */ |
308 | int SIndex = c - SBase; |
309 | int LIndex; |
310 | int VIndex; |
311 | int TIndex; |
312 | |
313 | if (SIndex < 0 || SIndex >= SCount) { |
314 | return QString(); |
315 | } |
316 | |
317 | LIndex = SIndex / NCount; |
318 | VIndex = (SIndex % NCount) / TCount; |
319 | TIndex = SIndex % TCount; |
320 | |
321 | return QLatin1String("HANGUL SYLLABLE " ) + QLatin1String(JAMO_L_TABLE[LIndex]) + QLatin1String(JAMO_V_TABLE[VIndex]) |
322 | + QLatin1String(JAMO_T_TABLE[TIndex]); |
323 | } else if (c >= 0xD800 && c <= 0xDB7F) { |
324 | return QCoreApplication::translate(context: "KCharSelectData" , key: "<Non Private Use High Surrogate>" ); |
325 | } else if (c >= 0xDB80 && c <= 0xDBFF) { |
326 | return QCoreApplication::translate(context: "KCharSelectData" , key: "<Private Use High Surrogate>" ); |
327 | } else if (c >= 0xDC00 && c <= 0xDFFF) { |
328 | return QCoreApplication::translate(context: "KCharSelectData" , key: "<Low Surrogate>" ); |
329 | } else if ((c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) { |
330 | return QCoreApplication::translate(context: "KCharSelectData" , key: "<Private Use>" ); |
331 | } else if ((c >= 0xF900 && c <= 0xFAFF) || (c >= 0x2F800 && c <= 0x2FFFF)) { |
332 | return QLatin1String("CJK COMPATIBILITY IDEOGRAPH-" ) + formatCode(code: c, length: 4, prefix: QString()); |
333 | } |
334 | quint16 unicode = mapCodePointToDataBase(code: c); |
335 | if (unicode == 0xFFFF) { |
336 | return QLatin1String("NON-BMP-CHARACTER-" ) + formatCode(code: c, length: 4, prefix: QString()); |
337 | } else { |
338 | const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); |
339 | const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 4); |
340 | const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 8); |
341 | |
342 | int min = 0; |
343 | int mid; |
344 | int max = ((offsetEnd - offsetBegin) / 6) - 1; |
345 | QString s; |
346 | |
347 | while (max >= min) { |
348 | mid = (min + max) / 2; |
349 | const quint16 midUnicode = qFromLittleEndian<quint16>(src: data + offsetBegin + mid * 6); |
350 | if (unicode > midUnicode) { |
351 | min = mid + 1; |
352 | } else if (unicode < midUnicode) { |
353 | max = mid - 1; |
354 | } else { |
355 | quint32 offset = qFromLittleEndian<quint32>(src: data + offsetBegin + mid * 6 + 2); |
356 | s = QString::fromUtf8(utf8: dataFile.constData() + offset + 1); |
357 | break; |
358 | } |
359 | } |
360 | |
361 | if (s.isNull()) { |
362 | return QCoreApplication::translate(context: "KCharSelectData" , key: "<not assigned>" ); |
363 | } else { |
364 | return s; |
365 | } |
366 | } |
367 | } |
368 | |
369 | int KCharSelectData::blockIndex(uint c) |
370 | { |
371 | if (!openDataFile()) { |
372 | return 0; |
373 | } |
374 | |
375 | const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); |
376 | const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 20); |
377 | const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 24); |
378 | const quint16 unicode = mapCodePointToDataBase(code: c); |
379 | if (unicode == 0xFFFF) { |
380 | return 0; |
381 | } |
382 | |
383 | int max = ((offsetEnd - offsetBegin) / 4) - 1; |
384 | |
385 | int i = 0; |
386 | |
387 | while (unicode > qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4 + 2) && i < max) { |
388 | i++; |
389 | } |
390 | |
391 | return i; |
392 | } |
393 | |
394 | int KCharSelectData::sectionIndex(int block) |
395 | { |
396 | if (!openDataFile()) { |
397 | return 0; |
398 | } |
399 | |
400 | const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); |
401 | const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 28); |
402 | const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 32); |
403 | |
404 | int max = ((offsetEnd - offsetBegin) / 4) - 1; |
405 | |
406 | for (int i = 0; i <= max; i++) { |
407 | if (qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4 + 2) == block) { |
408 | return qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4) + 1; |
409 | } |
410 | } |
411 | |
412 | return 0; |
413 | } |
414 | |
415 | QString KCharSelectData::blockName(int index) |
416 | { |
417 | if (!openDataFile()) { |
418 | return QString(); |
419 | } |
420 | |
421 | const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); |
422 | const quint32 stringBegin = qFromLittleEndian<quint32>(src: udata + 16); |
423 | const quint32 stringEnd = qFromLittleEndian<quint32>(src: udata + 20); |
424 | |
425 | quint32 i = stringBegin; |
426 | int currIndex = 0; |
427 | |
428 | const char *data = dataFile.constData(); |
429 | while (i < stringEnd && currIndex < index) { |
430 | i += qstrlen(str: data + i) + 1; |
431 | currIndex++; |
432 | } |
433 | |
434 | return QCoreApplication::translate(context: "KCharSelectData" , key: data + i, disambiguation: "KCharselect unicode block name" ); |
435 | } |
436 | |
437 | QString KCharSelectData::sectionName(int index) |
438 | { |
439 | if (index == 0) { |
440 | return QCoreApplication::translate(context: "KCharSelectData" , key: "All" , disambiguation: "KCharselect unicode section name" ); |
441 | } |
442 | if (!openDataFile()) { |
443 | return QString(); |
444 | } |
445 | |
446 | index -= 1; |
447 | |
448 | const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); |
449 | const quint32 stringBegin = qFromLittleEndian<quint32>(src: udata + 24); |
450 | const quint32 stringEnd = qFromLittleEndian<quint32>(src: udata + 28); |
451 | |
452 | quint32 i = stringBegin; |
453 | int currIndex = 0; |
454 | |
455 | const char *data = dataFile.constData(); |
456 | while (i < stringEnd && currIndex < index) { |
457 | i += qstrlen(str: data + i) + 1; |
458 | currIndex++; |
459 | } |
460 | |
461 | return QCoreApplication::translate(context: "KCharSelectData" , key: data + i, disambiguation: "KCharselect unicode section name" ); |
462 | } |
463 | |
464 | QStringList KCharSelectData::aliases(uint c) |
465 | { |
466 | if (!openDataFile()) { |
467 | return QStringList(); |
468 | } |
469 | const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); |
470 | const int detailIndex = getDetailIndex(c); |
471 | if (detailIndex == 0) { |
472 | return QStringList(); |
473 | } |
474 | |
475 | const quint8 count = *(quint8 *)(udata + detailIndex + 6); |
476 | quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 2); |
477 | |
478 | QStringList aliases; |
479 | aliases.reserve(asize: count); |
480 | |
481 | const char *data = dataFile.constData(); |
482 | for (int i = 0; i < count; i++) { |
483 | aliases.append(t: QString::fromUtf8(utf8: data + offset)); |
484 | offset += qstrlen(str: data + offset) + 1; |
485 | } |
486 | return aliases; |
487 | } |
488 | |
489 | QStringList KCharSelectData::notes(uint c) |
490 | { |
491 | if (!openDataFile()) { |
492 | return QStringList(); |
493 | } |
494 | const int detailIndex = getDetailIndex(c); |
495 | if (detailIndex == 0) { |
496 | return QStringList(); |
497 | } |
498 | |
499 | const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); |
500 | const quint8 count = *(quint8 *)(udata + detailIndex + 11); |
501 | quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 7); |
502 | |
503 | QStringList notes; |
504 | notes.reserve(asize: count); |
505 | |
506 | const char *data = dataFile.constData(); |
507 | for (int i = 0; i < count; i++) { |
508 | notes.append(t: QString::fromUtf8(utf8: data + offset)); |
509 | offset += qstrlen(str: data + offset) + 1; |
510 | } |
511 | |
512 | return notes; |
513 | } |
514 | |
515 | QList<uint> KCharSelectData::seeAlso(uint c) |
516 | { |
517 | if (!openDataFile()) { |
518 | return QList<uint>(); |
519 | } |
520 | const int detailIndex = getDetailIndex(c); |
521 | if (detailIndex == 0) { |
522 | return QList<uint>(); |
523 | } |
524 | |
525 | const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); |
526 | const quint8 count = *(quint8 *)(udata + detailIndex + 26); |
527 | quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 22); |
528 | |
529 | QList<uint> seeAlso; |
530 | seeAlso.reserve(asize: count); |
531 | |
532 | for (int i = 0; i < count; i++) { |
533 | seeAlso.append(t: mapDataBaseToCodePoint(code: qFromLittleEndian<quint16>(src: udata + offset))); |
534 | offset += 2; |
535 | } |
536 | |
537 | return seeAlso; |
538 | } |
539 | |
540 | QStringList KCharSelectData::equivalents(uint c) |
541 | { |
542 | if (!openDataFile()) { |
543 | return QStringList(); |
544 | } |
545 | const int detailIndex = getDetailIndex(c); |
546 | if (detailIndex == 0) { |
547 | return QStringList(); |
548 | } |
549 | |
550 | const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); |
551 | const quint8 count = *(quint8 *)(udata + detailIndex + 21); |
552 | quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 17); |
553 | |
554 | QStringList equivalents; |
555 | equivalents.reserve(asize: count); |
556 | |
557 | const char *data = dataFile.constData(); |
558 | for (int i = 0; i < count; i++) { |
559 | equivalents.append(t: QString::fromUtf8(utf8: data + offset)); |
560 | offset += qstrlen(str: data + offset) + 1; |
561 | } |
562 | |
563 | return equivalents; |
564 | } |
565 | |
566 | QStringList KCharSelectData::approximateEquivalents(uint c) |
567 | { |
568 | if (!openDataFile()) { |
569 | return QStringList(); |
570 | } |
571 | const int detailIndex = getDetailIndex(c); |
572 | if (detailIndex == 0) { |
573 | return QStringList(); |
574 | } |
575 | |
576 | const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); |
577 | const quint8 count = *(quint8 *)(udata + detailIndex + 16); |
578 | quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 12); |
579 | |
580 | QStringList approxEquivalents; |
581 | approxEquivalents.reserve(asize: count); |
582 | |
583 | const char *data = dataFile.constData(); |
584 | for (int i = 0; i < count; i++) { |
585 | approxEquivalents.append(t: QString::fromUtf8(utf8: data + offset)); |
586 | offset += qstrlen(str: data + offset) + 1; |
587 | } |
588 | |
589 | return approxEquivalents; |
590 | } |
591 | |
592 | QList<uint> KCharSelectData::decomposition(uint c) |
593 | { |
594 | // for now, only decompose Hangul Syllable into Hangul Jamo |
595 | uint SIndex = c - SBase; |
596 | if (SIndex >= SCount) { |
597 | return QList<uint>(); |
598 | } |
599 | |
600 | uint L = LBase + SIndex / NCount; // Choseong |
601 | uint V = VBase + (SIndex % NCount) / TCount; // Jungseong |
602 | uint T = TBase + SIndex % TCount; // Jongsung |
603 | QList<uint> jamoList; |
604 | jamoList.append(t: L); |
605 | jamoList.append(t: V); |
606 | if (T != TBase) { |
607 | jamoList.append(t: T); |
608 | } |
609 | return jamoList; |
610 | } |
611 | |
612 | QStringList KCharSelectData::unihanInfo(uint c) |
613 | { |
614 | if (!openDataFile()) { |
615 | return QStringList(); |
616 | } |
617 | |
618 | quint16 unicode = mapCodePointToDataBase(code: c); |
619 | if (unicode == 0xFFFF) { |
620 | return QStringList(); |
621 | } |
622 | |
623 | const char *data = dataFile.constData(); |
624 | const uchar *udata = reinterpret_cast<const uchar *>(data); |
625 | const quint32 offsetBegin = qFromLittleEndian<quint32>(src: udata + 36); |
626 | const quint32 offsetEnd = dataFile.size(); |
627 | |
628 | int min = 0; |
629 | int mid; |
630 | int max = ((offsetEnd - offsetBegin) / 30) - 1; |
631 | |
632 | while (max >= min) { |
633 | mid = (min + max) / 2; |
634 | const quint16 midUnicode = qFromLittleEndian<quint16>(src: udata + offsetBegin + mid * 30); |
635 | if (unicode > midUnicode) { |
636 | min = mid + 1; |
637 | } else if (unicode < midUnicode) { |
638 | max = mid - 1; |
639 | } else { |
640 | QStringList res; |
641 | res.reserve(asize: 7); |
642 | for (int i = 0; i < 7; i++) { |
643 | quint32 offset = qFromLittleEndian<quint32>(src: udata + offsetBegin + mid * 30 + 2 + i * 4); |
644 | if (offset != 0) { |
645 | res.append(t: QString::fromUtf8(utf8: data + offset)); |
646 | } else { |
647 | res.append(t: QString()); |
648 | } |
649 | } |
650 | return res; |
651 | } |
652 | } |
653 | |
654 | return QStringList(); |
655 | } |
656 | |
657 | QChar::Category KCharSelectData::category(uint c) |
658 | { |
659 | if (!openDataFile()) { |
660 | return QChar::category(ucs4: c); |
661 | } |
662 | |
663 | ushort unicode = mapCodePointToDataBase(code: c); |
664 | if (unicode == 0xFFFF) { |
665 | return QChar::category(ucs4: c); |
666 | } |
667 | |
668 | const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); |
669 | const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 4); |
670 | const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 8); |
671 | |
672 | int min = 0; |
673 | int mid; |
674 | int max = ((offsetEnd - offsetBegin) / 6) - 1; |
675 | |
676 | while (max >= min) { |
677 | mid = (min + max) / 2; |
678 | const quint16 midUnicode = qFromLittleEndian<quint16>(src: data + offsetBegin + mid * 6); |
679 | if (unicode > midUnicode) { |
680 | min = mid + 1; |
681 | } else if (unicode < midUnicode) { |
682 | max = mid - 1; |
683 | } else { |
684 | quint32 offset = qFromLittleEndian<quint32>(src: data + offsetBegin + mid * 6 + 2); |
685 | uchar categoryCode = *(data + offset); |
686 | Q_ASSERT(categoryCode > 0); |
687 | categoryCode--; /* Qt5 changed QChar::Category enum to start from 0 instead of 1 |
688 | See QtBase commit d17c76feee9eece4 */ |
689 | return QChar::Category(categoryCode); |
690 | } |
691 | } |
692 | |
693 | return QChar::category(ucs4: c); |
694 | } |
695 | |
696 | bool KCharSelectData::isPrint(uint c) |
697 | { |
698 | QChar::Category cat = category(c); |
699 | return !(cat == QChar::Other_Control || cat == QChar::Other_NotAssigned); |
700 | } |
701 | |
702 | bool KCharSelectData::isDisplayable(uint c) |
703 | { |
704 | // Qt internally uses U+FDD0 and U+FDD1 to mark the beginning and the end of frames. |
705 | // They should be seen as non-printable characters, as trying to display them leads |
706 | // to a crash caused by a Qt "noBlockInString" assertion. |
707 | if (c == 0xFDD0 || c == 0xFDD1) { |
708 | return false; |
709 | } |
710 | |
711 | return !isIgnorable(c) && isPrint(c); |
712 | } |
713 | |
714 | bool KCharSelectData::isIgnorable(uint c) |
715 | { |
716 | /* |
717 | * According to the Unicode standard, Default Ignorable Code Points |
718 | * should be ignored unless explicitly supported. For example, U+202E |
719 | * RIGHT-TO-LEFT-OVERRIDE ir printable according to Qt, but displaying |
720 | * it gives the undesired effect of all text being turned RTL. We do not |
721 | * have a way to "explicitly" support it, so we will treat it as |
722 | * non-printable. |
723 | * |
724 | * There is a list of these on |
725 | * http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt under the |
726 | * property Default_Ignorable_Code_Point. |
727 | */ |
728 | |
729 | // NOTE: not very nice to hardcode these here; is it worth it to modify |
730 | // the binary data file to hold them? |
731 | // clang-format off |
732 | return c == 0x00AD || c == 0x034F || c == 0x115F || c == 0x1160 || |
733 | c == 0x17B4 || c == 0x17B5 || (c >= 0x180B && c <= 0x180D) || |
734 | (c >= 0x200B && c <= 0x200F) || (c >= 0x202A && c <= 0x202E) || |
735 | (c >= 0x2060 && c <= 0x206F) || c == 0x3164 || |
736 | (c >= 0xFE00 && c <= 0xFE0F) || c == 0xFEFF || c == 0xFFA0 || |
737 | (c >= 0xFFF0 && c <= 0xFFF8); |
738 | // clang-format on |
739 | } |
740 | |
741 | bool KCharSelectData::isCombining(uint c) |
742 | { |
743 | return section(c) == QCoreApplication::translate(context: "KCharSelectData" , key: "Combining Diacritics" , disambiguation: "KCharSelect section name" ); |
744 | // FIXME: this is an imperfect test. There are many combining characters |
745 | // that are outside of this section. See Grapheme_Extend in |
746 | // http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt |
747 | } |
748 | |
749 | QString KCharSelectData::display(uint c, const QFont &font) |
750 | { |
751 | if (!isDisplayable(c)) { |
752 | return QLatin1String("<b>" ) + QCoreApplication::translate(context: "KCharSelectData" , key: "Non-printable" ) + QLatin1String("</b>" ); |
753 | } else { |
754 | QString s = QLatin1String("<font size=\"+4\" face=\"" ) + font.family() + QLatin1String("\">" ); |
755 | if (isCombining(c)) { |
756 | s += displayCombining(c); |
757 | } else { |
758 | s += QLatin1String("&#" ) + QString::number(c) + QLatin1Char(';'); |
759 | } |
760 | s += QLatin1String("</font>" ); |
761 | return s; |
762 | } |
763 | } |
764 | |
765 | QString KCharSelectData::displayCombining(uint c) |
766 | { |
767 | /* |
768 | * The purpose of this is to make it easier to see how a combining |
769 | * character affects the text around it. |
770 | * The initial plan was to use U+25CC DOTTED CIRCLE for this purpose, |
771 | * as seen in pdfs from Unicode, but there seem to be a lot of alignment |
772 | * problems with that. |
773 | * |
774 | * Eventually, it would be nice to determine whether the character |
775 | * combines to the left or to the right, etc. |
776 | */ |
777 | QString s = QLatin1String(" &#" ) + QString::number(c) + QLatin1String("; " ) + QLatin1String(" (ab&#" ) + QString::number(c) + QLatin1String(";c)" ); |
778 | return s; |
779 | } |
780 | |
781 | QString KCharSelectData::categoryText(QChar::Category category) |
782 | { |
783 | switch (category) { |
784 | case QChar::Other_Control: |
785 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Other, Control" ); |
786 | case QChar::Other_Format: |
787 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Other, Format" ); |
788 | case QChar::Other_NotAssigned: |
789 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Other, Not Assigned" ); |
790 | case QChar::Other_PrivateUse: |
791 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Other, Private Use" ); |
792 | case QChar::Other_Surrogate: |
793 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Other, Surrogate" ); |
794 | case QChar::Letter_Lowercase: |
795 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Letter, Lowercase" ); |
796 | case QChar::Letter_Modifier: |
797 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Letter, Modifier" ); |
798 | case QChar::Letter_Other: |
799 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Letter, Other" ); |
800 | case QChar::Letter_Titlecase: |
801 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Letter, Titlecase" ); |
802 | case QChar::Letter_Uppercase: |
803 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Letter, Uppercase" ); |
804 | case QChar::Mark_SpacingCombining: |
805 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Mark, Spacing Combining" ); |
806 | case QChar::Mark_Enclosing: |
807 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Mark, Enclosing" ); |
808 | case QChar::Mark_NonSpacing: |
809 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Mark, Non-Spacing" ); |
810 | case QChar::Number_DecimalDigit: |
811 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Number, Decimal Digit" ); |
812 | case QChar::Number_Letter: |
813 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Number, Letter" ); |
814 | case QChar::Number_Other: |
815 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Number, Other" ); |
816 | case QChar::Punctuation_Connector: |
817 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Punctuation, Connector" ); |
818 | case QChar::Punctuation_Dash: |
819 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Punctuation, Dash" ); |
820 | case QChar::Punctuation_Close: |
821 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Punctuation, Close" ); |
822 | case QChar::Punctuation_FinalQuote: |
823 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Punctuation, Final Quote" ); |
824 | case QChar::Punctuation_InitialQuote: |
825 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Punctuation, Initial Quote" ); |
826 | case QChar::Punctuation_Other: |
827 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Punctuation, Other" ); |
828 | case QChar::Punctuation_Open: |
829 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Punctuation, Open" ); |
830 | case QChar::Symbol_Currency: |
831 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Symbol, Currency" ); |
832 | case QChar::Symbol_Modifier: |
833 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Symbol, Modifier" ); |
834 | case QChar::Symbol_Math: |
835 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Symbol, Math" ); |
836 | case QChar::Symbol_Other: |
837 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Symbol, Other" ); |
838 | case QChar::Separator_Line: |
839 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Separator, Line" ); |
840 | case QChar::Separator_Paragraph: |
841 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Separator, Paragraph" ); |
842 | case QChar::Separator_Space: |
843 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Separator, Space" ); |
844 | default: |
845 | return QCoreApplication::translate(context: "KCharSelectData" , key: "Unknown" ); |
846 | } |
847 | } |
848 | |
849 | QList<uint> KCharSelectData::find(const QString &needle) |
850 | { |
851 | QSet<uint> result; |
852 | |
853 | QList<uint> returnRes; |
854 | QString simplified = needle.length() > 1 ? needle.simplified() : needle; |
855 | QStringList searchStrings; |
856 | |
857 | static const QRegularExpression octalExp(QStringLiteral("^\\\\[0-7][0-7\\\\]*$" )); |
858 | if (octalExp.match(subject: simplified).hasMatch()) { |
859 | // search for C octal escaped UTF-8 |
860 | QByteArray utf8; |
861 | int byte = -1; |
862 | for (int i = 0; i <= simplified.length(); ++i) { |
863 | int c = simplified.at(i).unicode(); |
864 | if (c >= '0' && c <= '7') { |
865 | byte = 8 * byte + c - '0'; |
866 | } else if (byte == -1) { |
867 | byte = 0; |
868 | } else if (byte >= 0x00 && byte <= 0xFF) { |
869 | utf8.append(c: (char)byte); |
870 | byte = 0; |
871 | } |
872 | } |
873 | simplified = QString::fromUtf8(ba: utf8); |
874 | } |
875 | |
876 | if (simplified.length() <= 2) { |
877 | QList<uint> ucs4 = simplified.toUcs4(); |
878 | if (ucs4.size() == 1) { |
879 | // search for hex representation of the character |
880 | searchStrings = QStringList(formatCode(code: ucs4.at(i: 0))); |
881 | } else { |
882 | searchStrings = splitString(s: simplified); |
883 | } |
884 | } else { |
885 | searchStrings = splitString(s: simplified); |
886 | } |
887 | |
888 | if (searchStrings.isEmpty()) { |
889 | return returnRes; |
890 | } |
891 | |
892 | static const QRegularExpression hexExp(QStringLiteral("^(?:|u\\+|U\\+|0x|0X)([A-Fa-f0-9]{4,5})$" )); |
893 | for (const QString &s : std::as_const(t&: searchStrings)) { |
894 | const QRegularExpressionMatch match = hexExp.match(subject: s); |
895 | if (match.hasMatch()) { |
896 | const QString cap = match.captured(nth: 1); |
897 | returnRes.append(t: cap.toInt(ok: nullptr, base: 16)); |
898 | // search for "1234" instead of "0x1234" |
899 | if (s.length() == 6 || s.length() == 7) { |
900 | searchStrings[searchStrings.indexOf(str: s)] = cap; |
901 | } |
902 | } |
903 | // try to parse string as decimal number |
904 | bool ok; |
905 | int unicode = s.toInt(ok: &ok); |
906 | if (ok && unicode >= 0 && unicode <= QChar::LastValidCodePoint) { |
907 | returnRes.append(t: unicode); |
908 | } |
909 | } |
910 | |
911 | bool firstSubString = true; |
912 | for (const QString &s : std::as_const(t&: searchStrings)) { |
913 | QSet<uint> partResult = getMatchingChars(s: s.toLower()); |
914 | if (firstSubString) { |
915 | result = partResult; |
916 | firstSubString = false; |
917 | } else { |
918 | result = result.intersect(other: partResult); |
919 | } |
920 | } |
921 | |
922 | // remove results found by matching the code point to prevent duplicate results |
923 | // while letting these characters stay at the beginning |
924 | for (uint c : std::as_const(t&: returnRes)) { |
925 | result.remove(value: c); |
926 | } |
927 | |
928 | QList<uint> sortedResult; |
929 | sortedResult.reserve(asize: result.count()); |
930 | for (auto c : std::as_const(t&: result)) { |
931 | sortedResult.append(t: c); |
932 | } |
933 | std::sort(first: sortedResult.begin(), last: sortedResult.end()); |
934 | |
935 | returnRes += sortedResult; |
936 | return returnRes; |
937 | } |
938 | |
939 | QSet<uint> KCharSelectData::getMatchingChars(const QString &s) |
940 | { |
941 | if (dataFile.isEmpty()) { |
942 | return QSet<uint>(); |
943 | } |
944 | futureIndex.waitForFinished(); |
945 | const Index index = futureIndex.result(); |
946 | Index::const_iterator pos = index.lowerBound(key: s); |
947 | QSet<uint> result; |
948 | |
949 | while (pos != index.constEnd() && pos.key().startsWith(s)) { |
950 | for (quint16 c : pos.value()) { |
951 | result.insert(value: mapDataBaseToCodePoint(code: c)); |
952 | } |
953 | ++pos; |
954 | } |
955 | |
956 | return result; |
957 | } |
958 | |
959 | QStringList KCharSelectData::splitString(const QString &s) |
960 | { |
961 | QStringList result; |
962 | int start = 0; |
963 | int end = 0; |
964 | int length = s.length(); |
965 | while (end < length) { |
966 | while (end < length && (s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) { |
967 | end++; |
968 | } |
969 | if (start != end) { |
970 | result.append(t: s.mid(position: start, n: end - start)); |
971 | } |
972 | start = end; |
973 | while (end < length && !(s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) { |
974 | end++; |
975 | start++; |
976 | } |
977 | } |
978 | return result; |
979 | } |
980 | |
981 | void KCharSelectData::appendToIndex(Index *index, quint16 unicode, const QString &s) |
982 | { |
983 | const QStringList strings = splitString(s); |
984 | for (const QString &s : strings) { |
985 | (*index)[s.toLower()].append(t: unicode); |
986 | } |
987 | } |
988 | |
989 | Index KCharSelectData::createIndex(const QByteArray &dataFile) |
990 | { |
991 | Index i; |
992 | |
993 | // character names |
994 | const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); |
995 | const char *data = dataFile.constData(); |
996 | const quint32 nameOffsetBegin = qFromLittleEndian<quint32>(src: udata + 4); |
997 | const quint32 nameOffsetEnd = qFromLittleEndian<quint32>(src: udata + 8); |
998 | |
999 | int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1; |
1000 | |
1001 | for (int pos = 0; pos <= max; pos++) { |
1002 | const quint16 unicode = qFromLittleEndian<quint16>(src: udata + nameOffsetBegin + pos * 6); |
1003 | quint32 offset = qFromLittleEndian<quint32>(src: udata + nameOffsetBegin + pos * 6 + 2); |
1004 | appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + offset + 1)); |
1005 | } |
1006 | |
1007 | // details |
1008 | const quint32 detailsOffsetBegin = qFromLittleEndian<quint32>(src: udata + 12); |
1009 | const quint32 detailsOffsetEnd = qFromLittleEndian<quint32>(src: udata + 16); |
1010 | |
1011 | max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1; |
1012 | |
1013 | for (int pos = 0; pos <= max; pos++) { |
1014 | const quint16 unicode = qFromLittleEndian<quint16>(src: udata + detailsOffsetBegin + pos * 27); |
1015 | |
1016 | // aliases |
1017 | const quint8 aliasCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 6); |
1018 | quint32 aliasOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 2); |
1019 | |
1020 | for (int j = 0; j < aliasCount; j++) { |
1021 | appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + aliasOffset)); |
1022 | aliasOffset += qstrlen(str: data + aliasOffset) + 1; |
1023 | } |
1024 | |
1025 | // notes |
1026 | const quint8 notesCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 11); |
1027 | quint32 notesOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 7); |
1028 | |
1029 | for (int j = 0; j < notesCount; j++) { |
1030 | appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + notesOffset)); |
1031 | notesOffset += qstrlen(str: data + notesOffset) + 1; |
1032 | } |
1033 | |
1034 | // approximate equivalents |
1035 | const quint8 apprCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 16); |
1036 | quint32 apprOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 12); |
1037 | |
1038 | for (int j = 0; j < apprCount; j++) { |
1039 | appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + apprOffset)); |
1040 | apprOffset += qstrlen(str: data + apprOffset) + 1; |
1041 | } |
1042 | |
1043 | // equivalents |
1044 | const quint8 equivCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 21); |
1045 | quint32 equivOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 17); |
1046 | |
1047 | for (int j = 0; j < equivCount; j++) { |
1048 | appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + equivOffset)); |
1049 | equivOffset += qstrlen(str: data + equivOffset) + 1; |
1050 | } |
1051 | |
1052 | // see also - convert to string (hex) |
1053 | const quint8 seeAlsoCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 26); |
1054 | quint32 seeAlsoOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 22); |
1055 | |
1056 | for (int j = 0; j < seeAlsoCount; j++) { |
1057 | quint16 seeAlso = qFromLittleEndian<quint16>(src: udata + seeAlsoOffset); |
1058 | appendToIndex(index: &i, unicode, s: formatCode(code: seeAlso, length: 4, prefix: QString())); |
1059 | equivOffset += qstrlen(str: data + equivOffset) + 1; |
1060 | } |
1061 | } |
1062 | |
1063 | // unihan data |
1064 | // temporary disabled due to the huge amount of data |
1065 | // const quint32 unihanOffsetBegin = qFromLittleEndian<quint32>(udata+36); |
1066 | // const quint32 unihanOffsetEnd = dataFile.size(); |
1067 | // max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1; |
1068 | // |
1069 | // for (int pos = 0; pos <= max; pos++) { |
1070 | // const quint16 unicode = qFromLittleEndian<quint16>(udata + unihanOffsetBegin + pos*30); |
1071 | // for(int j = 0; j < 7; j++) { |
1072 | // quint32 offset = qFromLittleEndian<quint32>(udata + unihanOffsetBegin + pos*30 + 2 + j*4); |
1073 | // if(offset != 0) { |
1074 | // appendToIndex(&i, unicode, QString::fromUtf8(data + offset)); |
1075 | // } |
1076 | // } |
1077 | // } |
1078 | |
1079 | return i; |
1080 | } |
1081 | |