1/*
2 This file is part of the KDE libraries
3 SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de>
4
5 SPDX-License-Identifier: LGPL-2.0-or-later
6*/
7
8#include "kcharselectdata_p.h"
9
10#include <QCoreApplication>
11#include <QFile>
12#include <QFutureInterface>
13#include <QRegularExpression>
14#include <QRunnable>
15#include <QStringList>
16#include <QThreadPool>
17#include <qendian.h>
18
19#include <../test-config.h>
20#include <qstandardpaths.h>
21#include <string.h>
22
23/* constants for hangul (de)composition, see UAX #15 */
24#define SBase 0xAC00
25#define LBase 0x1100
26#define VBase 0x1161
27#define TBase 0x11A7
28#define LCount 19
29#define VCount 21
30#define TCount 28
31#define NCount (VCount * TCount)
32#define SCount (LCount * NCount)
33
34class RunIndexCreation : public QFutureInterface<Index>, public QRunnable
35{
36public:
37 RunIndexCreation(KCharSelectData *data, const QByteArray &dataFile)
38 : m_data(data)
39 , m_dataFile(dataFile)
40 {
41 }
42
43 QFuture<Index> start()
44 {
45 setRunnable(this);
46 reportStarted();
47 QFuture<Index> f = this->future();
48 QThreadPool::globalInstance()->start(runnable: this);
49 return f;
50 }
51
52 void run() override
53 {
54 Index index = m_data->createIndex(dataFile: m_dataFile);
55 reportResult(result: index);
56 reportFinished(result: nullptr);
57 }
58
59private:
60 KCharSelectData *const m_data;
61 const QByteArray m_dataFile;
62};
63
64// clang-format off
65static const char JAMO_L_TABLE[][4] = {
66 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
67 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
68};
69
70static const char JAMO_V_TABLE[][4] = {
71 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
72 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
73 "YU", "EU", "YI", "I"
74};
75
76static const char JAMO_T_TABLE[][4] = {
77 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
78 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
79 "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
80};
81// clang-format on
82
83bool KCharSelectData::openDataFile()
84{
85 if (!dataFile.isEmpty()) {
86 return true;
87 } else {
88 QFile file(QStringLiteral(":/kf6/kcharselect/kcharselect-data"));
89 file.open(flags: QIODevice::ReadOnly);
90 dataFile = file.readAll();
91 file.close();
92 if (dataFile.size() < 40) {
93 dataFile.clear();
94 return false;
95 }
96 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
97 const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 20);
98 const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 24);
99 uint blocks = (offsetEnd - offsetBegin) / 4;
100 if (blocks <= 167) { // maximum possible number of blocks in BMP
101 // no remapping
102 remapType = -1;
103 } else if (blocks >= 174 && blocks <= 180) {
104 // remapping introduced in 5.25
105 remapType = 0;
106 } else {
107 // unknown remapping, abort
108 dataFile.clear();
109 return false;
110 }
111 futureIndex = (new RunIndexCreation(this, dataFile))->start();
112 return true;
113 }
114}
115
116// Temporary remapping code points <-> 16 bit database codes
117// See kcharselect-generate-datafile.py for details
118
119quint16 KCharSelectData::mapCodePointToDataBase(uint code) const
120{
121 if (remapType == 0) {
122 if (code >= 0xE000 && code <= 0xEFFF) {
123 return 0xFFFF;
124 }
125 if (code >= 0xF000 && code <= 0xFFFF) {
126 return code - 0x1000;
127 }
128 if (code >= 0x1F000 && code <= 0x1FFFF) {
129 return code - 0x10000;
130 }
131 }
132 if (code >= 0x10000) {
133 return 0xFFFF;
134 }
135 return code;
136}
137
138uint KCharSelectData::mapDataBaseToCodePoint(quint16 code) const
139{
140 if (remapType == 0) {
141 if (code >= 0xE000 && code <= 0xEFFF) {
142 return code + 0x1000;
143 }
144 if (code >= 0xF000) {
145 return code + 0x10000;
146 }
147 }
148 return code;
149}
150
151quint32 KCharSelectData::getDetailIndex(uint c) const
152{
153 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
154 // Convert from little-endian, so that this code works on PPC too.
155 // http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=482286
156 const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 12);
157 const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 16);
158
159 int min = 0;
160 int mid;
161 int max = ((offsetEnd - offsetBegin) / 27) - 1;
162
163 quint16 unicode = mapCodePointToDataBase(code: c);
164 if (unicode == 0xFFFF) {
165 return 0;
166 }
167
168 static quint16 most_recent_searched;
169 static quint32 most_recent_result;
170
171 if (unicode == most_recent_searched) {
172 return most_recent_result;
173 }
174
175 most_recent_searched = unicode;
176
177 while (max >= min) {
178 mid = (min + max) / 2;
179 const quint16 midUnicode = qFromLittleEndian<quint16>(src: data + offsetBegin + mid * 27);
180 if (unicode > midUnicode) {
181 min = mid + 1;
182 } else if (unicode < midUnicode) {
183 max = mid - 1;
184 } else {
185 most_recent_result = offsetBegin + mid * 27;
186
187 return most_recent_result;
188 }
189 }
190
191 most_recent_result = 0;
192 return 0;
193}
194
195QString KCharSelectData::formatCode(uint code, int length, const QString &prefix, int base)
196{
197 QString s = QString::number(code, base).toUpper();
198 while (s.size() < length) {
199 s.prepend(c: QLatin1Char('0'));
200 }
201 s.prepend(s: prefix);
202 return s;
203}
204
205QList<uint> KCharSelectData::blockContents(int block)
206{
207 if (!openDataFile()) {
208 return QList<uint>();
209 }
210
211 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
212 const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 20);
213 const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 24);
214
215 int max = ((offsetEnd - offsetBegin) / 4) - 1;
216
217 QList<uint> res;
218
219 if (block > max) {
220 return res;
221 }
222
223 quint16 unicodeBegin = qFromLittleEndian<quint16>(src: data + offsetBegin + block * 4);
224 quint16 unicodeEnd = qFromLittleEndian<quint16>(src: data + offsetBegin + block * 4 + 2);
225
226 while (unicodeBegin < unicodeEnd) {
227 res.append(t: mapDataBaseToCodePoint(code: unicodeBegin));
228 unicodeBegin++;
229 }
230 res.append(t: mapDataBaseToCodePoint(code: unicodeBegin)); // Be careful when unicodeEnd==0xffff
231
232 return res;
233}
234
235QList<int> KCharSelectData::sectionContents(int section)
236{
237 section -= 1;
238 if (!openDataFile()) {
239 return QList<int>();
240 }
241
242 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
243 const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 28);
244 const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 32);
245
246 int max = ((offsetEnd - offsetBegin) / 4) - 1;
247
248 QList<int> res;
249
250 if (section > max) {
251 return res;
252 }
253
254 for (int i = 0; i <= max; i++) {
255 const quint16 currSection = qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4);
256 if (currSection == section || section < 0) {
257 res.append(t: qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4 + 2));
258 }
259 }
260
261 return res;
262}
263
264QStringList KCharSelectData::sectionList()
265{
266 if (!openDataFile()) {
267 return QStringList();
268 }
269
270 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
271 const quint32 stringBegin = qFromLittleEndian<quint32>(src: udata + 24);
272 const quint32 stringEnd = qFromLittleEndian<quint32>(src: udata + 28);
273
274 const char *data = dataFile.constData();
275 QStringList list;
276 quint32 i = stringBegin;
277 list.append(t: QCoreApplication::translate(context: "KCharSelectData", key: "All", disambiguation: "KCharSelect section name"));
278 while (i < stringEnd) {
279 list.append(t: QCoreApplication::translate(context: "KCharSelectData", key: data + i, disambiguation: "KCharSelect section name"));
280 i += qstrlen(str: data + i) + 1;
281 }
282
283 return list;
284}
285
286QString KCharSelectData::block(uint c)
287{
288 return blockName(index: blockIndex(c));
289}
290
291QString KCharSelectData::section(uint c)
292{
293 return sectionName(index: sectionIndex(block: blockIndex(c)));
294}
295
296QString KCharSelectData::name(uint c)
297{
298 if (!openDataFile()) {
299 return QString();
300 }
301
302 if ((c & 0xFFFE) == 0xFFFE || (c >= 0xFDD0 && c <= 0xFDEF)) {
303 return QCoreApplication::translate(context: "KCharSelectData", key: "<noncharacter>");
304 } else if ((c >= 0x3400 && c <= 0x4DBF) || (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x20000 && c <= 0x2F7FF)) {
305 return QLatin1String("CJK UNIFIED IDEOGRAPH-") + formatCode(code: c, length: 4, prefix: QString());
306 } else if (c >= 0xAC00 && c <= 0xD7AF) {
307 /* compute hangul syllable name as per UAX #15 */
308 int SIndex = c - SBase;
309 int LIndex;
310 int VIndex;
311 int TIndex;
312
313 if (SIndex < 0 || SIndex >= SCount) {
314 return QString();
315 }
316
317 LIndex = SIndex / NCount;
318 VIndex = (SIndex % NCount) / TCount;
319 TIndex = SIndex % TCount;
320
321 return QLatin1String("HANGUL SYLLABLE ") + QLatin1String(JAMO_L_TABLE[LIndex]) + QLatin1String(JAMO_V_TABLE[VIndex])
322 + QLatin1String(JAMO_T_TABLE[TIndex]);
323 } else if (c >= 0xD800 && c <= 0xDB7F) {
324 return QCoreApplication::translate(context: "KCharSelectData", key: "<Non Private Use High Surrogate>");
325 } else if (c >= 0xDB80 && c <= 0xDBFF) {
326 return QCoreApplication::translate(context: "KCharSelectData", key: "<Private Use High Surrogate>");
327 } else if (c >= 0xDC00 && c <= 0xDFFF) {
328 return QCoreApplication::translate(context: "KCharSelectData", key: "<Low Surrogate>");
329 } else if ((c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) {
330 return QCoreApplication::translate(context: "KCharSelectData", key: "<Private Use>");
331 } else if ((c >= 0xF900 && c <= 0xFAFF) || (c >= 0x2F800 && c <= 0x2FFFF)) {
332 return QLatin1String("CJK COMPATIBILITY IDEOGRAPH-") + formatCode(code: c, length: 4, prefix: QString());
333 }
334 quint16 unicode = mapCodePointToDataBase(code: c);
335 if (unicode == 0xFFFF) {
336 return QLatin1String("NON-BMP-CHARACTER-") + formatCode(code: c, length: 4, prefix: QString());
337 } else {
338 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
339 const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 4);
340 const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 8);
341
342 int min = 0;
343 int mid;
344 int max = ((offsetEnd - offsetBegin) / 6) - 1;
345 QString s;
346
347 while (max >= min) {
348 mid = (min + max) / 2;
349 const quint16 midUnicode = qFromLittleEndian<quint16>(src: data + offsetBegin + mid * 6);
350 if (unicode > midUnicode) {
351 min = mid + 1;
352 } else if (unicode < midUnicode) {
353 max = mid - 1;
354 } else {
355 quint32 offset = qFromLittleEndian<quint32>(src: data + offsetBegin + mid * 6 + 2);
356 s = QString::fromUtf8(utf8: dataFile.constData() + offset + 1);
357 break;
358 }
359 }
360
361 if (s.isNull()) {
362 return QCoreApplication::translate(context: "KCharSelectData", key: "<not assigned>");
363 } else {
364 return s;
365 }
366 }
367}
368
369int KCharSelectData::blockIndex(uint c)
370{
371 if (!openDataFile()) {
372 return 0;
373 }
374
375 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
376 const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 20);
377 const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 24);
378 const quint16 unicode = mapCodePointToDataBase(code: c);
379 if (unicode == 0xFFFF) {
380 return 0;
381 }
382
383 int max = ((offsetEnd - offsetBegin) / 4) - 1;
384
385 int i = 0;
386
387 while (unicode > qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4 + 2) && i < max) {
388 i++;
389 }
390
391 return i;
392}
393
394int KCharSelectData::sectionIndex(int block)
395{
396 if (!openDataFile()) {
397 return 0;
398 }
399
400 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
401 const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 28);
402 const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 32);
403
404 int max = ((offsetEnd - offsetBegin) / 4) - 1;
405
406 for (int i = 0; i <= max; i++) {
407 if (qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4 + 2) == block) {
408 return qFromLittleEndian<quint16>(src: data + offsetBegin + i * 4) + 1;
409 }
410 }
411
412 return 0;
413}
414
415QString KCharSelectData::blockName(int index)
416{
417 if (!openDataFile()) {
418 return QString();
419 }
420
421 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
422 const quint32 stringBegin = qFromLittleEndian<quint32>(src: udata + 16);
423 const quint32 stringEnd = qFromLittleEndian<quint32>(src: udata + 20);
424
425 quint32 i = stringBegin;
426 int currIndex = 0;
427
428 const char *data = dataFile.constData();
429 while (i < stringEnd && currIndex < index) {
430 i += qstrlen(str: data + i) + 1;
431 currIndex++;
432 }
433
434 return QCoreApplication::translate(context: "KCharSelectData", key: data + i, disambiguation: "KCharselect unicode block name");
435}
436
437QString KCharSelectData::sectionName(int index)
438{
439 if (index == 0) {
440 return QCoreApplication::translate(context: "KCharSelectData", key: "All", disambiguation: "KCharselect unicode section name");
441 }
442 if (!openDataFile()) {
443 return QString();
444 }
445
446 index -= 1;
447
448 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
449 const quint32 stringBegin = qFromLittleEndian<quint32>(src: udata + 24);
450 const quint32 stringEnd = qFromLittleEndian<quint32>(src: udata + 28);
451
452 quint32 i = stringBegin;
453 int currIndex = 0;
454
455 const char *data = dataFile.constData();
456 while (i < stringEnd && currIndex < index) {
457 i += qstrlen(str: data + i) + 1;
458 currIndex++;
459 }
460
461 return QCoreApplication::translate(context: "KCharSelectData", key: data + i, disambiguation: "KCharselect unicode section name");
462}
463
464QStringList KCharSelectData::aliases(uint c)
465{
466 if (!openDataFile()) {
467 return QStringList();
468 }
469 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
470 const int detailIndex = getDetailIndex(c);
471 if (detailIndex == 0) {
472 return QStringList();
473 }
474
475 const quint8 count = *(quint8 *)(udata + detailIndex + 6);
476 quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 2);
477
478 QStringList aliases;
479 aliases.reserve(asize: count);
480
481 const char *data = dataFile.constData();
482 for (int i = 0; i < count; i++) {
483 aliases.append(t: QString::fromUtf8(utf8: data + offset));
484 offset += qstrlen(str: data + offset) + 1;
485 }
486 return aliases;
487}
488
489QStringList KCharSelectData::notes(uint c)
490{
491 if (!openDataFile()) {
492 return QStringList();
493 }
494 const int detailIndex = getDetailIndex(c);
495 if (detailIndex == 0) {
496 return QStringList();
497 }
498
499 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
500 const quint8 count = *(quint8 *)(udata + detailIndex + 11);
501 quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 7);
502
503 QStringList notes;
504 notes.reserve(asize: count);
505
506 const char *data = dataFile.constData();
507 for (int i = 0; i < count; i++) {
508 notes.append(t: QString::fromUtf8(utf8: data + offset));
509 offset += qstrlen(str: data + offset) + 1;
510 }
511
512 return notes;
513}
514
515QList<uint> KCharSelectData::seeAlso(uint c)
516{
517 if (!openDataFile()) {
518 return QList<uint>();
519 }
520 const int detailIndex = getDetailIndex(c);
521 if (detailIndex == 0) {
522 return QList<uint>();
523 }
524
525 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
526 const quint8 count = *(quint8 *)(udata + detailIndex + 26);
527 quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 22);
528
529 QList<uint> seeAlso;
530 seeAlso.reserve(asize: count);
531
532 for (int i = 0; i < count; i++) {
533 seeAlso.append(t: mapDataBaseToCodePoint(code: qFromLittleEndian<quint16>(src: udata + offset)));
534 offset += 2;
535 }
536
537 return seeAlso;
538}
539
540QStringList KCharSelectData::equivalents(uint c)
541{
542 if (!openDataFile()) {
543 return QStringList();
544 }
545 const int detailIndex = getDetailIndex(c);
546 if (detailIndex == 0) {
547 return QStringList();
548 }
549
550 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
551 const quint8 count = *(quint8 *)(udata + detailIndex + 21);
552 quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 17);
553
554 QStringList equivalents;
555 equivalents.reserve(asize: count);
556
557 const char *data = dataFile.constData();
558 for (int i = 0; i < count; i++) {
559 equivalents.append(t: QString::fromUtf8(utf8: data + offset));
560 offset += qstrlen(str: data + offset) + 1;
561 }
562
563 return equivalents;
564}
565
566QStringList KCharSelectData::approximateEquivalents(uint c)
567{
568 if (!openDataFile()) {
569 return QStringList();
570 }
571 const int detailIndex = getDetailIndex(c);
572 if (detailIndex == 0) {
573 return QStringList();
574 }
575
576 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
577 const quint8 count = *(quint8 *)(udata + detailIndex + 16);
578 quint32 offset = qFromLittleEndian<quint32>(src: udata + detailIndex + 12);
579
580 QStringList approxEquivalents;
581 approxEquivalents.reserve(asize: count);
582
583 const char *data = dataFile.constData();
584 for (int i = 0; i < count; i++) {
585 approxEquivalents.append(t: QString::fromUtf8(utf8: data + offset));
586 offset += qstrlen(str: data + offset) + 1;
587 }
588
589 return approxEquivalents;
590}
591
592QList<uint> KCharSelectData::decomposition(uint c)
593{
594 // for now, only decompose Hangul Syllable into Hangul Jamo
595 uint SIndex = c - SBase;
596 if (SIndex >= SCount) {
597 return QList<uint>();
598 }
599
600 uint L = LBase + SIndex / NCount; // Choseong
601 uint V = VBase + (SIndex % NCount) / TCount; // Jungseong
602 uint T = TBase + SIndex % TCount; // Jongsung
603 QList<uint> jamoList;
604 jamoList.append(t: L);
605 jamoList.append(t: V);
606 if (T != TBase) {
607 jamoList.append(t: T);
608 }
609 return jamoList;
610}
611
612QStringList KCharSelectData::unihanInfo(uint c)
613{
614 if (!openDataFile()) {
615 return QStringList();
616 }
617
618 quint16 unicode = mapCodePointToDataBase(code: c);
619 if (unicode == 0xFFFF) {
620 return QStringList();
621 }
622
623 const char *data = dataFile.constData();
624 const uchar *udata = reinterpret_cast<const uchar *>(data);
625 const quint32 offsetBegin = qFromLittleEndian<quint32>(src: udata + 36);
626 const quint32 offsetEnd = dataFile.size();
627
628 int min = 0;
629 int mid;
630 int max = ((offsetEnd - offsetBegin) / 30) - 1;
631
632 while (max >= min) {
633 mid = (min + max) / 2;
634 const quint16 midUnicode = qFromLittleEndian<quint16>(src: udata + offsetBegin + mid * 30);
635 if (unicode > midUnicode) {
636 min = mid + 1;
637 } else if (unicode < midUnicode) {
638 max = mid - 1;
639 } else {
640 QStringList res;
641 res.reserve(asize: 7);
642 for (int i = 0; i < 7; i++) {
643 quint32 offset = qFromLittleEndian<quint32>(src: udata + offsetBegin + mid * 30 + 2 + i * 4);
644 if (offset != 0) {
645 res.append(t: QString::fromUtf8(utf8: data + offset));
646 } else {
647 res.append(t: QString());
648 }
649 }
650 return res;
651 }
652 }
653
654 return QStringList();
655}
656
657QChar::Category KCharSelectData::category(uint c)
658{
659 if (!openDataFile()) {
660 return QChar::category(ucs4: c);
661 }
662
663 ushort unicode = mapCodePointToDataBase(code: c);
664 if (unicode == 0xFFFF) {
665 return QChar::category(ucs4: c);
666 }
667
668 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
669 const quint32 offsetBegin = qFromLittleEndian<quint32>(src: data + 4);
670 const quint32 offsetEnd = qFromLittleEndian<quint32>(src: data + 8);
671
672 int min = 0;
673 int mid;
674 int max = ((offsetEnd - offsetBegin) / 6) - 1;
675
676 while (max >= min) {
677 mid = (min + max) / 2;
678 const quint16 midUnicode = qFromLittleEndian<quint16>(src: data + offsetBegin + mid * 6);
679 if (unicode > midUnicode) {
680 min = mid + 1;
681 } else if (unicode < midUnicode) {
682 max = mid - 1;
683 } else {
684 quint32 offset = qFromLittleEndian<quint32>(src: data + offsetBegin + mid * 6 + 2);
685 uchar categoryCode = *(data + offset);
686 Q_ASSERT(categoryCode > 0);
687 categoryCode--; /* Qt5 changed QChar::Category enum to start from 0 instead of 1
688 See QtBase commit d17c76feee9eece4 */
689 return QChar::Category(categoryCode);
690 }
691 }
692
693 return QChar::category(ucs4: c);
694}
695
696bool KCharSelectData::isPrint(uint c)
697{
698 QChar::Category cat = category(c);
699 return !(cat == QChar::Other_Control || cat == QChar::Other_NotAssigned);
700}
701
702bool KCharSelectData::isDisplayable(uint c)
703{
704 // Qt internally uses U+FDD0 and U+FDD1 to mark the beginning and the end of frames.
705 // They should be seen as non-printable characters, as trying to display them leads
706 // to a crash caused by a Qt "noBlockInString" assertion.
707 if (c == 0xFDD0 || c == 0xFDD1) {
708 return false;
709 }
710
711 return !isIgnorable(c) && isPrint(c);
712}
713
714bool KCharSelectData::isIgnorable(uint c)
715{
716 /*
717 * According to the Unicode standard, Default Ignorable Code Points
718 * should be ignored unless explicitly supported. For example, U+202E
719 * RIGHT-TO-LEFT-OVERRIDE ir printable according to Qt, but displaying
720 * it gives the undesired effect of all text being turned RTL. We do not
721 * have a way to "explicitly" support it, so we will treat it as
722 * non-printable.
723 *
724 * There is a list of these on
725 * http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt under the
726 * property Default_Ignorable_Code_Point.
727 */
728
729 // NOTE: not very nice to hardcode these here; is it worth it to modify
730 // the binary data file to hold them?
731 // clang-format off
732 return c == 0x00AD || c == 0x034F || c == 0x115F || c == 0x1160 ||
733 c == 0x17B4 || c == 0x17B5 || (c >= 0x180B && c <= 0x180D) ||
734 (c >= 0x200B && c <= 0x200F) || (c >= 0x202A && c <= 0x202E) ||
735 (c >= 0x2060 && c <= 0x206F) || c == 0x3164 ||
736 (c >= 0xFE00 && c <= 0xFE0F) || c == 0xFEFF || c == 0xFFA0 ||
737 (c >= 0xFFF0 && c <= 0xFFF8);
738 // clang-format on
739}
740
741bool KCharSelectData::isCombining(uint c)
742{
743 return section(c) == QCoreApplication::translate(context: "KCharSelectData", key: "Combining Diacritics", disambiguation: "KCharSelect section name");
744 // FIXME: this is an imperfect test. There are many combining characters
745 // that are outside of this section. See Grapheme_Extend in
746 // http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
747}
748
749QString KCharSelectData::display(uint c, const QFont &font)
750{
751 if (!isDisplayable(c)) {
752 return QLatin1String("<b>") + QCoreApplication::translate(context: "KCharSelectData", key: "Non-printable") + QLatin1String("</b>");
753 } else {
754 QString s = QLatin1String("<font size=\"+4\" face=\"") + font.family() + QLatin1String("\">");
755 if (isCombining(c)) {
756 s += displayCombining(c);
757 } else {
758 s += QLatin1String("&#") + QString::number(c) + QLatin1Char(';');
759 }
760 s += QLatin1String("</font>");
761 return s;
762 }
763}
764
765QString KCharSelectData::displayCombining(uint c)
766{
767 /*
768 * The purpose of this is to make it easier to see how a combining
769 * character affects the text around it.
770 * The initial plan was to use U+25CC DOTTED CIRCLE for this purpose,
771 * as seen in pdfs from Unicode, but there seem to be a lot of alignment
772 * problems with that.
773 *
774 * Eventually, it would be nice to determine whether the character
775 * combines to the left or to the right, etc.
776 */
777 QString s = QLatin1String("&nbsp;&#") + QString::number(c) + QLatin1String(";&nbsp;") + QLatin1String(" (ab&#") + QString::number(c) + QLatin1String(";c)");
778 return s;
779}
780
781QString KCharSelectData::categoryText(QChar::Category category)
782{
783 switch (category) {
784 case QChar::Other_Control:
785 return QCoreApplication::translate(context: "KCharSelectData", key: "Other, Control");
786 case QChar::Other_Format:
787 return QCoreApplication::translate(context: "KCharSelectData", key: "Other, Format");
788 case QChar::Other_NotAssigned:
789 return QCoreApplication::translate(context: "KCharSelectData", key: "Other, Not Assigned");
790 case QChar::Other_PrivateUse:
791 return QCoreApplication::translate(context: "KCharSelectData", key: "Other, Private Use");
792 case QChar::Other_Surrogate:
793 return QCoreApplication::translate(context: "KCharSelectData", key: "Other, Surrogate");
794 case QChar::Letter_Lowercase:
795 return QCoreApplication::translate(context: "KCharSelectData", key: "Letter, Lowercase");
796 case QChar::Letter_Modifier:
797 return QCoreApplication::translate(context: "KCharSelectData", key: "Letter, Modifier");
798 case QChar::Letter_Other:
799 return QCoreApplication::translate(context: "KCharSelectData", key: "Letter, Other");
800 case QChar::Letter_Titlecase:
801 return QCoreApplication::translate(context: "KCharSelectData", key: "Letter, Titlecase");
802 case QChar::Letter_Uppercase:
803 return QCoreApplication::translate(context: "KCharSelectData", key: "Letter, Uppercase");
804 case QChar::Mark_SpacingCombining:
805 return QCoreApplication::translate(context: "KCharSelectData", key: "Mark, Spacing Combining");
806 case QChar::Mark_Enclosing:
807 return QCoreApplication::translate(context: "KCharSelectData", key: "Mark, Enclosing");
808 case QChar::Mark_NonSpacing:
809 return QCoreApplication::translate(context: "KCharSelectData", key: "Mark, Non-Spacing");
810 case QChar::Number_DecimalDigit:
811 return QCoreApplication::translate(context: "KCharSelectData", key: "Number, Decimal Digit");
812 case QChar::Number_Letter:
813 return QCoreApplication::translate(context: "KCharSelectData", key: "Number, Letter");
814 case QChar::Number_Other:
815 return QCoreApplication::translate(context: "KCharSelectData", key: "Number, Other");
816 case QChar::Punctuation_Connector:
817 return QCoreApplication::translate(context: "KCharSelectData", key: "Punctuation, Connector");
818 case QChar::Punctuation_Dash:
819 return QCoreApplication::translate(context: "KCharSelectData", key: "Punctuation, Dash");
820 case QChar::Punctuation_Close:
821 return QCoreApplication::translate(context: "KCharSelectData", key: "Punctuation, Close");
822 case QChar::Punctuation_FinalQuote:
823 return QCoreApplication::translate(context: "KCharSelectData", key: "Punctuation, Final Quote");
824 case QChar::Punctuation_InitialQuote:
825 return QCoreApplication::translate(context: "KCharSelectData", key: "Punctuation, Initial Quote");
826 case QChar::Punctuation_Other:
827 return QCoreApplication::translate(context: "KCharSelectData", key: "Punctuation, Other");
828 case QChar::Punctuation_Open:
829 return QCoreApplication::translate(context: "KCharSelectData", key: "Punctuation, Open");
830 case QChar::Symbol_Currency:
831 return QCoreApplication::translate(context: "KCharSelectData", key: "Symbol, Currency");
832 case QChar::Symbol_Modifier:
833 return QCoreApplication::translate(context: "KCharSelectData", key: "Symbol, Modifier");
834 case QChar::Symbol_Math:
835 return QCoreApplication::translate(context: "KCharSelectData", key: "Symbol, Math");
836 case QChar::Symbol_Other:
837 return QCoreApplication::translate(context: "KCharSelectData", key: "Symbol, Other");
838 case QChar::Separator_Line:
839 return QCoreApplication::translate(context: "KCharSelectData", key: "Separator, Line");
840 case QChar::Separator_Paragraph:
841 return QCoreApplication::translate(context: "KCharSelectData", key: "Separator, Paragraph");
842 case QChar::Separator_Space:
843 return QCoreApplication::translate(context: "KCharSelectData", key: "Separator, Space");
844 default:
845 return QCoreApplication::translate(context: "KCharSelectData", key: "Unknown");
846 }
847}
848
849QList<uint> KCharSelectData::find(const QString &needle)
850{
851 QSet<uint> result;
852
853 QList<uint> returnRes;
854 QString simplified = needle.length() > 1 ? needle.simplified() : needle;
855 QStringList searchStrings;
856
857 static const QRegularExpression octalExp(QStringLiteral("^\\\\[0-7][0-7\\\\]*$"));
858 if (octalExp.match(subject: simplified).hasMatch()) {
859 // search for C octal escaped UTF-8
860 QByteArray utf8;
861 int byte = -1;
862 for (int i = 0; i <= simplified.length(); ++i) {
863 int c = simplified.at(i).unicode();
864 if (c >= '0' && c <= '7') {
865 byte = 8 * byte + c - '0';
866 } else if (byte == -1) {
867 byte = 0;
868 } else if (byte >= 0x00 && byte <= 0xFF) {
869 utf8.append(c: (char)byte);
870 byte = 0;
871 }
872 }
873 simplified = QString::fromUtf8(ba: utf8);
874 }
875
876 if (simplified.length() <= 2) {
877 QList<uint> ucs4 = simplified.toUcs4();
878 if (ucs4.size() == 1) {
879 // search for hex representation of the character
880 searchStrings = QStringList(formatCode(code: ucs4.at(i: 0)));
881 } else {
882 searchStrings = splitString(s: simplified);
883 }
884 } else {
885 searchStrings = splitString(s: simplified);
886 }
887
888 if (searchStrings.isEmpty()) {
889 return returnRes;
890 }
891
892 static const QRegularExpression hexExp(QStringLiteral("^(?:|u\\+|U\\+|0x|0X)([A-Fa-f0-9]{4,5})$"));
893 for (const QString &s : std::as_const(t&: searchStrings)) {
894 const QRegularExpressionMatch match = hexExp.match(subject: s);
895 if (match.hasMatch()) {
896 const QString cap = match.captured(nth: 1);
897 returnRes.append(t: cap.toInt(ok: nullptr, base: 16));
898 // search for "1234" instead of "0x1234"
899 if (s.length() == 6 || s.length() == 7) {
900 searchStrings[searchStrings.indexOf(str: s)] = cap;
901 }
902 }
903 // try to parse string as decimal number
904 bool ok;
905 int unicode = s.toInt(ok: &ok);
906 if (ok && unicode >= 0 && unicode <= QChar::LastValidCodePoint) {
907 returnRes.append(t: unicode);
908 }
909 }
910
911 bool firstSubString = true;
912 for (const QString &s : std::as_const(t&: searchStrings)) {
913 QSet<uint> partResult = getMatchingChars(s: s.toLower());
914 if (firstSubString) {
915 result = partResult;
916 firstSubString = false;
917 } else {
918 result = result.intersect(other: partResult);
919 }
920 }
921
922 // remove results found by matching the code point to prevent duplicate results
923 // while letting these characters stay at the beginning
924 for (uint c : std::as_const(t&: returnRes)) {
925 result.remove(value: c);
926 }
927
928 QList<uint> sortedResult;
929 sortedResult.reserve(asize: result.count());
930 for (auto c : std::as_const(t&: result)) {
931 sortedResult.append(t: c);
932 }
933 std::sort(first: sortedResult.begin(), last: sortedResult.end());
934
935 returnRes += sortedResult;
936 return returnRes;
937}
938
939QSet<uint> KCharSelectData::getMatchingChars(const QString &s)
940{
941 if (dataFile.isEmpty()) {
942 return QSet<uint>();
943 }
944 futureIndex.waitForFinished();
945 const Index index = futureIndex.result();
946 Index::const_iterator pos = index.lowerBound(key: s);
947 QSet<uint> result;
948
949 while (pos != index.constEnd() && pos.key().startsWith(s)) {
950 for (quint16 c : pos.value()) {
951 result.insert(value: mapDataBaseToCodePoint(code: c));
952 }
953 ++pos;
954 }
955
956 return result;
957}
958
959QStringList KCharSelectData::splitString(const QString &s)
960{
961 QStringList result;
962 int start = 0;
963 int end = 0;
964 int length = s.length();
965 while (end < length) {
966 while (end < length && (s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
967 end++;
968 }
969 if (start != end) {
970 result.append(t: s.mid(position: start, n: end - start));
971 }
972 start = end;
973 while (end < length && !(s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
974 end++;
975 start++;
976 }
977 }
978 return result;
979}
980
981void KCharSelectData::appendToIndex(Index *index, quint16 unicode, const QString &s)
982{
983 const QStringList strings = splitString(s);
984 for (const QString &s : strings) {
985 (*index)[s.toLower()].append(t: unicode);
986 }
987}
988
989Index KCharSelectData::createIndex(const QByteArray &dataFile)
990{
991 Index i;
992
993 // character names
994 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
995 const char *data = dataFile.constData();
996 const quint32 nameOffsetBegin = qFromLittleEndian<quint32>(src: udata + 4);
997 const quint32 nameOffsetEnd = qFromLittleEndian<quint32>(src: udata + 8);
998
999 int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1;
1000
1001 for (int pos = 0; pos <= max; pos++) {
1002 const quint16 unicode = qFromLittleEndian<quint16>(src: udata + nameOffsetBegin + pos * 6);
1003 quint32 offset = qFromLittleEndian<quint32>(src: udata + nameOffsetBegin + pos * 6 + 2);
1004 appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + offset + 1));
1005 }
1006
1007 // details
1008 const quint32 detailsOffsetBegin = qFromLittleEndian<quint32>(src: udata + 12);
1009 const quint32 detailsOffsetEnd = qFromLittleEndian<quint32>(src: udata + 16);
1010
1011 max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1;
1012
1013 for (int pos = 0; pos <= max; pos++) {
1014 const quint16 unicode = qFromLittleEndian<quint16>(src: udata + detailsOffsetBegin + pos * 27);
1015
1016 // aliases
1017 const quint8 aliasCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 6);
1018 quint32 aliasOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 2);
1019
1020 for (int j = 0; j < aliasCount; j++) {
1021 appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + aliasOffset));
1022 aliasOffset += qstrlen(str: data + aliasOffset) + 1;
1023 }
1024
1025 // notes
1026 const quint8 notesCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 11);
1027 quint32 notesOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 7);
1028
1029 for (int j = 0; j < notesCount; j++) {
1030 appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + notesOffset));
1031 notesOffset += qstrlen(str: data + notesOffset) + 1;
1032 }
1033
1034 // approximate equivalents
1035 const quint8 apprCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 16);
1036 quint32 apprOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 12);
1037
1038 for (int j = 0; j < apprCount; j++) {
1039 appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + apprOffset));
1040 apprOffset += qstrlen(str: data + apprOffset) + 1;
1041 }
1042
1043 // equivalents
1044 const quint8 equivCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 21);
1045 quint32 equivOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 17);
1046
1047 for (int j = 0; j < equivCount; j++) {
1048 appendToIndex(index: &i, unicode, s: QString::fromUtf8(utf8: data + equivOffset));
1049 equivOffset += qstrlen(str: data + equivOffset) + 1;
1050 }
1051
1052 // see also - convert to string (hex)
1053 const quint8 seeAlsoCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 26);
1054 quint32 seeAlsoOffset = qFromLittleEndian<quint32>(src: udata + detailsOffsetBegin + pos * 27 + 22);
1055
1056 for (int j = 0; j < seeAlsoCount; j++) {
1057 quint16 seeAlso = qFromLittleEndian<quint16>(src: udata + seeAlsoOffset);
1058 appendToIndex(index: &i, unicode, s: formatCode(code: seeAlso, length: 4, prefix: QString()));
1059 equivOffset += qstrlen(str: data + equivOffset) + 1;
1060 }
1061 }
1062
1063 // unihan data
1064 // temporary disabled due to the huge amount of data
1065 // const quint32 unihanOffsetBegin = qFromLittleEndian<quint32>(udata+36);
1066 // const quint32 unihanOffsetEnd = dataFile.size();
1067 // max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1;
1068 //
1069 // for (int pos = 0; pos <= max; pos++) {
1070 // const quint16 unicode = qFromLittleEndian<quint16>(udata + unihanOffsetBegin + pos*30);
1071 // for(int j = 0; j < 7; j++) {
1072 // quint32 offset = qFromLittleEndian<quint32>(udata + unihanOffsetBegin + pos*30 + 2 + j*4);
1073 // if(offset != 0) {
1074 // appendToIndex(&i, unicode, QString::fromUtf8(data + offset));
1075 // }
1076 // }
1077 // }
1078
1079 return i;
1080}
1081

source code of kwidgetsaddons/src/kcharselectdata.cpp