1/* This file is part of the KDE libraries
2 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
3 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
4 SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org>
5
6 SPDX-License-Identifier: LGPL-2.0-or-later
7*/
8
9#include <QCoreApplication>
10#include <QDataStream>
11#include <QFile>
12#include <QLocale>
13#include <QStandardPaths>
14
15#include "core_debug.h"
16#include "guesslanguage.h"
17#include "loader_p.h"
18#include "speller.h"
19#include "spellerplugin_p.h"
20#include "tokenizer_p.h"
21
22/*
23All language tags should be valid according to IETF BCP 47, as codified in RFC 4646.
24ISO 639-1 codes should be used for the language part except for cases where there
25exists no code, then 639-3 codes should be used. Country codes should only be used
26in special cases. Scripts can be differentiated by IANA subtags, available here:
27http://www.iana.org/assignments/language-subtag-registry
28The script tags correspond to ISO 15924
29
30An overview of the best practices concerning language tagging is available here:
31http://www.w3.org/International/articles/language-tags/Overview.en.php
32
33lang tags should use underscores (_) rather than hyphens (-) to separate subsections.
34
35EXCEPTIONS:
36For cases of known differences from the above tagging scheme and major
37spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers
38shall be used. All exception shall be noted here:
39
40BCP SPELLCHECK
41az-Latn az
42
43*/
44
45namespace Sonnet
46{
47class GuessLanguagePrivate
48{
49public:
50 GuessLanguagePrivate();
51 // language trigram score
52 static QHash<QString, QHash<QString, int>> s_knownModels;
53
54 void loadModels();
55 QList<QChar::Script> findRuns(const QString &text);
56 QList<QString> createOrderedModel(const QString &content);
57 int distance(const QList<QString> &model, const QHash<QString, int> &knownModel);
58 QStringList guessFromTrigrams(const QString &sample, const QStringList &langs);
59 QStringList identify(const QString &sample, const QList<QChar::Script> &scripts);
60 QString guessFromDictionaries(const QString &sentence, const QStringList &candidates);
61
62 static QSet<QString> s_knownDictionaries;
63 static QMultiHash<QChar::Script, QString> s_scriptLanguages;
64 static QMap<QString, QString> s_dictionaryNameMap;
65
66 const int MIN_LENGTH;
67 int m_maxItems;
68 double m_minConfidence;
69};
70
71QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels;
72QSet<QString> GuessLanguagePrivate::s_knownDictionaries;
73QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages;
74QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap;
75
76QStringList getNames(QLocale::Script script)
77{
78 QStringList locales;
79 const auto matchingLocales = QLocale::matchingLocales(language: QLocale::AnyLanguage, script, territory: QLocale::AnyCountry);
80 locales.reserve(asize: matchingLocales.size());
81 for (const QLocale &locale : matchingLocales) {
82 locales << locale.name();
83 }
84 return locales;
85}
86
87GuessLanguagePrivate::GuessLanguagePrivate()
88 : MIN_LENGTH(5)
89 , m_maxItems(1)
90 , m_minConfidence(0)
91{
92 if (!s_scriptLanguages.isEmpty()) {
93 return;
94 }
95
96 const QStringList languages = Loader::openLoader()->languages();
97 s_knownDictionaries = QSet<QString>(languages.begin(), languages.end());
98 QSet<QString> dictionaryLanguages;
99 for (const QString &dictName : std::as_const(t&: s_knownDictionaries)) {
100 QString languageName = QLocale(dictName).name();
101 if (languageName.isEmpty()) {
102 qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName;
103 continue;
104 }
105 dictionaryLanguages.insert(value: languageName);
106 }
107
108 QSet<QString> allLanguages;
109 for (int i = 0; i < int(QChar::ScriptCount); i++) {
110 QChar::Script script = static_cast<QChar::Script>(i);
111 QStringList names;
112 switch (script) {
113 case QChar::Script_Latin:
114 names = getNames(script: QLocale::LatinScript);
115 break;
116 case QChar::Script_Greek:
117 names = getNames(script: QLocale::GreekScript);
118 break;
119 case QChar::Script_Cyrillic:
120 names = getNames(script: QLocale::CyrillicScript);
121 break;
122 case QChar::Script_Armenian:
123 names = getNames(script: QLocale::ArmenianScript);
124 break;
125 case QChar::Script_Hebrew:
126 names = getNames(script: QLocale::HebrewScript);
127 break;
128 case QChar::Script_Arabic:
129 names = getNames(script: QLocale::ArabicScript);
130 break;
131 case QChar::Script_Syriac:
132 names = getNames(script: QLocale::SyriacScript);
133 break;
134 case QChar::Script_Thaana:
135 names = getNames(script: QLocale::ThaanaScript);
136 break;
137 case QChar::Script_Devanagari:
138 names = getNames(script: QLocale::DevanagariScript);
139 break;
140 case QChar::Script_Bengali:
141 names = getNames(script: QLocale::BengaliScript);
142 break;
143 case QChar::Script_Gurmukhi:
144 names = getNames(script: QLocale::GurmukhiScript);
145 break;
146 case QChar::Script_Gujarati:
147 names = getNames(script: QLocale::GujaratiScript);
148 break;
149 case QChar::Script_Oriya:
150 names = getNames(script: QLocale::OriyaScript);
151 break;
152 case QChar::Script_Tamil:
153 names = getNames(script: QLocale::TamilScript);
154 break;
155 case QChar::Script_Telugu:
156 names = getNames(script: QLocale::TeluguScript);
157 break;
158 case QChar::Script_Kannada:
159 names = getNames(script: QLocale::KannadaScript);
160 break;
161 case QChar::Script_Malayalam:
162 names = getNames(script: QLocale::MalayalamScript);
163 break;
164 case QChar::Script_Sinhala:
165 names = getNames(script: QLocale::SinhalaScript);
166 break;
167 case QChar::Script_Thai:
168 names = getNames(script: QLocale::ThaiScript);
169 break;
170 case QChar::Script_Lao:
171 names = getNames(script: QLocale::LaoScript);
172 break;
173 case QChar::Script_Tibetan:
174 names = getNames(script: QLocale::TibetanScript);
175 break;
176 case QChar::Script_Myanmar:
177 names = getNames(script: QLocale::MyanmarScript);
178 break;
179 case QChar::Script_Georgian:
180 names = getNames(script: QLocale::GeorgianScript);
181 break;
182 case QChar::Script_Hangul:
183 names = getNames(script: QLocale::HangulScript);
184 break;
185 case QChar::Script_Ethiopic:
186 names = getNames(script: QLocale::EthiopicScript);
187 break;
188 case QChar::Script_Cherokee:
189 names = getNames(script: QLocale::CherokeeScript);
190 break;
191 case QChar::Script_CanadianAboriginal:
192 names = getNames(script: QLocale::CanadianAboriginalScript);
193 break;
194 case QChar::Script_Ogham:
195 names = getNames(script: QLocale::OghamScript);
196 break;
197 case QChar::Script_Runic:
198 names = getNames(script: QLocale::RunicScript);
199 break;
200 case QChar::Script_Khmer:
201 names = getNames(script: QLocale::KhmerScript);
202 break;
203 case QChar::Script_Mongolian:
204 names = getNames(script: QLocale::MongolianScript);
205 break;
206 case QChar::Script_Hiragana:
207 names = getNames(script: QLocale::HiraganaScript);
208 break;
209 case QChar::Script_Katakana:
210 names = getNames(script: QLocale::KatakanaScript);
211 break;
212 case QChar::Script_Bopomofo:
213 names = getNames(script: QLocale::BopomofoScript);
214 break;
215 case QChar::Script_Han:
216 names = getNames(script: QLocale::HanScript);
217 break;
218 case QChar::Script_Yi:
219 names = getNames(script: QLocale::YiScript);
220 break;
221 case QChar::Script_OldItalic:
222 names = getNames(script: QLocale::OldItalicScript);
223 break;
224 case QChar::Script_Gothic:
225 names = getNames(script: QLocale::GothicScript);
226 break;
227 case QChar::Script_Deseret:
228 names = getNames(script: QLocale::DeseretScript);
229 break;
230 case QChar::Script_Tagalog:
231 names = getNames(script: QLocale::TagalogScript);
232 break;
233 case QChar::Script_Hanunoo:
234 names = getNames(script: QLocale::HanunooScript);
235 break;
236 case QChar::Script_Buhid:
237 names = getNames(script: QLocale::BuhidScript);
238 break;
239 case QChar::Script_Tagbanwa:
240 names = getNames(script: QLocale::TagbanwaScript);
241 break;
242 case QChar::Script_Coptic:
243 names = getNames(script: QLocale::CopticScript);
244 break;
245 case QChar::Script_Limbu:
246 names = getNames(script: QLocale::LimbuScript);
247 break;
248 case QChar::Script_TaiLe:
249 names = getNames(script: QLocale::TaiLeScript);
250 break;
251 case QChar::Script_LinearB:
252 names = getNames(script: QLocale::LinearBScript);
253 break;
254 case QChar::Script_Ugaritic:
255 names = getNames(script: QLocale::UgariticScript);
256 break;
257 case QChar::Script_Shavian:
258 names = getNames(script: QLocale::ShavianScript);
259 break;
260 case QChar::Script_Osmanya:
261 names = getNames(script: QLocale::OsmanyaScript);
262 break;
263 case QChar::Script_Cypriot:
264 names = getNames(script: QLocale::CypriotScript);
265 break;
266 case QChar::Script_Braille:
267 names = getNames(script: QLocale::BrailleScript);
268 break;
269 case QChar::Script_Buginese:
270 names = getNames(script: QLocale::BugineseScript);
271 break;
272 case QChar::Script_NewTaiLue:
273 names = getNames(script: QLocale::NewTaiLueScript);
274 break;
275 case QChar::Script_Glagolitic:
276 names = getNames(script: QLocale::GlagoliticScript);
277 break;
278 case QChar::Script_Tifinagh:
279 names = getNames(script: QLocale::TifinaghScript);
280 break;
281 case QChar::Script_SylotiNagri:
282 names = getNames(script: QLocale::SylotiNagriScript);
283 break;
284 case QChar::Script_OldPersian:
285 names = getNames(script: QLocale::OldPersianScript);
286 break;
287 case QChar::Script_Kharoshthi:
288 names = getNames(script: QLocale::KharoshthiScript);
289 break;
290 case QChar::Script_Balinese:
291 names = getNames(script: QLocale::BalineseScript);
292 break;
293 case QChar::Script_Cuneiform:
294 names = getNames(script: QLocale::CuneiformScript);
295 break;
296 case QChar::Script_Phoenician:
297 names = getNames(script: QLocale::PhoenicianScript);
298 break;
299 case QChar::Script_PhagsPa:
300 names = getNames(script: QLocale::PhagsPaScript);
301 break;
302 case QChar::Script_Nko:
303 names = getNames(script: QLocale::NkoScript);
304 break;
305 case QChar::Script_Sundanese:
306 names = getNames(script: QLocale::SundaneseScript);
307 break;
308 case QChar::Script_Lepcha:
309 names = getNames(script: QLocale::LepchaScript);
310 break;
311 case QChar::Script_OlChiki:
312 names = getNames(script: QLocale::OlChikiScript);
313 break;
314 case QChar::Script_Vai:
315 names = getNames(script: QLocale::VaiScript);
316 break;
317 case QChar::Script_Saurashtra:
318 names = getNames(script: QLocale::SaurashtraScript);
319 break;
320 case QChar::Script_KayahLi:
321 names = getNames(script: QLocale::KayahLiScript);
322 break;
323 case QChar::Script_Rejang:
324 names = getNames(script: QLocale::RejangScript);
325 break;
326 case QChar::Script_Lycian:
327 names = getNames(script: QLocale::LycianScript);
328 break;
329 case QChar::Script_Carian:
330 names = getNames(script: QLocale::CarianScript);
331 break;
332 case QChar::Script_Lydian:
333 names = getNames(script: QLocale::LydianScript);
334 break;
335 case QChar::Script_Cham:
336 names = getNames(script: QLocale::ChamScript);
337 break;
338 case QChar::Script_TaiTham:
339 names = getNames(script: QLocale::LannaScript);
340 break;
341 case QChar::Script_TaiViet:
342 names = getNames(script: QLocale::TaiVietScript);
343 break;
344 case QChar::Script_Avestan:
345 names = getNames(script: QLocale::AvestanScript);
346 break;
347 case QChar::Script_EgyptianHieroglyphs:
348 names = getNames(script: QLocale::EgyptianHieroglyphsScript);
349 break;
350 case QChar::Script_Samaritan:
351 names = getNames(script: QLocale::SamaritanScript);
352 break;
353 case QChar::Script_Lisu:
354 names = getNames(script: QLocale::FraserScript);
355 break;
356 case QChar::Script_Bamum:
357 names = getNames(script: QLocale::BamumScript);
358 break;
359 case QChar::Script_Javanese:
360 names = getNames(script: QLocale::JavaneseScript);
361 break;
362 case QChar::Script_MeeteiMayek:
363 names = getNames(script: QLocale::MeiteiMayekScript);
364 break;
365 case QChar::Script_ImperialAramaic:
366 names = getNames(script: QLocale::ImperialAramaicScript);
367 break;
368 case QChar::Script_OldSouthArabian:
369 names = getNames(script: QLocale::OldSouthArabianScript);
370 break;
371 case QChar::Script_InscriptionalParthian:
372 names = getNames(script: QLocale::InscriptionalParthianScript);
373 break;
374 case QChar::Script_InscriptionalPahlavi:
375 names = getNames(script: QLocale::InscriptionalPahlaviScript);
376 break;
377 case QChar::Script_Kaithi:
378 names = getNames(script: QLocale::KaithiScript);
379 break;
380 case QChar::Script_Batak:
381 names = getNames(script: QLocale::BatakScript);
382 break;
383 case QChar::Script_Brahmi:
384 names = getNames(script: QLocale::BrahmiScript);
385 break;
386 case QChar::Script_Mandaic:
387 names = getNames(script: QLocale::MandaeanScript);
388 break;
389 case QChar::Script_Chakma:
390 names = getNames(script: QLocale::ChakmaScript);
391 break;
392 case QChar::Script_MeroiticCursive:
393 case QChar::Script_MeroiticHieroglyphs:
394 names = getNames(script: QLocale::MeroiticCursiveScript);
395 names.append(other: getNames(script: QLocale::MeroiticScript));
396 break;
397 case QChar::Script_Miao:
398 names = getNames(script: QLocale::PollardPhoneticScript);
399 break;
400 case QChar::Script_Sharada:
401 names = getNames(script: QLocale::SharadaScript);
402 break;
403 case QChar::Script_SoraSompeng:
404 names = getNames(script: QLocale::SoraSompengScript);
405 break;
406 case QChar::Script_Takri:
407 names = getNames(script: QLocale::TakriScript);
408 break;
409 case QChar::Script_CaucasianAlbanian:
410 names = getNames(script: QLocale::CaucasianAlbanianScript);
411 break;
412 case QChar::Script_BassaVah:
413 names = getNames(script: QLocale::BassaVahScript);
414 break;
415 case QChar::Script_Duployan:
416 names = getNames(script: QLocale::DuployanScript);
417 break;
418 case QChar::Script_Elbasan:
419 names = getNames(script: QLocale::ElbasanScript);
420 break;
421 case QChar::Script_Grantha:
422 names = getNames(script: QLocale::GranthaScript);
423 break;
424 case QChar::Script_PahawhHmong:
425 names = getNames(script: QLocale::PahawhHmongScript);
426 break;
427 case QChar::Script_Khojki:
428 names = getNames(script: QLocale::KhojkiScript);
429 break;
430 case QChar::Script_LinearA:
431 names = getNames(script: QLocale::LinearAScript);
432 break;
433 case QChar::Script_Mahajani:
434 names = getNames(script: QLocale::MahajaniScript);
435 break;
436 case QChar::Script_Manichaean:
437 names = getNames(script: QLocale::ManichaeanScript);
438 break;
439 case QChar::Script_MendeKikakui:
440 names = getNames(script: QLocale::MendeKikakuiScript);
441 break;
442 case QChar::Script_Modi:
443 names = getNames(script: QLocale::ModiScript);
444 break;
445 case QChar::Script_Mro:
446 names = getNames(script: QLocale::MroScript);
447 break;
448 case QChar::Script_OldNorthArabian:
449 names = getNames(script: QLocale::OldNorthArabianScript);
450 break;
451 case QChar::Script_Nabataean:
452 names = getNames(script: QLocale::NabataeanScript);
453 break;
454 case QChar::Script_Palmyrene:
455 names = getNames(script: QLocale::PalmyreneScript);
456 break;
457 case QChar::Script_PauCinHau:
458 names = getNames(script: QLocale::PauCinHauScript);
459 break;
460 case QChar::Script_OldPermic:
461 names = getNames(script: QLocale::OldPermicScript);
462 break;
463 case QChar::Script_PsalterPahlavi:
464 names = getNames(script: QLocale::PsalterPahlaviScript);
465 break;
466 case QChar::Script_Siddham:
467 names = getNames(script: QLocale::SiddhamScript);
468 break;
469 case QChar::Script_Khudawadi:
470 names = getNames(script: QLocale::KhudawadiScript);
471 break;
472 case QChar::Script_Tirhuta:
473 names = getNames(script: QLocale::TirhutaScript);
474 break;
475 case QChar::Script_WarangCiti:
476 names = getNames(script: QLocale::VarangKshitiScript);
477 break;
478 case QChar::Script_Ahom:
479 names = getNames(script: QLocale::AhomScript);
480 break;
481 case QChar::Script_AnatolianHieroglyphs:
482 names = getNames(script: QLocale::AnatolianHieroglyphsScript);
483 break;
484 case QChar::Script_Hatran:
485 names = getNames(script: QLocale::HatranScript);
486 break;
487 case QChar::Script_Multani:
488 names = getNames(script: QLocale::MultaniScript);
489 break;
490 case QChar::Script_OldHungarian:
491 names = getNames(script: QLocale::OldHungarianScript);
492 break;
493 case QChar::Script_Unknown:
494 case QChar::Script_Inherited:
495 case QChar::Script_Common:
496 case QChar::Script_OldTurkic:
497 case QChar::Script_SignWriting:
498 break;
499 default:
500 qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script;
501 break;
502 }
503 allLanguages.unite(other: QSet<QString>(names.constBegin(), names.constEnd()));
504
505 { // Remove unknown languages
506 QStringList pruned;
507 for (const QString &name : std::as_const(t&: names)) {
508 if (!dictionaryLanguages.contains(value: name)) {
509 continue;
510 }
511 pruned.append(t: name);
512 }
513 names = pruned;
514 }
515
516 if (names.isEmpty()) {
517 continue;
518 }
519
520 for (const QString &name : std::as_const(t&: names)) {
521 s_scriptLanguages.insert(key: script, value: name);
522 }
523 }
524
525 // Try to handle some badly named dictionaries
526 if (!allLanguages.contains(other: s_knownDictionaries)) {
527 QSet<QString> dicts(s_knownDictionaries);
528 dicts.subtract(other: allLanguages);
529 for (const QString &dictName : std::as_const(t&: dicts)) {
530 QString languageName = QLocale(dictName).name();
531 if (languageName.isEmpty()) {
532 qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName;
533 continue;
534 }
535 s_dictionaryNameMap[languageName] = dictName;
536 if (std::find(first: s_scriptLanguages.cbegin(), last: s_scriptLanguages.cend(), val: languageName) == s_scriptLanguages.cend()) {
537 qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName;
538 }
539 }
540 }
541}
542
543GuessLanguage::GuessLanguage()
544 : d(new GuessLanguagePrivate)
545{
546}
547
548GuessLanguage::~GuessLanguage() = default;
549
550QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const
551{
552 if (text.isEmpty()) {
553 return QString();
554 }
555
556 // Filter for available dictionaries
557 QStringList suggestionsList;
558 for (const QString &suggestion : suggestionsListIn) {
559 if (d->s_knownDictionaries.contains(value: suggestion) && !suggestionsList.contains(str: suggestion)) {
560 suggestionsList.append(t: suggestion);
561 }
562 }
563
564 // Load the model on demand
565 if (d->s_knownModels.isEmpty()) {
566 d->loadModels();
567 }
568
569 const QList<QChar::Script> scriptsList = d->findRuns(text);
570
571 QStringList candidateLanguages = d->identify(sample: text, scripts: scriptsList);
572
573 // if guessing from trigrams fail
574 if (candidateLanguages.isEmpty()) {
575 for (const QChar::Script script : scriptsList) {
576 const auto languagesList = d->s_scriptLanguages.values(key: script);
577 for (const QString &lang : languagesList) {
578 if (!d->s_knownModels.contains(key: lang)) {
579 candidateLanguages.append(t: lang);
580 }
581 }
582 }
583 }
584
585 // Hack for some bad dictionary names
586 for (int i = 0; i < candidateLanguages.count(); i++) {
587 if (d->s_dictionaryNameMap.contains(key: candidateLanguages[i])) {
588 candidateLanguages[i] = d->s_dictionaryNameMap.value(key: candidateLanguages[i]);
589 }
590 }
591
592 if (candidateLanguages.count() == 1) {
593 return candidateLanguages.first();
594 }
595
596 // Wasn't able to get a good guess with the trigrams, try checking all
597 // dictionaries for the suggested languages.
598 candidateLanguages.append(l: suggestionsList);
599 candidateLanguages.removeDuplicates();
600 QString identified = d->guessFromDictionaries(sentence: text, candidates: candidateLanguages);
601 if (!identified.isEmpty()) {
602 return identified;
603 }
604
605 qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text;
606
607 // None of our methods worked, just return the best suggestion
608 if (!suggestionsList.isEmpty()) {
609 return suggestionsList.first();
610 }
611
612 qCDebug(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text;
613
614 // Not even any suggestions, give up
615 return QString();
616}
617
618void GuessLanguage::setLimits(int maxItems, double minConfidence)
619{
620 d->m_maxItems = maxItems;
621 d->m_minConfidence = minConfidence;
622}
623
624void GuessLanguagePrivate::loadModels()
625{
626 // use trigrams from resource file, easy to deploy on all platforms
627 const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map");
628 qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile;
629
630 QFile sin(triMapFile);
631 if (!sin.open(flags: QIODevice::ReadOnly)) {
632 qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile;
633 return;
634 }
635
636 QDataStream in(&sin);
637 in >> s_knownModels;
638
639 // Sanity check
640 QSet<QString> availableLanguages;
641 QHashIterator<QString, QHash<QString, int>> iterator(s_knownModels);
642 while (iterator.hasNext()) {
643 iterator.next();
644 if (iterator.value().count() < MAXGRAMS) {
645 qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS;
646 }
647 availableLanguages.insert(value: iterator.key());
648 }
649 QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd());
650 knownLanguages.subtract(other: availableLanguages);
651 if (!knownLanguages.isEmpty()) {
652 qCDebug(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages;
653 }
654}
655
656QList<QChar::Script> GuessLanguagePrivate::findRuns(const QString &text)
657{
658 QHash<QChar::Script, int> scriptCounts;
659
660 int totalCount = 0;
661
662 for (const QChar c : text) {
663 const QChar::Script script = c.script();
664
665 if (script == QChar::Script_Common || script == QChar::Script_Inherited) {
666 continue;
667 }
668
669 if (!c.isLetter()) {
670 continue;
671 }
672
673 scriptCounts[script]++;
674 totalCount++;
675 }
676
677 QList<QChar::Script> relevantScripts;
678
679 if (totalCount == 0) {
680 return relevantScripts;
681 }
682
683 if (scriptCounts.size() == 1) {
684 return {scriptCounts.cbegin().key()};
685 }
686
687 for (auto it = scriptCounts.cbegin(); it != scriptCounts.cend(); ++it) {
688 // return run types that used for 40% or more of the string
689 const int scriptCount = it.value();
690 const auto currentScript = it.key();
691 if (scriptCount * 100 / totalCount >= 40) {
692 relevantScripts << currentScript;
693 // always return basic latin if found more than 15%.
694 } else if (currentScript == QChar::Script_Latin && scriptCount * 100 / totalCount >= 15) {
695 relevantScripts << currentScript;
696 }
697 }
698
699 return relevantScripts;
700}
701
702QStringList GuessLanguagePrivate::identify(const QString &sample, const QList<QChar::Script> &scripts)
703{
704 if (sample.size() < MIN_LENGTH) {
705 return QStringList();
706 }
707
708 QStringList guesses;
709 for (const QChar::Script script : scripts) {
710 guesses.append(other: guessFromTrigrams(sample, langs: s_scriptLanguages.values(key: script)));
711 }
712
713 return guesses;
714}
715
716QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages)
717{
718 QStringList ret;
719
720 const QList<QString> sampleTrigrams = createOrderedModel(content: sample);
721
722 // Sort by score
723 QMultiMap<int, QString> scores;
724 for (const QString &language : languages) {
725 if (s_knownModels.contains(key: language)) {
726 scores.insert(key: distance(model: sampleTrigrams, knownModel: s_knownModels[language]), value: language);
727 }
728 }
729
730 // Skip if either no results or best result is completely unknown (distance >= maxdistance)
731 if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) {
732 qCDebug(SONNET_LOG_CORE) << "No scores for" << sample;
733 return ret;
734 }
735
736 int counter = 0;
737 double confidence = 0;
738
739 QMultiMapIterator<int, QString> it(scores);
740 it.next();
741
742 QString prevItem = it.value();
743 int prevScore = it.key();
744
745 while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) {
746 it.next();
747 counter++;
748 confidence += (it.key() - prevScore) / (double)it.key();
749 ret += prevItem;
750 prevItem = it.value();
751 prevScore = it.key();
752 }
753 if (counter < m_maxItems && confidence < m_minConfidence) {
754 ret += prevItem;
755 }
756
757 return ret;
758}
759
760QList<QString> GuessLanguagePrivate::createOrderedModel(const QString &content)
761{
762 QHash<QString, int> trigramCounts;
763
764 // collect trigrams
765 trigramCounts.reserve(size: content.size() - 2);
766 for (int i = 0; i < (content.size() - 2); ++i) {
767 QString tri = content.mid(position: i, n: 3).toLower();
768 trigramCounts[tri]++;
769 }
770
771 // invert the map <freq, trigram>
772 QList<QPair<int, QString>> trigramFrequencyList;
773 trigramFrequencyList.reserve(asize: trigramCounts.size());
774
775 auto it = trigramCounts.constBegin();
776 for (; it != trigramCounts.constEnd(); ++it) {
777 const QChar *data = it.key().constData();
778 bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace()));
779
780 if (!hasTwoSpaces) {
781 const int freq = it.value();
782 const QString &trigram = it.key();
783 trigramFrequencyList.append(t: {freq, trigram});
784 }
785 }
786
787 // sort descending by frequency
788 std::sort(first: trigramFrequencyList.begin(), last: trigramFrequencyList.end(), comp: [](const QPair<int, QString> &a, const QPair<int, QString> &b) {
789 return a.first > b.first;
790 });
791
792 QList<QString> orderedTrigrams;
793 orderedTrigrams.reserve(asize: trigramFrequencyList.size());
794 for (const auto &tri : std::as_const(t&: trigramFrequencyList)) {
795 orderedTrigrams.append(t: tri.second);
796 }
797
798 return orderedTrigrams;
799}
800
801int GuessLanguagePrivate::distance(const QList<QString> &model, const QHash<QString, int> &knownModel)
802{
803 int counter = -1;
804 int dist = 0;
805
806 for (const QString &trigram : model) {
807 const int val = knownModel.value(key: trigram, defaultValue: -1);
808 if (val != -1) {
809 dist += qAbs(t: ++counter - val);
810 } else {
811 dist += MAXGRAMS;
812 }
813
814 if (counter == (MAXGRAMS - 1)) {
815 break;
816 }
817 }
818
819 return dist;
820}
821
822QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates)
823{
824 // Try to see how many languages we can get spell checking for
825 QList<QSharedPointer<SpellerPlugin>> spellers;
826 for (const QString &lang : candidates) {
827 if (!Loader::openLoader()->languages().contains(str: lang)) {
828 qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang;
829 continue;
830 }
831 QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(language: lang);
832 if (!plugin.isNull()) {
833 spellers.append(t: plugin);
834 }
835 }
836
837 // If there's no spell checkers, give up
838 if (spellers.isEmpty()) {
839 return QString();
840 }
841
842 QMap<QString, int> correctHits;
843
844 WordTokenizer tokenizer(sentence);
845 while (tokenizer.hasNext()) {
846 Token word = tokenizer.next();
847 if (!tokenizer.isSpellcheckable()) {
848 continue;
849 }
850
851 for (int i = 0; i < spellers.count(); ++i) {
852 if (spellers[i]->isCorrect(word: word.toString())) {
853 correctHits[spellers[i]->language()]++;
854 }
855 }
856 }
857
858 if (correctHits.isEmpty()) {
859 return QString();
860 }
861
862 QMap<QString, int>::const_iterator max = correctHits.constBegin();
863 for (QMap<QString, int>::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) {
864 if (itr.value() > max.value()) {
865 max = itr;
866 }
867 }
868 return max.key();
869}
870}
871

source code of sonnet/src/core/guesslanguage.cpp