| 1 | /* This file is part of the KDE libraries |
| 2 | SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> |
| 3 | SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> |
| 4 | SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org> |
| 5 | |
| 6 | SPDX-License-Identifier: LGPL-2.0-or-later |
| 7 | */ |
| 8 | |
| 9 | #include <QCoreApplication> |
| 10 | #include <QDataStream> |
| 11 | #include <QFile> |
| 12 | #include <QLocale> |
| 13 | #include <QStandardPaths> |
| 14 | |
| 15 | #include "core_debug.h" |
| 16 | #include "guesslanguage.h" |
| 17 | #include "loader_p.h" |
| 18 | #include "speller.h" |
| 19 | #include "spellerplugin_p.h" |
| 20 | #include "tokenizer_p.h" |
| 21 | |
| 22 | /* |
| 23 | All language tags should be valid according to IETF BCP 47, as codified in RFC 4646. |
| 24 | ISO 639-1 codes should be used for the language part except for cases where there |
| 25 | exists no code, then 639-3 codes should be used. Country codes should only be used |
| 26 | in special cases. Scripts can be differentiated by IANA subtags, available here: |
| 27 | http://www.iana.org/assignments/language-subtag-registry |
| 28 | The script tags correspond to ISO 15924 |
| 29 | |
| 30 | An overview of the best practices concerning language tagging is available here: |
| 31 | http://www.w3.org/International/articles/language-tags/Overview.en.php |
| 32 | |
| 33 | lang tags should use underscores (_) rather than hyphens (-) to separate subsections. |
| 34 | |
| 35 | EXCEPTIONS: |
| 36 | For cases of known differences from the above tagging scheme and major |
| 37 | spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers |
| 38 | shall be used. All exception shall be noted here: |
| 39 | |
| 40 | BCP SPELLCHECK |
| 41 | az-Latn az |
| 42 | |
| 43 | */ |
| 44 | |
| 45 | namespace Sonnet |
| 46 | { |
| 47 | class GuessLanguagePrivate |
| 48 | { |
| 49 | public: |
| 50 | GuessLanguagePrivate(); |
| 51 | // language trigram score |
| 52 | static QHash<QString, QHash<QString, int>> s_knownModels; |
| 53 | |
| 54 | void loadModels(); |
| 55 | QList<QChar::Script> findRuns(const QString &text); |
| 56 | QList<QString> createOrderedModel(const QString &content); |
| 57 | int distance(const QList<QString> &model, const QHash<QString, int> &knownModel); |
| 58 | QStringList guessFromTrigrams(const QString &sample, const QStringList &langs); |
| 59 | QStringList identify(const QString &sample, const QList<QChar::Script> &scripts); |
| 60 | QString guessFromDictionaries(const QString &sentence, const QStringList &candidates); |
| 61 | |
| 62 | static QSet<QString> s_knownDictionaries; |
| 63 | static QMultiHash<QChar::Script, QString> s_scriptLanguages; |
| 64 | static QMap<QString, QString> s_dictionaryNameMap; |
| 65 | |
| 66 | const int MIN_LENGTH; |
| 67 | int m_maxItems; |
| 68 | double m_minConfidence; |
| 69 | }; |
| 70 | |
| 71 | QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels; |
| 72 | QSet<QString> GuessLanguagePrivate::s_knownDictionaries; |
| 73 | QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages; |
| 74 | QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap; |
| 75 | |
| 76 | QStringList getNames(QLocale::Script script) |
| 77 | { |
| 78 | QStringList locales; |
| 79 | const auto matchingLocales = QLocale::matchingLocales(language: QLocale::AnyLanguage, script, territory: QLocale::AnyCountry); |
| 80 | locales.reserve(asize: matchingLocales.size()); |
| 81 | for (const QLocale &locale : matchingLocales) { |
| 82 | locales << locale.name(); |
| 83 | } |
| 84 | return locales; |
| 85 | } |
| 86 | |
| 87 | GuessLanguagePrivate::GuessLanguagePrivate() |
| 88 | : MIN_LENGTH(5) |
| 89 | , m_maxItems(1) |
| 90 | , m_minConfidence(0) |
| 91 | { |
| 92 | if (!s_scriptLanguages.isEmpty()) { |
| 93 | return; |
| 94 | } |
| 95 | |
| 96 | const QStringList languages = Loader::openLoader()->languages(); |
| 97 | s_knownDictionaries = QSet<QString>(languages.begin(), languages.end()); |
| 98 | QSet<QString> dictionaryLanguages; |
| 99 | for (const QString &dictName : std::as_const(t&: s_knownDictionaries)) { |
| 100 | QString languageName = QLocale(dictName).name(); |
| 101 | if (languageName.isEmpty()) { |
| 102 | qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName; |
| 103 | continue; |
| 104 | } |
| 105 | dictionaryLanguages.insert(value: languageName); |
| 106 | } |
| 107 | |
| 108 | QSet<QString> allLanguages; |
| 109 | for (int i = 0; i < int(QChar::ScriptCount); i++) { |
| 110 | QChar::Script script = static_cast<QChar::Script>(i); |
| 111 | QStringList names; |
| 112 | switch (script) { |
| 113 | case QChar::Script_Latin: |
| 114 | names = getNames(script: QLocale::LatinScript); |
| 115 | break; |
| 116 | case QChar::Script_Greek: |
| 117 | names = getNames(script: QLocale::GreekScript); |
| 118 | break; |
| 119 | case QChar::Script_Cyrillic: |
| 120 | names = getNames(script: QLocale::CyrillicScript); |
| 121 | break; |
| 122 | case QChar::Script_Armenian: |
| 123 | names = getNames(script: QLocale::ArmenianScript); |
| 124 | break; |
| 125 | case QChar::Script_Hebrew: |
| 126 | names = getNames(script: QLocale::HebrewScript); |
| 127 | break; |
| 128 | case QChar::Script_Arabic: |
| 129 | names = getNames(script: QLocale::ArabicScript); |
| 130 | break; |
| 131 | case QChar::Script_Syriac: |
| 132 | names = getNames(script: QLocale::SyriacScript); |
| 133 | break; |
| 134 | case QChar::Script_Thaana: |
| 135 | names = getNames(script: QLocale::ThaanaScript); |
| 136 | break; |
| 137 | case QChar::Script_Devanagari: |
| 138 | names = getNames(script: QLocale::DevanagariScript); |
| 139 | break; |
| 140 | case QChar::Script_Bengali: |
| 141 | names = getNames(script: QLocale::BengaliScript); |
| 142 | break; |
| 143 | case QChar::Script_Gurmukhi: |
| 144 | names = getNames(script: QLocale::GurmukhiScript); |
| 145 | break; |
| 146 | case QChar::Script_Gujarati: |
| 147 | names = getNames(script: QLocale::GujaratiScript); |
| 148 | break; |
| 149 | case QChar::Script_Oriya: |
| 150 | names = getNames(script: QLocale::OriyaScript); |
| 151 | break; |
| 152 | case QChar::Script_Tamil: |
| 153 | names = getNames(script: QLocale::TamilScript); |
| 154 | break; |
| 155 | case QChar::Script_Telugu: |
| 156 | names = getNames(script: QLocale::TeluguScript); |
| 157 | break; |
| 158 | case QChar::Script_Kannada: |
| 159 | names = getNames(script: QLocale::KannadaScript); |
| 160 | break; |
| 161 | case QChar::Script_Malayalam: |
| 162 | names = getNames(script: QLocale::MalayalamScript); |
| 163 | break; |
| 164 | case QChar::Script_Sinhala: |
| 165 | names = getNames(script: QLocale::SinhalaScript); |
| 166 | break; |
| 167 | case QChar::Script_Thai: |
| 168 | names = getNames(script: QLocale::ThaiScript); |
| 169 | break; |
| 170 | case QChar::Script_Lao: |
| 171 | names = getNames(script: QLocale::LaoScript); |
| 172 | break; |
| 173 | case QChar::Script_Tibetan: |
| 174 | names = getNames(script: QLocale::TibetanScript); |
| 175 | break; |
| 176 | case QChar::Script_Myanmar: |
| 177 | names = getNames(script: QLocale::MyanmarScript); |
| 178 | break; |
| 179 | case QChar::Script_Georgian: |
| 180 | names = getNames(script: QLocale::GeorgianScript); |
| 181 | break; |
| 182 | case QChar::Script_Hangul: |
| 183 | names = getNames(script: QLocale::HangulScript); |
| 184 | break; |
| 185 | case QChar::Script_Ethiopic: |
| 186 | names = getNames(script: QLocale::EthiopicScript); |
| 187 | break; |
| 188 | case QChar::Script_Cherokee: |
| 189 | names = getNames(script: QLocale::CherokeeScript); |
| 190 | break; |
| 191 | case QChar::Script_CanadianAboriginal: |
| 192 | names = getNames(script: QLocale::CanadianAboriginalScript); |
| 193 | break; |
| 194 | case QChar::Script_Ogham: |
| 195 | names = getNames(script: QLocale::OghamScript); |
| 196 | break; |
| 197 | case QChar::Script_Runic: |
| 198 | names = getNames(script: QLocale::RunicScript); |
| 199 | break; |
| 200 | case QChar::Script_Khmer: |
| 201 | names = getNames(script: QLocale::KhmerScript); |
| 202 | break; |
| 203 | case QChar::Script_Mongolian: |
| 204 | names = getNames(script: QLocale::MongolianScript); |
| 205 | break; |
| 206 | case QChar::Script_Hiragana: |
| 207 | names = getNames(script: QLocale::HiraganaScript); |
| 208 | break; |
| 209 | case QChar::Script_Katakana: |
| 210 | names = getNames(script: QLocale::KatakanaScript); |
| 211 | break; |
| 212 | case QChar::Script_Bopomofo: |
| 213 | names = getNames(script: QLocale::BopomofoScript); |
| 214 | break; |
| 215 | case QChar::Script_Han: |
| 216 | names = getNames(script: QLocale::HanScript); |
| 217 | break; |
| 218 | case QChar::Script_Yi: |
| 219 | names = getNames(script: QLocale::YiScript); |
| 220 | break; |
| 221 | case QChar::Script_OldItalic: |
| 222 | names = getNames(script: QLocale::OldItalicScript); |
| 223 | break; |
| 224 | case QChar::Script_Gothic: |
| 225 | names = getNames(script: QLocale::GothicScript); |
| 226 | break; |
| 227 | case QChar::Script_Deseret: |
| 228 | names = getNames(script: QLocale::DeseretScript); |
| 229 | break; |
| 230 | case QChar::Script_Tagalog: |
| 231 | names = getNames(script: QLocale::TagalogScript); |
| 232 | break; |
| 233 | case QChar::Script_Hanunoo: |
| 234 | names = getNames(script: QLocale::HanunooScript); |
| 235 | break; |
| 236 | case QChar::Script_Buhid: |
| 237 | names = getNames(script: QLocale::BuhidScript); |
| 238 | break; |
| 239 | case QChar::Script_Tagbanwa: |
| 240 | names = getNames(script: QLocale::TagbanwaScript); |
| 241 | break; |
| 242 | case QChar::Script_Coptic: |
| 243 | names = getNames(script: QLocale::CopticScript); |
| 244 | break; |
| 245 | case QChar::Script_Limbu: |
| 246 | names = getNames(script: QLocale::LimbuScript); |
| 247 | break; |
| 248 | case QChar::Script_TaiLe: |
| 249 | names = getNames(script: QLocale::TaiLeScript); |
| 250 | break; |
| 251 | case QChar::Script_LinearB: |
| 252 | names = getNames(script: QLocale::LinearBScript); |
| 253 | break; |
| 254 | case QChar::Script_Ugaritic: |
| 255 | names = getNames(script: QLocale::UgariticScript); |
| 256 | break; |
| 257 | case QChar::Script_Shavian: |
| 258 | names = getNames(script: QLocale::ShavianScript); |
| 259 | break; |
| 260 | case QChar::Script_Osmanya: |
| 261 | names = getNames(script: QLocale::OsmanyaScript); |
| 262 | break; |
| 263 | case QChar::Script_Cypriot: |
| 264 | names = getNames(script: QLocale::CypriotScript); |
| 265 | break; |
| 266 | case QChar::Script_Braille: |
| 267 | names = getNames(script: QLocale::BrailleScript); |
| 268 | break; |
| 269 | case QChar::Script_Buginese: |
| 270 | names = getNames(script: QLocale::BugineseScript); |
| 271 | break; |
| 272 | case QChar::Script_NewTaiLue: |
| 273 | names = getNames(script: QLocale::NewTaiLueScript); |
| 274 | break; |
| 275 | case QChar::Script_Glagolitic: |
| 276 | names = getNames(script: QLocale::GlagoliticScript); |
| 277 | break; |
| 278 | case QChar::Script_Tifinagh: |
| 279 | names = getNames(script: QLocale::TifinaghScript); |
| 280 | break; |
| 281 | case QChar::Script_SylotiNagri: |
| 282 | names = getNames(script: QLocale::SylotiNagriScript); |
| 283 | break; |
| 284 | case QChar::Script_OldPersian: |
| 285 | names = getNames(script: QLocale::OldPersianScript); |
| 286 | break; |
| 287 | case QChar::Script_Kharoshthi: |
| 288 | names = getNames(script: QLocale::KharoshthiScript); |
| 289 | break; |
| 290 | case QChar::Script_Balinese: |
| 291 | names = getNames(script: QLocale::BalineseScript); |
| 292 | break; |
| 293 | case QChar::Script_Cuneiform: |
| 294 | names = getNames(script: QLocale::CuneiformScript); |
| 295 | break; |
| 296 | case QChar::Script_Phoenician: |
| 297 | names = getNames(script: QLocale::PhoenicianScript); |
| 298 | break; |
| 299 | case QChar::Script_PhagsPa: |
| 300 | names = getNames(script: QLocale::PhagsPaScript); |
| 301 | break; |
| 302 | case QChar::Script_Nko: |
| 303 | names = getNames(script: QLocale::NkoScript); |
| 304 | break; |
| 305 | case QChar::Script_Sundanese: |
| 306 | names = getNames(script: QLocale::SundaneseScript); |
| 307 | break; |
| 308 | case QChar::Script_Lepcha: |
| 309 | names = getNames(script: QLocale::LepchaScript); |
| 310 | break; |
| 311 | case QChar::Script_OlChiki: |
| 312 | names = getNames(script: QLocale::OlChikiScript); |
| 313 | break; |
| 314 | case QChar::Script_Vai: |
| 315 | names = getNames(script: QLocale::VaiScript); |
| 316 | break; |
| 317 | case QChar::Script_Saurashtra: |
| 318 | names = getNames(script: QLocale::SaurashtraScript); |
| 319 | break; |
| 320 | case QChar::Script_KayahLi: |
| 321 | names = getNames(script: QLocale::KayahLiScript); |
| 322 | break; |
| 323 | case QChar::Script_Rejang: |
| 324 | names = getNames(script: QLocale::RejangScript); |
| 325 | break; |
| 326 | case QChar::Script_Lycian: |
| 327 | names = getNames(script: QLocale::LycianScript); |
| 328 | break; |
| 329 | case QChar::Script_Carian: |
| 330 | names = getNames(script: QLocale::CarianScript); |
| 331 | break; |
| 332 | case QChar::Script_Lydian: |
| 333 | names = getNames(script: QLocale::LydianScript); |
| 334 | break; |
| 335 | case QChar::Script_Cham: |
| 336 | names = getNames(script: QLocale::ChamScript); |
| 337 | break; |
| 338 | case QChar::Script_TaiTham: |
| 339 | names = getNames(script: QLocale::LannaScript); |
| 340 | break; |
| 341 | case QChar::Script_TaiViet: |
| 342 | names = getNames(script: QLocale::TaiVietScript); |
| 343 | break; |
| 344 | case QChar::Script_Avestan: |
| 345 | names = getNames(script: QLocale::AvestanScript); |
| 346 | break; |
| 347 | case QChar::Script_EgyptianHieroglyphs: |
| 348 | names = getNames(script: QLocale::EgyptianHieroglyphsScript); |
| 349 | break; |
| 350 | case QChar::Script_Samaritan: |
| 351 | names = getNames(script: QLocale::SamaritanScript); |
| 352 | break; |
| 353 | case QChar::Script_Lisu: |
| 354 | names = getNames(script: QLocale::FraserScript); |
| 355 | break; |
| 356 | case QChar::Script_Bamum: |
| 357 | names = getNames(script: QLocale::BamumScript); |
| 358 | break; |
| 359 | case QChar::Script_Javanese: |
| 360 | names = getNames(script: QLocale::JavaneseScript); |
| 361 | break; |
| 362 | case QChar::Script_MeeteiMayek: |
| 363 | names = getNames(script: QLocale::MeiteiMayekScript); |
| 364 | break; |
| 365 | case QChar::Script_ImperialAramaic: |
| 366 | names = getNames(script: QLocale::ImperialAramaicScript); |
| 367 | break; |
| 368 | case QChar::Script_OldSouthArabian: |
| 369 | names = getNames(script: QLocale::OldSouthArabianScript); |
| 370 | break; |
| 371 | case QChar::Script_InscriptionalParthian: |
| 372 | names = getNames(script: QLocale::InscriptionalParthianScript); |
| 373 | break; |
| 374 | case QChar::Script_InscriptionalPahlavi: |
| 375 | names = getNames(script: QLocale::InscriptionalPahlaviScript); |
| 376 | break; |
| 377 | case QChar::Script_Kaithi: |
| 378 | names = getNames(script: QLocale::KaithiScript); |
| 379 | break; |
| 380 | case QChar::Script_Batak: |
| 381 | names = getNames(script: QLocale::BatakScript); |
| 382 | break; |
| 383 | case QChar::Script_Brahmi: |
| 384 | names = getNames(script: QLocale::BrahmiScript); |
| 385 | break; |
| 386 | case QChar::Script_Mandaic: |
| 387 | names = getNames(script: QLocale::MandaeanScript); |
| 388 | break; |
| 389 | case QChar::Script_Chakma: |
| 390 | names = getNames(script: QLocale::ChakmaScript); |
| 391 | break; |
| 392 | case QChar::Script_MeroiticCursive: |
| 393 | case QChar::Script_MeroiticHieroglyphs: |
| 394 | names = getNames(script: QLocale::MeroiticCursiveScript); |
| 395 | names.append(other: getNames(script: QLocale::MeroiticScript)); |
| 396 | break; |
| 397 | case QChar::Script_Miao: |
| 398 | names = getNames(script: QLocale::PollardPhoneticScript); |
| 399 | break; |
| 400 | case QChar::Script_Sharada: |
| 401 | names = getNames(script: QLocale::SharadaScript); |
| 402 | break; |
| 403 | case QChar::Script_SoraSompeng: |
| 404 | names = getNames(script: QLocale::SoraSompengScript); |
| 405 | break; |
| 406 | case QChar::Script_Takri: |
| 407 | names = getNames(script: QLocale::TakriScript); |
| 408 | break; |
| 409 | case QChar::Script_CaucasianAlbanian: |
| 410 | names = getNames(script: QLocale::CaucasianAlbanianScript); |
| 411 | break; |
| 412 | case QChar::Script_BassaVah: |
| 413 | names = getNames(script: QLocale::BassaVahScript); |
| 414 | break; |
| 415 | case QChar::Script_Duployan: |
| 416 | names = getNames(script: QLocale::DuployanScript); |
| 417 | break; |
| 418 | case QChar::Script_Elbasan: |
| 419 | names = getNames(script: QLocale::ElbasanScript); |
| 420 | break; |
| 421 | case QChar::Script_Grantha: |
| 422 | names = getNames(script: QLocale::GranthaScript); |
| 423 | break; |
| 424 | case QChar::Script_PahawhHmong: |
| 425 | names = getNames(script: QLocale::PahawhHmongScript); |
| 426 | break; |
| 427 | case QChar::Script_Khojki: |
| 428 | names = getNames(script: QLocale::KhojkiScript); |
| 429 | break; |
| 430 | case QChar::Script_LinearA: |
| 431 | names = getNames(script: QLocale::LinearAScript); |
| 432 | break; |
| 433 | case QChar::Script_Mahajani: |
| 434 | names = getNames(script: QLocale::MahajaniScript); |
| 435 | break; |
| 436 | case QChar::Script_Manichaean: |
| 437 | names = getNames(script: QLocale::ManichaeanScript); |
| 438 | break; |
| 439 | case QChar::Script_MendeKikakui: |
| 440 | names = getNames(script: QLocale::MendeKikakuiScript); |
| 441 | break; |
| 442 | case QChar::Script_Modi: |
| 443 | names = getNames(script: QLocale::ModiScript); |
| 444 | break; |
| 445 | case QChar::Script_Mro: |
| 446 | names = getNames(script: QLocale::MroScript); |
| 447 | break; |
| 448 | case QChar::Script_OldNorthArabian: |
| 449 | names = getNames(script: QLocale::OldNorthArabianScript); |
| 450 | break; |
| 451 | case QChar::Script_Nabataean: |
| 452 | names = getNames(script: QLocale::NabataeanScript); |
| 453 | break; |
| 454 | case QChar::Script_Palmyrene: |
| 455 | names = getNames(script: QLocale::PalmyreneScript); |
| 456 | break; |
| 457 | case QChar::Script_PauCinHau: |
| 458 | names = getNames(script: QLocale::PauCinHauScript); |
| 459 | break; |
| 460 | case QChar::Script_OldPermic: |
| 461 | names = getNames(script: QLocale::OldPermicScript); |
| 462 | break; |
| 463 | case QChar::Script_PsalterPahlavi: |
| 464 | names = getNames(script: QLocale::PsalterPahlaviScript); |
| 465 | break; |
| 466 | case QChar::Script_Siddham: |
| 467 | names = getNames(script: QLocale::SiddhamScript); |
| 468 | break; |
| 469 | case QChar::Script_Khudawadi: |
| 470 | names = getNames(script: QLocale::KhudawadiScript); |
| 471 | break; |
| 472 | case QChar::Script_Tirhuta: |
| 473 | names = getNames(script: QLocale::TirhutaScript); |
| 474 | break; |
| 475 | case QChar::Script_WarangCiti: |
| 476 | names = getNames(script: QLocale::VarangKshitiScript); |
| 477 | break; |
| 478 | case QChar::Script_Ahom: |
| 479 | names = getNames(script: QLocale::AhomScript); |
| 480 | break; |
| 481 | case QChar::Script_AnatolianHieroglyphs: |
| 482 | names = getNames(script: QLocale::AnatolianHieroglyphsScript); |
| 483 | break; |
| 484 | case QChar::Script_Hatran: |
| 485 | names = getNames(script: QLocale::HatranScript); |
| 486 | break; |
| 487 | case QChar::Script_Multani: |
| 488 | names = getNames(script: QLocale::MultaniScript); |
| 489 | break; |
| 490 | case QChar::Script_OldHungarian: |
| 491 | names = getNames(script: QLocale::OldHungarianScript); |
| 492 | break; |
| 493 | case QChar::Script_Unknown: |
| 494 | case QChar::Script_Inherited: |
| 495 | case QChar::Script_Common: |
| 496 | case QChar::Script_OldTurkic: |
| 497 | case QChar::Script_SignWriting: |
| 498 | break; |
| 499 | default: |
| 500 | qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script; |
| 501 | break; |
| 502 | } |
| 503 | allLanguages.unite(other: QSet<QString>(names.constBegin(), names.constEnd())); |
| 504 | |
| 505 | { // Remove unknown languages |
| 506 | QStringList pruned; |
| 507 | for (const QString &name : std::as_const(t&: names)) { |
| 508 | if (!dictionaryLanguages.contains(value: name)) { |
| 509 | continue; |
| 510 | } |
| 511 | pruned.append(t: name); |
| 512 | } |
| 513 | names = pruned; |
| 514 | } |
| 515 | |
| 516 | if (names.isEmpty()) { |
| 517 | continue; |
| 518 | } |
| 519 | |
| 520 | for (const QString &name : std::as_const(t&: names)) { |
| 521 | s_scriptLanguages.insert(key: script, value: name); |
| 522 | } |
| 523 | } |
| 524 | |
| 525 | // Try to handle some badly named dictionaries |
| 526 | if (!allLanguages.contains(other: s_knownDictionaries)) { |
| 527 | QSet<QString> dicts(s_knownDictionaries); |
| 528 | dicts.subtract(other: allLanguages); |
| 529 | for (const QString &dictName : std::as_const(t&: dicts)) { |
| 530 | QString languageName = QLocale(dictName).name(); |
| 531 | if (languageName.isEmpty()) { |
| 532 | qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName; |
| 533 | continue; |
| 534 | } |
| 535 | s_dictionaryNameMap[languageName] = dictName; |
| 536 | if (std::find(first: s_scriptLanguages.cbegin(), last: s_scriptLanguages.cend(), val: languageName) == s_scriptLanguages.cend()) { |
| 537 | qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName; |
| 538 | } |
| 539 | } |
| 540 | } |
| 541 | } |
| 542 | |
| 543 | GuessLanguage::GuessLanguage() |
| 544 | : d(new GuessLanguagePrivate) |
| 545 | { |
| 546 | } |
| 547 | |
| 548 | GuessLanguage::~GuessLanguage() = default; |
| 549 | |
| 550 | QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const |
| 551 | { |
| 552 | if (text.isEmpty()) { |
| 553 | return QString(); |
| 554 | } |
| 555 | |
| 556 | // Filter for available dictionaries |
| 557 | QStringList suggestionsList; |
| 558 | for (const QString &suggestion : suggestionsListIn) { |
| 559 | if (d->s_knownDictionaries.contains(value: suggestion) && !suggestionsList.contains(str: suggestion)) { |
| 560 | suggestionsList.append(t: suggestion); |
| 561 | } |
| 562 | } |
| 563 | |
| 564 | // Load the model on demand |
| 565 | if (d->s_knownModels.isEmpty()) { |
| 566 | d->loadModels(); |
| 567 | } |
| 568 | |
| 569 | const QList<QChar::Script> scriptsList = d->findRuns(text); |
| 570 | |
| 571 | QStringList candidateLanguages = d->identify(sample: text, scripts: scriptsList); |
| 572 | |
| 573 | // if guessing from trigrams fail |
| 574 | if (candidateLanguages.isEmpty()) { |
| 575 | for (const QChar::Script script : scriptsList) { |
| 576 | const auto languagesList = d->s_scriptLanguages.values(key: script); |
| 577 | for (const QString &lang : languagesList) { |
| 578 | if (!d->s_knownModels.contains(key: lang)) { |
| 579 | candidateLanguages.append(t: lang); |
| 580 | } |
| 581 | } |
| 582 | } |
| 583 | } |
| 584 | |
| 585 | // Hack for some bad dictionary names |
| 586 | for (int i = 0; i < candidateLanguages.count(); i++) { |
| 587 | if (d->s_dictionaryNameMap.contains(key: candidateLanguages[i])) { |
| 588 | candidateLanguages[i] = d->s_dictionaryNameMap.value(key: candidateLanguages[i]); |
| 589 | } |
| 590 | } |
| 591 | |
| 592 | if (candidateLanguages.count() == 1) { |
| 593 | return candidateLanguages.first(); |
| 594 | } |
| 595 | |
| 596 | // Wasn't able to get a good guess with the trigrams, try checking all |
| 597 | // dictionaries for the suggested languages. |
| 598 | candidateLanguages.append(l: suggestionsList); |
| 599 | candidateLanguages.removeDuplicates(); |
| 600 | QString identified = d->guessFromDictionaries(sentence: text, candidates: candidateLanguages); |
| 601 | if (!identified.isEmpty()) { |
| 602 | return identified; |
| 603 | } |
| 604 | |
| 605 | qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text; |
| 606 | |
| 607 | // None of our methods worked, just return the best suggestion |
| 608 | if (!suggestionsList.isEmpty()) { |
| 609 | return suggestionsList.first(); |
| 610 | } |
| 611 | |
| 612 | qCDebug(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text; |
| 613 | |
| 614 | // Not even any suggestions, give up |
| 615 | return QString(); |
| 616 | } |
| 617 | |
| 618 | void GuessLanguage::setLimits(int maxItems, double minConfidence) |
| 619 | { |
| 620 | d->m_maxItems = maxItems; |
| 621 | d->m_minConfidence = minConfidence; |
| 622 | } |
| 623 | |
| 624 | void GuessLanguagePrivate::loadModels() |
| 625 | { |
| 626 | // use trigrams from resource file, easy to deploy on all platforms |
| 627 | const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map" ); |
| 628 | qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile; |
| 629 | |
| 630 | QFile sin(triMapFile); |
| 631 | if (!sin.open(flags: QIODevice::ReadOnly)) { |
| 632 | qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile; |
| 633 | return; |
| 634 | } |
| 635 | |
| 636 | QDataStream in(&sin); |
| 637 | in >> s_knownModels; |
| 638 | |
| 639 | // Sanity check |
| 640 | QSet<QString> availableLanguages; |
| 641 | QHashIterator<QString, QHash<QString, int>> iterator(s_knownModels); |
| 642 | while (iterator.hasNext()) { |
| 643 | iterator.next(); |
| 644 | if (iterator.value().count() < MAXGRAMS) { |
| 645 | qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS; |
| 646 | } |
| 647 | availableLanguages.insert(value: iterator.key()); |
| 648 | } |
| 649 | QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd()); |
| 650 | knownLanguages.subtract(other: availableLanguages); |
| 651 | if (!knownLanguages.isEmpty()) { |
| 652 | qCDebug(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages; |
| 653 | } |
| 654 | } |
| 655 | |
| 656 | QList<QChar::Script> GuessLanguagePrivate::findRuns(const QString &text) |
| 657 | { |
| 658 | QHash<QChar::Script, int> scriptCounts; |
| 659 | |
| 660 | int totalCount = 0; |
| 661 | |
| 662 | for (const QChar c : text) { |
| 663 | const QChar::Script script = c.script(); |
| 664 | |
| 665 | if (script == QChar::Script_Common || script == QChar::Script_Inherited) { |
| 666 | continue; |
| 667 | } |
| 668 | |
| 669 | if (!c.isLetter()) { |
| 670 | continue; |
| 671 | } |
| 672 | |
| 673 | scriptCounts[script]++; |
| 674 | totalCount++; |
| 675 | } |
| 676 | |
| 677 | QList<QChar::Script> relevantScripts; |
| 678 | |
| 679 | if (totalCount == 0) { |
| 680 | return relevantScripts; |
| 681 | } |
| 682 | |
| 683 | if (scriptCounts.size() == 1) { |
| 684 | return {scriptCounts.cbegin().key()}; |
| 685 | } |
| 686 | |
| 687 | for (auto it = scriptCounts.cbegin(); it != scriptCounts.cend(); ++it) { |
| 688 | // return run types that used for 40% or more of the string |
| 689 | const int scriptCount = it.value(); |
| 690 | const auto currentScript = it.key(); |
| 691 | if (scriptCount * 100 / totalCount >= 40) { |
| 692 | relevantScripts << currentScript; |
| 693 | // always return basic latin if found more than 15%. |
| 694 | } else if (currentScript == QChar::Script_Latin && scriptCount * 100 / totalCount >= 15) { |
| 695 | relevantScripts << currentScript; |
| 696 | } |
| 697 | } |
| 698 | |
| 699 | return relevantScripts; |
| 700 | } |
| 701 | |
| 702 | QStringList GuessLanguagePrivate::identify(const QString &sample, const QList<QChar::Script> &scripts) |
| 703 | { |
| 704 | if (sample.size() < MIN_LENGTH) { |
| 705 | return QStringList(); |
| 706 | } |
| 707 | |
| 708 | QStringList guesses; |
| 709 | for (const QChar::Script script : scripts) { |
| 710 | guesses.append(other: guessFromTrigrams(sample, langs: s_scriptLanguages.values(key: script))); |
| 711 | } |
| 712 | |
| 713 | return guesses; |
| 714 | } |
| 715 | |
| 716 | QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages) |
| 717 | { |
| 718 | QStringList ret; |
| 719 | |
| 720 | const QList<QString> sampleTrigrams = createOrderedModel(content: sample); |
| 721 | |
| 722 | // Sort by score |
| 723 | QMultiMap<int, QString> scores; |
| 724 | for (const QString &language : languages) { |
| 725 | if (s_knownModels.contains(key: language)) { |
| 726 | scores.insert(key: distance(model: sampleTrigrams, knownModel: s_knownModels[language]), value: language); |
| 727 | } |
| 728 | } |
| 729 | |
| 730 | // Skip if either no results or best result is completely unknown (distance >= maxdistance) |
| 731 | if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) { |
| 732 | qCDebug(SONNET_LOG_CORE) << "No scores for" << sample; |
| 733 | return ret; |
| 734 | } |
| 735 | |
| 736 | int counter = 0; |
| 737 | double confidence = 0; |
| 738 | |
| 739 | QMultiMapIterator<int, QString> it(scores); |
| 740 | it.next(); |
| 741 | |
| 742 | QString prevItem = it.value(); |
| 743 | int prevScore = it.key(); |
| 744 | |
| 745 | while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) { |
| 746 | it.next(); |
| 747 | counter++; |
| 748 | confidence += (it.key() - prevScore) / (double)it.key(); |
| 749 | ret += prevItem; |
| 750 | prevItem = it.value(); |
| 751 | prevScore = it.key(); |
| 752 | } |
| 753 | if (counter < m_maxItems && confidence < m_minConfidence) { |
| 754 | ret += prevItem; |
| 755 | } |
| 756 | |
| 757 | return ret; |
| 758 | } |
| 759 | |
| 760 | QList<QString> GuessLanguagePrivate::createOrderedModel(const QString &content) |
| 761 | { |
| 762 | QHash<QString, int> trigramCounts; |
| 763 | |
| 764 | // collect trigrams |
| 765 | trigramCounts.reserve(size: content.size() - 2); |
| 766 | for (int i = 0; i < (content.size() - 2); ++i) { |
| 767 | QString tri = content.mid(position: i, n: 3).toLower(); |
| 768 | trigramCounts[tri]++; |
| 769 | } |
| 770 | |
| 771 | // invert the map <freq, trigram> |
| 772 | QList<QPair<int, QString>> trigramFrequencyList; |
| 773 | trigramFrequencyList.reserve(asize: trigramCounts.size()); |
| 774 | |
| 775 | auto it = trigramCounts.constBegin(); |
| 776 | for (; it != trigramCounts.constEnd(); ++it) { |
| 777 | const QChar *data = it.key().constData(); |
| 778 | bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace())); |
| 779 | |
| 780 | if (!hasTwoSpaces) { |
| 781 | const int freq = it.value(); |
| 782 | const QString &trigram = it.key(); |
| 783 | trigramFrequencyList.append(t: {freq, trigram}); |
| 784 | } |
| 785 | } |
| 786 | |
| 787 | // sort descending by frequency |
| 788 | std::sort(first: trigramFrequencyList.begin(), last: trigramFrequencyList.end(), comp: [](const QPair<int, QString> &a, const QPair<int, QString> &b) { |
| 789 | return a.first > b.first; |
| 790 | }); |
| 791 | |
| 792 | QList<QString> orderedTrigrams; |
| 793 | orderedTrigrams.reserve(asize: trigramFrequencyList.size()); |
| 794 | for (const auto &tri : std::as_const(t&: trigramFrequencyList)) { |
| 795 | orderedTrigrams.append(t: tri.second); |
| 796 | } |
| 797 | |
| 798 | return orderedTrigrams; |
| 799 | } |
| 800 | |
| 801 | int GuessLanguagePrivate::distance(const QList<QString> &model, const QHash<QString, int> &knownModel) |
| 802 | { |
| 803 | int counter = -1; |
| 804 | int dist = 0; |
| 805 | |
| 806 | for (const QString &trigram : model) { |
| 807 | const int val = knownModel.value(key: trigram, defaultValue: -1); |
| 808 | if (val != -1) { |
| 809 | dist += qAbs(t: ++counter - val); |
| 810 | } else { |
| 811 | dist += MAXGRAMS; |
| 812 | } |
| 813 | |
| 814 | if (counter == (MAXGRAMS - 1)) { |
| 815 | break; |
| 816 | } |
| 817 | } |
| 818 | |
| 819 | return dist; |
| 820 | } |
| 821 | |
| 822 | QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates) |
| 823 | { |
| 824 | // Try to see how many languages we can get spell checking for |
| 825 | QList<QSharedPointer<SpellerPlugin>> spellers; |
| 826 | for (const QString &lang : candidates) { |
| 827 | if (!Loader::openLoader()->languages().contains(str: lang)) { |
| 828 | qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang; |
| 829 | continue; |
| 830 | } |
| 831 | QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(language: lang); |
| 832 | if (!plugin.isNull()) { |
| 833 | spellers.append(t: plugin); |
| 834 | } |
| 835 | } |
| 836 | |
| 837 | // If there's no spell checkers, give up |
| 838 | if (spellers.isEmpty()) { |
| 839 | return QString(); |
| 840 | } |
| 841 | |
| 842 | QMap<QString, int> correctHits; |
| 843 | |
| 844 | WordTokenizer tokenizer(sentence); |
| 845 | while (tokenizer.hasNext()) { |
| 846 | Token word = tokenizer.next(); |
| 847 | if (!tokenizer.isSpellcheckable()) { |
| 848 | continue; |
| 849 | } |
| 850 | |
| 851 | for (int i = 0; i < spellers.count(); ++i) { |
| 852 | if (spellers[i]->isCorrect(word: word.toString())) { |
| 853 | correctHits[spellers[i]->language()]++; |
| 854 | } |
| 855 | } |
| 856 | } |
| 857 | |
| 858 | if (correctHits.isEmpty()) { |
| 859 | return QString(); |
| 860 | } |
| 861 | |
| 862 | QMap<QString, int>::const_iterator max = correctHits.constBegin(); |
| 863 | for (QMap<QString, int>::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) { |
| 864 | if (itr.value() > max.value()) { |
| 865 | max = itr; |
| 866 | } |
| 867 | } |
| 868 | return max.key(); |
| 869 | } |
| 870 | } |
| 871 | |