1 | /* This file is part of the KDE libraries |
2 | SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> |
3 | SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> |
4 | SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org> |
5 | |
6 | SPDX-License-Identifier: LGPL-2.0-or-later |
7 | */ |
8 | |
9 | #include <QCoreApplication> |
10 | #include <QDataStream> |
11 | #include <QFile> |
12 | #include <QLocale> |
13 | #include <QStandardPaths> |
14 | |
15 | #include "core_debug.h" |
16 | #include "guesslanguage.h" |
17 | #include "loader_p.h" |
18 | #include "speller.h" |
19 | #include "spellerplugin_p.h" |
20 | #include "tokenizer_p.h" |
21 | |
22 | /* |
23 | All language tags should be valid according to IETF BCP 47, as codified in RFC 4646. |
24 | ISO 639-1 codes should be used for the language part except for cases where there |
25 | exists no code, then 639-3 codes should be used. Country codes should only be used |
26 | in special cases. Scripts can be differentiated by IANA subtags, available here: |
27 | http://www.iana.org/assignments/language-subtag-registry |
28 | The script tags correspond to ISO 15924 |
29 | |
30 | An overview of the best practices concerning language tagging is available here: |
31 | http://www.w3.org/International/articles/language-tags/Overview.en.php |
32 | |
33 | lang tags should use underscores (_) rather than hyphens (-) to separate subsections. |
34 | |
35 | EXCEPTIONS: |
36 | For cases of known differences from the above tagging scheme and major |
37 | spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers |
38 | shall be used. All exception shall be noted here: |
39 | |
40 | BCP SPELLCHECK |
41 | az-Latn az |
42 | |
43 | */ |
44 | |
45 | namespace Sonnet |
46 | { |
47 | class GuessLanguagePrivate |
48 | { |
49 | public: |
50 | GuessLanguagePrivate(); |
51 | // language trigram score |
52 | static QHash<QString, QHash<QString, int>> s_knownModels; |
53 | |
54 | void loadModels(); |
55 | QList<QChar::Script> findRuns(const QString &text); |
56 | QList<QString> createOrderedModel(const QString &content); |
57 | int distance(const QList<QString> &model, const QHash<QString, int> &knownModel); |
58 | QStringList guessFromTrigrams(const QString &sample, const QStringList &langs); |
59 | QStringList identify(const QString &sample, const QList<QChar::Script> &scripts); |
60 | QString guessFromDictionaries(const QString &sentence, const QStringList &candidates); |
61 | |
62 | static QSet<QString> s_knownDictionaries; |
63 | static QMultiHash<QChar::Script, QString> s_scriptLanguages; |
64 | static QMap<QString, QString> s_dictionaryNameMap; |
65 | |
66 | const int MIN_LENGTH; |
67 | int m_maxItems; |
68 | double m_minConfidence; |
69 | }; |
70 | |
71 | QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels; |
72 | QSet<QString> GuessLanguagePrivate::s_knownDictionaries; |
73 | QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages; |
74 | QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap; |
75 | |
76 | QStringList getNames(QLocale::Script script) |
77 | { |
78 | QStringList locales; |
79 | const auto matchingLocales = QLocale::matchingLocales(language: QLocale::AnyLanguage, script, territory: QLocale::AnyCountry); |
80 | locales.reserve(asize: matchingLocales.size()); |
81 | for (const QLocale &locale : matchingLocales) { |
82 | locales << locale.name(); |
83 | } |
84 | return locales; |
85 | } |
86 | |
87 | GuessLanguagePrivate::GuessLanguagePrivate() |
88 | : MIN_LENGTH(5) |
89 | , m_maxItems(1) |
90 | , m_minConfidence(0) |
91 | { |
92 | if (!s_scriptLanguages.isEmpty()) { |
93 | return; |
94 | } |
95 | |
96 | const QStringList languages = Loader::openLoader()->languages(); |
97 | s_knownDictionaries = QSet<QString>(languages.begin(), languages.end()); |
98 | QSet<QString> dictionaryLanguages; |
99 | for (const QString &dictName : std::as_const(t&: s_knownDictionaries)) { |
100 | QString languageName = QLocale(dictName).name(); |
101 | if (languageName.isEmpty()) { |
102 | qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName; |
103 | continue; |
104 | } |
105 | dictionaryLanguages.insert(value: languageName); |
106 | } |
107 | |
108 | QSet<QString> allLanguages; |
109 | for (int i = 0; i < int(QChar::ScriptCount); i++) { |
110 | QChar::Script script = static_cast<QChar::Script>(i); |
111 | QStringList names; |
112 | switch (script) { |
113 | case QChar::Script_Latin: |
114 | names = getNames(script: QLocale::LatinScript); |
115 | break; |
116 | case QChar::Script_Greek: |
117 | names = getNames(script: QLocale::GreekScript); |
118 | break; |
119 | case QChar::Script_Cyrillic: |
120 | names = getNames(script: QLocale::CyrillicScript); |
121 | break; |
122 | case QChar::Script_Armenian: |
123 | names = getNames(script: QLocale::ArmenianScript); |
124 | break; |
125 | case QChar::Script_Hebrew: |
126 | names = getNames(script: QLocale::HebrewScript); |
127 | break; |
128 | case QChar::Script_Arabic: |
129 | names = getNames(script: QLocale::ArabicScript); |
130 | break; |
131 | case QChar::Script_Syriac: |
132 | names = getNames(script: QLocale::SyriacScript); |
133 | break; |
134 | case QChar::Script_Thaana: |
135 | names = getNames(script: QLocale::ThaanaScript); |
136 | break; |
137 | case QChar::Script_Devanagari: |
138 | names = getNames(script: QLocale::DevanagariScript); |
139 | break; |
140 | case QChar::Script_Bengali: |
141 | names = getNames(script: QLocale::BengaliScript); |
142 | break; |
143 | case QChar::Script_Gurmukhi: |
144 | names = getNames(script: QLocale::GurmukhiScript); |
145 | break; |
146 | case QChar::Script_Gujarati: |
147 | names = getNames(script: QLocale::GujaratiScript); |
148 | break; |
149 | case QChar::Script_Oriya: |
150 | names = getNames(script: QLocale::OriyaScript); |
151 | break; |
152 | case QChar::Script_Tamil: |
153 | names = getNames(script: QLocale::TamilScript); |
154 | break; |
155 | case QChar::Script_Telugu: |
156 | names = getNames(script: QLocale::TeluguScript); |
157 | break; |
158 | case QChar::Script_Kannada: |
159 | names = getNames(script: QLocale::KannadaScript); |
160 | break; |
161 | case QChar::Script_Malayalam: |
162 | names = getNames(script: QLocale::MalayalamScript); |
163 | break; |
164 | case QChar::Script_Sinhala: |
165 | names = getNames(script: QLocale::SinhalaScript); |
166 | break; |
167 | case QChar::Script_Thai: |
168 | names = getNames(script: QLocale::ThaiScript); |
169 | break; |
170 | case QChar::Script_Lao: |
171 | names = getNames(script: QLocale::LaoScript); |
172 | break; |
173 | case QChar::Script_Tibetan: |
174 | names = getNames(script: QLocale::TibetanScript); |
175 | break; |
176 | case QChar::Script_Myanmar: |
177 | names = getNames(script: QLocale::MyanmarScript); |
178 | break; |
179 | case QChar::Script_Georgian: |
180 | names = getNames(script: QLocale::GeorgianScript); |
181 | break; |
182 | case QChar::Script_Hangul: |
183 | names = getNames(script: QLocale::HangulScript); |
184 | break; |
185 | case QChar::Script_Ethiopic: |
186 | names = getNames(script: QLocale::EthiopicScript); |
187 | break; |
188 | case QChar::Script_Cherokee: |
189 | names = getNames(script: QLocale::CherokeeScript); |
190 | break; |
191 | case QChar::Script_CanadianAboriginal: |
192 | names = getNames(script: QLocale::CanadianAboriginalScript); |
193 | break; |
194 | case QChar::Script_Ogham: |
195 | names = getNames(script: QLocale::OghamScript); |
196 | break; |
197 | case QChar::Script_Runic: |
198 | names = getNames(script: QLocale::RunicScript); |
199 | break; |
200 | case QChar::Script_Khmer: |
201 | names = getNames(script: QLocale::KhmerScript); |
202 | break; |
203 | case QChar::Script_Mongolian: |
204 | names = getNames(script: QLocale::MongolianScript); |
205 | break; |
206 | case QChar::Script_Hiragana: |
207 | names = getNames(script: QLocale::HiraganaScript); |
208 | break; |
209 | case QChar::Script_Katakana: |
210 | names = getNames(script: QLocale::KatakanaScript); |
211 | break; |
212 | case QChar::Script_Bopomofo: |
213 | names = getNames(script: QLocale::BopomofoScript); |
214 | break; |
215 | case QChar::Script_Han: |
216 | names = getNames(script: QLocale::HanScript); |
217 | break; |
218 | case QChar::Script_Yi: |
219 | names = getNames(script: QLocale::YiScript); |
220 | break; |
221 | case QChar::Script_OldItalic: |
222 | names = getNames(script: QLocale::OldItalicScript); |
223 | break; |
224 | case QChar::Script_Gothic: |
225 | names = getNames(script: QLocale::GothicScript); |
226 | break; |
227 | case QChar::Script_Deseret: |
228 | names = getNames(script: QLocale::DeseretScript); |
229 | break; |
230 | case QChar::Script_Tagalog: |
231 | names = getNames(script: QLocale::TagalogScript); |
232 | break; |
233 | case QChar::Script_Hanunoo: |
234 | names = getNames(script: QLocale::HanunooScript); |
235 | break; |
236 | case QChar::Script_Buhid: |
237 | names = getNames(script: QLocale::BuhidScript); |
238 | break; |
239 | case QChar::Script_Tagbanwa: |
240 | names = getNames(script: QLocale::TagbanwaScript); |
241 | break; |
242 | case QChar::Script_Coptic: |
243 | names = getNames(script: QLocale::CopticScript); |
244 | break; |
245 | case QChar::Script_Limbu: |
246 | names = getNames(script: QLocale::LimbuScript); |
247 | break; |
248 | case QChar::Script_TaiLe: |
249 | names = getNames(script: QLocale::TaiLeScript); |
250 | break; |
251 | case QChar::Script_LinearB: |
252 | names = getNames(script: QLocale::LinearBScript); |
253 | break; |
254 | case QChar::Script_Ugaritic: |
255 | names = getNames(script: QLocale::UgariticScript); |
256 | break; |
257 | case QChar::Script_Shavian: |
258 | names = getNames(script: QLocale::ShavianScript); |
259 | break; |
260 | case QChar::Script_Osmanya: |
261 | names = getNames(script: QLocale::OsmanyaScript); |
262 | break; |
263 | case QChar::Script_Cypriot: |
264 | names = getNames(script: QLocale::CypriotScript); |
265 | break; |
266 | case QChar::Script_Braille: |
267 | names = getNames(script: QLocale::BrailleScript); |
268 | break; |
269 | case QChar::Script_Buginese: |
270 | names = getNames(script: QLocale::BugineseScript); |
271 | break; |
272 | case QChar::Script_NewTaiLue: |
273 | names = getNames(script: QLocale::NewTaiLueScript); |
274 | break; |
275 | case QChar::Script_Glagolitic: |
276 | names = getNames(script: QLocale::GlagoliticScript); |
277 | break; |
278 | case QChar::Script_Tifinagh: |
279 | names = getNames(script: QLocale::TifinaghScript); |
280 | break; |
281 | case QChar::Script_SylotiNagri: |
282 | names = getNames(script: QLocale::SylotiNagriScript); |
283 | break; |
284 | case QChar::Script_OldPersian: |
285 | names = getNames(script: QLocale::OldPersianScript); |
286 | break; |
287 | case QChar::Script_Kharoshthi: |
288 | names = getNames(script: QLocale::KharoshthiScript); |
289 | break; |
290 | case QChar::Script_Balinese: |
291 | names = getNames(script: QLocale::BalineseScript); |
292 | break; |
293 | case QChar::Script_Cuneiform: |
294 | names = getNames(script: QLocale::CuneiformScript); |
295 | break; |
296 | case QChar::Script_Phoenician: |
297 | names = getNames(script: QLocale::PhoenicianScript); |
298 | break; |
299 | case QChar::Script_PhagsPa: |
300 | names = getNames(script: QLocale::PhagsPaScript); |
301 | break; |
302 | case QChar::Script_Nko: |
303 | names = getNames(script: QLocale::NkoScript); |
304 | break; |
305 | case QChar::Script_Sundanese: |
306 | names = getNames(script: QLocale::SundaneseScript); |
307 | break; |
308 | case QChar::Script_Lepcha: |
309 | names = getNames(script: QLocale::LepchaScript); |
310 | break; |
311 | case QChar::Script_OlChiki: |
312 | names = getNames(script: QLocale::OlChikiScript); |
313 | break; |
314 | case QChar::Script_Vai: |
315 | names = getNames(script: QLocale::VaiScript); |
316 | break; |
317 | case QChar::Script_Saurashtra: |
318 | names = getNames(script: QLocale::SaurashtraScript); |
319 | break; |
320 | case QChar::Script_KayahLi: |
321 | names = getNames(script: QLocale::KayahLiScript); |
322 | break; |
323 | case QChar::Script_Rejang: |
324 | names = getNames(script: QLocale::RejangScript); |
325 | break; |
326 | case QChar::Script_Lycian: |
327 | names = getNames(script: QLocale::LycianScript); |
328 | break; |
329 | case QChar::Script_Carian: |
330 | names = getNames(script: QLocale::CarianScript); |
331 | break; |
332 | case QChar::Script_Lydian: |
333 | names = getNames(script: QLocale::LydianScript); |
334 | break; |
335 | case QChar::Script_Cham: |
336 | names = getNames(script: QLocale::ChamScript); |
337 | break; |
338 | case QChar::Script_TaiTham: |
339 | names = getNames(script: QLocale::LannaScript); |
340 | break; |
341 | case QChar::Script_TaiViet: |
342 | names = getNames(script: QLocale::TaiVietScript); |
343 | break; |
344 | case QChar::Script_Avestan: |
345 | names = getNames(script: QLocale::AvestanScript); |
346 | break; |
347 | case QChar::Script_EgyptianHieroglyphs: |
348 | names = getNames(script: QLocale::EgyptianHieroglyphsScript); |
349 | break; |
350 | case QChar::Script_Samaritan: |
351 | names = getNames(script: QLocale::SamaritanScript); |
352 | break; |
353 | case QChar::Script_Lisu: |
354 | names = getNames(script: QLocale::FraserScript); |
355 | break; |
356 | case QChar::Script_Bamum: |
357 | names = getNames(script: QLocale::BamumScript); |
358 | break; |
359 | case QChar::Script_Javanese: |
360 | names = getNames(script: QLocale::JavaneseScript); |
361 | break; |
362 | case QChar::Script_MeeteiMayek: |
363 | names = getNames(script: QLocale::MeiteiMayekScript); |
364 | break; |
365 | case QChar::Script_ImperialAramaic: |
366 | names = getNames(script: QLocale::ImperialAramaicScript); |
367 | break; |
368 | case QChar::Script_OldSouthArabian: |
369 | names = getNames(script: QLocale::OldSouthArabianScript); |
370 | break; |
371 | case QChar::Script_InscriptionalParthian: |
372 | names = getNames(script: QLocale::InscriptionalParthianScript); |
373 | break; |
374 | case QChar::Script_InscriptionalPahlavi: |
375 | names = getNames(script: QLocale::InscriptionalPahlaviScript); |
376 | break; |
377 | case QChar::Script_Kaithi: |
378 | names = getNames(script: QLocale::KaithiScript); |
379 | break; |
380 | case QChar::Script_Batak: |
381 | names = getNames(script: QLocale::BatakScript); |
382 | break; |
383 | case QChar::Script_Brahmi: |
384 | names = getNames(script: QLocale::BrahmiScript); |
385 | break; |
386 | case QChar::Script_Mandaic: |
387 | names = getNames(script: QLocale::MandaeanScript); |
388 | break; |
389 | case QChar::Script_Chakma: |
390 | names = getNames(script: QLocale::ChakmaScript); |
391 | break; |
392 | case QChar::Script_MeroiticCursive: |
393 | case QChar::Script_MeroiticHieroglyphs: |
394 | names = getNames(script: QLocale::MeroiticCursiveScript); |
395 | names.append(other: getNames(script: QLocale::MeroiticScript)); |
396 | break; |
397 | case QChar::Script_Miao: |
398 | names = getNames(script: QLocale::PollardPhoneticScript); |
399 | break; |
400 | case QChar::Script_Sharada: |
401 | names = getNames(script: QLocale::SharadaScript); |
402 | break; |
403 | case QChar::Script_SoraSompeng: |
404 | names = getNames(script: QLocale::SoraSompengScript); |
405 | break; |
406 | case QChar::Script_Takri: |
407 | names = getNames(script: QLocale::TakriScript); |
408 | break; |
409 | case QChar::Script_CaucasianAlbanian: |
410 | names = getNames(script: QLocale::CaucasianAlbanianScript); |
411 | break; |
412 | case QChar::Script_BassaVah: |
413 | names = getNames(script: QLocale::BassaVahScript); |
414 | break; |
415 | case QChar::Script_Duployan: |
416 | names = getNames(script: QLocale::DuployanScript); |
417 | break; |
418 | case QChar::Script_Elbasan: |
419 | names = getNames(script: QLocale::ElbasanScript); |
420 | break; |
421 | case QChar::Script_Grantha: |
422 | names = getNames(script: QLocale::GranthaScript); |
423 | break; |
424 | case QChar::Script_PahawhHmong: |
425 | names = getNames(script: QLocale::PahawhHmongScript); |
426 | break; |
427 | case QChar::Script_Khojki: |
428 | names = getNames(script: QLocale::KhojkiScript); |
429 | break; |
430 | case QChar::Script_LinearA: |
431 | names = getNames(script: QLocale::LinearAScript); |
432 | break; |
433 | case QChar::Script_Mahajani: |
434 | names = getNames(script: QLocale::MahajaniScript); |
435 | break; |
436 | case QChar::Script_Manichaean: |
437 | names = getNames(script: QLocale::ManichaeanScript); |
438 | break; |
439 | case QChar::Script_MendeKikakui: |
440 | names = getNames(script: QLocale::MendeKikakuiScript); |
441 | break; |
442 | case QChar::Script_Modi: |
443 | names = getNames(script: QLocale::ModiScript); |
444 | break; |
445 | case QChar::Script_Mro: |
446 | names = getNames(script: QLocale::MroScript); |
447 | break; |
448 | case QChar::Script_OldNorthArabian: |
449 | names = getNames(script: QLocale::OldNorthArabianScript); |
450 | break; |
451 | case QChar::Script_Nabataean: |
452 | names = getNames(script: QLocale::NabataeanScript); |
453 | break; |
454 | case QChar::Script_Palmyrene: |
455 | names = getNames(script: QLocale::PalmyreneScript); |
456 | break; |
457 | case QChar::Script_PauCinHau: |
458 | names = getNames(script: QLocale::PauCinHauScript); |
459 | break; |
460 | case QChar::Script_OldPermic: |
461 | names = getNames(script: QLocale::OldPermicScript); |
462 | break; |
463 | case QChar::Script_PsalterPahlavi: |
464 | names = getNames(script: QLocale::PsalterPahlaviScript); |
465 | break; |
466 | case QChar::Script_Siddham: |
467 | names = getNames(script: QLocale::SiddhamScript); |
468 | break; |
469 | case QChar::Script_Khudawadi: |
470 | names = getNames(script: QLocale::KhudawadiScript); |
471 | break; |
472 | case QChar::Script_Tirhuta: |
473 | names = getNames(script: QLocale::TirhutaScript); |
474 | break; |
475 | case QChar::Script_WarangCiti: |
476 | names = getNames(script: QLocale::VarangKshitiScript); |
477 | break; |
478 | case QChar::Script_Ahom: |
479 | names = getNames(script: QLocale::AhomScript); |
480 | break; |
481 | case QChar::Script_AnatolianHieroglyphs: |
482 | names = getNames(script: QLocale::AnatolianHieroglyphsScript); |
483 | break; |
484 | case QChar::Script_Hatran: |
485 | names = getNames(script: QLocale::HatranScript); |
486 | break; |
487 | case QChar::Script_Multani: |
488 | names = getNames(script: QLocale::MultaniScript); |
489 | break; |
490 | case QChar::Script_OldHungarian: |
491 | names = getNames(script: QLocale::OldHungarianScript); |
492 | break; |
493 | case QChar::Script_Unknown: |
494 | case QChar::Script_Inherited: |
495 | case QChar::Script_Common: |
496 | case QChar::Script_OldTurkic: |
497 | case QChar::Script_SignWriting: |
498 | break; |
499 | default: |
500 | qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script; |
501 | break; |
502 | } |
503 | allLanguages.unite(other: QSet<QString>(names.constBegin(), names.constEnd())); |
504 | |
505 | { // Remove unknown languages |
506 | QStringList pruned; |
507 | for (const QString &name : std::as_const(t&: names)) { |
508 | if (!dictionaryLanguages.contains(value: name)) { |
509 | continue; |
510 | } |
511 | pruned.append(t: name); |
512 | } |
513 | names = pruned; |
514 | } |
515 | |
516 | if (names.isEmpty()) { |
517 | continue; |
518 | } |
519 | |
520 | for (const QString &name : std::as_const(t&: names)) { |
521 | s_scriptLanguages.insert(key: script, value: name); |
522 | } |
523 | } |
524 | |
525 | // Try to handle some badly named dictionaries |
526 | if (!allLanguages.contains(other: s_knownDictionaries)) { |
527 | QSet<QString> dicts(s_knownDictionaries); |
528 | dicts.subtract(other: allLanguages); |
529 | for (const QString &dictName : std::as_const(t&: dicts)) { |
530 | QString languageName = QLocale(dictName).name(); |
531 | if (languageName.isEmpty()) { |
532 | qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName; |
533 | continue; |
534 | } |
535 | s_dictionaryNameMap[languageName] = dictName; |
536 | if (std::find(first: s_scriptLanguages.cbegin(), last: s_scriptLanguages.cend(), val: languageName) == s_scriptLanguages.cend()) { |
537 | qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName; |
538 | } |
539 | } |
540 | } |
541 | } |
542 | |
543 | GuessLanguage::GuessLanguage() |
544 | : d(new GuessLanguagePrivate) |
545 | { |
546 | } |
547 | |
548 | GuessLanguage::~GuessLanguage() = default; |
549 | |
550 | QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const |
551 | { |
552 | if (text.isEmpty()) { |
553 | return QString(); |
554 | } |
555 | |
556 | // Filter for available dictionaries |
557 | QStringList suggestionsList; |
558 | for (const QString &suggestion : suggestionsListIn) { |
559 | if (d->s_knownDictionaries.contains(value: suggestion) && !suggestionsList.contains(str: suggestion)) { |
560 | suggestionsList.append(t: suggestion); |
561 | } |
562 | } |
563 | |
564 | // Load the model on demand |
565 | if (d->s_knownModels.isEmpty()) { |
566 | d->loadModels(); |
567 | } |
568 | |
569 | const QList<QChar::Script> scriptsList = d->findRuns(text); |
570 | |
571 | QStringList candidateLanguages = d->identify(sample: text, scripts: scriptsList); |
572 | |
573 | // if guessing from trigrams fail |
574 | if (candidateLanguages.isEmpty()) { |
575 | for (const QChar::Script script : scriptsList) { |
576 | const auto languagesList = d->s_scriptLanguages.values(key: script); |
577 | for (const QString &lang : languagesList) { |
578 | if (!d->s_knownModels.contains(key: lang)) { |
579 | candidateLanguages.append(t: lang); |
580 | } |
581 | } |
582 | } |
583 | } |
584 | |
585 | // Hack for some bad dictionary names |
586 | for (int i = 0; i < candidateLanguages.count(); i++) { |
587 | if (d->s_dictionaryNameMap.contains(key: candidateLanguages[i])) { |
588 | candidateLanguages[i] = d->s_dictionaryNameMap.value(key: candidateLanguages[i]); |
589 | } |
590 | } |
591 | |
592 | if (candidateLanguages.count() == 1) { |
593 | return candidateLanguages.first(); |
594 | } |
595 | |
596 | // Wasn't able to get a good guess with the trigrams, try checking all |
597 | // dictionaries for the suggested languages. |
598 | candidateLanguages.append(l: suggestionsList); |
599 | candidateLanguages.removeDuplicates(); |
600 | QString identified = d->guessFromDictionaries(sentence: text, candidates: candidateLanguages); |
601 | if (!identified.isEmpty()) { |
602 | return identified; |
603 | } |
604 | |
605 | qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text; |
606 | |
607 | // None of our methods worked, just return the best suggestion |
608 | if (!suggestionsList.isEmpty()) { |
609 | return suggestionsList.first(); |
610 | } |
611 | |
612 | qCDebug(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text; |
613 | |
614 | // Not even any suggestions, give up |
615 | return QString(); |
616 | } |
617 | |
618 | void GuessLanguage::setLimits(int maxItems, double minConfidence) |
619 | { |
620 | d->m_maxItems = maxItems; |
621 | d->m_minConfidence = minConfidence; |
622 | } |
623 | |
624 | void GuessLanguagePrivate::loadModels() |
625 | { |
626 | // use trigrams from resource file, easy to deploy on all platforms |
627 | const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map" ); |
628 | qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile; |
629 | |
630 | QFile sin(triMapFile); |
631 | if (!sin.open(flags: QIODevice::ReadOnly)) { |
632 | qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile; |
633 | return; |
634 | } |
635 | |
636 | QDataStream in(&sin); |
637 | in >> s_knownModels; |
638 | |
639 | // Sanity check |
640 | QSet<QString> availableLanguages; |
641 | QHashIterator<QString, QHash<QString, int>> iterator(s_knownModels); |
642 | while (iterator.hasNext()) { |
643 | iterator.next(); |
644 | if (iterator.value().count() < MAXGRAMS) { |
645 | qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS; |
646 | } |
647 | availableLanguages.insert(value: iterator.key()); |
648 | } |
649 | QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd()); |
650 | knownLanguages.subtract(other: availableLanguages); |
651 | if (!knownLanguages.isEmpty()) { |
652 | qCDebug(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages; |
653 | } |
654 | } |
655 | |
656 | QList<QChar::Script> GuessLanguagePrivate::findRuns(const QString &text) |
657 | { |
658 | QHash<QChar::Script, int> scriptCounts; |
659 | |
660 | int totalCount = 0; |
661 | |
662 | for (const QChar c : text) { |
663 | const QChar::Script script = c.script(); |
664 | |
665 | if (script == QChar::Script_Common || script == QChar::Script_Inherited) { |
666 | continue; |
667 | } |
668 | |
669 | if (!c.isLetter()) { |
670 | continue; |
671 | } |
672 | |
673 | scriptCounts[script]++; |
674 | totalCount++; |
675 | } |
676 | |
677 | QList<QChar::Script> relevantScripts; |
678 | |
679 | if (totalCount == 0) { |
680 | return relevantScripts; |
681 | } |
682 | |
683 | if (scriptCounts.size() == 1) { |
684 | return {scriptCounts.cbegin().key()}; |
685 | } |
686 | |
687 | for (auto it = scriptCounts.cbegin(); it != scriptCounts.cend(); ++it) { |
688 | // return run types that used for 40% or more of the string |
689 | const int scriptCount = it.value(); |
690 | const auto currentScript = it.key(); |
691 | if (scriptCount * 100 / totalCount >= 40) { |
692 | relevantScripts << currentScript; |
693 | // always return basic latin if found more than 15%. |
694 | } else if (currentScript == QChar::Script_Latin && scriptCount * 100 / totalCount >= 15) { |
695 | relevantScripts << currentScript; |
696 | } |
697 | } |
698 | |
699 | return relevantScripts; |
700 | } |
701 | |
702 | QStringList GuessLanguagePrivate::identify(const QString &sample, const QList<QChar::Script> &scripts) |
703 | { |
704 | if (sample.size() < MIN_LENGTH) { |
705 | return QStringList(); |
706 | } |
707 | |
708 | QStringList guesses; |
709 | for (const QChar::Script script : scripts) { |
710 | guesses.append(other: guessFromTrigrams(sample, langs: s_scriptLanguages.values(key: script))); |
711 | } |
712 | |
713 | return guesses; |
714 | } |
715 | |
716 | QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages) |
717 | { |
718 | QStringList ret; |
719 | |
720 | const QList<QString> sampleTrigrams = createOrderedModel(content: sample); |
721 | |
722 | // Sort by score |
723 | QMultiMap<int, QString> scores; |
724 | for (const QString &language : languages) { |
725 | if (s_knownModels.contains(key: language)) { |
726 | scores.insert(key: distance(model: sampleTrigrams, knownModel: s_knownModels[language]), value: language); |
727 | } |
728 | } |
729 | |
730 | // Skip if either no results or best result is completely unknown (distance >= maxdistance) |
731 | if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) { |
732 | qCDebug(SONNET_LOG_CORE) << "No scores for" << sample; |
733 | return ret; |
734 | } |
735 | |
736 | int counter = 0; |
737 | double confidence = 0; |
738 | |
739 | QMultiMapIterator<int, QString> it(scores); |
740 | it.next(); |
741 | |
742 | QString prevItem = it.value(); |
743 | int prevScore = it.key(); |
744 | |
745 | while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) { |
746 | it.next(); |
747 | counter++; |
748 | confidence += (it.key() - prevScore) / (double)it.key(); |
749 | ret += prevItem; |
750 | prevItem = it.value(); |
751 | prevScore = it.key(); |
752 | } |
753 | if (counter < m_maxItems && confidence < m_minConfidence) { |
754 | ret += prevItem; |
755 | } |
756 | |
757 | return ret; |
758 | } |
759 | |
760 | QList<QString> GuessLanguagePrivate::createOrderedModel(const QString &content) |
761 | { |
762 | QHash<QString, int> trigramCounts; |
763 | |
764 | // collect trigrams |
765 | trigramCounts.reserve(size: content.size() - 2); |
766 | for (int i = 0; i < (content.size() - 2); ++i) { |
767 | QString tri = content.mid(position: i, n: 3).toLower(); |
768 | trigramCounts[tri]++; |
769 | } |
770 | |
771 | // invert the map <freq, trigram> |
772 | QList<QPair<int, QString>> trigramFrequencyList; |
773 | trigramFrequencyList.reserve(asize: trigramCounts.size()); |
774 | |
775 | auto it = trigramCounts.constBegin(); |
776 | for (; it != trigramCounts.constEnd(); ++it) { |
777 | const QChar *data = it.key().constData(); |
778 | bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace())); |
779 | |
780 | if (!hasTwoSpaces) { |
781 | const int freq = it.value(); |
782 | const QString &trigram = it.key(); |
783 | trigramFrequencyList.append(t: {freq, trigram}); |
784 | } |
785 | } |
786 | |
787 | // sort descending by frequency |
788 | std::sort(first: trigramFrequencyList.begin(), last: trigramFrequencyList.end(), comp: [](const QPair<int, QString> &a, const QPair<int, QString> &b) { |
789 | return a.first > b.first; |
790 | }); |
791 | |
792 | QList<QString> orderedTrigrams; |
793 | orderedTrigrams.reserve(asize: trigramFrequencyList.size()); |
794 | for (const auto &tri : std::as_const(t&: trigramFrequencyList)) { |
795 | orderedTrigrams.append(t: tri.second); |
796 | } |
797 | |
798 | return orderedTrigrams; |
799 | } |
800 | |
801 | int GuessLanguagePrivate::distance(const QList<QString> &model, const QHash<QString, int> &knownModel) |
802 | { |
803 | int counter = -1; |
804 | int dist = 0; |
805 | |
806 | for (const QString &trigram : model) { |
807 | const int val = knownModel.value(key: trigram, defaultValue: -1); |
808 | if (val != -1) { |
809 | dist += qAbs(t: ++counter - val); |
810 | } else { |
811 | dist += MAXGRAMS; |
812 | } |
813 | |
814 | if (counter == (MAXGRAMS - 1)) { |
815 | break; |
816 | } |
817 | } |
818 | |
819 | return dist; |
820 | } |
821 | |
822 | QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates) |
823 | { |
824 | // Try to see how many languages we can get spell checking for |
825 | QList<QSharedPointer<SpellerPlugin>> spellers; |
826 | for (const QString &lang : candidates) { |
827 | if (!Loader::openLoader()->languages().contains(str: lang)) { |
828 | qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang; |
829 | continue; |
830 | } |
831 | QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(language: lang); |
832 | if (!plugin.isNull()) { |
833 | spellers.append(t: plugin); |
834 | } |
835 | } |
836 | |
837 | // If there's no spell checkers, give up |
838 | if (spellers.isEmpty()) { |
839 | return QString(); |
840 | } |
841 | |
842 | QMap<QString, int> correctHits; |
843 | |
844 | WordTokenizer tokenizer(sentence); |
845 | while (tokenizer.hasNext()) { |
846 | Token word = tokenizer.next(); |
847 | if (!tokenizer.isSpellcheckable()) { |
848 | continue; |
849 | } |
850 | |
851 | for (int i = 0; i < spellers.count(); ++i) { |
852 | if (spellers[i]->isCorrect(word: word.toString())) { |
853 | correctHits[spellers[i]->language()]++; |
854 | } |
855 | } |
856 | } |
857 | |
858 | if (correctHits.isEmpty()) { |
859 | return QString(); |
860 | } |
861 | |
862 | QMap<QString, int>::const_iterator max = correctHits.constBegin(); |
863 | for (QMap<QString, int>::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) { |
864 | if (itr.value() > max.value()) { |
865 | max = itr; |
866 | } |
867 | } |
868 | return max.key(); |
869 | } |
870 | } |
871 | |