| 1 | /* This file is part of the KDE libraries |
| 2 | SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> |
| 3 | |
| 4 | SPDX-License-Identifier: LGPL-2.0-or-later |
| 5 | */ |
| 6 | |
| 7 | #ifndef GUESSLANGUAGE_H |
| 8 | #define GUESSLANGUAGE_H |
| 9 | |
| 10 | #include <QString> |
| 11 | #include <QStringList> |
| 12 | |
| 13 | #include "sonnetcore_export.h" |
| 14 | |
| 15 | #include <memory> |
| 16 | |
| 17 | namespace Sonnet |
| 18 | { |
| 19 | // Amount of trigrams in each file |
| 20 | static const int MAXGRAMS = 300; |
| 21 | |
| 22 | class GuessLanguagePrivate; |
| 23 | |
| 24 | /*! |
| 25 | * \class Sonnet::GuessLanguage |
| 26 | * \inheaderfile Sonnet/GuessLanguage |
| 27 | * \inmodule SonnetCore |
| 28 | * |
| 29 | * \brief GuessLanguage determines the language of a given text. |
| 30 | * |
| 31 | * GuessLanguage can determine the difference between ~75 languages for a given string. It is |
| 32 | * based off a Perl script originally written by Maciej Ceglowski |
| 33 | * called Languid. His script used a 2 part heuristic to determine language. First the text |
| 34 | * is checked for the scripts it contains, then for each set of languages using those |
| 35 | * scripts a n-gram frequency model of a given language is compared to a model of the text. |
| 36 | * The most similar language model is assumed to be the language. If no language is found |
| 37 | * an empty string is returned. |
| 38 | * |
| 39 | * \since 4.3 |
| 40 | */ |
| 41 | class SONNETCORE_EXPORT GuessLanguage |
| 42 | { |
| 43 | public: |
| 44 | /*! |
| 45 | * Constructor |
| 46 | * |
| 47 | * Creates a new GuessLanguage instance. |
| 48 | */ |
| 49 | GuessLanguage(); |
| 50 | |
| 51 | ~GuessLanguage(); |
| 52 | |
| 53 | GuessLanguage(const GuessLanguage &) = delete; |
| 54 | GuessLanguage &operator=(const GuessLanguage &) = delete; |
| 55 | |
| 56 | /*! |
| 57 | * Sets limits to number of languages returned by identify(). The confidence for each language is computed |
| 58 | * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get |
| 59 | * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly |
| 60 | * as after call to setLimits(1,0). |
| 61 | * |
| 62 | * \a maxItems The list returned by identify() will never have more than maxItems item |
| 63 | * |
| 64 | * \a minConfidence The list will have only enough items for their summary confidence equal |
| 65 | * or exceed minConfidence. |
| 66 | */ |
| 67 | void setLimits(int maxItems, double minConfidence); |
| 68 | |
| 69 | /*! |
| 70 | * Returns the 2 digit ISO 639-1 code for the language of the currently |
| 71 | * set text and. |
| 72 | * |
| 73 | * Three digits are returned only in the case where a 2 digit |
| 74 | * code does not exist. If \a text isn't empty, set the text to checked. |
| 75 | * \a text to be identified |
| 76 | * |
| 77 | * Returns list of the presumed languages of the text, sorted by decreasing confidence. Empty list means |
| 78 | * it is impossible to determine language with confidence required by setLimits |
| 79 | */ |
| 80 | QString identify(const QString &text, const QStringList &suggestions = QStringList()) const; |
| 81 | |
| 82 | private: |
| 83 | std::unique_ptr<GuessLanguagePrivate> const d; |
| 84 | }; |
| 85 | } |
| 86 | |
| 87 | #endif |
| 88 | |