1/* This file is part of the KDE libraries
2 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#ifndef GUESSLANGUAGE_H
8#define GUESSLANGUAGE_H
9
10#include <QString>
11#include <QStringList>
12
13#include "sonnetcore_export.h"
14
15#include <memory>
16
17namespace Sonnet
18{
19// Amount of trigrams in each file
20static const int MAXGRAMS = 300;
21
22class GuessLanguagePrivate;
23
24/*!
25 * \class Sonnet::GuessLanguage
26 * \inheaderfile Sonnet/GuessLanguage
27 * \inmodule SonnetCore
28 *
29 * \brief GuessLanguage determines the language of a given text.
30 *
31 * GuessLanguage can determine the difference between ~75 languages for a given string. It is
32 * based off a Perl script originally written by Maciej Ceglowski
33 * called Languid. His script used a 2 part heuristic to determine language. First the text
34 * is checked for the scripts it contains, then for each set of languages using those
35 * scripts a n-gram frequency model of a given language is compared to a model of the text.
36 * The most similar language model is assumed to be the language. If no language is found
37 * an empty string is returned.
38 *
39 * \since 4.3
40 */
41class SONNETCORE_EXPORT GuessLanguage
42{
43public:
44 /*!
45 * Constructor
46 *
47 * Creates a new GuessLanguage instance.
48 */
49 GuessLanguage();
50
51 ~GuessLanguage();
52
53 GuessLanguage(const GuessLanguage &) = delete;
54 GuessLanguage &operator=(const GuessLanguage &) = delete;
55
56 /*!
57 * Sets limits to number of languages returned by identify(). The confidence for each language is computed
58 * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get
59 * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly
60 * as after call to setLimits(1,0).
61 *
62 * \a maxItems The list returned by identify() will never have more than maxItems item
63 *
64 * \a minConfidence The list will have only enough items for their summary confidence equal
65 * or exceed minConfidence.
66 */
67 void setLimits(int maxItems, double minConfidence);
68
69 /*!
70 * Returns the 2 digit ISO 639-1 code for the language of the currently
71 * set text and.
72 *
73 * Three digits are returned only in the case where a 2 digit
74 * code does not exist. If \a text isn't empty, set the text to checked.
75 * \a text to be identified
76 *
77 * Returns list of the presumed languages of the text, sorted by decreasing confidence. Empty list means
78 * it is impossible to determine language with confidence required by setLimits
79 */
80 QString identify(const QString &text, const QStringList &suggestions = QStringList()) const;
81
82private:
83 std::unique_ptr<GuessLanguagePrivate> const d;
84};
85}
86
87#endif
88

source code of sonnet/src/core/guesslanguage.h