1/* This file is part of the KDE libraries
2 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#ifndef GUESSLANGUAGE_H
8#define GUESSLANGUAGE_H
9
10#include <QString>
11#include <QStringList>
12
13#include "sonnetcore_export.h"
14
15#include <memory>
16
17namespace Sonnet
18{
19// Amount of trigrams in each file
20static const int MAXGRAMS = 300;
21
22class GuessLanguagePrivate;
23
24/**
25 * @class Sonnet::GuessLanguage guesslanguage.h <Sonnet/GuessLanguage>
26 *
27 * @short GuessLanguage determines the language of a given text.
28 *
29 * GuessLanguage can determine the difference between ~75 languages for a given string. It is
30 * based off a Perl script originally written by Maciej Ceglowski <maciej@ceglowski.com>
31 * called Languid. His script used a 2 part heuristic to determine language. First the text
32 * is checked for the scripts it contains, then for each set of languages using those
33 * scripts a n-gram frequency model of a given language is compared to a model of the text.
34 * The most similar language model is assumed to be the language. If no language is found
35 * an empty string is returned.
36 *
37 *
38 * @author Jacob Rideout <kde@jacobrideout.net>
39 * @since 4.3
40 */
41class SONNETCORE_EXPORT GuessLanguage
42{
43public:
44 /** Constructor
45 * Creates a new GuessLanguage instance. If @p text is specified,
46 * it sets the text to be checked.
47 * @param text the text that is to be checked
48 */
49 GuessLanguage();
50
51 /** Destructor
52 */
53 ~GuessLanguage();
54
55 GuessLanguage(const GuessLanguage &) = delete;
56 GuessLanguage &operator=(const GuessLanguage &) = delete;
57
58 /**
59 * Sets limits to number of languages returned by identify(). The confidence for each language is computed
60 * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get
61 * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly
62 * as after call to setLimits(1,0).
63 * @param maxItems The list returned by identify() will never have more than maxItems item
64 * @param minConfidence The list will have only enough items for their summary confidence equal
65 * or exceed minConfidence.
66 */
67 void setLimits(int maxItems, double minConfidence);
68
69 /**
70 * Returns the 2 digit ISO 639-1 code for the language of the currently
71 * set text and. Three digits are returned only in the case where a 2 digit
72 * code does not exist. If @p text isn't empty, set the text to checked.
73 * @param text to be identified
74 * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means
75 * it is impossible to determine language with confidence required by setLimits
76 */
77 QString identify(const QString &text, const QStringList &suggestions = QStringList()) const;
78
79private:
80 std::unique_ptr<GuessLanguagePrivate> const d;
81};
82}
83
84#endif
85

source code of sonnet/src/core/guesslanguage.h