1 | /* This file is part of the KDE libraries |
2 | SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> |
3 | |
4 | SPDX-License-Identifier: LGPL-2.0-or-later |
5 | */ |
6 | |
7 | #ifndef GUESSLANGUAGE_H |
8 | #define GUESSLANGUAGE_H |
9 | |
10 | #include <QString> |
11 | #include <QStringList> |
12 | |
13 | #include "sonnetcore_export.h" |
14 | |
15 | #include <memory> |
16 | |
17 | namespace Sonnet |
18 | { |
19 | // Amount of trigrams in each file |
20 | static const int MAXGRAMS = 300; |
21 | |
22 | class GuessLanguagePrivate; |
23 | |
24 | /** |
25 | * @class Sonnet::GuessLanguage guesslanguage.h <Sonnet/GuessLanguage> |
26 | * |
27 | * @short GuessLanguage determines the language of a given text. |
28 | * |
29 | * GuessLanguage can determine the difference between ~75 languages for a given string. It is |
30 | * based off a Perl script originally written by Maciej Ceglowski <maciej@ceglowski.com> |
31 | * called Languid. His script used a 2 part heuristic to determine language. First the text |
32 | * is checked for the scripts it contains, then for each set of languages using those |
33 | * scripts a n-gram frequency model of a given language is compared to a model of the text. |
34 | * The most similar language model is assumed to be the language. If no language is found |
35 | * an empty string is returned. |
36 | * |
37 | * |
38 | * @author Jacob Rideout <kde@jacobrideout.net> |
39 | * @since 4.3 |
40 | */ |
41 | class SONNETCORE_EXPORT GuessLanguage |
42 | { |
43 | public: |
44 | /** Constructor |
45 | * Creates a new GuessLanguage instance. If @p text is specified, |
46 | * it sets the text to be checked. |
47 | * @param text the text that is to be checked |
48 | */ |
49 | GuessLanguage(); |
50 | |
51 | /** Destructor |
52 | */ |
53 | ~GuessLanguage(); |
54 | |
55 | GuessLanguage(const GuessLanguage &) = delete; |
56 | GuessLanguage &operator=(const GuessLanguage &) = delete; |
57 | |
58 | /** |
59 | * Sets limits to number of languages returned by identify(). The confidence for each language is computed |
60 | * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get |
61 | * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly |
62 | * as after call to setLimits(1,0). |
63 | * @param maxItems The list returned by identify() will never have more than maxItems item |
64 | * @param minConfidence The list will have only enough items for their summary confidence equal |
65 | * or exceed minConfidence. |
66 | */ |
67 | void setLimits(int maxItems, double minConfidence); |
68 | |
69 | /** |
70 | * Returns the 2 digit ISO 639-1 code for the language of the currently |
71 | * set text and. Three digits are returned only in the case where a 2 digit |
72 | * code does not exist. If @p text isn't empty, set the text to checked. |
73 | * @param text to be identified |
74 | * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means |
75 | * it is impossible to determine language with confidence required by setLimits |
76 | */ |
77 | QString identify(const QString &text, const QStringList &suggestions = QStringList()) const; |
78 | |
79 | private: |
80 | std::unique_ptr<GuessLanguagePrivate> const d; |
81 | }; |
82 | } |
83 | |
84 | #endif |
85 | |