1 | /* This file is part of the KDE libraries |
2 | SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> |
3 | |
4 | SPDX-License-Identifier: LGPL-2.0-or-later |
5 | */ |
6 | |
7 | #ifndef GUESSLANGUAGE_H |
8 | #define GUESSLANGUAGE_H |
9 | |
10 | #include <QString> |
11 | #include <QStringList> |
12 | |
13 | #include "sonnetcore_export.h" |
14 | |
15 | #include <memory> |
16 | |
17 | namespace Sonnet |
18 | { |
19 | // Amount of trigrams in each file |
20 | static const int MAXGRAMS = 300; |
21 | |
22 | class GuessLanguagePrivate; |
23 | |
24 | /*! |
25 | * \class Sonnet::GuessLanguage |
26 | * \inheaderfile Sonnet/GuessLanguage |
27 | * \inmodule SonnetCore |
28 | * |
29 | * \brief GuessLanguage determines the language of a given text. |
30 | * |
31 | * GuessLanguage can determine the difference between ~75 languages for a given string. It is |
32 | * based off a Perl script originally written by Maciej Ceglowski |
33 | * called Languid. His script used a 2 part heuristic to determine language. First the text |
34 | * is checked for the scripts it contains, then for each set of languages using those |
35 | * scripts a n-gram frequency model of a given language is compared to a model of the text. |
36 | * The most similar language model is assumed to be the language. If no language is found |
37 | * an empty string is returned. |
38 | * |
39 | * \since 4.3 |
40 | */ |
41 | class SONNETCORE_EXPORT GuessLanguage |
42 | { |
43 | public: |
44 | /*! |
45 | * Constructor |
46 | * |
47 | * Creates a new GuessLanguage instance. |
48 | */ |
49 | GuessLanguage(); |
50 | |
51 | ~GuessLanguage(); |
52 | |
53 | GuessLanguage(const GuessLanguage &) = delete; |
54 | GuessLanguage &operator=(const GuessLanguage &) = delete; |
55 | |
56 | /*! |
57 | * Sets limits to number of languages returned by identify(). The confidence for each language is computed |
58 | * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get |
59 | * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly |
60 | * as after call to setLimits(1,0). |
61 | * |
62 | * \a maxItems The list returned by identify() will never have more than maxItems item |
63 | * |
64 | * \a minConfidence The list will have only enough items for their summary confidence equal |
65 | * or exceed minConfidence. |
66 | */ |
67 | void setLimits(int maxItems, double minConfidence); |
68 | |
69 | /*! |
70 | * Returns the 2 digit ISO 639-1 code for the language of the currently |
71 | * set text and. |
72 | * |
73 | * Three digits are returned only in the case where a 2 digit |
74 | * code does not exist. If \a text isn't empty, set the text to checked. |
75 | * \a text to be identified |
76 | * |
77 | * Returns list of the presumed languages of the text, sorted by decreasing confidence. Empty list means |
78 | * it is impossible to determine language with confidence required by setLimits |
79 | */ |
80 | QString identify(const QString &text, const QStringList &suggestions = QStringList()) const; |
81 | |
82 | private: |
83 | std::unique_ptr<GuessLanguagePrivate> const d; |
84 | }; |
85 | } |
86 | |
87 | #endif |
88 | |