guesslanguage.h source code [sonnet/src/core/guesslanguage.h]

1	/ This file is part of the KDE libraries*
2	SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
3
4	SPDX-License-Identifier: LGPL-2.0-or-later
5	*/
6
7	#ifndef GUESSLANGUAGE_H
8	#define GUESSLANGUAGE_H
9
10	#include <QString>
11	#include <QStringList>
12
13	#include "sonnetcore_export.h"
14
15	#include <memory>
16
17	namespace Sonnet
18	{
19	// Amount of trigrams in each file
20	static const int MAXGRAMS = `300`;
21
22	class GuessLanguagePrivate;
23
24	/!*
25	* \class Sonnet::GuessLanguage
26	* \inheaderfile Sonnet/GuessLanguage
27	* \inmodule SonnetCore
28	*
29	* \brief GuessLanguage determines the language of a given text.
30	*
31	* GuessLanguage can determine the difference between ~75 languages for a given string. It is
32	* based off a Perl script originally written by Maciej Ceglowski
33	* called Languid. His script used a 2 part heuristic to determine language. First the text
34	* is checked for the scripts it contains, then for each set of languages using those
35	* scripts a n-gram frequency model of a given language is compared to a model of the text.
36	* The most similar language model is assumed to be the language. If no language is found
37	* an empty string is returned.
38	*
39	* \since 4.3
40	*/
41	class SONNETCORE_EXPORT GuessLanguage
42	{
43	public:
44	/!*
45	* Constructor
46	*
47	* Creates a new GuessLanguage instance.
48	*/
49	GuessLanguage();
50
51	~GuessLanguage();
52
53	GuessLanguage(const GuessLanguage &) = delete;
54	GuessLanguage &operator=(const GuessLanguage &) = delete;
55
56	/!*
57	* Sets limits to number of languages returned by identify(). The confidence for each language is computed
58	* as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get
59	* fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly
60	* as after call to setLimits(1,0).
61	*
62	* \a maxItems The list returned by identify() will never have more than maxItems item
63	*
64	* \a minConfidence The list will have only enough items for their summary confidence equal
65	* or exceed minConfidence.
66	*/
67	void setLimits(int maxItems, double minConfidence);
68
69	/!*
70	* Returns the 2 digit ISO 639-1 code for the language of the currently
71	* set text and.
72	*
73	* Three digits are returned only in the case where a 2 digit
74	* code does not exist. If \a text isn't empty, set the text to checked.
75	* \a text to be identified
76	*
77	* Returns list of the presumed languages of the text, sorted by decreasing confidence. Empty list means
78	* it is impossible to determine language with confidence required by setLimits
79	*/
80	QString identify(const QString &text, const QStringList &suggestions = QStringList()) const;
81
82	private:
83	std::unique_ptr<GuessLanguagePrivate> const d;
84	};
85	}
86
87	#endif
88

source code of sonnet/src/core/guesslanguage.h