1/*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
5
6 SPDX-License-Identifier: LGPL-2.0-or-later
7*/
8#ifndef KENCODINGPROBER_H
9#define KENCODINGPROBER_H
10
11#include <kcodecs_export.h>
12
13#include <QCoreApplication>
14#include <QString>
15#include <memory>
16
17class KEncodingProberPrivate;
18
19/*!
20 * \class KEncodingProber
21 * \inmodule KCodecs
22 *
23 * \brief Provides encoding detection(probe) capabilities.
24 *
25 * Probe the encoding of raw data only.
26 * In the case it can't find it, return the most possible encoding it guessed.
27 *
28 * Always do Unicode probe regardless the ProberType
29 *
30 * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe,
31 * or confidence() returns a value you find acceptable.
32 *
33 * Intended lifetime of the object: one instance per ProberType.
34 *
35 * Typical use:
36 * \code
37 * QByteArray data, moredata;
38 * ...
39 * KEncodingProber prober(KEncodingProber::Chinese);
40 * prober.feed(data);
41 * prober.feed(moredata);
42 * if (prober.confidence() > 0.6)
43 * encoding = prober.encoding();
44 * \endcode
45 *
46 * At least 256 characters are needed to change the ProberState from Probing to FoundIt.
47 * If you don't have so many characters to probe,
48 * decide whether to accept the encoding it guessed so far according to the Confidence by yourself.
49 *
50 */
51class KCODECS_EXPORT KEncodingProber
52{
53 Q_DECLARE_TR_FUNCTIONS(KEncodingProber)
54
55public:
56 /*!
57 * \value FoundIt Sure find the encoding
58 * \value NotMe Sure not included in current ProberType's all supported encodings
59 * \value Probing Need more data to make a decision
60 */
61 enum ProberState {
62 FoundIt,
63 NotMe,
64 Probing,
65 };
66
67 /*!
68 * \value None
69 * \value Universal
70 * \value Arabic
71 * \value Baltic
72 * \value CentralEuropean
73 * \value ChineseSimplified
74 * \value ChineseTraditional
75 * \value Cyrillic
76 * \value Greek
77 * \value Hebrew
78 * \value Japanese
79 * \value Korean
80 * \value NorthernSaami
81 * \value Other
82 * \value SouthEasternEurope
83 * \value Thai
84 * \value Turkish
85 * \value Unicode
86 * \value WesternEuropean
87 */
88 enum ProberType {
89 None,
90 Universal,
91 Arabic,
92 Baltic,
93 CentralEuropean,
94 ChineseSimplified,
95 ChineseTraditional,
96 Cyrillic,
97 Greek,
98 Hebrew,
99 Japanese,
100 Korean,
101 NorthernSaami,
102 Other,
103 SouthEasternEurope,
104 Thai,
105 Turkish,
106 Unicode,
107 WesternEuropean,
108 };
109
110 /*!
111 * Default ProberType is Universal(detect all possible encodings)
112 */
113 KEncodingProber(ProberType proberType = Universal);
114
115 ~KEncodingProber();
116
117 KEncodingProber(const KEncodingProber &) = delete;
118 KEncodingProber &operator=(const KEncodingProber &) = delete;
119
120 /*!
121 * reset the prober's internal state and data.
122 */
123 void reset();
124
125 /*!
126 * The main class method
127 *
128 * Feed \a data to the prober
129 *
130 * Returns the ProberState after probing the fed data.
131 */
132 ProberState feed(QByteArrayView data);
133 // for API compatibility
134 inline ProberState feed(const char *data, qsizetype len)
135 {
136 return feed(data: {data, len});
137 }
138
139 /*!
140 * Returns the prober's current ProberState
141 *
142 */
143 ProberState state() const;
144
145 /*!
146 * Returns a QByteArray with the name of the best encoding it has guessed so far
147 * \since 4.2.2
148 */
149 QByteArray encoding() const;
150
151 /*!
152 * Returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings
153 */
154 float confidence() const;
155
156 ProberType proberType() const;
157
158 /*!
159 * change current prober's ProberType and reset the prober
160 *
161 * \a proberType the new type
162 */
163 void setProberType(ProberType proberType);
164
165 /*!
166 * Returns the ProberType for \a lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified
167 */
168 static ProberType proberTypeForName(const QString &lang);
169
170 /*!
171 * map ProberType to language string
172 *
173 * \a proberType the proper type
174 *
175 * Returns the language string
176 */
177 static QString nameForProberType(ProberType proberType);
178
179private:
180 std::unique_ptr<KEncodingProberPrivate> const d;
181};
182
183#endif
184

source code of kcodecs/src/kencodingprober.h