1/*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
5
6 SPDX-License-Identifier: LGPL-2.0-or-later
7*/
8#ifndef KENCODINGPROBER_H
9#define KENCODINGPROBER_H
10
11// enable debug of private probers
12// #define DEBUG_PROBE
13
14#include <kcodecs_export.h>
15
16#ifdef DEBUG_PROBE
17#include <QDebug>
18#endif
19
20#include <QCoreApplication>
21#include <QString>
22#include <memory>
23
24class KEncodingProberPrivate;
25
26/*!
27 * \class KEncodingProber
28 * \inmodule KCodecs
29 *
30 * \brief Provides encoding detection(probe) capabilities.
31 *
32 * Probe the encoding of raw data only.
33 * In the case it can't find it, return the most possible encoding it guessed.
34 *
35 * Always do Unicode probe regardless the ProberType
36 *
37 * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe,
38 * or confidence() returns a value you find acceptable.
39 *
40 * Intended lifetime of the object: one instance per ProberType.
41 *
42 * Typical use:
43 * \code
44 * QByteArray data, moredata;
45 * ...
46 * KEncodingProber prober(KEncodingProber::Chinese);
47 * prober.feed(data);
48 * prober.feed(moredata);
49 * if (prober.confidence() > 0.6)
50 * encoding = prober.encoding();
51 * \endcode
52 *
53 * At least 256 characters are needed to change the ProberState from Probing to FoundIt.
54 * If you don't have so many characters to probe,
55 * decide whether to accept the encoding it guessed so far according to the Confidence by yourself.
56 *
57 */
58class KCODECS_EXPORT KEncodingProber
59{
60 Q_DECLARE_TR_FUNCTIONS(KEncodingProber)
61
62public:
63 /*!
64 * \value FoundIt Sure find the encoding
65 * \value NotMe Sure not included in current ProberType's all supported encodings
66 * \value Probing Need more data to make a decision
67 */
68 enum ProberState {
69 FoundIt,
70 NotMe,
71 Probing,
72 };
73
74 /*!
75 * \value None
76 * \value Universal
77 * \value Arabic
78 * \value Baltic
79 * \value CentralEuropean
80 * \value ChineseSimplified
81 * \value ChineseTraditional
82 * \value Cyrillic
83 * \value Greek
84 * \value Hebrew
85 * \value Japanese
86 * \value Korean
87 * \value NorthernSaami
88 * \value Other
89 * \value SouthEasternEurope
90 * \value Thai
91 * \value Turkish
92 * \value Unicode
93 * \value WesternEuropean
94 */
95 enum ProberType {
96 None,
97 Universal,
98 Arabic,
99 Baltic,
100 CentralEuropean,
101 ChineseSimplified,
102 ChineseTraditional,
103 Cyrillic,
104 Greek,
105 Hebrew,
106 Japanese,
107 Korean,
108 NorthernSaami,
109 Other,
110 SouthEasternEurope,
111 Thai,
112 Turkish,
113 Unicode,
114 WesternEuropean,
115 };
116
117 /*!
118 * Default ProberType is Universal(detect all possible encodings)
119 */
120 KEncodingProber(ProberType proberType = Universal);
121
122 ~KEncodingProber();
123
124 KEncodingProber(const KEncodingProber &) = delete;
125 KEncodingProber &operator=(const KEncodingProber &) = delete;
126
127 /*!
128 * reset the prober's internal state and data.
129 */
130 void reset();
131
132 /*!
133 * The main class method
134 *
135 * Feed \a data to the prober
136 *
137 * Returns the ProberState after probing the fed data.
138 */
139 ProberState feed(QByteArrayView data);
140 // for API compatibility
141 inline ProberState feed(const char *data, qsizetype len)
142 {
143 return feed(data: {data, len});
144 }
145
146 /*!
147 * Returns the prober's current ProberState
148 *
149 */
150 ProberState state() const;
151
152 /*!
153 * Returns a QByteArray with the name of the best encoding it has guessed so far
154 * \since 4.2.2
155 */
156 QByteArray encoding() const;
157
158 /*!
159 * Returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings
160 */
161 float confidence() const;
162
163 ProberType proberType() const;
164
165 /*!
166 * change current prober's ProberType and reset the prober
167 *
168 * \a proberType the new type
169 */
170 void setProberType(ProberType proberType);
171
172 /*!
173 * Returns the ProberType for \a lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified
174 */
175 static ProberType proberTypeForName(const QString &lang);
176
177 /*!
178 * map ProberType to language string
179 *
180 * \a proberType the proper type
181 *
182 * Returns the language string
183 */
184 static QString nameForProberType(ProberType proberType);
185
186private:
187 std::unique_ptr<KEncodingProberPrivate> const d;
188};
189
190#endif
191

source code of kcodecs/src/kencodingprober.h