1 | /* |
2 | This file is part of the KDE libraries |
3 | |
4 | SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com> |
5 | |
6 | SPDX-License-Identifier: LGPL-2.0-or-later |
7 | */ |
8 | #ifndef KENCODINGPROBER_H |
9 | #define KENCODINGPROBER_H |
10 | |
11 | // enable debug of private probers |
12 | // #define DEBUG_PROBE |
13 | |
14 | #include <kcodecs_export.h> |
15 | |
16 | #ifdef DEBUG_PROBE |
17 | #include <QDebug> |
18 | #endif |
19 | |
20 | #include <QCoreApplication> |
21 | #include <QString> |
22 | #include <memory> |
23 | |
24 | class KEncodingProberPrivate; |
25 | |
26 | /*! |
27 | * \class KEncodingProber |
28 | * \inmodule KCodecs |
29 | * |
30 | * \brief Provides encoding detection(probe) capabilities. |
31 | * |
32 | * Probe the encoding of raw data only. |
33 | * In the case it can't find it, return the most possible encoding it guessed. |
34 | * |
35 | * Always do Unicode probe regardless the ProberType |
36 | * |
37 | * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe, |
38 | * or confidence() returns a value you find acceptable. |
39 | * |
40 | * Intended lifetime of the object: one instance per ProberType. |
41 | * |
42 | * Typical use: |
43 | * \code |
44 | * QByteArray data, moredata; |
45 | * ... |
46 | * KEncodingProber prober(KEncodingProber::Chinese); |
47 | * prober.feed(data); |
48 | * prober.feed(moredata); |
49 | * if (prober.confidence() > 0.6) |
50 | * encoding = prober.encoding(); |
51 | * \endcode |
52 | * |
53 | * At least 256 characters are needed to change the ProberState from Probing to FoundIt. |
54 | * If you don't have so many characters to probe, |
55 | * decide whether to accept the encoding it guessed so far according to the Confidence by yourself. |
56 | * |
57 | */ |
58 | class KCODECS_EXPORT KEncodingProber |
59 | { |
60 | Q_DECLARE_TR_FUNCTIONS(KEncodingProber) |
61 | |
62 | public: |
63 | /*! |
64 | * \value FoundIt Sure find the encoding |
65 | * \value NotMe Sure not included in current ProberType's all supported encodings |
66 | * \value Probing Need more data to make a decision |
67 | */ |
68 | enum ProberState { |
69 | FoundIt, |
70 | NotMe, |
71 | Probing, |
72 | }; |
73 | |
74 | /*! |
75 | * \value None |
76 | * \value Universal |
77 | * \value Arabic |
78 | * \value Baltic |
79 | * \value CentralEuropean |
80 | * \value ChineseSimplified |
81 | * \value ChineseTraditional |
82 | * \value Cyrillic |
83 | * \value Greek |
84 | * \value Hebrew |
85 | * \value Japanese |
86 | * \value Korean |
87 | * \value NorthernSaami |
88 | * \value Other |
89 | * \value SouthEasternEurope |
90 | * \value Thai |
91 | * \value Turkish |
92 | * \value Unicode |
93 | * \value WesternEuropean |
94 | */ |
95 | enum ProberType { |
96 | None, |
97 | Universal, |
98 | Arabic, |
99 | Baltic, |
100 | CentralEuropean, |
101 | ChineseSimplified, |
102 | ChineseTraditional, |
103 | Cyrillic, |
104 | Greek, |
105 | Hebrew, |
106 | Japanese, |
107 | Korean, |
108 | NorthernSaami, |
109 | Other, |
110 | SouthEasternEurope, |
111 | Thai, |
112 | Turkish, |
113 | Unicode, |
114 | WesternEuropean, |
115 | }; |
116 | |
117 | /*! |
118 | * Default ProberType is Universal(detect all possible encodings) |
119 | */ |
120 | KEncodingProber(ProberType proberType = Universal); |
121 | |
122 | ~KEncodingProber(); |
123 | |
124 | KEncodingProber(const KEncodingProber &) = delete; |
125 | KEncodingProber &operator=(const KEncodingProber &) = delete; |
126 | |
127 | /*! |
128 | * reset the prober's internal state and data. |
129 | */ |
130 | void reset(); |
131 | |
132 | /*! |
133 | * The main class method |
134 | * |
135 | * Feed \a data to the prober |
136 | * |
137 | * Returns the ProberState after probing the fed data. |
138 | */ |
139 | ProberState feed(QByteArrayView data); |
140 | // for API compatibility |
141 | inline ProberState feed(const char *data, qsizetype len) |
142 | { |
143 | return feed(data: {data, len}); |
144 | } |
145 | |
146 | /*! |
147 | * Returns the prober's current ProberState |
148 | * |
149 | */ |
150 | ProberState state() const; |
151 | |
152 | /*! |
153 | * Returns a QByteArray with the name of the best encoding it has guessed so far |
154 | * \since 4.2.2 |
155 | */ |
156 | QByteArray encoding() const; |
157 | |
158 | /*! |
159 | * Returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings |
160 | */ |
161 | float confidence() const; |
162 | |
163 | ProberType proberType() const; |
164 | |
165 | /*! |
166 | * change current prober's ProberType and reset the prober |
167 | * |
168 | * \a proberType the new type |
169 | */ |
170 | void setProberType(ProberType proberType); |
171 | |
172 | /*! |
173 | * Returns the ProberType for \a lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified |
174 | */ |
175 | static ProberType proberTypeForName(const QString &lang); |
176 | |
177 | /*! |
178 | * map ProberType to language string |
179 | * |
180 | * \a proberType the proper type |
181 | * |
182 | * Returns the language string |
183 | */ |
184 | static QString nameForProberType(ProberType proberType); |
185 | |
186 | private: |
187 | std::unique_ptr<KEncodingProberPrivate> const d; |
188 | }; |
189 | |
190 | #endif |
191 | |