1/*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
5
6 SPDX-License-Identifier: LGPL-2.0-or-later
7*/
8
9#include "kencodingprober.h"
10
11#include "probers/ChineseGroupProber.h"
12#include "probers/JapaneseGroupProber.h"
13#include "probers/UnicodeGroupProber.h"
14#include "probers/nsCharSetProber.h"
15#include "probers/nsMBCSGroupProber.h"
16#include "probers/nsSBCSGroupProber.h"
17#include "probers/nsUniversalDetector.h"
18
19#include <string.h>
20
21class KEncodingProberPrivate
22{
23public:
24 KEncodingProberPrivate()
25 : mProber(nullptr)
26 , mStart(true)
27 {
28 }
29 ~KEncodingProberPrivate()
30 {
31 delete mProber;
32 }
33 void setProberType(KEncodingProber::ProberType pType)
34 {
35 mProberType = pType;
36 /* handle multi-byte encodings carefully , because they're hard to detect,
37 * and have to use some Stastics methods.
38 * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
39 * because encoding state machine can detect many such encodings.
40 */
41
42 delete mProber;
43
44 switch (mProberType) {
45 case KEncodingProber::None:
46 mProber = nullptr;
47 break;
48 case KEncodingProber::Arabic:
49 case KEncodingProber::Baltic:
50 case KEncodingProber::CentralEuropean:
51 case KEncodingProber::Cyrillic:
52 case KEncodingProber::Greek:
53 case KEncodingProber::Hebrew:
54 case KEncodingProber::NorthernSaami:
55 case KEncodingProber::Other:
56 case KEncodingProber::SouthEasternEurope:
57 case KEncodingProber::Thai:
58 case KEncodingProber::Turkish:
59 case KEncodingProber::WesternEuropean:
60 mProber = new kencodingprober::nsSBCSGroupProber();
61 break;
62 case KEncodingProber::ChineseSimplified:
63 case KEncodingProber::ChineseTraditional:
64 mProber = new kencodingprober::ChineseGroupProber();
65 break;
66 case KEncodingProber::Japanese:
67 mProber = new kencodingprober::JapaneseGroupProber();
68 break;
69 case KEncodingProber::Korean:
70 mProber = new kencodingprober::nsMBCSGroupProber();
71 break;
72 case KEncodingProber::Unicode:
73 mProber = new kencodingprober::UnicodeGroupProber();
74 break;
75 case KEncodingProber::Universal:
76 mProber = new kencodingprober::nsUniversalDetector();
77 break;
78 default:
79 mProber = nullptr;
80 }
81 }
82 void unicodeTest(const char *aBuf, int aLen)
83 {
84 if (mStart) {
85 mStart = false;
86 if (aLen > 3) {
87 switch (aBuf[0]) {
88 case '\xEF':
89 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
90 // EF BB BF UTF-8 encoded BOM
91 {
92 mProberState = KEncodingProber::FoundIt;
93 }
94 break;
95 case '\xFE':
96 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
97 // FE FF 00 00 UCS-4, unusual octet order BOM (3412)
98 {
99 mProberState = KEncodingProber::FoundIt;
100 } else if ('\xFF' == aBuf[1])
101 // FE FF UTF-16, big endian BOM
102 {
103 mProberState = KEncodingProber::FoundIt;
104 }
105 break;
106 case '\x00':
107 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
108 // 00 00 FE FF UTF-32, big-endian BOM
109 {
110 mProberState = KEncodingProber::FoundIt;
111 } else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
112 // 00 00 FF FE UCS-4, unusual octet order BOM (2143)
113 {
114 mProberState = KEncodingProber::FoundIt;
115 }
116 break;
117 case '\xFF':
118 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
119 // FF FE 00 00 UTF-32, little-endian BOM
120 {
121 mProberState = KEncodingProber::FoundIt;
122 } else if ('\xFE' == aBuf[1])
123 // FF FE UTF-16, little endian BOM
124 {
125 mProberState = KEncodingProber::FoundIt;
126 }
127 break;
128 } // switch
129 }
130 }
131 }
132 KEncodingProber::ProberType mProberType;
133 KEncodingProber::ProberState mProberState;
134 kencodingprober::nsCharSetProber *mProber;
135 bool mStart;
136};
137
138KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType)
139 : d(new KEncodingProberPrivate())
140{
141 setProberType(proberType);
142}
143
144KEncodingProber::~KEncodingProber() = default;
145
146void KEncodingProber::reset()
147{
148 d->mProberState = KEncodingProber::Probing;
149 d->mStart = true;
150}
151
152KEncodingProber::ProberState KEncodingProber::feed(QByteArrayView data)
153{
154 if (!d->mProber) {
155 return d->mProberState;
156 }
157 if (d->mProberState == Probing) {
158 if (d->mStart) {
159 d->unicodeTest(aBuf: data.constData(), aLen: data.size());
160 if (d->mProberState == FoundIt) {
161 return d->mProberState;
162 }
163 }
164 d->mProber->HandleData(aBuf: data.constData(), aLen: data.size());
165 switch (d->mProber->GetState()) {
166 case kencodingprober::eNotMe:
167 d->mProberState = NotMe;
168 break;
169 case kencodingprober::eFoundIt:
170 d->mProberState = FoundIt;
171 break;
172 default:
173 d->mProberState = Probing;
174 break;
175 }
176 }
177#ifdef DEBUG_PROBE
178 d->mProber->DumpStatus();
179#endif
180 return d->mProberState;
181}
182
183KEncodingProber::ProberState KEncodingProber::state() const
184{
185 return d->mProberState;
186}
187
188QByteArray KEncodingProber::encoding() const
189{
190 if (!d->mProber) {
191 return QByteArray("UTF-8");
192 }
193
194 return QByteArray(d->mProber->GetCharSetName());
195}
196
197float KEncodingProber::confidence() const
198{
199 if (!d->mProber) {
200 return 0.0;
201 }
202
203 return d->mProber->GetConfidence();
204}
205
206KEncodingProber::ProberType KEncodingProber::proberType() const
207{
208 return d->mProberType;
209}
210
211void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
212{
213 d->setProberType(proberType);
214 reset();
215}
216
217KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString &lang)
218{
219 if (lang.isEmpty()) {
220 return KEncodingProber::Universal;
221 } else if (lang == tr(sourceText: "Disabled", disambiguation: "@item Text character set")) {
222 return KEncodingProber::None;
223 } else if (lang == tr(sourceText: "Universal", disambiguation: "@item Text character set")) {
224 return KEncodingProber::Universal;
225 } else if (lang == tr(sourceText: "Unicode", disambiguation: "@item Text character set")) {
226 return KEncodingProber::Unicode;
227 } else if (lang == tr(sourceText: "Cyrillic", disambiguation: "@item Text character set")) {
228 return KEncodingProber::Cyrillic;
229 } else if (lang == tr(sourceText: "Western European", disambiguation: "@item Text character set")) {
230 return KEncodingProber::WesternEuropean;
231 } else if (lang == tr(sourceText: "Central European", disambiguation: "@item Text character set")) {
232 return KEncodingProber::CentralEuropean;
233 } else if (lang == tr(sourceText: "Greek", disambiguation: "@item Text character set")) {
234 return KEncodingProber::Greek;
235 } else if (lang == tr(sourceText: "Hebrew", disambiguation: "@item Text character set")) {
236 return KEncodingProber::Hebrew;
237 } else if (lang == tr(sourceText: "Turkish", disambiguation: "@item Text character set")) {
238 return KEncodingProber::Turkish;
239 } else if (lang == tr(sourceText: "Japanese", disambiguation: "@item Text character set")) {
240 return KEncodingProber::Japanese;
241 } else if (lang == tr(sourceText: "Baltic", disambiguation: "@item Text character set")) {
242 return KEncodingProber::Baltic;
243 } else if (lang == tr(sourceText: "Chinese Traditional", disambiguation: "@item Text character set")) {
244 return KEncodingProber::ChineseTraditional;
245 } else if (lang == tr(sourceText: "Chinese Simplified", disambiguation: "@item Text character set")) {
246 return KEncodingProber::ChineseSimplified;
247 } else if (lang == tr(sourceText: "Korean", disambiguation: "@item Text character set")) {
248 return KEncodingProber::Korean;
249 } else if (lang == tr(sourceText: "Thai", disambiguation: "@item Text character set")) {
250 return KEncodingProber::Thai;
251 } else if (lang == tr(sourceText: "Arabic", disambiguation: "@item Text character set")) {
252 return KEncodingProber::Arabic;
253 }
254
255 return KEncodingProber::Universal;
256}
257
258QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
259{
260 switch (proberType) {
261 case KEncodingProber::None:
262 return tr(sourceText: "Disabled", disambiguation: "@item Text character set");
263 break;
264 case KEncodingProber::Universal:
265 return tr(sourceText: "Universal", disambiguation: "@item Text character set");
266 break;
267 case KEncodingProber::Arabic:
268 return tr(sourceText: "Arabic", disambiguation: "@item Text character set");
269 break;
270 case KEncodingProber::Baltic:
271 return tr(sourceText: "Baltic", disambiguation: "@item Text character set");
272 break;
273 case KEncodingProber::CentralEuropean:
274 return tr(sourceText: "Central European", disambiguation: "@item Text character set");
275 break;
276 case KEncodingProber::Cyrillic:
277 return tr(sourceText: "Cyrillic", disambiguation: "@item Text character set");
278 break;
279 case KEncodingProber::Greek:
280 return tr(sourceText: "Greek", disambiguation: "@item Text character set");
281 break;
282 case KEncodingProber::Hebrew:
283 return tr(sourceText: "Hebrew", disambiguation: "@item Text character set");
284 break;
285 case KEncodingProber::Japanese:
286 return tr(sourceText: "Japanese", disambiguation: "@item Text character set");
287 break;
288 case KEncodingProber::Turkish:
289 return tr(sourceText: "Turkish", disambiguation: "@item Text character set");
290 break;
291 case KEncodingProber::WesternEuropean:
292 return tr(sourceText: "Western European", disambiguation: "@item Text character set");
293 break;
294 case KEncodingProber::ChineseTraditional:
295 return tr(sourceText: "Chinese Traditional", disambiguation: "@item Text character set");
296 break;
297 case KEncodingProber::ChineseSimplified:
298 return tr(sourceText: "Chinese Simplified", disambiguation: "@item Text character set");
299 break;
300 case KEncodingProber::Korean:
301 return tr(sourceText: "Korean", disambiguation: "@item Text character set");
302 break;
303 case KEncodingProber::Thai:
304 return tr(sourceText: "Thai", disambiguation: "@item Text character set");
305 break;
306 case KEncodingProber::Unicode:
307 return tr(sourceText: "Unicode", disambiguation: "@item Text character set");
308 break;
309 default:
310 return QString();
311 }
312}
313

source code of kcodecs/src/kencodingprober.cpp