1/*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
5
6 SPDX-License-Identifier: LGPL-2.0-or-later
7*/
8
9#include "kencodingprober.h"
10#include "kencodingprober_p.h"
11
12#include "probers/UnicodeGroupProber.h"
13#include "probers/nsCharSetProber.h"
14#include "probers/nsMBCSGroupProber.h"
15#include "probers/nsSBCSGroupProber.h"
16#include "probers/nsUniversalDetector.h"
17
18#include <span>
19#include <string.h>
20
21namespace
22{
23using Prober = kencodingprober::nsMBCSGroupProber::Prober;
24static const std::array ChineseMSBCProbers{
25 Prober::Unicode,
26 Prober::GB18030,
27 Prober::Big5,
28};
29static const std::array JapaneseMSBCProbers{
30 Prober::Unicode,
31 Prober::SJIS,
32 Prober::EUCJP,
33};
34constexpr const char *checkBom(std::span<const char, 4> buf)
35{
36 switch (buf[0]) {
37 case '\xEF':
38 if (('\xBB' == buf[1]) && ('\xBF' == buf[2])) {
39 // EF BB BF UTF-8 encoded BOM
40 return "UTF-8";
41 }
42 break;
43 case '\xFE':
44 if ('\xFF' == buf[1]) {
45 // FE FF UTF-16, big endian BOM
46 return "UTF-16BE";
47 }
48 break;
49 case '\x00':
50 if (('\x00' == buf[1]) && ('\xFE' == buf[2]) && ('\xFF' == buf[3])) {
51 // 00 00 FE FF UTF-32, big-endian BOM
52 return "UTF-32BE";
53 }
54 break;
55 case '\xFF':
56 if (('\xFE' == buf[1]) && ('\x00' == buf[2]) && ('\x00' == buf[3])) {
57 // FF FE 00 00 UTF-32, little-endian BOM
58 return "UTF-32LE";
59 } else if ('\xFE' == buf[1]) {
60 // FF FE UTF-16, little endian BOM
61 return "UTF-16LE";
62 }
63 break;
64 } // switch
65 return "";
66}
67static_assert(checkBom(buf: std::array{'\0', '\0', '\0', '\0'})[0] == '\0');
68static_assert(checkBom(buf: std::array{'\xEF', '\xBB', '\xBF', '\0'})[4] == '8'); // UTF-8
69static_assert(checkBom(buf: std::array{'\xFF', '\xFE', ' ', ' '})[4] == '1'); // UTF-16LE
70static_assert(checkBom(buf: std::array{'\xFF', '\xFE', '\0', '\0'})[4] == '3'); // UTF-32LE
71} // namespace <anonymous>
72
73class KEncodingProberPrivate
74{
75public:
76 KEncodingProberPrivate()
77 : mProber(nullptr)
78 {
79 }
80 ~KEncodingProberPrivate()
81 {
82 delete mProber;
83 }
84 void setProberType(KEncodingProber::ProberType pType)
85 {
86 mProberType = pType;
87 /* handle multi-byte encodings carefully , because they're hard to detect,
88 * and have to use some Stastics methods.
89 * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
90 * because encoding state machine can detect many such encodings.
91 */
92
93 delete mProber;
94
95 switch (mProberType) {
96 case KEncodingProber::None:
97 mProber = nullptr;
98 break;
99 case KEncodingProber::Arabic:
100 case KEncodingProber::Baltic:
101 case KEncodingProber::CentralEuropean:
102 case KEncodingProber::Cyrillic:
103 case KEncodingProber::Greek:
104 case KEncodingProber::Hebrew:
105 case KEncodingProber::NorthernSaami:
106 case KEncodingProber::Other:
107 case KEncodingProber::SouthEasternEurope:
108 case KEncodingProber::Thai:
109 case KEncodingProber::Turkish:
110 case KEncodingProber::WesternEuropean:
111 mProber = new kencodingprober::nsSBCSGroupProber();
112 break;
113 case KEncodingProber::ChineseSimplified:
114 case KEncodingProber::ChineseTraditional:
115 mProber = new kencodingprober::nsMBCSGroupProber(ChineseMSBCProbers);
116 break;
117 case KEncodingProber::Japanese:
118 mProber = new kencodingprober::nsMBCSGroupProber(JapaneseMSBCProbers);
119 break;
120 case KEncodingProber::Korean:
121 mProber = new kencodingprober::nsMBCSGroupProber();
122 break;
123 case KEncodingProber::Unicode:
124 mProber = new kencodingprober::UnicodeGroupProber();
125 break;
126 case KEncodingProber::Universal:
127 mProber = new kencodingprober::nsUniversalDetector();
128 break;
129 default:
130 mProber = nullptr;
131 }
132 }
133 KEncodingProber::ProberType mProberType;
134 KEncodingProber::ProberState mProberState;
135 kencodingprober::nsCharSetProber *mProber;
136 char mBom[4] = {0};
137 unsigned int mBomLen = 0;
138};
139
140KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType)
141 : d(new KEncodingProberPrivate())
142{
143 setProberType(proberType);
144}
145
146KEncodingProber::~KEncodingProber() = default;
147
148void KEncodingProber::reset()
149{
150 d->mProberState = KEncodingProber::Probing;
151 d->mBomLen = 0;
152}
153
154KEncodingProber::ProberState KEncodingProber::feed(QByteArrayView data)
155{
156 if (!d->mProber) {
157 return d->mProberState;
158 }
159 if (d->mProberState == Probing) {
160 if (d->mBomLen < 4) {
161 auto remainder = std::min<unsigned int>(a: (4 - d->mBomLen), b: data.size());
162 memcpy(dest: &d->mBom[d->mBomLen], src: data.constData(), n: remainder);
163 d->mBomLen += remainder;
164 if ((d->mBomLen == 4) && (checkBom(buf: d->mBom)[0] != '\0')) {
165 d->mProberState = FoundIt;
166 return d->mProberState;
167 }
168 }
169 d->mProber->HandleData(aBuf: data.constData(), aLen: data.size());
170 switch (d->mProber->GetState()) {
171 case kencodingprober::eNotMe:
172 d->mProberState = NotMe;
173 break;
174 case kencodingprober::eFoundIt:
175 d->mBomLen = 0;
176 d->mProberState = FoundIt;
177 break;
178 default:
179 d->mProberState = Probing;
180 break;
181 }
182 }
183#ifdef DEBUG_PROBE
184 d->mProber->DumpStatus();
185#endif
186 return d->mProberState;
187}
188
189KEncodingProber::ProberState KEncodingProber::state() const
190{
191 return d->mProberState;
192}
193
194QByteArray KEncodingProber::encoding() const
195{
196 if (!d->mProber) {
197 return QByteArray("UTF-8");
198 }
199 if ((d->mProberState == FoundIt) && (d->mBomLen == 4)) {
200 if (auto bomResult = checkBom(buf: d->mBom); bomResult[0] != '\0') {
201 return {bomResult};
202 }
203 }
204
205 return QByteArray(d->mProber->GetCharSetName());
206}
207
208float KEncodingProber::confidence() const
209{
210 if (!d->mProber) {
211 return 0.0;
212 }
213 if ((d->mProberState == FoundIt) && (d->mBomLen == 4)) {
214 if (auto bomResult = checkBom(buf: d->mBom); bomResult[0] != '\0') {
215 return 0.99;
216 }
217 }
218
219 return d->mProber->GetConfidence();
220}
221
222KEncodingProber::ProberType KEncodingProber::proberType() const
223{
224 return d->mProberType;
225}
226
227void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
228{
229 d->setProberType(proberType);
230 reset();
231}
232
233KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString &lang)
234{
235 if (lang.isEmpty()) {
236 return KEncodingProber::Universal;
237 } else if (lang == tr(sourceText: "Disabled", disambiguation: "@item Text character set")) {
238 return KEncodingProber::None;
239 } else if (lang == tr(sourceText: "Universal", disambiguation: "@item Text character set")) {
240 return KEncodingProber::Universal;
241 } else if (lang == tr(sourceText: "Unicode", disambiguation: "@item Text character set")) {
242 return KEncodingProber::Unicode;
243 } else if (lang == tr(sourceText: "Cyrillic", disambiguation: "@item Text character set")) {
244 return KEncodingProber::Cyrillic;
245 } else if (lang == tr(sourceText: "Western European", disambiguation: "@item Text character set")) {
246 return KEncodingProber::WesternEuropean;
247 } else if (lang == tr(sourceText: "Central European", disambiguation: "@item Text character set")) {
248 return KEncodingProber::CentralEuropean;
249 } else if (lang == tr(sourceText: "Greek", disambiguation: "@item Text character set")) {
250 return KEncodingProber::Greek;
251 } else if (lang == tr(sourceText: "Hebrew", disambiguation: "@item Text character set")) {
252 return KEncodingProber::Hebrew;
253 } else if (lang == tr(sourceText: "Turkish", disambiguation: "@item Text character set")) {
254 return KEncodingProber::Turkish;
255 } else if (lang == tr(sourceText: "Japanese", disambiguation: "@item Text character set")) {
256 return KEncodingProber::Japanese;
257 } else if (lang == tr(sourceText: "Baltic", disambiguation: "@item Text character set")) {
258 return KEncodingProber::Baltic;
259 } else if (lang == tr(sourceText: "Chinese Traditional", disambiguation: "@item Text character set")) {
260 return KEncodingProber::ChineseTraditional;
261 } else if (lang == tr(sourceText: "Chinese Simplified", disambiguation: "@item Text character set")) {
262 return KEncodingProber::ChineseSimplified;
263 } else if (lang == tr(sourceText: "Korean", disambiguation: "@item Text character set")) {
264 return KEncodingProber::Korean;
265 } else if (lang == tr(sourceText: "Thai", disambiguation: "@item Text character set")) {
266 return KEncodingProber::Thai;
267 } else if (lang == tr(sourceText: "Arabic", disambiguation: "@item Text character set")) {
268 return KEncodingProber::Arabic;
269 }
270
271 return KEncodingProber::Universal;
272}
273
274QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
275{
276 switch (proberType) {
277 case KEncodingProber::None:
278 return tr(sourceText: "Disabled", disambiguation: "@item Text character set");
279 break;
280 case KEncodingProber::Universal:
281 return tr(sourceText: "Universal", disambiguation: "@item Text character set");
282 break;
283 case KEncodingProber::Arabic:
284 return tr(sourceText: "Arabic", disambiguation: "@item Text character set");
285 break;
286 case KEncodingProber::Baltic:
287 return tr(sourceText: "Baltic", disambiguation: "@item Text character set");
288 break;
289 case KEncodingProber::CentralEuropean:
290 return tr(sourceText: "Central European", disambiguation: "@item Text character set");
291 break;
292 case KEncodingProber::Cyrillic:
293 return tr(sourceText: "Cyrillic", disambiguation: "@item Text character set");
294 break;
295 case KEncodingProber::Greek:
296 return tr(sourceText: "Greek", disambiguation: "@item Text character set");
297 break;
298 case KEncodingProber::Hebrew:
299 return tr(sourceText: "Hebrew", disambiguation: "@item Text character set");
300 break;
301 case KEncodingProber::Japanese:
302 return tr(sourceText: "Japanese", disambiguation: "@item Text character set");
303 break;
304 case KEncodingProber::Turkish:
305 return tr(sourceText: "Turkish", disambiguation: "@item Text character set");
306 break;
307 case KEncodingProber::WesternEuropean:
308 return tr(sourceText: "Western European", disambiguation: "@item Text character set");
309 break;
310 case KEncodingProber::ChineseTraditional:
311 return tr(sourceText: "Chinese Traditional", disambiguation: "@item Text character set");
312 break;
313 case KEncodingProber::ChineseSimplified:
314 return tr(sourceText: "Chinese Simplified", disambiguation: "@item Text character set");
315 break;
316 case KEncodingProber::Korean:
317 return tr(sourceText: "Korean", disambiguation: "@item Text character set");
318 break;
319 case KEncodingProber::Thai:
320 return tr(sourceText: "Thai", disambiguation: "@item Text character set");
321 break;
322 case KEncodingProber::Unicode:
323 return tr(sourceText: "Unicode", disambiguation: "@item Text character set");
324 break;
325 default:
326 return QString();
327 }
328}
329

source code of kcodecs/src/kencodingprober.cpp