1 | /* |
2 | This file is part of the KDE libraries |
3 | |
4 | SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com> |
5 | |
6 | SPDX-License-Identifier: LGPL-2.0-or-later |
7 | */ |
8 | |
9 | #include "kencodingprober.h" |
10 | |
11 | #include "probers/ChineseGroupProber.h" |
12 | #include "probers/JapaneseGroupProber.h" |
13 | #include "probers/UnicodeGroupProber.h" |
14 | #include "probers/nsCharSetProber.h" |
15 | #include "probers/nsMBCSGroupProber.h" |
16 | #include "probers/nsSBCSGroupProber.h" |
17 | #include "probers/nsUniversalDetector.h" |
18 | |
19 | #include <string.h> |
20 | |
21 | class KEncodingProberPrivate |
22 | { |
23 | public: |
24 | KEncodingProberPrivate() |
25 | : mProber(nullptr) |
26 | , mStart(true) |
27 | { |
28 | } |
29 | ~KEncodingProberPrivate() |
30 | { |
31 | delete mProber; |
32 | } |
33 | void setProberType(KEncodingProber::ProberType pType) |
34 | { |
35 | mProberType = pType; |
36 | /* handle multi-byte encodings carefully , because they're hard to detect, |
37 | * and have to use some Stastics methods. |
38 | * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok, |
39 | * because encoding state machine can detect many such encodings. |
40 | */ |
41 | |
42 | delete mProber; |
43 | |
44 | switch (mProberType) { |
45 | case KEncodingProber::None: |
46 | mProber = nullptr; |
47 | break; |
48 | case KEncodingProber::Arabic: |
49 | case KEncodingProber::Baltic: |
50 | case KEncodingProber::CentralEuropean: |
51 | case KEncodingProber::Cyrillic: |
52 | case KEncodingProber::Greek: |
53 | case KEncodingProber::Hebrew: |
54 | case KEncodingProber::NorthernSaami: |
55 | case KEncodingProber::Other: |
56 | case KEncodingProber::SouthEasternEurope: |
57 | case KEncodingProber::Thai: |
58 | case KEncodingProber::Turkish: |
59 | case KEncodingProber::WesternEuropean: |
60 | mProber = new kencodingprober::nsSBCSGroupProber(); |
61 | break; |
62 | case KEncodingProber::ChineseSimplified: |
63 | case KEncodingProber::ChineseTraditional: |
64 | mProber = new kencodingprober::ChineseGroupProber(); |
65 | break; |
66 | case KEncodingProber::Japanese: |
67 | mProber = new kencodingprober::JapaneseGroupProber(); |
68 | break; |
69 | case KEncodingProber::Korean: |
70 | mProber = new kencodingprober::nsMBCSGroupProber(); |
71 | break; |
72 | case KEncodingProber::Unicode: |
73 | mProber = new kencodingprober::UnicodeGroupProber(); |
74 | break; |
75 | case KEncodingProber::Universal: |
76 | mProber = new kencodingprober::nsUniversalDetector(); |
77 | break; |
78 | default: |
79 | mProber = nullptr; |
80 | } |
81 | } |
82 | void unicodeTest(const char *aBuf, int aLen) |
83 | { |
84 | if (mStart) { |
85 | mStart = false; |
86 | if (aLen > 3) { |
87 | switch (aBuf[0]) { |
88 | case '\xEF': |
89 | if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) |
90 | // EF BB BF UTF-8 encoded BOM |
91 | { |
92 | mProberState = KEncodingProber::FoundIt; |
93 | } |
94 | break; |
95 | case '\xFE': |
96 | if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) |
97 | // FE FF 00 00 UCS-4, unusual octet order BOM (3412) |
98 | { |
99 | mProberState = KEncodingProber::FoundIt; |
100 | } else if ('\xFF' == aBuf[1]) |
101 | // FE FF UTF-16, big endian BOM |
102 | { |
103 | mProberState = KEncodingProber::FoundIt; |
104 | } |
105 | break; |
106 | case '\x00': |
107 | if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) |
108 | // 00 00 FE FF UTF-32, big-endian BOM |
109 | { |
110 | mProberState = KEncodingProber::FoundIt; |
111 | } else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) |
112 | // 00 00 FF FE UCS-4, unusual octet order BOM (2143) |
113 | { |
114 | mProberState = KEncodingProber::FoundIt; |
115 | } |
116 | break; |
117 | case '\xFF': |
118 | if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) |
119 | // FF FE 00 00 UTF-32, little-endian BOM |
120 | { |
121 | mProberState = KEncodingProber::FoundIt; |
122 | } else if ('\xFE' == aBuf[1]) |
123 | // FF FE UTF-16, little endian BOM |
124 | { |
125 | mProberState = KEncodingProber::FoundIt; |
126 | } |
127 | break; |
128 | } // switch |
129 | } |
130 | } |
131 | } |
132 | KEncodingProber::ProberType mProberType; |
133 | KEncodingProber::ProberState mProberState; |
134 | kencodingprober::nsCharSetProber *mProber; |
135 | bool mStart; |
136 | }; |
137 | |
138 | KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType) |
139 | : d(new KEncodingProberPrivate()) |
140 | { |
141 | setProberType(proberType); |
142 | } |
143 | |
144 | KEncodingProber::~KEncodingProber() = default; |
145 | |
146 | void KEncodingProber::reset() |
147 | { |
148 | d->mProberState = KEncodingProber::Probing; |
149 | d->mStart = true; |
150 | } |
151 | |
152 | KEncodingProber::ProberState KEncodingProber::feed(QByteArrayView data) |
153 | { |
154 | if (!d->mProber) { |
155 | return d->mProberState; |
156 | } |
157 | if (d->mProberState == Probing) { |
158 | if (d->mStart) { |
159 | d->unicodeTest(aBuf: data.constData(), aLen: data.size()); |
160 | if (d->mProberState == FoundIt) { |
161 | return d->mProberState; |
162 | } |
163 | } |
164 | d->mProber->HandleData(aBuf: data.constData(), aLen: data.size()); |
165 | switch (d->mProber->GetState()) { |
166 | case kencodingprober::eNotMe: |
167 | d->mProberState = NotMe; |
168 | break; |
169 | case kencodingprober::eFoundIt: |
170 | d->mProberState = FoundIt; |
171 | break; |
172 | default: |
173 | d->mProberState = Probing; |
174 | break; |
175 | } |
176 | } |
177 | #ifdef DEBUG_PROBE |
178 | d->mProber->DumpStatus(); |
179 | #endif |
180 | return d->mProberState; |
181 | } |
182 | |
183 | KEncodingProber::ProberState KEncodingProber::state() const |
184 | { |
185 | return d->mProberState; |
186 | } |
187 | |
188 | QByteArray KEncodingProber::encoding() const |
189 | { |
190 | if (!d->mProber) { |
191 | return QByteArray("UTF-8" ); |
192 | } |
193 | |
194 | return QByteArray(d->mProber->GetCharSetName()); |
195 | } |
196 | |
197 | float KEncodingProber::confidence() const |
198 | { |
199 | if (!d->mProber) { |
200 | return 0.0; |
201 | } |
202 | |
203 | return d->mProber->GetConfidence(); |
204 | } |
205 | |
206 | KEncodingProber::ProberType KEncodingProber::proberType() const |
207 | { |
208 | return d->mProberType; |
209 | } |
210 | |
211 | void KEncodingProber::setProberType(KEncodingProber::ProberType proberType) |
212 | { |
213 | d->setProberType(proberType); |
214 | reset(); |
215 | } |
216 | |
217 | KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString &lang) |
218 | { |
219 | if (lang.isEmpty()) { |
220 | return KEncodingProber::Universal; |
221 | } else if (lang == tr(sourceText: "Disabled" , disambiguation: "@item Text character set" )) { |
222 | return KEncodingProber::None; |
223 | } else if (lang == tr(sourceText: "Universal" , disambiguation: "@item Text character set" )) { |
224 | return KEncodingProber::Universal; |
225 | } else if (lang == tr(sourceText: "Unicode" , disambiguation: "@item Text character set" )) { |
226 | return KEncodingProber::Unicode; |
227 | } else if (lang == tr(sourceText: "Cyrillic" , disambiguation: "@item Text character set" )) { |
228 | return KEncodingProber::Cyrillic; |
229 | } else if (lang == tr(sourceText: "Western European" , disambiguation: "@item Text character set" )) { |
230 | return KEncodingProber::WesternEuropean; |
231 | } else if (lang == tr(sourceText: "Central European" , disambiguation: "@item Text character set" )) { |
232 | return KEncodingProber::CentralEuropean; |
233 | } else if (lang == tr(sourceText: "Greek" , disambiguation: "@item Text character set" )) { |
234 | return KEncodingProber::Greek; |
235 | } else if (lang == tr(sourceText: "Hebrew" , disambiguation: "@item Text character set" )) { |
236 | return KEncodingProber::Hebrew; |
237 | } else if (lang == tr(sourceText: "Turkish" , disambiguation: "@item Text character set" )) { |
238 | return KEncodingProber::Turkish; |
239 | } else if (lang == tr(sourceText: "Japanese" , disambiguation: "@item Text character set" )) { |
240 | return KEncodingProber::Japanese; |
241 | } else if (lang == tr(sourceText: "Baltic" , disambiguation: "@item Text character set" )) { |
242 | return KEncodingProber::Baltic; |
243 | } else if (lang == tr(sourceText: "Chinese Traditional" , disambiguation: "@item Text character set" )) { |
244 | return KEncodingProber::ChineseTraditional; |
245 | } else if (lang == tr(sourceText: "Chinese Simplified" , disambiguation: "@item Text character set" )) { |
246 | return KEncodingProber::ChineseSimplified; |
247 | } else if (lang == tr(sourceText: "Korean" , disambiguation: "@item Text character set" )) { |
248 | return KEncodingProber::Korean; |
249 | } else if (lang == tr(sourceText: "Thai" , disambiguation: "@item Text character set" )) { |
250 | return KEncodingProber::Thai; |
251 | } else if (lang == tr(sourceText: "Arabic" , disambiguation: "@item Text character set" )) { |
252 | return KEncodingProber::Arabic; |
253 | } |
254 | |
255 | return KEncodingProber::Universal; |
256 | } |
257 | |
258 | QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType) |
259 | { |
260 | switch (proberType) { |
261 | case KEncodingProber::None: |
262 | return tr(sourceText: "Disabled" , disambiguation: "@item Text character set" ); |
263 | break; |
264 | case KEncodingProber::Universal: |
265 | return tr(sourceText: "Universal" , disambiguation: "@item Text character set" ); |
266 | break; |
267 | case KEncodingProber::Arabic: |
268 | return tr(sourceText: "Arabic" , disambiguation: "@item Text character set" ); |
269 | break; |
270 | case KEncodingProber::Baltic: |
271 | return tr(sourceText: "Baltic" , disambiguation: "@item Text character set" ); |
272 | break; |
273 | case KEncodingProber::CentralEuropean: |
274 | return tr(sourceText: "Central European" , disambiguation: "@item Text character set" ); |
275 | break; |
276 | case KEncodingProber::Cyrillic: |
277 | return tr(sourceText: "Cyrillic" , disambiguation: "@item Text character set" ); |
278 | break; |
279 | case KEncodingProber::Greek: |
280 | return tr(sourceText: "Greek" , disambiguation: "@item Text character set" ); |
281 | break; |
282 | case KEncodingProber::Hebrew: |
283 | return tr(sourceText: "Hebrew" , disambiguation: "@item Text character set" ); |
284 | break; |
285 | case KEncodingProber::Japanese: |
286 | return tr(sourceText: "Japanese" , disambiguation: "@item Text character set" ); |
287 | break; |
288 | case KEncodingProber::Turkish: |
289 | return tr(sourceText: "Turkish" , disambiguation: "@item Text character set" ); |
290 | break; |
291 | case KEncodingProber::WesternEuropean: |
292 | return tr(sourceText: "Western European" , disambiguation: "@item Text character set" ); |
293 | break; |
294 | case KEncodingProber::ChineseTraditional: |
295 | return tr(sourceText: "Chinese Traditional" , disambiguation: "@item Text character set" ); |
296 | break; |
297 | case KEncodingProber::ChineseSimplified: |
298 | return tr(sourceText: "Chinese Simplified" , disambiguation: "@item Text character set" ); |
299 | break; |
300 | case KEncodingProber::Korean: |
301 | return tr(sourceText: "Korean" , disambiguation: "@item Text character set" ); |
302 | break; |
303 | case KEncodingProber::Thai: |
304 | return tr(sourceText: "Thai" , disambiguation: "@item Text character set" ); |
305 | break; |
306 | case KEncodingProber::Unicode: |
307 | return tr(sourceText: "Unicode" , disambiguation: "@item Text character set" ); |
308 | break; |
309 | default: |
310 | return QString(); |
311 | } |
312 | } |
313 | |