1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4 SPDX-License-Identifier: MIT
5*/
6
7#ifndef CharDistribution_h__
8#define CharDistribution_h__
9
10#include "kcodecs_export.h"
11
12#include <qcompilerdetection.h>
13
14#define ENOUGH_DATA_THRESHOLD 256
15
16namespace kencodingprober
17{
18class KCODECS_NO_EXPORT CharDistributionAnalysis
19{
20public:
21 CharDistributionAnalysis()
22 {
23 Reset();
24 }
25 virtual ~CharDistributionAnalysis()
26 {
27 }
28
29 // feed a block of data and do distribution analysis
30 void HandleData(const char * /* aBuf */, unsigned int /* aLen */)
31 {
32 }
33
34 // Feed a character with known length
35 void HandleOneChar(const char *aStr, unsigned int aCharLen)
36 {
37 int order;
38
39 // we only care about 2-bytes character in our distribution analysis
40 order = (aCharLen == 2) ? GetOrder(aStr) : -1;
41
42 if (order >= 0) {
43 mTotalChars++;
44 // order is valid
45 if ((unsigned int)order < mTableSize) {
46 if (512 > mCharToFreqOrder[order]) {
47 mFreqChars++;
48 }
49 }
50 }
51 }
52
53 // return confidence base on existing data
54 float GetConfidence();
55
56 // Reset analyser, clear any state
57 void Reset(void)
58 {
59 mDone = false;
60 mTotalChars = 0;
61 mFreqChars = 0;
62 }
63
64 // It is not necessary to receive all data to draw conclusion. For charset detection,
65 // certain amount of data is enough
66 bool GotEnoughData()
67 {
68 return mTotalChars > ENOUGH_DATA_THRESHOLD;
69 }
70
71protected:
72 // we do not handle character base on its original encoding string, but
73 // convert this encoding string to a number, here called order.
74 // This allows multiple encodings of a language to share one frequency table
75 virtual int GetOrder(const char * /* str */)
76 {
77 return -1;
78 }
79
80 // If this flag is set to true, detection is done and conclusion has been made
81 bool mDone;
82
83 // The number of characters whose frequency order is less than 512
84 unsigned int mFreqChars;
85
86 // Total character encountered.
87 unsigned int mTotalChars;
88
89 // Mapping table to get frequency order from char order (get from GetOrder())
90 const short *mCharToFreqOrder;
91
92 // Size of above table
93 unsigned int mTableSize;
94
95 // This is a constant value varies from language to language, it is used in
96 // calculating confidence. See my paper for further detail.
97 float mTypicalDistributionRatio;
98};
99
100class KCODECS_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis
101{
102public:
103 EUCKRDistributionAnalysis();
104
105protected:
106 // for euc-KR encoding, we are interested
107 // first byte range: 0xb0 -- 0xfe
108 // second byte range: 0xa1 -- 0xfe
109 // no validation needed here. State machine has done that
110 int GetOrder(const char *str) override
111 {
112 if ((unsigned char)*str >= (unsigned char)0xb0) {
113 return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
114 } else {
115 return -1;
116 }
117 }
118};
119
120class KCODECS_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis
121{
122public:
123 GB2312DistributionAnalysis();
124
125protected:
126 // for GB2312 encoding, we are interested
127 // first byte range: 0xb0 -- 0xfe
128 // second byte range: 0xa1 -- 0xfe
129 // no validation needed here. State machine has done that
130 int GetOrder(const char *str) override
131 {
132 if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) {
133 return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
134 } else {
135 return -1;
136 }
137 }
138};
139
140class KCODECS_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis
141{
142public:
143 Big5DistributionAnalysis();
144
145protected:
146 // for big5 encoding, we are interested
147 // first byte range: 0xa4 -- 0xfe
148 // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
149 // no validation needed here. State machine has done that
150 int GetOrder(const char *str) override
151 {
152 if ((unsigned char)*str >= (unsigned char)0xa4)
153 if ((unsigned char)str[1] >= (unsigned char)0xa1) {
154 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 + 63;
155 } else {
156 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
157 }
158 else {
159 return -1;
160 }
161 }
162};
163
164class KCODECS_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis
165{
166public:
167 SJISDistributionAnalysis();
168
169protected:
170 // for sjis encoding, we are interested
171 // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
172 // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
173 // no validation needed here. State machine has done that
174 int GetOrder(const char *str) override
175 {
176 int order;
177 if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) {
178 order = 188 * ((unsigned char)str[0] - (unsigned char)0x81);
179 } else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) {
180 order = 188 * ((unsigned char)str[0] - (unsigned char)0xe0 + 31);
181 } else {
182 return -1;
183 }
184 order += (unsigned char)*(str + 1) - 0x40;
185 if ((unsigned char)str[1] > (unsigned char)0x7f) {
186 order--;
187 }
188 return order;
189 }
190};
191
192class KCODECS_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis
193{
194public:
195 EUCJPDistributionAnalysis();
196
197protected:
198 // for euc-JP encoding, we are interested
199 // first byte range: 0xa0 -- 0xfe
200 // second byte range: 0xa1 -- 0xfe
201 // no validation needed here. State machine has done that
202 int GetOrder(const char *str) override
203 {
204 if ((unsigned char)*str >= (unsigned char)0xa0) {
205 return 94 * ((unsigned char)str[0] - (unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
206 } else {
207 return -1;
208 }
209 }
210};
211}
212#endif // CharDistribution_h__
213

source code of kcodecs/src/probers/CharDistribution.h