1 | /* -*- C++ -*- |
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #ifndef CharDistribution_h__ |
8 | #define CharDistribution_h__ |
9 | |
10 | #include "kcodecs_export.h" |
11 | |
12 | #include <qcompilerdetection.h> |
13 | |
14 | #define ENOUGH_DATA_THRESHOLD 256 |
15 | |
16 | namespace kencodingprober |
17 | { |
18 | class KCODECS_NO_EXPORT CharDistributionAnalysis |
19 | { |
20 | public: |
21 | CharDistributionAnalysis() |
22 | { |
23 | Reset(); |
24 | } |
25 | virtual ~CharDistributionAnalysis() |
26 | { |
27 | } |
28 | |
29 | // feed a block of data and do distribution analysis |
30 | void HandleData(const char * /* aBuf */, unsigned int /* aLen */) |
31 | { |
32 | } |
33 | |
34 | // Feed a character with known length |
35 | void HandleOneChar(const char *aStr, unsigned int aCharLen) |
36 | { |
37 | int order; |
38 | |
39 | // we only care about 2-bytes character in our distribution analysis |
40 | order = (aCharLen == 2) ? GetOrder(aStr) : -1; |
41 | |
42 | if (order >= 0) { |
43 | mTotalChars++; |
44 | // order is valid |
45 | if ((unsigned int)order < mTableSize) { |
46 | if (512 > mCharToFreqOrder[order]) { |
47 | mFreqChars++; |
48 | } |
49 | } |
50 | } |
51 | } |
52 | |
53 | // return confidence base on existing data |
54 | float GetConfidence(); |
55 | |
56 | // Reset analyser, clear any state |
57 | void Reset(void) |
58 | { |
59 | mDone = false; |
60 | mTotalChars = 0; |
61 | mFreqChars = 0; |
62 | } |
63 | |
64 | // This function is for future extension. Caller can use this function to control |
65 | // analyser's behavior |
66 | void SetOpion() |
67 | { |
68 | } |
69 | |
70 | // It is not necessary to receive all data to draw conclusion. For charset detection, |
71 | // certain amount of data is enough |
72 | bool GotEnoughData() |
73 | { |
74 | return mTotalChars > ENOUGH_DATA_THRESHOLD; |
75 | } |
76 | |
77 | protected: |
78 | // we do not handle character base on its original encoding string, but |
79 | // convert this encoding string to a number, here called order. |
80 | // This allows multiple encodings of a language to share one frequency table |
81 | virtual int GetOrder(const char * /* str */) |
82 | { |
83 | return -1; |
84 | } |
85 | |
86 | // If this flag is set to true, detection is done and conclusion has been made |
87 | bool mDone; |
88 | |
89 | // The number of characters whose frequency order is less than 512 |
90 | unsigned int mFreqChars; |
91 | |
92 | // Total character encountered. |
93 | unsigned int mTotalChars; |
94 | |
95 | // Mapping table to get frequency order from char order (get from GetOrder()) |
96 | const short *mCharToFreqOrder; |
97 | |
98 | // Size of above table |
99 | unsigned int mTableSize; |
100 | |
101 | // This is a constant value varies from language to language, it is used in |
102 | // calculating confidence. See my paper for further detail. |
103 | float mTypicalDistributionRatio; |
104 | }; |
105 | |
106 | class KCODECS_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis |
107 | { |
108 | public: |
109 | EUCKRDistributionAnalysis(); |
110 | |
111 | protected: |
112 | // for euc-KR encoding, we are interested |
113 | // first byte range: 0xb0 -- 0xfe |
114 | // second byte range: 0xa1 -- 0xfe |
115 | // no validation needed here. State machine has done that |
116 | int GetOrder(const char *str) override |
117 | { |
118 | if ((unsigned char)*str >= (unsigned char)0xb0) { |
119 | return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; |
120 | } else { |
121 | return -1; |
122 | } |
123 | } |
124 | }; |
125 | |
126 | class KCODECS_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis |
127 | { |
128 | public: |
129 | GB2312DistributionAnalysis(); |
130 | |
131 | protected: |
132 | // for GB2312 encoding, we are interested |
133 | // first byte range: 0xb0 -- 0xfe |
134 | // second byte range: 0xa1 -- 0xfe |
135 | // no validation needed here. State machine has done that |
136 | int GetOrder(const char *str) override |
137 | { |
138 | if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) { |
139 | return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; |
140 | } else { |
141 | return -1; |
142 | } |
143 | } |
144 | }; |
145 | |
146 | class KCODECS_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis |
147 | { |
148 | public: |
149 | Big5DistributionAnalysis(); |
150 | |
151 | protected: |
152 | // for big5 encoding, we are interested |
153 | // first byte range: 0xa4 -- 0xfe |
154 | // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe |
155 | // no validation needed here. State machine has done that |
156 | int GetOrder(const char *str) override |
157 | { |
158 | if ((unsigned char)*str >= (unsigned char)0xa4) |
159 | if ((unsigned char)str[1] >= (unsigned char)0xa1) { |
160 | return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 + 63; |
161 | } else { |
162 | return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; |
163 | } |
164 | else { |
165 | return -1; |
166 | } |
167 | } |
168 | }; |
169 | |
170 | class KCODECS_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis |
171 | { |
172 | public: |
173 | SJISDistributionAnalysis(); |
174 | |
175 | protected: |
176 | // for sjis encoding, we are interested |
177 | // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe |
178 | // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe |
179 | // no validation needed here. State machine has done that |
180 | int GetOrder(const char *str) override |
181 | { |
182 | int order; |
183 | if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) { |
184 | order = 188 * ((unsigned char)str[0] - (unsigned char)0x81); |
185 | } else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) { |
186 | order = 188 * ((unsigned char)str[0] - (unsigned char)0xe0 + 31); |
187 | } else { |
188 | return -1; |
189 | } |
190 | order += (unsigned char)*(str + 1) - 0x40; |
191 | if ((unsigned char)str[1] > (unsigned char)0x7f) { |
192 | order--; |
193 | } |
194 | return order; |
195 | } |
196 | }; |
197 | |
198 | class KCODECS_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis |
199 | { |
200 | public: |
201 | EUCJPDistributionAnalysis(); |
202 | |
203 | protected: |
204 | // for euc-JP encoding, we are interested |
205 | // first byte range: 0xa0 -- 0xfe |
206 | // second byte range: 0xa1 -- 0xfe |
207 | // no validation needed here. State machine has done that |
208 | int GetOrder(const char *str) override |
209 | { |
210 | if ((unsigned char)*str >= (unsigned char)0xa0) { |
211 | return 94 * ((unsigned char)str[0] - (unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; |
212 | } else { |
213 | return -1; |
214 | } |
215 | } |
216 | }; |
217 | } |
218 | #endif // CharDistribution_h__ |
219 | |