1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4 SPDX-License-Identifier: MIT
5*/
6
7#ifndef CharDistribution_h__
8#define CharDistribution_h__
9
10#include "kcodecs_export.h"
11
12#include <qcompilerdetection.h>
13
14#define ENOUGH_DATA_THRESHOLD 256
15
16namespace kencodingprober
17{
18class KCODECS_NO_EXPORT CharDistributionAnalysis
19{
20public:
21 CharDistributionAnalysis()
22 {
23 Reset();
24 }
25 virtual ~CharDistributionAnalysis()
26 {
27 }
28
29 // feed a block of data and do distribution analysis
30 void HandleData(const char * /* aBuf */, unsigned int /* aLen */)
31 {
32 }
33
34 // Feed a character with known length
35 void HandleOneChar(const char *aStr, unsigned int aCharLen)
36 {
37 int order;
38
39 // we only care about 2-bytes character in our distribution analysis
40 order = (aCharLen == 2) ? GetOrder(aStr) : -1;
41
42 if (order >= 0) {
43 mTotalChars++;
44 // order is valid
45 if ((unsigned int)order < mTableSize) {
46 if (512 > mCharToFreqOrder[order]) {
47 mFreqChars++;
48 }
49 }
50 }
51 }
52
53 // return confidence base on existing data
54 float GetConfidence();
55
56 // Reset analyser, clear any state
57 void Reset(void)
58 {
59 mDone = false;
60 mTotalChars = 0;
61 mFreqChars = 0;
62 }
63
64 // This function is for future extension. Caller can use this function to control
65 // analyser's behavior
66 void SetOpion()
67 {
68 }
69
70 // It is not necessary to receive all data to draw conclusion. For charset detection,
71 // certain amount of data is enough
72 bool GotEnoughData()
73 {
74 return mTotalChars > ENOUGH_DATA_THRESHOLD;
75 }
76
77protected:
78 // we do not handle character base on its original encoding string, but
79 // convert this encoding string to a number, here called order.
80 // This allows multiple encodings of a language to share one frequency table
81 virtual int GetOrder(const char * /* str */)
82 {
83 return -1;
84 }
85
86 // If this flag is set to true, detection is done and conclusion has been made
87 bool mDone;
88
89 // The number of characters whose frequency order is less than 512
90 unsigned int mFreqChars;
91
92 // Total character encountered.
93 unsigned int mTotalChars;
94
95 // Mapping table to get frequency order from char order (get from GetOrder())
96 const short *mCharToFreqOrder;
97
98 // Size of above table
99 unsigned int mTableSize;
100
101 // This is a constant value varies from language to language, it is used in
102 // calculating confidence. See my paper for further detail.
103 float mTypicalDistributionRatio;
104};
105
106class KCODECS_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis
107{
108public:
109 EUCKRDistributionAnalysis();
110
111protected:
112 // for euc-KR encoding, we are interested
113 // first byte range: 0xb0 -- 0xfe
114 // second byte range: 0xa1 -- 0xfe
115 // no validation needed here. State machine has done that
116 int GetOrder(const char *str) override
117 {
118 if ((unsigned char)*str >= (unsigned char)0xb0) {
119 return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
120 } else {
121 return -1;
122 }
123 }
124};
125
126class KCODECS_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis
127{
128public:
129 GB2312DistributionAnalysis();
130
131protected:
132 // for GB2312 encoding, we are interested
133 // first byte range: 0xb0 -- 0xfe
134 // second byte range: 0xa1 -- 0xfe
135 // no validation needed here. State machine has done that
136 int GetOrder(const char *str) override
137 {
138 if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) {
139 return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
140 } else {
141 return -1;
142 }
143 }
144};
145
146class KCODECS_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis
147{
148public:
149 Big5DistributionAnalysis();
150
151protected:
152 // for big5 encoding, we are interested
153 // first byte range: 0xa4 -- 0xfe
154 // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
155 // no validation needed here. State machine has done that
156 int GetOrder(const char *str) override
157 {
158 if ((unsigned char)*str >= (unsigned char)0xa4)
159 if ((unsigned char)str[1] >= (unsigned char)0xa1) {
160 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 + 63;
161 } else {
162 return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
163 }
164 else {
165 return -1;
166 }
167 }
168};
169
170class KCODECS_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis
171{
172public:
173 SJISDistributionAnalysis();
174
175protected:
176 // for sjis encoding, we are interested
177 // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
178 // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
179 // no validation needed here. State machine has done that
180 int GetOrder(const char *str) override
181 {
182 int order;
183 if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) {
184 order = 188 * ((unsigned char)str[0] - (unsigned char)0x81);
185 } else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) {
186 order = 188 * ((unsigned char)str[0] - (unsigned char)0xe0 + 31);
187 } else {
188 return -1;
189 }
190 order += (unsigned char)*(str + 1) - 0x40;
191 if ((unsigned char)str[1] > (unsigned char)0x7f) {
192 order--;
193 }
194 return order;
195 }
196};
197
198class KCODECS_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis
199{
200public:
201 EUCJPDistributionAnalysis();
202
203protected:
204 // for euc-JP encoding, we are interested
205 // first byte range: 0xa0 -- 0xfe
206 // second byte range: 0xa1 -- 0xfe
207 // no validation needed here. State machine has done that
208 int GetOrder(const char *str) override
209 {
210 if ((unsigned char)*str >= (unsigned char)0xa0) {
211 return 94 * ((unsigned char)str[0] - (unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
212 } else {
213 return -1;
214 }
215 }
216};
217}
218#endif // CharDistribution_h__
219

source code of kcodecs/src/probers/CharDistribution.h