CharDistribution.h source code [kcodecs/src/probers/CharDistribution.h]

1	/ -- C++ --*
2	SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4	SPDX-License-Identifier: MIT
5	*/
6
7	#ifndef CharDistribution_h__
8	#define CharDistribution_h__
9
10	#include "kcodecs_export.h"
11
12	#include <qcompilerdetection.h>
13
14	#define ENOUGH_DATA_THRESHOLD 256
15
16	namespace kencodingprober
17	{
18	class KCODECS_NO_EXPORT CharDistributionAnalysis
19	{
20	public:
21	CharDistributionAnalysis()
22	{
23	Reset();
24	}
25	virtual ~CharDistributionAnalysis()
26	{
27	}
28
29	// feed a block of data and do distribution analysis
30	void HandleData(const char * / aBuf /, unsigned int / aLen /)
31	{
32	}
33
34	// Feed a character with known length
35	void HandleOneChar(const char aStr, unsigned* int aCharLen)
36	{
37	int order;
38
39	// we only care about 2-bytes character in our distribution analysis
40	order = (aCharLen == `2`) ? GetOrder(aStr) : -`1`;
41
42	if (order >= `0`) {
43	mTotalChars++;
44	// order is valid
45	if ((unsigned int)order < mTableSize) {
46	if (`512` > mCharToFreqOrder[order]) {
47	mFreqChars++;
48	}
49	}
50	}
51	}
52
53	// return confidence base on existing data
54	float GetConfidence();
55
56	// Reset analyser, clear any state
57	void Reset(void)
58	{
59	mDone = false;
60	mTotalChars = `0`;
61	mFreqChars = `0`;
62	}
63
64	// It is not necessary to receive all data to draw conclusion. For charset detection,
65	// certain amount of data is enough
66	bool GotEnoughData()
67	{
68	return mTotalChars > ENOUGH_DATA_THRESHOLD;
69	}
70
71	protected:
72	// we do not handle character base on its original encoding string, but
73	// convert this encoding string to a number, here called order.
74	// This allows multiple encodings of a language to share one frequency table
75	virtual int GetOrder(const char * / str /)
76	{
77	return -`1`;
78	}
79
80	// If this flag is set to true, detection is done and conclusion has been made
81	bool mDone;
82
83	// The number of characters whose frequency order is less than 512
84	unsigned int mFreqChars;
85
86	// Total character encountered.
87	unsigned int mTotalChars;
88
89	// Mapping table to get frequency order from char order (get from GetOrder())
90	const short *mCharToFreqOrder;
91
92	// Size of above table
93	unsigned int mTableSize;
94
95	// This is a constant value varies from language to language, it is used in
96	// calculating confidence. See my paper for further detail.
97	float mTypicalDistributionRatio;
98	};
99
100	class KCODECS_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis
101	{
102	public:
103	EUCKRDistributionAnalysis();
104
105	protected:
106	// for euc-KR encoding, we are interested
107	// first byte range: 0xb0 -- 0xfe
108	// second byte range: 0xa1 -- 0xfe
109	// no validation needed here. State machine has done that
110	int GetOrder(const char *str) override
111	{
112	if ((unsigned char)str >= (unsigned* char)`0xb0`) {
113	return `94` * ((unsigned char)str[`0`] - (unsigned char)`0xb0`) + (unsigned char)str[`1`] - (unsigned char)`0xa1`;
114	} else {
115	return -`1`;
116	}
117	}
118	};
119
120	class KCODECS_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis
121	{
122	public:
123	GB2312DistributionAnalysis();
124
125	protected:
126	// for GB2312 encoding, we are interested
127	// first byte range: 0xb0 -- 0xfe
128	// second byte range: 0xa1 -- 0xfe
129	// no validation needed here. State machine has done that
130	int GetOrder(const char *str) override
131	{
132	if ((unsigned char)str >= (unsigned* char)`0xb0` && (unsigned char)str[`1`] >= (unsigned char)`0xa1`) {
133	return `94` * ((unsigned char)str[`0`] - (unsigned char)`0xb0`) + (unsigned char)str[`1`] - (unsigned char)`0xa1`;
134	} else {
135	return -`1`;
136	}
137	}
138	};
139
140	class KCODECS_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis
141	{
142	public:
143	Big5DistributionAnalysis();
144
145	protected:
146	// for big5 encoding, we are interested
147	// first byte range: 0xa4 -- 0xfe
148	// second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
149	// no validation needed here. State machine has done that
150	int GetOrder(const char *str) override
151	{
152	if ((unsigned char)str >= (unsigned* char)`0xa4`)
153	if ((unsigned char)str[`1`] >= (unsigned char)`0xa1`) {
154	return `157` * ((unsigned char)str[`0`] - (unsigned char)`0xa4`) + (unsigned char)str[`1`] - (unsigned char)`0xa1` + `63`;
155	} else {
156	return `157` * ((unsigned char)str[`0`] - (unsigned char)`0xa4`) + (unsigned char)str[`1`] - (unsigned char)`0x40`;
157	}
158	else {
159	return -`1`;
160	}
161	}
162	};
163
164	class KCODECS_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis
165	{
166	public:
167	SJISDistributionAnalysis();
168
169	protected:
170	// for sjis encoding, we are interested
171	// first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
172	// second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
173	// no validation needed here. State machine has done that
174	int GetOrder(const char *str) override
175	{
176	int order;
177	if ((unsigned char)str >= (unsigned* char)`0x81` && (unsigned char)str <= (unsigned* char)`0x9f`) {
178	order = `188` * ((unsigned char)str[`0`] - (unsigned char)`0x81`);
179	} else if ((unsigned char)str >= (unsigned* char)`0xe0` && (unsigned char)str <= (unsigned* char)`0xef`) {
180	order = `188` * ((unsigned char)str[`0`] - (unsigned char)`0xe0` + `31`);
181	} else {
182	return -`1`;
183	}
184	order += (unsigned char)*(str + `1`) - `0x40`;
185	if ((unsigned char)str[`1`] > (unsigned char)`0x7f`) {
186	order--;
187	}
188	return order;
189	}
190	};
191
192	class KCODECS_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis
193	{
194	public:
195	EUCJPDistributionAnalysis();
196
197	protected:
198	// for euc-JP encoding, we are interested
199	// first byte range: 0xa0 -- 0xfe
200	// second byte range: 0xa1 -- 0xfe
201	// no validation needed here. State machine has done that
202	int GetOrder(const char *str) override
203	{
204	if ((unsigned char)str >= (unsigned* char)`0xa0`) {
205	return `94` * ((unsigned char)str[`0`] - (unsigned char)`0xa1`) + (unsigned char)str[`1`] - (unsigned char)`0xa1`;
206	} else {
207	return -`1`;
208	}
209	}
210	};
211	}
212	#endif // CharDistribution_h__
213

source code of kcodecs/src/probers/CharDistribution.h