1 | /* -*- C++ -*- |
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #include "nsCodingStateMachine.h" |
8 | |
9 | /* |
10 | Modification from frank tang's original work: |
11 | . 0x00 is allowed as a legal character. Since some web pages contains this char in |
12 | text stream. |
13 | */ |
14 | |
15 | // BIG5 |
16 | |
17 | namespace kencodingprober |
18 | { |
19 | static const unsigned int BIG5_cls[256 / 8] = { |
20 | // PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 |
21 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 //allow 0x00 as legal value |
22 | PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f |
23 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 |
24 | PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f |
25 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 |
26 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f |
27 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 |
28 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f |
29 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 40 - 47 |
30 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 48 - 4f |
31 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 50 - 57 |
32 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 58 - 5f |
33 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 60 - 67 |
34 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 68 - 6f |
35 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 70 - 77 |
36 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 1), // 78 - 7f |
37 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 80 - 87 |
38 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 88 - 8f |
39 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 90 - 97 |
40 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 98 - 9f |
41 | PCK4BITS(4, 3, 3, 3, 3, 3, 3, 3), // a0 - a7 |
42 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // a8 - af |
43 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // b0 - b7 |
44 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // b8 - bf |
45 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // c0 - c7 |
46 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // c8 - cf |
47 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // d0 - d7 |
48 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // d8 - df |
49 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // e0 - e7 |
50 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // e8 - ef |
51 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // f0 - f7 |
52 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 0) // f8 - ff |
53 | }; |
54 | |
55 | static const unsigned int BIG5_st[3] = { |
56 | PCK4BITS(eError, eStart, eStart, 3, eError, eError, eError, eError), // 00-07 |
57 | PCK4BITS(eError, eError, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eError), // 08-0f |
58 | PCK4BITS(eError, eStart, eStart, eStart, eStart, eStart, eStart, eStart) // 10-17 |
59 | }; |
60 | |
61 | static const unsigned int Big5CharLenTable[] = {0, 1, 1, 2, 0}; |
62 | |
63 | const SMModel Big5SMModel = { |
64 | .classTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: BIG5_cls}, |
65 | .classFactor: 5, |
66 | .stateTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: BIG5_st}, |
67 | .charLenTable: Big5CharLenTable, |
68 | .name: "Big5" , |
69 | }; |
70 | |
71 | static const unsigned int EUCJP_cls[256 / 8] = { |
72 | // PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07 |
73 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 00 - 07 |
74 | PCK4BITS(4, 4, 4, 4, 4, 4, 5, 5), // 08 - 0f |
75 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 10 - 17 |
76 | PCK4BITS(4, 4, 4, 5, 4, 4, 4, 4), // 18 - 1f |
77 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 20 - 27 |
78 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 28 - 2f |
79 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 30 - 37 |
80 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 38 - 3f |
81 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 40 - 47 |
82 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 48 - 4f |
83 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 50 - 57 |
84 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 58 - 5f |
85 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 60 - 67 |
86 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 68 - 6f |
87 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 70 - 77 |
88 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 78 - 7f |
89 | PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // 80 - 87 |
90 | PCK4BITS(5, 5, 5, 5, 5, 5, 1, 3), // 88 - 8f |
91 | PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // 90 - 97 |
92 | PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // 98 - 9f |
93 | PCK4BITS(5, 2, 2, 2, 2, 2, 2, 2), // a0 - a7 |
94 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // a8 - af |
95 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b0 - b7 |
96 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b8 - bf |
97 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c0 - c7 |
98 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c8 - cf |
99 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d0 - d7 |
100 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d8 - df |
101 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e0 - e7 |
102 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e8 - ef |
103 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // f0 - f7 |
104 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 5) // f8 - ff |
105 | }; |
106 | |
107 | static const unsigned int EUCJP_st[5] = { |
108 | PCK4BITS(3, 4, 3, 5, eStart, eError, eError, eError), // 00-07 |
109 | PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f |
110 | PCK4BITS(eItsMe, eItsMe, eStart, eError, eStart, eError, eError, eError), // 10-17 |
111 | PCK4BITS(eError, eError, eStart, eError, eError, eError, 3, eError), // 18-1f |
112 | PCK4BITS(3, eError, eError, eError, eStart, eStart, eStart, eStart) // 20-27 |
113 | }; |
114 | |
115 | static const unsigned int EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0}; |
116 | |
117 | const SMModel EUCJPSMModel = { |
118 | .classTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: EUCJP_cls}, |
119 | .classFactor: 6, |
120 | .stateTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: EUCJP_st}, |
121 | .charLenTable: EUCJPCharLenTable, |
122 | .name: "EUC-JP" , |
123 | }; |
124 | |
125 | static const unsigned int EUCKR_cls[256 / 8] = { |
126 | // PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 |
127 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 |
128 | PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f |
129 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 |
130 | PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f |
131 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 |
132 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f |
133 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 |
134 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f |
135 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 40 - 47 |
136 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 48 - 4f |
137 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 50 - 57 |
138 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 58 - 5f |
139 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 60 - 67 |
140 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 68 - 6f |
141 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 70 - 77 |
142 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 78 - 7f |
143 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 80 - 87 |
144 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 88 - 8f |
145 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 90 - 97 |
146 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 98 - 9f |
147 | PCK4BITS(0, 2, 2, 2, 2, 2, 2, 2), // a0 - a7 |
148 | PCK4BITS(2, 2, 2, 2, 2, 3, 3, 3), // a8 - af |
149 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b0 - b7 |
150 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b8 - bf |
151 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c0 - c7 |
152 | PCK4BITS(2, 3, 2, 2, 2, 2, 2, 2), // c8 - cf |
153 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d0 - d7 |
154 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d8 - df |
155 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // e0 - e7 |
156 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // e8 - ef |
157 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // f0 - f7 |
158 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 0) // f8 - ff |
159 | }; |
160 | |
161 | static const unsigned int EUCKR_st[2] = { |
162 | PCK4BITS(eError, eStart, 3, eError, eError, eError, eError, eError), // 00-07 |
163 | PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eError, eError, eStart, eStart) // 08-0f |
164 | }; |
165 | |
166 | static const unsigned int EUCKRCharLenTable[] = {0, 1, 2, 0}; |
167 | |
168 | const SMModel EUCKRSMModel = { |
169 | .classTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: EUCKR_cls}, |
170 | .classFactor: 4, |
171 | .stateTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: EUCKR_st}, |
172 | .charLenTable: EUCKRCharLenTable, |
173 | .name: "EUC-KR" , |
174 | }; |
175 | |
176 | /* obsolete GB2312 by gb18030 |
177 | static unsigned int GB2312_cls [ 256 / 8 ] = { |
178 | //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 |
179 | PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 |
180 | PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f |
181 | PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 |
182 | PCK4BITS(1,1,1,0,1,1,1,1), // 18 - 1f |
183 | PCK4BITS(1,1,1,1,1,1,1,1), // 20 - 27 |
184 | PCK4BITS(1,1,1,1,1,1,1,1), // 28 - 2f |
185 | PCK4BITS(1,1,1,1,1,1,1,1), // 30 - 37 |
186 | PCK4BITS(1,1,1,1,1,1,1,1), // 38 - 3f |
187 | PCK4BITS(1,1,1,1,1,1,1,1), // 40 - 47 |
188 | PCK4BITS(1,1,1,1,1,1,1,1), // 48 - 4f |
189 | PCK4BITS(1,1,1,1,1,1,1,1), // 50 - 57 |
190 | PCK4BITS(1,1,1,1,1,1,1,1), // 58 - 5f |
191 | PCK4BITS(1,1,1,1,1,1,1,1), // 60 - 67 |
192 | PCK4BITS(1,1,1,1,1,1,1,1), // 68 - 6f |
193 | PCK4BITS(1,1,1,1,1,1,1,1), // 70 - 77 |
194 | PCK4BITS(1,1,1,1,1,1,1,1), // 78 - 7f |
195 | PCK4BITS(1,0,0,0,0,0,0,0), // 80 - 87 |
196 | PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f |
197 | PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 |
198 | PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f |
199 | PCK4BITS(0,2,2,2,2,2,2,2), // a0 - a7 |
200 | PCK4BITS(2,2,3,3,3,3,3,3), // a8 - af |
201 | PCK4BITS(2,2,2,2,2,2,2,2), // b0 - b7 |
202 | PCK4BITS(2,2,2,2,2,2,2,2), // b8 - bf |
203 | PCK4BITS(2,2,2,2,2,2,2,2), // c0 - c7 |
204 | PCK4BITS(2,2,2,2,2,2,2,2), // c8 - cf |
205 | PCK4BITS(2,2,2,2,2,2,2,2), // d0 - d7 |
206 | PCK4BITS(2,2,2,2,2,2,2,2), // d8 - df |
207 | PCK4BITS(2,2,2,2,2,2,2,2), // e0 - e7 |
208 | PCK4BITS(2,2,2,2,2,2,2,2), // e8 - ef |
209 | PCK4BITS(2,2,2,2,2,2,2,2), // f0 - f7 |
210 | PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff |
211 | }; |
212 | |
213 | static unsigned int GB2312_st [ 2] = { |
214 | PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 |
215 | PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f |
216 | }; |
217 | |
218 | static const unsigned int GB2312CharLenTable[] = {0, 1, 2, 0}; |
219 | |
220 | SMModel GB2312SMModel = { |
221 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_cls }, |
222 | 4, |
223 | {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB2312_st }, |
224 | GB2312CharLenTable, |
225 | "GB2312", |
226 | }; |
227 | */ |
228 | |
229 | // the following state machine data was created by perl script in |
230 | // intl/chardet/tools. It should be the same as in PSM detector. |
231 | static const unsigned int GB18030_cls[256 / 8] = { |
232 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 |
233 | PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f |
234 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 |
235 | PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f |
236 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 |
237 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f |
238 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 30 - 37 |
239 | PCK4BITS(3, 3, 1, 1, 1, 1, 1, 1), // 38 - 3f |
240 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 40 - 47 |
241 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 48 - 4f |
242 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 50 - 57 |
243 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 58 - 5f |
244 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 60 - 67 |
245 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 68 - 6f |
246 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 70 - 77 |
247 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 4), // 78 - 7f |
248 | PCK4BITS(5, 6, 6, 6, 6, 6, 6, 6), // 80 - 87 |
249 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // 88 - 8f |
250 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // 90 - 97 |
251 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // 98 - 9f |
252 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // a0 - a7 |
253 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // a8 - af |
254 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // b0 - b7 |
255 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // b8 - bf |
256 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c0 - c7 |
257 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c8 - cf |
258 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d0 - d7 |
259 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d8 - df |
260 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // e0 - e7 |
261 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // e8 - ef |
262 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // f0 - f7 |
263 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 0) // f8 - ff |
264 | }; |
265 | |
266 | static const unsigned int GB18030_st[6] = { |
267 | PCK4BITS(eError, eStart, eStart, eStart, eStart, eStart, 3, eError), // 00-07 |
268 | PCK4BITS(eError, eError, eError, eError, eError, eError, eItsMe, eItsMe), // 08-0f |
269 | PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eError, eError, eStart), // 10-17 |
270 | PCK4BITS(4, eError, eStart, eStart, eError, eError, eError, eError), // 18-1f |
271 | PCK4BITS(eError, eError, 5, eError, eError, eError, eItsMe, eError), // 20-27 |
272 | PCK4BITS(eError, eError, eStart, eStart, eStart, eStart, eStart, eStart) // 28-2f |
273 | }; |
274 | |
275 | // To be accurate, the length of class 6 can be either 2 or 4. |
276 | // But it is not necessary to discriminate between the two since |
277 | // it is used for frequency analysis only, and we are validating |
278 | // each code range there as well. So it is safe to set it to be |
279 | // 2 here. |
280 | static const unsigned int GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2}; |
281 | |
282 | const SMModel GB18030SMModel = { |
283 | .classTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: GB18030_cls}, |
284 | .classFactor: 7, |
285 | .stateTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: GB18030_st}, |
286 | .charLenTable: GB18030CharLenTable, |
287 | .name: "GB18030" , |
288 | }; |
289 | |
290 | // sjis |
291 | |
292 | static const unsigned int SJIS_cls[256 / 8] = { |
293 | // PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 |
294 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 |
295 | PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f |
296 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 |
297 | PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f |
298 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 |
299 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f |
300 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 |
301 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f |
302 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 40 - 47 |
303 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 48 - 4f |
304 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 50 - 57 |
305 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 58 - 5f |
306 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 60 - 67 |
307 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 68 - 6f |
308 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 70 - 77 |
309 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 1), // 78 - 7f |
310 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 80 - 87 |
311 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 88 - 8f |
312 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 90 - 97 |
313 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 98 - 9f |
314 | // 0xa0 is illegal in sjis encoding, but some pages does |
315 | // contain such byte. We need to be more error forgiven. |
316 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // a0 - a7 |
317 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // a8 - af |
318 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b0 - b7 |
319 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // b8 - bf |
320 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c0 - c7 |
321 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // c8 - cf |
322 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d0 - d7 |
323 | PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // d8 - df |
324 | PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // e0 - e7 |
325 | PCK4BITS(3, 3, 3, 3, 3, 4, 4, 4), // e8 - ef |
326 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // f0 - f7 |
327 | PCK4BITS(4, 4, 4, 4, 4, 0, 0, 0) // f8 - ff |
328 | }; |
329 | |
330 | static const unsigned int SJIS_st[3] = { |
331 | PCK4BITS(eError, eStart, eStart, 3, eError, eError, eError, eError), // 00-07 |
332 | PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f |
333 | PCK4BITS(eItsMe, eItsMe, eError, eError, eStart, eStart, eStart, eStart) // 10-17 |
334 | }; |
335 | |
336 | static const unsigned int SJISCharLenTable[] = {0, 1, 1, 2, 0, 0}; |
337 | |
338 | const SMModel SJISSMModel = { |
339 | .classTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: SJIS_cls}, |
340 | .classFactor: 6, |
341 | .stateTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: SJIS_st}, |
342 | .charLenTable: SJISCharLenTable, |
343 | .name: "Shift_JIS" , |
344 | }; |
345 | |
346 | static const unsigned int UCS2BE_cls[256 / 8] = { |
347 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 00 - 07 |
348 | PCK4BITS(0, 0, 1, 0, 0, 2, 0, 0), // 08 - 0f |
349 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 10 - 17 |
350 | PCK4BITS(0, 0, 0, 3, 0, 0, 0, 0), // 18 - 1f |
351 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 20 - 27 |
352 | PCK4BITS(0, 3, 3, 3, 3, 3, 0, 0), // 28 - 2f |
353 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 30 - 37 |
354 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 38 - 3f |
355 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 40 - 47 |
356 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 48 - 4f |
357 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 50 - 57 |
358 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 58 - 5f |
359 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 60 - 67 |
360 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 68 - 6f |
361 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 70 - 77 |
362 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 78 - 7f |
363 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 80 - 87 |
364 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 88 - 8f |
365 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 90 - 97 |
366 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 98 - 9f |
367 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a0 - a7 |
368 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a8 - af |
369 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b0 - b7 |
370 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b8 - bf |
371 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c0 - c7 |
372 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c8 - cf |
373 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d0 - d7 |
374 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d8 - df |
375 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e0 - e7 |
376 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e8 - ef |
377 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // f0 - f7 |
378 | PCK4BITS(0, 0, 0, 0, 0, 0, 4, 5) // f8 - ff |
379 | }; |
380 | |
381 | static const unsigned int UCS2BE_st[7] = { |
382 | PCK4BITS(5, 7, 7, eError, 4, 3, eError, eError), // 00-07 |
383 | PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f |
384 | PCK4BITS(eItsMe, eItsMe, 6, 6, 6, 6, eError, eError), // 10-17 |
385 | PCK4BITS(6, 6, 6, 6, 6, eItsMe, 6, 6), // 18-1f |
386 | PCK4BITS(6, 6, 6, 6, 5, 7, 7, eError), // 20-27 |
387 | PCK4BITS(5, 8, 6, 6, eError, 6, 6, 6), // 28-2f |
388 | PCK4BITS(6, 6, 6, 6, eError, eError, eStart, eStart) // 30-37 |
389 | }; |
390 | |
391 | static const unsigned int UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2}; |
392 | |
393 | const SMModel UCS2BESMModel = { |
394 | .classTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: UCS2BE_cls}, |
395 | .classFactor: 6, |
396 | .stateTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: UCS2BE_st}, |
397 | .charLenTable: UCS2BECharLenTable, |
398 | .name: "UTF-16BE" , |
399 | }; |
400 | |
401 | static const unsigned int UCS2LE_cls[256 / 8] = { |
402 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 00 - 07 |
403 | PCK4BITS(0, 0, 1, 0, 0, 2, 0, 0), // 08 - 0f |
404 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 10 - 17 |
405 | PCK4BITS(0, 0, 0, 3, 0, 0, 0, 0), // 18 - 1f |
406 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 20 - 27 |
407 | PCK4BITS(0, 3, 3, 3, 3, 3, 0, 0), // 28 - 2f |
408 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 30 - 37 |
409 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 38 - 3f |
410 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 40 - 47 |
411 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 48 - 4f |
412 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 50 - 57 |
413 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 58 - 5f |
414 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 60 - 67 |
415 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 68 - 6f |
416 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 70 - 77 |
417 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 78 - 7f |
418 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 80 - 87 |
419 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 88 - 8f |
420 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 90 - 97 |
421 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // 98 - 9f |
422 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a0 - a7 |
423 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // a8 - af |
424 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b0 - b7 |
425 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // b8 - bf |
426 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c0 - c7 |
427 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // c8 - cf |
428 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d0 - d7 |
429 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // d8 - df |
430 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e0 - e7 |
431 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // e8 - ef |
432 | PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0), // f0 - f7 |
433 | PCK4BITS(0, 0, 0, 0, 0, 0, 4, 5) // f8 - ff |
434 | }; |
435 | |
436 | static const unsigned int UCS2LE_st[7] = { |
437 | PCK4BITS(6, 6, 7, 6, 4, 3, eError, eError), // 00-07 |
438 | PCK4BITS(eError, eError, eError, eError, eItsMe, eItsMe, eItsMe, eItsMe), // 08-0f |
439 | PCK4BITS(eItsMe, eItsMe, 5, 5, 5, eError, eItsMe, eError), // 10-17 |
440 | PCK4BITS(5, 5, 5, eError, 5, eError, 6, 6), // 18-1f |
441 | PCK4BITS(7, 6, 8, 8, 5, 5, 5, eError), // 20-27 |
442 | PCK4BITS(5, 5, 5, eError, eError, eError, 5, 5), // 28-2f |
443 | PCK4BITS(5, 5, 5, eError, 5, eError, eStart, eStart) // 30-37 |
444 | }; |
445 | |
446 | static const unsigned int UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2}; |
447 | |
448 | const SMModel UCS2LESMModel = { |
449 | .classTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: UCS2LE_cls}, |
450 | .classFactor: 6, |
451 | .stateTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: UCS2LE_st}, |
452 | .charLenTable: UCS2LECharLenTable, |
453 | .name: "UTF-16LE" , |
454 | }; |
455 | |
456 | static const unsigned int UTF8_cls[256 / 8] = { |
457 | // PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 |
458 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 00 - 07 //allow 0x00 as a legal value |
459 | PCK4BITS(1, 1, 1, 1, 1, 1, 0, 0), // 08 - 0f |
460 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 10 - 17 |
461 | PCK4BITS(1, 1, 1, 0, 1, 1, 1, 1), // 18 - 1f |
462 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 20 - 27 |
463 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 28 - 2f |
464 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 30 - 37 |
465 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 38 - 3f |
466 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 40 - 47 |
467 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 48 - 4f |
468 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 50 - 57 |
469 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 58 - 5f |
470 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 60 - 67 |
471 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 68 - 6f |
472 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 70 - 77 |
473 | PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 78 - 7f |
474 | PCK4BITS(2, 2, 2, 2, 3, 3, 3, 3), // 80 - 87 |
475 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 88 - 8f |
476 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 90 - 97 |
477 | PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 98 - 9f |
478 | PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // a0 - a7 |
479 | PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // a8 - af |
480 | PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // b0 - b7 |
481 | PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // b8 - bf |
482 | PCK4BITS(0, 0, 6, 6, 6, 6, 6, 6), // c0 - c7 |
483 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c8 - cf |
484 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d0 - d7 |
485 | PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d8 - df |
486 | PCK4BITS(7, 8, 8, 8, 8, 8, 8, 8), // e0 - e7 |
487 | PCK4BITS(8, 8, 8, 8, 8, 9, 8, 8), // e8 - ef |
488 | PCK4BITS(10, 11, 11, 11, 11, 11, 11, 11), // f0 - f7 |
489 | PCK4BITS(12, 13, 13, 13, 14, 15, 0, 0) // f8 - ff |
490 | }; |
491 | |
492 | static const unsigned int UTF8_st[26] = { |
493 | PCK4BITS(eError, eStart, eError, eError, eError, eError, 12, 10), // 00-07 |
494 | PCK4BITS(9, 11, 8, 7, 6, 5, 4, 3), // 08-0f |
495 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 10-17 |
496 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 18-1f |
497 | PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe), // 20-27 |
498 | PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe), // 28-2f |
499 | PCK4BITS(eError, eError, 5, 5, 5, 5, eError, eError), // 30-37 |
500 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 38-3f |
501 | PCK4BITS(eError, eError, eError, 5, 5, 5, eError, eError), // 40-47 |
502 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 48-4f |
503 | PCK4BITS(eError, eError, 7, 7, 7, 7, eError, eError), // 50-57 |
504 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 58-5f |
505 | PCK4BITS(eError, eError, eError, eError, 7, 7, eError, eError), // 60-67 |
506 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 68-6f |
507 | PCK4BITS(eError, eError, 9, 9, 9, 9, eError, eError), // 70-77 |
508 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 78-7f |
509 | PCK4BITS(eError, eError, eError, eError, eError, 9, eError, eError), // 80-87 |
510 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 88-8f |
511 | PCK4BITS(eError, eError, 12, 12, 12, 12, eError, eError), // 90-97 |
512 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 98-9f |
513 | PCK4BITS(eError, eError, eError, eError, eError, 12, eError, eError), // a0-a7 |
514 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // a8-af |
515 | PCK4BITS(eError, eError, 12, 12, 12, eError, eError, eError), // b0-b7 |
516 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // b8-bf |
517 | PCK4BITS(eError, eError, eStart, eStart, eStart, eStart, eError, eError), // c0-c7 |
518 | PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError) // c8-cf |
519 | }; |
520 | |
521 | static const unsigned int UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6}; |
522 | |
523 | const SMModel UTF8SMModel = { |
524 | .classTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: UTF8_cls}, |
525 | .classFactor: 16, |
526 | .stateTable: {.idxsft: eIdxSft4bits, .sftmsk: eSftMsk4bits, .bitsft: eBitSft4bits, .unitmsk: eUnitMsk4bits, .data: UTF8_st}, |
527 | .charLenTable: UTF8CharLenTable, |
528 | .name: "UTF-8" , |
529 | }; |
530 | } |
531 | |