1 | /* -*- C++ -*- |
---|---|
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com> |
4 | |
5 | SPDX-License-Identifier: MIT |
6 | */ |
7 | |
8 | #include "nsUniversalDetector.h" |
9 | |
10 | #include "nsEscCharsetProber.h" |
11 | #include "nsLatin1Prober.h" |
12 | #include "nsMBCSGroupProber.h" |
13 | #include "nsSBCSGroupProber.h" |
14 | |
15 | namespace kencodingprober |
16 | { |
17 | nsUniversalDetector::nsUniversalDetector() |
18 | { |
19 | mDone = false; |
20 | mBestGuess = -1; // illegal value as signal |
21 | mInTag = false; |
22 | mEscCharSetProber = nullptr; |
23 | |
24 | mStart = true; |
25 | mDetectedCharset = nullptr; |
26 | mGotData = false; |
27 | mInputState = ePureAscii; |
28 | mLastChar = '\0'; |
29 | |
30 | unsigned int i; |
31 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
32 | mCharSetProbers[i] = nullptr; |
33 | } |
34 | } |
35 | |
36 | nsUniversalDetector::~nsUniversalDetector() |
37 | { |
38 | for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
39 | delete mCharSetProbers[i]; |
40 | } |
41 | delete mEscCharSetProber; |
42 | } |
43 | |
44 | void nsUniversalDetector::Reset() |
45 | { |
46 | mDone = false; |
47 | mBestGuess = -1; // illegal value as signal |
48 | mInTag = false; |
49 | |
50 | mStart = true; |
51 | mDetectedCharset = nullptr; |
52 | mGotData = false; |
53 | mInputState = ePureAscii; |
54 | mLastChar = '\0'; |
55 | |
56 | if (mEscCharSetProber) { |
57 | mEscCharSetProber->Reset(); |
58 | } |
59 | |
60 | unsigned int i; |
61 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
62 | if (mCharSetProbers[i]) { |
63 | mCharSetProbers[i]->Reset(); |
64 | } |
65 | } |
66 | } |
67 | |
68 | //--------------------------------------------------------------------- |
69 | #define SHORTCUT_THRESHOLD (float)0.95 |
70 | #define MINIMUM_THRESHOLD (float)0.20 |
71 | |
72 | nsProbingState nsUniversalDetector::HandleData(const char *aBuf, unsigned int aLen) |
73 | { |
74 | if (mDone) { |
75 | return eFoundIt; |
76 | } |
77 | |
78 | if (aLen > 0) { |
79 | mGotData = true; |
80 | } |
81 | |
82 | unsigned int i; |
83 | for (i = 0; i < aLen; i++) { |
84 | // other than 0xa0, if every other character is ascii, the page is ascii |
85 | if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { // Since many Ascii only page contains NBSP |
86 | // we got a non-ascii byte (high-byte) |
87 | if (mInputState != eHighbyte) { |
88 | // adjust state |
89 | mInputState = eHighbyte; |
90 | |
91 | // kill mEscCharSetProber if it is active |
92 | delete mEscCharSetProber; |
93 | mEscCharSetProber = nullptr; |
94 | |
95 | // start multibyte and singlebyte charset prober |
96 | if (nullptr == mCharSetProbers[0]) { |
97 | mCharSetProbers[0] = new nsMBCSGroupProber; |
98 | } |
99 | if (nullptr == mCharSetProbers[1]) { |
100 | mCharSetProbers[1] = new nsSBCSGroupProber; |
101 | } |
102 | if (nullptr == mCharSetProbers[2]) { |
103 | mCharSetProbers[2] = new nsLatin1Prober; |
104 | } |
105 | } |
106 | } else { |
107 | // ok, just pure ascii so far |
108 | if (ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) { |
109 | // found escape character or HZ "~{" |
110 | mInputState = eEscAscii; |
111 | } |
112 | |
113 | mLastChar = aBuf[i]; |
114 | } |
115 | } |
116 | |
117 | nsProbingState st = eDetecting; |
118 | switch (mInputState) { |
119 | case eEscAscii: |
120 | if (nullptr == mEscCharSetProber) { |
121 | mEscCharSetProber = new nsEscCharSetProber; |
122 | } |
123 | st = mEscCharSetProber->HandleData(aBuf, aLen); |
124 | if (st == eFoundIt) { |
125 | mDone = true; |
126 | mDetectedCharset = mEscCharSetProber->GetCharSetName(); |
127 | } |
128 | break; |
129 | case eHighbyte: |
130 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) { |
131 | st = mCharSetProbers[i]->HandleData(aBuf, aLen); |
132 | if (st == eFoundIt) { |
133 | mDone = true; |
134 | mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); |
135 | } |
136 | } |
137 | break; |
138 | |
139 | default: // pure ascii |
140 | mDetectedCharset = "UTF-8"; |
141 | } |
142 | return st; |
143 | } |
144 | |
145 | //--------------------------------------------------------------------- |
146 | const char *nsUniversalDetector::GetCharSetName() |
147 | { |
148 | if (mDetectedCharset) { |
149 | return mDetectedCharset; |
150 | } |
151 | switch (mInputState) { |
152 | case eHighbyte: { |
153 | float proberConfidence; |
154 | float maxProberConfidence = (float)0.0; |
155 | int maxProber = 0; |
156 | |
157 | for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
158 | proberConfidence = mCharSetProbers[i]->GetConfidence(); |
159 | if (proberConfidence > maxProberConfidence) { |
160 | maxProberConfidence = proberConfidence; |
161 | maxProber = i; |
162 | } |
163 | } |
164 | // do not report anything because we are not confident of it, that's in fact a negative answer |
165 | if (maxProberConfidence > MINIMUM_THRESHOLD) { |
166 | return mCharSetProbers[maxProber]->GetCharSetName(); |
167 | } |
168 | } |
169 | case eEscAscii: |
170 | break; |
171 | default: // pure ascii |
172 | ; |
173 | } |
174 | return "UTF-8"; |
175 | } |
176 | |
177 | //--------------------------------------------------------------------- |
178 | float nsUniversalDetector::GetConfidence() |
179 | { |
180 | if (!mGotData) { |
181 | // we haven't got any data yet, return immediately |
182 | // caller program sometimes call DataEnd before anything has been sent to detector |
183 | return MINIMUM_THRESHOLD; |
184 | } |
185 | if (mDetectedCharset) { |
186 | return 0.99f; |
187 | } |
188 | switch (mInputState) { |
189 | case eHighbyte: { |
190 | float proberConfidence; |
191 | float maxProberConfidence = (float)0.0; |
192 | int maxProber = 0; |
193 | |
194 | for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
195 | proberConfidence = mCharSetProbers[i]->GetConfidence(); |
196 | if (proberConfidence > maxProberConfidence) { |
197 | maxProberConfidence = proberConfidence; |
198 | maxProber = i; |
199 | } |
200 | } |
201 | // do not report anything because we are not confident of it, that's in fact a negative answer |
202 | if (maxProberConfidence > MINIMUM_THRESHOLD) { |
203 | return mCharSetProbers[maxProber]->GetConfidence(); |
204 | } |
205 | } |
206 | case eEscAscii: |
207 | break; |
208 | default: // pure ascii |
209 | ; |
210 | } |
211 | return MINIMUM_THRESHOLD; |
212 | } |
213 | |
214 | nsProbingState nsUniversalDetector::GetState() |
215 | { |
216 | if (mDone) { |
217 | return eFoundIt; |
218 | } else { |
219 | return eDetecting; |
220 | } |
221 | } |
222 | } |
223 |