1 | /* -*- C++ -*- |
---|---|
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com> |
4 | |
5 | SPDX-License-Identifier: MIT |
6 | */ |
7 | |
8 | #include "nsUniversalDetector.h" |
9 | |
10 | #include "nsEscCharsetProber.h" |
11 | #include "nsLatin1Prober.h" |
12 | #include "nsMBCSGroupProber.h" |
13 | #include "nsSBCSGroupProber.h" |
14 | |
15 | namespace kencodingprober |
16 | { |
17 | nsUniversalDetector::nsUniversalDetector() |
18 | { |
19 | mDone = false; |
20 | mBestGuess = -1; // illegal value as signal |
21 | mEscCharSetProber = nullptr; |
22 | |
23 | mDetectedCharset = nullptr; |
24 | mGotData = false; |
25 | mInputState = ePureAscii; |
26 | mLastChar = '\0'; |
27 | |
28 | unsigned int i; |
29 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
30 | mCharSetProbers[i] = nullptr; |
31 | } |
32 | } |
33 | |
34 | nsUniversalDetector::~nsUniversalDetector() |
35 | { |
36 | for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
37 | delete mCharSetProbers[i]; |
38 | } |
39 | delete mEscCharSetProber; |
40 | } |
41 | |
42 | void nsUniversalDetector::Reset() |
43 | { |
44 | mDone = false; |
45 | mBestGuess = -1; // illegal value as signal |
46 | |
47 | mDetectedCharset = nullptr; |
48 | mGotData = false; |
49 | mInputState = ePureAscii; |
50 | mLastChar = '\0'; |
51 | |
52 | if (mEscCharSetProber) { |
53 | mEscCharSetProber->Reset(); |
54 | } |
55 | |
56 | unsigned int i; |
57 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
58 | if (mCharSetProbers[i]) { |
59 | mCharSetProbers[i]->Reset(); |
60 | } |
61 | } |
62 | } |
63 | |
64 | //--------------------------------------------------------------------- |
65 | #define SHORTCUT_THRESHOLD (float)0.95 |
66 | #define MINIMUM_THRESHOLD (float)0.20 |
67 | |
68 | nsProbingState nsUniversalDetector::HandleData(const char *aBuf, unsigned int aLen) |
69 | { |
70 | if (mDone) { |
71 | return eFoundIt; |
72 | } |
73 | |
74 | if (aLen > 0) { |
75 | mGotData = true; |
76 | } |
77 | |
78 | unsigned int i; |
79 | for (i = 0; i < aLen; i++) { |
80 | // other than 0xa0, if every other character is ascii, the page is ascii |
81 | if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { // Since many Ascii only page contains NBSP |
82 | // we got a non-ascii byte (high-byte) |
83 | if (mInputState != eHighbyte) { |
84 | // adjust state |
85 | mInputState = eHighbyte; |
86 | |
87 | // kill mEscCharSetProber if it is active |
88 | delete mEscCharSetProber; |
89 | mEscCharSetProber = nullptr; |
90 | |
91 | // start multibyte and singlebyte charset prober |
92 | if (nullptr == mCharSetProbers[0]) { |
93 | mCharSetProbers[0] = new nsMBCSGroupProber; |
94 | } |
95 | if (nullptr == mCharSetProbers[1]) { |
96 | mCharSetProbers[1] = new nsSBCSGroupProber; |
97 | } |
98 | if (nullptr == mCharSetProbers[2]) { |
99 | mCharSetProbers[2] = new nsLatin1Prober; |
100 | } |
101 | } |
102 | } else { |
103 | // ok, just pure ascii so far |
104 | if (ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) { |
105 | // found escape character or HZ "~{" |
106 | mInputState = eEscAscii; |
107 | } |
108 | |
109 | mLastChar = aBuf[i]; |
110 | } |
111 | } |
112 | |
113 | nsProbingState st = eDetecting; |
114 | switch (mInputState) { |
115 | case eEscAscii: |
116 | if (nullptr == mEscCharSetProber) { |
117 | mEscCharSetProber = new nsEscCharSetProber; |
118 | } |
119 | st = mEscCharSetProber->HandleData(aBuf, aLen); |
120 | if (st == eFoundIt) { |
121 | mDone = true; |
122 | mDetectedCharset = mEscCharSetProber->GetCharSetName(); |
123 | } |
124 | break; |
125 | case eHighbyte: |
126 | for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) { |
127 | st = mCharSetProbers[i]->HandleData(aBuf, aLen); |
128 | if (st == eFoundIt) { |
129 | mDone = true; |
130 | mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); |
131 | } |
132 | } |
133 | break; |
134 | |
135 | default: // pure ascii |
136 | mDetectedCharset = "UTF-8"; |
137 | } |
138 | return st; |
139 | } |
140 | |
141 | //--------------------------------------------------------------------- |
142 | const char *nsUniversalDetector::GetCharSetName() |
143 | { |
144 | if (mDetectedCharset) { |
145 | return mDetectedCharset; |
146 | } |
147 | switch (mInputState) { |
148 | case eHighbyte: { |
149 | float proberConfidence; |
150 | float maxProberConfidence = (float)0.0; |
151 | int maxProber = 0; |
152 | |
153 | for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
154 | proberConfidence = mCharSetProbers[i]->GetConfidence(); |
155 | if (proberConfidence > maxProberConfidence) { |
156 | maxProberConfidence = proberConfidence; |
157 | maxProber = i; |
158 | } |
159 | } |
160 | // do not report anything because we are not confident of it, that's in fact a negative answer |
161 | if (maxProberConfidence > MINIMUM_THRESHOLD) { |
162 | return mCharSetProbers[maxProber]->GetCharSetName(); |
163 | } |
164 | } |
165 | case eEscAscii: |
166 | break; |
167 | default: // pure ascii |
168 | ; |
169 | } |
170 | return "UTF-8"; |
171 | } |
172 | |
173 | //--------------------------------------------------------------------- |
174 | float nsUniversalDetector::GetConfidence() |
175 | { |
176 | if (!mGotData) { |
177 | // we haven't got any data yet, return immediately |
178 | // caller program sometimes call DataEnd before anything has been sent to detector |
179 | return MINIMUM_THRESHOLD; |
180 | } |
181 | if (mDetectedCharset) { |
182 | return 0.99f; |
183 | } |
184 | switch (mInputState) { |
185 | case eHighbyte: { |
186 | float proberConfidence; |
187 | float maxProberConfidence = (float)0.0; |
188 | int maxProber = 0; |
189 | |
190 | for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { |
191 | proberConfidence = mCharSetProbers[i]->GetConfidence(); |
192 | if (proberConfidence > maxProberConfidence) { |
193 | maxProberConfidence = proberConfidence; |
194 | maxProber = i; |
195 | } |
196 | } |
197 | // do not report anything because we are not confident of it, that's in fact a negative answer |
198 | if (maxProberConfidence > MINIMUM_THRESHOLD) { |
199 | return mCharSetProbers[maxProber]->GetConfidence(); |
200 | } |
201 | } |
202 | case eEscAscii: |
203 | break; |
204 | default: // pure ascii |
205 | ; |
206 | } |
207 | return MINIMUM_THRESHOLD; |
208 | } |
209 | |
210 | nsProbingState nsUniversalDetector::GetState() |
211 | { |
212 | if (mDone) { |
213 | return eFoundIt; |
214 | } else { |
215 | return eDetecting; |
216 | } |
217 | } |
218 | } |
219 |