1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3 SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com>
4
5 SPDX-License-Identifier: MIT
6*/
7
8#include "nsUniversalDetector.h"
9
10#include "nsEscCharsetProber.h"
11#include "nsLatin1Prober.h"
12#include "nsMBCSGroupProber.h"
13#include "nsSBCSGroupProber.h"
14
15namespace kencodingprober
16{
17nsUniversalDetector::nsUniversalDetector()
18{
19 mDone = false;
20 mBestGuess = -1; // illegal value as signal
21 mEscCharSetProber = nullptr;
22
23 mDetectedCharset = nullptr;
24 mGotData = false;
25 mInputState = ePureAscii;
26 mLastChar = '\0';
27
28 unsigned int i;
29 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
30 mCharSetProbers[i] = nullptr;
31 }
32}
33
34nsUniversalDetector::~nsUniversalDetector()
35{
36 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
37 delete mCharSetProbers[i];
38 }
39 delete mEscCharSetProber;
40}
41
42void nsUniversalDetector::Reset()
43{
44 mDone = false;
45 mBestGuess = -1; // illegal value as signal
46
47 mDetectedCharset = nullptr;
48 mGotData = false;
49 mInputState = ePureAscii;
50 mLastChar = '\0';
51
52 if (mEscCharSetProber) {
53 mEscCharSetProber->Reset();
54 }
55
56 unsigned int i;
57 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
58 if (mCharSetProbers[i]) {
59 mCharSetProbers[i]->Reset();
60 }
61 }
62}
63
64//---------------------------------------------------------------------
65#define SHORTCUT_THRESHOLD (float)0.95
66#define MINIMUM_THRESHOLD (float)0.20
67
68nsProbingState nsUniversalDetector::HandleData(const char *aBuf, unsigned int aLen)
69{
70 if (mDone) {
71 return eFoundIt;
72 }
73
74 if (aLen > 0) {
75 mGotData = true;
76 }
77
78 unsigned int i;
79 for (i = 0; i < aLen; i++) {
80 // other than 0xa0, if every other character is ascii, the page is ascii
81 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { // Since many Ascii only page contains NBSP
82 // we got a non-ascii byte (high-byte)
83 if (mInputState != eHighbyte) {
84 // adjust state
85 mInputState = eHighbyte;
86
87 // kill mEscCharSetProber if it is active
88 delete mEscCharSetProber;
89 mEscCharSetProber = nullptr;
90
91 // start multibyte and singlebyte charset prober
92 if (nullptr == mCharSetProbers[0]) {
93 mCharSetProbers[0] = new nsMBCSGroupProber;
94 }
95 if (nullptr == mCharSetProbers[1]) {
96 mCharSetProbers[1] = new nsSBCSGroupProber;
97 }
98 if (nullptr == mCharSetProbers[2]) {
99 mCharSetProbers[2] = new nsLatin1Prober;
100 }
101 }
102 } else {
103 // ok, just pure ascii so far
104 if (ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) {
105 // found escape character or HZ "~{"
106 mInputState = eEscAscii;
107 }
108
109 mLastChar = aBuf[i];
110 }
111 }
112
113 nsProbingState st = eDetecting;
114 switch (mInputState) {
115 case eEscAscii:
116 if (nullptr == mEscCharSetProber) {
117 mEscCharSetProber = new nsEscCharSetProber;
118 }
119 st = mEscCharSetProber->HandleData(aBuf, aLen);
120 if (st == eFoundIt) {
121 mDone = true;
122 mDetectedCharset = mEscCharSetProber->GetCharSetName();
123 }
124 break;
125 case eHighbyte:
126 for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) {
127 st = mCharSetProbers[i]->HandleData(aBuf, aLen);
128 if (st == eFoundIt) {
129 mDone = true;
130 mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
131 }
132 }
133 break;
134
135 default: // pure ascii
136 mDetectedCharset = "UTF-8";
137 }
138 return st;
139}
140
141//---------------------------------------------------------------------
142const char *nsUniversalDetector::GetCharSetName()
143{
144 if (mDetectedCharset) {
145 return mDetectedCharset;
146 }
147 switch (mInputState) {
148 case eHighbyte: {
149 float proberConfidence;
150 float maxProberConfidence = (float)0.0;
151 int maxProber = 0;
152
153 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
154 proberConfidence = mCharSetProbers[i]->GetConfidence();
155 if (proberConfidence > maxProberConfidence) {
156 maxProberConfidence = proberConfidence;
157 maxProber = i;
158 }
159 }
160 // do not report anything because we are not confident of it, that's in fact a negative answer
161 if (maxProberConfidence > MINIMUM_THRESHOLD) {
162 return mCharSetProbers[maxProber]->GetCharSetName();
163 }
164 }
165 case eEscAscii:
166 break;
167 default: // pure ascii
168 ;
169 }
170 return "UTF-8";
171}
172
173//---------------------------------------------------------------------
174float nsUniversalDetector::GetConfidence()
175{
176 if (!mGotData) {
177 // we haven't got any data yet, return immediately
178 // caller program sometimes call DataEnd before anything has been sent to detector
179 return MINIMUM_THRESHOLD;
180 }
181 if (mDetectedCharset) {
182 return 0.99f;
183 }
184 switch (mInputState) {
185 case eHighbyte: {
186 float proberConfidence;
187 float maxProberConfidence = (float)0.0;
188 int maxProber = 0;
189
190 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
191 proberConfidence = mCharSetProbers[i]->GetConfidence();
192 if (proberConfidence > maxProberConfidence) {
193 maxProberConfidence = proberConfidence;
194 maxProber = i;
195 }
196 }
197 // do not report anything because we are not confident of it, that's in fact a negative answer
198 if (maxProberConfidence > MINIMUM_THRESHOLD) {
199 return mCharSetProbers[maxProber]->GetConfidence();
200 }
201 }
202 case eEscAscii:
203 break;
204 default: // pure ascii
205 ;
206 }
207 return MINIMUM_THRESHOLD;
208}
209
210nsProbingState nsUniversalDetector::GetState()
211{
212 if (mDone) {
213 return eFoundIt;
214 } else {
215 return eDetecting;
216 }
217}
218}
219

source code of kcodecs/src/probers/nsUniversalDetector.cpp