1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4 SPDX-License-Identifier: MIT
5*/
6
7#include "nsMBCSGroupProber.h"
8
9#include "UnicodeGroupProber.h"
10#include "nsBig5Prober.h"
11#include "nsEUCJPProber.h"
12#include "nsEUCKRProber.h"
13#include "nsGB2312Prober.h"
14#include "nsSJISProber.h"
15
16#include <stdio.h>
17#include <stdlib.h>
18
19namespace kencodingprober
20{
21#ifdef DEBUG_PROBE
22static const char *const ProberName[] = {
23 "Unicode",
24 "SJIS",
25 "EUCJP",
26 "GB18030",
27 "EUCKR",
28 "Big5",
29};
30
31#endif
32
33namespace
34{
35using Prober = nsMBCSGroupProber::Prober;
36constexpr std::array<bool, 6> fromSelectedList(std::span<const Prober> selected)
37{
38 std::array<bool, 6> isSelected{false};
39 for (auto p : selected) {
40 const auto i = static_cast<std::underlying_type_t<Prober>>(p);
41 if (i >= NUM_OF_PROBERS) {
42 continue;
43 }
44 isSelected[i] = true;
45 }
46 return isSelected;
47}
48static_assert(fromSelectedList(selected: {})[0] == false);
49static_assert(fromSelectedList(selected: {})[5] == false);
50static_assert(fromSelectedList(selected: std::array{Prober::Unicode})[0] == true);
51static_assert(fromSelectedList(selected: std::array{Prober::Unicode})[5] == false);
52static_assert(fromSelectedList(selected: std::array{Prober::SJIS, Prober::Big5})[0] == false);
53static_assert(fromSelectedList(selected: std::array{Prober::SJIS, Prober::Big5})[1] == true);
54static_assert(fromSelectedList(selected: std::array{Prober::SJIS, Prober::Big5})[2] == false);
55static_assert(fromSelectedList(selected: std::array{Prober::SJIS, Prober::Big5})[3] == false);
56static_assert(fromSelectedList(selected: std::array{Prober::SJIS, Prober::Big5})[4] == false);
57static_assert(fromSelectedList(selected: std::array{Prober::SJIS, Prober::Big5})[5] == true);
58
59} // namespace <anonymous>
60
61nsMBCSGroupProber::nsMBCSGroupProber(std::span<const Prober> selected)
62 : mIsSelected(fromSelectedList(selected))
63{
64 mProbers[0] = new UnicodeGroupProber();
65 mProbers[1] = new nsSJISProber();
66 mProbers[2] = new nsEUCJPProber();
67 mProbers[3] = new nsGB18030Prober();
68 mProbers[4] = new nsEUCKRProber();
69 mProbers[5] = new nsBig5Prober();
70 Reset();
71}
72
73nsMBCSGroupProber::nsMBCSGroupProber()
74 : nsMBCSGroupProber(std::array{
75 Prober::Unicode,
76 Prober::SJIS,
77 Prober::EUCJP,
78 Prober::GB18030,
79 Prober::EUCKR,
80 Prober::Big5,
81 })
82{
83}
84
85nsMBCSGroupProber::~nsMBCSGroupProber()
86{
87 for (unsigned int i = 0; i < NUM_OF_PROBERS; i++) {
88 delete mProbers[i];
89 }
90}
91
92const char *nsMBCSGroupProber::GetCharSetName()
93{
94 if (mBestGuess == -1) {
95 GetConfidence();
96 if (mBestGuess == -1) {
97 mBestGuess = 0;
98 }
99 }
100 return mProbers[mBestGuess]->GetCharSetName();
101}
102
103void nsMBCSGroupProber::Reset(void)
104{
105 mActiveNum = 0;
106 for (unsigned int i = 0; i < NUM_OF_PROBERS; i++) {
107 if (mProbers[i] && mIsSelected[i]) {
108 mProbers[i]->Reset();
109 mIsActive[i] = true;
110 ++mActiveNum;
111 } else {
112 mIsActive[i] = false;
113 }
114 }
115 mBestGuess = -1;
116 mState = eDetecting;
117}
118
119nsProbingState nsMBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen)
120{
121 nsProbingState st;
122 unsigned int i;
123
124 // do filtering to reduce load to probers
125 char *highbyteBuf;
126 char *hptr;
127 bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise
128 hptr = highbyteBuf = (char *)malloc(size: aLen);
129 if (!hptr) {
130 return mState;
131 }
132 for (i = 0; i < aLen; ++i) {
133 if (aBuf[i] & 0x80) {
134 *hptr++ = aBuf[i];
135 keepNext = true;
136 } else {
137 // if previous is highbyte, keep this even it is a ASCII
138 if (keepNext) {
139 *hptr++ = aBuf[i];
140 keepNext = false;
141 }
142 }
143 }
144
145 for (i = 0; i < NUM_OF_PROBERS; ++i) {
146 if (!mIsActive[i]) {
147 continue;
148 }
149 st = mProbers[i]->HandleData(aBuf: highbyteBuf, aLen: hptr - highbyteBuf);
150 if (st == eFoundIt) {
151 mBestGuess = i;
152 mState = eFoundIt;
153 break;
154 } else if (st == eNotMe) {
155 mIsActive[i] = false;
156 mActiveNum--;
157 if (mActiveNum == 0) {
158 mState = eNotMe;
159 break;
160 }
161 }
162 }
163
164 free(ptr: highbyteBuf);
165
166 return mState;
167}
168
169float nsMBCSGroupProber::GetConfidence(void)
170{
171 unsigned int i;
172 float bestConf = 0.0;
173 float cf;
174
175 switch (mState) {
176 case eFoundIt:
177 return (float)0.99;
178 case eNotMe:
179 return (float)0.01;
180 default:
181 for (i = 0; i < NUM_OF_PROBERS; ++i) {
182 if (!mIsActive[i]) {
183 continue;
184 }
185 cf = mProbers[i]->GetConfidence();
186 if (bestConf < cf) {
187 bestConf = cf;
188 mBestGuess = i;
189 }
190 }
191 }
192 return bestConf;
193}
194
195#ifdef DEBUG_PROBE
196void nsMBCSGroupProber::DumpStatus()
197{
198 unsigned int i;
199 float cf;
200
201 GetConfidence();
202 for (i = 0; i < NUM_OF_PROBERS; i++) {
203 if (!mIsSelected[i]) {
204 printf(" MBCS deselected: [%s][%s]\r\n", ProberName[i], mProbers[i]->GetCharSetName());
205 } else if (!mIsActive[i]) {
206 printf(" MBCS inactive: [%s][%s] (confidence is too low).\r\n", ProberName[i], mProbers[i]->GetCharSetName());
207 } else {
208 cf = mProbers[i]->GetConfidence();
209 printf(" MBCS %1.3f: [%s][%s]\r\n", cf, ProberName[i], mProbers[i]->GetCharSetName());
210 mProbers[i]->DumpStatus();
211 }
212 }
213}
214#endif
215}
216

source code of kcodecs/src/probers/nsMBCSGroupProber.cpp