1 | /* -*- C++ -*- |
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #include "nsMBCSGroupProber.h" |
8 | |
9 | #include <stdio.h> |
10 | #include <stdlib.h> |
11 | |
12 | namespace kencodingprober |
13 | { |
14 | #ifdef DEBUG_PROBE |
15 | static const char *const ProberName[] = { |
16 | "Unicode" , |
17 | "SJIS" , |
18 | "EUCJP" , |
19 | "GB18030" , |
20 | "EUCKR" , |
21 | "Big5" , |
22 | }; |
23 | |
24 | #endif |
25 | |
26 | nsMBCSGroupProber::nsMBCSGroupProber() |
27 | { |
28 | mProbers[0] = new UnicodeGroupProber(); |
29 | mProbers[1] = new nsSJISProber(); |
30 | mProbers[2] = new nsEUCJPProber(); |
31 | mProbers[3] = new nsGB18030Prober(); |
32 | mProbers[4] = new nsEUCKRProber(); |
33 | mProbers[5] = new nsBig5Prober(); |
34 | Reset(); |
35 | } |
36 | |
37 | nsMBCSGroupProber::~nsMBCSGroupProber() |
38 | { |
39 | for (unsigned int i = 0; i < NUM_OF_PROBERS; i++) { |
40 | delete mProbers[i]; |
41 | } |
42 | } |
43 | |
44 | const char *nsMBCSGroupProber::GetCharSetName() |
45 | { |
46 | if (mBestGuess == -1) { |
47 | GetConfidence(); |
48 | if (mBestGuess == -1) { |
49 | mBestGuess = 0; |
50 | } |
51 | } |
52 | return mProbers[mBestGuess]->GetCharSetName(); |
53 | } |
54 | |
55 | void nsMBCSGroupProber::Reset(void) |
56 | { |
57 | mActiveNum = 0; |
58 | for (unsigned int i = 0; i < NUM_OF_PROBERS; i++) { |
59 | if (mProbers[i]) { |
60 | mProbers[i]->Reset(); |
61 | mIsActive[i] = true; |
62 | ++mActiveNum; |
63 | } else { |
64 | mIsActive[i] = false; |
65 | } |
66 | } |
67 | mBestGuess = -1; |
68 | mState = eDetecting; |
69 | } |
70 | |
71 | nsProbingState nsMBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen) |
72 | { |
73 | nsProbingState st; |
74 | unsigned int i; |
75 | |
76 | // do filtering to reduce load to probers |
77 | char *highbyteBuf; |
78 | char *hptr; |
79 | bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise |
80 | hptr = highbyteBuf = (char *)malloc(size: aLen); |
81 | if (!hptr) { |
82 | return mState; |
83 | } |
84 | for (i = 0; i < aLen; ++i) { |
85 | if (aBuf[i] & 0x80) { |
86 | *hptr++ = aBuf[i]; |
87 | keepNext = true; |
88 | } else { |
89 | // if previous is highbyte, keep this even it is a ASCII |
90 | if (keepNext) { |
91 | *hptr++ = aBuf[i]; |
92 | keepNext = false; |
93 | } |
94 | } |
95 | } |
96 | |
97 | for (i = 0; i < NUM_OF_PROBERS; ++i) { |
98 | if (!mIsActive[i]) { |
99 | continue; |
100 | } |
101 | st = mProbers[i]->HandleData(aBuf: highbyteBuf, aLen: hptr - highbyteBuf); |
102 | if (st == eFoundIt) { |
103 | mBestGuess = i; |
104 | mState = eFoundIt; |
105 | break; |
106 | } else if (st == eNotMe) { |
107 | mIsActive[i] = false; |
108 | mActiveNum--; |
109 | if (mActiveNum == 0) { |
110 | mState = eNotMe; |
111 | break; |
112 | } |
113 | } |
114 | } |
115 | |
116 | free(ptr: highbyteBuf); |
117 | |
118 | return mState; |
119 | } |
120 | |
121 | float nsMBCSGroupProber::GetConfidence(void) |
122 | { |
123 | unsigned int i; |
124 | float bestConf = 0.0; |
125 | float cf; |
126 | |
127 | switch (mState) { |
128 | case eFoundIt: |
129 | return (float)0.99; |
130 | case eNotMe: |
131 | return (float)0.01; |
132 | default: |
133 | for (i = 0; i < NUM_OF_PROBERS; ++i) { |
134 | if (!mIsActive[i]) { |
135 | continue; |
136 | } |
137 | cf = mProbers[i]->GetConfidence(); |
138 | if (bestConf < cf) { |
139 | bestConf = cf; |
140 | mBestGuess = i; |
141 | } |
142 | } |
143 | } |
144 | return bestConf; |
145 | } |
146 | |
147 | #ifdef DEBUG_PROBE |
148 | void nsMBCSGroupProber::DumpStatus() |
149 | { |
150 | unsigned int i; |
151 | float cf; |
152 | |
153 | GetConfidence(); |
154 | for (i = 0; i < NUM_OF_PROBERS; i++) { |
155 | if (!mIsActive[i]) { |
156 | printf(" MBCS inactive: [%s] (confidence is too low).\r\n" , ProberName[i]); |
157 | } else { |
158 | cf = mProbers[i]->GetConfidence(); |
159 | printf(" MBCS %1.3f: [%s]\r\n" , cf, ProberName[i]); |
160 | } |
161 | } |
162 | } |
163 | #endif |
164 | } |
165 | |