1 | /* -*- C++ -*- |
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #include "nsSBCSGroupProber.h" |
8 | |
9 | #include "UnicodeGroupProber.h" |
10 | #include "nsHebrewProber.h" |
11 | #include "nsSBCharSetProber.h" |
12 | |
13 | #include <stdio.h> |
14 | #include <stdlib.h> |
15 | |
16 | namespace kencodingprober |
17 | { |
18 | nsSBCSGroupProber::nsSBCSGroupProber() |
19 | { |
20 | mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model); |
21 | mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel); |
22 | mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model); |
23 | mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel); |
24 | mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model); |
25 | mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model); |
26 | mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model); |
27 | mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); |
28 | mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); |
29 | mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); |
30 | |
31 | nsHebrewProber *hebprober = new nsHebrewProber(); |
32 | // Notice: Any change in these indexes - 10,11,12 must be reflected |
33 | // in the code below as well. |
34 | mProbers[10] = hebprober; |
35 | mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew |
36 | mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew |
37 | mProbers[13] = new UnicodeGroupProber(); |
38 | |
39 | // Tell the Hebrew prober about the logical and visual probers |
40 | if (mProbers[10] && mProbers[11] && mProbers[12]) { // all are not null |
41 | hebprober->SetModelProbers(logicalPrb: mProbers[11], visualPrb: mProbers[12]); |
42 | } else { // One or more is null. avoid any Hebrew probing, null them all |
43 | for (unsigned int i = 10; i <= 12; ++i) { |
44 | delete mProbers[i]; |
45 | mProbers[i] = nullptr; |
46 | } |
47 | } |
48 | |
49 | // disable latin2 before latin1 is available, otherwise all latin1 |
50 | // will be detected as latin2 because of their similarity. |
51 | // mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel); |
52 | // mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel); |
53 | |
54 | Reset(); |
55 | } |
56 | |
57 | nsSBCSGroupProber::~nsSBCSGroupProber() |
58 | { |
59 | for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) { |
60 | delete mProbers[i]; |
61 | } |
62 | } |
63 | |
64 | const char *nsSBCSGroupProber::GetCharSetName() |
65 | { |
66 | // if we have no answer yet |
67 | if (mBestGuess == -1) { |
68 | GetConfidence(); |
69 | // no charset seems positive |
70 | if (mBestGuess == -1) |
71 | // we will use default. |
72 | { |
73 | mBestGuess = 0; |
74 | } |
75 | } |
76 | return mProbers[mBestGuess]->GetCharSetName(); |
77 | } |
78 | |
79 | void nsSBCSGroupProber::Reset(void) |
80 | { |
81 | mActiveNum = 0; |
82 | for (unsigned int i = 0; i < NUM_OF_SBCS_PROBERS; i++) { |
83 | if (mProbers[i]) { // not null |
84 | mProbers[i]->Reset(); |
85 | mIsActive[i] = true; |
86 | ++mActiveNum; |
87 | } else { |
88 | mIsActive[i] = false; |
89 | } |
90 | } |
91 | mBestGuess = -1; |
92 | mState = eDetecting; |
93 | } |
94 | |
95 | nsProbingState nsSBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen) |
96 | { |
97 | nsProbingState st; |
98 | unsigned int i; |
99 | char *newBuf1 = nullptr; |
100 | unsigned int newLen1 = 0; |
101 | |
102 | // apply filter to original buffer, and we got new buffer back |
103 | // depend on what script it is, we will feed them the new buffer |
104 | // we got after applying proper filter |
105 | // this is done without any consideration to KeepEnglishLetters |
106 | // of each prober since as of now, there are no probers here which |
107 | // recognize languages with English characters. |
108 | if (!FilterWithoutEnglishLetters(aBuf, aLen, newBuf: &newBuf1, newLen&: newLen1)) { |
109 | goto done; |
110 | } |
111 | |
112 | if (newLen1 == 0) { |
113 | goto done; // Nothing to see here, move on. |
114 | } |
115 | |
116 | for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) { |
117 | if (!mIsActive[i]) { |
118 | continue; |
119 | } |
120 | st = mProbers[i]->HandleData(aBuf: newBuf1, aLen: newLen1); |
121 | if (st == eFoundIt) { |
122 | mBestGuess = i; |
123 | mState = eFoundIt; |
124 | break; |
125 | } else if (st == eNotMe) { |
126 | mIsActive[i] = false; |
127 | mActiveNum--; |
128 | if (mActiveNum == 0) { |
129 | mState = eNotMe; |
130 | break; |
131 | } |
132 | } |
133 | } |
134 | |
135 | done: |
136 | free(ptr: newBuf1); |
137 | |
138 | return mState; |
139 | } |
140 | |
141 | float nsSBCSGroupProber::GetConfidence(void) |
142 | { |
143 | unsigned int i; |
144 | float bestConf = 0.0; |
145 | float cf; |
146 | |
147 | switch (mState) { |
148 | case eFoundIt: |
149 | return (float)0.99; // sure yes |
150 | case eNotMe: |
151 | return (float)0.01; // sure no |
152 | default: |
153 | for (i = 0; i < NUM_OF_SBCS_PROBERS; ++i) { |
154 | if (!mIsActive[i]) { |
155 | continue; |
156 | } |
157 | cf = mProbers[i]->GetConfidence(); |
158 | if (bestConf < cf) { |
159 | bestConf = cf; |
160 | mBestGuess = i; |
161 | } |
162 | } |
163 | } |
164 | return bestConf; |
165 | } |
166 | |
167 | #ifdef DEBUG_PROBE |
168 | void nsSBCSGroupProber::DumpStatus() |
169 | { |
170 | unsigned int i; |
171 | float cf; |
172 | |
173 | cf = GetConfidence(); |
174 | printf(" SBCS Group Prober --------begin status \r\n" ); |
175 | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) { |
176 | if (!mIsActive[i]) { |
177 | printf(" inactive: [%s] (i.e. confidence is too low).\r\n" , mProbers[i]->GetCharSetName()); |
178 | } else { |
179 | mProbers[i]->DumpStatus(); |
180 | } |
181 | } |
182 | printf(" SBCS Group found best match [%s] confidence %f.\r\n" , mProbers[mBestGuess]->GetCharSetName(), cf); |
183 | } |
184 | #endif |
185 | } |
186 | |