1 | /* -*- C++ -*- |
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #include "ChineseGroupProber.h" |
8 | |
9 | #include "UnicodeGroupProber.h" |
10 | #include "nsBig5Prober.h" |
11 | #include "nsGB2312Prober.h" |
12 | |
13 | #include <stdio.h> |
14 | #include <stdlib.h> |
15 | |
16 | namespace kencodingprober |
17 | { |
18 | #ifdef DEBUG_PROBE |
19 | static const char *const ProberName[] = { |
20 | "Unicode" , |
21 | "GB18030" , |
22 | "Big5" , |
23 | }; |
24 | |
25 | #endif |
26 | |
27 | ChineseGroupProber::ChineseGroupProber() |
28 | { |
29 | mProbers[0] = new UnicodeGroupProber(); |
30 | mProbers[1] = new nsGB18030Prober(); |
31 | mProbers[2] = new nsBig5Prober(); |
32 | Reset(); |
33 | } |
34 | |
35 | ChineseGroupProber::~ChineseGroupProber() |
36 | { |
37 | for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) { |
38 | delete mProbers[i]; |
39 | } |
40 | } |
41 | |
42 | const char *ChineseGroupProber::GetCharSetName() |
43 | { |
44 | if (mBestGuess == -1) { |
45 | GetConfidence(); |
46 | if (mBestGuess == -1) { |
47 | mBestGuess = 1; // assume it's GB18030 |
48 | } |
49 | } |
50 | return mProbers[mBestGuess]->GetCharSetName(); |
51 | } |
52 | |
53 | void ChineseGroupProber::Reset(void) |
54 | { |
55 | mActiveNum = 0; |
56 | for (unsigned int i = 0; i < CN_NUM_OF_PROBERS; i++) { |
57 | if (mProbers[i]) { |
58 | mProbers[i]->Reset(); |
59 | mIsActive[i] = true; |
60 | ++mActiveNum; |
61 | } else { |
62 | mIsActive[i] = false; |
63 | } |
64 | } |
65 | mBestGuess = -1; |
66 | mState = eDetecting; |
67 | } |
68 | |
69 | nsProbingState ChineseGroupProber::HandleData(const char *aBuf, unsigned int aLen) |
70 | { |
71 | nsProbingState st; |
72 | unsigned int i; |
73 | |
74 | // do filtering to reduce load to probers |
75 | char *highbyteBuf; |
76 | char *hptr; |
77 | bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise |
78 | hptr = highbyteBuf = (char *)malloc(size: aLen); |
79 | if (!hptr) { |
80 | return mState; |
81 | } |
82 | for (i = 0; i < aLen; ++i) { |
83 | if (aBuf[i] & 0x80) { |
84 | *hptr++ = aBuf[i]; |
85 | keepNext = true; |
86 | } else { |
87 | // if previous is highbyte, keep this even it is an ASCII |
88 | if (keepNext) { |
89 | *hptr++ = aBuf[i]; |
90 | keepNext = false; |
91 | } |
92 | } |
93 | } |
94 | |
95 | for (i = 0; i < CN_NUM_OF_PROBERS; ++i) { |
96 | if (!mIsActive[i]) { |
97 | continue; |
98 | } |
99 | st = mProbers[i]->HandleData(aBuf: highbyteBuf, aLen: hptr - highbyteBuf); |
100 | if (st == eFoundIt) { |
101 | mBestGuess = i; |
102 | mState = eFoundIt; |
103 | break; |
104 | } else if (st == eNotMe) { |
105 | mIsActive[i] = false; |
106 | --mActiveNum; |
107 | if (mActiveNum == 0) { |
108 | mState = eNotMe; |
109 | break; |
110 | } |
111 | } |
112 | } |
113 | |
114 | free(ptr: highbyteBuf); |
115 | |
116 | return mState; |
117 | } |
118 | |
119 | float ChineseGroupProber::GetConfidence(void) |
120 | { |
121 | unsigned int i; |
122 | float bestConf = 0.0; |
123 | float cf; |
124 | |
125 | switch (mState) { |
126 | case eFoundIt: |
127 | return (float)0.99; |
128 | case eNotMe: |
129 | return (float)0.01; |
130 | default: |
131 | for (i = 0; i < CN_NUM_OF_PROBERS; ++i) { |
132 | if (!mIsActive[i]) { |
133 | continue; |
134 | } |
135 | cf = mProbers[i]->GetConfidence(); |
136 | if (bestConf < cf) { |
137 | bestConf = cf; |
138 | mBestGuess = i; |
139 | } |
140 | } |
141 | } |
142 | return bestConf; |
143 | } |
144 | |
145 | #ifdef DEBUG_PROBE |
146 | void ChineseGroupProber::DumpStatus() |
147 | { |
148 | unsigned int i; |
149 | float cf; |
150 | |
151 | GetConfidence(); |
152 | for (i = 0; i < CN_NUM_OF_PROBERS; i++) { |
153 | if (!mIsActive[i]) { |
154 | printf(" Chinese group inactive: [%s] (confidence is too low).\r\n" , ProberName[i]); |
155 | } else { |
156 | cf = mProbers[i]->GetConfidence(); |
157 | printf(" Chinese group %1.3f: [%s]\r\n" , cf, ProberName[i]); |
158 | } |
159 | } |
160 | } |
161 | #endif |
162 | } |
163 | |