1 | /* -*- C++ -*- |
---|---|
2 | SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #include "UnicodeGroupProber.h" |
8 | |
9 | #include <QChar> |
10 | #include <math.h> |
11 | |
12 | namespace kencodingprober |
13 | { |
14 | UnicodeGroupProber::UnicodeGroupProber(void) |
15 | { |
16 | mCodingSM[0] = new nsCodingStateMachine(&UTF8SMModel); |
17 | mCodingSM[1] = new nsCodingStateMachine(&UCS2LESMModel); |
18 | mCodingSM[2] = new nsCodingStateMachine(&UCS2BESMModel); |
19 | mActiveSM = NUM_OF_UNICODE_CHARSETS; |
20 | mState = eDetecting; |
21 | mDetectedCharset = "UTF-8"; |
22 | } |
23 | |
24 | UnicodeGroupProber::~UnicodeGroupProber(void) |
25 | { |
26 | for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) { |
27 | delete mCodingSM[i]; |
28 | } |
29 | } |
30 | |
31 | void UnicodeGroupProber::Reset(void) |
32 | { |
33 | mState = eDetecting; |
34 | for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) { |
35 | mCodingSM[i]->Reset(); |
36 | } |
37 | mActiveSM = NUM_OF_UNICODE_CHARSETS; |
38 | mDetectedCharset = "UTF-8"; |
39 | } |
40 | |
41 | nsProbingState UnicodeGroupProber::HandleData(const char *aBuf, unsigned int aLen) |
42 | { |
43 | nsSMState codingState; |
44 | static bool disableUTF16LE = false; |
45 | static bool disableUTF16BE = false; |
46 | |
47 | if (mActiveSM == 0 || aLen < 2) { |
48 | mState = eNotMe; |
49 | return mState; |
50 | } |
51 | |
52 | if (!(disableUTF16LE || disableUTF16BE)) { |
53 | if (aLen % 2 != 0) { |
54 | disableUTF16LE = true; |
55 | disableUTF16BE = true; |
56 | } |
57 | const uint weight_BOM = sqrt(x: (double)aLen) + aLen / 10.0; |
58 | uint counts[5] = {0, 0, 0, 0, 0}; |
59 | for (uint i = 0; i < 5; i++) { |
60 | counts[i] = std::count(first: aBuf, last: aBuf + aLen, value: char(i)); |
61 | } |
62 | const double weight_zero = (2.0 * (counts[0] + counts[1] + counts[2] + counts[3] + counts[4]) + weight_BOM) / aLen; |
63 | if (weight_zero < log(x: 1.4142)) { |
64 | disableUTF16LE = true; |
65 | disableUTF16BE = true; |
66 | } |
67 | if (4 >= aBuf[1] && aBuf[1] >= 0 && QChar::isPrint(ucs4: static_cast<uint>(aBuf[0]))) { |
68 | disableUTF16BE = true; |
69 | } else { |
70 | disableUTF16LE = true; |
71 | } |
72 | if (disableUTF16BE) { |
73 | mActiveSM--; |
74 | } |
75 | if (disableUTF16LE) { |
76 | nsCodingStateMachine *t; |
77 | t = mCodingSM[1]; |
78 | mCodingSM[1] = mCodingSM[2]; |
79 | mCodingSM[2] = t; |
80 | mActiveSM--; |
81 | } |
82 | } |
83 | |
84 | for (uint i = 0; i < aLen; ++i) { |
85 | for (int j = mActiveSM - 1; j >= 0; --j) { |
86 | // byte is feed to all active state machine |
87 | codingState = mCodingSM[j]->NextState(c: aBuf[i]); |
88 | if (codingState == eError) { |
89 | // got negative answer for this state machine, make it inactive |
90 | mActiveSM--; |
91 | if (mActiveSM == 0) { |
92 | mState = eNotMe; |
93 | return mState; |
94 | } else if (j != (int)mActiveSM) { |
95 | nsCodingStateMachine *t; |
96 | t = mCodingSM[mActiveSM]; |
97 | mCodingSM[mActiveSM] = mCodingSM[j]; |
98 | mCodingSM[j] = t; |
99 | } |
100 | } else if (codingState == eItsMe) { |
101 | mState = eFoundIt; |
102 | mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); |
103 | return mState; |
104 | } else if (mState == eDetecting) { |
105 | mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); |
106 | }; |
107 | } |
108 | } |
109 | return mState; |
110 | } |
111 | |
112 | float UnicodeGroupProber::GetConfidence() |
113 | { |
114 | if (mState == eFoundIt) { |
115 | return 0.99f; |
116 | } else { |
117 | return 0.0f; |
118 | } |
119 | } |
120 | |
121 | #ifdef DEBUG_PROBE |
122 | void UnicodeGroupProber::DumpStatus() |
123 | { |
124 | GetConfidence(); |
125 | for (uint i = 0; i < mActiveSM; i++) { |
126 | qDebug() << "Unicode group"<< mCodingSM[i]->DumpCurrentState() << mCodingSM[i]->GetCodingStateMachine(); |
127 | } |
128 | } |
129 | #endif |
130 | |
131 | } |
132 |