1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com>
3
4 SPDX-License-Identifier: MIT
5*/
6
7#include "UnicodeGroupProber.h"
8
9#include <QChar>
10#include <math.h>
11
12namespace kencodingprober
13{
14UnicodeGroupProber::UnicodeGroupProber(void)
15{
16 mCodingSM[0] = new nsCodingStateMachine(&UTF8SMModel);
17 mCodingSM[1] = new nsCodingStateMachine(&UCS2LESMModel);
18 mCodingSM[2] = new nsCodingStateMachine(&UCS2BESMModel);
19 mActiveSM = NUM_OF_UNICODE_CHARSETS;
20 mState = eDetecting;
21 mDetectedCharset = "UTF-8";
22}
23
24UnicodeGroupProber::~UnicodeGroupProber(void)
25{
26 for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) {
27 delete mCodingSM[i];
28 }
29}
30
31void UnicodeGroupProber::Reset(void)
32{
33 mState = eDetecting;
34 for (unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) {
35 mCodingSM[i]->Reset();
36 }
37 mActiveSM = NUM_OF_UNICODE_CHARSETS;
38 mDetectedCharset = "UTF-8";
39}
40
41nsProbingState UnicodeGroupProber::HandleData(const char *aBuf, unsigned int aLen)
42{
43 nsSMState codingState;
44 static bool disableUTF16LE = false;
45 static bool disableUTF16BE = false;
46
47 if (mActiveSM == 0 || aLen < 2) {
48 mState = eNotMe;
49 return mState;
50 }
51
52 if (!(disableUTF16LE || disableUTF16BE)) {
53 if (aLen % 2 != 0) {
54 disableUTF16LE = true;
55 disableUTF16BE = true;
56 }
57 const uint weight_BOM = sqrt(x: (double)aLen) + aLen / 10.0;
58 uint counts[5] = {0, 0, 0, 0, 0};
59 for (uint i = 0; i < 5; i++) {
60 counts[i] = std::count(first: aBuf, last: aBuf + aLen, value: char(i));
61 }
62 const double weight_zero = (2.0 * (counts[0] + counts[1] + counts[2] + counts[3] + counts[4]) + weight_BOM) / aLen;
63 if (weight_zero < log(x: 1.4142)) {
64 disableUTF16LE = true;
65 disableUTF16BE = true;
66 }
67 if (4 >= aBuf[1] && aBuf[1] >= 0 && QChar::isPrint(ucs4: static_cast<uint>(aBuf[0]))) {
68 disableUTF16BE = true;
69 } else {
70 disableUTF16LE = true;
71 }
72 if (disableUTF16BE) {
73 mActiveSM--;
74 }
75 if (disableUTF16LE) {
76 nsCodingStateMachine *t;
77 t = mCodingSM[1];
78 mCodingSM[1] = mCodingSM[2];
79 mCodingSM[2] = t;
80 mActiveSM--;
81 }
82 }
83
84 for (uint i = 0; i < aLen; ++i) {
85 for (int j = mActiveSM - 1; j >= 0; --j) {
86 // byte is feed to all active state machine
87 codingState = mCodingSM[j]->NextState(c: aBuf[i]);
88 if (codingState == eError) {
89 // got negative answer for this state machine, make it inactive
90 mActiveSM--;
91 if (mActiveSM == 0) {
92 mState = eNotMe;
93 return mState;
94 } else if (j != (int)mActiveSM) {
95 nsCodingStateMachine *t;
96 t = mCodingSM[mActiveSM];
97 mCodingSM[mActiveSM] = mCodingSM[j];
98 mCodingSM[j] = t;
99 }
100 } else if (codingState == eItsMe) {
101 mState = eFoundIt;
102 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
103 return mState;
104 } else if (mState == eDetecting) {
105 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
106 };
107 }
108 }
109 return mState;
110}
111
112float UnicodeGroupProber::GetConfidence()
113{
114 if (mState == eFoundIt) {
115 return 0.99f;
116 } else {
117 return 0.0f;
118 }
119}
120
121#ifdef DEBUG_PROBE
122void UnicodeGroupProber::DumpStatus()
123{
124 GetConfidence();
125 for (uint i = 0; i < mActiveSM; i++) {
126 qDebug() << "Unicode group" << mCodingSM[i]->DumpCurrentState() << mCodingSM[i]->GetCodingStateMachine();
127 }
128}
129#endif
130
131}
132

source code of kcodecs/src/probers/UnicodeGroupProber.cpp