1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4 SPDX-License-Identifier: MIT
5*/
6
7// for japanese encoding, observe characteristic:
8// 1, kana character (or hankaku?) often have high frequency of appearance
9// 2, kana character often exist in group
10// 3, certain combination of kana is never used in japanese language
11
12#include "nsEUCJPProber.h"
13
14namespace kencodingprober
15{
16void nsEUCJPProber::Reset(void)
17{
18 mCodingSM->Reset();
19 mState = eDetecting;
20 mContextAnalyser.Reset();
21 mDistributionAnalyser.Reset();
22}
23
24nsProbingState nsEUCJPProber::HandleData(const char *aBuf, unsigned int aLen)
25{
26 if (aLen == 0) {
27 return mState;
28 }
29
30 for (unsigned int i = 0; i < aLen; i++) {
31 const nsSMState codingState = mCodingSM->NextState(c: aBuf[i]);
32 if (codingState == eError) {
33 mState = eNotMe;
34 break;
35 }
36 if (codingState == eItsMe) {
37 mState = eFoundIt;
38 break;
39 }
40 if (codingState == eStart) {
41 unsigned int charLen = mCodingSM->GetCurrentCharLen();
42
43 if (i == 0) {
44 mLastChar[1] = aBuf[0];
45 mContextAnalyser.HandleOneChar(aStr: mLastChar, aCharLen: charLen);
46 mDistributionAnalyser.HandleOneChar(aStr: mLastChar, aCharLen: charLen);
47 } else {
48 mContextAnalyser.HandleOneChar(aStr: aBuf + i - 1, aCharLen: charLen);
49 mDistributionAnalyser.HandleOneChar(aStr: aBuf + i - 1, aCharLen: charLen);
50 }
51 }
52 }
53
54 mLastChar[0] = aBuf[aLen - 1];
55
56 if (mState == eDetecting) {
57 if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) {
58 mState = eFoundIt;
59 }
60 }
61
62 return mState;
63}
64
65float nsEUCJPProber::GetConfidence(void)
66{
67 float contxtCf = mContextAnalyser.GetConfidence();
68 float distribCf = mDistributionAnalyser.GetConfidence();
69
70 return (contxtCf > distribCf ? contxtCf : distribCf);
71}
72}
73

source code of kcodecs/src/probers/nsEUCJPProber.cpp