1 | /* -*- C++ -*- |
---|---|
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #include "nsSBCharSetProber.h" |
8 | |
9 | #include <stdio.h> |
10 | |
11 | namespace kencodingprober |
12 | { |
13 | nsProbingState nsSingleByteCharSetProber::HandleData(const char *aBuf, unsigned int aLen) |
14 | { |
15 | for (unsigned int i = 0; i < aLen; i++) { |
16 | const unsigned char order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; |
17 | |
18 | if (order < SYMBOL_CAT_ORDER) { |
19 | mTotalChar++; |
20 | } |
21 | if (order < SAMPLE_SIZE) { |
22 | mFreqChar++; |
23 | |
24 | if (mLastOrder < SAMPLE_SIZE) { |
25 | mTotalSeqs++; |
26 | if (!mReversed) { |
27 | ++(mSeqCounters[(int)mModel->precedenceMatrix[mLastOrder * SAMPLE_SIZE + order]]); |
28 | } else { // reverse the order of the letters in the lookup |
29 | ++(mSeqCounters[(int)mModel->precedenceMatrix[order * SAMPLE_SIZE + mLastOrder]]); |
30 | } |
31 | } |
32 | } |
33 | mLastOrder = order; |
34 | } |
35 | |
36 | if (mState == eDetecting) { |
37 | if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) { |
38 | float cf = GetConfidence(); |
39 | if (cf > POSITIVE_SHORTCUT_THRESHOLD) { |
40 | mState = eFoundIt; |
41 | } else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) { |
42 | mState = eNotMe; |
43 | } |
44 | } |
45 | } |
46 | |
47 | return mState; |
48 | } |
49 | |
50 | void nsSingleByteCharSetProber::Reset(void) |
51 | { |
52 | mState = eDetecting; |
53 | mLastOrder = 255; |
54 | for (unsigned int i = 0; i < NUMBER_OF_SEQ_CAT; i++) { |
55 | mSeqCounters[i] = 0; |
56 | } |
57 | mTotalSeqs = 0; |
58 | mTotalChar = 0; |
59 | mFreqChar = 0; |
60 | } |
61 | |
62 | //#define NEGATIVE_APPROACH 1 |
63 | |
64 | float nsSingleByteCharSetProber::GetConfidence(void) |
65 | { |
66 | #ifdef NEGATIVE_APPROACH |
67 | if (mTotalSeqs > 0) |
68 | if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT] * 10) { |
69 | return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT] * 10)) / mTotalSeqs * mFreqChar / mTotalChar; |
70 | } |
71 | return (float)0.01; |
72 | #else // POSITIVE_APPROACH |
73 | float r; |
74 | |
75 | if (mTotalSeqs > 0) { |
76 | r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; |
77 | r = r * mFreqChar / mTotalChar; |
78 | if (r >= (float)1.00) { |
79 | r = (float)0.99; |
80 | } |
81 | return r; |
82 | } |
83 | return (float)0.01; |
84 | #endif |
85 | } |
86 | |
87 | const char *nsSingleByteCharSetProber::GetCharSetName() |
88 | { |
89 | if (!mNameProber) { |
90 | return mModel->charsetName; |
91 | } |
92 | return mNameProber->GetCharSetName(); |
93 | } |
94 | |
95 | #ifdef DEBUG_PROBE |
96 | void nsSingleByteCharSetProber::DumpStatus() |
97 | { |
98 | printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); |
99 | } |
100 | #endif |
101 | } |
102 |