1 | /* -*- C++ -*- |
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #ifndef NSSBCHARSETPROBER_H |
8 | #define NSSBCHARSETPROBER_H |
9 | |
10 | #include "nsCharSetProber.h" |
11 | |
12 | #define SAMPLE_SIZE 64 |
13 | #define SB_ENOUGH_REL_THRESHOLD 1024 |
14 | #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 |
15 | #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 |
16 | #define SYMBOL_CAT_ORDER 250 |
17 | #define NUMBER_OF_SEQ_CAT 4 |
18 | #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT - 1) |
19 | #define NEGATIVE_CAT 0 |
20 | |
21 | namespace kencodingprober |
22 | { |
23 | typedef struct { |
24 | const unsigned char *charToOrderMap; // [256] table use to find a char's order |
25 | const char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency |
26 | float mTypicalPositiveRatio; // = freqSeqs / totalSeqs |
27 | bool keepEnglishLetter; // says if this script contains English characters (not implemented) |
28 | const char *charsetName; |
29 | } SequenceModel; |
30 | |
31 | class KCODECS_NO_EXPORT nsSingleByteCharSetProber : public nsCharSetProber |
32 | { |
33 | public: |
34 | explicit nsSingleByteCharSetProber(const SequenceModel *model) |
35 | : mModel(model) |
36 | , mReversed(false) |
37 | , mNameProber(nullptr) |
38 | { |
39 | Reset(); |
40 | } |
41 | nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber *nameProber) |
42 | : mModel(model) |
43 | , mReversed(reversed) |
44 | , mNameProber(nameProber) |
45 | { |
46 | Reset(); |
47 | } |
48 | |
49 | const char *GetCharSetName() override; |
50 | nsProbingState HandleData(const char *aBuf, unsigned int aLen) override; |
51 | nsProbingState GetState(void) override |
52 | { |
53 | return mState; |
54 | } |
55 | void Reset(void) override; |
56 | float GetConfidence(void) override; |
57 | void SetOpion() override |
58 | { |
59 | } |
60 | |
61 | // This feature is not implemented yet. any current language model |
62 | // contain this parameter as false. No one is looking at this |
63 | // parameter or calling this method. |
64 | // Moreover, the nsSBCSGroupProber which calls the HandleData of this |
65 | // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid |
66 | // of the English letters. |
67 | bool KeepEnglishLetters() |
68 | { |
69 | return mModel->keepEnglishLetter; |
70 | } // (not implemented) |
71 | |
72 | #ifdef DEBUG_PROBE |
73 | void DumpStatus() override; |
74 | #endif |
75 | |
76 | protected: |
77 | nsProbingState mState; |
78 | const SequenceModel *mModel; |
79 | const bool mReversed; // true if we need to reverse every pair in the model lookup |
80 | |
81 | // char order of last character |
82 | unsigned char mLastOrder; |
83 | |
84 | unsigned int mTotalSeqs; |
85 | unsigned int mSeqCounters[NUMBER_OF_SEQ_CAT]; |
86 | |
87 | unsigned int mTotalChar; |
88 | // characters that fall in our sampling range |
89 | unsigned int mFreqChar; |
90 | |
91 | // Optional auxiliary prober for name decision. created and destroyed by the GroupProber |
92 | nsCharSetProber *mNameProber; |
93 | }; |
94 | |
95 | extern const SequenceModel Koi8rModel; |
96 | extern const SequenceModel Win1251Model; |
97 | extern const SequenceModel Latin5Model; |
98 | extern const SequenceModel MacCyrillicModel; |
99 | extern const SequenceModel Ibm866Model; |
100 | extern const SequenceModel Ibm855Model; |
101 | extern const SequenceModel Latin7Model; |
102 | extern const SequenceModel Win1253Model; |
103 | extern const SequenceModel Latin5BulgarianModel; |
104 | extern const SequenceModel Win1251BulgarianModel; |
105 | extern const SequenceModel Latin2HungarianModel; |
106 | extern const SequenceModel Win1250HungarianModel; |
107 | extern const SequenceModel Win1255Model; |
108 | } |
109 | #endif /* NSSBCHARSETPROBER_H */ |
110 | |