| 1 | /* -*- C++ -*- |
| 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
| 3 | |
| 4 | SPDX-License-Identifier: MIT |
| 5 | */ |
| 6 | |
| 7 | #ifndef CharDistribution_h__ |
| 8 | #define CharDistribution_h__ |
| 9 | |
| 10 | #include "kcodecs_export.h" |
| 11 | |
| 12 | #include <qcompilerdetection.h> |
| 13 | |
| 14 | #define ENOUGH_DATA_THRESHOLD 256 |
| 15 | |
| 16 | namespace kencodingprober |
| 17 | { |
| 18 | class KCODECS_NO_EXPORT CharDistributionAnalysis |
| 19 | { |
| 20 | public: |
| 21 | CharDistributionAnalysis() |
| 22 | { |
| 23 | Reset(); |
| 24 | } |
| 25 | virtual ~CharDistributionAnalysis() |
| 26 | { |
| 27 | } |
| 28 | |
| 29 | // feed a block of data and do distribution analysis |
| 30 | void HandleData(const char * /* aBuf */, unsigned int /* aLen */) |
| 31 | { |
| 32 | } |
| 33 | |
| 34 | // Feed a character with known length |
| 35 | void HandleOneChar(const char *aStr, unsigned int aCharLen) |
| 36 | { |
| 37 | int order; |
| 38 | |
| 39 | // we only care about 2-bytes character in our distribution analysis |
| 40 | order = (aCharLen == 2) ? GetOrder(aStr) : -1; |
| 41 | |
| 42 | if (order >= 0) { |
| 43 | mTotalChars++; |
| 44 | // order is valid |
| 45 | if ((unsigned int)order < mTableSize) { |
| 46 | if (512 > mCharToFreqOrder[order]) { |
| 47 | mFreqChars++; |
| 48 | } |
| 49 | } |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | // return confidence base on existing data |
| 54 | float GetConfidence(); |
| 55 | |
| 56 | // Reset analyser, clear any state |
| 57 | void Reset(void) |
| 58 | { |
| 59 | mDone = false; |
| 60 | mTotalChars = 0; |
| 61 | mFreqChars = 0; |
| 62 | } |
| 63 | |
| 64 | // It is not necessary to receive all data to draw conclusion. For charset detection, |
| 65 | // certain amount of data is enough |
| 66 | bool GotEnoughData() |
| 67 | { |
| 68 | return mTotalChars > ENOUGH_DATA_THRESHOLD; |
| 69 | } |
| 70 | |
| 71 | protected: |
| 72 | // we do not handle character base on its original encoding string, but |
| 73 | // convert this encoding string to a number, here called order. |
| 74 | // This allows multiple encodings of a language to share one frequency table |
| 75 | virtual int GetOrder(const char * /* str */) |
| 76 | { |
| 77 | return -1; |
| 78 | } |
| 79 | |
| 80 | // If this flag is set to true, detection is done and conclusion has been made |
| 81 | bool mDone; |
| 82 | |
| 83 | // The number of characters whose frequency order is less than 512 |
| 84 | unsigned int mFreqChars; |
| 85 | |
| 86 | // Total character encountered. |
| 87 | unsigned int mTotalChars; |
| 88 | |
| 89 | // Mapping table to get frequency order from char order (get from GetOrder()) |
| 90 | const short *mCharToFreqOrder; |
| 91 | |
| 92 | // Size of above table |
| 93 | unsigned int mTableSize; |
| 94 | |
| 95 | // This is a constant value varies from language to language, it is used in |
| 96 | // calculating confidence. See my paper for further detail. |
| 97 | float mTypicalDistributionRatio; |
| 98 | }; |
| 99 | |
| 100 | class KCODECS_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis |
| 101 | { |
| 102 | public: |
| 103 | EUCKRDistributionAnalysis(); |
| 104 | |
| 105 | protected: |
| 106 | // for euc-KR encoding, we are interested |
| 107 | // first byte range: 0xb0 -- 0xfe |
| 108 | // second byte range: 0xa1 -- 0xfe |
| 109 | // no validation needed here. State machine has done that |
| 110 | int GetOrder(const char *str) override |
| 111 | { |
| 112 | if ((unsigned char)*str >= (unsigned char)0xb0) { |
| 113 | return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; |
| 114 | } else { |
| 115 | return -1; |
| 116 | } |
| 117 | } |
| 118 | }; |
| 119 | |
| 120 | class KCODECS_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis |
| 121 | { |
| 122 | public: |
| 123 | GB2312DistributionAnalysis(); |
| 124 | |
| 125 | protected: |
| 126 | // for GB2312 encoding, we are interested |
| 127 | // first byte range: 0xb0 -- 0xfe |
| 128 | // second byte range: 0xa1 -- 0xfe |
| 129 | // no validation needed here. State machine has done that |
| 130 | int GetOrder(const char *str) override |
| 131 | { |
| 132 | if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) { |
| 133 | return 94 * ((unsigned char)str[0] - (unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; |
| 134 | } else { |
| 135 | return -1; |
| 136 | } |
| 137 | } |
| 138 | }; |
| 139 | |
| 140 | class KCODECS_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis |
| 141 | { |
| 142 | public: |
| 143 | Big5DistributionAnalysis(); |
| 144 | |
| 145 | protected: |
| 146 | // for big5 encoding, we are interested |
| 147 | // first byte range: 0xa4 -- 0xfe |
| 148 | // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe |
| 149 | // no validation needed here. State machine has done that |
| 150 | int GetOrder(const char *str) override |
| 151 | { |
| 152 | if ((unsigned char)*str >= (unsigned char)0xa4) |
| 153 | if ((unsigned char)str[1] >= (unsigned char)0xa1) { |
| 154 | return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 + 63; |
| 155 | } else { |
| 156 | return 157 * ((unsigned char)str[0] - (unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; |
| 157 | } |
| 158 | else { |
| 159 | return -1; |
| 160 | } |
| 161 | } |
| 162 | }; |
| 163 | |
| 164 | class KCODECS_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis |
| 165 | { |
| 166 | public: |
| 167 | SJISDistributionAnalysis(); |
| 168 | |
| 169 | protected: |
| 170 | // for sjis encoding, we are interested |
| 171 | // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe |
| 172 | // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe |
| 173 | // no validation needed here. State machine has done that |
| 174 | int GetOrder(const char *str) override |
| 175 | { |
| 176 | int order; |
| 177 | if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) { |
| 178 | order = 188 * ((unsigned char)str[0] - (unsigned char)0x81); |
| 179 | } else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) { |
| 180 | order = 188 * ((unsigned char)str[0] - (unsigned char)0xe0 + 31); |
| 181 | } else { |
| 182 | return -1; |
| 183 | } |
| 184 | order += (unsigned char)*(str + 1) - 0x40; |
| 185 | if ((unsigned char)str[1] > (unsigned char)0x7f) { |
| 186 | order--; |
| 187 | } |
| 188 | return order; |
| 189 | } |
| 190 | }; |
| 191 | |
| 192 | class KCODECS_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis |
| 193 | { |
| 194 | public: |
| 195 | EUCJPDistributionAnalysis(); |
| 196 | |
| 197 | protected: |
| 198 | // for euc-JP encoding, we are interested |
| 199 | // first byte range: 0xa0 -- 0xfe |
| 200 | // second byte range: 0xa1 -- 0xfe |
| 201 | // no validation needed here. State machine has done that |
| 202 | int GetOrder(const char *str) override |
| 203 | { |
| 204 | if ((unsigned char)*str >= (unsigned char)0xa0) { |
| 205 | return 94 * ((unsigned char)str[0] - (unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; |
| 206 | } else { |
| 207 | return -1; |
| 208 | } |
| 209 | } |
| 210 | }; |
| 211 | } |
| 212 | #endif // CharDistribution_h__ |
| 213 | |