| 1 | /* -*- C++ -*- |
| 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
| 3 | |
| 4 | SPDX-License-Identifier: MIT |
| 5 | */ |
| 6 | |
| 7 | #include "nsLatin1Prober.h" |
| 8 | #include <stdio.h> |
| 9 | #include <stdlib.h> |
| 10 | |
| 11 | #define UDF 0 // undefined |
| 12 | #define OTH 1 // other |
| 13 | #define ASC 2 // ascii capital letter |
| 14 | #define ASS 3 // ascii small letter |
| 15 | #define ACV 4 // accent capital vowel |
| 16 | #define ACO 5 // accent capital other |
| 17 | #define ASV 6 // accent small vowel |
| 18 | #define ASO 7 // accent small other |
| 19 | #define CLASS_NUM 8 // total classes |
| 20 | |
| 21 | namespace kencodingprober |
| 22 | { |
| 23 | static const unsigned char Latin1_CharToClass[] = { |
| 24 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 |
| 25 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F |
| 26 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 |
| 27 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F |
| 28 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 |
| 29 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F |
| 30 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 |
| 31 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F |
| 32 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 |
| 33 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F |
| 34 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 |
| 35 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F |
| 36 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 |
| 37 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F |
| 38 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 |
| 39 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F |
| 40 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 |
| 41 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F |
| 42 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 |
| 43 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F |
| 44 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 |
| 45 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF |
| 46 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 |
| 47 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF |
| 48 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 |
| 49 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF |
| 50 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 |
| 51 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF |
| 52 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 |
| 53 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF |
| 54 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 |
| 55 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF |
| 56 | }; |
| 57 | |
| 58 | /* 0 : illegal |
| 59 | 1 : very unlikely |
| 60 | 2 : normal |
| 61 | 3 : very likely |
| 62 | */ |
| 63 | static const unsigned char Latin1ClassModel[] = { |
| 64 | /* UDF OTH ASC ASS ACV ACO ASV ASO */ |
| 65 | /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, |
| 66 | /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, |
| 67 | /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, |
| 68 | /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, |
| 69 | /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, |
| 70 | /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, |
| 71 | /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, |
| 72 | /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, |
| 73 | }; |
| 74 | |
| 75 | void nsLatin1Prober::Reset(void) |
| 76 | { |
| 77 | mState = eDetecting; |
| 78 | mLastCharClass = OTH; |
| 79 | for (int i = 0; i < FREQ_CAT_NUM; i++) { |
| 80 | mFreqCounter[i] = 0; |
| 81 | } |
| 82 | } |
| 83 | |
| 84 | nsProbingState nsLatin1Prober::HandleData(const char *aBuf, unsigned int aLen) |
| 85 | { |
| 86 | char *newBuf1 = nullptr; |
| 87 | unsigned int newLen1 = 0; |
| 88 | |
| 89 | if (!FilterWithEnglishLetters(aBuf, aLen, newBuf: &newBuf1, newLen&: newLen1)) { |
| 90 | newBuf1 = (char *)aBuf; |
| 91 | newLen1 = aLen; |
| 92 | } |
| 93 | |
| 94 | for (unsigned int i = 0; i < newLen1; i++) { |
| 95 | const unsigned char charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; |
| 96 | const unsigned char freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass]; |
| 97 | if (freq == 0) { |
| 98 | mState = eNotMe; |
| 99 | break; |
| 100 | } |
| 101 | mFreqCounter[freq]++; |
| 102 | mLastCharClass = charClass; |
| 103 | } |
| 104 | |
| 105 | if (newBuf1 != aBuf) { |
| 106 | free(ptr: newBuf1); |
| 107 | } |
| 108 | |
| 109 | return mState; |
| 110 | } |
| 111 | |
| 112 | float nsLatin1Prober::GetConfidence(void) |
| 113 | { |
| 114 | if (mState == eNotMe) { |
| 115 | return 0.01f; |
| 116 | } |
| 117 | |
| 118 | float confidence; |
| 119 | unsigned int total = 0; |
| 120 | for (int i = 0; i < FREQ_CAT_NUM; i++) { |
| 121 | total += mFreqCounter[i]; |
| 122 | } |
| 123 | |
| 124 | if (!total) { |
| 125 | confidence = 0.0f; |
| 126 | } else { |
| 127 | confidence = mFreqCounter[3] * 1.0f / total; |
| 128 | confidence -= mFreqCounter[1] * 20.0f / total; |
| 129 | } |
| 130 | |
| 131 | if (confidence < 0.0f) { |
| 132 | confidence = 0.0f; |
| 133 | } |
| 134 | |
| 135 | // lower the confidence of latin1 so that other more accurate detector |
| 136 | // can take priority. |
| 137 | confidence *= 0.50f; |
| 138 | |
| 139 | return confidence; |
| 140 | } |
| 141 | |
| 142 | #ifdef DEBUG_PROBE |
| 143 | void nsLatin1Prober::DumpStatus() |
| 144 | { |
| 145 | printf(" Latin1Prober: %1.3f [%s]\r\n" , GetConfidence(), GetCharSetName()); |
| 146 | } |
| 147 | #endif |
| 148 | } |
| 149 | |