1 | /* -*- C++ -*- |
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #include "nsLatin1Prober.h" |
8 | #include <stdio.h> |
9 | #include <stdlib.h> |
10 | |
11 | #define UDF 0 // undefined |
12 | #define OTH 1 // other |
13 | #define ASC 2 // ascii capital letter |
14 | #define ASS 3 // ascii small letter |
15 | #define ACV 4 // accent capital vowel |
16 | #define ACO 5 // accent capital other |
17 | #define ASV 6 // accent small vowel |
18 | #define ASO 7 // accent small other |
19 | #define CLASS_NUM 8 // total classes |
20 | |
21 | namespace kencodingprober |
22 | { |
23 | static const unsigned char Latin1_CharToClass[] = { |
24 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 |
25 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F |
26 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 |
27 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F |
28 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 |
29 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F |
30 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 |
31 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F |
32 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 |
33 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F |
34 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 |
35 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F |
36 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 |
37 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F |
38 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 |
39 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F |
40 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 |
41 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F |
42 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 |
43 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F |
44 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 |
45 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF |
46 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 |
47 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF |
48 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 |
49 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF |
50 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 |
51 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF |
52 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 |
53 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF |
54 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 |
55 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF |
56 | }; |
57 | |
58 | /* 0 : illegal |
59 | 1 : very unlikely |
60 | 2 : normal |
61 | 3 : very likely |
62 | */ |
63 | static const unsigned char Latin1ClassModel[] = { |
64 | /* UDF OTH ASC ASS ACV ACO ASV ASO */ |
65 | /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, |
66 | /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, |
67 | /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, |
68 | /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, |
69 | /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, |
70 | /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, |
71 | /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, |
72 | /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, |
73 | }; |
74 | |
75 | void nsLatin1Prober::Reset(void) |
76 | { |
77 | mState = eDetecting; |
78 | mLastCharClass = OTH; |
79 | for (int i = 0; i < FREQ_CAT_NUM; i++) { |
80 | mFreqCounter[i] = 0; |
81 | } |
82 | } |
83 | |
84 | nsProbingState nsLatin1Prober::HandleData(const char *aBuf, unsigned int aLen) |
85 | { |
86 | char *newBuf1 = nullptr; |
87 | unsigned int newLen1 = 0; |
88 | |
89 | if (!FilterWithEnglishLetters(aBuf, aLen, newBuf: &newBuf1, newLen&: newLen1)) { |
90 | newBuf1 = (char *)aBuf; |
91 | newLen1 = aLen; |
92 | } |
93 | |
94 | for (unsigned int i = 0; i < newLen1; i++) { |
95 | const unsigned char charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; |
96 | const unsigned char freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass]; |
97 | if (freq == 0) { |
98 | mState = eNotMe; |
99 | break; |
100 | } |
101 | mFreqCounter[freq]++; |
102 | mLastCharClass = charClass; |
103 | } |
104 | |
105 | if (newBuf1 != aBuf) { |
106 | free(ptr: newBuf1); |
107 | } |
108 | |
109 | return mState; |
110 | } |
111 | |
112 | float nsLatin1Prober::GetConfidence(void) |
113 | { |
114 | if (mState == eNotMe) { |
115 | return 0.01f; |
116 | } |
117 | |
118 | float confidence; |
119 | unsigned int total = 0; |
120 | for (int i = 0; i < FREQ_CAT_NUM; i++) { |
121 | total += mFreqCounter[i]; |
122 | } |
123 | |
124 | if (!total) { |
125 | confidence = 0.0f; |
126 | } else { |
127 | confidence = mFreqCounter[3] * 1.0f / total; |
128 | confidence -= mFreqCounter[1] * 20.0f / total; |
129 | } |
130 | |
131 | if (confidence < 0.0f) { |
132 | confidence = 0.0f; |
133 | } |
134 | |
135 | // lower the confidence of latin1 so that other more accurate detector |
136 | // can take priority. |
137 | confidence *= 0.50f; |
138 | |
139 | return confidence; |
140 | } |
141 | |
142 | #ifdef DEBUG_PROBE |
143 | void nsLatin1Prober::DumpStatus() |
144 | { |
145 | printf(" Latin1Prober: %1.3f [%s]\r\n" , GetConfidence(), GetCharSetName()); |
146 | } |
147 | #endif |
148 | } |
149 | |