1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4 SPDX-License-Identifier: MIT
5*/
6
7#include "nsLatin1Prober.h"
8#include <stdio.h>
9#include <stdlib.h>
10
11#define UDF 0 // undefined
12#define OTH 1 // other
13#define ASC 2 // ascii capital letter
14#define ASS 3 // ascii small letter
15#define ACV 4 // accent capital vowel
16#define ACO 5 // accent capital other
17#define ASV 6 // accent small vowel
18#define ASO 7 // accent small other
19#define CLASS_NUM 8 // total classes
20
21namespace kencodingprober
22{
23static const unsigned char Latin1_CharToClass[] = {
24 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
25 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
26 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
27 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
28 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
29 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
30 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
31 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
32 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
33 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
34 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
35 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
36 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
37 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
38 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
39 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
40 OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
41 OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
42 UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
43 OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
44 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
45 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
46 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
47 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
48 ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
49 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
50 ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
51 ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
52 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
53 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
54 ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
55 ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
56};
57
58/* 0 : illegal
59 1 : very unlikely
60 2 : normal
61 3 : very likely
62*/
63static const unsigned char Latin1ClassModel[] = {
64 /* UDF OTH ASC ASS ACV ACO ASV ASO */
65 /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
66 /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
67 /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
68 /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
69 /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
70 /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
71 /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
72 /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
73};
74
75void nsLatin1Prober::Reset(void)
76{
77 mState = eDetecting;
78 mLastCharClass = OTH;
79 for (int i = 0; i < FREQ_CAT_NUM; i++) {
80 mFreqCounter[i] = 0;
81 }
82}
83
84nsProbingState nsLatin1Prober::HandleData(const char *aBuf, unsigned int aLen)
85{
86 char *newBuf1 = nullptr;
87 unsigned int newLen1 = 0;
88
89 if (!FilterWithEnglishLetters(aBuf, aLen, newBuf: &newBuf1, newLen&: newLen1)) {
90 newBuf1 = (char *)aBuf;
91 newLen1 = aLen;
92 }
93
94 for (unsigned int i = 0; i < newLen1; i++) {
95 const unsigned char charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
96 const unsigned char freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass];
97 if (freq == 0) {
98 mState = eNotMe;
99 break;
100 }
101 mFreqCounter[freq]++;
102 mLastCharClass = charClass;
103 }
104
105 if (newBuf1 != aBuf) {
106 free(ptr: newBuf1);
107 }
108
109 return mState;
110}
111
112float nsLatin1Prober::GetConfidence(void)
113{
114 if (mState == eNotMe) {
115 return 0.01f;
116 }
117
118 float confidence;
119 unsigned int total = 0;
120 for (int i = 0; i < FREQ_CAT_NUM; i++) {
121 total += mFreqCounter[i];
122 }
123
124 if (!total) {
125 confidence = 0.0f;
126 } else {
127 confidence = mFreqCounter[3] * 1.0f / total;
128 confidence -= mFreqCounter[1] * 20.0f / total;
129 }
130
131 if (confidence < 0.0f) {
132 confidence = 0.0f;
133 }
134
135 // lower the confidence of latin1 so that other more accurate detector
136 // can take priority.
137 confidence *= 0.50f;
138
139 return confidence;
140}
141
142#ifdef DEBUG_PROBE
143void nsLatin1Prober::DumpStatus()
144{
145 printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
146}
147#endif
148}
149

source code of kcodecs/src/probers/nsLatin1Prober.cpp