1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3
4 SPDX-License-Identifier: MIT
5*/
6
7#include "nsHebrewProber.h"
8#include "nsSBCharSetProber.h"
9#include <stdio.h>
10
11// windows-1255 / ISO-8859-8 code points of interest
12#define FINAL_KAF ('\xea')
13#define NORMAL_KAF ('\xeb')
14#define FINAL_MEM ('\xed')
15#define NORMAL_MEM ('\xee')
16#define FINAL_NUN ('\xef')
17#define NORMAL_NUN ('\xf0')
18#define FINAL_PE ('\xf3')
19#define NORMAL_PE ('\xf4')
20#define FINAL_TSADI ('\xf5')
21#define NORMAL_TSADI ('\xf6')
22
23// Minimum Visual vs Logical final letter score difference.
24// If the difference is below this, don't rely solely on the final letter score distance.
25#define MIN_FINAL_CHAR_DISTANCE (5)
26
27// Minimum Visual vs Logical model score difference.
28// If the difference is below this, don't rely at all on the model score distance.
29#define MIN_MODEL_DISTANCE (0.01)
30
31#define VISUAL_HEBREW_NAME ("ISO-8859-8")
32#define LOGICAL_HEBREW_NAME ("windows-1255")
33
34namespace
35{
36bool isFinal(char c)
37{
38 return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI));
39}
40
41bool isNonFinal(char c)
42{
43 return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE));
44 // The normal Tsadi is not a good Non-Final letter due to words like
45 // 'lechotet' (to chat) containing an apostrophe after the tsadi. This
46 // apostrophe is converted to a space in FilterWithoutEnglishLetters causing
47 // the Non-Final tsadi to appear at an end of a word even though this is not
48 // the case in the original text.
49 // The letters Pe and Kaf rarely display a related behavior of not being a
50 // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
51 // example legally end with a Non-Final Pe or Kaf. However, the benefit of
52 // these letters as Non-Final letters outweighs the damage since these words
53 // are quite rare.
54}
55} // namespace <anonymous>
56
57namespace kencodingprober
58{
59nsHebrewProber::nsHebrewProber()
60 : mLogicalProb(new nsSingleByteCharSetProber<false>(&Win1255Model))
61 , mVisualProb(new nsSingleByteCharSetProber<true>(&Win1255Model))
62{
63}
64/** HandleData
65 * Final letter analysis for logical-visual decision.
66 * Look for evidence that the received buffer is either logical Hebrew or
67 * visual Hebrew.
68 * The following cases are checked:
69 * 1) A word longer than 1 letter, ending with a final letter. This is an
70 * indication that the text is laid out "naturally" since the final letter
71 * really appears at the end. +1 for logical score.
72 * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
73 * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
74 * the Non-Final form of that letter. Exceptions to this rule are mentioned
75 * above in isNonFinal(). This is an indication that the text is laid out
76 * backwards. +1 for visual score
77 * 3) A word longer than 1 letter, starting with a final letter. Final letters
78 * should not appear at the beginning of a word. This is an indication that
79 * the text is laid out backwards. +1 for visual score.
80 *
81 * The visual score and logical score are accumulated throughout the text and
82 * are finally checked against each other in GetCharSetName().
83 * No checking for final letters in the middle of words is done since that case
84 * is not an indication for either Logical or Visual text.
85 *
86 * The input buffer should not contain any white spaces that are not (' ')
87 * or any low-ascii punctuation marks.
88 */
89nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen)
90{
91 mLogicalProb->HandleData(aBuf, aLen);
92 mVisualProb->HandleData(aBuf, aLen);
93
94 // Both model probers say it's not them. No reason to continue.
95 if (GetState() == eNotMe) {
96 return eNotMe;
97 }
98
99 const char *curPtr;
100 const char *endPtr = aBuf + aLen;
101
102 for (curPtr = aBuf; curPtr < endPtr; ++curPtr) {
103 char cur = *curPtr;
104 if (cur == ' ') { // We stand on a space - a word just ended
105 if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word
106 if (isFinal(c: mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space]
107 ++mFinalCharLogicalScore;
108 } else if (isNonFinal(c: mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space]
109 ++mFinalCharVisualScore;
110 }
111 }
112 } else { // Not standing on a space
113 if ((mBeforePrev == ' ') && (isFinal(c: mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space]
114 ++mFinalCharVisualScore;
115 }
116 }
117 mBeforePrev = mPrev;
118 mPrev = cur;
119 }
120
121 // Forever detecting, till the end or until both model probers return eNotMe (handled above).
122 return eDetecting;
123}
124
125float nsHebrewProber::GetConfidence()
126{
127 if (GetState() == eNotMe) {
128 return 0.01f;
129 }
130
131 int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
132 auto logicalConfidence = mLogicalProb->GetConfidence();
133 auto visualConfidence = mVisualProb->GetConfidence();
134
135 if ((logicalConfidence - 0.1 > visualConfidence) && (finalsub >= 0)) {
136 return logicalConfidence;
137 } else if ((visualConfidence - 0.1 > logicalConfidence) && (finalsub <= 0)) {
138 return visualConfidence;
139 } else {
140 return 0.01f;
141 }
142}
143
144// Make the decision: is it Logical or Visual?
145const char *nsHebrewProber::GetCharSetName()
146{
147 // If the final letter score distance is dominant enough, rely on it.
148 int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
149 if (finalsub >= MIN_FINAL_CHAR_DISTANCE) {
150 return LOGICAL_HEBREW_NAME;
151 }
152 if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) {
153 return VISUAL_HEBREW_NAME;
154 }
155
156 // It's not dominant enough, try to rely on the model scores instead.
157 float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence();
158 if (modelsub > MIN_MODEL_DISTANCE) {
159 return LOGICAL_HEBREW_NAME;
160 }
161 if (modelsub < -(MIN_MODEL_DISTANCE)) {
162 return VISUAL_HEBREW_NAME;
163 }
164
165 // Still no good, back to final letter distance, maybe it'll save the day.
166 if (finalsub < 0) {
167 return VISUAL_HEBREW_NAME;
168 }
169
170 // (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
171 return LOGICAL_HEBREW_NAME;
172}
173
174void nsHebrewProber::Reset(void)
175{
176 mFinalCharLogicalScore = 0;
177 mFinalCharVisualScore = 0;
178
179 // mPrev and mBeforePrev are initialized to space in order to simulate a word
180 // delimiter at the beginning of the data
181 mPrev = ' ';
182 mBeforePrev = ' ';
183}
184
185nsProbingState nsHebrewProber::GetState(void)
186{
187 // Remain active as long as any of the model probers are active.
188 if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) {
189 return eNotMe;
190 }
191 return eDetecting;
192}
193
194#ifdef DEBUG_PROBE
195void nsHebrewProber::DumpStatus()
196{
197 printf(" HEB: [%.3f] %d - %d [Logical-Visual score]:\r\n", GetConfidence(), mFinalCharLogicalScore, mFinalCharVisualScore);
198 mLogicalProb->DumpStatus();
199 mVisualProb->DumpStatus();
200}
201#endif
202}
203

source code of kcodecs/src/probers/nsHebrewProber.cpp