1 | /* -*- C++ -*- |
2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | |
4 | SPDX-License-Identifier: MIT |
5 | */ |
6 | |
7 | #include "nsHebrewProber.h" |
8 | #include <stdio.h> |
9 | |
10 | // windows-1255 / ISO-8859-8 code points of interest |
11 | #define FINAL_KAF ('\xea') |
12 | #define NORMAL_KAF ('\xeb') |
13 | #define FINAL_MEM ('\xed') |
14 | #define NORMAL_MEM ('\xee') |
15 | #define FINAL_NUN ('\xef') |
16 | #define NORMAL_NUN ('\xf0') |
17 | #define FINAL_PE ('\xf3') |
18 | #define NORMAL_PE ('\xf4') |
19 | #define FINAL_TSADI ('\xf5') |
20 | #define NORMAL_TSADI ('\xf6') |
21 | |
22 | // Minimum Visual vs Logical final letter score difference. |
23 | // If the difference is below this, don't rely solely on the final letter score distance. |
24 | #define MIN_FINAL_CHAR_DISTANCE (5) |
25 | |
26 | // Minimum Visual vs Logical model score difference. |
27 | // If the difference is below this, don't rely at all on the model score distance. |
28 | #define MIN_MODEL_DISTANCE (0.01) |
29 | |
30 | #define VISUAL_HEBREW_NAME ("ISO-8859-8") |
31 | #define LOGICAL_HEBREW_NAME ("windows-1255") |
32 | |
33 | namespace kencodingprober |
34 | { |
35 | bool nsHebrewProber::isFinal(char c) |
36 | { |
37 | return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); |
38 | } |
39 | |
40 | bool nsHebrewProber::isNonFinal(char c) |
41 | { |
42 | return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); |
43 | // The normal Tsadi is not a good Non-Final letter due to words like |
44 | // 'lechotet' (to chat) containing an apostrophe after the tsadi. This |
45 | // apostrophe is converted to a space in FilterWithoutEnglishLetters causing |
46 | // the Non-Final tsadi to appear at an end of a word even though this is not |
47 | // the case in the original text. |
48 | // The letters Pe and Kaf rarely display a related behavior of not being a |
49 | // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for |
50 | // example legally end with a Non-Final Pe or Kaf. However, the benefit of |
51 | // these letters as Non-Final letters outweighs the damage since these words |
52 | // are quite rare. |
53 | } |
54 | |
55 | /** HandleData |
56 | * Final letter analysis for logical-visual decision. |
57 | * Look for evidence that the received buffer is either logical Hebrew or |
58 | * visual Hebrew. |
59 | * The following cases are checked: |
60 | * 1) A word longer than 1 letter, ending with a final letter. This is an |
61 | * indication that the text is laid out "naturally" since the final letter |
62 | * really appears at the end. +1 for logical score. |
63 | * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal |
64 | * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with |
65 | * the Non-Final form of that letter. Exceptions to this rule are mentioned |
66 | * above in isNonFinal(). This is an indication that the text is laid out |
67 | * backwards. +1 for visual score |
68 | * 3) A word longer than 1 letter, starting with a final letter. Final letters |
69 | * should not appear at the beginning of a word. This is an indication that |
70 | * the text is laid out backwards. +1 for visual score. |
71 | * |
72 | * The visual score and logical score are accumulated throughout the text and |
73 | * are finally checked against each other in GetCharSetName(). |
74 | * No checking for final letters in the middle of words is done since that case |
75 | * is not an indication for either Logical or Visual text. |
76 | * |
77 | * The input buffer should not contain any white spaces that are not (' ') |
78 | * or any low-ascii punctuation marks. |
79 | */ |
80 | nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen) |
81 | { |
82 | // Both model probers say it's not them. No reason to continue. |
83 | if (GetState() == eNotMe) { |
84 | return eNotMe; |
85 | } |
86 | |
87 | const char *curPtr; |
88 | const char *endPtr = aBuf + aLen; |
89 | |
90 | for (curPtr = (char *)aBuf; curPtr < endPtr; ++curPtr) { |
91 | char cur = *curPtr; |
92 | if (cur == ' ') { // We stand on a space - a word just ended |
93 | if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word |
94 | if (isFinal(c: mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space] |
95 | ++mFinalCharLogicalScore; |
96 | } else if (isNonFinal(c: mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space] |
97 | ++mFinalCharVisualScore; |
98 | } |
99 | } |
100 | } else { // Not standing on a space |
101 | if ((mBeforePrev == ' ') && (isFinal(c: mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space] |
102 | ++mFinalCharVisualScore; |
103 | } |
104 | } |
105 | mBeforePrev = mPrev; |
106 | mPrev = cur; |
107 | } |
108 | |
109 | // Forever detecting, till the end or until both model probers return eNotMe (handled above). |
110 | return eDetecting; |
111 | } |
112 | |
113 | // Make the decision: is it Logical or Visual? |
114 | const char *nsHebrewProber::GetCharSetName() |
115 | { |
116 | // If the final letter score distance is dominant enough, rely on it. |
117 | int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; |
118 | if (finalsub >= MIN_FINAL_CHAR_DISTANCE) { |
119 | return LOGICAL_HEBREW_NAME; |
120 | } |
121 | if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) { |
122 | return VISUAL_HEBREW_NAME; |
123 | } |
124 | |
125 | // It's not dominant enough, try to rely on the model scores instead. |
126 | float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); |
127 | if (modelsub > MIN_MODEL_DISTANCE) { |
128 | return LOGICAL_HEBREW_NAME; |
129 | } |
130 | if (modelsub < -(MIN_MODEL_DISTANCE)) { |
131 | return VISUAL_HEBREW_NAME; |
132 | } |
133 | |
134 | // Still no good, back to final letter distance, maybe it'll save the day. |
135 | if (finalsub < 0) { |
136 | return VISUAL_HEBREW_NAME; |
137 | } |
138 | |
139 | // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. |
140 | return LOGICAL_HEBREW_NAME; |
141 | } |
142 | |
143 | void nsHebrewProber::Reset(void) |
144 | { |
145 | mFinalCharLogicalScore = 0; |
146 | mFinalCharVisualScore = 0; |
147 | |
148 | // mPrev and mBeforePrev are initialized to space in order to simulate a word |
149 | // delimiter at the beginning of the data |
150 | mPrev = ' '; |
151 | mBeforePrev = ' '; |
152 | } |
153 | |
154 | nsProbingState nsHebrewProber::GetState(void) |
155 | { |
156 | // Remain active as long as any of the model probers are active. |
157 | if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) { |
158 | return eNotMe; |
159 | } |
160 | return eDetecting; |
161 | } |
162 | |
163 | #ifdef DEBUG_PROBE |
164 | void nsHebrewProber::DumpStatus() |
165 | { |
166 | printf(" HEB: %d - %d [Logical-Visual score]\r\n" , mFinalCharLogicalScore, mFinalCharVisualScore); |
167 | } |
168 | #endif |
169 | } |
170 | |