| 1 | /* -*- C++ -*- |
| 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
| 3 | |
| 4 | SPDX-License-Identifier: MIT |
| 5 | */ |
| 6 | |
| 7 | #include "nsHebrewProber.h" |
| 8 | #include "nsSBCharSetProber.h" |
| 9 | #include <stdio.h> |
| 10 | |
| 11 | // windows-1255 / ISO-8859-8 code points of interest |
| 12 | #define FINAL_KAF ('\xea') |
| 13 | #define NORMAL_KAF ('\xeb') |
| 14 | #define FINAL_MEM ('\xed') |
| 15 | #define NORMAL_MEM ('\xee') |
| 16 | #define FINAL_NUN ('\xef') |
| 17 | #define NORMAL_NUN ('\xf0') |
| 18 | #define FINAL_PE ('\xf3') |
| 19 | #define NORMAL_PE ('\xf4') |
| 20 | #define FINAL_TSADI ('\xf5') |
| 21 | #define NORMAL_TSADI ('\xf6') |
| 22 | |
| 23 | // Minimum Visual vs Logical final letter score difference. |
| 24 | // If the difference is below this, don't rely solely on the final letter score distance. |
| 25 | #define MIN_FINAL_CHAR_DISTANCE (5) |
| 26 | |
| 27 | // Minimum Visual vs Logical model score difference. |
| 28 | // If the difference is below this, don't rely at all on the model score distance. |
| 29 | #define MIN_MODEL_DISTANCE (0.01) |
| 30 | |
| 31 | #define VISUAL_HEBREW_NAME ("ISO-8859-8") |
| 32 | #define LOGICAL_HEBREW_NAME ("windows-1255") |
| 33 | |
| 34 | namespace |
| 35 | { |
| 36 | bool isFinal(char c) |
| 37 | { |
| 38 | return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); |
| 39 | } |
| 40 | |
| 41 | bool isNonFinal(char c) |
| 42 | { |
| 43 | return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); |
| 44 | // The normal Tsadi is not a good Non-Final letter due to words like |
| 45 | // 'lechotet' (to chat) containing an apostrophe after the tsadi. This |
| 46 | // apostrophe is converted to a space in FilterWithoutEnglishLetters causing |
| 47 | // the Non-Final tsadi to appear at an end of a word even though this is not |
| 48 | // the case in the original text. |
| 49 | // The letters Pe and Kaf rarely display a related behavior of not being a |
| 50 | // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for |
| 51 | // example legally end with a Non-Final Pe or Kaf. However, the benefit of |
| 52 | // these letters as Non-Final letters outweighs the damage since these words |
| 53 | // are quite rare. |
| 54 | } |
| 55 | } // namespace <anonymous> |
| 56 | |
| 57 | namespace kencodingprober |
| 58 | { |
| 59 | nsHebrewProber::nsHebrewProber() |
| 60 | : mLogicalProb(new nsSingleByteCharSetProber<false>(&Win1255Model)) |
| 61 | , mVisualProb(new nsSingleByteCharSetProber<true>(&Win1255Model)) |
| 62 | { |
| 63 | } |
| 64 | /** HandleData |
| 65 | * Final letter analysis for logical-visual decision. |
| 66 | * Look for evidence that the received buffer is either logical Hebrew or |
| 67 | * visual Hebrew. |
| 68 | * The following cases are checked: |
| 69 | * 1) A word longer than 1 letter, ending with a final letter. This is an |
| 70 | * indication that the text is laid out "naturally" since the final letter |
| 71 | * really appears at the end. +1 for logical score. |
| 72 | * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal |
| 73 | * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with |
| 74 | * the Non-Final form of that letter. Exceptions to this rule are mentioned |
| 75 | * above in isNonFinal(). This is an indication that the text is laid out |
| 76 | * backwards. +1 for visual score |
| 77 | * 3) A word longer than 1 letter, starting with a final letter. Final letters |
| 78 | * should not appear at the beginning of a word. This is an indication that |
| 79 | * the text is laid out backwards. +1 for visual score. |
| 80 | * |
| 81 | * The visual score and logical score are accumulated throughout the text and |
| 82 | * are finally checked against each other in GetCharSetName(). |
| 83 | * No checking for final letters in the middle of words is done since that case |
| 84 | * is not an indication for either Logical or Visual text. |
| 85 | * |
| 86 | * The input buffer should not contain any white spaces that are not (' ') |
| 87 | * or any low-ascii punctuation marks. |
| 88 | */ |
| 89 | nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen) |
| 90 | { |
| 91 | mLogicalProb->HandleData(aBuf, aLen); |
| 92 | mVisualProb->HandleData(aBuf, aLen); |
| 93 | |
| 94 | // Both model probers say it's not them. No reason to continue. |
| 95 | if (GetState() == eNotMe) { |
| 96 | return eNotMe; |
| 97 | } |
| 98 | |
| 99 | const char *curPtr; |
| 100 | const char *endPtr = aBuf + aLen; |
| 101 | |
| 102 | for (curPtr = aBuf; curPtr < endPtr; ++curPtr) { |
| 103 | char cur = *curPtr; |
| 104 | if (cur == ' ') { // We stand on a space - a word just ended |
| 105 | if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word |
| 106 | if (isFinal(c: mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space] |
| 107 | ++mFinalCharLogicalScore; |
| 108 | } else if (isNonFinal(c: mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space] |
| 109 | ++mFinalCharVisualScore; |
| 110 | } |
| 111 | } |
| 112 | } else { // Not standing on a space |
| 113 | if ((mBeforePrev == ' ') && (isFinal(c: mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space] |
| 114 | ++mFinalCharVisualScore; |
| 115 | } |
| 116 | } |
| 117 | mBeforePrev = mPrev; |
| 118 | mPrev = cur; |
| 119 | } |
| 120 | |
| 121 | // Forever detecting, till the end or until both model probers return eNotMe (handled above). |
| 122 | return eDetecting; |
| 123 | } |
| 124 | |
| 125 | float nsHebrewProber::GetConfidence() |
| 126 | { |
| 127 | if (GetState() == eNotMe) { |
| 128 | return 0.01f; |
| 129 | } |
| 130 | |
| 131 | int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; |
| 132 | auto logicalConfidence = mLogicalProb->GetConfidence(); |
| 133 | auto visualConfidence = mVisualProb->GetConfidence(); |
| 134 | |
| 135 | if ((logicalConfidence - 0.1 > visualConfidence) && (finalsub >= 0)) { |
| 136 | return logicalConfidence; |
| 137 | } else if ((visualConfidence - 0.1 > logicalConfidence) && (finalsub <= 0)) { |
| 138 | return visualConfidence; |
| 139 | } else { |
| 140 | return 0.01f; |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | // Make the decision: is it Logical or Visual? |
| 145 | const char *nsHebrewProber::GetCharSetName() |
| 146 | { |
| 147 | // If the final letter score distance is dominant enough, rely on it. |
| 148 | int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; |
| 149 | if (finalsub >= MIN_FINAL_CHAR_DISTANCE) { |
| 150 | return LOGICAL_HEBREW_NAME; |
| 151 | } |
| 152 | if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) { |
| 153 | return VISUAL_HEBREW_NAME; |
| 154 | } |
| 155 | |
| 156 | // It's not dominant enough, try to rely on the model scores instead. |
| 157 | float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); |
| 158 | if (modelsub > MIN_MODEL_DISTANCE) { |
| 159 | return LOGICAL_HEBREW_NAME; |
| 160 | } |
| 161 | if (modelsub < -(MIN_MODEL_DISTANCE)) { |
| 162 | return VISUAL_HEBREW_NAME; |
| 163 | } |
| 164 | |
| 165 | // Still no good, back to final letter distance, maybe it'll save the day. |
| 166 | if (finalsub < 0) { |
| 167 | return VISUAL_HEBREW_NAME; |
| 168 | } |
| 169 | |
| 170 | // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. |
| 171 | return LOGICAL_HEBREW_NAME; |
| 172 | } |
| 173 | |
| 174 | void nsHebrewProber::Reset(void) |
| 175 | { |
| 176 | mFinalCharLogicalScore = 0; |
| 177 | mFinalCharVisualScore = 0; |
| 178 | |
| 179 | // mPrev and mBeforePrev are initialized to space in order to simulate a word |
| 180 | // delimiter at the beginning of the data |
| 181 | mPrev = ' '; |
| 182 | mBeforePrev = ' '; |
| 183 | } |
| 184 | |
| 185 | nsProbingState nsHebrewProber::GetState(void) |
| 186 | { |
| 187 | // Remain active as long as any of the model probers are active. |
| 188 | if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) { |
| 189 | return eNotMe; |
| 190 | } |
| 191 | return eDetecting; |
| 192 | } |
| 193 | |
| 194 | #ifdef DEBUG_PROBE |
| 195 | void nsHebrewProber::DumpStatus() |
| 196 | { |
| 197 | printf(" HEB: [%.3f] %d - %d [Logical-Visual score]:\r\n" , GetConfidence(), mFinalCharLogicalScore, mFinalCharVisualScore); |
| 198 | mLogicalProb->DumpStatus(); |
| 199 | mVisualProb->DumpStatus(); |
| 200 | } |
| 201 | #endif |
| 202 | } |
| 203 | |