| 1 | /* -*- C++ -*- |
| 2 | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
| 3 | |
| 4 | SPDX-License-Identifier: MIT |
| 5 | */ |
| 6 | |
| 7 | #include "nsHebrewProber.h" |
| 8 | #include <stdio.h> |
| 9 | |
| 10 | // windows-1255 / ISO-8859-8 code points of interest |
| 11 | #define FINAL_KAF ('\xea') |
| 12 | #define NORMAL_KAF ('\xeb') |
| 13 | #define FINAL_MEM ('\xed') |
| 14 | #define NORMAL_MEM ('\xee') |
| 15 | #define FINAL_NUN ('\xef') |
| 16 | #define NORMAL_NUN ('\xf0') |
| 17 | #define FINAL_PE ('\xf3') |
| 18 | #define NORMAL_PE ('\xf4') |
| 19 | #define FINAL_TSADI ('\xf5') |
| 20 | #define NORMAL_TSADI ('\xf6') |
| 21 | |
| 22 | // Minimum Visual vs Logical final letter score difference. |
| 23 | // If the difference is below this, don't rely solely on the final letter score distance. |
| 24 | #define MIN_FINAL_CHAR_DISTANCE (5) |
| 25 | |
| 26 | // Minimum Visual vs Logical model score difference. |
| 27 | // If the difference is below this, don't rely at all on the model score distance. |
| 28 | #define MIN_MODEL_DISTANCE (0.01) |
| 29 | |
| 30 | #define VISUAL_HEBREW_NAME ("ISO-8859-8") |
| 31 | #define LOGICAL_HEBREW_NAME ("windows-1255") |
| 32 | |
| 33 | namespace kencodingprober |
| 34 | { |
| 35 | bool nsHebrewProber::isFinal(char c) |
| 36 | { |
| 37 | return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); |
| 38 | } |
| 39 | |
| 40 | bool nsHebrewProber::isNonFinal(char c) |
| 41 | { |
| 42 | return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); |
| 43 | // The normal Tsadi is not a good Non-Final letter due to words like |
| 44 | // 'lechotet' (to chat) containing an apostrophe after the tsadi. This |
| 45 | // apostrophe is converted to a space in FilterWithoutEnglishLetters causing |
| 46 | // the Non-Final tsadi to appear at an end of a word even though this is not |
| 47 | // the case in the original text. |
| 48 | // The letters Pe and Kaf rarely display a related behavior of not being a |
| 49 | // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for |
| 50 | // example legally end with a Non-Final Pe or Kaf. However, the benefit of |
| 51 | // these letters as Non-Final letters outweighs the damage since these words |
| 52 | // are quite rare. |
| 53 | } |
| 54 | |
| 55 | /** HandleData |
| 56 | * Final letter analysis for logical-visual decision. |
| 57 | * Look for evidence that the received buffer is either logical Hebrew or |
| 58 | * visual Hebrew. |
| 59 | * The following cases are checked: |
| 60 | * 1) A word longer than 1 letter, ending with a final letter. This is an |
| 61 | * indication that the text is laid out "naturally" since the final letter |
| 62 | * really appears at the end. +1 for logical score. |
| 63 | * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal |
| 64 | * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with |
| 65 | * the Non-Final form of that letter. Exceptions to this rule are mentioned |
| 66 | * above in isNonFinal(). This is an indication that the text is laid out |
| 67 | * backwards. +1 for visual score |
| 68 | * 3) A word longer than 1 letter, starting with a final letter. Final letters |
| 69 | * should not appear at the beginning of a word. This is an indication that |
| 70 | * the text is laid out backwards. +1 for visual score. |
| 71 | * |
| 72 | * The visual score and logical score are accumulated throughout the text and |
| 73 | * are finally checked against each other in GetCharSetName(). |
| 74 | * No checking for final letters in the middle of words is done since that case |
| 75 | * is not an indication for either Logical or Visual text. |
| 76 | * |
| 77 | * The input buffer should not contain any white spaces that are not (' ') |
| 78 | * or any low-ascii punctuation marks. |
| 79 | */ |
| 80 | nsProbingState nsHebrewProber::HandleData(const char *aBuf, unsigned int aLen) |
| 81 | { |
| 82 | // Both model probers say it's not them. No reason to continue. |
| 83 | if (GetState() == eNotMe) { |
| 84 | return eNotMe; |
| 85 | } |
| 86 | |
| 87 | const char *curPtr; |
| 88 | const char *endPtr = aBuf + aLen; |
| 89 | |
| 90 | for (curPtr = (char *)aBuf; curPtr < endPtr; ++curPtr) { |
| 91 | char cur = *curPtr; |
| 92 | if (cur == ' ') { // We stand on a space - a word just ended |
| 93 | if (mBeforePrev != ' ') { // *(curPtr-2) was not a space so prev is not a 1 letter word |
| 94 | if (isFinal(c: mPrev)) { // case (1) [-2:not space][-1:final letter][cur:space] |
| 95 | ++mFinalCharLogicalScore; |
| 96 | } else if (isNonFinal(c: mPrev)) { // case (2) [-2:not space][-1:Non-Final letter][cur:space] |
| 97 | ++mFinalCharVisualScore; |
| 98 | } |
| 99 | } |
| 100 | } else { // Not standing on a space |
| 101 | if ((mBeforePrev == ' ') && (isFinal(c: mPrev)) && (cur != ' ')) { // case (3) [-2:space][-1:final letter][cur:not space] |
| 102 | ++mFinalCharVisualScore; |
| 103 | } |
| 104 | } |
| 105 | mBeforePrev = mPrev; |
| 106 | mPrev = cur; |
| 107 | } |
| 108 | |
| 109 | // Forever detecting, till the end or until both model probers return eNotMe (handled above). |
| 110 | return eDetecting; |
| 111 | } |
| 112 | |
| 113 | // Make the decision: is it Logical or Visual? |
| 114 | const char *nsHebrewProber::GetCharSetName() |
| 115 | { |
| 116 | // If the final letter score distance is dominant enough, rely on it. |
| 117 | int finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; |
| 118 | if (finalsub >= MIN_FINAL_CHAR_DISTANCE) { |
| 119 | return LOGICAL_HEBREW_NAME; |
| 120 | } |
| 121 | if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) { |
| 122 | return VISUAL_HEBREW_NAME; |
| 123 | } |
| 124 | |
| 125 | // It's not dominant enough, try to rely on the model scores instead. |
| 126 | float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); |
| 127 | if (modelsub > MIN_MODEL_DISTANCE) { |
| 128 | return LOGICAL_HEBREW_NAME; |
| 129 | } |
| 130 | if (modelsub < -(MIN_MODEL_DISTANCE)) { |
| 131 | return VISUAL_HEBREW_NAME; |
| 132 | } |
| 133 | |
| 134 | // Still no good, back to final letter distance, maybe it'll save the day. |
| 135 | if (finalsub < 0) { |
| 136 | return VISUAL_HEBREW_NAME; |
| 137 | } |
| 138 | |
| 139 | // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. |
| 140 | return LOGICAL_HEBREW_NAME; |
| 141 | } |
| 142 | |
| 143 | void nsHebrewProber::Reset(void) |
| 144 | { |
| 145 | mFinalCharLogicalScore = 0; |
| 146 | mFinalCharVisualScore = 0; |
| 147 | |
| 148 | // mPrev and mBeforePrev are initialized to space in order to simulate a word |
| 149 | // delimiter at the beginning of the data |
| 150 | mPrev = ' '; |
| 151 | mBeforePrev = ' '; |
| 152 | } |
| 153 | |
| 154 | nsProbingState nsHebrewProber::GetState(void) |
| 155 | { |
| 156 | // Remain active as long as any of the model probers are active. |
| 157 | if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) { |
| 158 | return eNotMe; |
| 159 | } |
| 160 | return eDetecting; |
| 161 | } |
| 162 | |
| 163 | #ifdef DEBUG_PROBE |
| 164 | void nsHebrewProber::DumpStatus() |
| 165 | { |
| 166 | printf(" HEB: %d - %d [Logical-Visual score]\r\n" , mFinalCharLogicalScore, mFinalCharVisualScore); |
| 167 | } |
| 168 | #endif |
| 169 | } |
| 170 | |