LCOV - code coverage report
Current view: directory - extensions/universalchardet/src/base - nsHebrewProber.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 45 0 0.0 %
Date: 2012-06-02 Functions: 6 0 0.0 %

       1                 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2                 : /* ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is Mozilla Universal charset detector code.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is
      18                 :  *          Shy Shalom <shooshX@gmail.com>
      19                 :  * Portions created by the Initial Developer are Copyright (C) 2005
      20                 :  * the Initial Developer. All Rights Reserved.
      21                 :  *
      22                 :  * Contributor(s):
      23                 :  *
      24                 :  * Alternatively, the contents of this file may be used under the terms of
      25                 :  * either the GNU General Public License Version 2 or later (the "GPL"), or
      26                 :  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      27                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      28                 :  * of those above. If you wish to allow use of your version of this file only
      29                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      30                 :  * use your version of this file under the terms of the MPL, indicate your
      31                 :  * decision by deleting the provisions above and replace them with the notice
      32                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      33                 :  * the provisions above, a recipient may use your version of this file under
      34                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      35                 :  *
      36                 :  * ***** END LICENSE BLOCK ***** */
      37                 : 
      38                 : #include "nsHebrewProber.h"
      39                 : #include <stdio.h>
      40                 : 
      41                 : // windows-1255 / ISO-8859-8 code points of interest
      42                 : #define FINAL_KAF ('\xea')
      43                 : #define NORMAL_KAF ('\xeb')
      44                 : #define FINAL_MEM ('\xed')
      45                 : #define NORMAL_MEM ('\xee')
      46                 : #define FINAL_NUN ('\xef')
      47                 : #define NORMAL_NUN ('\xf0')
      48                 : #define FINAL_PE ('\xf3')
      49                 : #define NORMAL_PE ('\xf4')
      50                 : #define FINAL_TSADI ('\xf5')
      51                 : #define NORMAL_TSADI ('\xf6')
      52                 : 
      53                 : // Minimum Visual vs Logical final letter score difference.
      54                 : // If the difference is below this, don't rely solely on the final letter score distance.
      55                 : #define MIN_FINAL_CHAR_DISTANCE (5)
      56                 : 
      57                 : // Minimum Visual vs Logical model score difference.
      58                 : // If the difference is below this, don't rely at all on the model score distance.
      59                 : #define MIN_MODEL_DISTANCE (0.01)
      60                 : 
      61                 : #define VISUAL_HEBREW_NAME ("ISO-8859-8")
      62                 : #define LOGICAL_HEBREW_NAME ("windows-1255")
      63                 : 
      64               0 : bool nsHebrewProber::isFinal(char c)
      65                 : {
      66               0 :   return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI));
      67                 : }
      68                 : 
      69               0 : bool nsHebrewProber::isNonFinal(char c)
      70                 : {
      71               0 :   return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE));
      72                 :   // The normal Tsadi is not a good Non-Final letter due to words like 
      73                 :   // 'lechotet' (to chat) containing an apostrophe after the tsadi. This 
      74                 :   // apostrophe is converted to a space in FilterWithoutEnglishLetters causing 
      75                 :   // the Non-Final tsadi to appear at an end of a word even though this is not 
      76                 :   // the case in the original text.
      77                 :   // The letters Pe and Kaf rarely display a related behavior of not being a 
      78                 :   // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 
      79                 :   // example legally end with a Non-Final Pe or Kaf. However, the benefit of 
      80                 :   // these letters as Non-Final letters outweighs the damage since these words 
      81                 :   // are quite rare.
      82                 : }
      83                 : 
      84                 : /** HandleData
      85                 :  * Final letter analysis for logical-visual decision.
      86                 :  * Look for evidence that the received buffer is either logical Hebrew or 
      87                 :  * visual Hebrew.
      88                 :  * The following cases are checked:
      89                 :  * 1) A word longer than 1 letter, ending with a final letter. This is an 
      90                 :  *    indication that the text is laid out "naturally" since the final letter 
      91                 :  *    really appears at the end. +1 for logical score.
      92                 :  * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
      93                 :  *    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
      94                 :  *    the Non-Final form of that letter. Exceptions to this rule are mentioned
      95                 :  *    above in isNonFinal(). This is an indication that the text is laid out
      96                 :  *    backwards. +1 for visual score
      97                 :  * 3) A word longer than 1 letter, starting with a final letter. Final letters 
      98                 :  *    should not appear at the beginning of a word. This is an indication that 
      99                 :  *    the text is laid out backwards. +1 for visual score.
     100                 :  *
     101                 :  * The visual score and logical score are accumulated throughout the text and 
     102                 :  * are finally checked against each other in GetCharSetName().
     103                 :  * No checking for final letters in the middle of words is done since that case
     104                 :  * is not an indication for either Logical or Visual text.
     105                 :  *
     106                 :  * The input buffer should not contain any white spaces that are not (' ')
     107                 :  * or any low-ascii punctuation marks. 
     108                 :  */
     109               0 : nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen)
     110                 : {
     111                 :   // Both model probers say it's not them. No reason to continue.
     112               0 :   if (GetState() == eNotMe)
     113               0 :     return eNotMe;
     114                 : 
     115               0 :   const char *curPtr, *endPtr = aBuf+aLen;
     116                 :   char cur;
     117                 : 
     118               0 :   for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr)
     119                 :   {
     120               0 :     cur = *curPtr;
     121               0 :     if (cur == ' ') // We stand on a space - a word just ended
     122                 :     {
     123               0 :       if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word
     124                 :       {
     125               0 :         if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space]
     126               0 :           ++mFinalCharLogicalScore;
     127               0 :         else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space]
     128               0 :           ++mFinalCharVisualScore;
     129                 :       }
     130                 :     }
     131                 :     else  // Not standing on a space
     132                 :     {
     133               0 :       if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space]
     134               0 :         ++mFinalCharVisualScore;
     135                 :     }
     136               0 :     mBeforePrev = mPrev;
     137               0 :     mPrev = cur;
     138                 :   }
     139                 : 
     140                 :   // Forever detecting, till the end or until both model probers return eNotMe (handled above).
     141               0 :   return eDetecting;
     142                 : }
     143                 : 
     144                 : // Make the decision: is it Logical or Visual?
     145               0 : const char* nsHebrewProber::GetCharSetName()
     146                 : {
     147                 :   // If the final letter score distance is dominant enough, rely on it.
     148               0 :   PRInt32 finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
     149               0 :   if (finalsub >= MIN_FINAL_CHAR_DISTANCE) 
     150               0 :     return LOGICAL_HEBREW_NAME;
     151               0 :   if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE))
     152               0 :     return VISUAL_HEBREW_NAME;
     153                 : 
     154                 :   // It's not dominant enough, try to rely on the model scores instead.
     155               0 :   float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence();
     156               0 :   if (modelsub > MIN_MODEL_DISTANCE)
     157               0 :     return LOGICAL_HEBREW_NAME;
     158               0 :   if (modelsub < -(MIN_MODEL_DISTANCE))
     159               0 :     return VISUAL_HEBREW_NAME;
     160                 : 
     161                 :   // Still no good, back to final letter distance, maybe it'll save the day.
     162               0 :   if (finalsub < 0) 
     163               0 :     return VISUAL_HEBREW_NAME;
     164                 : 
     165                 :   // (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
     166               0 :   return LOGICAL_HEBREW_NAME;
     167                 : }
     168                 : 
     169                 : 
     170               0 : void nsHebrewProber::Reset(void)
     171                 : {
     172               0 :   mFinalCharLogicalScore = 0;
     173               0 :   mFinalCharVisualScore = 0;
     174                 : 
     175                 :   // mPrev and mBeforePrev are initialized to space in order to simulate a word 
     176                 :   // delimiter at the beginning of the data
     177               0 :   mPrev = ' ';
     178               0 :   mBeforePrev = ' ';
     179               0 : }
     180                 : 
     181               0 : nsProbingState nsHebrewProber::GetState(void) 
     182                 : {
     183                 :   // Remain active as long as any of the model probers are active.
     184               0 :   if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe))
     185               0 :     return eNotMe;
     186               0 :   return eDetecting;
     187                 : }
     188                 : 
     189                 : #ifdef DEBUG_chardet
     190                 : void  nsHebrewProber::DumpStatus()
     191                 : {
     192                 :   printf("  HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore);
     193                 : }
     194                 : #endif

Generated by: LCOV version 1.7