LCOV - code coverage report
Current view: directory - extensions/universalchardet/src/base - nsUniversalDetector.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 118 0 0.0 %
Date: 2012-06-02 Functions: 6 0 0.0 %

       1                 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2                 : /* ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is Mozilla Universal charset detector code.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is
      18                 :  * Netscape Communications Corporation.
      19                 :  * Portions created by the Initial Developer are Copyright (C) 2001
      20                 :  * the Initial Developer. All Rights Reserved.
      21                 :  *
      22                 :  * Contributor(s):
      23                 :  *          Shy Shalom <shooshX@gmail.com>
      24                 :  *
      25                 :  * Alternatively, the contents of this file may be used under the terms of
      26                 :  * either the GNU General Public License Version 2 or later (the "GPL"), or
      27                 :  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      28                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      29                 :  * of those above. If you wish to allow use of your version of this file only
      30                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      31                 :  * use your version of this file under the terms of the MPL, indicate your
      32                 :  * decision by deleting the provisions above and replace them with the notice
      33                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      34                 :  * the provisions above, a recipient may use your version of this file under
      35                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      36                 :  *
      37                 :  * ***** END LICENSE BLOCK ***** */
      38                 : 
      39                 : #include "nscore.h"
      40                 : 
      41                 : #include "nsUniversalDetector.h"
      42                 : 
      43                 : #include "nsMBCSGroupProber.h"
      44                 : #include "nsSBCSGroupProber.h"
      45                 : #include "nsEscCharsetProber.h"
      46                 : #include "nsLatin1Prober.h"
      47                 : 
      48               0 : nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
      49                 : {
      50               0 :   mDone = false;
      51               0 :   mBestGuess = -1;   //illegal value as signal
      52               0 :   mInTag = false;
      53               0 :   mEscCharSetProber = nsnull;
      54                 : 
      55               0 :   mStart = true;
      56               0 :   mDetectedCharset = nsnull;
      57               0 :   mGotData = false;
      58               0 :   mInputState = ePureAscii;
      59               0 :   mLastChar = '\0';
      60               0 :   mLanguageFilter = aLanguageFilter;
      61                 : 
      62                 :   PRUint32 i;
      63               0 :   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
      64               0 :     mCharSetProbers[i] = nsnull;
      65               0 : }
      66                 : 
      67               0 : nsUniversalDetector::~nsUniversalDetector() 
      68                 : {
      69               0 :   for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
      70               0 :     delete mCharSetProbers[i];
      71                 : 
      72               0 :   delete mEscCharSetProber;
      73               0 : }
      74                 : 
      75                 : void 
      76               0 : nsUniversalDetector::Reset()
      77                 : {
      78               0 :   mDone = false;
      79               0 :   mBestGuess = -1;   //illegal value as signal
      80               0 :   mInTag = false;
      81                 : 
      82               0 :   mStart = true;
      83               0 :   mDetectedCharset = nsnull;
      84               0 :   mGotData = false;
      85               0 :   mInputState = ePureAscii;
      86               0 :   mLastChar = '\0';
      87                 : 
      88               0 :   if (mEscCharSetProber)
      89               0 :     mEscCharSetProber->Reset();
      90                 : 
      91                 :   PRUint32 i;
      92               0 :   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
      93               0 :     if (mCharSetProbers[i])
      94               0 :       mCharSetProbers[i]->Reset();
      95               0 : }
      96                 : 
      97                 : //---------------------------------------------------------------------
      98                 : #define SHORTCUT_THRESHOLD      (float)0.95
      99                 : #define MINIMUM_THRESHOLD      (float)0.20
     100                 : 
     101               0 : nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
     102                 : {
     103               0 :   if(mDone) 
     104               0 :     return NS_OK;
     105                 : 
     106               0 :   if (aLen > 0)
     107               0 :     mGotData = true;
     108                 : 
     109                 :   //If the data starts with BOM, we know it is UTF
     110               0 :   if (mStart)
     111                 :   {
     112               0 :     mStart = false;
     113               0 :     if (aLen > 2)
     114               0 :       switch (aBuf[0])
     115                 :         {
     116                 :         case '\xEF':
     117               0 :           if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
     118                 :             // EF BB BF  UTF-8 encoded BOM
     119               0 :             mDetectedCharset = "UTF-8";
     120               0 :         break;
     121                 :         case '\xFE':
     122               0 :           if ('\xFF' == aBuf[1])
     123                 :             // FE FF  UTF-16, big endian BOM
     124               0 :             mDetectedCharset = "UTF-16";
     125               0 :         break;
     126                 :         case '\xFF':
     127               0 :           if ('\xFE' == aBuf[1])
     128                 :             // FF FE  UTF-16, little endian BOM
     129               0 :             mDetectedCharset = "UTF-16";
     130               0 :         break;
     131                 :       }  // switch
     132                 : 
     133               0 :       if (mDetectedCharset)
     134                 :       {
     135               0 :         mDone = true;
     136               0 :         return NS_OK;
     137                 :       }
     138                 :   }
     139                 :   
     140                 :   PRUint32 i;
     141               0 :   for (i = 0; i < aLen; i++)
     142                 :   {
     143                 :     //other than 0xa0, if every othe character is ascii, the page is ascii
     144               0 :     if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
     145                 :     {
     146                 :       //we got a non-ascii byte (high-byte)
     147               0 :       if (mInputState != eHighbyte)
     148                 :       {
     149                 :         //adjust state
     150               0 :         mInputState = eHighbyte;
     151                 : 
     152                 :         //kill mEscCharSetProber if it is active
     153               0 :         if (mEscCharSetProber) {
     154               0 :           delete mEscCharSetProber;
     155               0 :           mEscCharSetProber = nsnull;
     156                 :         }
     157                 : 
     158                 :         //start multibyte and singlebyte charset prober
     159               0 :         if (nsnull == mCharSetProbers[0])
     160                 :         {
     161               0 :           mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
     162               0 :           if (nsnull == mCharSetProbers[0])
     163               0 :             return NS_ERROR_OUT_OF_MEMORY;
     164                 :         }
     165               0 :         if (nsnull == mCharSetProbers[1] &&
     166                 :             (mLanguageFilter & NS_FILTER_NON_CJK))
     167                 :         {
     168               0 :           mCharSetProbers[1] = new nsSBCSGroupProber;
     169               0 :           if (nsnull == mCharSetProbers[1])
     170               0 :             return NS_ERROR_OUT_OF_MEMORY;
     171                 :         }
     172               0 :         if (nsnull == mCharSetProbers[2])
     173                 :         {
     174               0 :           mCharSetProbers[2] = new nsLatin1Prober; 
     175               0 :           if (nsnull == mCharSetProbers[2])
     176               0 :             return NS_ERROR_OUT_OF_MEMORY;
     177                 :         }
     178                 :       }
     179                 :     }
     180                 :     else
     181                 :     {
     182                 :       //ok, just pure ascii so far
     183               0 :       if ( ePureAscii == mInputState &&
     184               0 :         (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
     185                 :       {
     186                 :         //found escape character or HZ "~{"
     187               0 :         mInputState = eEscAscii;
     188                 :       }
     189               0 :       mLastChar = aBuf[i];
     190                 :     }
     191                 :   }
     192                 : 
     193                 :   nsProbingState st;
     194               0 :   switch (mInputState)
     195                 :   {
     196                 :   case eEscAscii:
     197               0 :     if (nsnull == mEscCharSetProber) {
     198               0 :       mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
     199               0 :       if (nsnull == mEscCharSetProber)
     200               0 :         return NS_ERROR_OUT_OF_MEMORY;
     201                 :     }
     202               0 :     st = mEscCharSetProber->HandleData(aBuf, aLen);
     203               0 :     if (st == eFoundIt)
     204                 :     {
     205               0 :       mDone = true;
     206               0 :       mDetectedCharset = mEscCharSetProber->GetCharSetName();
     207                 :     }
     208               0 :     break;
     209                 :   case eHighbyte:
     210               0 :     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
     211                 :     {
     212               0 :       if (mCharSetProbers[i])
     213                 :       {
     214               0 :         st = mCharSetProbers[i]->HandleData(aBuf, aLen);
     215               0 :         if (st == eFoundIt) 
     216                 :         {
     217               0 :           mDone = true;
     218               0 :           mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
     219               0 :           return NS_OK;
     220                 :         }
     221                 :       } 
     222                 :     }
     223               0 :     break;
     224                 : 
     225                 :   default:  //pure ascii
     226                 :     ;//do nothing here
     227                 :   }
     228               0 :   return NS_OK;
     229                 : }
     230                 : 
     231                 : 
     232                 : //---------------------------------------------------------------------
     233               0 : void nsUniversalDetector::DataEnd()
     234                 : {
     235               0 :   if (!mGotData)
     236                 :   {
     237                 :     // we haven't got any data yet, return immediately 
     238                 :     // caller program sometimes call DataEnd before anything has been sent to detector
     239               0 :     return;
     240                 :   }
     241                 : 
     242               0 :   if (mDetectedCharset)
     243                 :   {
     244               0 :     mDone = true;
     245               0 :     Report(mDetectedCharset);
     246               0 :     return;
     247                 :   }
     248                 :   
     249               0 :   switch (mInputState)
     250                 :   {
     251                 :   case eHighbyte:
     252                 :     {
     253                 :       float proberConfidence;
     254               0 :       float maxProberConfidence = (float)0.0;
     255               0 :       PRInt32 maxProber = 0;
     256                 : 
     257               0 :       for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
     258                 :       {
     259               0 :         if (mCharSetProbers[i])
     260                 :         {
     261               0 :           proberConfidence = mCharSetProbers[i]->GetConfidence();
     262               0 :           if (proberConfidence > maxProberConfidence)
     263                 :           {
     264               0 :             maxProberConfidence = proberConfidence;
     265               0 :             maxProber = i;
     266                 :           }
     267                 :         }
     268                 :       }
     269                 :       //do not report anything because we are not confident of it, that's in fact a negative answer
     270               0 :       if (maxProberConfidence > MINIMUM_THRESHOLD)
     271               0 :         Report(mCharSetProbers[maxProber]->GetCharSetName());
     272                 :     }
     273               0 :     break;
     274                 :   case eEscAscii:
     275               0 :     break;
     276                 :   default:
     277                 :     ;
     278                 :   }
     279               0 :   return;
     280                 : }

Generated by: LCOV version 1.7