LCOV - code coverage report
Current view: directory - intl/uconv/ucvcn - nsGBKToUnicode.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 144 118 81.9 %
Date: 2012-06-02 Functions: 18 12 66.7 %

       1                 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2                 : /* ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is mozilla.org code.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is
      18                 :  * Netscape Communications Corporation.
      19                 :  * Portions created by the Initial Developer are Copyright (C) 1998
      20                 :  * the Initial Developer. All Rights Reserved.
      21                 :  *
      22                 :  * Contributor(s):
      23                 :  *
      24                 :  * Alternatively, the contents of this file may be used under the terms of
      25                 :  * either of the GNU General Public License Version 2 or later (the "GPL"),
      26                 :  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      27                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      28                 :  * of those above. If you wish to allow use of your version of this file only
      29                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      30                 :  * use your version of this file under the terms of the MPL, indicate your
      31                 :  * decision by deleting the provisions above and replace them with the notice
      32                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      33                 :  * the provisions above, a recipient may use your version of this file under
      34                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      35                 :  *
      36                 :  * ***** END LICENSE BLOCK ***** */
      37                 : /**
      38                 :  * A character set converter from GBK to Unicode.
      39                 :  * 
      40                 :  *
      41                 :  * @created         07/Sept/1999
      42                 :  * @author  Yueheng Xu, Yueheng.Xu@intel.com
      43                 :  */
      44                 : 
      45                 : #include "nsGBKToUnicode.h"
      46                 : #include "nsUCvCnDll.h"
      47                 : #include "gbku.h"
      48                 : 
      49                 : 
      50                 : //------------------------------------------------------------
      51                 : // nsGBKUnique2BytesToUnicode
      52                 : //------------------------------------------------------------
      53                 : class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport 
      54                 : {
      55                 : public:
      56                 :   nsGBKUnique2BytesToUnicode();
      57               0 :   virtual ~nsGBKUnique2BytesToUnicode() 
      58               0 :     { }
      59                 : protected:
      60                 : };
      61                 : 
      62                 : static const PRUint16 g_utGBKUnique2Bytes[] = {
      63                 : #include "gbkuniq2b.ut"
      64                 : };
      65               0 : nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode() 
      66                 :   : nsTableDecoderSupport(u2BytesCharset, nsnull,
      67               0 :         (uMappingTable*) &g_utGBKUnique2Bytes, 1) 
      68                 : {
      69               0 : }
      70                 : 
      71                 : //------------------------------------------------------------
      72                 : // nsGB18030Unique2BytesToUnicode
      73                 : //------------------------------------------------------------
      74                 : class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport 
      75                 : {
      76                 : public:
      77                 :   nsGB18030Unique2BytesToUnicode();
      78               2 :   virtual ~nsGB18030Unique2BytesToUnicode() 
      79               4 :     { }
      80                 : protected:
      81                 : };
      82                 : 
      83                 : static const PRUint16 g_utGB18030Unique2Bytes[] = {
      84                 : #include "gb18030uniq2b.ut"
      85                 : };
      86               1 : nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode() 
      87                 :   : nsTableDecoderSupport(u2BytesCharset, nsnull,
      88               1 :         (uMappingTable*) &g_utGB18030Unique2Bytes, 1) 
      89                 : {
      90               1 : }
      91                 : 
      92                 : //------------------------------------------------------------
      93                 : // nsGB18030Unique4BytesToUnicode
      94                 : //------------------------------------------------------------
      95                 : class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport 
      96                 : {
      97                 : public:
      98                 :   nsGB18030Unique4BytesToUnicode();
      99               2 :   virtual ~nsGB18030Unique4BytesToUnicode() 
     100               4 :     { }
     101                 : protected:
     102                 : };
     103                 : 
     104                 : static const PRUint16 g_utGB18030Unique4Bytes[] = {
     105                 : #include "gb180304bytes.ut"
     106                 : };
     107               1 : nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode() 
     108                 :   : nsTableDecoderSupport(u4BytesGB18030Charset, nsnull,
     109               1 :         (uMappingTable*) &g_utGB18030Unique4Bytes, 1) 
     110                 : {
     111               1 : }
     112                 : 
     113                 : 
     114                 : //----------------------------------------------------------------------
     115                 : // Class nsGBKToUnicode [implementation]
     116                 : 
     117                 : //----------------------------------------------------------------------
     118                 : // Subclassing of nsTablesDecoderSupport class [implementation]
     119                 : 
     120                 : #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c)  \
     121                 :       (UINT8_IN_RANGE(0x81, (c), 0xFE))
     122                 : #define FIRST_BYTE_IS_SURROGATE(c)  \
     123                 :       (UINT8_IN_RANGE(0x90, (c), 0xFE))
     124                 : #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
     125                 :       (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
     126                 : #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
     127                 :       (UINT8_IN_RANGE(0x30, (c), 0x39))
     128                 : #define LEGAL_GBK_4BYTE_THIRD_BYTE(c)  \
     129                 :       (UINT8_IN_RANGE(0x81, (c), 0xFE))
     130                 : #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
     131                 :       (UINT8_IN_RANGE(0x30, (c), 0x39))
     132                 : 
     133           66231 : NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
     134                 :                                             PRInt32 * aSrcLength,
     135                 :                                             PRUnichar *aDest,
     136                 :                                             PRInt32 * aDestLength)
     137                 : {
     138           66231 :   PRInt32 i=0;
     139           66231 :   PRInt32 iSrcLength = (*aSrcLength);
     140           66231 :   PRInt32 iDestlen = 0;
     141           66231 :   nsresult rv=NS_OK;
     142           66231 :   *aSrcLength = 0;
     143                 :   
     144          219907 :   for (i=0;i<iSrcLength;i++)
     145                 :   {
     146          155755 :     if ( iDestlen >= (*aDestLength) )
     147                 :     {
     148            1260 :       rv = NS_OK_UDEC_MOREOUTPUT;
     149            1260 :       break;
     150                 :     }
     151                 :     // The valid range for the 1st byte is [0x81,0xFE] 
     152          154495 :     if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
     153                 :     {
     154           48967 :       if(i+1 >= iSrcLength) 
     155                 :       {
     156             189 :         rv = NS_OK_UDEC_MOREINPUT;
     157             189 :         break;
     158                 :       }
     159                 :       // To make sure, the second byte has to be checked as well.
     160                 :       // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
     161           96926 :       if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
     162                 :       {
     163                 :         // Valid GBK code
     164           40572 :         *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
     165           40572 :         if(UCS2_NO_MAPPING == *aDest)
     166                 :         { 
     167                 :           // We cannot map in the common mapping, let's call the
     168                 :           // delegate 2 byte decoder to decode the gbk or gb18030 unique 
     169                 :           // 2 byte mapping
     170             255 :           if(! TryExtensionDecoder(aSrc, aDest))
     171                 :           {
     172               0 :             *aDest = UCS2_NO_MAPPING;
     173                 :           }
     174                 :         }
     175           40572 :         aSrc += 2;
     176           40572 :         i++;
     177                 :       }
     178            8206 :       else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
     179                 :       {
     180                 :         // from the first 2 bytes, it looks like a 4 byte GB18030
     181            1276 :         if(i+3 >= iSrcLength)  // make sure we got 4 bytes
     182                 :         {
     183             630 :           rv = NS_OK_UDEC_MOREINPUT;
     184             630 :           break;
     185                 :         }
     186                 :         // 4 bytes patten
     187                 :         // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
     188                 :         // preset the 
     189                 :  
     190            1324 :         if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
     191              32 :             LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
     192                 :         {
     193              16 :            if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0])) 
     194                 :            {
     195                 :              // let's call the delegated 4 byte gb18030 converter to convert it
     196              16 :              if(! Try4BytesDecoder(aSrc, aDest))
     197               0 :                *aDest = UCS2_NO_MAPPING;
     198                 :            } else {
     199                 :               // let's try supplement mapping
     200               8 :              if ( (iDestlen+1) < (*aDestLength) )
     201                 :              {
     202               8 :                if(DecodeToSurrogate(aSrc, aDest))
     203                 :                {
     204                 :                  // surrogte two PRUnichar
     205               2 :                  iDestlen++;
     206               2 :                  aDest++;
     207                 :                }  else {
     208               6 :                  *aDest = UCS2_NO_MAPPING;
     209                 :               }
     210                 :              } else {
     211               0 :                if (*aDestLength < 2) {
     212               0 :                  NS_ERROR("insufficient space in output buffer");
     213               0 :                  *aDest = UCS2_NO_MAPPING;
     214                 :                } else {
     215               0 :                  rv = NS_OK_UDEC_MOREOUTPUT;
     216               0 :                  break;
     217                 :                }
     218                 :              }
     219                 :            }
     220              16 :            aSrc += 4;
     221              16 :            i += 3;
     222                 :         } else {
     223             630 :           *aDest = UCS2_NO_MAPPING; 
     224                 :           // If the third and fourth bytes are not in the legal ranges for
     225                 :           // a four-byte sequnce, resynchronize on the second byte
     226                 :           // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
     227                 :           //  0x30-0x39)
     228             630 :           aSrc++;
     229                 :         }
     230                 :       }
     231            6930 :       else if ((PRUint8) aSrc[0] == (PRUint8)0xA0 )
     232                 :       {
     233                 :         // stand-alone (not followed by a valid second byte) 0xA0 !
     234                 :         // treat it as valid a la Netscape 4.x
     235              55 :         *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
     236              55 :         aSrc++;
     237                 :       } else {
     238                 :         // Invalid GBK code point (second byte should be 0x40 or higher)
     239            6875 :         *aDest = UCS2_NO_MAPPING;
     240            6875 :         aSrc++;
     241                 :       }
     242                 :     } else {
     243          105528 :       if(IS_ASCII(*aSrc))
     244                 :       {
     245                 :         // The source is an ASCII
     246          104628 :         *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
     247          104628 :         aSrc++;
     248                 :       } else {
     249             900 :         if(IS_GBK_EURO(*aSrc)) {
     250             387 :           *aDest = UCS2_EURO;
     251                 :         } else {
     252             513 :           *aDest = UCS2_NO_MAPPING;
     253                 :         }
     254             900 :         aSrc++;
     255                 :       }
     256                 :     }
     257          153676 :     iDestlen++;
     258          153676 :     aDest++;
     259          153676 :     *aSrcLength = i+1;
     260                 :   }
     261           66231 :   *aDestLength = iDestlen;
     262           66231 :   return rv;
     263                 : }
     264                 : 
     265                 : 
     266               0 : void nsGBKToUnicode::CreateExtensionDecoder()
     267                 : {
     268               0 :   mExtensionDecoder = new nsGBKUnique2BytesToUnicode();
     269               0 : }
     270               0 : void nsGBKToUnicode::Create4BytesDecoder()
     271                 : {
     272               0 :   m4BytesDecoder =  nsnull;
     273               0 : }
     274               1 : void nsGB18030ToUnicode::CreateExtensionDecoder()
     275                 : {
     276               1 :   mExtensionDecoder = new nsGB18030Unique2BytesToUnicode();
     277               1 : }
     278               1 : void nsGB18030ToUnicode::Create4BytesDecoder()
     279                 : {
     280               1 :   m4BytesDecoder = new nsGB18030Unique4BytesToUnicode();
     281               1 : }
     282               8 : bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, PRUnichar* aOut)
     283                 : {
     284               8 :   NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]),       "illegal first byte");
     285               8 :   NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]),   "illegal second byte");
     286               8 :   NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]),    "illegal third byte");
     287               8 :   NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]),    "illegal forth byte");
     288               8 :   if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
     289               0 :     return false;
     290               8 :   if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
     291               0 :     return false;
     292               8 :   if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
     293               0 :     return false;
     294               8 :   if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
     295               0 :     return false;
     296                 : 
     297               8 :   PRUint8 a1 = (PRUint8) aSrc[0];
     298               8 :   PRUint8 a2 = (PRUint8) aSrc[1];
     299               8 :   PRUint8 a3 = (PRUint8) aSrc[2];
     300               8 :   PRUint8 a4 = (PRUint8) aSrc[3];
     301               8 :   a1 -= (PRUint8)0x90;
     302               8 :   a2 -= (PRUint8)0x30;
     303               8 :   a3 -= (PRUint8)0x81;
     304               8 :   a4 -= (PRUint8)0x30;
     305               8 :   PRUint32 idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
     306                 :   // idx == ucs4Codepoint - 0x10000
     307               8 :   if (idx > 0x000FFFFF)
     308               6 :     return false;
     309                 : 
     310               2 :   *aOut++ = 0xD800 | (idx >> 10);
     311               2 :   *aOut = 0xDC00 | (0x000003FF & idx);
     312                 : 
     313               2 :   return true;
     314                 : }
     315             255 : bool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, PRUnichar* aOut)
     316                 : {
     317             255 :   if(!mExtensionDecoder)
     318               1 :     CreateExtensionDecoder();
     319             255 :   NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter");
     320             255 :   if(mExtensionDecoder)
     321                 :   {
     322             255 :     nsresult res = mExtensionDecoder->Reset();
     323             255 :     NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed");
     324             255 :     PRInt32 len = 2;
     325             255 :     PRInt32 dstlen = 1;
     326             255 :     res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen); 
     327             255 :     NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)), 
     328                 :        "some strange conversion result");
     329                 :      // if we failed, we then just use the 0xfffd 
     330                 :      // therefore, we ignore the res here. 
     331             255 :     if(NS_SUCCEEDED(res)) 
     332             255 :       return true;
     333                 :   }
     334               0 :   return  false;
     335                 : }
     336               0 : bool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, PRUnichar* aOut)
     337                 : {
     338               0 :   return false;
     339                 : }
     340               8 : bool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, PRUnichar* aOut)
     341                 : {
     342               8 :   if(!m4BytesDecoder)
     343               1 :     Create4BytesDecoder();
     344               8 :   if(m4BytesDecoder)
     345                 :   {
     346               8 :     nsresult res = m4BytesDecoder->Reset();
     347               8 :     NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed");
     348               8 :     PRInt32 len = 4;
     349               8 :     PRInt32 dstlen = 1;
     350               8 :     res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen); 
     351               8 :     NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)), 
     352                 :        "some strange conversion result");
     353                 :      // if we failed, we then just use the 0xfffd 
     354                 :      // therefore, we ignore the res here. 
     355               8 :     if(NS_SUCCEEDED(res)) 
     356               8 :       return true;
     357                 :   }
     358               0 :   return  false;
     359                 : }

Generated by: LCOV version 1.7