LCOV - code coverage report
Current view: directory - intl/uconv/ucvlatin - nsUCS2BEToUnicode.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 141 113 80.1 %
Date: 2012-06-02 Functions: 7 7 100.0 %

       1                 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2                 : /* ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is Mozilla Communicator client code.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is
      18                 :  * Netscape Communications Corporation.
      19                 :  * Portions created by the Initial Developer are Copyright (C) 1998
      20                 :  * the Initial Developer. All Rights Reserved.
      21                 :  *
      22                 :  * Contributor(s):
      23                 :  *
      24                 :  * Alternatively, the contents of this file may be used under the terms of
      25                 :  * either of the GNU General Public License Version 2 or later (the "GPL"),
      26                 :  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      27                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      28                 :  * of those above. If you wish to allow use of your version of this file only
      29                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      30                 :  * use your version of this file under the terms of the MPL, indicate your
      31                 :  * decision by deleting the provisions above and replace them with the notice
      32                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      33                 :  * the provisions above, a recipient may use your version of this file under
      34                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      35                 :  *
      36                 :  * ***** END LICENSE BLOCK ***** */
      37                 : 
      38                 : #include "nsUCConstructors.h"
      39                 : #include "nsUCS2BEToUnicode.h"
      40                 : #include "nsUCvLatinDll.h"
      41                 : #include "nsCharTraits.h"
      42                 : #include <string.h>
      43                 : #include "prtypes.h"
      44                 : 
      45                 : #define STATE_NORMAL             0
      46                 : #define STATE_HALF_CODE_POINT    1
      47                 : #define STATE_FIRST_CALL         2
      48                 : #define STATE_FOUND_BOM          3
      49                 : #define STATE_ODD_SURROGATE_PAIR 4
      50                 : 
      51                 : static nsresult
      52             351 : UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aOddByte,
      53                 :                       PRUnichar& aOddHighSurrogate, PRUnichar& aOddLowSurrogate,
      54                 :                       const char * aSrc,
      55                 :                       PRInt32 * aSrcLength, PRUnichar * aDest,
      56                 :                       PRInt32 * aDestLength,
      57                 :                       bool aSwapBytes)
      58                 : {
      59             351 :   const char* src = aSrc;
      60             351 :   const char* srcEnd = aSrc + *aSrcLength;
      61             351 :   PRUnichar* dest = aDest;
      62             351 :   PRUnichar* destEnd = aDest + *aDestLength;
      63                 : 
      64             351 :   switch(aState) {
      65                 :     case STATE_FOUND_BOM:
      66              19 :       NS_ASSERTION(*aSrcLength > 1, "buffer too short");
      67              19 :       src+=2;
      68              19 :       aState = STATE_NORMAL;
      69              19 :       break;
      70                 : 
      71                 :     case STATE_FIRST_CALL: // first time called
      72              60 :       NS_ASSERTION(*aSrcLength > 1, "buffer too short");
      73                 :       // Eliminate BOM (0xFEFF). Note that different endian case is taken care
      74                 :       // of in |Convert| of LE and BE converters. Here, we only have to
      75                 :       // deal with the same endian case. That is, 0xFFFE (byte-swapped BOM) is
      76                 :       // illegal.
      77              60 :       if(0xFEFF == *((PRUnichar*)src)) {
      78               2 :         src+=2;
      79              58 :       } else if(0xFFFE == *((PRUnichar*)src)) {
      80               0 :         *aSrcLength=0;
      81               0 :         *aDestLength=0;
      82               0 :         return NS_ERROR_ILLEGAL_INPUT;
      83                 :       }  
      84              60 :       aState = STATE_NORMAL;
      85              60 :       break;
      86                 : 
      87                 :     case STATE_ODD_SURROGATE_PAIR:
      88               0 :       if (*aDestLength < 2)
      89               0 :         *dest++ = UCS2_REPLACEMENT_CHAR;
      90                 :       else {
      91               0 :         *dest++ = aOddHighSurrogate;
      92               0 :         *dest++ = aOddLowSurrogate;
      93               0 :         aOddHighSurrogate = aOddLowSurrogate = 0;
      94               0 :         aState = STATE_NORMAL;
      95                 :       }
      96               0 :       break;
      97                 : 
      98                 :     case STATE_NORMAL:
      99                 :     case STATE_HALF_CODE_POINT:
     100                 :     default:
     101             272 :       break;
     102                 :   }
     103                 : 
     104             351 :   if (src == srcEnd) {
     105               0 :     *aDestLength = dest - aDest;
     106               0 :     return NS_OK;
     107                 :   }
     108                 : 
     109             351 :   PRUnichar oddHighSurrogate = aOddHighSurrogate;
     110                 : 
     111                 :   const char* srcEvenEnd;
     112                 : 
     113                 :   PRUnichar u;
     114             351 :   if (aState == STATE_HALF_CODE_POINT) {
     115                 :     // the 1st byte of a 16-bit code unit was stored in |aOddByte| in the
     116                 :     // previous run while the 2nd byte has to come from |*src|.
     117              56 :     aState = STATE_NORMAL;
     118                 : #ifdef IS_BIG_ENDIAN
     119                 :     u = (aOddByte << 8) | *src++; // safe, we know we have at least one byte.
     120                 : #else
     121              56 :     u = (*src++ << 8) | aOddByte; // safe, we know we have at least one byte.
     122                 : #endif
     123              56 :     srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
     124              56 :     goto have_codepoint;
     125                 :   } else {
     126             295 :     srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
     127                 :   }
     128                 : 
     129           39608 :   while (src != srcEvenEnd) {
     130           38962 :     if (dest == destEnd)
     131               0 :       goto error;
     132                 : 
     133                 : #if !defined(__sparc__) && !defined(__arm__)
     134           38962 :     u = *(const PRUnichar*)src;
     135                 : #else
     136                 :     memcpy(&u, src, 2);
     137                 : #endif
     138           38962 :     src += 2;
     139                 : 
     140                 : have_codepoint:
     141           39018 :     if (aSwapBytes)
     142           13700 :       u = u << 8 | u >> 8;
     143                 : 
     144           39018 :     if (!IS_SURROGATE(u)) {
     145           38642 :       if (oddHighSurrogate) {
     146              32 :         *dest++ = UCS2_REPLACEMENT_CHAR;
     147              32 :         if (dest == destEnd)
     148               0 :           goto error;
     149              32 :         oddHighSurrogate = 0;
     150                 :       }
     151           38642 :       *dest++ = u;
     152             376 :     } else if (NS_IS_HIGH_SURROGATE(u)) {
     153             192 :       if (oddHighSurrogate) {
     154              16 :         *dest++ = UCS2_REPLACEMENT_CHAR;
     155              16 :         if (dest == destEnd)
     156               0 :           goto error;
     157                 :       }
     158             192 :       oddHighSurrogate = u;
     159                 :     }
     160                 :     else /* if (NS_IS_LOW_SURROGATE(u)) */ {
     161             184 :       if (oddHighSurrogate && *aDestLength > 1) {
     162             136 :         if (dest + 1 >= destEnd) {
     163               0 :           aOddLowSurrogate = u;
     164               0 :           aOddHighSurrogate = oddHighSurrogate;
     165               0 :           aState = STATE_ODD_SURROGATE_PAIR;
     166               0 :           goto error;
     167                 :         }
     168             136 :         *dest++ = oddHighSurrogate;
     169             136 :         *dest++ = u;
     170                 :       } else {
     171              48 :         *dest++ = UCS2_REPLACEMENT_CHAR;
     172                 :       }
     173             184 :       oddHighSurrogate = 0;
     174                 :     }
     175                 :   }
     176             351 :   if (src != srcEnd) {
     177                 :     // store the lead byte of a 16-bit unit for the next run.
     178              72 :     aOddByte = *src++;
     179              72 :     aState = STATE_HALF_CODE_POINT;
     180                 :   }
     181                 : 
     182             351 :   aOddHighSurrogate = oddHighSurrogate;
     183                 : 
     184             351 :   *aDestLength = dest - aDest;
     185             351 :   *aSrcLength =  src  - aSrc; 
     186             351 :   return NS_OK;
     187                 : 
     188                 : error:
     189               0 :   *aDestLength = dest - aDest;
     190               0 :   *aSrcLength =  src  - aSrc; 
     191               0 :   return  NS_OK_UDEC_MOREOUTPUT;
     192                 : }
     193                 : 
     194                 : NS_IMETHODIMP
     195             167 : nsUTF16ToUnicodeBase::Reset()
     196                 : {
     197             167 :   mState = STATE_FIRST_CALL;
     198             167 :   mOddByte = 0;
     199             167 :   mOddHighSurrogate = 0;
     200             167 :   mOddLowSurrogate = 0;
     201             167 :   return NS_OK;
     202                 : }
     203                 : 
     204                 : NS_IMETHODIMP
     205             385 : nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, PRInt32 aSrcLength, 
     206                 :                                    PRInt32 * aDestLength)
     207                 : {
     208                 :   // the left-over data of the previous run have to be taken into account.
     209             385 :   *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT == mState) ? 1 : 0)) / 2;
     210             385 :   if (mOddHighSurrogate)
     211               0 :     (*aDestLength)++;
     212             385 :   if (mOddLowSurrogate)
     213               0 :     (*aDestLength)++;
     214             385 :   return NS_OK;
     215                 : }
     216                 : 
     217                 : 
     218                 : NS_IMETHODIMP
     219             286 : nsUTF16BEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
     220                 :                             PRUnichar * aDest, PRInt32 * aDestLength)
     221                 : {
     222             286 :     if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
     223                 :     {
     224             129 :       nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
     225             129 :       *aSrcLength=0;
     226             129 :       *aDestLength=0;
     227             129 :       return res;
     228                 :     }
     229                 : #ifdef IS_LITTLE_ENDIAN
     230                 :     // Remove the BOM if we're little-endian. The 'same endian' case with the
     231                 :     // leading BOM will be taken care of by |UTF16ConvertToUnicode|.
     232             157 :     if(STATE_FIRST_CALL == mState) // Called for the first time.
     233                 :     {
     234              60 :       mState = STATE_NORMAL;
     235              60 :       if(0xFFFE == *((PRUnichar*)aSrc)) {
     236                 :         // eliminate BOM (on LE machines, BE BOM is 0xFFFE)
     237               2 :         mState = STATE_FOUND_BOM;
     238              58 :       } else if(0xFEFF == *((PRUnichar*)aSrc)) {
     239               0 :         *aSrcLength=0;
     240               0 :         *aDestLength=0;
     241               0 :         return NS_ERROR_ILLEGAL_INPUT;
     242                 :       }
     243                 :     }
     244                 : #endif
     245                 : 
     246                 :   nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
     247                 :                                       mOddLowSurrogate,
     248                 :                                       aSrc, aSrcLength, aDest, aDestLength,
     249                 : #ifdef IS_LITTLE_ENDIAN
     250                 :                                       true
     251                 : #else
     252                 :                                       false
     253                 : #endif
     254             157 :                                       );
     255             157 :   return rv;
     256                 : }
     257                 : 
     258                 : NS_IMETHODIMP
     259             285 : nsUTF16LEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
     260                 :                             PRUnichar * aDest, PRInt32 * aDestLength)
     261                 : {
     262             285 :     if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
     263                 :     {
     264             128 :       nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
     265             128 :       *aSrcLength=0;
     266             128 :       *aDestLength=0;
     267             128 :       return res;
     268                 :     }
     269                 : #ifdef IS_BIG_ENDIAN
     270                 :     // Remove the BOM if we're big-endian. The 'same endian' case with the
     271                 :     // leading BOM will be taken care of by |UTF16ConvertToUnicode|.
     272                 :     if(STATE_FIRST_CALL == mState) // first time called
     273                 :     {
     274                 :       mState = STATE_NORMAL;
     275                 :       if(0xFFFE == *((PRUnichar*)aSrc)) {
     276                 :         // eliminate BOM (on BE machines, LE BOM is 0xFFFE)
     277                 :         mState = STATE_FOUND_BOM;
     278                 :       } else if(0xFEFF == *((PRUnichar*)aSrc)) {
     279                 :         *aSrcLength=0;
     280                 :         *aDestLength=0;
     281                 :         return NS_ERROR_ILLEGAL_INPUT;
     282                 :       }
     283                 :     }
     284                 : #endif
     285                 :     
     286                 :   nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
     287                 :                                       mOddLowSurrogate,
     288                 :                                       aSrc, aSrcLength, aDest, aDestLength,
     289                 : #ifdef IS_BIG_ENDIAN
     290                 :                                       true
     291                 : #else
     292                 :                                       false
     293                 : #endif
     294             157 :                                       );
     295             157 :   return rv;
     296                 : }
     297                 : 
     298                 : NS_IMETHODIMP
     299              22 : nsUTF16ToUnicode::Reset()
     300                 : {
     301              22 :   mEndian = kUnknown;
     302              22 :   mFoundBOM = false;
     303              22 :   return nsUTF16ToUnicodeBase::Reset();
     304                 : }
     305                 : 
     306                 : NS_IMETHODIMP
     307             165 : nsUTF16ToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
     308                 :                           PRUnichar * aDest, PRInt32 * aDestLength)
     309                 : {
     310             165 :     if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
     311                 :     {
     312             128 :       nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
     313             128 :       *aSrcLength=0;
     314             128 :       *aDestLength=0;
     315             128 :       return res;
     316                 :     }
     317              37 :     if(STATE_FIRST_CALL == mState) // first time called
     318                 :     {
     319              21 :       mState = STATE_NORMAL;
     320                 :       // check if BOM (0xFEFF) is at the beginning, remove it if found, and
     321                 :       // set mEndian accordingly.
     322              21 :       if(0xFF == PRUint8(aSrc[0]) && 0xFE == PRUint8(aSrc[1])) {
     323              15 :         mState = STATE_FOUND_BOM;
     324              15 :         mEndian = kLittleEndian;
     325              15 :         mFoundBOM = true;
     326                 :       }
     327               6 :       else if(0xFE == PRUint8(aSrc[0]) && 0xFF == PRUint8(aSrc[1])) {
     328               2 :         mState = STATE_FOUND_BOM;
     329               2 :         mEndian = kBigEndian;
     330               2 :         mFoundBOM = true;
     331                 :       }
     332                 :       // BOM is not found, but we can use a simple heuristic to determine
     333                 :       // the endianness. Assume the first character is [U+0001, U+00FF].
     334                 :       // Not always valid, but it's very likely to hold for html/xml/css. 
     335               4 :       else if(!aSrc[0] && aSrc[1]) {  // 0x00 0xhh (hh != 00)
     336               2 :         mEndian = kBigEndian;
     337                 :       }
     338               2 :       else if(aSrc[0] && !aSrc[1]) {  // 0xhh 0x00 (hh != 00)
     339               2 :         mEndian = kLittleEndian;
     340                 :       }
     341                 :       else { // Neither BOM nor 'plausible' byte patterns at the beginning.
     342                 :              // Just assume it's BE (following Unicode standard)
     343                 :              // and let the garbage show up in the browser. (security concern?)
     344                 :              // (bug 246194)
     345               0 :         mEndian = kBigEndian;
     346                 :       }
     347                 :     }
     348                 :     
     349                 :     nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
     350                 :                                         mOddLowSurrogate,
     351                 :                                         aSrc, aSrcLength, aDest, aDestLength,
     352                 : #ifdef IS_BIG_ENDIAN
     353                 :                                         (mEndian == kLittleEndian)
     354                 : #elif defined(IS_LITTLE_ENDIAN)
     355                 :                                         (mEndian == kBigEndian)
     356                 : #else
     357                 :     #error "Unknown endianness"
     358                 : #endif
     359              37 :                                         );
     360                 : 
     361                 :     // If BOM is not found and we're to return NS_OK, signal that BOM
     362                 :     // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
     363              37 :     return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
     364                 : }

Generated by: LCOV version 1.7