LCOV - code coverage report
Current view: directory - intl/uconv/src - nsUTF8ToUnicode.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 118 102 86.4 %
Date: 2012-06-02 Functions: 6 6 100.0 %

       1                 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2                 : /* ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is Mozilla Communicator client code.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is
      18                 :  * Netscape Communications Corporation.
      19                 :  * Portions created by the Initial Developer are Copyright (C) 1998
      20                 :  * the Initial Developer. All Rights Reserved.
      21                 :  *
      22                 :  * Contributor(s):
      23                 :  *
      24                 :  * Alternatively, the contents of this file may be used under the terms of
      25                 :  * either of the GNU General Public License Version 2 or later (the "GPL"),
      26                 :  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      27                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      28                 :  * of those above. If you wish to allow use of your version of this file only
      29                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      30                 :  * use your version of this file under the terms of the MPL, indicate your
      31                 :  * decision by deleting the provisions above and replace them with the notice
      32                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      33                 :  * the provisions above, a recipient may use your version of this file under
      34                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      35                 :  *
      36                 :  * ***** END LICENSE BLOCK ***** */
      37                 : 
      38                 : #include "nsAlgorithm.h"
      39                 : #include "nsUCSupport.h"
      40                 : #include "nsUTF8ToUnicode.h"
      41                 : #include "mozilla/SSE.h"
      42                 : 
      43                 : #define UNICODE_BYTE_ORDER_MARK    0xFEFF
      44                 : 
      45              47 : static PRUnichar* EmitSurrogatePair(PRUint32 ucs4, PRUnichar* aDest)
      46                 : {
      47              47 :   NS_ASSERTION(ucs4 > 0xFFFF, "Should be a supplementary character");
      48              47 :   ucs4 -= 0x00010000;
      49              47 :   *aDest++ = 0xD800 | (0x000003FF & (ucs4 >> 10));
      50              47 :   *aDest++ = 0xDC00 | (0x000003FF & ucs4);
      51              47 :   return aDest;
      52                 : }
      53                 : 
      54                 : //----------------------------------------------------------------------
      55                 : // Class nsUTF8ToUnicode [implementation]
      56                 : 
      57            5188 : nsUTF8ToUnicode::nsUTF8ToUnicode()
      58            5188 : : nsBasicDecoderSupport()
      59                 : {
      60            5188 :   Reset();
      61            5188 : }
      62                 : 
      63                 : //----------------------------------------------------------------------
      64                 : // Subclassing of nsTableDecoderSupport class [implementation]
      65                 : 
      66                 : /**
      67                 :  * Normally the maximum length of the output of the UTF8 decoder in UTF16
      68                 :  *  code units is the same as the length of the input in UTF8 code units,
      69                 :  *  since 1-byte, 2-byte and 3-byte UTF-8 sequences decode to a single
      70                 :  *  UTF-16 character, and 4-byte UTF-8 sequences decode to a surrogate pair.
      71                 :  *
      72                 :  * However, there is an edge case where the output can be longer than the
      73                 :  *  input: if the previous buffer ended with an incomplete multi-byte
      74                 :  *  sequence and this buffer does not begin with a valid continuation
      75                 :  *  byte, we will return NS_ERROR_ILLEGAL_INPUT and the caller may insert a
      76                 :  *  replacement character in the output buffer which corresponds to no
      77                 :  *  character in the input buffer. So in the worst case the destination
      78                 :  *  will need to be one code unit longer than the source.
      79                 :  *  See bug 301797.
      80                 :  */
      81           16302 : NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc,
      82                 :                                             PRInt32 aSrcLength,
      83                 :                                             PRInt32 * aDestLength)
      84                 : {
      85           16302 :   *aDestLength = aSrcLength + 1;
      86           16302 :   return NS_OK;
      87                 : }
      88                 : 
      89                 : 
      90                 : //----------------------------------------------------------------------
      91                 : // Subclassing of nsBasicDecoderSupport class [implementation]
      92                 : 
      93            5512 : NS_IMETHODIMP nsUTF8ToUnicode::Reset()
      94                 : {
      95                 : 
      96            5512 :   mUcs4  = 0;     // cached Unicode character
      97            5512 :   mState = 0;     // cached expected number of octets after the current octet
      98                 :                   // until the beginning of the next UTF8 character sequence
      99            5512 :   mBytes = 1;     // cached expected number of octets in the current sequence
     100            5512 :   mFirst = true;
     101                 : 
     102            5512 :   return NS_OK;
     103                 : 
     104                 : }
     105                 : 
     106                 : //----------------------------------------------------------------------
     107                 : // Subclassing of nsBasicDecoderSupport class [implementation]
     108                 : 
     109                 : // Fast ASCII -> UTF16 inner loop implementations
     110                 : //
     111                 : // Convert_ascii_run will update src and dst to the new values, and
     112                 : // len must be the maximum number ascii chars that it would be valid
     113                 : // to take from src and place into dst.  (That is, the minimum of the
     114                 : // number of bytes left in src and the number of unichars available in
     115                 : // dst.)
     116                 : 
     117                 : #if defined(__arm__) || defined(_M_ARM)
     118                 : 
     119                 : // on ARM, do extra work to avoid byte/halfword reads/writes by
     120                 : // reading/writing a word at a time for as long as we can
     121                 : static inline void
     122                 : Convert_ascii_run (const char *&src,
     123                 :                    PRUnichar *&dst,
     124                 :                    PRInt32 len)
     125                 : {
     126                 :   const PRUint32 *src32;
     127                 :   PRUint32 *dst32;
     128                 : 
     129                 :   // with some alignments, we'd never actually break out of the slow loop, so
     130                 :   // check and do the faster slow loop
     131                 :   if ((((NS_PTR_TO_UINT32(dst) & 3) == 0) && ((NS_PTR_TO_UINT32(src) & 1) == 0)) ||
     132                 :       (((NS_PTR_TO_UINT32(dst) & 3) == 2) && ((NS_PTR_TO_UINT32(src) & 1) == 1)))
     133                 :   {
     134                 :     while (((NS_PTR_TO_UINT32(src) & 3) ||
     135                 :             (NS_PTR_TO_UINT32(dst) & 3)) &&
     136                 :            len > 0)
     137                 :     {
     138                 :       if (*src & 0x80U)
     139                 :         return;
     140                 :       *dst++ = (PRUnichar) *src++;
     141                 :       len--;
     142                 :     }
     143                 :   } else {
     144                 :     goto finish;
     145                 :   }
     146                 : 
     147                 :   // then go 4 bytes at a time
     148                 :   src32 = (const PRUint32*) src;
     149                 :   dst32 = (PRUint32*) dst;
     150                 : 
     151                 :   while (len > 4) {
     152                 :     PRUint32 in = *src32++;
     153                 : 
     154                 :     if (in & 0x80808080U) {
     155                 :       src32--;
     156                 :       break;
     157                 :     }
     158                 : 
     159                 :     *dst32++ = ((in & 0x000000ff) >>  0) | ((in & 0x0000ff00) << 8);
     160                 :     *dst32++ = ((in & 0x00ff0000) >> 16) | ((in & 0xff000000) >> 8);
     161                 : 
     162                 :     len -= 4;
     163                 :   }
     164                 : 
     165                 :   src = (const char *) src32;
     166                 :   dst = (PRUnichar *) dst32;
     167                 : 
     168                 : finish:
     169                 :   while (len-- > 0 && (*src & 0x80U) == 0) {
     170                 :     *dst++ = (PRUnichar) *src++;
     171                 :   }
     172                 : }
     173                 : 
     174                 : #else
     175                 : 
     176                 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
     177                 : namespace mozilla {
     178                 : namespace SSE2 {
     179                 : 
     180                 : void Convert_ascii_run(const char *&src, PRUnichar *&dst, PRInt32 len);
     181                 : 
     182                 : }
     183                 : }
     184                 : #endif
     185                 : 
     186                 : static inline void
     187           17715 : Convert_ascii_run (const char *&src,
     188                 :                    PRUnichar *&dst,
     189                 :                    PRInt32 len)
     190                 : {
     191                 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
     192           17715 :   if (mozilla::supports_sse2()) {
     193           17715 :     mozilla::SSE2::Convert_ascii_run(src, dst, len);
     194           17715 :     return;
     195                 :   }
     196                 : #endif
     197                 : 
     198               0 :   while (len-- > 0 && (*src & 0x80U) == 0) {
     199               0 :     *dst++ = (PRUnichar) *src++;
     200                 :   }
     201                 : }
     202                 : 
     203                 : #endif
     204                 : 
     205           17210 : NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
     206                 :                                        PRInt32 * aSrcLength,
     207                 :                                        PRUnichar * aDest,
     208                 :                                        PRInt32 * aDestLength)
     209                 : {
     210           17210 :   PRUint32 aSrcLen   = (PRUint32) (*aSrcLength);
     211           17210 :   PRUint32 aDestLen = (PRUint32) (*aDestLength);
     212                 : 
     213                 :   const char *in, *inend;
     214           17210 :   inend = aSrc + aSrcLen;
     215                 : 
     216                 :   PRUnichar *out, *outend;
     217           17210 :   outend = aDest + aDestLen;
     218                 : 
     219           17210 :   nsresult res = NS_OK; // conversion result
     220                 : 
     221           17210 :   out = aDest;
     222           17210 :   if (mState == 0xFF) {
     223                 :     // Emit supplementary character left over from previous iteration. If the
     224                 :     // buffer size is insufficient, treat it as an illegal character.
     225               0 :     if (aDestLen < 2) {
     226               0 :       NS_ERROR("Output buffer insufficient to hold supplementary character");
     227               0 :       mState = 0;
     228               0 :       return NS_ERROR_ILLEGAL_INPUT;
     229                 :     }
     230               0 :     out = EmitSurrogatePair(mUcs4, out);
     231               0 :     mUcs4 = 0;
     232               0 :     mState = 0;
     233               0 :     mBytes = 1;
     234               0 :     mFirst = false;
     235                 :   }
     236                 : 
     237                 :   // alias these locally for speed
     238           17210 :   PRInt32 mUcs4 = this->mUcs4;
     239           17210 :   PRUint8 mState = this->mState;
     240           17210 :   PRUint8 mBytes = this->mBytes;
     241           17210 :   bool mFirst = this->mFirst;
     242                 : 
     243                 :   // Set mFirst to false now so we don't have to every time through the ASCII
     244                 :   // branch within the loop.
     245           17210 :   if (mFirst && aSrcLen && (0 == (0x80 & (*aSrc))))
     246            4360 :     mFirst = false;
     247                 : 
     248           40911 :   for (in = aSrc; ((in < inend) && (out < outend)); ++in) {
     249           24165 :     if (0 == mState) {
     250                 :       // When mState is zero we expect either a US-ASCII character or a
     251                 :       // multi-octet sequence.
     252           20707 :       if (0 == (0x80 & (*in))) {
     253           17715 :         PRInt32 max_loops = NS_MIN(inend - in, outend - out);
     254           17715 :         Convert_ascii_run(in, out, max_loops);
     255           17715 :         --in; // match the rest of the cases
     256           17715 :         mBytes = 1;
     257            2992 :       } else if (0xC0 == (0xE0 & (*in))) {
     258                 :         // First octet of 2 octet sequence
     259            2131 :         mUcs4 = (PRUint32)(*in);
     260            2131 :         mUcs4 = (mUcs4 & 0x1F) << 6;
     261            2131 :         mState = 1;
     262            2131 :         mBytes = 2;
     263             861 :       } else if (0xE0 == (0xF0 & (*in))) {
     264                 :         // First octet of 3 octet sequence
     265             542 :         mUcs4 = (PRUint32)(*in);
     266             542 :         mUcs4 = (mUcs4 & 0x0F) << 12;
     267             542 :         mState = 2;
     268             542 :         mBytes = 3;
     269             319 :       } else if (0xF0 == (0xF8 & (*in))) {
     270                 :         // First octet of 4 octet sequence
     271              82 :         mUcs4 = (PRUint32)(*in);
     272              82 :         mUcs4 = (mUcs4 & 0x07) << 18;
     273              82 :         mState = 3;
     274              82 :         mBytes = 4;
     275             237 :       } else if (0xF8 == (0xFC & (*in))) {
     276                 :         /* First octet of 5 octet sequence.
     277                 :          *
     278                 :          * This is illegal because the encoded codepoint must be either
     279                 :          * (a) not the shortest form or
     280                 :          * (b) outside the Unicode range of 0-0x10FFFF.
     281                 :          * Rather than trying to resynchronize, we will carry on until the end
     282                 :          * of the sequence and let the later error handling code catch it.
     283                 :          */
     284              11 :         mUcs4 = (PRUint32)(*in);
     285              11 :         mUcs4 = (mUcs4 & 0x03) << 24;
     286              11 :         mState = 4;
     287              11 :         mBytes = 5;
     288             226 :       } else if (0xFC == (0xFE & (*in))) {
     289                 :         // First octet of 6 octet sequence, see comments for 5 octet sequence.
     290               1 :         mUcs4 = (PRUint32)(*in);
     291               1 :         mUcs4 = (mUcs4 & 1) << 30;
     292               1 :         mState = 5;
     293               1 :         mBytes = 6;
     294                 :       } else {
     295                 :         /* Current octet is neither in the US-ASCII range nor a legal first
     296                 :          * octet of a multi-octet sequence.
     297                 :          *
     298                 :          * Return an error condition. Caller is responsible for flushing and
     299                 :          * refilling the buffer and resetting state.
     300                 :          */
     301             225 :         res = NS_ERROR_ILLEGAL_INPUT;
     302             225 :         break;
     303                 :       }
     304                 :     } else {
     305                 :       // When mState is non-zero, we expect a continuation of the multi-octet
     306                 :       // sequence
     307            3458 :       if (0x80 == (0xC0 & (*in))) {
     308                 :         // Legal continuation.
     309            3247 :         PRUint32 shift = (mState - 1) * 6;
     310            3247 :         PRUint32 tmp = *in;
     311            3247 :         tmp = (tmp & 0x0000003FL) << shift;
     312            3247 :         mUcs4 |= tmp;
     313                 : 
     314            3247 :         if (0 == --mState) {
     315                 :           /* End of the multi-octet sequence. mUcs4 now contains the final
     316                 :            * Unicode codepoint to be output
     317                 :            *
     318                 :            * Check for illegal sequences and codepoints.
     319                 :            */
     320                 : 
     321                 :           // From Unicode 3.1, non-shortest form is illegal
     322            2618 :           if (((2 == mBytes) && (mUcs4 < 0x0080)) ||
     323                 :               ((3 == mBytes) && (mUcs4 < 0x0800)) ||
     324                 :               ((4 == mBytes) && (mUcs4 < 0x10000)) ||
     325                 :               (4 < mBytes) ||
     326                 :               // From Unicode 3.2, surrogate characters are illegal
     327                 :               ((mUcs4 & 0xFFFFF800) == 0xD800) ||
     328                 :               // Codepoints outside the Unicode range are illegal
     329                 :               (mUcs4 > 0x10FFFF)) {
     330              28 :             res = NS_ERROR_ILLEGAL_INPUT;
     331              28 :             break;
     332                 :           }
     333            2590 :           if (mUcs4 > 0xFFFF) {
     334                 :             // mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair
     335              47 :             if (out + 2 > outend) {
     336                 :               // insufficient space left in the buffer. Keep mUcs4 for the
     337                 :               // next iteration.
     338               0 :               mState = 0xFF;
     339               0 :               ++in;
     340               0 :               res = NS_OK_UDEC_MOREOUTPUT;
     341               0 :               break;
     342                 :             }
     343              47 :             out = EmitSurrogatePair(mUcs4, out);
     344            2543 :           } else if (UNICODE_BYTE_ORDER_MARK != mUcs4 || !mFirst) {
     345                 :             // Don't output the BOM only if it is the first character
     346            2543 :             *out++ = mUcs4;
     347                 :           }
     348                 :           //initialize UTF8 cache
     349            2590 :           mUcs4  = 0;
     350            2590 :           mState = 0;
     351            2590 :           mBytes = 1;
     352            2590 :           mFirst = false;
     353                 :         }
     354                 :       } else {
     355                 :         /* ((0xC0 & (*in) != 0x80) && (mState != 0))
     356                 :          * 
     357                 :          * Incomplete multi-octet sequence. Unconsume this
     358                 :          * octet and return an error condition. Caller is responsible
     359                 :          * for flushing and refilling the buffer and resetting state.
     360                 :          */
     361             211 :         in--;
     362             211 :         res = NS_ERROR_ILLEGAL_INPUT;
     363             211 :         break;
     364                 :       }
     365                 :     }
     366                 :   }
     367                 : 
     368                 :   // output not finished, output buffer too short
     369           17210 :   if ((NS_OK == res) && (in < inend) && (out >= outend))
     370               0 :     res = NS_OK_UDEC_MOREOUTPUT;
     371                 : 
     372                 :   // last UCS4 is incomplete, make sure the caller
     373                 :   // returns with properly aligned continuation of the buffer
     374           17210 :   if ((NS_OK == res) && (mState != 0))
     375               4 :     res = NS_OK_UDEC_MOREINPUT;
     376                 : 
     377           17210 :   *aSrcLength = in - aSrc;
     378           17210 :   *aDestLength = out - aDest;
     379                 : 
     380           17210 :   this->mUcs4 = mUcs4;
     381           17210 :   this->mState = mState;
     382           17210 :   this->mBytes = mBytes;
     383           17210 :   this->mFirst = mFirst;
     384                 : 
     385           17210 :   return(res);
     386                 : }

Generated by: LCOV version 1.7