LCOV - code coverage report
Current view: directory - objdir/dist/include - nsUTF8Utils.h (source / functions) Found Hit Coverage
Test: app.info Lines: 244 135 55.3 %
Date: 2012-06-02 Functions: 28 27 96.4 %

       1                 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2                 : /* ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is mozilla.org code.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is
      18                 :  * Netscape Communications Corporation.
      19                 :  * Portions created by the Initial Developer are Copyright (C) 2001
      20                 :  * the Initial Developer. All Rights Reserved.
      21                 :  *
      22                 :  * Contributor(s):
      23                 :  *   Peter Annema <jaggernaut@netscape.com> (original author)
      24                 :  *
      25                 :  * Alternatively, the contents of this file may be used under the terms of
      26                 :  * either of the GNU General Public License Version 2 or later (the "GPL"),
      27                 :  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      28                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      29                 :  * of those above. If you wish to allow use of your version of this file only
      30                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      31                 :  * use your version of this file under the terms of the MPL, indicate your
      32                 :  * decision by deleting the provisions above and replace them with the notice
      33                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      34                 :  * the provisions above, a recipient may use your version of this file under
      35                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      36                 :  *
      37                 :  * ***** END LICENSE BLOCK ***** */
      38                 : #ifndef nsUTF8Utils_h_
      39                 : #define nsUTF8Utils_h_
      40                 : 
      41                 : // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
      42                 : // file will provide signatures for the Mozilla abstract string types. It will
      43                 : // use XPCOM assertion/debugging macros, etc.
      44                 : 
      45                 : #include "nscore.h"
      46                 : #include "mozilla/SSE.h"
      47                 : 
      48                 : #include "nsCharTraits.h"
      49                 : 
      50                 : class UTF8traits
      51                 :   {
      52                 :     public:
      53        69981944 :       static bool isASCII(char c) { return (c & 0x80) == 0x00; }
      54          372914 :       static bool isInSeq(char c) { return (c & 0xC0) == 0x80; }
      55          421568 :       static bool is2byte(char c) { return (c & 0xE0) == 0xC0; }
      56          325000 :       static bool is3byte(char c) { return (c & 0xF0) == 0xE0; }
      57               2 :       static bool is4byte(char c) { return (c & 0xF8) == 0xF0; }
      58               1 :       static bool is5byte(char c) { return (c & 0xFC) == 0xF8; }
      59               1 :       static bool is6byte(char c) { return (c & 0xFE) == 0xFC; }
      60                 :   };
      61                 : 
      62                 : /**
      63                 :  * Extract the next UCS-4 character from the buffer and return it.  The
      64                 :  * pointer passed in is advanced to the start of the next character in the
      65                 :  * buffer.  If non-null, the parameters err and overlong are filled in to
      66                 :  * indicate that the character was represented by an overlong sequence, or
      67                 :  * that an error occurred.
      68                 :  */
      69                 : 
      70                 : class UTF8CharEnumerator
      71                 : {
      72                 : public:
      73        32568304 :   static PRUint32 NextChar(const char **buffer, const char *end,
      74                 :                            bool *err)
      75                 :   {
      76        32568304 :     NS_ASSERTION(buffer && *buffer, "null buffer!");
      77                 : 
      78        32568304 :     const char *p = *buffer;
      79        32568304 :     *err = false;
      80                 : 
      81        32568304 :     if (p >= end)
      82                 :       {
      83               0 :         *err = true;
      84                 : 
      85               0 :         return 0;
      86                 :       }
      87                 : 
      88        32568304 :     char c = *p++;
      89                 : 
      90        32568304 :     if ( UTF8traits::isASCII(c) )
      91                 :       {
      92        32357793 :         *buffer = p;
      93        32357793 :         return c;
      94                 :       }
      95                 : 
      96                 :     PRUint32 ucs4;
      97                 :     PRUint32 minUcs4;
      98          210511 :     PRInt32 state = 0;
      99                 : 
     100          210511 :     if (!CalcState(c, ucs4, minUcs4, state)) {
     101               0 :         NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
     102               0 :         *err = true;
     103                 : 
     104               0 :         return 0;
     105                 :     }
     106                 : 
     107          793781 :     while ( state-- )
     108                 :       {
     109          372759 :         if (p == end)
     110                 :           {
     111               0 :             *err = true;
     112                 : 
     113               0 :             return 0;
     114                 :           }
     115                 : 
     116          372759 :         c = *p++;
     117                 : 
     118          372759 :         if (!AddByte(c, state, ucs4))
     119                 :           {
     120               0 :             *err = true;
     121                 : 
     122               0 :             return 0;
     123                 :           }
     124                 :       }
     125                 : 
     126          210511 :       if ( ucs4 < minUcs4 )
     127                 :         {
     128                 :           // Overlong sequence
     129               0 :           ucs4 = UCS2_REPLACEMENT_CHAR;
     130                 :         }
     131          210511 :       else if ( ucs4 >= 0xD800 &&
     132                 :                 (ucs4 <= 0xDFFF || ucs4 >= UCS_END))
     133                 :         {
     134                 :           // Surrogates and code points outside the Unicode range.
     135               0 :           ucs4 = UCS2_REPLACEMENT_CHAR;
     136                 :         }
     137                 : 
     138          210511 :     *buffer = p;
     139          210511 :     return ucs4;
     140                 :   }
     141                 : 
     142                 : private:
     143          210511 :   static bool CalcState(char c, PRUint32& ucs4, PRUint32& minUcs4,
     144                 :                           PRInt32& state)
     145                 :   {
     146          210511 :     if ( UTF8traits::is2byte(c) )
     147                 :       {
     148           48263 :         ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
     149           48263 :         state = 1;
     150           48263 :         minUcs4 = 0x00000080;
     151                 :       }
     152          162248 :     else if ( UTF8traits::is3byte(c) )
     153                 :       {
     154          162248 :         ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
     155          162248 :         state = 2;
     156          162248 :         minUcs4 = 0x00000800;
     157                 :       }
     158               0 :     else if ( UTF8traits::is4byte(c) )
     159                 :       {
     160               0 :         ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
     161               0 :         state = 3;
     162               0 :         minUcs4 = 0x00010000;
     163                 :       }
     164               0 :     else if ( UTF8traits::is5byte(c) )
     165                 :       {
     166               0 :         ucs4 = (PRUint32(c) << 24) & 0x03000000L;
     167               0 :         state = 4;
     168               0 :         minUcs4 = 0x00200000;
     169                 :       }
     170               0 :     else if ( UTF8traits::is6byte(c) )
     171                 :       {
     172               0 :         ucs4 = (PRUint32(c) << 30) & 0x40000000L;
     173               0 :         state = 5;
     174               0 :         minUcs4 = 0x04000000;
     175                 :       }
     176                 :     else
     177                 :       {
     178               0 :         return false;
     179                 :       }
     180                 : 
     181          210511 :     return true;
     182                 :   }
     183                 : 
     184          372759 :   static bool AddByte(char c, PRInt32 state, PRUint32& ucs4)
     185                 :   {
     186          372759 :     if ( UTF8traits::isInSeq(c) )
     187                 :       {
     188          372759 :         PRInt32 shift = state * 6;
     189          372759 :         ucs4 |= (PRUint32(c) & 0x3F) << shift;
     190          372759 :         return true;
     191                 :       }
     192                 : 
     193               0 :     return false;
     194                 :   }
     195                 : };
     196                 : 
     197                 : 
     198                 : /**
     199                 :  * Extract the next UCS-4 character from the buffer and return it.  The
     200                 :  * pointer passed in is advanced to the start of the next character in the
     201                 :  * buffer.  If non-null, the err parameter is filled in if an error occurs.
     202                 :  */
     203                 : 
     204                 : 
     205                 : class UTF16CharEnumerator
     206                 : {
     207                 : public:
     208               0 :   static PRUint32 NextChar(const PRUnichar **buffer, const PRUnichar *end,
     209                 :                            bool *err = nsnull)
     210                 :   {
     211               0 :     NS_ASSERTION(buffer && *buffer, "null buffer!");
     212                 : 
     213               0 :     const PRUnichar *p = *buffer;
     214                 : 
     215               0 :     if (p >= end)
     216                 :       {
     217               0 :         NS_ERROR("No input to work with");
     218               0 :         if (err)
     219               0 :           *err = true;
     220                 : 
     221               0 :         return 0;
     222                 :       }
     223                 : 
     224               0 :     PRUnichar c = *p++;
     225                 : 
     226               0 :     if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
     227                 :       {
     228               0 :         if (err)
     229               0 :           *err = false;
     230               0 :         *buffer = p;
     231               0 :         return c;
     232                 :       }
     233               0 :     else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
     234                 :       {
     235               0 :         if (p == end)
     236                 :           {
     237                 :             // Found a high surrogate the end of the buffer. Flag this
     238                 :             // as an error and return the Unicode replacement
     239                 :             // character 0xFFFD.
     240                 : 
     241               0 :             NS_WARNING("Unexpected end of buffer after high surrogate");
     242                 : 
     243               0 :             if (err)
     244               0 :               *err = true;
     245               0 :             *buffer = p;
     246               0 :             return 0xFFFD;
     247                 :           }
     248                 : 
     249                 :         // D800- DBFF - High Surrogate
     250               0 :         PRUnichar h = c;
     251                 : 
     252               0 :         c = *p++;
     253                 : 
     254               0 :         if (NS_IS_LOW_SURROGATE(c))
     255                 :           {
     256                 :             // DC00- DFFF - Low Surrogate
     257                 :             // N = (H - D800) *400 + 10000 + (L - DC00)
     258               0 :             PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
     259               0 :             if (err)
     260               0 :               *err = false;
     261               0 :             *buffer = p;
     262               0 :             return ucs4;
     263                 :           }
     264                 :         else
     265                 :           {
     266                 :             // Found a high surrogate followed by something other than
     267                 :             // a low surrogate. Flag this as an error and return the
     268                 :             // Unicode replacement character 0xFFFD.  Note that the
     269                 :             // pointer to the next character points to the second 16-bit
     270                 :             // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
     271                 :             // only the first code unit of an illegal sequence must be
     272                 :             // treated as an illegally terminated code unit sequence
     273                 :             // (also Chapter 3 D91, "isolated [not paired and ill-formed]
     274                 :             // UTF-16 code units in the range D800..DFFF are ill-formed").
     275               0 :             NS_WARNING("got a High Surrogate but no low surrogate");
     276                 : 
     277               0 :             if (err)
     278               0 :               *err = true;
     279               0 :             *buffer = p - 1;
     280               0 :             return 0xFFFD;
     281                 :           }
     282                 :       }
     283                 :     else // U+DC00 - U+DFFF
     284                 :       {
     285                 :         // DC00- DFFF - Low Surrogate
     286                 : 
     287                 :         // Found a low surrogate w/o a preceding high surrogate. Flag
     288                 :         // this as an error and return the Unicode replacement
     289                 :         // character 0xFFFD.
     290                 : 
     291               0 :         NS_WARNING("got a low Surrogate but no high surrogate");
     292               0 :         if (err)
     293               0 :           *err = true;
     294               0 :         *buffer = p;
     295               0 :         return 0xFFFD;
     296                 :       }
     297                 : 
     298                 :     if (err)
     299                 :       *err = true;
     300                 :     return 0;
     301                 :   }
     302                 : };
     303                 : 
     304                 : 
     305                 : /**
     306                 :  * A character sink (see |copy_string| in nsAlgorithm.h) for converting
     307                 :  * UTF-8 to UTF-16
     308                 :  */
     309                 : class ConvertUTF8toUTF16
     310                 :   {
     311                 :     public:
     312                 :       typedef char      value_type;
     313                 :       typedef PRUnichar buffer_type;
     314                 : 
     315          218840 :     ConvertUTF8toUTF16( buffer_type* aBuffer )
     316          218840 :         : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {}
     317                 : 
     318          218840 :     size_t Length() const { return mBuffer - mStart; }
     319                 : 
     320          223426 :     bool ErrorEncountered() const { return mErrorEncountered; }
     321                 : 
     322                 :     void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
     323                 :       {
     324          218840 :         if ( mErrorEncountered )
     325                 :           return;
     326                 : 
     327                 :         // algorithm assumes utf8 units won't
     328                 :         // be spread across fragments
     329          218840 :         const value_type* p = start;
     330          218840 :         const value_type* end = start + N;
     331          218840 :         buffer_type* out = mBuffer;
     332        32214847 :         for ( ; p != end /* && *p */; )
     333                 :           {
     334                 :             bool err;
     335        31996007 :             PRUint32 ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
     336                 : 
     337        31996007 :             if ( err )
     338                 :               {
     339               0 :                 mErrorEncountered = true;
     340               0 :                 mBuffer = out;
     341                 :                 return;
     342                 :               }
     343                 : 
     344        31996007 :             if ( ucs4 >= PLANE1_BASE )
     345                 :               {
     346               0 :                 *out++ = (buffer_type)H_SURROGATE(ucs4);
     347               0 :                 *out++ = (buffer_type)L_SURROGATE(ucs4);
     348                 :               }
     349                 :             else
     350                 :               {
     351        31996007 :                 *out++ = ucs4;
     352                 :               }
     353                 :           }
     354          218840 :         mBuffer = out;
     355                 :       }
     356                 : 
     357          102931 :     void write_terminator()
     358                 :       {
     359          102931 :         *mBuffer = buffer_type(0);
     360          102931 :       }
     361                 : 
     362                 :     private:
     363                 :       buffer_type* const mStart;
     364                 :       buffer_type* mBuffer;
     365                 :       bool mErrorEncountered;
     366                 :   };
     367                 : 
     368                 : /**
     369                 :  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
     370                 :  * the length of the UTF-16 string equivalent to a UTF-8 string.
     371                 :  */
     372                 : class CalculateUTF8Length
     373                 :   {
     374                 :     public:
     375                 :       typedef char value_type;
     376                 : 
     377          215601 :     CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { }
     378                 : 
     379          421207 :     size_t Length() const { return mLength; }
     380                 : 
     381                 :     void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
     382                 :       {
     383                 :           // ignore any further requests
     384          215601 :         if ( mErrorEncountered )
     385                 :             return;
     386                 : 
     387                 :         // algorithm assumes utf8 units won't
     388                 :         // be spread across fragments
     389          215601 :         const value_type* p = start;
     390          215601 :         const value_type* end = start + N;
     391         9265874 :         for ( ; p < end /* && *p */; ++mLength )
     392                 :           {
     393         9265874 :             if ( UTF8traits::isASCII(*p) )
     394         9055586 :                 p += 1;
     395          210288 :             else if ( UTF8traits::is2byte(*p) )
     396           48261 :                 p += 2;
     397          162027 :             else if ( UTF8traits::is3byte(*p) )
     398          162027 :                 p += 3;
     399               0 :             else if ( UTF8traits::is4byte(*p) ) {
     400                 :                 // Because a UTF-8 sequence of 4 bytes represents a codepoint
     401                 :                 // greater than 0xFFFF, it will become a surrogate pair in the
     402                 :                 // UTF-16 string, so add 1 more to mLength.
     403                 :                 // This doesn't happen with is5byte and is6byte because they
     404                 :                 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
     405                 :                 // converted to a single replacement character.
     406                 : 
     407                 :                 // However, there is one case when a 4 byte UTF-8 sequence will
     408                 :                 // only generate 2 UTF-16 bytes. If we have a properly encoded
     409                 :                 // sequence, but with an invalid value (too small or too big),
     410                 :                 // that will result in a replacement character being written
     411                 :                 // This replacement character is encoded as just 1 single
     412                 :                 // UTF-16 character, which is 2 bytes.
     413                 : 
     414                 :                 // The below code therefore only adds 1 to mLength if the UTF8
     415                 :                 // data will produce a decoded character which is greater than
     416                 :                 // or equal to 0x010000 and less than 0x0110000.
     417                 : 
     418                 :                 // A 4byte UTF8 character is encoded as
     419                 :                 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     420                 :                 // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
     421                 :                 // map to bit 17-21 in the final result. If these bits are
     422                 :                 // between 0x01 and 0x11, that means that the final result is
     423                 :                 // between 0x010000 and 0x110000. The below code reads these
     424                 :                 // bits out and assigns them to c, but shifted up 4 bits to
     425                 :                 // avoid having to shift twice.
     426                 : 
     427                 :                 // It doesn't matter what to do in the case where p + 4 > end
     428                 :                 // since no UTF16 characters will be written in that case by
     429                 :                 // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
     430                 :                 // any of the surrogate bits are wrong since no UTF16
     431                 :                 // characters will be written in that case either.
     432                 : 
     433               0 :                 if (p + 4 <= end) {
     434               0 :                   PRUint32 c = ((PRUint32)(p[0] & 0x07)) << 6 |
     435               0 :                                ((PRUint32)(p[1] & 0x30));
     436               0 :                   if (c >= 0x010 && c < 0x110)
     437               0 :                     ++mLength;
     438                 :                 }
     439                 : 
     440               0 :                 p += 4;
     441                 :             }
     442               0 :             else if ( UTF8traits::is5byte(*p) )
     443               0 :                 p += 5;
     444               0 :             else if ( UTF8traits::is6byte(*p) )
     445               0 :                 p += 6;
     446                 :             else // error
     447                 :               {
     448               0 :                 ++mLength; // to account for the decrement below
     449                 :                 break;
     450                 :               }
     451                 :           }
     452          215601 :         if ( p != end )
     453                 :           {
     454               0 :             NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
     455               0 :             --mLength; // The last multi-byte char wasn't complete, discard it.
     456               0 :             mErrorEncountered = true;
     457                 :           }
     458                 :       }
     459                 : 
     460                 :     private:
     461                 :       size_t mLength;
     462                 :       bool mErrorEncountered;
     463                 :   };
     464                 : 
     465                 : /**
     466                 :  * A character sink (see |copy_string| in nsAlgorithm.h) for
     467                 :  * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
     468                 :  * (0xEFBFBD in UTF-8).
     469                 :  */
     470                 : class ConvertUTF16toUTF8
     471                 :   {
     472                 :     public:
     473                 :       typedef PRUnichar value_type;
     474                 :       typedef char      buffer_type;
     475                 : 
     476                 :     // The error handling here is more lenient than that in
     477                 :     // |ConvertUTF8toUTF16|, but it's that way for backwards
     478                 :     // compatibility.
     479                 : 
     480         1379912 :     ConvertUTF16toUTF8( buffer_type* aBuffer )
     481         1379912 :         : mStart(aBuffer), mBuffer(aBuffer) {}
     482                 : 
     483         1379912 :     size_t Size() const { return mBuffer - mStart; }
     484                 : 
     485                 :     void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
     486                 :       {
     487         1379912 :         buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
     488                 : 
     489        47953169 :         for (const value_type *p = start, *end = start + N; p < end; ++p )
     490                 :           {
     491        46573257 :             value_type c = *p;
     492        46573257 :             if (! (c & 0xFF80)) // U+0000 - U+007F
     493                 :               {
     494        46410211 :                 *out++ = (char)c;
     495                 :               }
     496          163046 :             else if (! (c & 0xF800)) // U+0100 - U+07FF
     497                 :               {
     498           27864 :                 *out++ = 0xC0 | (char)(c >> 6);
     499           27864 :                 *out++ = 0x80 | (char)(0x003F & c);
     500                 :               }
     501          135182 :             else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
     502                 :               {
     503          135176 :                 *out++ = 0xE0 | (char)(c >> 12);
     504          135176 :                 *out++ = 0x80 | (char)(0x003F & (c >> 6));
     505          135176 :                 *out++ = 0x80 | (char)(0x003F & c );
     506                 :               }
     507               6 :             else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
     508                 :               {
     509                 :                 // D800- DBFF - High Surrogate
     510               6 :                 value_type h = c;
     511                 : 
     512               6 :                 ++p;
     513               6 :                 if (p == end)
     514                 :                   {
     515                 :                     // Treat broken characters as the Unicode
     516                 :                     // replacement character 0xFFFD (0xEFBFBD in
     517                 :                     // UTF-8)
     518               0 :                     *out++ = '\xEF';
     519               0 :                     *out++ = '\xBF';
     520               0 :                     *out++ = '\xBD';
     521                 : 
     522               0 :                     NS_WARNING("String ending in half a surrogate pair!");
     523                 : 
     524                 :                     break;
     525                 :                   }
     526               6 :                 c = *p;
     527                 : 
     528               6 :                 if (NS_IS_LOW_SURROGATE(c))
     529                 :                   {
     530                 :                     // DC00- DFFF - Low Surrogate
     531                 :                     // N = (H - D800) *400 + 10000 + ( L - DC00 )
     532               6 :                     PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
     533                 : 
     534                 :                     // 0001 0000-001F FFFF
     535               6 :                     *out++ = 0xF0 | (char)(ucs4 >> 18);
     536               6 :                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
     537               6 :                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
     538               6 :                     *out++ = 0x80 | (char)(0x003F & ucs4);
     539                 :                   }
     540                 :                 else
     541                 :                   {
     542                 :                     // Treat broken characters as the Unicode
     543                 :                     // replacement character 0xFFFD (0xEFBFBD in
     544                 :                     // UTF-8)
     545               0 :                     *out++ = '\xEF';
     546               0 :                     *out++ = '\xBF';
     547               0 :                     *out++ = '\xBD';
     548                 : 
     549                 :                     // The pointer to the next character points to the second
     550                 :                     // 16-bit value, not beyond it, as per Unicode 5.0.0
     551                 :                     // Chapter 3 C10, only the first code unit of an illegal
     552                 :                     // sequence must be treated as an illegally terminated
     553                 :                     // code unit sequence (also Chapter 3 D91, "isolated [not
     554                 :                     // paired and ill-formed] UTF-16 code units in the range
     555                 :                     // D800..DFFF are ill-formed").
     556               0 :                     p--;
     557                 : 
     558               0 :                     NS_WARNING("got a High Surrogate but no low surrogate");
     559                 :                   }
     560                 :               }
     561                 :             else // U+DC00 - U+DFFF
     562                 :               {
     563                 :                 // Treat broken characters as the Unicode replacement
     564                 :                 // character 0xFFFD (0xEFBFBD in UTF-8)
     565               0 :                 *out++ = '\xEF';
     566               0 :                 *out++ = '\xBF';
     567               0 :                 *out++ = '\xBD';
     568                 : 
     569                 :                 // DC00- DFFF - Low Surrogate
     570               0 :                 NS_WARNING("got a low Surrogate but no high surrogate");
     571                 :               }
     572                 :           }
     573                 : 
     574         1379912 :         mBuffer = out;
     575                 :       }
     576                 : 
     577            2088 :     void write_terminator()
     578                 :       {
     579            2088 :         *mBuffer = buffer_type(0);
     580            2088 :       }
     581                 : 
     582                 :     private:
     583                 :       buffer_type* const mStart;
     584                 :       buffer_type* mBuffer;
     585                 :   };
     586                 : 
     587                 : /**
     588                 :  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
     589                 :  * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
     590                 :  * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
     591                 :  */
     592                 : class CalculateUTF8Size
     593                 :   {
     594                 :     public:
     595                 :       typedef PRUnichar value_type;
     596                 : 
     597         1231520 :     CalculateUTF8Size()
     598         1231520 :       : mSize(0) { }
     599                 : 
     600         1233608 :     size_t Size() const { return mSize; }
     601                 : 
     602                 :     void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
     603                 :       {
     604                 :         // Assume UCS2 surrogate pairs won't be spread across fragments.
     605        47652730 :         for (const value_type *p = start, *end = start + N; p < end; ++p )
     606                 :           {
     607        46421210 :             value_type c = *p;
     608        46421210 :             if (! (c & 0xFF80)) // U+0000 - U+007F
     609        46408790 :               mSize += 1;
     610           12420 :             else if (! (c & 0xF800)) // U+0100 - U+07FF
     611             865 :               mSize += 2;
     612           11555 :             else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
     613           11549 :               mSize += 3;
     614               6 :             else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
     615                 :               {
     616               6 :                 ++p;
     617               6 :                 if (p == end)
     618                 :                   {
     619                 :                     // Treat broken characters as the Unicode
     620                 :                     // replacement character 0xFFFD (0xEFBFBD in
     621                 :                     // UTF-8)
     622               0 :                     mSize += 3;
     623                 : 
     624               0 :                     NS_WARNING("String ending in half a surrogate pair!");
     625                 : 
     626                 :                     break;
     627                 :                   }
     628               6 :                 c = *p;
     629                 : 
     630               6 :                 if (0xDC00 == (0xFC00 & c))
     631               6 :                   mSize += 4;
     632                 :                 else
     633                 :                   {
     634                 :                     // Treat broken characters as the Unicode
     635                 :                     // replacement character 0xFFFD (0xEFBFBD in
     636                 :                     // UTF-8)
     637               0 :                     mSize += 3;
     638                 : 
     639                 :                     // The next code unit is the second 16-bit value, not
     640                 :                     // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
     641                 :                     // only the first code unit of an illegal sequence must
     642                 :                     // be treated as an illegally terminated code unit
     643                 :                     // sequence (also Chapter 3 D91, "isolated [not paired and
     644                 :                     // ill-formed] UTF-16 code units in the range D800..DFFF
     645                 :                     // are ill-formed").
     646               0 :                     p--;
     647                 : 
     648               0 :                     NS_WARNING("got a high Surrogate but no low surrogate");
     649                 :                   }
     650                 :               }
     651                 :             else // U+DC00 - U+DFFF
     652                 :               {
     653                 :                 // Treat broken characters as the Unicode replacement
     654                 :                 // character 0xFFFD (0xEFBFBD in UTF-8)
     655               0 :                 mSize += 3;
     656                 : 
     657               0 :                 NS_WARNING("got a low Surrogate but no high surrogate");
     658                 :               }
     659                 :           }
     660                 :       }
     661                 : 
     662                 :     private:
     663                 :       size_t mSize;
     664                 :   };
     665                 : 
     666                 : #ifdef MOZILLA_INTERNAL_API
     667                 : /**
     668                 :  * A character sink that performs a |reinterpret_cast|-style conversion
     669                 :  * from char to PRUnichar.
     670                 :  */
     671                 : class LossyConvertEncoding8to16
     672                 :   {
     673                 :     public:
     674                 :       typedef char      value_type;
     675                 :       typedef char      input_type;
     676                 :       typedef PRUnichar output_type;
     677                 : 
     678                 :     public:
     679         1352778 :       LossyConvertEncoding8to16( PRUnichar* aDestination ) :
     680         1352778 :         mDestination(aDestination) { }
     681                 : 
     682                 :       void
     683         1352778 :       write( const char* aSource, PRUint32 aSourceLength )
     684                 :         {
     685                 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
     686         1352778 :           if (mozilla::supports_sse2())
     687                 :             {
     688         1352778 :               write_sse2(aSource, aSourceLength);
     689         1352778 :               return;
     690                 :             }
     691                 : #endif
     692               0 :           const char* done_writing = aSource + aSourceLength;
     693               0 :           while ( aSource < done_writing )
     694               0 :             *mDestination++ = (PRUnichar)(unsigned char)(*aSource++);
     695                 :         }
     696                 : 
     697                 :       void
     698                 :       write_sse2( const char* aSource, PRUint32 aSourceLength );
     699                 : 
     700                 :       void
     701          768686 :       write_terminator()
     702                 :         {
     703          768686 :           *mDestination = (PRUnichar)(0);
     704          768686 :         }
     705                 : 
     706                 :     private:
     707                 :       PRUnichar* mDestination;
     708                 :   };
     709                 : 
     710                 : /**
     711                 :  * A character sink that performs a |reinterpret_cast|-style conversion
     712                 :  * from PRUnichar to char.
     713                 :  */
     714                 : class LossyConvertEncoding16to8
     715                 :   {
     716                 :     public:
     717                 :       typedef PRUnichar value_type;
     718                 :       typedef PRUnichar input_type;
     719                 :       typedef char      output_type;
     720                 : 
     721           26553 :       LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
     722                 : 
     723                 :       void
     724           26553 :       write( const PRUnichar* aSource, PRUint32 aSourceLength)
     725                 :         {
     726                 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
     727           26553 :           if (mozilla::supports_sse2())
     728                 :             {
     729           26553 :               write_sse2(aSource, aSourceLength);
     730           26553 :               return;
     731                 :             }
     732                 : #endif
     733               0 :             const PRUnichar* done_writing = aSource + aSourceLength;
     734               0 :             while ( aSource < done_writing )
     735               0 :               *mDestination++ = (char)(*aSource++);
     736                 :         }
     737                 : 
     738                 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
     739                 :       void
     740                 :       write_sse2( const PRUnichar* aSource, PRUint32 aSourceLength );
     741                 : #endif
     742                 : 
     743                 :       void
     744              36 :       write_terminator()
     745                 :         {
     746              36 :           *mDestination = '\0';
     747              36 :         }
     748                 : 
     749                 :     private:
     750                 :       char *mDestination;
     751                 :   };
     752                 : #endif // MOZILLA_INTERNAL_API
     753                 : 
     754                 : #endif /* !defined(nsUTF8Utils_h_) */

Generated by: LCOV version 1.7