LCOV - code coverage report
Current view: directory - toolkit/crashreporter/google-breakpad/src/common - convert_UTF.c (source / functions) Found Hit Coverage
Test: app.info Lines: 247 0 0.0 %
Date: 2012-06-02 Functions: 8 0 0.0 %

       1                 : /*
       2                 :  * Copyright 2001-2004 Unicode, Inc.
       3                 :  *
       4                 :  * Disclaimer
       5                 :  *
       6                 :  * This source code is provided as is by Unicode, Inc. No claims are
       7                 :  * made as to fitness for any particular purpose. No warranties of any
       8                 :  * kind are expressed or implied. The recipient agrees to determine
       9                 :  * applicability of information provided. If this file has been
      10                 :  * purchased on magnetic or optical media from Unicode, Inc., the
      11                 :  * sole remedy for any claim will be exchange of defective media
      12                 :  * within 90 days of receipt.
      13                 :  *
      14                 :  * Limitations on Rights to Redistribute This Code
      15                 :  *
      16                 :  * Unicode, Inc. hereby grants the right to freely use the information
      17                 :  * supplied in this file in the creation of products supporting the
      18                 :  * Unicode Standard, and to make copies of this file in any form
      19                 :  * for internal or external distribution as long as this notice
      20                 :  * remains attached.
      21                 :  */
      22                 : 
      23                 : /* ---------------------------------------------------------------------
      24                 : 
      25                 : Conversions between UTF32, UTF-16, and UTF-8. Source code file.
      26                 : Author: Mark E. Davis, 1994.
      27                 : Rev History: Rick McGowan, fixes & updates May 2001.
      28                 : Sept 2001: fixed const & error conditions per
      29                 : mods suggested by S. Parent & A. Lillich.
      30                 : June 2002: Tim Dodd added detection and handling of incomplete
      31                 : source sequences, enhanced error detection, added casts
      32                 : to eliminate compiler warnings.
      33                 : July 2003: slight mods to back out aggressive FFFE detection.
      34                 : Jan 2004: updated switches in from-UTF8 conversions.
      35                 : Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
      36                 : 
      37                 : See the header file "ConvertUTF.h" for complete documentation.
      38                 : 
      39                 : ------------------------------------------------------------------------ */
      40                 : 
      41                 : 
      42                 : #include "convert_UTF.h"
      43                 : #ifdef CVTUTF_DEBUG
      44                 : #include <stdio.h>
      45                 : #endif
      46                 : 
      47                 : static const int halfShift  = 10; /* used for shifting by 10 bits */
      48                 : 
      49                 : static const UTF32 halfBase = 0x0010000UL;
      50                 : static const UTF32 halfMask = 0x3FFUL;
      51                 : 
      52                 : #define UNI_SUR_HIGH_START  (UTF32)0xD800
      53                 : #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
      54                 : #define UNI_SUR_LOW_START   (UTF32)0xDC00
      55                 : #define UNI_SUR_LOW_END     (UTF32)0xDFFF
      56                 : #define false      0
      57                 : #define true        1
      58                 : 
      59                 : /* --------------------------------------------------------------------- */
      60                 : 
      61               0 : ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd,
      62                 :                                       UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
      63               0 :   ConversionResult result = conversionOK;
      64               0 :   const UTF32* source = *sourceStart;
      65               0 :   UTF16* target = *targetStart;
      66               0 :   while (source < sourceEnd) {
      67                 :     UTF32 ch;
      68               0 :     if (target >= targetEnd) {
      69               0 :             result = targetExhausted; break;
      70                 :     }
      71               0 :     ch = *source++;
      72               0 :     if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
      73                 :             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
      74               0 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
      75               0 :         if (flags == strictConversion) {
      76               0 :           --source; /* return to the illegal value itself */
      77               0 :           result = sourceIllegal;
      78               0 :           break;
      79                 :         } else {
      80               0 :           *target++ = UNI_REPLACEMENT_CHAR;
      81                 :         }
      82                 :             } else {
      83               0 :         *target++ = (UTF16)ch; /* normal case */
      84                 :             }
      85               0 :     } else if (ch > UNI_MAX_LEGAL_UTF32) {
      86               0 :             if (flags == strictConversion) {
      87               0 :         result = sourceIllegal;
      88                 :             } else {
      89               0 :         *target++ = UNI_REPLACEMENT_CHAR;
      90                 :             }
      91                 :     } else {
      92                 :             /* target is a character in range 0xFFFF - 0x10FFFF. */
      93               0 :             if (target + 1 >= targetEnd) {
      94               0 :         --source; /* Back up source pointer! */
      95               0 :         result = targetExhausted; break;
      96                 :             }
      97               0 :             ch -= halfBase;
      98               0 :             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
      99               0 :             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     100                 :     }
     101                 :   }
     102               0 : *sourceStart = source;
     103               0 : *targetStart = target;
     104               0 : return result;
     105                 : }
     106                 : 
     107                 : /* --------------------------------------------------------------------- */
     108                 : 
     109               0 : ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd,
     110                 :                                       UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
     111               0 :   ConversionResult result = conversionOK;
     112               0 :   const UTF16* source = *sourceStart;
     113               0 :   UTF32* target = *targetStart;
     114                 :   UTF32 ch, ch2;
     115               0 :   while (source < sourceEnd) {
     116               0 :     const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
     117               0 :     ch = *source++;
     118                 :     /* If we have a surrogate pair, convert to UTF32 first. */
     119               0 :     if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
     120                 :             /* If the 16 bits following the high surrogate are in the source buffer... */
     121               0 :             if (source < sourceEnd) {
     122               0 :         ch2 = *source;
     123                 :         /* If it's a low surrogate, convert to UTF32. */
     124               0 :         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
     125               0 :           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
     126               0 :           + (ch2 - UNI_SUR_LOW_START) + halfBase;
     127               0 :           ++source;
     128               0 :         } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
     129               0 :           --source; /* return to the illegal value itself */
     130               0 :           result = sourceIllegal;
     131               0 :           break;
     132                 :         }
     133                 :             } else { /* We don't have the 16 bits following the high surrogate. */
     134               0 :         --source; /* return to the high surrogate */
     135               0 :         result = sourceExhausted;
     136               0 :         break;
     137                 :             }
     138               0 :     } else if (flags == strictConversion) {
     139                 :             /* UTF-16 surrogate values are illegal in UTF-32 */
     140               0 :             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
     141               0 :         --source; /* return to the illegal value itself */
     142               0 :         result = sourceIllegal;
     143               0 :         break;
     144                 :             }
     145                 :     }
     146               0 :     if (target >= targetEnd) {
     147               0 :             source = oldSource; /* Back up source pointer! */
     148               0 :             result = targetExhausted; break;
     149                 :     }
     150               0 :     *target++ = ch;
     151                 :   }
     152               0 :   *sourceStart = source;
     153               0 :   *targetStart = target;
     154                 : #ifdef CVTUTF_DEBUG
     155                 :   if (result == sourceIllegal) {
     156                 :     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
     157                 :     fflush(stderr);
     158                 :   }
     159                 : #endif
     160               0 :   return result;
     161                 : }
     162                 : 
     163                 : /* --------------------------------------------------------------------- */
     164                 : 
     165                 : /*
     166                 :  * Index into the table below with the first byte of a UTF-8 sequence to
     167                 :  * get the number of trailing bytes that are supposed to follow it.
     168                 :  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
     169                 :  * left as-is for anyone who may want to do such conversion, which was
     170                 :  * allowed in earlier algorithms.
     171                 :  */
     172                 : static const char trailingBytesForUTF8[256] = {
     173                 :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     174                 :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     175                 :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     176                 :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     177                 :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     178                 :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     179                 :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     180                 :   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
     181                 : };
     182                 : 
     183                 : /*
     184                 :  * Magic values subtracted from a buffer value during UTF8 conversion.
     185                 :  * This table contains as many values as there might be trailing bytes
     186                 :  * in a UTF-8 sequence.
     187                 :  */
     188                 : static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
     189                 :   0x03C82080UL, 0xFA082080UL, 0x82082080UL };
     190                 : 
     191                 : /*
     192                 :  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
     193                 :  * into the first byte, depending on how many bytes follow.  There are
     194                 :  * as many entries in this table as there are UTF-8 sequence types.
     195                 :  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
     196                 :  * for *legal* UTF-8 will be 4 or fewer bytes total.
     197                 :  */
     198                 : static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
     199                 : 
     200                 : /* --------------------------------------------------------------------- */
     201                 : 
     202                 : /* The interface converts a whole buffer to avoid function-call overhead.
     203                 : * Constants have been gathered. Loops & conditionals have been removed as
     204                 : * much as possible for efficiency, in favor of drop-through switches.
     205                 : * (See "Note A" at the bottom of the file for equivalent code.)
     206                 : * If your compiler supports it, the "isLegalUTF8" call can be turned
     207                 : * into an inline function.
     208                 : */
     209                 : 
     210                 : /* --------------------------------------------------------------------- */
     211                 : 
     212               0 : ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd,
     213                 :                                      UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
     214               0 :   ConversionResult result = conversionOK;
     215               0 :   const UTF16* source = *sourceStart;
     216               0 :   UTF8* target = *targetStart;
     217               0 :   while (source < sourceEnd) {
     218                 :     UTF32 ch;
     219               0 :     unsigned short bytesToWrite = 0;
     220               0 :     const UTF32 byteMask = 0xBF;
     221               0 :     const UTF32 byteMark = 0x80;
     222               0 :     const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
     223               0 :     ch = *source++;
     224                 :     /* If we have a surrogate pair, convert to UTF32 first. */
     225               0 :     if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
     226                 :             /* If the 16 bits following the high surrogate are in the source buffer... */
     227               0 :             if (source < sourceEnd) {
     228               0 :         UTF32 ch2 = *source;
     229                 :         /* If it's a low surrogate, convert to UTF32. */
     230               0 :         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
     231               0 :           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
     232               0 :           + (ch2 - UNI_SUR_LOW_START) + halfBase;
     233               0 :           ++source;
     234               0 :         } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
     235               0 :           --source; /* return to the illegal value itself */
     236               0 :           result = sourceIllegal;
     237               0 :           break;
     238                 :         }
     239                 :             } else { /* We don't have the 16 bits following the high surrogate. */
     240               0 :         --source; /* return to the high surrogate */
     241               0 :         result = sourceExhausted;
     242               0 :         break;
     243                 :             }
     244               0 :     } else if (flags == strictConversion) {
     245                 :             /* UTF-16 surrogate values are illegal in UTF-32 */
     246               0 :             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
     247               0 :         --source; /* return to the illegal value itself */
     248               0 :         result = sourceIllegal;
     249               0 :         break;
     250                 :             }
     251                 :     }
     252                 :     /* Figure out how many bytes the result will require */
     253               0 :     if (ch < (UTF32)0x80) {       bytesToWrite = 1;
     254               0 :     } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
     255               0 :     } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
     256               0 :     } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
     257               0 :     } else {                        bytesToWrite = 3;
     258               0 :       ch = UNI_REPLACEMENT_CHAR;
     259                 :     }
     260                 : 
     261               0 :     target += bytesToWrite;
     262               0 :     if (target > targetEnd) {
     263               0 :             source = oldSource; /* Back up source pointer! */
     264               0 :             target -= bytesToWrite; result = targetExhausted; break;
     265                 :     }
     266               0 :     switch (bytesToWrite) { /* note: everything falls through. */
     267               0 :             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     268               0 :             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     269               0 :             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     270               0 :             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
     271                 :     }
     272               0 :     target += bytesToWrite;
     273                 :   }
     274               0 : *sourceStart = source;
     275               0 : *targetStart = target;
     276               0 : return result;
     277                 : }
     278                 : 
     279                 : /* --------------------------------------------------------------------- */
     280                 : 
     281                 : /*
     282                 :  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
     283                 :  * This must be called with the length pre-determined by the first byte.
     284                 :  * If not calling this from ConvertUTF8to*, then the length can be set by:
     285                 :  *  length = trailingBytesForUTF8[*source]+1;
     286                 :  * and the sequence is illegal right away if there aren't that many bytes
     287                 :  * available.
     288                 :  * If presented with a length > 4, this returns false.  The Unicode
     289                 :  * definition of UTF-8 goes up to 4-byte sequences.
     290                 :  */
     291                 : 
     292               0 : static Boolean isLegalUTF8(const UTF8 *source, int length) {
     293                 :   UTF8 a;
     294               0 :   const UTF8 *srcptr = source+length;
     295               0 :   switch (length) {
     296               0 :     default: return false;
     297                 :       /* Everything else falls through when "true"... */
     298               0 :     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     299               0 :     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     300               0 :     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
     301                 : 
     302               0 :       switch (*source) {
     303                 :         /* no fall-through in this inner switch */
     304               0 :         case 0xE0: if (a < 0xA0) return false; break;
     305               0 :         case 0xED: if (a > 0x9F) return false; break;
     306               0 :         case 0xF0: if (a < 0x90) return false; break;
     307               0 :         case 0xF4: if (a > 0x8F) return false; break;
     308               0 :         default:   if (a < 0x80) return false;
     309                 :       }
     310                 : 
     311               0 :       case 1: if (*source >= 0x80 && *source < 0xC2) return false;
     312                 :   }
     313               0 :   if (*source > 0xF4) return false;
     314               0 :   return true;
     315                 : }
     316                 : 
     317                 : /* --------------------------------------------------------------------- */
     318                 : 
     319                 : /*
     320                 :  * Exported function to return whether a UTF-8 sequence is legal or not.
     321                 :  * This is not used here; it's just exported.
     322                 :  */
     323               0 : Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
     324               0 :   int length = trailingBytesForUTF8[*source]+1;
     325               0 :   if (source+length > sourceEnd) {
     326               0 :     return false;
     327                 :   }
     328               0 :   return isLegalUTF8(source, length);
     329                 : }
     330                 : 
     331                 : /* --------------------------------------------------------------------- */
     332                 : 
     333               0 : ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,
     334                 :                                      UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     335               0 :   ConversionResult result = conversionOK;
     336               0 :   const UTF8* source = *sourceStart;
     337               0 :   UTF16* target = *targetStart;
     338               0 :   while (source < sourceEnd) {
     339               0 :     UTF32 ch = 0;
     340               0 :     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
     341               0 :     if (source + extraBytesToRead >= sourceEnd) {
     342               0 :             result = sourceExhausted; break;
     343                 :     }
     344                 :     /* Do this check whether lenient or strict */
     345               0 :     if (! isLegalUTF8(source, extraBytesToRead+1)) {
     346               0 :             result = sourceIllegal;
     347               0 :             break;
     348                 :     }
     349                 :     /*
     350                 :      * The cases all fall through. See "Note A" below.
     351                 :      */
     352               0 :     switch (extraBytesToRead) {
     353               0 :             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
     354               0 :             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
     355               0 :             case 3: ch += *source++; ch <<= 6;
     356               0 :             case 2: ch += *source++; ch <<= 6;
     357               0 :             case 1: ch += *source++; ch <<= 6;
     358               0 :             case 0: ch += *source++;
     359                 :     }
     360               0 :     ch -= offsetsFromUTF8[extraBytesToRead];
     361                 : 
     362               0 :     if (target >= targetEnd) {
     363               0 :             source -= (extraBytesToRead+1); /* Back up source pointer! */
     364               0 :             result = targetExhausted; break;
     365                 :     }
     366               0 :     if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     367                 :             /* UTF-16 surrogate values are illegal in UTF-32 */
     368               0 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     369               0 :         if (flags == strictConversion) {
     370               0 :           source -= (extraBytesToRead+1); /* return to the illegal value itself */
     371               0 :           result = sourceIllegal;
     372               0 :           break;
     373                 :         } else {
     374               0 :           *target++ = UNI_REPLACEMENT_CHAR;
     375                 :         }
     376                 :             } else {
     377               0 :         *target++ = (UTF16)ch; /* normal case */
     378                 :             }
     379               0 :     } else if (ch > UNI_MAX_UTF16) {
     380               0 :             if (flags == strictConversion) {
     381               0 :         result = sourceIllegal;
     382               0 :         source -= (extraBytesToRead+1); /* return to the start */
     383               0 :         break; /* Bail out; shouldn't continue */
     384                 :             } else {
     385               0 :         *target++ = UNI_REPLACEMENT_CHAR;
     386                 :             }
     387                 :     } else {
     388                 :             /* target is a character in range 0xFFFF - 0x10FFFF. */
     389               0 :             if (target + 1 >= targetEnd) {
     390               0 :         source -= (extraBytesToRead+1); /* Back up source pointer! */
     391               0 :         result = targetExhausted; break;
     392                 :             }
     393               0 :             ch -= halfBase;
     394               0 :             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
     395               0 :             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     396                 :     }
     397                 :   }
     398               0 : *sourceStart = source;
     399               0 : *targetStart = target;
     400               0 : return result;
     401                 : }
     402                 : 
     403                 : /* --------------------------------------------------------------------- */
     404                 : 
     405               0 : ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd,
     406                 :                                      UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
     407               0 :   ConversionResult result = conversionOK;
     408               0 :   const UTF32* source = *sourceStart;
     409               0 :   UTF8* target = *targetStart;
     410               0 :   while (source < sourceEnd) {
     411                 :     UTF32 ch;
     412               0 :     unsigned short bytesToWrite = 0;
     413               0 :     const UTF32 byteMask = 0xBF;
     414               0 :     const UTF32 byteMark = 0x80;
     415               0 :     ch = *source++;
     416               0 :     if (flags == strictConversion ) {
     417                 :             /* UTF-16 surrogate values are illegal in UTF-32 */
     418               0 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     419               0 :         --source; /* return to the illegal value itself */
     420               0 :         result = sourceIllegal;
     421               0 :         break;
     422                 :             }
     423                 :     }
     424                 :     /*
     425                 :      * Figure out how many bytes the result will require. Turn any
     426                 :      * illegally large UTF32 things (> Plane 17) into replacement chars.
     427                 :      */
     428               0 :     if (ch < (UTF32)0x80) {       bytesToWrite = 1;
     429               0 :     } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
     430               0 :     } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
     431               0 :     } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
     432               0 :     } else {                        bytesToWrite = 3;
     433               0 :       ch = UNI_REPLACEMENT_CHAR;
     434               0 :       result = sourceIllegal;
     435                 :     }
     436                 : 
     437               0 :     target += bytesToWrite;
     438               0 :     if (target > targetEnd) {
     439               0 :             --source; /* Back up source pointer! */
     440               0 :             target -= bytesToWrite; result = targetExhausted; break;
     441                 :     }
     442               0 :     switch (bytesToWrite) { /* note: everything falls through. */
     443               0 :             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     444               0 :             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     445               0 :             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     446               0 :             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
     447                 :     }
     448               0 :     target += bytesToWrite;
     449                 :   }
     450               0 : *sourceStart = source;
     451               0 : *targetStart = target;
     452               0 : return result;
     453                 : }
     454                 : 
     455                 : /* --------------------------------------------------------------------- */
     456                 : 
     457               0 : ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd,
     458                 :                                      UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
     459               0 :   ConversionResult result = conversionOK;
     460               0 :   const UTF8* source = *sourceStart;
     461               0 :   UTF32* target = *targetStart;
     462               0 :   while (source < sourceEnd) {
     463               0 :     UTF32 ch = 0;
     464               0 :     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
     465               0 :     if (source + extraBytesToRead >= sourceEnd) {
     466               0 :             result = sourceExhausted; break;
     467                 :     }
     468                 :     /* Do this check whether lenient or strict */
     469               0 :     if (! isLegalUTF8(source, extraBytesToRead+1)) {
     470               0 :             result = sourceIllegal;
     471               0 :             break;
     472                 :     }
     473                 :     /*
     474                 :      * The cases all fall through. See "Note A" below.
     475                 :      */
     476               0 :     switch (extraBytesToRead) {
     477               0 :             case 5: ch += *source++; ch <<= 6;
     478               0 :             case 4: ch += *source++; ch <<= 6;
     479               0 :             case 3: ch += *source++; ch <<= 6;
     480               0 :             case 2: ch += *source++; ch <<= 6;
     481               0 :             case 1: ch += *source++; ch <<= 6;
     482               0 :             case 0: ch += *source++;
     483                 :     }
     484               0 :     ch -= offsetsFromUTF8[extraBytesToRead];
     485                 : 
     486               0 :     if (target >= targetEnd) {
     487               0 :             source -= (extraBytesToRead+1); /* Back up the source pointer! */
     488               0 :             result = targetExhausted; break;
     489                 :     }
     490               0 :     if (ch <= UNI_MAX_LEGAL_UTF32) {
     491                 :             /*
     492                 :              * UTF-16 surrogate values are illegal in UTF-32, and anything
     493                 :              * over Plane 17 (> 0x10FFFF) is illegal.
     494                 :              */
     495               0 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     496               0 :         if (flags == strictConversion) {
     497               0 :           source -= (extraBytesToRead+1); /* return to the illegal value itself */
     498               0 :           result = sourceIllegal;
     499               0 :           break;
     500                 :         } else {
     501               0 :           *target++ = UNI_REPLACEMENT_CHAR;
     502                 :         }
     503                 :             } else {
     504               0 :         *target++ = ch;
     505                 :             }
     506                 :     } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
     507               0 :             result = sourceIllegal;
     508               0 :             *target++ = UNI_REPLACEMENT_CHAR;
     509                 :     }
     510                 :   }
     511               0 :   *sourceStart = source;
     512               0 :   *targetStart = target;
     513               0 :   return result;
     514                 : }
     515                 : 
     516                 : /* ---------------------------------------------------------------------
     517                 : 
     518                 : Note A.
     519                 : The fall-through switches in UTF-8 reading code save a
     520                 : temp variable, some decrements & conditionals.  The switches
     521                 : are equivalent to the following loop:
     522                 : {
     523                 :   int tmpBytesToRead = extraBytesToRead+1;
     524                 :   do {
     525                 :                 ch += *source++;
     526                 :                 --tmpBytesToRead;
     527                 :                 if (tmpBytesToRead) ch <<= 6;
     528                 :   } while (tmpBytesToRead > 0);
     529                 : }
     530                 : In UTF-8 writing code, the switches on "bytesToWrite" are
     531                 : similarly unrolled loops.
     532                 : 
     533                 : --------------------------------------------------------------------- */

Generated by: LCOV version 1.7