LCOV - code coverage report
Current view: directory - gfx/cairo/cairo/src - cairo-unicode.c (source / functions) Found Hit Coverage
Test: app.info Lines: 135 0 0.0 %
Date: 2012-06-02 Functions: 6 0 0.0 %

       1                 : /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
       2                 : /* cairo - a vector graphics library with display and print output
       3                 :  *
       4                 :  * The code in this file is derived from GLib's gutf8.c and
       5                 :  *   ultimately from libunicode. It is relicensed under the
       6                 :  *   dual LGPL/MPL with permission of the original authors.
       7                 :  *
       8                 :  * Copyright © 1999 Tom Tromey
       9                 :  * Copyright © 2005 Red Hat, Inc
      10                 :  *
      11                 :  * This library is free software; you can redistribute it and/or
      12                 :  * modify it either under the terms of the GNU Lesser General Public
      13                 :  * License version 2.1 as published by the Free Software Foundation
      14                 :  * (the "LGPL") or, at your option, under the terms of the Mozilla
      15                 :  * Public License Version 1.1 (the "MPL"). If you do not alter this
      16                 :  * notice, a recipient may use your version of this file under either
      17                 :  * the MPL or the LGPL.
      18                 :  *
      19                 :  * You should have received a copy of the LGPL along with this library
      20                 :  * in the file COPYING-LGPL-2.1; if not, write to the Free Software
      21                 :  * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
      22                 :  * You should have received a copy of the MPL along with this library
      23                 :  * in the file COPYING-MPL-1.1
      24                 :  *
      25                 :  * The contents of this file are subject to the Mozilla Public License
      26                 :  * Version 1.1 (the "License"); you may not use this file except in
      27                 :  * compliance with the License. You may obtain a copy of the License at
      28                 :  * http://www.mozilla.org/MPL/
      29                 :  *
      30                 :  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
      31                 :  * OF ANY KIND, either express or implied. See the LGPL or the MPL for
      32                 :  * the specific language governing rights and limitations.
      33                 :  *
      34                 :  * The Original Code is the cairo graphics library.
      35                 :  *
      36                 :  * The Initial Developer of the Original Code is Tom Tromey.
      37                 :  *  and Red Hat, Inc.
      38                 :  *
      39                 :  * Contributor(s):
      40                 :  *      Owen Taylor <otaylor@redhat.com>
      41                 :  */
      42                 : 
      43                 : #include "cairoint.h"
      44                 : #include "cairo-error-private.h"
      45                 : 
      46                 : #define UTF8_COMPUTE(Char, Mask, Len)                                         \
      47                 :   if (Char < 128)                                                          \
      48                 :     {                                                                         \
      49                 :       Len = 1;                                                                \
      50                 :       Mask = 0x7f;                                                            \
      51                 :     }                                                                         \
      52                 :   else if ((Char & 0xe0) == 0xc0)                                         \
      53                 :     {                                                                         \
      54                 :       Len = 2;                                                                \
      55                 :       Mask = 0x1f;                                                            \
      56                 :     }                                                                         \
      57                 :   else if ((Char & 0xf0) == 0xe0)                                         \
      58                 :     {                                                                         \
      59                 :       Len = 3;                                                                \
      60                 :       Mask = 0x0f;                                                            \
      61                 :     }                                                                         \
      62                 :   else if ((Char & 0xf8) == 0xf0)                                         \
      63                 :     {                                                                         \
      64                 :       Len = 4;                                                                \
      65                 :       Mask = 0x07;                                                            \
      66                 :     }                                                                         \
      67                 :   else if ((Char & 0xfc) == 0xf8)                                         \
      68                 :     {                                                                         \
      69                 :       Len = 5;                                                                \
      70                 :       Mask = 0x03;                                                            \
      71                 :     }                                                                         \
      72                 :   else if ((Char & 0xfe) == 0xfc)                                         \
      73                 :     {                                                                         \
      74                 :       Len = 6;                                                                \
      75                 :       Mask = 0x01;                                                            \
      76                 :     }                                                                         \
      77                 :   else                                                                        \
      78                 :     Len = -1;
      79                 : 
      80                 : #define UTF8_LENGTH(Char)              \
      81                 :   ((Char) < 0x80 ? 1 :                 \
      82                 :    ((Char) < 0x800 ? 2 :               \
      83                 :     ((Char) < 0x10000 ? 3 :            \
      84                 :      ((Char) < 0x200000 ? 4 :          \
      85                 :       ((Char) < 0x4000000 ? 5 : 6)))))
      86                 : 
      87                 : #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
      88                 :   (Result) = (Chars)[0] & (Mask);                                         \
      89                 :   for ((Count) = 1; (Count) < (Len); ++(Count))                                    \
      90                 :     {                                                                         \
      91                 :       if (((Chars)[(Count)] & 0xc0) != 0x80)                                      \
      92                 :         {                                                                     \
      93                 :           (Result) = -1;                                                      \
      94                 :           break;                                                              \
      95                 :         }                                                                     \
      96                 :       (Result) <<= 6;                                                           \
      97                 :       (Result) |= ((Chars)[(Count)] & 0x3f);                                      \
      98                 :     }
      99                 : 
     100                 : #define UNICODE_VALID(Char)                   \
     101                 :     ((Char) < 0x110000 &&                     \
     102                 :      (((Char) & 0xFFFFF800) != 0xD800) &&     \
     103                 :      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
     104                 :      ((Char) & 0xFFFE) != 0xFFFE)
     105                 : 
     106                 : static const char utf8_skip_data[256] = {
     107                 :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     108                 :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     109                 :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     110                 :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     111                 :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     112                 :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     113                 :     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     114                 :     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
     115                 : };
     116                 : 
     117                 : #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
     118                 : 
     119                 : /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
     120                 :  * If @p does not point to a valid UTF-8 encoded character, results are
     121                 :  * undefined.
     122                 :  **/
     123                 : static uint32_t
     124               0 : _utf8_get_char (const unsigned char *p)
     125                 : {
     126               0 :     int i, mask = 0, len;
     127                 :     uint32_t result;
     128               0 :     unsigned char c = (unsigned char) *p;
     129                 : 
     130               0 :     UTF8_COMPUTE (c, mask, len);
     131               0 :     if (len == -1)
     132               0 :         return (uint32_t)-1;
     133               0 :     UTF8_GET (result, p, i, mask, len);
     134                 : 
     135               0 :     return result;
     136                 : }
     137                 : 
     138                 : /* Like _utf8_get_char, but take a maximum length
     139                 :  * and return (uint32_t)-2 on incomplete trailing character
     140                 :  */
     141                 : static uint32_t
     142               0 : _utf8_get_char_extended (const unsigned char *p,
     143                 :                          long                 max_len)
     144                 : {
     145                 :     int i, len;
     146               0 :     uint32_t wc = (unsigned char) *p;
     147                 : 
     148               0 :     if (wc < 0x80) {
     149               0 :         return wc;
     150               0 :     } else if (wc < 0xc0) {
     151               0 :         return (uint32_t)-1;
     152               0 :     } else if (wc < 0xe0) {
     153               0 :         len = 2;
     154               0 :         wc &= 0x1f;
     155               0 :     } else if (wc < 0xf0) {
     156               0 :         len = 3;
     157               0 :         wc &= 0x0f;
     158               0 :     } else if (wc < 0xf8) {
     159               0 :         len = 4;
     160               0 :         wc &= 0x07;
     161               0 :     } else if (wc < 0xfc) {
     162               0 :         len = 5;
     163               0 :         wc &= 0x03;
     164               0 :     } else if (wc < 0xfe) {
     165               0 :         len = 6;
     166               0 :         wc &= 0x01;
     167                 :     } else {
     168               0 :         return (uint32_t)-1;
     169                 :     }
     170                 : 
     171               0 :     if (max_len >= 0 && len > max_len) {
     172               0 :         for (i = 1; i < max_len; i++) {
     173               0 :             if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
     174               0 :                 return (uint32_t)-1;
     175                 :         }
     176               0 :         return (uint32_t)-2;
     177                 :     }
     178                 : 
     179               0 :     for (i = 1; i < len; ++i) {
     180               0 :         uint32_t ch = ((unsigned char *)p)[i];
     181                 : 
     182               0 :         if ((ch & 0xc0) != 0x80) {
     183               0 :             if (ch)
     184               0 :                 return (uint32_t)-1;
     185                 :             else
     186               0 :                 return (uint32_t)-2;
     187                 :         }
     188                 : 
     189               0 :         wc <<= 6;
     190               0 :         wc |= (ch & 0x3f);
     191                 :     }
     192                 : 
     193               0 :     if (UTF8_LENGTH(wc) != len)
     194               0 :         return (uint32_t)-1;
     195                 : 
     196               0 :     return wc;
     197                 : }
     198                 : 
     199                 : /**
     200                 :  * _cairo_utf8_get_char_validated:
     201                 :  * @p: a UTF-8 string
     202                 :  * @unicode: location to store one Unicode character
     203                 :  *
     204                 :  * Decodes the first character of a valid UTF-8 string, and returns
     205                 :  * the number of bytes consumed.
     206                 :  *
     207                 :  * Note that the string should be valid.  Do not use this without
     208                 :  * validating the string first.
     209                 :  *
     210                 :  * Returns: the number of bytes forming the character returned.
     211                 :  **/
     212                 : int
     213               0 : _cairo_utf8_get_char_validated (const char *p,
     214                 :                                 uint32_t   *unicode)
     215                 : {
     216               0 :     int i, mask = 0, len;
     217                 :     uint32_t result;
     218               0 :     unsigned char c = (unsigned char) *p;
     219                 : 
     220               0 :     UTF8_COMPUTE (c, mask, len);
     221               0 :     if (len == -1) {
     222               0 :         if (unicode)
     223               0 :             *unicode = (uint32_t)-1;
     224               0 :         return 1;
     225                 :     }
     226               0 :     UTF8_GET (result, p, i, mask, len);
     227                 : 
     228               0 :     if (unicode)
     229               0 :         *unicode = result;
     230               0 :     return len;
     231                 : }
     232                 : 
     233                 : /**
     234                 :  * _cairo_utf8_to_ucs4:
     235                 :  * @str: an UTF-8 string
     236                 :  * @len: length of @str in bytes, or -1 if it is nul-terminated.
     237                 :  *   If @len is supplied and the string has an embedded nul
     238                 :  *   byte, only the portion before the nul byte is converted.
     239                 :  * @result: location to store a pointer to a newly allocated UTF-32
     240                 :  *   string (always native endian), or %NULL. Free with free(). A 0
     241                 :  *   word will be written after the last character.
     242                 :  * @items_written: location to store number of 32-bit words
     243                 :  *   written. (Not including the trailing 0)
     244                 :  *
     245                 :  * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
     246                 :  * with 1 32-bit word per character. The string is validated to
     247                 :  * consist entirely of valid Unicode characters.
     248                 :  *
     249                 :  * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
     250                 :  *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
     251                 :  *   invalid sequence was found.
     252                 :  **/
     253                 : cairo_status_t
     254               0 : _cairo_utf8_to_ucs4 (const char *str,
     255                 :                      int         len,
     256                 :                      uint32_t  **result,
     257                 :                      int        *items_written)
     258                 : {
     259               0 :     uint32_t *str32 = NULL;
     260                 :     int n_chars, i;
     261                 :     const unsigned char *in;
     262               0 :     const unsigned char * const ustr = (const unsigned char *) str;
     263                 : 
     264               0 :     in = ustr;
     265               0 :     n_chars = 0;
     266               0 :     while ((len < 0 || ustr + len - in > 0) && *in)
     267                 :     {
     268               0 :         uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
     269               0 :         if (wc & 0x80000000 || !UNICODE_VALID (wc))
     270               0 :             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
     271                 : 
     272               0 :         n_chars++;
     273               0 :         if (n_chars == INT_MAX)
     274               0 :             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
     275                 : 
     276               0 :         in = UTF8_NEXT_CHAR (in);
     277                 :     }
     278                 : 
     279               0 :     if (result) {
     280               0 :         str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
     281               0 :         if (!str32)
     282               0 :             return _cairo_error (CAIRO_STATUS_NO_MEMORY);
     283                 : 
     284               0 :         in = ustr;
     285               0 :         for (i=0; i < n_chars; i++) {
     286               0 :             str32[i] = _utf8_get_char (in);
     287               0 :             in = UTF8_NEXT_CHAR (in);
     288                 :         }
     289               0 :         str32[i] = 0;
     290                 : 
     291               0 :         *result = str32;
     292                 :     }
     293                 : 
     294               0 :     if (items_written)
     295               0 :         *items_written = n_chars;
     296                 : 
     297               0 :     return CAIRO_STATUS_SUCCESS;
     298                 : }
     299                 : 
     300                 : /**
     301                 :  * _cairo_ucs4_to_utf8:
     302                 :  * @unicode: a UCS-4 character
     303                 :  * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
     304                 :  * space available. Or %NULL.
     305                 :  *
     306                 :  * This space left intentionally blank.
     307                 :  *
     308                 :  * Return value: Number of bytes in the utf8 string or 0 if an invalid
     309                 :  * unicode character
     310                 :  **/
     311                 : int
     312               0 : _cairo_ucs4_to_utf8 (uint32_t  unicode,
     313                 :                      char     *utf8)
     314                 : {
     315                 :     int bytes;
     316                 :     char *p;
     317                 : 
     318               0 :     if (unicode < 0x80) {
     319               0 :         if (utf8)
     320               0 :             *utf8 = unicode;
     321               0 :         return 1;
     322               0 :     } else if (unicode < 0x800) {
     323               0 :         bytes = 2;
     324               0 :     } else if (unicode < 0x10000) {
     325               0 :         bytes = 3;
     326               0 :     } else if (unicode < 0x200000) {
     327               0 :         bytes = 4;
     328                 :     } else {
     329               0 :         return 0;
     330                 :     }
     331                 : 
     332               0 :     if (!utf8)
     333               0 :         return bytes;
     334                 : 
     335               0 :     p = utf8 + bytes;
     336               0 :     while (p > utf8) {
     337               0 :         *--p = 0x80 | (unicode & 0x3f);
     338               0 :         unicode >>= 6;
     339                 :     }
     340               0 :     *p |= 0xf0 << (4 - bytes);
     341                 : 
     342               0 :     return bytes;
     343                 : }
     344                 : 
     345                 : #if CAIRO_HAS_UTF8_TO_UTF16
     346                 : /**
     347                 :  * _cairo_utf8_to_utf16:
     348                 :  * @str: an UTF-8 string
     349                 :  * @len: length of @str in bytes, or -1 if it is nul-terminated.
     350                 :  *   If @len is supplied and the string has an embedded nul
     351                 :  *   byte, only the portion before the nul byte is converted.
     352                 :  * @result: location to store a pointer to a newly allocated UTF-16
     353                 :  *   string (always native endian). Free with free(). A 0
     354                 :  *   word will be written after the last character.
     355                 :  * @items_written: location to store number of 16-bit words
     356                 :  *   written. (Not including the trailing 0)
     357                 :  *
     358                 :  * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
     359                 :  * where characters are represented either as a single 16-bit word, or
     360                 :  * as a pair of 16-bit "surrogates". The string is validated to
     361                 :  * consist entirely of valid Unicode characters.
     362                 :  *
     363                 :  * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
     364                 :  *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
     365                 :  *   an invalid sequence was found.
     366                 :  **/
     367                 : cairo_status_t
     368               0 : _cairo_utf8_to_utf16 (const char *str,
     369                 :                       int         len,
     370                 :                       uint16_t **result,
     371                 :                       int       *items_written)
     372                 : {
     373               0 :     uint16_t *str16 = NULL;
     374                 :     int n16, i;
     375                 :     const unsigned char *in;
     376               0 :     const unsigned char * const ustr = (const unsigned char *) str;
     377                 : 
     378               0 :     in = ustr;
     379               0 :     n16 = 0;
     380               0 :     while ((len < 0 || ustr + len - in > 0) && *in) {
     381               0 :         uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
     382               0 :         if (wc & 0x80000000 || !UNICODE_VALID (wc))
     383               0 :             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
     384                 : 
     385               0 :         if (wc < 0x10000)
     386               0 :             n16 += 1;
     387                 :         else
     388               0 :             n16 += 2;
     389                 : 
     390               0 :         if (n16 == INT_MAX - 1 || n16 == INT_MAX)
     391               0 :             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
     392                 : 
     393               0 :         in = UTF8_NEXT_CHAR (in);
     394                 :     }
     395                 : 
     396               0 :     str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
     397               0 :     if (!str16)
     398               0 :         return _cairo_error (CAIRO_STATUS_NO_MEMORY);
     399                 : 
     400               0 :     in = ustr;
     401               0 :     for (i = 0; i < n16;) {
     402               0 :         uint32_t wc = _utf8_get_char (in);
     403                 : 
     404               0 :         if (wc < 0x10000) {
     405               0 :             str16[i++] = wc;
     406                 :         } else {
     407               0 :             str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
     408               0 :             str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
     409                 :         }
     410                 : 
     411               0 :         in = UTF8_NEXT_CHAR (in);
     412                 :     }
     413                 : 
     414               0 :     str16[i] = 0;
     415                 : 
     416               0 :     *result = str16;
     417               0 :     if (items_written)
     418               0 :         *items_written = n16;
     419                 : 
     420               0 :     return CAIRO_STATUS_SUCCESS;
     421                 : }
     422                 : #endif

Generated by: LCOV version 1.7