LCOV - code coverage report
Current view: directory - intl/unicharutil/src - nsUnicodeProperties.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 109 0 0.0 %
Date: 2012-06-02 Functions: 10 0 0.0 %

       1                 : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
       2                 :  * ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is Mozilla Corporation code.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is Mozilla Corporation.
      18                 :  * Portions created by the Initial Developer are Copyright (C) 2009-2010
      19                 :  * the Initial Developer. All Rights Reserved.
      20                 :  *
      21                 :  * Contributor(s):
      22                 :  *   Jonathan Kew <jfkthame@gmail.com>
      23                 :  *
      24                 :  * Alternatively, the contents of this file may be used under the terms of
      25                 :  * either the GNU General Public License Version 2 or later (the "GPL"), or
      26                 :  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      27                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      28                 :  * of those above. If you wish to allow use of your version of this file only
      29                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      30                 :  * use your version of this file under the terms of the MPL, indicate your
      31                 :  * decision by deleting the provisions above and replace them with the notice
      32                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      33                 :  * the provisions above, a recipient may use your version of this file under
      34                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      35                 :  *
      36                 :  * ***** END LICENSE BLOCK ***** */
      37                 : 
      38                 : #include "nsUnicodeProperties.h"
      39                 : #include "nsUnicodeScriptCodes.h"
      40                 : #include "nsUnicodePropertyData.cpp"
      41                 : 
      42                 : #include "mozilla/Util.h"
      43                 : #include "nsMemory.h"
      44                 : #include "nsCharTraits.h"
      45                 : 
      46                 : #include "harfbuzz/hb-unicode.h"
      47                 : 
      48                 : #define UNICODE_BMP_LIMIT 0x10000
      49                 : #define UNICODE_LIMIT     0x110000
      50                 : 
      51                 : namespace mozilla {
      52                 : 
      53                 : namespace unicode {
      54                 : 
      55                 : /*
      56                 : To store properties for a million Unicode codepoints compactly, we use
      57                 : a three-level array structure, with the Unicode values considered as
      58                 : three elements: Plane, Page, and Char.
      59                 : 
      60                 : Space optimization happens because multiple Planes can refer to the same
      61                 : Page array, and multiple Pages can refer to the same Char array holding
      62                 : the actual values. In practice, most of the higher planes are empty and
      63                 : thus share the same data; and within the BMP, there are also many pages
      64                 : that repeat the same data for any given property.
      65                 : 
      66                 : Plane is usually zero, so we skip a lookup in this case, and require
      67                 : that the Plane 0 pages are always the first set of entries in the Page
      68                 : array.
      69                 : 
      70                 : The division of the remaining 16 bits into Page and Char fields is
      71                 : adjusted for each property (by experiment using the generation tool)
      72                 : to provide the most compact storage, depending on the distribution
      73                 : of values.
      74                 : */
      75                 : 
      76                 : nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
      77                 :   /*
      78                 :    * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
      79                 :    * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h.
      80                 :    */
      81                 :   /* CONTROL */             nsIUGenCategory::kOther,
      82                 :   /* FORMAT */              nsIUGenCategory::kOther,
      83                 :   /* UNASSIGNED */          nsIUGenCategory::kOther,
      84                 :   /* PRIVATE_USE */         nsIUGenCategory::kOther,
      85                 :   /* SURROGATE */           nsIUGenCategory::kOther,
      86                 :   /* LOWERCASE_LETTER */    nsIUGenCategory::kLetter,
      87                 :   /* MODIFIER_LETTER */     nsIUGenCategory::kLetter,
      88                 :   /* OTHER_LETTER */        nsIUGenCategory::kLetter,
      89                 :   /* TITLECASE_LETTER */    nsIUGenCategory::kLetter,
      90                 :   /* UPPERCASE_LETTER */    nsIUGenCategory::kLetter,
      91                 :   /* COMBINING_MARK */      nsIUGenCategory::kMark,
      92                 :   /* ENCLOSING_MARK */      nsIUGenCategory::kMark,
      93                 :   /* NON_SPACING_MARK */    nsIUGenCategory::kMark,
      94                 :   /* DECIMAL_NUMBER */      nsIUGenCategory::kNumber,
      95                 :   /* LETTER_NUMBER */       nsIUGenCategory::kNumber,
      96                 :   /* OTHER_NUMBER */        nsIUGenCategory::kNumber,
      97                 :   /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation,
      98                 :   /* DASH_PUNCTUATION */    nsIUGenCategory::kPunctuation,
      99                 :   /* CLOSE_PUNCTUATION */   nsIUGenCategory::kPunctuation,
     100                 :   /* FINAL_PUNCTUATION */   nsIUGenCategory::kPunctuation,
     101                 :   /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
     102                 :   /* OTHER_PUNCTUATION */   nsIUGenCategory::kPunctuation,
     103                 :   /* OPEN_PUNCTUATION */    nsIUGenCategory::kPunctuation,
     104                 :   /* CURRENCY_SYMBOL */     nsIUGenCategory::kSymbol,
     105                 :   /* MODIFIER_SYMBOL */     nsIUGenCategory::kSymbol,
     106                 :   /* MATH_SYMBOL */         nsIUGenCategory::kSymbol,
     107                 :   /* OTHER_SYMBOL */        nsIUGenCategory::kSymbol,
     108                 :   /* LINE_SEPARATOR */      nsIUGenCategory::kSeparator,
     109                 :   /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator,
     110                 :   /* SPACE_SEPARATOR */     nsIUGenCategory::kSeparator
     111                 : };
     112                 : 
     113                 : PRUint32
     114               0 : GetMirroredChar(PRUint32 aCh)
     115                 : {
     116                 :     // all mirrored chars are in plane 0
     117               0 :     if (aCh < UNICODE_BMP_LIMIT) {
     118               0 :         int v = sMirrorValues[sMirrorPages[0][aCh >> kMirrorCharBits]]
     119               0 :                              [aCh & ((1 << kMirrorCharBits) - 1)];
     120                 :         // The mirror value is stored as either an offset (if less than
     121                 :         // kSmallMirrorOffset) from the input character code, or as
     122                 :         // an index into the sDistantMirrors list. This allows the
     123                 :         // mirrored codes to be stored as 8-bit values, as most of them
     124                 :         // are references to nearby character codes.
     125               0 :         if (v < kSmallMirrorOffset) {
     126               0 :             return aCh + v;
     127                 :         }
     128               0 :         return sDistantMirrors[v - kSmallMirrorOffset];
     129                 :     }
     130               0 :     return aCh;
     131                 : }
     132                 : 
     133                 : PRUint8
     134               0 : GetCombiningClass(PRUint32 aCh)
     135                 : {
     136               0 :     if (aCh < UNICODE_BMP_LIMIT) {
     137               0 :         return sCClassValues[sCClassPages[0][aCh >> kCClassCharBits]]
     138               0 :                             [aCh & ((1 << kCClassCharBits) - 1)];
     139                 :     }
     140               0 :     if (aCh < UNICODE_LIMIT) {
     141               0 :         return sCClassValues[sCClassPages[sCClassPlanes[(aCh >> 16) - 1]]
     142               0 :                                          [(aCh & 0xffff) >> kCClassCharBits]]
     143               0 :                             [aCh & ((1 << kCClassCharBits) - 1)];
     144                 :     }
     145               0 :     NS_NOTREACHED("invalid Unicode character!");
     146               0 :     return 0;
     147                 : }
     148                 : 
     149                 : PRUint8
     150               0 : GetGeneralCategory(PRUint32 aCh)
     151                 : {
     152               0 :     if (aCh < UNICODE_BMP_LIMIT) {
     153               0 :         return sCatEAWValues[sCatEAWPages[0][aCh >> kCatEAWCharBits]]
     154               0 :                             [aCh & ((1 << kCatEAWCharBits) - 1)].mCategory;
     155                 :     }
     156               0 :     if (aCh < UNICODE_LIMIT) {
     157               0 :         return sCatEAWValues[sCatEAWPages[sCatEAWPlanes[(aCh >> 16) - 1]]
     158               0 :                                          [(aCh & 0xffff) >> kCatEAWCharBits]]
     159               0 :                             [aCh & ((1 << kCatEAWCharBits) - 1)].mCategory;
     160                 :     }
     161               0 :     NS_NOTREACHED("invalid Unicode character!");
     162               0 :     return PRUint8(HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED);
     163                 : }
     164                 : 
     165                 : PRUint8
     166               0 : GetEastAsianWidth(PRUint32 aCh)
     167                 : {
     168               0 :     if (aCh < UNICODE_BMP_LIMIT) {
     169               0 :         return sCatEAWValues[sCatEAWPages[0][aCh >> kCatEAWCharBits]]
     170               0 :                             [aCh & ((1 << kCatEAWCharBits) - 1)].mEAW;
     171                 :     }
     172               0 :     if (aCh < UNICODE_LIMIT) {
     173               0 :         return sCatEAWValues[sCatEAWPages[sCatEAWPlanes[(aCh >> 16) - 1]]
     174               0 :                                          [(aCh & 0xffff) >> kCatEAWCharBits]]
     175               0 :                             [aCh & ((1 << kCatEAWCharBits) - 1)].mEAW;
     176                 :     }
     177               0 :     NS_NOTREACHED("invalid Unicode character!");
     178               0 :     return 0;
     179                 : }
     180                 : 
     181                 : PRInt32
     182               0 : GetScriptCode(PRUint32 aCh)
     183                 : {
     184               0 :     if (aCh < UNICODE_BMP_LIMIT) {
     185               0 :         return sScriptValues[sScriptPages[0][aCh >> kScriptCharBits]]
     186               0 :                             [aCh & ((1 << kScriptCharBits) - 1)];
     187                 :     }
     188               0 :     if (aCh < UNICODE_LIMIT) {
     189               0 :         return sScriptValues[sScriptPages[sScriptPlanes[(aCh >> 16) - 1]]
     190               0 :                                          [(aCh & 0xffff) >> kScriptCharBits]]
     191               0 :                             [aCh & ((1 << kScriptCharBits) - 1)];
     192                 :     }
     193               0 :     NS_NOTREACHED("invalid Unicode character!");
     194               0 :     return MOZ_SCRIPT_UNKNOWN;
     195                 : }
     196                 : 
     197                 : PRUint32
     198               0 : GetScriptTagForCode(PRInt32 aScriptCode)
     199                 : {
     200                 :     // this will safely return 0 for negative script codes, too :)
     201               0 :     if (PRUint32(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
     202               0 :         return 0;
     203                 :     }
     204               0 :     return sScriptCodeToTag[aScriptCode];
     205                 : }
     206                 : 
     207                 : HSType
     208               0 : GetHangulSyllableType(PRUint32 aCh)
     209                 : {
     210                 :     // all Hangul chars are in plane 0
     211               0 :     if (aCh < UNICODE_BMP_LIMIT) {
     212               0 :         return HSType(sHangulValues[sHangulPages[0][aCh >> kHangulCharBits]]
     213               0 :                                    [aCh & ((1 << kHangulCharBits) - 1)]);
     214                 :     }
     215               0 :     return HST_NONE;
     216                 : }
     217                 : 
     218                 : bool
     219               0 : IsClusterExtender(PRUint32 aCh, PRUint8 aCategory)
     220                 : {
     221                 :     return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
     222                 :              aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
     223                 :             (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
     224               0 :             (aCh >= 0xff9e && aCh <= 0xff9f));  // katakana sound marks
     225                 : }
     226                 : 
     227                 : // TODO: replace this with a properties file or similar;
     228                 : // expect this to evolve as harfbuzz shaping support matures.
     229                 : //
     230                 : // The "shaping type" of each script run, as returned by this
     231                 : // function, is compared to the bits set in the
     232                 : // gfx.font_rendering.harfbuzz.scripts
     233                 : // preference to decide whether to use the harfbuzz shaper.
     234                 : //
     235                 : PRInt32
     236               0 : ScriptShapingType(PRInt32 aScriptCode)
     237                 : {
     238               0 :     switch (aScriptCode) {
     239                 :     default:
     240               0 :         return SHAPING_DEFAULT; // scripts not explicitly listed here are
     241                 :                                 // assumed to just use default shaping
     242                 : 
     243                 :     case MOZ_SCRIPT_ARABIC:
     244                 :     case MOZ_SCRIPT_SYRIAC:
     245                 :     case MOZ_SCRIPT_NKO:
     246                 :     case MOZ_SCRIPT_MANDAIC:
     247               0 :         return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping
     248                 : 
     249                 :     case MOZ_SCRIPT_HEBREW:
     250               0 :         return SHAPING_HEBREW;
     251                 : 
     252                 :     case MOZ_SCRIPT_HANGUL:
     253               0 :         return SHAPING_HANGUL;
     254                 : 
     255                 :     case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper?
     256               0 :         return SHAPING_MONGOLIAN;
     257                 : 
     258                 :     case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do
     259                 :                           // sequence checking
     260               0 :         return SHAPING_THAI;
     261                 : 
     262                 :     case MOZ_SCRIPT_BENGALI:
     263                 :     case MOZ_SCRIPT_DEVANAGARI:
     264                 :     case MOZ_SCRIPT_GUJARATI:
     265                 :     case MOZ_SCRIPT_GURMUKHI:
     266                 :     case MOZ_SCRIPT_KANNADA:
     267                 :     case MOZ_SCRIPT_MALAYALAM:
     268                 :     case MOZ_SCRIPT_ORIYA:
     269                 :     case MOZ_SCRIPT_SINHALA:
     270                 :     case MOZ_SCRIPT_TAMIL:
     271                 :     case MOZ_SCRIPT_TELUGU:
     272                 :     case MOZ_SCRIPT_KHMER:
     273                 :     case MOZ_SCRIPT_LAO:
     274                 :     case MOZ_SCRIPT_TIBETAN:
     275                 :     case MOZ_SCRIPT_NEW_TAI_LUE:
     276                 :     case MOZ_SCRIPT_TAI_LE:
     277                 :     case MOZ_SCRIPT_MYANMAR:
     278                 :     case MOZ_SCRIPT_PHAGS_PA:
     279                 :     case MOZ_SCRIPT_BATAK:
     280                 :     case MOZ_SCRIPT_BRAHMI:
     281               0 :         return SHAPING_INDIC; // scripts that require Indic or other "special" shaping
     282                 :     }
     283                 : }
     284                 : 
     285                 : void
     286               0 : ClusterIterator::Next()
     287                 : {
     288               0 :     if (AtEnd()) {
     289               0 :         NS_WARNING("ClusterIterator has already reached the end");
     290               0 :         return;
     291                 :     }
     292                 : 
     293               0 :     PRUint32 ch = *mPos++;
     294                 : 
     295               0 :     if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit &&
     296                 :         NS_IS_LOW_SURROGATE(*mPos)) {
     297               0 :         ch = SURROGATE_TO_UCS4(ch, *mPos++);
     298               0 :     } else if ((ch & ~0xff) == 0x1100 ||
     299                 :         (ch >= 0xa960 && ch <= 0xa97f) ||
     300                 :         (ch >= 0xac00 && ch <= 0xd7ff)) {
     301                 :         // Handle conjoining Jamo that make Hangul syllables
     302               0 :         HSType hangulState = GetHangulSyllableType(ch);
     303               0 :         while (mPos < mLimit) {
     304               0 :             ch = *mPos;
     305               0 :             HSType hangulType = GetHangulSyllableType(ch);
     306               0 :             switch (hangulType) {
     307                 :             case HST_L:
     308                 :             case HST_LV:
     309                 :             case HST_LVT:
     310               0 :                 if (hangulState == HST_L) {
     311               0 :                     hangulState = hangulType;
     312               0 :                     mPos++;
     313               0 :                     continue;
     314                 :                 }
     315               0 :                 break;
     316                 :             case HST_V:
     317               0 :                 if ((hangulState != HST_NONE) && !(hangulState & HST_T)) {
     318               0 :                     hangulState = hangulType;
     319               0 :                     mPos++;
     320               0 :                     continue;
     321                 :                 }
     322               0 :                 break;
     323                 :             case HST_T:
     324               0 :                 if (hangulState & (HST_V | HST_T)) {
     325               0 :                     hangulState = hangulType;
     326               0 :                     mPos++;
     327               0 :                     continue;
     328                 :                 }
     329               0 :                 break;
     330                 :             default:
     331               0 :                 break;
     332                 :             }
     333               0 :             break;
     334                 :         }
     335                 :     }
     336                 : 
     337               0 :     while (mPos < mLimit) {
     338               0 :         ch = *mPos;
     339                 : 
     340                 :         // Check for surrogate pairs; note that isolated surrogates will just
     341                 :         // be treated as generic (non-cluster-extending) characters here,
     342                 :         // which is fine for cluster-iterating purposes
     343               0 :         if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
     344               0 :             NS_IS_LOW_SURROGATE(*(mPos + 1))) {
     345               0 :             ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
     346                 :         }
     347                 : 
     348               0 :         if (!IsClusterExtender(ch)) {
     349               0 :             break;
     350                 :         }
     351                 : 
     352               0 :         mPos++;
     353               0 :         if (!IS_IN_BMP(ch)) {
     354               0 :             mPos++;
     355                 :         }
     356                 :     }
     357                 : 
     358               0 :     NS_ASSERTION(mText < mPos && mPos <= mLimit,
     359                 :                  "ClusterIterator::Next has overshot the string!");
     360                 : }
     361                 : 
     362                 : } // end namespace unicode
     363                 : 
     364                 : } // end namespace mozilla

Generated by: LCOV version 1.7