LCOV - code coverage report
Current view: directory - js/src/vm - Unicode.h (source / functions) Found Hit Coverage
Test: app.info Lines: 60 46 76.7 %
Date: 2012-06-02 Functions: 17 14 82.4 %

       1                 : /*
       2                 :  * ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is SpiderMonkey JavaScript engine.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is
      18                 :  * SpiderMonkey Unicode support code.
      19                 :  * Portions created by the Initial Developer are Copyright (C) 2011
      20                 :  * the Initial Developer. All Rights Reserved.
      21                 :  *
      22                 :  * Contributor(s):
      23                 :  *   Tom Schuster <evilpies@gmail.com>
      24                 :  *
      25                 :  * Alternatively, the contents of this file may be used under the terms of
      26                 :  * either the GNU General Public License Version 2 or later (the "GPL"), or
      27                 :  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      28                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      29                 :  * of those above. If you wish to allow use of your version of this file only
      30                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      31                 :  * use your version of this file under the terms of the MPL, indicate your
      32                 :  * decision by deleting the provisions above and replace them with the notice
      33                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      34                 :  * the provisions above, a recipient may use your version of this file under
      35                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      36                 :  *
      37                 :  * ***** END LICENSE BLOCK ***** */
      38                 : 
      39                 : #ifndef Unicode_h__
      40                 : #define Unicode_h__
      41                 : 
      42                 : #include "mozilla/StandardInteger.h"
      43                 : 
      44                 : #include "jspubtd.h"
      45                 : 
      46                 : #ifdef DEBUG
      47                 : #include <stdio.h> /* For EOF */
      48                 : #endif
      49                 : 
      50                 : extern const bool js_isidstart[];
      51                 : extern const bool js_isident[];
      52                 : extern const bool js_isspace[];
      53                 : 
      54                 : namespace js {
      55                 : namespace unicode {
      56                 : 
      57                 : /*
      58                 :  * This enum contains the all the knowledge required to handle
      59                 :  * Unicode in JavaScript.
      60                 :  *
      61                 :  * SPACE
      62                 :  *   Every character that is either in the ECMA-262 5th Edition
      63                 :  *   class WhiteSpace or LineTerminator.
      64                 :  *
      65                 :  *   WhiteSpace
      66                 :  *    \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
      67                 :  *    and every other Unicode character with the General Category "Zs".
      68                 :  *    In pratice this is every character with the value "Zs" as the third
      69                 :  *    field (after the char code in hex, and the name) called General_Category
      70                 :  *    (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
      71                 :  *     in the file UnicodeData.txt.
      72                 :  *
      73                 :  *   LineTerminator
      74                 :  *    \u000A, \u000D, \u2028, \u2029
      75                 :  *
      76                 :  * LETTER
      77                 :  *   This are all characters included UnicodeLetter from ECMA-262.
      78                 :  *   This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
      79                 :  *
      80                 :  * IDENTIFIER_PART
      81                 :  *   This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
      82                 :  *   Aka categories Mn/Mc, Md, Nd, Pc
      83                 :  *   And <ZWNJ> and <ZWJ>.
      84                 :  *   Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
      85                 :  *   a matcher for the real IdentifierPart like this:
      86                 :  *
      87                 :  *   if isEscapeSequence():
      88                 :  *      handleEscapeSequence()
      89                 :  *      return True
      90                 :  *   if char in ['$', '_']:
      91                 :  *      return True
      92                 :  *   if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
      93                 :  *      return True
      94                 :  *
      95                 :  * NO_DELTA
      96                 :  *   See comment in CharacterInfo
      97                 :  *
      98                 :  * ENCLOSING_MARK / COMBINING_SPACING_MARK
      99                 :  *   Something for E4X....
     100                 :  */
     101                 : 
     102                 : struct CharFlag {
     103                 :     enum temp {
     104                 :         SPACE  = 1 << 0,
     105                 :         LETTER = 1 << 1,
     106                 :         IDENTIFIER_PART = 1 << 2,
     107                 :         NO_DELTA = 1 << 3,
     108                 :         ENCLOSING_MARK = 1 << 4,
     109                 :         COMBINING_SPACING_MARK = 1 << 5
     110                 :     };
     111                 : };
     112                 : 
     113                 : const jschar BYTE_ORDER_MARK2 = 0xFFFE;
     114                 : const jschar NO_BREAK_SPACE  = 0x00A0;
     115                 : 
     116                 : class CharacterInfo {
     117                 :     /*
     118                 :      * upperCase and loweCase normally store the delta between two
     119                 :      * letters. For example the lower case alpha (a) has the char code
     120                 :      * 97, and the upper case alpha (A) has 65. So for "a" we would
     121                 :      * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
     122                 :      * because this char is already in lower case.
     123                 :      * Well, not -32 exactly, but (2**16 - 32) to induce
     124                 :      * unsigned overflow with identical mathematical behavior.
     125                 :      * For upper case alpha, we would store 0 in upperCase and 32 in
     126                 :      * lowerCase (65 + 32 = 97).
     127                 :      *
     128                 :      * If the delta between the chars wouldn't fit in a T, the flag
     129                 :      * FLAG_NO_DELTA is set, and you can just use upperCase and lowerCase
     130                 :      * without adding them the base char. See CharInfo.toUpperCase().
     131                 :      *
     132                 :      * We use deltas to reuse information for multiple characters. For
     133                 :      * example the whole lower case latin alphabet fits into one entry,
     134                 :      * because it's always a UnicodeLetter and upperCase contains
     135                 :      * -32.
     136                 :      */
     137                 :   public:
     138                 :     uint16_t upperCase;
     139                 :     uint16_t lowerCase;
     140                 :     uint8_t flags;
     141                 : 
     142              49 :     inline bool isSpace() const {
     143              49 :         return flags & CharFlag::SPACE;
     144                 :     }
     145                 : 
     146               0 :     inline bool isLetter() const {
     147               0 :         return flags & CharFlag::LETTER;
     148                 :     }
     149                 : 
     150               0 :     inline bool isIdentifierPart() const {
     151               0 :         return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
     152                 :     }
     153                 : 
     154           13627 :     inline bool isEnclosingMark() const {
     155           13627 :         return flags & CharFlag::ENCLOSING_MARK;
     156                 :     }
     157                 : 
     158            7598 :     inline bool isCombiningSpacingMark() const {
     159            7598 :         return flags & CharFlag::COMBINING_SPACING_MARK;
     160                 :     }
     161                 : };
     162                 : 
     163                 : extern const uint8_t index1[];
     164                 : extern const uint8_t index2[];
     165                 : extern const CharacterInfo js_charinfo[];
     166                 : 
     167                 : inline const CharacterInfo&
     168         1857782 : CharInfo(jschar code)
     169                 : {
     170         1857782 :     size_t index = index1[code >> 6];
     171         1857782 :     index = index2[(index << 6) + (code & 0x3f)];
     172                 : 
     173         1857782 :     return js_charinfo[index];
     174                 : }
     175                 : 
     176                 : inline bool
     177         6308671 : IsIdentifierStart(jschar ch)
     178                 : {
     179                 :     /*
     180                 :      * ES5 7.6 IdentifierStart
     181                 :      *  $ (dollar sign)
     182                 :      *  _ (underscore)
     183                 :      *  or any UnicodeLetter.
     184                 :      *
     185                 :      * We use a lookup table for small and thus common characters for speed.
     186                 :      */
     187                 : 
     188         6308671 :     if (ch < 128)
     189         6308671 :         return js_isidstart[ch];
     190                 : 
     191               0 :     return CharInfo(ch).isLetter();
     192                 : }
     193                 : 
     194                 : inline bool
     195       243865896 : IsIdentifierPart(jschar ch)
     196                 : {
     197                 :     /* Matches ES5 7.6 IdentifierPart. */
     198                 : 
     199       243865896 :     if (ch < 128)
     200       243865896 :         return js_isident[ch];
     201                 : 
     202               0 :     return CharInfo(ch).isIdentifierPart();
     203                 : }
     204                 : 
     205                 : inline bool
     206               0 : IsLetter(jschar ch)
     207                 : {
     208               0 :     return CharInfo(ch).isLetter();
     209                 : }
     210                 : 
     211                 : inline bool
     212         1669330 : IsSpace(jschar ch)
     213                 : {
     214                 :     /*
     215                 :      * IsSpace checks if some character is included in the merged set
     216                 :      * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
     217                 :      * We combined them, because in practice nearly every
     218                 :      * calling function wants this, except some code in the tokenizer.
     219                 :      *
     220                 :      * We use a lookup table for ASCII-7 characters, because they are
     221                 :      * very common and must be handled quickly in the tokenizer.
     222                 :      * NO-BREAK SPACE is supposed to be the most common character not in
     223                 :      * this range, so we inline this case, too.
     224                 :      */
     225                 : 
     226         1669330 :     if (ch < 128)
     227         1669283 :         return js_isspace[ch];
     228                 : 
     229              47 :     if (ch == NO_BREAK_SPACE)
     230               0 :         return true;
     231                 : 
     232              47 :     return CharInfo(ch).isSpace();
     233                 : }
     234                 : 
     235                 : inline bool
     236              52 : IsSpaceOrBOM2(jschar ch)
     237                 : {
     238              52 :     if (ch < 128)
     239              50 :         return js_isspace[ch];
     240                 : 
     241                 :     /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
     242               2 :     if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
     243               0 :         return true;
     244                 : 
     245               2 :     return CharInfo(ch).isSpace();
     246                 : }
     247                 : 
     248                 : inline jschar
     249          103741 : ToUpperCase(jschar ch)
     250                 : {
     251          103741 :     const CharacterInfo &info = CharInfo(ch);
     252                 : 
     253                 :     /*
     254                 :      * The delta didn't fit into T, so we had to store the
     255                 :      * actual char code.
     256                 :      */
     257          103741 :     if (info.flags & CharFlag::NO_DELTA)
     258               0 :         return info.upperCase;
     259                 : 
     260          103741 :     return uint16_t(ch) + info.upperCase;
     261                 : }
     262                 : 
     263                 : inline jschar
     264         1732767 : ToLowerCase(jschar ch)
     265                 : {
     266         1732767 :     const CharacterInfo &info = CharInfo(ch);
     267                 : 
     268         1732767 :     if (info.flags & CharFlag::NO_DELTA)
     269               0 :         return info.lowerCase;
     270                 : 
     271         1732767 :     return uint16_t(ch) + info.lowerCase;
     272                 : }
     273                 : 
     274                 : /* XML support functions */
     275                 : 
     276                 : inline bool
     277            9092 : IsXMLSpace(jschar ch)
     278                 : {
     279            9092 :     return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
     280                 : }
     281                 : 
     282                 : inline bool
     283            7760 : IsXMLNamespaceStart(jschar ch)
     284                 : {
     285            7760 :     if (ch == '_')
     286             162 :         return true;
     287                 : 
     288            7598 :     return CharInfo(ch).isCombiningSpacingMark() || IsIdentifierStart(ch);
     289                 : }
     290                 : 
     291                 : inline bool
     292              27 : IsXMLNamespacePart(jschar ch)
     293                 : {
     294              27 :     if (ch == '.' || ch == '-' || ch == '_')
     295               0 :         return true;
     296                 : 
     297              27 :     return CharInfo(ch).isEnclosingMark() || IsIdentifierPart(ch);
     298                 : }
     299                 : 
     300                 : inline bool
     301                 : IsXMLNameStart(jschar ch)
     302                 : {
     303                 :     if (ch == '_' || ch == ':')
     304                 :         return true;
     305                 : 
     306                 :     return CharInfo(ch).isCombiningSpacingMark() || IsIdentifierStart(ch);
     307                 : }
     308                 : 
     309                 : inline bool
     310           13600 : IsXMLNamePart(jschar ch)
     311                 : {
     312           13600 :     if (ch == '.' || ch == '-' || ch == '_' || ch == ':')
     313               0 :         return true;
     314                 : 
     315           13600 :     return CharInfo(ch).isEnclosingMark() || IsIdentifierPart(ch);
     316                 : }
     317                 : 
     318                 : 
     319                 : } /* namespace unicode */
     320                 : } /* namespace js */
     321                 : 
     322                 : #endif /* Unicode_h__ */

Generated by: LCOV version 1.7