1 : /*
2 : * ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is SpiderMonkey JavaScript engine.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * SpiderMonkey Unicode support code.
19 : * Portions created by the Initial Developer are Copyright (C) 2011
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : * Tom Schuster <evilpies@gmail.com>
24 : *
25 : * Alternatively, the contents of this file may be used under the terms of
26 : * either the GNU General Public License Version 2 or later (the "GPL"), or
27 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 : * in which case the provisions of the GPL or the LGPL are applicable instead
29 : * of those above. If you wish to allow use of your version of this file only
30 : * under the terms of either the GPL or the LGPL, and not to allow others to
31 : * use your version of this file under the terms of the MPL, indicate your
32 : * decision by deleting the provisions above and replace them with the notice
33 : * and other provisions required by the GPL or the LGPL. If you do not delete
34 : * the provisions above, a recipient may use your version of this file under
35 : * the terms of any one of the MPL, the GPL or the LGPL.
36 : *
37 : * ***** END LICENSE BLOCK ***** */
38 :
39 : #ifndef Unicode_h__
40 : #define Unicode_h__
41 :
42 : #include "mozilla/StandardInteger.h"
43 :
44 : #include "jspubtd.h"
45 :
46 : #ifdef DEBUG
47 : #include <stdio.h> /* For EOF */
48 : #endif
49 :
50 : extern const bool js_isidstart[];
51 : extern const bool js_isident[];
52 : extern const bool js_isspace[];
53 :
54 : namespace js {
55 : namespace unicode {
56 :
57 : /*
58 : * This enum contains the all the knowledge required to handle
59 : * Unicode in JavaScript.
60 : *
61 : * SPACE
62 : * Every character that is either in the ECMA-262 5th Edition
63 : * class WhiteSpace or LineTerminator.
64 : *
65 : * WhiteSpace
66 : * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
67 : * and every other Unicode character with the General Category "Zs".
68 : * In pratice this is every character with the value "Zs" as the third
69 : * field (after the char code in hex, and the name) called General_Category
70 : * (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
71 : * in the file UnicodeData.txt.
72 : *
73 : * LineTerminator
74 : * \u000A, \u000D, \u2028, \u2029
75 : *
76 : * LETTER
77 : * This are all characters included UnicodeLetter from ECMA-262.
78 : * This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
79 : *
80 : * IDENTIFIER_PART
81 : * This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
82 : * Aka categories Mn/Mc, Md, Nd, Pc
83 : * And <ZWNJ> and <ZWJ>.
84 : * Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
85 : * a matcher for the real IdentifierPart like this:
86 : *
87 : * if isEscapeSequence():
88 : * handleEscapeSequence()
89 : * return True
90 : * if char in ['$', '_']:
91 : * return True
92 : * if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
93 : * return True
94 : *
95 : * NO_DELTA
96 : * See comment in CharacterInfo
97 : *
98 : * ENCLOSING_MARK / COMBINING_SPACING_MARK
99 : * Something for E4X....
100 : */
101 :
102 : struct CharFlag {
103 : enum temp {
104 : SPACE = 1 << 0,
105 : LETTER = 1 << 1,
106 : IDENTIFIER_PART = 1 << 2,
107 : NO_DELTA = 1 << 3,
108 : ENCLOSING_MARK = 1 << 4,
109 : COMBINING_SPACING_MARK = 1 << 5
110 : };
111 : };
112 :
113 : const jschar BYTE_ORDER_MARK2 = 0xFFFE;
114 : const jschar NO_BREAK_SPACE = 0x00A0;
115 :
116 : class CharacterInfo {
117 : /*
118 : * upperCase and loweCase normally store the delta between two
119 : * letters. For example the lower case alpha (a) has the char code
120 : * 97, and the upper case alpha (A) has 65. So for "a" we would
121 : * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
122 : * because this char is already in lower case.
123 : * Well, not -32 exactly, but (2**16 - 32) to induce
124 : * unsigned overflow with identical mathematical behavior.
125 : * For upper case alpha, we would store 0 in upperCase and 32 in
126 : * lowerCase (65 + 32 = 97).
127 : *
128 : * If the delta between the chars wouldn't fit in a T, the flag
129 : * FLAG_NO_DELTA is set, and you can just use upperCase and lowerCase
130 : * without adding them the base char. See CharInfo.toUpperCase().
131 : *
132 : * We use deltas to reuse information for multiple characters. For
133 : * example the whole lower case latin alphabet fits into one entry,
134 : * because it's always a UnicodeLetter and upperCase contains
135 : * -32.
136 : */
137 : public:
138 : uint16_t upperCase;
139 : uint16_t lowerCase;
140 : uint8_t flags;
141 :
142 49 : inline bool isSpace() const {
143 49 : return flags & CharFlag::SPACE;
144 : }
145 :
146 0 : inline bool isLetter() const {
147 0 : return flags & CharFlag::LETTER;
148 : }
149 :
150 0 : inline bool isIdentifierPart() const {
151 0 : return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
152 : }
153 :
154 13627 : inline bool isEnclosingMark() const {
155 13627 : return flags & CharFlag::ENCLOSING_MARK;
156 : }
157 :
158 7598 : inline bool isCombiningSpacingMark() const {
159 7598 : return flags & CharFlag::COMBINING_SPACING_MARK;
160 : }
161 : };
162 :
163 : extern const uint8_t index1[];
164 : extern const uint8_t index2[];
165 : extern const CharacterInfo js_charinfo[];
166 :
167 : inline const CharacterInfo&
168 1857782 : CharInfo(jschar code)
169 : {
170 1857782 : size_t index = index1[code >> 6];
171 1857782 : index = index2[(index << 6) + (code & 0x3f)];
172 :
173 1857782 : return js_charinfo[index];
174 : }
175 :
176 : inline bool
177 6308671 : IsIdentifierStart(jschar ch)
178 : {
179 : /*
180 : * ES5 7.6 IdentifierStart
181 : * $ (dollar sign)
182 : * _ (underscore)
183 : * or any UnicodeLetter.
184 : *
185 : * We use a lookup table for small and thus common characters for speed.
186 : */
187 :
188 6308671 : if (ch < 128)
189 6308671 : return js_isidstart[ch];
190 :
191 0 : return CharInfo(ch).isLetter();
192 : }
193 :
194 : inline bool
195 243865896 : IsIdentifierPart(jschar ch)
196 : {
197 : /* Matches ES5 7.6 IdentifierPart. */
198 :
199 243865896 : if (ch < 128)
200 243865896 : return js_isident[ch];
201 :
202 0 : return CharInfo(ch).isIdentifierPart();
203 : }
204 :
205 : inline bool
206 0 : IsLetter(jschar ch)
207 : {
208 0 : return CharInfo(ch).isLetter();
209 : }
210 :
211 : inline bool
212 1669330 : IsSpace(jschar ch)
213 : {
214 : /*
215 : * IsSpace checks if some character is included in the merged set
216 : * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
217 : * We combined them, because in practice nearly every
218 : * calling function wants this, except some code in the tokenizer.
219 : *
220 : * We use a lookup table for ASCII-7 characters, because they are
221 : * very common and must be handled quickly in the tokenizer.
222 : * NO-BREAK SPACE is supposed to be the most common character not in
223 : * this range, so we inline this case, too.
224 : */
225 :
226 1669330 : if (ch < 128)
227 1669283 : return js_isspace[ch];
228 :
229 47 : if (ch == NO_BREAK_SPACE)
230 0 : return true;
231 :
232 47 : return CharInfo(ch).isSpace();
233 : }
234 :
235 : inline bool
236 52 : IsSpaceOrBOM2(jschar ch)
237 : {
238 52 : if (ch < 128)
239 50 : return js_isspace[ch];
240 :
241 : /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
242 2 : if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
243 0 : return true;
244 :
245 2 : return CharInfo(ch).isSpace();
246 : }
247 :
248 : inline jschar
249 103741 : ToUpperCase(jschar ch)
250 : {
251 103741 : const CharacterInfo &info = CharInfo(ch);
252 :
253 : /*
254 : * The delta didn't fit into T, so we had to store the
255 : * actual char code.
256 : */
257 103741 : if (info.flags & CharFlag::NO_DELTA)
258 0 : return info.upperCase;
259 :
260 103741 : return uint16_t(ch) + info.upperCase;
261 : }
262 :
263 : inline jschar
264 1732767 : ToLowerCase(jschar ch)
265 : {
266 1732767 : const CharacterInfo &info = CharInfo(ch);
267 :
268 1732767 : if (info.flags & CharFlag::NO_DELTA)
269 0 : return info.lowerCase;
270 :
271 1732767 : return uint16_t(ch) + info.lowerCase;
272 : }
273 :
274 : /* XML support functions */
275 :
276 : inline bool
277 9092 : IsXMLSpace(jschar ch)
278 : {
279 9092 : return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
280 : }
281 :
282 : inline bool
283 7760 : IsXMLNamespaceStart(jschar ch)
284 : {
285 7760 : if (ch == '_')
286 162 : return true;
287 :
288 7598 : return CharInfo(ch).isCombiningSpacingMark() || IsIdentifierStart(ch);
289 : }
290 :
291 : inline bool
292 27 : IsXMLNamespacePart(jschar ch)
293 : {
294 27 : if (ch == '.' || ch == '-' || ch == '_')
295 0 : return true;
296 :
297 27 : return CharInfo(ch).isEnclosingMark() || IsIdentifierPart(ch);
298 : }
299 :
300 : inline bool
301 : IsXMLNameStart(jschar ch)
302 : {
303 : if (ch == '_' || ch == ':')
304 : return true;
305 :
306 : return CharInfo(ch).isCombiningSpacingMark() || IsIdentifierStart(ch);
307 : }
308 :
309 : inline bool
310 13600 : IsXMLNamePart(jschar ch)
311 : {
312 13600 : if (ch == '.' || ch == '-' || ch == '_' || ch == ':')
313 0 : return true;
314 :
315 13600 : return CharInfo(ch).isEnclosingMark() || IsIdentifierPart(ch);
316 : }
317 :
318 :
319 : } /* namespace unicode */
320 : } /* namespace js */
321 :
322 : #endif /* Unicode_h__ */
|