1 : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Mozilla Corporation code.
16 : *
17 : * The Initial Developer of the Original Code is Mozilla Corporation.
18 : * Portions created by the Initial Developer are Copyright (C) 2009-2010
19 : * the Initial Developer. All Rights Reserved.
20 : *
21 : * Contributor(s):
22 : * Jonathan Kew <jfkthame@gmail.com>
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either the GNU General Public License Version 2 or later (the "GPL"), or
26 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 :
38 : #include "nsUnicodeProperties.h"
39 : #include "nsUnicodeScriptCodes.h"
40 : #include "nsUnicodePropertyData.cpp"
41 :
42 : #include "mozilla/Util.h"
43 : #include "nsMemory.h"
44 : #include "nsCharTraits.h"
45 :
46 : #include "harfbuzz/hb-unicode.h"
47 :
48 : #define UNICODE_BMP_LIMIT 0x10000
49 : #define UNICODE_LIMIT 0x110000
50 :
51 : namespace mozilla {
52 :
53 : namespace unicode {
54 :
55 : /*
56 : To store properties for a million Unicode codepoints compactly, we use
57 : a three-level array structure, with the Unicode values considered as
58 : three elements: Plane, Page, and Char.
59 :
60 : Space optimization happens because multiple Planes can refer to the same
61 : Page array, and multiple Pages can refer to the same Char array holding
62 : the actual values. In practice, most of the higher planes are empty and
63 : thus share the same data; and within the BMP, there are also many pages
64 : that repeat the same data for any given property.
65 :
66 : Plane is usually zero, so we skip a lookup in this case, and require
67 : that the Plane 0 pages are always the first set of entries in the Page
68 : array.
69 :
70 : The division of the remaining 16 bits into Page and Char fields is
71 : adjusted for each property (by experiment using the generation tool)
72 : to provide the most compact storage, depending on the distribution
73 : of values.
74 : */
75 :
76 : nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
77 : /*
78 : * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
79 : * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h.
80 : */
81 : /* CONTROL */ nsIUGenCategory::kOther,
82 : /* FORMAT */ nsIUGenCategory::kOther,
83 : /* UNASSIGNED */ nsIUGenCategory::kOther,
84 : /* PRIVATE_USE */ nsIUGenCategory::kOther,
85 : /* SURROGATE */ nsIUGenCategory::kOther,
86 : /* LOWERCASE_LETTER */ nsIUGenCategory::kLetter,
87 : /* MODIFIER_LETTER */ nsIUGenCategory::kLetter,
88 : /* OTHER_LETTER */ nsIUGenCategory::kLetter,
89 : /* TITLECASE_LETTER */ nsIUGenCategory::kLetter,
90 : /* UPPERCASE_LETTER */ nsIUGenCategory::kLetter,
91 : /* COMBINING_MARK */ nsIUGenCategory::kMark,
92 : /* ENCLOSING_MARK */ nsIUGenCategory::kMark,
93 : /* NON_SPACING_MARK */ nsIUGenCategory::kMark,
94 : /* DECIMAL_NUMBER */ nsIUGenCategory::kNumber,
95 : /* LETTER_NUMBER */ nsIUGenCategory::kNumber,
96 : /* OTHER_NUMBER */ nsIUGenCategory::kNumber,
97 : /* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation,
98 : /* DASH_PUNCTUATION */ nsIUGenCategory::kPunctuation,
99 : /* CLOSE_PUNCTUATION */ nsIUGenCategory::kPunctuation,
100 : /* FINAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
101 : /* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
102 : /* OTHER_PUNCTUATION */ nsIUGenCategory::kPunctuation,
103 : /* OPEN_PUNCTUATION */ nsIUGenCategory::kPunctuation,
104 : /* CURRENCY_SYMBOL */ nsIUGenCategory::kSymbol,
105 : /* MODIFIER_SYMBOL */ nsIUGenCategory::kSymbol,
106 : /* MATH_SYMBOL */ nsIUGenCategory::kSymbol,
107 : /* OTHER_SYMBOL */ nsIUGenCategory::kSymbol,
108 : /* LINE_SEPARATOR */ nsIUGenCategory::kSeparator,
109 : /* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator,
110 : /* SPACE_SEPARATOR */ nsIUGenCategory::kSeparator
111 : };
112 :
113 : PRUint32
114 0 : GetMirroredChar(PRUint32 aCh)
115 : {
116 : // all mirrored chars are in plane 0
117 0 : if (aCh < UNICODE_BMP_LIMIT) {
118 0 : int v = sMirrorValues[sMirrorPages[0][aCh >> kMirrorCharBits]]
119 0 : [aCh & ((1 << kMirrorCharBits) - 1)];
120 : // The mirror value is stored as either an offset (if less than
121 : // kSmallMirrorOffset) from the input character code, or as
122 : // an index into the sDistantMirrors list. This allows the
123 : // mirrored codes to be stored as 8-bit values, as most of them
124 : // are references to nearby character codes.
125 0 : if (v < kSmallMirrorOffset) {
126 0 : return aCh + v;
127 : }
128 0 : return sDistantMirrors[v - kSmallMirrorOffset];
129 : }
130 0 : return aCh;
131 : }
132 :
133 : PRUint8
134 0 : GetCombiningClass(PRUint32 aCh)
135 : {
136 0 : if (aCh < UNICODE_BMP_LIMIT) {
137 0 : return sCClassValues[sCClassPages[0][aCh >> kCClassCharBits]]
138 0 : [aCh & ((1 << kCClassCharBits) - 1)];
139 : }
140 0 : if (aCh < UNICODE_LIMIT) {
141 0 : return sCClassValues[sCClassPages[sCClassPlanes[(aCh >> 16) - 1]]
142 0 : [(aCh & 0xffff) >> kCClassCharBits]]
143 0 : [aCh & ((1 << kCClassCharBits) - 1)];
144 : }
145 0 : NS_NOTREACHED("invalid Unicode character!");
146 0 : return 0;
147 : }
148 :
149 : PRUint8
150 0 : GetGeneralCategory(PRUint32 aCh)
151 : {
152 0 : if (aCh < UNICODE_BMP_LIMIT) {
153 0 : return sCatEAWValues[sCatEAWPages[0][aCh >> kCatEAWCharBits]]
154 0 : [aCh & ((1 << kCatEAWCharBits) - 1)].mCategory;
155 : }
156 0 : if (aCh < UNICODE_LIMIT) {
157 0 : return sCatEAWValues[sCatEAWPages[sCatEAWPlanes[(aCh >> 16) - 1]]
158 0 : [(aCh & 0xffff) >> kCatEAWCharBits]]
159 0 : [aCh & ((1 << kCatEAWCharBits) - 1)].mCategory;
160 : }
161 0 : NS_NOTREACHED("invalid Unicode character!");
162 0 : return PRUint8(HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED);
163 : }
164 :
165 : PRUint8
166 0 : GetEastAsianWidth(PRUint32 aCh)
167 : {
168 0 : if (aCh < UNICODE_BMP_LIMIT) {
169 0 : return sCatEAWValues[sCatEAWPages[0][aCh >> kCatEAWCharBits]]
170 0 : [aCh & ((1 << kCatEAWCharBits) - 1)].mEAW;
171 : }
172 0 : if (aCh < UNICODE_LIMIT) {
173 0 : return sCatEAWValues[sCatEAWPages[sCatEAWPlanes[(aCh >> 16) - 1]]
174 0 : [(aCh & 0xffff) >> kCatEAWCharBits]]
175 0 : [aCh & ((1 << kCatEAWCharBits) - 1)].mEAW;
176 : }
177 0 : NS_NOTREACHED("invalid Unicode character!");
178 0 : return 0;
179 : }
180 :
181 : PRInt32
182 0 : GetScriptCode(PRUint32 aCh)
183 : {
184 0 : if (aCh < UNICODE_BMP_LIMIT) {
185 0 : return sScriptValues[sScriptPages[0][aCh >> kScriptCharBits]]
186 0 : [aCh & ((1 << kScriptCharBits) - 1)];
187 : }
188 0 : if (aCh < UNICODE_LIMIT) {
189 0 : return sScriptValues[sScriptPages[sScriptPlanes[(aCh >> 16) - 1]]
190 0 : [(aCh & 0xffff) >> kScriptCharBits]]
191 0 : [aCh & ((1 << kScriptCharBits) - 1)];
192 : }
193 0 : NS_NOTREACHED("invalid Unicode character!");
194 0 : return MOZ_SCRIPT_UNKNOWN;
195 : }
196 :
197 : PRUint32
198 0 : GetScriptTagForCode(PRInt32 aScriptCode)
199 : {
200 : // this will safely return 0 for negative script codes, too :)
201 0 : if (PRUint32(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
202 0 : return 0;
203 : }
204 0 : return sScriptCodeToTag[aScriptCode];
205 : }
206 :
207 : HSType
208 0 : GetHangulSyllableType(PRUint32 aCh)
209 : {
210 : // all Hangul chars are in plane 0
211 0 : if (aCh < UNICODE_BMP_LIMIT) {
212 0 : return HSType(sHangulValues[sHangulPages[0][aCh >> kHangulCharBits]]
213 0 : [aCh & ((1 << kHangulCharBits) - 1)]);
214 : }
215 0 : return HST_NONE;
216 : }
217 :
218 : bool
219 0 : IsClusterExtender(PRUint32 aCh, PRUint8 aCategory)
220 : {
221 : return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
222 : aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
223 : (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
224 0 : (aCh >= 0xff9e && aCh <= 0xff9f)); // katakana sound marks
225 : }
226 :
227 : // TODO: replace this with a properties file or similar;
228 : // expect this to evolve as harfbuzz shaping support matures.
229 : //
230 : // The "shaping type" of each script run, as returned by this
231 : // function, is compared to the bits set in the
232 : // gfx.font_rendering.harfbuzz.scripts
233 : // preference to decide whether to use the harfbuzz shaper.
234 : //
235 : PRInt32
236 0 : ScriptShapingType(PRInt32 aScriptCode)
237 : {
238 0 : switch (aScriptCode) {
239 : default:
240 0 : return SHAPING_DEFAULT; // scripts not explicitly listed here are
241 : // assumed to just use default shaping
242 :
243 : case MOZ_SCRIPT_ARABIC:
244 : case MOZ_SCRIPT_SYRIAC:
245 : case MOZ_SCRIPT_NKO:
246 : case MOZ_SCRIPT_MANDAIC:
247 0 : return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping
248 :
249 : case MOZ_SCRIPT_HEBREW:
250 0 : return SHAPING_HEBREW;
251 :
252 : case MOZ_SCRIPT_HANGUL:
253 0 : return SHAPING_HANGUL;
254 :
255 : case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper?
256 0 : return SHAPING_MONGOLIAN;
257 :
258 : case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do
259 : // sequence checking
260 0 : return SHAPING_THAI;
261 :
262 : case MOZ_SCRIPT_BENGALI:
263 : case MOZ_SCRIPT_DEVANAGARI:
264 : case MOZ_SCRIPT_GUJARATI:
265 : case MOZ_SCRIPT_GURMUKHI:
266 : case MOZ_SCRIPT_KANNADA:
267 : case MOZ_SCRIPT_MALAYALAM:
268 : case MOZ_SCRIPT_ORIYA:
269 : case MOZ_SCRIPT_SINHALA:
270 : case MOZ_SCRIPT_TAMIL:
271 : case MOZ_SCRIPT_TELUGU:
272 : case MOZ_SCRIPT_KHMER:
273 : case MOZ_SCRIPT_LAO:
274 : case MOZ_SCRIPT_TIBETAN:
275 : case MOZ_SCRIPT_NEW_TAI_LUE:
276 : case MOZ_SCRIPT_TAI_LE:
277 : case MOZ_SCRIPT_MYANMAR:
278 : case MOZ_SCRIPT_PHAGS_PA:
279 : case MOZ_SCRIPT_BATAK:
280 : case MOZ_SCRIPT_BRAHMI:
281 0 : return SHAPING_INDIC; // scripts that require Indic or other "special" shaping
282 : }
283 : }
284 :
285 : void
286 0 : ClusterIterator::Next()
287 : {
288 0 : if (AtEnd()) {
289 0 : NS_WARNING("ClusterIterator has already reached the end");
290 0 : return;
291 : }
292 :
293 0 : PRUint32 ch = *mPos++;
294 :
295 0 : if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit &&
296 : NS_IS_LOW_SURROGATE(*mPos)) {
297 0 : ch = SURROGATE_TO_UCS4(ch, *mPos++);
298 0 : } else if ((ch & ~0xff) == 0x1100 ||
299 : (ch >= 0xa960 && ch <= 0xa97f) ||
300 : (ch >= 0xac00 && ch <= 0xd7ff)) {
301 : // Handle conjoining Jamo that make Hangul syllables
302 0 : HSType hangulState = GetHangulSyllableType(ch);
303 0 : while (mPos < mLimit) {
304 0 : ch = *mPos;
305 0 : HSType hangulType = GetHangulSyllableType(ch);
306 0 : switch (hangulType) {
307 : case HST_L:
308 : case HST_LV:
309 : case HST_LVT:
310 0 : if (hangulState == HST_L) {
311 0 : hangulState = hangulType;
312 0 : mPos++;
313 0 : continue;
314 : }
315 0 : break;
316 : case HST_V:
317 0 : if ((hangulState != HST_NONE) && !(hangulState & HST_T)) {
318 0 : hangulState = hangulType;
319 0 : mPos++;
320 0 : continue;
321 : }
322 0 : break;
323 : case HST_T:
324 0 : if (hangulState & (HST_V | HST_T)) {
325 0 : hangulState = hangulType;
326 0 : mPos++;
327 0 : continue;
328 : }
329 0 : break;
330 : default:
331 0 : break;
332 : }
333 0 : break;
334 : }
335 : }
336 :
337 0 : while (mPos < mLimit) {
338 0 : ch = *mPos;
339 :
340 : // Check for surrogate pairs; note that isolated surrogates will just
341 : // be treated as generic (non-cluster-extending) characters here,
342 : // which is fine for cluster-iterating purposes
343 0 : if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
344 0 : NS_IS_LOW_SURROGATE(*(mPos + 1))) {
345 0 : ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
346 : }
347 :
348 0 : if (!IsClusterExtender(ch)) {
349 0 : break;
350 : }
351 :
352 0 : mPos++;
353 0 : if (!IS_IN_BMP(ch)) {
354 0 : mPos++;
355 : }
356 : }
357 :
358 0 : NS_ASSERTION(mText < mPos && mPos <= mLimit,
359 : "ClusterIterator::Next has overshot the string!");
360 : }
361 :
362 : } // end namespace unicode
363 :
364 : } // end namespace mozilla
|