1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is mozilla.org code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either of the GNU General Public License Version 2 or later (the "GPL"),
26 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 :
38 : #include "nsUnicodeRange.h"
39 : #include "nsIAtom.h"
40 : #include "gfxAtoms.h"
41 :
42 : // This table depends on unicode range definitions.
43 : // Each item's index must correspond unicode range value
44 : // eg. x-cyrillic = LangGroupTable[kRangeCyrillic]
45 : static nsIAtom **gUnicodeRangeToLangGroupAtomTable[] =
46 : {
47 : &gfxAtoms::x_cyrillic,
48 : &gfxAtoms::el,
49 : &gfxAtoms::tr,
50 : &gfxAtoms::he,
51 : &gfxAtoms::ar,
52 : &gfxAtoms::x_baltic,
53 : &gfxAtoms::th,
54 : &gfxAtoms::ko,
55 : &gfxAtoms::ja,
56 : &gfxAtoms::zh_cn,
57 : &gfxAtoms::zh_tw,
58 : &gfxAtoms::x_devanagari,
59 : &gfxAtoms::x_tamil,
60 : &gfxAtoms::x_armn,
61 : &gfxAtoms::x_beng,
62 : &gfxAtoms::x_cans,
63 : &gfxAtoms::x_ethi,
64 : &gfxAtoms::x_geor,
65 : &gfxAtoms::x_gujr,
66 : &gfxAtoms::x_guru,
67 : &gfxAtoms::x_khmr,
68 : &gfxAtoms::x_mlym,
69 : &gfxAtoms::x_orya,
70 : &gfxAtoms::x_telu,
71 : &gfxAtoms::x_knda,
72 : &gfxAtoms::x_sinh,
73 : &gfxAtoms::x_tibt
74 : };
75 :
76 : /**********************************************************************
77 : * Unicode subranges as defined in unicode 3.0
78 : * x-western, x-central-euro, tr, x-baltic -> latin
79 : * 0000 - 036f
80 : * 1e00 - 1eff
81 : * 2000 - 206f (general punctuation)
82 : * 20a0 - 20cf (currency symbols)
83 : * 2100 - 214f (letterlike symbols)
84 : * 2150 - 218f (Number Forms)
85 : * el -> greek
86 : * 0370 - 03ff
87 : * 1f00 - 1fff
88 : * x-cyrillic -> cyrillic
89 : * 0400 - 04ff
90 : * he -> hebrew
91 : * 0590 - 05ff
92 : * ar -> arabic
93 : * 0600 - 06ff
94 : * fb50 - fdff (arabic presentation forms)
95 : * fe70 - feff (arabic presentation forms b)
96 : * th - thai
97 : * 0e00 - 0e7f
98 : * ko -> korean
99 : * ac00 - d7af (hangul Syllables)
100 : * 1100 - 11ff (jamo)
101 : * 3130 - 318f (hangul compatibility jamo)
102 : * ja
103 : * 3040 - 309f (hiragana)
104 : * 30a0 - 30ff (katakana)
105 : * zh-CN
106 : * zh-TW
107 : *
108 : * CJK
109 : * 3100 - 312f (bopomofo)
110 : * 31a0 - 31bf (bopomofo extended)
111 : * 3000 - 303f (CJK Symbols and Punctuation)
112 : * 2e80 - 2eff (CJK radicals supplement)
113 : * 2f00 - 2fdf (Kangxi Radicals)
114 : * 2ff0 - 2fff (Ideographic Description Characters)
115 : * 3190 - 319f (kanbun)
116 : * 3200 - 32ff (Enclosed CJK letters and Months)
117 : * 3300 - 33ff (CJK compatibility)
118 : * 3400 - 4dbf (CJK Unified Ideographs Extension A)
119 : * 4e00 - 9faf (CJK Unified Ideographs)
120 : * f900 - fa5f (CJK Compatibility Ideographs)
121 : * fe30 - fe4f (CJK compatibility Forms)
122 : * ff00 - ffef (halfwidth and fullwidth forms)
123 : *
124 : * Armenian
125 : * 0530 - 058f
126 : * Sriac
127 : * 0700 - 074f
128 : * Thaana
129 : * 0780 - 07bf
130 : * Devanagari
131 : * 0900 - 097f
132 : * Bengali
133 : * 0980 - 09ff
134 : * Gurmukhi
135 : * 0a00 - 0a7f
136 : * Gujarati
137 : * 0a80 - 0aff
138 : * Oriya
139 : * 0b00 - 0b7f
140 : * Tamil
141 : * 0b80 - 0bff
142 : * Telugu
143 : * 0c00 - 0c7f
144 : * Kannada
145 : * 0c80 - 0cff
146 : * Malayalam
147 : * 0d00 - 0d7f
148 : * Sinhala
149 : * 0d80 - 0def
150 : * Lao
151 : * 0e80 - 0eff
152 : * Tibetan
153 : * 0f00 - 0fbf
154 : * Myanmar
155 : * 1000 - 109f
156 : * Georgian
157 : * 10a0 - 10ff
158 : * Ethiopic
159 : * 1200 - 137f
160 : * Cherokee
161 : * 13a0 - 13ff
162 : * Canadian Aboriginal Syllabics
163 : * 1400 - 167f
164 : * Ogham
165 : * 1680 - 169f
166 : * Runic
167 : * 16a0 - 16ff
168 : * Khmer
169 : * 1780 - 17ff
170 : * Mongolian
171 : * 1800 - 18af
172 : * Misc - superscripts and subscripts
173 : * 2070 - 209f
174 : * Misc - Combining Diacritical Marks for Symbols
175 : * 20d0 - 20ff
176 : * Misc - Arrows
177 : * 2190 - 21ff
178 : * Misc - Mathematical Operators
179 : * 2200 - 22ff
180 : * Misc - Miscellaneous Technical
181 : * 2300 - 23ff
182 : * Misc - Control picture
183 : * 2400 - 243f
184 : * Misc - Optical character recognition
185 : * 2440 - 2450
186 : * Misc - Enclose Alphanumerics
187 : * 2460 - 24ff
188 : * Misc - Box Drawing
189 : * 2500 - 257f
190 : * Misc - Block Elements
191 : * 2580 - 259f
192 : * Misc - Geometric Shapes
193 : * 25a0 - 25ff
194 : * Misc - Miscellaneous Symbols
195 : * 2600 - 267f
196 : * Misc - Dingbats
197 : * 2700 - 27bf
198 : * Misc - Braille Patterns
199 : * 2800 - 28ff
200 : * Yi Syllables
201 : * a000 - a48f
202 : * Yi radicals
203 : * a490 - a4cf
204 : * Alphabetic Presentation Forms
205 : * fb00 - fb4f
206 : * Misc - Combining half Marks
207 : * fe20 - fe2f
208 : * Misc - small form variants
209 : * fe50 - fe6f
210 : * Misc - Specials
211 : * fff0 - ffff
212 : *********************************************************************/
213 :
214 :
215 :
216 : #define NUM_OF_SUBTABLES 10
217 : #define SUBTABLE_SIZE 16
218 :
219 : static const PRUint8 gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] =
220 : {
221 : { // table for X---
222 : kRangeTableBase+1, //u0xxx
223 : kRangeTableBase+2, //u1xxx
224 : kRangeTableBase+3, //u2xxx
225 : kRangeSetCJK, //u3xxx
226 : kRangeSetCJK, //u4xxx
227 : kRangeSetCJK, //u5xxx
228 : kRangeSetCJK, //u6xxx
229 : kRangeSetCJK, //u7xxx
230 : kRangeSetCJK, //u8xxx
231 : kRangeSetCJK, //u9xxx
232 : kRangeTableBase+4, //uaxxx
233 : kRangeKorean, //ubxxx
234 : kRangeKorean, //ucxxx
235 : kRangeTableBase+5, //udxxx
236 : kRangePrivate, //uexxx
237 : kRangeTableBase+6 //ufxxx
238 : },
239 : { //table for 0X--
240 : kRangeSetLatin, //u00xx
241 : kRangeSetLatin, //u01xx
242 : kRangeSetLatin, //u02xx
243 : kRangeGreek, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
244 : kRangeCyrillic, //u04xx
245 : kRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
246 : kRangeArabic, //u06xx
247 : kRangeTertiaryTable, //u07xx
248 : kRangeUnassigned, //u08xx
249 : kRangeTertiaryTable, //u09xx
250 : kRangeTertiaryTable, //u0axx
251 : kRangeTertiaryTable, //u0bxx
252 : kRangeTertiaryTable, //u0cxx
253 : kRangeTertiaryTable, //u0dxx
254 : kRangeTertiaryTable, //u0exx
255 : kRangeTibetan //u0fxx
256 : },
257 : { //table for 1x--
258 : kRangeTertiaryTable, //u10xx
259 : kRangeKorean, //u11xx
260 : kRangeEthiopic, //u12xx
261 : kRangeTertiaryTable, //u13xx
262 : kRangeCanadian, //u14xx
263 : kRangeCanadian, //u15xx
264 : kRangeTertiaryTable, //u16xx
265 : kRangeKhmer, //u17xx
266 : kRangeMongolian, //u18xx
267 : kRangeUnassigned, //u19xx
268 : kRangeUnassigned, //u1axx
269 : kRangeUnassigned, //u1bxx
270 : kRangeUnassigned, //u1cxx
271 : kRangeUnassigned, //u1dxx
272 : kRangeSetLatin, //u1exx
273 : kRangeGreek //u1fxx
274 : },
275 : { //table for 2x--
276 : kRangeSetLatin, //u20xx
277 : kRangeSetLatin, //u21xx
278 : kRangeMathOperators, //u22xx
279 : kRangeMiscTechnical, //u23xx
280 : kRangeControlOpticalEnclose, //u24xx
281 : kRangeBoxBlockGeometrics, //u25xx
282 : kRangeMiscSymbols, //u26xx
283 : kRangeDingbats, //u27xx
284 : kRangeBraillePattern, //u28xx
285 : kRangeUnassigned, //u29xx
286 : kRangeUnassigned, //u2axx
287 : kRangeUnassigned, //u2bxx
288 : kRangeUnassigned, //u2cxx
289 : kRangeUnassigned, //u2dxx
290 : kRangeSetCJK, //u2exx
291 : kRangeSetCJK //u2fxx
292 : },
293 : { //table for ax--
294 : kRangeYi, //ua0xx
295 : kRangeYi, //ua1xx
296 : kRangeYi, //ua2xx
297 : kRangeYi, //ua3xx
298 : kRangeYi, //ua4xx
299 : kRangeUnassigned, //ua5xx
300 : kRangeUnassigned, //ua6xx
301 : kRangeUnassigned, //ua7xx
302 : kRangeUnassigned, //ua8xx
303 : kRangeUnassigned, //ua9xx
304 : kRangeUnassigned, //uaaxx
305 : kRangeUnassigned, //uabxx
306 : kRangeKorean, //uacxx
307 : kRangeKorean, //uadxx
308 : kRangeKorean, //uaexx
309 : kRangeKorean //uafxx
310 : },
311 : { //table for dx--
312 : kRangeKorean, //ud0xx
313 : kRangeKorean, //ud1xx
314 : kRangeKorean, //ud2xx
315 : kRangeKorean, //ud3xx
316 : kRangeKorean, //ud4xx
317 : kRangeKorean, //ud5xx
318 : kRangeKorean, //ud6xx
319 : kRangeKorean, //ud7xx
320 : kRangeSurrogate, //ud8xx
321 : kRangeSurrogate, //ud9xx
322 : kRangeSurrogate, //udaxx
323 : kRangeSurrogate, //udbxx
324 : kRangeSurrogate, //udcxx
325 : kRangeSurrogate, //uddxx
326 : kRangeSurrogate, //udexx
327 : kRangeSurrogate //udfxx
328 : },
329 : { // table for fx--
330 : kRangePrivate, //uf0xx
331 : kRangePrivate, //uf1xx
332 : kRangePrivate, //uf2xx
333 : kRangePrivate, //uf3xx
334 : kRangePrivate, //uf4xx
335 : kRangePrivate, //uf5xx
336 : kRangePrivate, //uf6xx
337 : kRangePrivate, //uf7xx
338 : kRangePrivate, //uf8xx
339 : kRangeSetCJK, //uf9xx
340 : kRangeSetCJK, //ufaxx
341 : kRangeArabic, //ufbxx, includes alphabic presentation form
342 : kRangeArabic, //ufcxx
343 : kRangeArabic, //ufdxx
344 : kRangeTableBase+8, //ufexx
345 : kRangeTableBase+9 //uffxx, halfwidth and fullwidth forms, includes Specials
346 : },
347 : { //table for 0x0500 - 0x05ff
348 : kRangeCyrillic, //u050x
349 : kRangeCyrillic, //u051x
350 : kRangeCyrillic, //u052x
351 : kRangeArmenian, //u053x
352 : kRangeArmenian, //u054x
353 : kRangeArmenian, //u055x
354 : kRangeArmenian, //u056x
355 : kRangeArmenian, //u057x
356 : kRangeArmenian, //u058x
357 : kRangeHebrew, //u059x
358 : kRangeHebrew, //u05ax
359 : kRangeHebrew, //u05bx
360 : kRangeHebrew, //u05cx
361 : kRangeHebrew, //u05dx
362 : kRangeHebrew, //u05ex
363 : kRangeHebrew //u05fx
364 : },
365 : { //table for 0xfe00 - 0xfeff
366 : kRangeSetCJK, //ufe0x
367 : kRangeSetCJK, //ufe1x
368 : kRangeSetCJK, //ufe2x
369 : kRangeSetCJK, //ufe3x
370 : kRangeSetCJK, //ufe4x
371 : kRangeSetCJK, //ufe5x
372 : kRangeSetCJK, //ufe6x
373 : kRangeArabic, //ufe7x
374 : kRangeArabic, //ufe8x
375 : kRangeArabic, //ufe9x
376 : kRangeArabic, //ufeax
377 : kRangeArabic, //ufebx
378 : kRangeArabic, //ufecx
379 : kRangeArabic, //ufedx
380 : kRangeArabic, //ufeex
381 : kRangeArabic //ufefx
382 : },
383 : { //table for 0xff00 - 0xffff
384 : kRangeSetCJK, //uff0x, fullwidth latin
385 : kRangeSetCJK, //uff1x, fullwidth latin
386 : kRangeSetCJK, //uff2x, fullwidth latin
387 : kRangeSetCJK, //uff3x, fullwidth latin
388 : kRangeSetCJK, //uff4x, fullwidth latin
389 : kRangeSetCJK, //uff5x, fullwidth latin
390 : kRangeSetCJK, //uff6x, halfwidth katakana
391 : kRangeSetCJK, //uff7x, halfwidth katakana
392 : kRangeSetCJK, //uff8x, halfwidth katakana
393 : kRangeSetCJK, //uff9x, halfwidth katakana
394 : kRangeSetCJK, //uffax, halfwidth hangul jamo
395 : kRangeSetCJK, //uffbx, halfwidth hangul jamo
396 : kRangeSetCJK, //uffcx, halfwidth hangul jamo
397 : kRangeSetCJK, //uffdx, halfwidth hangul jamo
398 : kRangeSetCJK, //uffex, fullwidth symbols
399 : kRangeSpecials, //ufffx, Specials
400 : },
401 : };
402 :
403 : // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
404 : // code points so that the number of entries in the tertiary range
405 : // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
406 : // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
407 : // syllabaries take multiple chunks and Ogham and Runic share a single chunk.
408 : #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)
409 :
410 : static const PRUint8 gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] =
411 : { //table for 0x0700 - 0x1600
412 : kRangeSyriac, //u070x
413 : kRangeThaana, //u078x
414 : kRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.)
415 : kRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.)
416 : kRangeDevanagari, //u090x
417 : kRangeBengali, //u098x
418 : kRangeGurmukhi, //u0a0x
419 : kRangeGujarati, //u0a8x
420 : kRangeOriya, //u0b0x
421 : kRangeTamil, //u0b8x
422 : kRangeTelugu, //u0c0x
423 : kRangeKannada, //u0c8x
424 : kRangeMalayalam, //u0d0x
425 : kRangeSinhala, //u0d8x
426 : kRangeThai, //u0e0x
427 : kRangeLao, //u0e8x
428 : kRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.)
429 : kRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.)
430 : kRangeMyanmar, //u100x
431 : kRangeGeorgian, //u108x
432 : kRangeKorean, //u110x place holder(resolved in the 2ndary tab.)
433 : kRangeKorean, //u118x place holder(resolved in the 2ndary tab.)
434 : kRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.)
435 : kRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.)
436 : kRangeEthiopic, //u130x
437 : kRangeCherokee, //u138x
438 : kRangeCanadian, //u140x place holder(resolved in the 2ndary tab.)
439 : kRangeCanadian, //u148x place holder(resolved in the 2ndary tab.)
440 : kRangeCanadian, //u150x place holder(resolved in the 2ndary tab.)
441 : kRangeCanadian, //u158x place holder(resolved in the 2ndary tab.)
442 : kRangeCanadian, //u160x
443 : kRangeOghamRunic //u168x this contains two scripts, Ogham & Runic
444 : };
445 :
446 : // A two level index is almost enough for locating a range, with the
447 : // exception of u03xx and u05xx. Since we don't really care about range for
448 : // combining diacritical marks in our font application, they are
449 : // not discriminated further. But future adoption of this module for other use
450 : // should be aware of this limitation. The implementation can be extended if
451 : // there is such a need.
452 : // For Indic, Southeast Asian scripts and some other scripts between
453 : // U+0700 and U+16FF, it's extended to the third level.
454 0 : PRUint32 FindCharUnicodeRange(PRUint32 ch)
455 : {
456 : PRUint32 range;
457 :
458 : // aggregate ranges for non-BMP codepoints
459 0 : if (ch > 0xFFFF) {
460 0 : PRUint32 p = (ch >> 16);
461 0 : if (p == 1) {
462 0 : return kRangeSMP;
463 0 : } else if (p == 2) {
464 0 : return kRangeSetCJK;
465 : }
466 0 : return kRangeHigherPlanes;
467 : }
468 :
469 : // lookup explicit range for BMP codepoints
470 : // first general range
471 0 : range = gUnicodeSubrangeTable[0][ch >> 12];
472 :
473 : // if general range is good enough, return that
474 0 : if (range < kRangeTableBase)
475 : // we try to get a specific range
476 0 : return range;
477 :
478 : // otherwise, use subrange tables
479 0 : range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8];
480 0 : if (range < kRangeTableBase)
481 0 : return range;
482 0 : if (range < kRangeTertiaryTable)
483 0 : return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4];
484 :
485 : // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
486 0 : return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
487 : }
488 :
489 0 : nsIAtom *LangGroupFromUnicodeRange(PRUint8 unicodeRange)
490 : {
491 0 : if (kRangeSpecificItemNum > unicodeRange) {
492 0 : nsIAtom **atom = gUnicodeRangeToLangGroupAtomTable[unicodeRange];
493 0 : return *atom;
494 : }
495 0 : return nsnull;
496 : }
|