1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Unicode case conversion helpers.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corp..
19 : * Portions created by the Initial Developer are Copyright (C) 2002
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : * Alec Flett <alecf@netscape.com>
24 : * Benjamin Smedberg <benjamin@smedbergs.us>
25 : * Ben Turner <mozilla@songbirdnest.com>
26 : *
27 : * Alternatively, the contents of this file may be used under the terms of
28 : * either the GNU General Public License Version 2 or later (the "GPL"), or
29 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 : * in which case the provisions of the GPL or the LGPL are applicable instead
31 : * of those above. If you wish to allow use of your version of this file only
32 : * under the terms of either the GPL or the LGPL, and not to allow others to
33 : * use your version of this file under the terms of the MPL, indicate your
34 : * decision by deleting the provisions above and replace them with the notice
35 : * and other provisions required by the GPL or the LGPL. If you do not delete
36 : * the provisions above, a recipient may use your version of this file under
37 : * the terms of any one of the MPL, the GPL or the LGPL.
38 : *
39 : * ***** END LICENSE BLOCK ***** */
40 :
41 : #include "nsUnicharUtils.h"
42 : #include "nsUnicharUtilCIID.h"
43 :
44 : #include "nsCRT.h"
45 : #include "nsICaseConversion.h"
46 : #include "nsServiceManagerUtils.h"
47 : #include "nsXPCOMStrings.h"
48 : #include "casetable.h"
49 : #include "nsUTF8Utils.h"
50 : #include "nsHashKeys.h"
51 :
52 : #include <ctype.h>
53 :
54 : // For gUpperToTitle
55 : enum {
56 : kUpperIdx =0,
57 : kTitleIdx
58 : };
59 :
60 : // For gUpperToTitle
61 : enum {
62 : kLowIdx =0,
63 : kSizeEveryIdx,
64 : kDiffIdx
65 : };
66 :
67 : #define IS_ASCII(u) ((u) < 0x80)
68 : #define IS_ASCII_UPPER(u) (('A' <= (u)) && ( (u) <= 'Z' ))
69 : #define IS_ASCII_LOWER(u) (('a' <= (u)) && ( (u) <= 'z'))
70 : #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
71 : #define IS_ASCII_SPACE(u) ( ' ' == (u) )
72 :
73 : #define IS_NOCASE_CHAR(u) (0==(1&(gCaseBlocks[(u)>>13]>>(0x001F&((u)>>8)))))
74 :
75 : // Size of Tables
76 :
77 : // Changing these numbers may break UTF-8 caching. Be careful!
78 : #define CASE_MAP_CACHE_SIZE 0x100
79 : #define CASE_MAP_CACHE_MASK 0xFF
80 :
81 : struct nsCompressedMap {
82 : const PRUnichar *mTable;
83 : PRUint32 mSize;
84 : PRUint32 mCache[CASE_MAP_CACHE_SIZE];
85 : PRUint32 mLastBase;
86 :
87 75924 : PRUnichar Map(PRUnichar aChar)
88 : {
89 : // We don't need explicit locking here since the cached values are int32s,
90 : // which are read and written atomically. The following code is threadsafe
91 : // because we never access bits from mCache directly -- we always first
92 : // read the entire entry into a local variable and then mask off the bits
93 : // we're interested in.
94 :
95 : // Check the 256-byte cache first and bail with our answer if we can.
96 75924 : PRUint32 cachedData = mCache[aChar & CASE_MAP_CACHE_MASK];
97 75924 : if (aChar == ((cachedData >> 16) & 0x0000FFFF))
98 74088 : return cachedData & 0x0000FFFF;
99 :
100 : // Now try the last index we looked up, storing it into a local variable
101 : // for thread-safety.
102 1836 : PRUint32 base = mLastBase;
103 1836 : PRUnichar res = 0;
104 :
105 : // Does this character fit in the slot?
106 4301 : if ((aChar <= ((mTable[base+kSizeEveryIdx] >> 8) +
107 1836 : mTable[base+kLowIdx])) &&
108 629 : (mTable[base+kLowIdx] <= aChar)) {
109 :
110 : // This character uses the same base as our last lookup, so the
111 : // conversion is easy.
112 460 : if (((mTable[base+kSizeEveryIdx] & 0x00FF) > 0) &&
113 115 : (0 != ((aChar - mTable[base+kLowIdx]) %
114 115 : (mTable[base+kSizeEveryIdx] & 0x00FF))))
115 : {
116 11 : res = aChar;
117 : } else {
118 104 : res = aChar + mTable[base+kDiffIdx];
119 : }
120 :
121 : } else {
122 : // Do the full lookup.
123 1721 : res = this->Lookup(0, mSize/2, mSize-1, aChar);
124 : }
125 :
126 : // Cache the result and return.
127 1836 : mCache[aChar & CASE_MAP_CACHE_MASK] =
128 1836 : ((aChar << 16) & 0xFFFF0000) | (0x0000FFFF & res);
129 1836 : return res;
130 : }
131 :
132 : // Takes as arguments the left bound, middle, right bound, and character to
133 : // search for. Executes a binary search.
134 12340 : PRUnichar Lookup(PRUint32 l,
135 : PRUint32 m,
136 : PRUint32 r,
137 : PRUnichar aChar)
138 : {
139 12340 : PRUint32 base = m*3; // Every line in the table is 3 units wide.
140 :
141 : // Is aChar past the top of the current table entry? (The upper byte of
142 : // the 'every' entry contains the offset to the end of this entry.)
143 24680 : if (aChar > ((mTable[base+kSizeEveryIdx] >> 8) +
144 12340 : mTable[base+kLowIdx]))
145 : {
146 4273 : if (l > m || l == r)
147 875 : return aChar;
148 : // Advance one round.
149 3398 : PRUint32 newm = (m+r+1)/2;
150 3398 : if (newm == m)
151 0 : newm++;
152 3398 : return this->Lookup(m+1, newm, r, aChar);
153 :
154 : // Is aChar below the bottom of the current table entry?
155 8067 : } else if (mTable[base+kLowIdx] > aChar) {
156 7292 : if (r < m || l == r)
157 71 : return aChar;
158 : // Advance one round
159 7221 : PRUint32 newm = (l+m-1)/2;
160 7221 : if(newm == m)
161 0 : newm++;
162 7221 : return this->Lookup(l, newm, m-1, aChar);
163 :
164 : // We've found the entry aChar should live in.
165 : } else {
166 : // Determine if aChar falls in a gap. (The lower byte of the 'every'
167 : // entry contains n for which every nth character from the base is a
168 : // character of interest.)
169 2319 : if (((mTable[base+kSizeEveryIdx] & 0x00FF) > 0) &&
170 772 : (0 != ((aChar - mTable[base+kLowIdx]) %
171 772 : (mTable[base+kSizeEveryIdx] & 0x00FF))))
172 : {
173 50 : return aChar;
174 : }
175 : // If aChar doesn't fall in the gap, cache and convert.
176 725 : mLastBase = base;
177 725 : return aChar + mTable[base+kDiffIdx];
178 : }
179 : }
180 : };
181 :
182 : static nsCompressedMap gUpperMap = {
183 : reinterpret_cast<const PRUnichar*>(&gToUpper[0]),
184 : gToUpperItems
185 : };
186 :
187 : static nsCompressedMap gLowerMap = {
188 : reinterpret_cast<const PRUnichar*>(&gToLower[0]),
189 : gToLowerItems
190 : };
191 :
192 : // We want ToLowerCase(PRUnichar) and ToLowerCaseASCII(PRUnichar) to be fast
193 : // when they're called from within the case-insensitive comparators, so we
194 : // define inlined versions.
195 : static NS_ALWAYS_INLINE PRUnichar
196 0 : ToLowerCase_inline(PRUnichar aChar)
197 : {
198 753205 : if (IS_ASCII(aChar)) {
199 706863 : return gASCIIToLower[aChar];
200 46342 : } else if (IS_NOCASE_CHAR(aChar)) {
201 5702 : return aChar;
202 : }
203 :
204 40640 : return gLowerMap.Map(aChar);
205 : }
206 :
207 : static NS_ALWAYS_INLINE PRUnichar
208 0 : ToLowerCaseASCII_inline(const PRUnichar aChar)
209 : {
210 0 : if (IS_ASCII(aChar))
211 0 : return gASCIIToLower[aChar];
212 0 : return aChar;
213 : }
214 :
215 : void
216 47693 : ToLowerCase(nsAString& aString)
217 : {
218 47693 : PRUnichar *buf = aString.BeginWriting();
219 47693 : ToLowerCase(buf, buf, aString.Length());
220 47693 : }
221 :
222 : void
223 0 : ToLowerCase(const nsAString& aSource,
224 : nsAString& aDest)
225 : {
226 : const PRUnichar *in;
227 : PRUnichar *out;
228 0 : PRUint32 len = NS_StringGetData(aSource, &in);
229 0 : NS_StringGetMutableData(aDest, len, &out);
230 0 : NS_ASSERTION(out, "Uh...");
231 0 : ToLowerCase(in, out, len);
232 0 : }
233 :
234 : PRUnichar
235 0 : ToLowerCaseASCII(const PRUnichar aChar)
236 : {
237 0 : return ToLowerCaseASCII_inline(aChar);
238 : }
239 :
240 : void
241 32046 : ToUpperCase(nsAString& aString)
242 : {
243 32046 : PRUnichar *buf = aString.BeginWriting();
244 32046 : ToUpperCase(buf, buf, aString.Length());
245 32046 : }
246 :
247 : void
248 0 : ToUpperCase(const nsAString& aSource,
249 : nsAString& aDest)
250 : {
251 : const PRUnichar *in;
252 : PRUnichar *out;
253 0 : PRUint32 len = NS_StringGetData(aSource, &in);
254 0 : NS_StringGetMutableData(aDest, len, &out);
255 0 : NS_ASSERTION(out, "Uh...");
256 0 : ToUpperCase(in, out, len);
257 0 : }
258 :
259 : #ifdef MOZILLA_INTERNAL_API
260 :
261 : PRInt32
262 12120 : nsCaseInsensitiveStringComparator::operator()(const PRUnichar* lhs,
263 : const PRUnichar* rhs,
264 : PRUint32 lLength,
265 : PRUint32 rLength) const
266 : {
267 : return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
268 12120 : (lLength > rLength) ? 1 : -1;
269 : }
270 :
271 : PRInt32
272 0 : nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
273 : const char* rhs,
274 : PRUint32 lLength,
275 : PRUint32 rLength) const
276 : {
277 0 : return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
278 : }
279 :
280 : PRInt32
281 0 : nsASCIICaseInsensitiveStringComparator::operator()(const PRUnichar* lhs,
282 : const PRUnichar* rhs,
283 : PRUint32 lLength,
284 : PRUint32 rLength) const
285 : {
286 0 : if (lLength != rLength) {
287 0 : if (lLength > rLength)
288 0 : return 1;
289 0 : return -1;
290 : }
291 :
292 0 : while (rLength) {
293 0 : PRUnichar l = *lhs++;
294 0 : PRUnichar r = *rhs++;
295 0 : if (l != r) {
296 0 : l = ToLowerCaseASCII_inline(l);
297 0 : r = ToLowerCaseASCII_inline(r);
298 :
299 0 : if (l > r)
300 0 : return 1;
301 0 : else if (r > l)
302 0 : return -1;
303 : }
304 0 : rLength--;
305 : }
306 :
307 0 : return 0;
308 : }
309 :
310 : #endif // MOZILLA_INTERNAL_API
311 :
312 : PRUnichar
313 731143 : ToLowerCase(PRUnichar aChar)
314 : {
315 731143 : return ToLowerCase_inline(aChar);
316 : }
317 :
318 : void
319 69765 : ToLowerCase(const PRUnichar *aIn, PRUnichar *aOut, PRUint32 aLen)
320 : {
321 705702 : for (PRUint32 i = 0; i < aLen; i++) {
322 635937 : aOut[i] = ToLowerCase(aIn[i]);
323 : }
324 69765 : }
325 :
326 : PRUnichar
327 230793 : ToUpperCase(PRUnichar aChar)
328 : {
329 230793 : if (IS_ASCII(aChar)) {
330 195372 : if (IS_ASCII_LOWER(aChar))
331 156830 : return aChar - 0x20;
332 : else
333 38542 : return aChar;
334 35421 : } else if (IS_NOCASE_CHAR(aChar)) {
335 137 : return aChar;
336 : }
337 :
338 35284 : return gUpperMap.Map(aChar);
339 : }
340 :
341 : void
342 32046 : ToUpperCase(const PRUnichar *aIn, PRUnichar *aOut, PRUint32 aLen)
343 : {
344 179632 : for (PRUint32 i = 0; i < aLen; i++) {
345 147586 : aOut[i] = ToUpperCase(aIn[i]);
346 : }
347 32046 : }
348 :
349 : PRUnichar
350 0 : ToTitleCase(PRUnichar aChar)
351 : {
352 0 : if (IS_ASCII(aChar)) {
353 0 : return ToUpperCase(aChar);
354 0 : } else if (IS_NOCASE_CHAR(aChar)) {
355 0 : return aChar;
356 : }
357 :
358 : // First check for uppercase characters whose titlecase mapping is
359 : // different, like U+01F1 DZ: they must remain unchanged.
360 0 : if (0x01C0 == (aChar & 0xFFC0)) {
361 0 : for (PRUint32 i = 0; i < gUpperToTitleItems; i++) {
362 0 : if (aChar == gUpperToTitle[(i*2)+kUpperIdx]) {
363 0 : return aChar;
364 : }
365 : }
366 : }
367 :
368 0 : PRUnichar upper = gUpperMap.Map(aChar);
369 :
370 0 : if (0x01C0 == ( upper & 0xFFC0)) {
371 0 : for (PRUint32 i = 0 ; i < gUpperToTitleItems; i++) {
372 0 : if (upper == gUpperToTitle[(i*2)+kUpperIdx]) {
373 0 : return gUpperToTitle[(i*2)+kTitleIdx];
374 : }
375 : }
376 : }
377 :
378 0 : return upper;
379 : }
380 :
381 : PRInt32
382 12120 : CaseInsensitiveCompare(const PRUnichar *a,
383 : const PRUnichar *b,
384 : PRUint32 len)
385 : {
386 12120 : NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
387 :
388 12120 : if (len) {
389 6915 : do {
390 17783 : PRUnichar c1 = *a++;
391 17783 : PRUnichar c2 = *b++;
392 :
393 17783 : if (c1 != c2) {
394 11031 : c1 = ToLowerCase_inline(c1);
395 11031 : c2 = ToLowerCase_inline(c2);
396 11031 : if (c1 != c2) {
397 10868 : if (c1 < c2) {
398 8290 : return -1;
399 : }
400 2578 : return 1;
401 : }
402 : }
403 : } while (--len != 0);
404 : }
405 1252 : return 0;
406 : }
407 :
408 : // Calculates the codepoint of the UTF8 sequence starting at aStr. Sets aNext
409 : // to the byte following the end of the sequence.
410 : //
411 : // If the sequence is invalid, or if computing the codepoint would take us off
412 : // the end of the string (as marked by aEnd), returns -1 and does not set
413 : // aNext. Note that this function doesn't check that aStr < aEnd -- it assumes
414 : // you've done that already.
415 : static NS_ALWAYS_INLINE PRUint32
416 0 : GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)
417 : {
418 : // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
419 : // sign extend.
420 48036 : const unsigned char *str = (unsigned char*)aStr;
421 :
422 48036 : if (UTF8traits::isASCII(str[0])) {
423 : // It's ASCII; just convert to lower-case and return it.
424 47592 : *aNext = aStr + 1;
425 47592 : return gASCIIToLower[*str];
426 : }
427 444 : if (UTF8traits::is2byte(str[0]) && NS_LIKELY(aStr + 1 < aEnd)) {
428 : // It's a two-byte sequence, so it looks like
429 : // 110XXXXX 10XXXXXX.
430 : // This is definitely in the BMP, so we can store straightaway into a
431 : // PRUint16.
432 :
433 : PRUint16 c;
434 0 : c = (str[0] & 0x1F) << 6;
435 0 : c += (str[1] & 0x3F);
436 :
437 0 : if (!IS_NOCASE_CHAR(c))
438 0 : c = gLowerMap.Map(c);
439 :
440 0 : *aNext = aStr + 2;
441 0 : return c;
442 : }
443 444 : if (UTF8traits::is3byte(str[0]) && NS_LIKELY(aStr + 2 < aEnd)) {
444 : // It's a three-byte sequence, so it looks like
445 : // 1110XXXX 10XXXXXX 10XXXXXX.
446 : // This will just barely fit into 16-bits, so store into a PRUint16.
447 :
448 : PRUint16 c;
449 444 : c = (str[0] & 0x0F) << 12;
450 444 : c += (str[1] & 0x3F) << 6;
451 444 : c += (str[2] & 0x3F);
452 :
453 444 : if (!IS_NOCASE_CHAR(c))
454 0 : c = gLowerMap.Map(c);
455 :
456 444 : *aNext = aStr + 3;
457 444 : return c;
458 : }
459 0 : if (UTF8traits::is4byte(str[0]) && NS_LIKELY(aStr + 3 < aEnd)) {
460 : // It's a four-byte sequence, so it looks like
461 : // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
462 : // Unless this is an overlong sequence, the codepoint it encodes definitely
463 : // isn't in the BMP, so we don't bother trying to convert it to lower-case.
464 :
465 : PRUint32 c;
466 0 : c = (str[0] & 0x07) << 18;
467 0 : c += (str[1] & 0x3F) << 12;
468 0 : c += (str[2] & 0x3F) << 6;
469 0 : c += (str[3] & 0x3F);
470 :
471 0 : *aNext = aStr + 4;
472 0 : return c;
473 : }
474 :
475 : // Hm, we don't understand this sequence.
476 0 : return -1;
477 : }
478 :
479 0 : PRInt32 CaseInsensitiveCompare(const char *aLeft,
480 : const char *aRight,
481 : PRUint32 aLeftBytes,
482 : PRUint32 aRightBytes)
483 : {
484 0 : const char *leftEnd = aLeft + aLeftBytes;
485 0 : const char *rightEnd = aRight + aRightBytes;
486 :
487 0 : while (aLeft < leftEnd && aRight < rightEnd) {
488 0 : PRUint32 leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);
489 0 : if (NS_UNLIKELY(leftChar == PRUint32(-1)))
490 0 : return -1;
491 :
492 0 : PRUint32 rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);
493 0 : if (NS_UNLIKELY(rightChar == PRUint32(-1)))
494 0 : return -1;
495 :
496 : // Now leftChar and rightChar are lower-case, so we can compare them.
497 0 : if (leftChar != rightChar) {
498 0 : if (leftChar > rightChar)
499 0 : return 1;
500 0 : return -1;
501 : }
502 : }
503 :
504 : // Make sure that if one string is longer than the other we return the
505 : // correct result.
506 0 : if (aLeft < leftEnd)
507 0 : return 1;
508 0 : if (aRight < rightEnd)
509 0 : return -1;
510 :
511 0 : return 0;
512 : }
513 :
514 : bool
515 24018 : CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
516 : const char* aLeftEnd, const char* aRightEnd,
517 : const char** aLeftNext, const char** aRightNext,
518 : bool* aErr)
519 : {
520 24018 : NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
521 24018 : NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
522 24018 : NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
523 24018 : NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
524 24018 : NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
525 :
526 : PRUint32 leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);
527 24018 : if (NS_UNLIKELY(leftChar == PRUint32(-1))) {
528 0 : *aErr = true;
529 0 : return false;
530 : }
531 :
532 : PRUint32 rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);
533 24018 : if (NS_UNLIKELY(rightChar == PRUint32(-1))) {
534 0 : *aErr = true;
535 0 : return false;
536 : }
537 :
538 : // Can't have an error past this point.
539 24018 : *aErr = false;
540 :
541 24018 : return leftChar == rightChar;
542 : }
543 :
544 : namespace mozilla {
545 :
546 : PRUint32
547 69442 : HashUTF8AsUTF16(const char* aUTF8, PRUint32 aLength, bool* aErr)
548 : {
549 69442 : PRUint32 hash = 0;
550 69442 : const char* s = aUTF8;
551 69442 : const char* end = aUTF8 + aLength;
552 :
553 69442 : *aErr = false;
554 :
555 711181 : while (s < end)
556 : {
557 572297 : PRUint32 ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
558 572297 : if (*aErr) {
559 0 : return 0;
560 : }
561 :
562 572297 : if (ucs4 < PLANE1_BASE) {
563 572297 : hash = AddToHash(hash, ucs4);
564 : }
565 : else {
566 0 : hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
567 : }
568 : }
569 :
570 69442 : return hash;
571 : }
572 :
573 : } // namespace mozilla
|