1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is mozilla.org code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 2001
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : * Peter Annema <jaggernaut@netscape.com> (original author)
24 : *
25 : * Alternatively, the contents of this file may be used under the terms of
26 : * either of the GNU General Public License Version 2 or later (the "GPL"),
27 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 : * in which case the provisions of the GPL or the LGPL are applicable instead
29 : * of those above. If you wish to allow use of your version of this file only
30 : * under the terms of either the GPL or the LGPL, and not to allow others to
31 : * use your version of this file under the terms of the MPL, indicate your
32 : * decision by deleting the provisions above and replace them with the notice
33 : * and other provisions required by the GPL or the LGPL. If you do not delete
34 : * the provisions above, a recipient may use your version of this file under
35 : * the terms of any one of the MPL, the GPL or the LGPL.
36 : *
37 : * ***** END LICENSE BLOCK ***** */
38 : #ifndef nsUTF8Utils_h_
39 : #define nsUTF8Utils_h_
40 :
41 : // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
42 : // file will provide signatures for the Mozilla abstract string types. It will
43 : // use XPCOM assertion/debugging macros, etc.
44 :
45 : #include "nscore.h"
46 : #include "mozilla/SSE.h"
47 :
48 : #include "nsCharTraits.h"
49 :
50 : class UTF8traits
51 : {
52 : public:
53 69981944 : static bool isASCII(char c) { return (c & 0x80) == 0x00; }
54 372914 : static bool isInSeq(char c) { return (c & 0xC0) == 0x80; }
55 421568 : static bool is2byte(char c) { return (c & 0xE0) == 0xC0; }
56 325000 : static bool is3byte(char c) { return (c & 0xF0) == 0xE0; }
57 2 : static bool is4byte(char c) { return (c & 0xF8) == 0xF0; }
58 1 : static bool is5byte(char c) { return (c & 0xFC) == 0xF8; }
59 1 : static bool is6byte(char c) { return (c & 0xFE) == 0xFC; }
60 : };
61 :
62 : /**
63 : * Extract the next UCS-4 character from the buffer and return it. The
64 : * pointer passed in is advanced to the start of the next character in the
65 : * buffer. If non-null, the parameters err and overlong are filled in to
66 : * indicate that the character was represented by an overlong sequence, or
67 : * that an error occurred.
68 : */
69 :
70 : class UTF8CharEnumerator
71 : {
72 : public:
73 32568304 : static PRUint32 NextChar(const char **buffer, const char *end,
74 : bool *err)
75 : {
76 32568304 : NS_ASSERTION(buffer && *buffer, "null buffer!");
77 :
78 32568304 : const char *p = *buffer;
79 32568304 : *err = false;
80 :
81 32568304 : if (p >= end)
82 : {
83 0 : *err = true;
84 :
85 0 : return 0;
86 : }
87 :
88 32568304 : char c = *p++;
89 :
90 32568304 : if ( UTF8traits::isASCII(c) )
91 : {
92 32357793 : *buffer = p;
93 32357793 : return c;
94 : }
95 :
96 : PRUint32 ucs4;
97 : PRUint32 minUcs4;
98 210511 : PRInt32 state = 0;
99 :
100 210511 : if (!CalcState(c, ucs4, minUcs4, state)) {
101 0 : NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
102 0 : *err = true;
103 :
104 0 : return 0;
105 : }
106 :
107 793781 : while ( state-- )
108 : {
109 372759 : if (p == end)
110 : {
111 0 : *err = true;
112 :
113 0 : return 0;
114 : }
115 :
116 372759 : c = *p++;
117 :
118 372759 : if (!AddByte(c, state, ucs4))
119 : {
120 0 : *err = true;
121 :
122 0 : return 0;
123 : }
124 : }
125 :
126 210511 : if ( ucs4 < minUcs4 )
127 : {
128 : // Overlong sequence
129 0 : ucs4 = UCS2_REPLACEMENT_CHAR;
130 : }
131 210511 : else if ( ucs4 >= 0xD800 &&
132 : (ucs4 <= 0xDFFF || ucs4 >= UCS_END))
133 : {
134 : // Surrogates and code points outside the Unicode range.
135 0 : ucs4 = UCS2_REPLACEMENT_CHAR;
136 : }
137 :
138 210511 : *buffer = p;
139 210511 : return ucs4;
140 : }
141 :
142 : private:
143 210511 : static bool CalcState(char c, PRUint32& ucs4, PRUint32& minUcs4,
144 : PRInt32& state)
145 : {
146 210511 : if ( UTF8traits::is2byte(c) )
147 : {
148 48263 : ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
149 48263 : state = 1;
150 48263 : minUcs4 = 0x00000080;
151 : }
152 162248 : else if ( UTF8traits::is3byte(c) )
153 : {
154 162248 : ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
155 162248 : state = 2;
156 162248 : minUcs4 = 0x00000800;
157 : }
158 0 : else if ( UTF8traits::is4byte(c) )
159 : {
160 0 : ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
161 0 : state = 3;
162 0 : minUcs4 = 0x00010000;
163 : }
164 0 : else if ( UTF8traits::is5byte(c) )
165 : {
166 0 : ucs4 = (PRUint32(c) << 24) & 0x03000000L;
167 0 : state = 4;
168 0 : minUcs4 = 0x00200000;
169 : }
170 0 : else if ( UTF8traits::is6byte(c) )
171 : {
172 0 : ucs4 = (PRUint32(c) << 30) & 0x40000000L;
173 0 : state = 5;
174 0 : minUcs4 = 0x04000000;
175 : }
176 : else
177 : {
178 0 : return false;
179 : }
180 :
181 210511 : return true;
182 : }
183 :
184 372759 : static bool AddByte(char c, PRInt32 state, PRUint32& ucs4)
185 : {
186 372759 : if ( UTF8traits::isInSeq(c) )
187 : {
188 372759 : PRInt32 shift = state * 6;
189 372759 : ucs4 |= (PRUint32(c) & 0x3F) << shift;
190 372759 : return true;
191 : }
192 :
193 0 : return false;
194 : }
195 : };
196 :
197 :
198 : /**
199 : * Extract the next UCS-4 character from the buffer and return it. The
200 : * pointer passed in is advanced to the start of the next character in the
201 : * buffer. If non-null, the err parameter is filled in if an error occurs.
202 : */
203 :
204 :
205 : class UTF16CharEnumerator
206 : {
207 : public:
208 0 : static PRUint32 NextChar(const PRUnichar **buffer, const PRUnichar *end,
209 : bool *err = nsnull)
210 : {
211 0 : NS_ASSERTION(buffer && *buffer, "null buffer!");
212 :
213 0 : const PRUnichar *p = *buffer;
214 :
215 0 : if (p >= end)
216 : {
217 0 : NS_ERROR("No input to work with");
218 0 : if (err)
219 0 : *err = true;
220 :
221 0 : return 0;
222 : }
223 :
224 0 : PRUnichar c = *p++;
225 :
226 0 : if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
227 : {
228 0 : if (err)
229 0 : *err = false;
230 0 : *buffer = p;
231 0 : return c;
232 : }
233 0 : else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
234 : {
235 0 : if (p == end)
236 : {
237 : // Found a high surrogate the end of the buffer. Flag this
238 : // as an error and return the Unicode replacement
239 : // character 0xFFFD.
240 :
241 0 : NS_WARNING("Unexpected end of buffer after high surrogate");
242 :
243 0 : if (err)
244 0 : *err = true;
245 0 : *buffer = p;
246 0 : return 0xFFFD;
247 : }
248 :
249 : // D800- DBFF - High Surrogate
250 0 : PRUnichar h = c;
251 :
252 0 : c = *p++;
253 :
254 0 : if (NS_IS_LOW_SURROGATE(c))
255 : {
256 : // DC00- DFFF - Low Surrogate
257 : // N = (H - D800) *400 + 10000 + (L - DC00)
258 0 : PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
259 0 : if (err)
260 0 : *err = false;
261 0 : *buffer = p;
262 0 : return ucs4;
263 : }
264 : else
265 : {
266 : // Found a high surrogate followed by something other than
267 : // a low surrogate. Flag this as an error and return the
268 : // Unicode replacement character 0xFFFD. Note that the
269 : // pointer to the next character points to the second 16-bit
270 : // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
271 : // only the first code unit of an illegal sequence must be
272 : // treated as an illegally terminated code unit sequence
273 : // (also Chapter 3 D91, "isolated [not paired and ill-formed]
274 : // UTF-16 code units in the range D800..DFFF are ill-formed").
275 0 : NS_WARNING("got a High Surrogate but no low surrogate");
276 :
277 0 : if (err)
278 0 : *err = true;
279 0 : *buffer = p - 1;
280 0 : return 0xFFFD;
281 : }
282 : }
283 : else // U+DC00 - U+DFFF
284 : {
285 : // DC00- DFFF - Low Surrogate
286 :
287 : // Found a low surrogate w/o a preceding high surrogate. Flag
288 : // this as an error and return the Unicode replacement
289 : // character 0xFFFD.
290 :
291 0 : NS_WARNING("got a low Surrogate but no high surrogate");
292 0 : if (err)
293 0 : *err = true;
294 0 : *buffer = p;
295 0 : return 0xFFFD;
296 : }
297 :
298 : if (err)
299 : *err = true;
300 : return 0;
301 : }
302 : };
303 :
304 :
305 : /**
306 : * A character sink (see |copy_string| in nsAlgorithm.h) for converting
307 : * UTF-8 to UTF-16
308 : */
309 : class ConvertUTF8toUTF16
310 : {
311 : public:
312 : typedef char value_type;
313 : typedef PRUnichar buffer_type;
314 :
315 218840 : ConvertUTF8toUTF16( buffer_type* aBuffer )
316 218840 : : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {}
317 :
318 218840 : size_t Length() const { return mBuffer - mStart; }
319 :
320 223426 : bool ErrorEncountered() const { return mErrorEncountered; }
321 :
322 : void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
323 : {
324 218840 : if ( mErrorEncountered )
325 : return;
326 :
327 : // algorithm assumes utf8 units won't
328 : // be spread across fragments
329 218840 : const value_type* p = start;
330 218840 : const value_type* end = start + N;
331 218840 : buffer_type* out = mBuffer;
332 32214847 : for ( ; p != end /* && *p */; )
333 : {
334 : bool err;
335 31996007 : PRUint32 ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
336 :
337 31996007 : if ( err )
338 : {
339 0 : mErrorEncountered = true;
340 0 : mBuffer = out;
341 : return;
342 : }
343 :
344 31996007 : if ( ucs4 >= PLANE1_BASE )
345 : {
346 0 : *out++ = (buffer_type)H_SURROGATE(ucs4);
347 0 : *out++ = (buffer_type)L_SURROGATE(ucs4);
348 : }
349 : else
350 : {
351 31996007 : *out++ = ucs4;
352 : }
353 : }
354 218840 : mBuffer = out;
355 : }
356 :
357 102931 : void write_terminator()
358 : {
359 102931 : *mBuffer = buffer_type(0);
360 102931 : }
361 :
362 : private:
363 : buffer_type* const mStart;
364 : buffer_type* mBuffer;
365 : bool mErrorEncountered;
366 : };
367 :
368 : /**
369 : * A character sink (see |copy_string| in nsAlgorithm.h) for computing
370 : * the length of the UTF-16 string equivalent to a UTF-8 string.
371 : */
372 : class CalculateUTF8Length
373 : {
374 : public:
375 : typedef char value_type;
376 :
377 215601 : CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { }
378 :
379 421207 : size_t Length() const { return mLength; }
380 :
381 : void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
382 : {
383 : // ignore any further requests
384 215601 : if ( mErrorEncountered )
385 : return;
386 :
387 : // algorithm assumes utf8 units won't
388 : // be spread across fragments
389 215601 : const value_type* p = start;
390 215601 : const value_type* end = start + N;
391 9265874 : for ( ; p < end /* && *p */; ++mLength )
392 : {
393 9265874 : if ( UTF8traits::isASCII(*p) )
394 9055586 : p += 1;
395 210288 : else if ( UTF8traits::is2byte(*p) )
396 48261 : p += 2;
397 162027 : else if ( UTF8traits::is3byte(*p) )
398 162027 : p += 3;
399 0 : else if ( UTF8traits::is4byte(*p) ) {
400 : // Because a UTF-8 sequence of 4 bytes represents a codepoint
401 : // greater than 0xFFFF, it will become a surrogate pair in the
402 : // UTF-16 string, so add 1 more to mLength.
403 : // This doesn't happen with is5byte and is6byte because they
404 : // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
405 : // converted to a single replacement character.
406 :
407 : // However, there is one case when a 4 byte UTF-8 sequence will
408 : // only generate 2 UTF-16 bytes. If we have a properly encoded
409 : // sequence, but with an invalid value (too small or too big),
410 : // that will result in a replacement character being written
411 : // This replacement character is encoded as just 1 single
412 : // UTF-16 character, which is 2 bytes.
413 :
414 : // The below code therefore only adds 1 to mLength if the UTF8
415 : // data will produce a decoded character which is greater than
416 : // or equal to 0x010000 and less than 0x0110000.
417 :
418 : // A 4byte UTF8 character is encoded as
419 : // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
420 : // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
421 : // map to bit 17-21 in the final result. If these bits are
422 : // between 0x01 and 0x11, that means that the final result is
423 : // between 0x010000 and 0x110000. The below code reads these
424 : // bits out and assigns them to c, but shifted up 4 bits to
425 : // avoid having to shift twice.
426 :
427 : // It doesn't matter what to do in the case where p + 4 > end
428 : // since no UTF16 characters will be written in that case by
429 : // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
430 : // any of the surrogate bits are wrong since no UTF16
431 : // characters will be written in that case either.
432 :
433 0 : if (p + 4 <= end) {
434 0 : PRUint32 c = ((PRUint32)(p[0] & 0x07)) << 6 |
435 0 : ((PRUint32)(p[1] & 0x30));
436 0 : if (c >= 0x010 && c < 0x110)
437 0 : ++mLength;
438 : }
439 :
440 0 : p += 4;
441 : }
442 0 : else if ( UTF8traits::is5byte(*p) )
443 0 : p += 5;
444 0 : else if ( UTF8traits::is6byte(*p) )
445 0 : p += 6;
446 : else // error
447 : {
448 0 : ++mLength; // to account for the decrement below
449 : break;
450 : }
451 : }
452 215601 : if ( p != end )
453 : {
454 0 : NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
455 0 : --mLength; // The last multi-byte char wasn't complete, discard it.
456 0 : mErrorEncountered = true;
457 : }
458 : }
459 :
460 : private:
461 : size_t mLength;
462 : bool mErrorEncountered;
463 : };
464 :
465 : /**
466 : * A character sink (see |copy_string| in nsAlgorithm.h) for
467 : * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
468 : * (0xEFBFBD in UTF-8).
469 : */
470 : class ConvertUTF16toUTF8
471 : {
472 : public:
473 : typedef PRUnichar value_type;
474 : typedef char buffer_type;
475 :
476 : // The error handling here is more lenient than that in
477 : // |ConvertUTF8toUTF16|, but it's that way for backwards
478 : // compatibility.
479 :
480 1379912 : ConvertUTF16toUTF8( buffer_type* aBuffer )
481 1379912 : : mStart(aBuffer), mBuffer(aBuffer) {}
482 :
483 1379912 : size_t Size() const { return mBuffer - mStart; }
484 :
485 : void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
486 : {
487 1379912 : buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
488 :
489 47953169 : for (const value_type *p = start, *end = start + N; p < end; ++p )
490 : {
491 46573257 : value_type c = *p;
492 46573257 : if (! (c & 0xFF80)) // U+0000 - U+007F
493 : {
494 46410211 : *out++ = (char)c;
495 : }
496 163046 : else if (! (c & 0xF800)) // U+0100 - U+07FF
497 : {
498 27864 : *out++ = 0xC0 | (char)(c >> 6);
499 27864 : *out++ = 0x80 | (char)(0x003F & c);
500 : }
501 135182 : else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
502 : {
503 135176 : *out++ = 0xE0 | (char)(c >> 12);
504 135176 : *out++ = 0x80 | (char)(0x003F & (c >> 6));
505 135176 : *out++ = 0x80 | (char)(0x003F & c );
506 : }
507 6 : else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
508 : {
509 : // D800- DBFF - High Surrogate
510 6 : value_type h = c;
511 :
512 6 : ++p;
513 6 : if (p == end)
514 : {
515 : // Treat broken characters as the Unicode
516 : // replacement character 0xFFFD (0xEFBFBD in
517 : // UTF-8)
518 0 : *out++ = '\xEF';
519 0 : *out++ = '\xBF';
520 0 : *out++ = '\xBD';
521 :
522 0 : NS_WARNING("String ending in half a surrogate pair!");
523 :
524 : break;
525 : }
526 6 : c = *p;
527 :
528 6 : if (NS_IS_LOW_SURROGATE(c))
529 : {
530 : // DC00- DFFF - Low Surrogate
531 : // N = (H - D800) *400 + 10000 + ( L - DC00 )
532 6 : PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
533 :
534 : // 0001 0000-001F FFFF
535 6 : *out++ = 0xF0 | (char)(ucs4 >> 18);
536 6 : *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
537 6 : *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
538 6 : *out++ = 0x80 | (char)(0x003F & ucs4);
539 : }
540 : else
541 : {
542 : // Treat broken characters as the Unicode
543 : // replacement character 0xFFFD (0xEFBFBD in
544 : // UTF-8)
545 0 : *out++ = '\xEF';
546 0 : *out++ = '\xBF';
547 0 : *out++ = '\xBD';
548 :
549 : // The pointer to the next character points to the second
550 : // 16-bit value, not beyond it, as per Unicode 5.0.0
551 : // Chapter 3 C10, only the first code unit of an illegal
552 : // sequence must be treated as an illegally terminated
553 : // code unit sequence (also Chapter 3 D91, "isolated [not
554 : // paired and ill-formed] UTF-16 code units in the range
555 : // D800..DFFF are ill-formed").
556 0 : p--;
557 :
558 0 : NS_WARNING("got a High Surrogate but no low surrogate");
559 : }
560 : }
561 : else // U+DC00 - U+DFFF
562 : {
563 : // Treat broken characters as the Unicode replacement
564 : // character 0xFFFD (0xEFBFBD in UTF-8)
565 0 : *out++ = '\xEF';
566 0 : *out++ = '\xBF';
567 0 : *out++ = '\xBD';
568 :
569 : // DC00- DFFF - Low Surrogate
570 0 : NS_WARNING("got a low Surrogate but no high surrogate");
571 : }
572 : }
573 :
574 1379912 : mBuffer = out;
575 : }
576 :
577 2088 : void write_terminator()
578 : {
579 2088 : *mBuffer = buffer_type(0);
580 2088 : }
581 :
582 : private:
583 : buffer_type* const mStart;
584 : buffer_type* mBuffer;
585 : };
586 :
587 : /**
588 : * A character sink (see |copy_string| in nsAlgorithm.h) for computing
589 : * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
590 : * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
591 : */
592 : class CalculateUTF8Size
593 : {
594 : public:
595 : typedef PRUnichar value_type;
596 :
597 1231520 : CalculateUTF8Size()
598 1231520 : : mSize(0) { }
599 :
600 1233608 : size_t Size() const { return mSize; }
601 :
602 : void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
603 : {
604 : // Assume UCS2 surrogate pairs won't be spread across fragments.
605 47652730 : for (const value_type *p = start, *end = start + N; p < end; ++p )
606 : {
607 46421210 : value_type c = *p;
608 46421210 : if (! (c & 0xFF80)) // U+0000 - U+007F
609 46408790 : mSize += 1;
610 12420 : else if (! (c & 0xF800)) // U+0100 - U+07FF
611 865 : mSize += 2;
612 11555 : else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
613 11549 : mSize += 3;
614 6 : else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
615 : {
616 6 : ++p;
617 6 : if (p == end)
618 : {
619 : // Treat broken characters as the Unicode
620 : // replacement character 0xFFFD (0xEFBFBD in
621 : // UTF-8)
622 0 : mSize += 3;
623 :
624 0 : NS_WARNING("String ending in half a surrogate pair!");
625 :
626 : break;
627 : }
628 6 : c = *p;
629 :
630 6 : if (0xDC00 == (0xFC00 & c))
631 6 : mSize += 4;
632 : else
633 : {
634 : // Treat broken characters as the Unicode
635 : // replacement character 0xFFFD (0xEFBFBD in
636 : // UTF-8)
637 0 : mSize += 3;
638 :
639 : // The next code unit is the second 16-bit value, not
640 : // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
641 : // only the first code unit of an illegal sequence must
642 : // be treated as an illegally terminated code unit
643 : // sequence (also Chapter 3 D91, "isolated [not paired and
644 : // ill-formed] UTF-16 code units in the range D800..DFFF
645 : // are ill-formed").
646 0 : p--;
647 :
648 0 : NS_WARNING("got a high Surrogate but no low surrogate");
649 : }
650 : }
651 : else // U+DC00 - U+DFFF
652 : {
653 : // Treat broken characters as the Unicode replacement
654 : // character 0xFFFD (0xEFBFBD in UTF-8)
655 0 : mSize += 3;
656 :
657 0 : NS_WARNING("got a low Surrogate but no high surrogate");
658 : }
659 : }
660 : }
661 :
662 : private:
663 : size_t mSize;
664 : };
665 :
666 : #ifdef MOZILLA_INTERNAL_API
667 : /**
668 : * A character sink that performs a |reinterpret_cast|-style conversion
669 : * from char to PRUnichar.
670 : */
671 : class LossyConvertEncoding8to16
672 : {
673 : public:
674 : typedef char value_type;
675 : typedef char input_type;
676 : typedef PRUnichar output_type;
677 :
678 : public:
679 1352778 : LossyConvertEncoding8to16( PRUnichar* aDestination ) :
680 1352778 : mDestination(aDestination) { }
681 :
682 : void
683 1352778 : write( const char* aSource, PRUint32 aSourceLength )
684 : {
685 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
686 1352778 : if (mozilla::supports_sse2())
687 : {
688 1352778 : write_sse2(aSource, aSourceLength);
689 1352778 : return;
690 : }
691 : #endif
692 0 : const char* done_writing = aSource + aSourceLength;
693 0 : while ( aSource < done_writing )
694 0 : *mDestination++ = (PRUnichar)(unsigned char)(*aSource++);
695 : }
696 :
697 : void
698 : write_sse2( const char* aSource, PRUint32 aSourceLength );
699 :
700 : void
701 768686 : write_terminator()
702 : {
703 768686 : *mDestination = (PRUnichar)(0);
704 768686 : }
705 :
706 : private:
707 : PRUnichar* mDestination;
708 : };
709 :
710 : /**
711 : * A character sink that performs a |reinterpret_cast|-style conversion
712 : * from PRUnichar to char.
713 : */
714 : class LossyConvertEncoding16to8
715 : {
716 : public:
717 : typedef PRUnichar value_type;
718 : typedef PRUnichar input_type;
719 : typedef char output_type;
720 :
721 26553 : LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
722 :
723 : void
724 26553 : write( const PRUnichar* aSource, PRUint32 aSourceLength)
725 : {
726 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
727 26553 : if (mozilla::supports_sse2())
728 : {
729 26553 : write_sse2(aSource, aSourceLength);
730 26553 : return;
731 : }
732 : #endif
733 0 : const PRUnichar* done_writing = aSource + aSourceLength;
734 0 : while ( aSource < done_writing )
735 0 : *mDestination++ = (char)(*aSource++);
736 : }
737 :
738 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
739 : void
740 : write_sse2( const PRUnichar* aSource, PRUint32 aSourceLength );
741 : #endif
742 :
743 : void
744 36 : write_terminator()
745 : {
746 36 : *mDestination = '\0';
747 36 : }
748 :
749 : private:
750 : char *mDestination;
751 : };
752 : #endif // MOZILLA_INTERNAL_API
753 :
754 : #endif /* !defined(nsUTF8Utils_h_) */
|