1 : /* ***** BEGIN LICENSE BLOCK *****
2 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 : *
4 : * The contents of this file are subject to the Mozilla Public License Version
5 : * 1.1 (the "License"); you may not use this file except in compliance with
6 : * the License. You may obtain a copy of the License at
7 : * http://www.mozilla.org/MPL/
8 : *
9 : * Software distributed under the License is distributed on an "AS IS" basis,
10 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 : * for the specific language governing rights and limitations under the
12 : * License.
13 : *
14 : * The Original Code is Mozilla.
15 : *
16 : * The Initial Developer of the Original Code is
17 : * Netscape Communications Corporation.
18 : * Portions created by the Initial Developer are Copyright (C) 2002
19 : * the Initial Developer. All Rights Reserved.
20 : *
21 : * Contributor(s):
22 : * Darin Fisher <darin@netscape.com>
23 : * Brian Stell <bstell@ix.netcom.com>
24 : * Frank Tang <ftang@netscape.com>
25 : * Brendan Eich <brendan@mozilla.org>
26 : * Sergei Dolgov <sergei_d@fi.fi.tartu.ee>
27 : * Jungshik Shin <jshin@i18nl10n.com>
28 : *
29 : * Alternatively, the contents of this file may be used under the terms of
30 : * either the GNU General Public License Version 2 or later (the "GPL"), or
31 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
32 : * in which case the provisions of the GPL or the LGPL are applicable instead
33 : * of those above. If you wish to allow use of your version of this file only
34 : * under the terms of either the GPL or the LGPL, and not to allow others to
35 : * use your version of this file under the terms of the MPL, indicate your
36 : * decision by deleting the provisions above and replace them with the notice
37 : * and other provisions required by the GPL or the LGPL. If you do not delete
38 : * the provisions above, a recipient may use your version of this file under
39 : * the terms of any one of the MPL, the GPL or the LGPL.
40 : *
41 : * ***** END LICENSE BLOCK ***** */
42 :
43 : #include "xpcom-private.h"
44 :
45 : //-----------------------------------------------------------------------------
46 : // XP_MACOSX or ANDROID
47 : //-----------------------------------------------------------------------------
48 : #if defined(XP_MACOSX) || defined(ANDROID)
49 :
50 : #include "nsAString.h"
51 : #include "nsReadableUtils.h"
52 : #include "nsString.h"
53 :
54 : nsresult
55 : NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
56 : {
57 : CopyUTF8toUTF16(input, output);
58 : return NS_OK;
59 : }
60 :
61 : nsresult
62 : NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
63 : {
64 : CopyUTF16toUTF8(input, output);
65 : return NS_OK;
66 : }
67 :
68 : void
69 : NS_StartupNativeCharsetUtils()
70 : {
71 : }
72 :
73 : void
74 : NS_ShutdownNativeCharsetUtils()
75 : {
76 : }
77 :
78 :
79 : //-----------------------------------------------------------------------------
80 : // XP_UNIX
81 : //-----------------------------------------------------------------------------
82 : #elif defined(XP_UNIX)
83 :
84 : #include <stdlib.h> // mbtowc, wctomb
85 : #include <locale.h> // setlocale
86 : #include "mozilla/Mutex.h"
87 : #include "nscore.h"
88 : #include "nsAString.h"
89 : #include "nsReadableUtils.h"
90 :
91 : using namespace mozilla;
92 :
93 : //
94 : // choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
95 : // but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
96 : // or not (see bug 206811 and
97 : // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
98 : // iconv for all platforms where nltypes.h and nllanginfo.h are present
99 : // along with iconv.
100 : //
101 : #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
102 : #define USE_ICONV 1
103 : #else
104 : #define USE_STDCONV 1
105 : #endif
106 :
107 : static void
108 0 : isolatin1_to_utf16(const char **input, PRUint32 *inputLeft, PRUnichar **output, PRUint32 *outputLeft)
109 : {
110 0 : while (*inputLeft && *outputLeft) {
111 0 : **output = (unsigned char) **input;
112 0 : (*input)++;
113 0 : (*inputLeft)--;
114 0 : (*output)++;
115 0 : (*outputLeft)--;
116 : }
117 0 : }
118 :
119 : static void
120 0 : utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output, PRUint32 *outputLeft)
121 : {
122 0 : while (*inputLeft && *outputLeft) {
123 0 : **output = (unsigned char) **input;
124 0 : (*input)++;
125 0 : (*inputLeft)--;
126 0 : (*output)++;
127 0 : (*outputLeft)--;
128 : }
129 0 : }
130 :
131 : //-----------------------------------------------------------------------------
132 : // conversion using iconv
133 : //-----------------------------------------------------------------------------
134 : #if defined(USE_ICONV)
135 : #include <nl_types.h> // CODESET
136 : #include <langinfo.h> // nl_langinfo
137 : #include <iconv.h> // iconv_open, iconv, iconv_close
138 : #include <errno.h>
139 : #include "plstr.h"
140 :
141 : #if defined(HAVE_ICONV_WITH_CONST_INPUT)
142 : #define ICONV_INPUT(x) (x)
143 : #else
144 : #define ICONV_INPUT(x) ((char **)x)
145 : #endif
146 :
147 : // solaris definitely needs this, but we'll enable it by default
148 : // just in case... but we know for sure that iconv(3) in glibc
149 : // doesn't need this.
150 : #if !defined(__GLIBC__)
151 : #define ENABLE_UTF8_FALLBACK_SUPPORT
152 : #endif
153 :
154 : #define INVALID_ICONV_T ((iconv_t) -1)
155 :
156 : static inline size_t
157 1371466 : xp_iconv(iconv_t converter,
158 : const char **input,
159 : size_t *inputLeft,
160 : char **output,
161 : size_t *outputLeft)
162 : {
163 1371466 : size_t res, outputAvail = outputLeft ? *outputLeft : 0;
164 1371466 : res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
165 1371466 : if (res == (size_t) -1) {
166 : // on some platforms (e.g., linux) iconv will fail with
167 : // E2BIG if it cannot convert _all_ of its input. it'll
168 : // still adjust all of the in/out params correctly, so we
169 : // can ignore this error. the assumption is that we will
170 : // be called again to complete the conversion.
171 0 : if ((errno == E2BIG) && (*outputLeft < outputAvail))
172 0 : res = 0;
173 : }
174 1371466 : return res;
175 : }
176 :
177 : static inline void
178 914152 : xp_iconv_reset(iconv_t converter)
179 : {
180 : // NOTE: the man pages on Solaris claim that you can pass NULL
181 : // for all parameter to reset the converter, but beware the
182 : // evil Solaris crash if you go down this route >:-)
183 :
184 914152 : const char *zero_char_in_ptr = NULL;
185 914152 : char *zero_char_out_ptr = NULL;
186 914152 : size_t zero_size_in = 0,
187 914152 : zero_size_out = 0;
188 :
189 : xp_iconv(converter, &zero_char_in_ptr,
190 : &zero_size_in,
191 : &zero_char_out_ptr,
192 914152 : &zero_size_out);
193 914152 : }
194 :
195 : static inline iconv_t
196 2838 : xp_iconv_open(const char **to_list, const char **from_list)
197 : {
198 : iconv_t res;
199 : const char **from_name;
200 : const char **to_name;
201 :
202 : // try all possible combinations to locate a converter.
203 2838 : to_name = to_list;
204 5676 : while (*to_name) {
205 2838 : if (**to_name) {
206 2838 : from_name = from_list;
207 5676 : while (*from_name) {
208 2838 : if (**from_name) {
209 2838 : res = iconv_open(*to_name, *from_name);
210 2838 : if (res != INVALID_ICONV_T)
211 2838 : return res;
212 : }
213 0 : from_name++;
214 : }
215 : }
216 0 : to_name++;
217 : }
218 :
219 0 : return INVALID_ICONV_T;
220 : }
221 :
222 : /*
223 : * PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
224 : * have to use UTF-16 with iconv(3) on platforms where it's supported.
225 : * However, the way UTF-16 and UCS-2 are interpreted varies across platforms
226 : * and implementations of iconv(3). On Tru64, it also depends on the environment
227 : * variable. To avoid the trouble arising from byte-swapping
228 : * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
229 : * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
230 : * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
231 : * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
232 : * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
233 : * variable ICONV_BYTEORDER is set to 'big-endian', about which not much
234 : * can be done other than adding a note in the release notes. (bug 206811)
235 : */
236 : static const char *UTF_16_NAMES[] = {
237 : #if defined(IS_LITTLE_ENDIAN)
238 : "UTF-16LE",
239 : #if defined(__GLIBC__)
240 : "UNICODELITTLE",
241 : #endif
242 : "UCS-2LE",
243 : #else
244 : "UTF-16BE",
245 : #if defined(__GLIBC__)
246 : "UNICODEBIG",
247 : #endif
248 : "UCS-2BE",
249 : #endif
250 : "UTF-16",
251 : "UCS-2",
252 : "UCS2",
253 : "UCS_2",
254 : "ucs-2",
255 : "ucs2",
256 : "ucs_2",
257 : NULL
258 : };
259 :
260 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
261 : static const char *UTF_8_NAMES[] = {
262 : "UTF-8",
263 : "UTF8",
264 : "UTF_8",
265 : "utf-8",
266 : "utf8",
267 : "utf_8",
268 : NULL
269 : };
270 : #endif
271 :
272 : static const char *ISO_8859_1_NAMES[] = {
273 : "ISO-8859-1",
274 : #if !defined(__GLIBC__)
275 : "ISO8859-1",
276 : "ISO88591",
277 : "ISO_8859_1",
278 : "ISO8859_1",
279 : "iso-8859-1",
280 : "iso8859-1",
281 : "iso88591",
282 : "iso_8859_1",
283 : "iso8859_1",
284 : #endif
285 : NULL
286 : };
287 :
288 : class nsNativeCharsetConverter
289 : {
290 : public:
291 : nsNativeCharsetConverter();
292 : ~nsNativeCharsetConverter();
293 :
294 : nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
295 : PRUnichar **output, PRUint32 *outputLeft);
296 : nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
297 : char **output, PRUint32 *outputLeft);
298 :
299 : static void GlobalInit();
300 : static void GlobalShutdown();
301 : static bool IsNativeUTF8();
302 :
303 : private:
304 : static iconv_t gNativeToUnicode;
305 : static iconv_t gUnicodeToNative;
306 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
307 : static iconv_t gNativeToUTF8;
308 : static iconv_t gUTF8ToNative;
309 : static iconv_t gUnicodeToUTF8;
310 : static iconv_t gUTF8ToUnicode;
311 : #endif
312 : static Mutex *gLock;
313 : static bool gInitialized;
314 : static bool gIsNativeUTF8;
315 :
316 : static void LazyInit();
317 :
318 457076 : static void Lock() { if (gLock) gLock->Lock(); }
319 457076 : static void Unlock() { if (gLock) gLock->Unlock(); }
320 : };
321 :
322 : iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
323 : iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
324 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
325 : iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
326 : iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
327 : iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
328 : iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
329 : #endif
330 : Mutex *nsNativeCharsetConverter::gLock = nsnull;
331 : bool nsNativeCharsetConverter::gInitialized = false;
332 : bool nsNativeCharsetConverter::gIsNativeUTF8 = false;
333 :
334 : void
335 1419 : nsNativeCharsetConverter::LazyInit()
336 : {
337 1419 : const char *blank_list[] = { "", NULL };
338 1419 : const char **native_charset_list = blank_list;
339 1419 : const char *native_charset = nl_langinfo(CODESET);
340 1419 : if (native_charset == nsnull) {
341 0 : NS_ERROR("native charset is unknown");
342 : // fallback to ISO-8859-1
343 0 : native_charset_list = ISO_8859_1_NAMES;
344 : }
345 : else
346 1419 : native_charset_list[0] = native_charset;
347 :
348 : // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET)
349 : // return 'UTF-8' (or 'utf-8')
350 1419 : if (!PL_strcasecmp(native_charset, "UTF-8"))
351 1419 : gIsNativeUTF8 = true;
352 :
353 1419 : gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
354 1419 : gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
355 :
356 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
357 : if (gNativeToUnicode == INVALID_ICONV_T) {
358 : gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
359 : gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
360 : NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
361 : NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
362 : }
363 : if (gUnicodeToNative == INVALID_ICONV_T) {
364 : gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
365 : gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
366 : NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
367 : NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
368 : }
369 : #else
370 1419 : NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
371 1419 : NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
372 : #endif
373 :
374 : /*
375 : * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
376 : * prepend a byte order mark unicode character (BOM, u+FEFF) during
377 : * the first use of the iconv converter. The same is the case of
378 : * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
379 : * However, we use 'UTF-16LE/BE' in both cases, instead so that we
380 : * should be safe. But just in case...
381 : *
382 : * This dummy conversion gets rid of the BOMs and fixes bug 153562.
383 : */
384 1419 : char dummy_input[1] = { ' ' };
385 : char dummy_output[4];
386 :
387 1419 : if (gNativeToUnicode != INVALID_ICONV_T) {
388 1419 : const char *input = dummy_input;
389 1419 : size_t input_left = sizeof(dummy_input);
390 1419 : char *output = dummy_output;
391 1419 : size_t output_left = sizeof(dummy_output);
392 :
393 1419 : xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
394 : }
395 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
396 : if (gUTF8ToUnicode != INVALID_ICONV_T) {
397 : const char *input = dummy_input;
398 : size_t input_left = sizeof(dummy_input);
399 : char *output = dummy_output;
400 : size_t output_left = sizeof(dummy_output);
401 :
402 : xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
403 : }
404 : #endif
405 :
406 1419 : gInitialized = true;
407 1419 : }
408 :
409 : void
410 1419 : nsNativeCharsetConverter::GlobalInit()
411 : {
412 1419 : gLock = new Mutex("nsNativeCharsetConverter.gLock");
413 1419 : }
414 :
415 : void
416 1419 : nsNativeCharsetConverter::GlobalShutdown()
417 : {
418 1419 : if (gLock) {
419 1419 : delete gLock;
420 1419 : gLock = nsnull;
421 : }
422 :
423 1419 : if (gNativeToUnicode != INVALID_ICONV_T) {
424 1419 : iconv_close(gNativeToUnicode);
425 1419 : gNativeToUnicode = INVALID_ICONV_T;
426 : }
427 :
428 1419 : if (gUnicodeToNative != INVALID_ICONV_T) {
429 1419 : iconv_close(gUnicodeToNative);
430 1419 : gUnicodeToNative = INVALID_ICONV_T;
431 : }
432 :
433 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
434 : if (gNativeToUTF8 != INVALID_ICONV_T) {
435 : iconv_close(gNativeToUTF8);
436 : gNativeToUTF8 = INVALID_ICONV_T;
437 : }
438 : if (gUTF8ToNative != INVALID_ICONV_T) {
439 : iconv_close(gUTF8ToNative);
440 : gUTF8ToNative = INVALID_ICONV_T;
441 : }
442 : if (gUnicodeToUTF8 != INVALID_ICONV_T) {
443 : iconv_close(gUnicodeToUTF8);
444 : gUnicodeToUTF8 = INVALID_ICONV_T;
445 : }
446 : if (gUTF8ToUnicode != INVALID_ICONV_T) {
447 : iconv_close(gUTF8ToUnicode);
448 : gUTF8ToUnicode = INVALID_ICONV_T;
449 : }
450 : #endif
451 :
452 1419 : gInitialized = false;
453 1419 : }
454 :
455 457076 : nsNativeCharsetConverter::nsNativeCharsetConverter()
456 : {
457 457076 : Lock();
458 457076 : if (!gInitialized)
459 1419 : LazyInit();
460 457076 : }
461 :
462 457076 : nsNativeCharsetConverter::~nsNativeCharsetConverter()
463 : {
464 : // reset converters for next time
465 457076 : if (gNativeToUnicode != INVALID_ICONV_T)
466 457076 : xp_iconv_reset(gNativeToUnicode);
467 457076 : if (gUnicodeToNative != INVALID_ICONV_T)
468 457076 : xp_iconv_reset(gUnicodeToNative);
469 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
470 : if (gNativeToUTF8 != INVALID_ICONV_T)
471 : xp_iconv_reset(gNativeToUTF8);
472 : if (gUTF8ToNative != INVALID_ICONV_T)
473 : xp_iconv_reset(gUTF8ToNative);
474 : if (gUnicodeToUTF8 != INVALID_ICONV_T)
475 : xp_iconv_reset(gUnicodeToUTF8);
476 : if (gUTF8ToUnicode != INVALID_ICONV_T)
477 : xp_iconv_reset(gUTF8ToUnicode);
478 : #endif
479 457076 : Unlock();
480 457076 : }
481 :
482 : nsresult
483 222341 : nsNativeCharsetConverter::NativeToUnicode(const char **input,
484 : PRUint32 *inputLeft,
485 : PRUnichar **output,
486 : PRUint32 *outputLeft)
487 : {
488 222341 : size_t res = 0;
489 222341 : size_t inLeft = (size_t) *inputLeft;
490 222341 : size_t outLeft = (size_t) *outputLeft * 2;
491 :
492 222341 : if (gNativeToUnicode != INVALID_ICONV_T) {
493 :
494 222341 : res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
495 :
496 222341 : *inputLeft = inLeft;
497 222341 : *outputLeft = outLeft / 2;
498 222341 : if (res != (size_t) -1)
499 222341 : return NS_OK;
500 :
501 0 : NS_WARNING("conversion from native to utf-16 failed");
502 :
503 : // reset converter
504 0 : xp_iconv_reset(gNativeToUnicode);
505 : }
506 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
507 : else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
508 : (gUTF8ToUnicode != INVALID_ICONV_T)) {
509 : // convert first to UTF8, then from UTF8 to UCS2
510 : const char *in = *input;
511 :
512 : char ubuf[1024];
513 :
514 : // we assume we're always called with enough space in |output|,
515 : // so convert many chars at a time...
516 : while (inLeft) {
517 : char *p = ubuf;
518 : size_t n = sizeof(ubuf);
519 : res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
520 : if (res == (size_t) -1) {
521 : NS_ERROR("conversion from native to utf-8 failed");
522 : break;
523 : }
524 : NS_ASSERTION(outLeft > 0, "bad assumption");
525 : p = ubuf;
526 : n = sizeof(ubuf) - n;
527 : res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
528 : if (res == (size_t) -1) {
529 : NS_ERROR("conversion from utf-8 to utf-16 failed");
530 : break;
531 : }
532 : }
533 :
534 : (*input) += (*inputLeft - inLeft);
535 : *inputLeft = inLeft;
536 : *outputLeft = outLeft / 2;
537 :
538 : if (res != (size_t) -1)
539 : return NS_OK;
540 :
541 : // reset converters
542 : xp_iconv_reset(gNativeToUTF8);
543 : xp_iconv_reset(gUTF8ToUnicode);
544 : }
545 : #endif
546 :
547 : // fallback: zero-pad and hope for the best
548 : // XXX This is lame and we have to do better.
549 0 : isolatin1_to_utf16(input, inputLeft, output, outputLeft);
550 :
551 0 : return NS_OK;
552 : }
553 :
554 : nsresult
555 233554 : nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
556 : PRUint32 *inputLeft,
557 : char **output,
558 : PRUint32 *outputLeft)
559 : {
560 233554 : size_t res = 0;
561 233554 : size_t inLeft = (size_t) *inputLeft * 2;
562 233554 : size_t outLeft = (size_t) *outputLeft;
563 :
564 233554 : if (gUnicodeToNative != INVALID_ICONV_T) {
565 233554 : res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
566 :
567 233554 : *inputLeft = inLeft / 2;
568 233554 : *outputLeft = outLeft;
569 233554 : if (res != (size_t) -1) {
570 233554 : return NS_OK;
571 : }
572 :
573 0 : NS_ERROR("iconv failed");
574 :
575 : // reset converter
576 0 : xp_iconv_reset(gUnicodeToNative);
577 : }
578 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
579 : else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
580 : (gUTF8ToNative != INVALID_ICONV_T)) {
581 : const char *in = (const char *) *input;
582 :
583 : char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
584 :
585 : // convert one uchar at a time...
586 : while (inLeft && outLeft) {
587 : char *p = ubuf;
588 : size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
589 : res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
590 : if (res == (size_t) -1) {
591 : NS_ERROR("conversion from utf-16 to utf-8 failed");
592 : break;
593 : }
594 : p = ubuf;
595 : n = sizeof(ubuf) - n;
596 : res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
597 : if (res == (size_t) -1) {
598 : if (errno == E2BIG) {
599 : // not enough room for last uchar... back up and return.
600 : in -= sizeof(PRUnichar);
601 : res = 0;
602 : }
603 : else
604 : NS_ERROR("conversion from utf-8 to native failed");
605 : break;
606 : }
607 : inLeft -= sizeof(PRUnichar);
608 : }
609 :
610 : (*input) += (*inputLeft - inLeft / 2);
611 : *inputLeft = inLeft / 2;
612 : *outputLeft = outLeft;
613 : if (res != (size_t) -1) {
614 : return NS_OK;
615 : }
616 :
617 : // reset converters
618 : xp_iconv_reset(gUnicodeToUTF8);
619 : xp_iconv_reset(gUTF8ToNative);
620 : }
621 : #endif
622 :
623 : // fallback: truncate and hope for the best
624 : // XXX This is lame and we have to do better.
625 0 : utf16_to_isolatin1(input, inputLeft, output, outputLeft);
626 :
627 0 : return NS_OK;
628 : }
629 :
630 : bool
631 63307 : nsNativeCharsetConverter::IsNativeUTF8()
632 : {
633 63307 : if (!gInitialized) {
634 0 : Lock();
635 0 : if (!gInitialized)
636 0 : LazyInit();
637 0 : Unlock();
638 : }
639 63307 : return gIsNativeUTF8;
640 : }
641 :
642 : #endif // USE_ICONV
643 :
644 : //-----------------------------------------------------------------------------
645 : // conversion using mb[r]towc/wc[r]tomb
646 : //-----------------------------------------------------------------------------
647 : #if defined(USE_STDCONV)
648 : #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
649 : #include <wchar.h> // mbrtowc, wcrtomb
650 : #endif
651 :
652 : class nsNativeCharsetConverter
653 : {
654 : public:
655 : nsNativeCharsetConverter();
656 :
657 : nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
658 : PRUnichar **output, PRUint32 *outputLeft);
659 : nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
660 : char **output, PRUint32 *outputLeft);
661 :
662 : static void GlobalInit();
663 : static void GlobalShutdown() { }
664 : static bool IsNativeUTF8();
665 :
666 : private:
667 : static bool gWCharIsUnicode;
668 :
669 : #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
670 : mbstate_t ps;
671 : #endif
672 : };
673 :
674 : bool nsNativeCharsetConverter::gWCharIsUnicode = false;
675 :
676 : nsNativeCharsetConverter::nsNativeCharsetConverter()
677 : {
678 : #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
679 : memset(&ps, 0, sizeof(ps));
680 : #endif
681 : }
682 :
683 : void
684 : nsNativeCharsetConverter::GlobalInit()
685 : {
686 : // verify that wchar_t for the current locale is actually unicode.
687 : // if it is not, then we should avoid calling mbtowc/wctomb and
688 : // just fallback on zero-pad/truncation conversion.
689 : //
690 : // this test cannot be done at build time because the encoding of
691 : // wchar_t may depend on the runtime locale. sad, but true!!
692 : //
693 : // so, if wchar_t is unicode then converting an ASCII character
694 : // to wchar_t should not change its numeric value. we'll just
695 : // check what happens with the ASCII 'a' character.
696 : //
697 : // this test is not perfect... obviously, it could yield false
698 : // positives, but then at least ASCII text would be converted
699 : // properly (or maybe just the 'a' character) -- oh well :(
700 :
701 : char a = 'a';
702 : unsigned int w = 0;
703 :
704 : int res = mbtowc((wchar_t *) &w, &a, 1);
705 :
706 : gWCharIsUnicode = (res != -1 && w == 'a');
707 :
708 : #ifdef DEBUG
709 : if (!gWCharIsUnicode)
710 : NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
711 : #endif
712 : }
713 :
714 : nsresult
715 : nsNativeCharsetConverter::NativeToUnicode(const char **input,
716 : PRUint32 *inputLeft,
717 : PRUnichar **output,
718 : PRUint32 *outputLeft)
719 : {
720 : if (gWCharIsUnicode) {
721 : int incr;
722 :
723 : // cannot use wchar_t here since it may have been redefined (e.g.,
724 : // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
725 : unsigned int tmp = 0;
726 : while (*inputLeft && *outputLeft) {
727 : #ifdef HAVE_MBRTOWC
728 : incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
729 : #else
730 : // XXX is this thread-safe?
731 : incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
732 : #endif
733 : if (incr < 0) {
734 : NS_WARNING("mbtowc failed: possible charset mismatch");
735 : // zero-pad and hope for the best
736 : tmp = (unsigned char) **input;
737 : incr = 1;
738 : }
739 : **output = (PRUnichar) tmp;
740 : (*input) += incr;
741 : (*inputLeft) -= incr;
742 : (*output)++;
743 : (*outputLeft)--;
744 : }
745 : }
746 : else {
747 : // wchar_t isn't unicode, so the best we can do is treat the
748 : // input as if it is isolatin1 :(
749 : isolatin1_to_utf16(input, inputLeft, output, outputLeft);
750 : }
751 :
752 : return NS_OK;
753 : }
754 :
755 : nsresult
756 : nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
757 : PRUint32 *inputLeft,
758 : char **output,
759 : PRUint32 *outputLeft)
760 : {
761 : if (gWCharIsUnicode) {
762 : int incr;
763 :
764 : while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
765 : #ifdef HAVE_WCRTOMB
766 : incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
767 : #else
768 : // XXX is this thread-safe?
769 : incr = (int) wctomb(*output, (wchar_t) **input);
770 : #endif
771 : if (incr < 0) {
772 : NS_WARNING("mbtowc failed: possible charset mismatch");
773 : **output = (unsigned char) **input; // truncate
774 : incr = 1;
775 : }
776 : // most likely we're dead anyways if this assertion should fire
777 : NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
778 : (*output) += incr;
779 : (*outputLeft) -= incr;
780 : (*input)++;
781 : (*inputLeft)--;
782 : }
783 : }
784 : else {
785 : // wchar_t isn't unicode, so the best we can do is treat the
786 : // input as if it is isolatin1 :(
787 : utf16_to_isolatin1(input, inputLeft, output, outputLeft);
788 : }
789 :
790 : return NS_OK;
791 : }
792 :
793 : // XXX : for now, return false
794 : bool
795 : nsNativeCharsetConverter::IsNativeUTF8()
796 : {
797 : return false;
798 : }
799 :
800 : #endif // USE_STDCONV
801 :
802 : //-----------------------------------------------------------------------------
803 : // API implementation
804 : //-----------------------------------------------------------------------------
805 :
806 : nsresult
807 222341 : NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
808 : {
809 222341 : output.Truncate();
810 :
811 222341 : PRUint32 inputLen = input.Length();
812 :
813 222341 : nsACString::const_iterator iter;
814 222341 : input.BeginReading(iter);
815 :
816 : //
817 : // OPTIMIZATION: preallocate space for largest possible result; convert
818 : // directly into the result buffer to avoid intermediate buffer copy.
819 : //
820 : // this will generally result in a larger allocation, but that seems
821 : // better than an extra buffer copy.
822 : //
823 222341 : if (!EnsureStringLength(output, inputLen))
824 0 : return NS_ERROR_OUT_OF_MEMORY;
825 222341 : nsAString::iterator out_iter;
826 222341 : output.BeginWriting(out_iter);
827 :
828 222341 : PRUnichar *result = out_iter.get();
829 222341 : PRUint32 resultLeft = inputLen;
830 :
831 222341 : const char *buf = iter.get();
832 222341 : PRUint32 bufLeft = inputLen;
833 :
834 444682 : nsNativeCharsetConverter conv;
835 222341 : nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
836 222341 : if (NS_SUCCEEDED(rv)) {
837 222341 : NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
838 222341 : output.SetLength(inputLen - resultLeft);
839 : }
840 222341 : return rv;
841 : }
842 :
843 : nsresult
844 234735 : NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
845 : {
846 234735 : output.Truncate();
847 :
848 234735 : nsAString::const_iterator iter, end;
849 234735 : input.BeginReading(iter);
850 234735 : input.EndReading(end);
851 :
852 : // cannot easily avoid intermediate buffer copy.
853 : char temp[4096];
854 :
855 469470 : nsNativeCharsetConverter conv;
856 :
857 234735 : const PRUnichar *buf = iter.get();
858 234735 : PRUint32 bufLeft = Distance(iter, end);
859 703024 : while (bufLeft) {
860 233554 : char *p = temp;
861 233554 : PRUint32 tempLeft = sizeof(temp);
862 :
863 233554 : nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
864 233554 : if (NS_FAILED(rv)) return rv;
865 :
866 233554 : if (tempLeft < sizeof(temp))
867 233554 : output.Append(temp, sizeof(temp) - tempLeft);
868 : }
869 234735 : return NS_OK;
870 : }
871 :
872 : bool
873 63307 : NS_IsNativeUTF8()
874 : {
875 63307 : return nsNativeCharsetConverter::IsNativeUTF8();
876 : }
877 :
878 : void
879 1419 : NS_StartupNativeCharsetUtils()
880 : {
881 : //
882 : // need to initialize the locale or else charset conversion will fail.
883 : // better not delay this in case some other component alters the locale
884 : // settings.
885 : //
886 : // XXX we assume that we are called early enough that we should
887 : // always be the first to care about the locale's charset.
888 : //
889 1419 : setlocale(LC_CTYPE, "");
890 :
891 1419 : nsNativeCharsetConverter::GlobalInit();
892 1419 : }
893 :
894 : void
895 1419 : NS_ShutdownNativeCharsetUtils()
896 : {
897 1419 : nsNativeCharsetConverter::GlobalShutdown();
898 1419 : }
899 :
900 : //-----------------------------------------------------------------------------
901 : // XP_WIN
902 : //-----------------------------------------------------------------------------
903 : #elif defined(XP_WIN)
904 :
905 : #include <windows.h>
906 : #include "nsAString.h"
907 : #include "nsReadableUtils.h"
908 :
909 : nsresult
910 : NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
911 : {
912 : PRUint32 inputLen = input.Length();
913 :
914 : nsACString::const_iterator iter;
915 : input.BeginReading(iter);
916 :
917 : const char *buf = iter.get();
918 :
919 : // determine length of result
920 : PRUint32 resultLen = 0;
921 : int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, NULL, 0);
922 : if (n > 0)
923 : resultLen += n;
924 :
925 : // allocate sufficient space
926 : if (!EnsureStringLength(output, resultLen))
927 : return NS_ERROR_OUT_OF_MEMORY;
928 : if (resultLen > 0) {
929 : nsAString::iterator out_iter;
930 : output.BeginWriting(out_iter);
931 :
932 : PRUnichar *result = out_iter.get();
933 :
934 : ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, result, resultLen);
935 : }
936 : return NS_OK;
937 : }
938 :
939 : nsresult
940 : NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
941 : {
942 : PRUint32 inputLen = input.Length();
943 :
944 : nsAString::const_iterator iter;
945 : input.BeginReading(iter);
946 :
947 : const PRUnichar *buf = iter.get();
948 :
949 : // determine length of result
950 : PRUint32 resultLen = 0;
951 :
952 : int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, NULL, 0, NULL, NULL);
953 : if (n > 0)
954 : resultLen += n;
955 :
956 : // allocate sufficient space
957 : if (!EnsureStringLength(output, resultLen))
958 : return NS_ERROR_OUT_OF_MEMORY;
959 : if (resultLen > 0) {
960 : nsACString::iterator out_iter;
961 : output.BeginWriting(out_iter);
962 :
963 : // default "defaultChar" is '?', which is an illegal character on windows
964 : // file system. That will cause file uncreatable. Change it to '_'
965 : const char defaultChar = '_';
966 :
967 : char *result = out_iter.get();
968 :
969 : ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
970 : &defaultChar, NULL);
971 : }
972 : return NS_OK;
973 : }
974 :
975 : // moved from widget/windows/nsToolkit.cpp
976 : PRInt32
977 : NS_ConvertAtoW(const char *aStrInA, int aBufferSize, PRUnichar *aStrOutW)
978 : {
979 : return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, aStrOutW, aBufferSize);
980 : }
981 :
982 : PRInt32
983 : NS_ConvertWtoA(const PRUnichar *aStrInW, int aBufferSizeOut,
984 : char *aStrOutA, const char *aDefault)
985 : {
986 : if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0))
987 : return 0;
988 :
989 : int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, aStrInW, -1,
990 : aStrOutA, aBufferSizeOut,
991 : aDefault, NULL);
992 :
993 : if (!numCharsConverted) {
994 : if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
995 : // Overflow, add missing null termination but return 0
996 : aStrOutA[aBufferSizeOut-1] = '\0';
997 : }
998 : else {
999 : // Other error, clear string and return 0
1000 : aStrOutA[0] = '\0';
1001 : }
1002 : }
1003 : else if (numCharsConverted < aBufferSizeOut) {
1004 : // Add 2nd null (really necessary?)
1005 : aStrOutA[numCharsConverted] = '\0';
1006 : }
1007 :
1008 : return numCharsConverted;
1009 : }
1010 :
1011 : //-----------------------------------------------------------------------------
1012 : // XP_OS2
1013 : //-----------------------------------------------------------------------------
1014 : #elif defined(XP_OS2)
1015 :
1016 : #define INCL_DOS
1017 : #include <os2.h>
1018 : #include <uconv.h>
1019 : #include "nsAString.h"
1020 : #include "nsReadableUtils.h"
1021 : #include <ulserrno.h>
1022 : #include "nsNativeCharsetUtils.h"
1023 :
1024 : static UconvObject UnicodeConverter = NULL;
1025 :
1026 : nsresult
1027 : NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1028 : {
1029 : PRUint32 inputLen = input.Length();
1030 :
1031 : nsACString::const_iterator iter;
1032 : input.BeginReading(iter);
1033 : const char *inputStr = iter.get();
1034 :
1035 : // determine length of result
1036 : PRUint32 resultLen = inputLen;
1037 : if (!EnsureStringLength(output, resultLen))
1038 : return NS_ERROR_OUT_OF_MEMORY;
1039 :
1040 : nsAString::iterator out_iter;
1041 : output.BeginWriting(out_iter);
1042 : UniChar *result = (UniChar*)out_iter.get();
1043 :
1044 : size_t cSubs = 0;
1045 : size_t resultLeft = resultLen;
1046 :
1047 : if (!UnicodeConverter)
1048 : NS_StartupNativeCharsetUtils();
1049 :
1050 : int unirc = ::UniUconvToUcs(UnicodeConverter, (void**)&inputStr, &inputLen,
1051 : &result, &resultLeft, &cSubs);
1052 :
1053 : NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1054 :
1055 : if (unirc != ULS_SUCCESS) {
1056 : output.Truncate();
1057 : return NS_ERROR_FAILURE;
1058 : }
1059 :
1060 : // Need to update string length to reflect how many bytes were actually
1061 : // written.
1062 : output.Truncate(resultLen - resultLeft);
1063 : return NS_OK;
1064 : }
1065 :
1066 : nsresult
1067 : NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1068 : {
1069 : size_t inputLen = input.Length();
1070 :
1071 : nsAString::const_iterator iter;
1072 : input.BeginReading(iter);
1073 : UniChar* inputStr = (UniChar*) const_cast<PRUnichar*>(iter.get());
1074 :
1075 : // maximum length of unicode string of length x converted to native
1076 : // codepage is x*2
1077 : size_t resultLen = inputLen * 2;
1078 : if (!EnsureStringLength(output, resultLen))
1079 : return NS_ERROR_OUT_OF_MEMORY;
1080 :
1081 : nsACString::iterator out_iter;
1082 : output.BeginWriting(out_iter);
1083 : char *result = out_iter.get();
1084 :
1085 : size_t cSubs = 0;
1086 : size_t resultLeft = resultLen;
1087 :
1088 : if (!UnicodeConverter)
1089 : NS_StartupNativeCharsetUtils();
1090 :
1091 : int unirc = ::UniUconvFromUcs(UnicodeConverter, &inputStr, &inputLen,
1092 : (void**)&result, &resultLeft, &cSubs);
1093 :
1094 : NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1095 :
1096 : if (unirc != ULS_SUCCESS) {
1097 : output.Truncate();
1098 : return NS_ERROR_FAILURE;
1099 : }
1100 :
1101 : // Need to update string length to reflect how many bytes were actually
1102 : // written.
1103 : output.Truncate(resultLen - resultLeft);
1104 : return NS_OK;
1105 : }
1106 :
1107 : void
1108 : NS_StartupNativeCharsetUtils()
1109 : {
1110 : ULONG ulLength;
1111 : ULONG ulCodePage;
1112 : DosQueryCp(sizeof(ULONG), &ulCodePage, &ulLength);
1113 :
1114 : UniChar codepage[20];
1115 : int unirc = ::UniMapCpToUcsCp(ulCodePage, codepage, 20);
1116 : if (unirc == ULS_SUCCESS) {
1117 : unirc = ::UniCreateUconvObject(codepage, &UnicodeConverter);
1118 : if (unirc == ULS_SUCCESS) {
1119 : uconv_attribute_t attr;
1120 : ::UniQueryUconvObject(UnicodeConverter, &attr, sizeof(uconv_attribute_t),
1121 : NULL, NULL, NULL);
1122 : attr.options = UCONV_OPTION_SUBSTITUTE_BOTH;
1123 : attr.subchar_len=1;
1124 : attr.subchar[0]='_';
1125 : ::UniSetUconvObject(UnicodeConverter, &attr);
1126 : }
1127 : }
1128 : }
1129 :
1130 : void
1131 : NS_ShutdownNativeCharsetUtils()
1132 : {
1133 : ::UniFreeUconvObject(UnicodeConverter);
1134 : }
1135 :
1136 : #else
1137 :
1138 : #include "nsReadableUtils.h"
1139 :
1140 : nsresult
1141 : NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1142 : {
1143 : CopyASCIItoUTF16(input, output);
1144 : return NS_OK;
1145 : }
1146 :
1147 : nsresult
1148 : NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1149 : {
1150 : LossyCopyUTF16toASCII(input, output);
1151 : return NS_OK;
1152 : }
1153 :
1154 : void
1155 : NS_StartupNativeCharsetUtils()
1156 : {
1157 : }
1158 :
1159 : void
1160 : NS_ShutdownNativeCharsetUtils()
1161 : {
1162 : }
1163 :
1164 : #endif
|