1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Mozilla Communicator client code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either of the GNU General Public License Version 2 or later (the "GPL"),
26 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 : #include "nsJapaneseToUnicode.h"
38 :
39 : #include "nsUCSupport.h"
40 :
41 : #include "japanese.map"
42 :
43 : #include "nsICharsetConverterManager.h"
44 : #include "nsIServiceManager.h"
45 :
46 : #include "mozilla/Assertions.h"
47 :
48 : static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID);
49 :
50 : #ifdef XP_OS2
51 : // HTML5-incompliant behavior for OS/2, see bug 108136
52 : // This is bogus. The right fix would be working around the font problems
53 : // in OS/2 gfx, since this "fix" introduces script-visible DOM differences
54 : // between the platforms.
55 : #define SJIS_INDEX gIBM943Index[0]
56 : #define JIS0208_INDEX gIBM943Index[1]
57 : #else
58 : // HTML5 says to use Windows-31J instead of the real Shift_JIS for decoding
59 : #define SJIS_INDEX gCP932Index[0]
60 : #define JIS0208_INDEX gCP932Index[1]
61 : #endif
62 :
63 : #define JIS0212_INDEX gJIS0212Index
64 : #define SJIS_UNMAPPED 0x30fb
65 : #define UNICODE_REPLACEMENT_CHARACTER 0xfffd
66 : #define IN_GR_RANGE(b) \
67 : ((PRUint8(0xa1) <= PRUint8(b)) && (PRUint8(b) <= PRUint8(0xfe)))
68 :
69 65155 : NS_IMETHODIMP nsShiftJISToUnicode::Convert(
70 : const char * aSrc, PRInt32 * aSrcLen,
71 : PRUnichar * aDest, PRInt32 * aDestLen)
72 : {
73 : static const PRUint8 sbIdx[256] =
74 : {
75 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x00 */
76 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x08 */
77 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x10 */
78 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x18 */
79 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x20 */
80 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x28 */
81 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x30 */
82 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* 0x38 */
83 : 0, 1, 2, 3, 4, 5, 6, 7, /* 0x40 */
84 : 8, 9, 10, 11, 12, 13, 14, 15, /* 0x48 */
85 : 16, 17, 18, 19, 20, 21, 22, 23, /* 0x50 */
86 : 24, 25, 26, 27, 28, 29, 30, 31, /* 0x58 */
87 : 32, 33, 34, 35, 36, 37, 38, 39, /* 0x60 */
88 : 40, 41, 42, 43, 44, 45, 46, 47, /* 0x68 */
89 : 48, 49, 50, 51, 52, 53, 54, 55, /* 0x70 */
90 : 56, 57, 58, 59, 60, 61, 62, 0xFF, /* 0x78 */
91 : 63, 64, 65, 66, 67, 68, 69, 70, /* 0x80 */
92 : 71, 72, 73, 74, 75, 76, 77, 78, /* 0x88 */
93 : 79, 80, 81, 82, 83, 84, 85, 86, /* 0x90 */
94 : 87, 88, 89, 90, 91, 92, 93, 94, /* 0x98 */
95 : 95, 96, 97, 98, 99, 100, 101, 102, /* 0xa0 */
96 : 103, 104, 105, 106, 107, 108, 109, 110, /* 0xa8 */
97 : 111, 112, 113, 114, 115, 116, 117, 118, /* 0xb0 */
98 : 119, 120, 121, 122, 123, 124, 125, 126, /* 0xb8 */
99 : 127, 128, 129, 130, 131, 132, 133, 134, /* 0xc0 */
100 : 135, 136, 137, 138, 139, 140, 141, 142, /* 0xc8 */
101 : 143, 144, 145, 146, 147, 148, 149, 150, /* 0xd0 */
102 : 151, 152, 153, 154, 155, 156, 157, 158, /* 0xd8 */
103 : 159, 160, 161, 162, 163, 164, 165, 166, /* 0xe0 */
104 : 167, 168, 169, 170, 171, 172, 173, 174, /* 0xe8 */
105 : 175, 176, 177, 178, 179, 180, 181, 182, /* 0xf0 */
106 : 183, 184, 185, 186, 187, 0xFF, 0xFF, 0xFF, /* 0xf8 */
107 : };
108 :
109 65155 : const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen;
110 65155 : const unsigned char* src =(unsigned char*) aSrc;
111 65155 : PRUnichar* destEnd = aDest + *aDestLen;
112 65155 : PRUnichar* dest = aDest;
113 287410 : while (src < srcEnd) {
114 199243 : switch (mState) {
115 : case 0:
116 172205 : if (*src <= 0x80) {
117 : // ASCII
118 114769 : *dest++ = (PRUnichar) *src;
119 114769 : if (dest >= destEnd) {
120 42047 : goto error1;
121 : }
122 : } else {
123 57436 : mData = SJIS_INDEX[*src & 0x7F];
124 57436 : if (mData < 0xE000) {
125 22533 : mState = 1; // two bytes
126 34903 : } else if (mData < 0xF000) {
127 4505 : mState = 2; // EUDC
128 : } else {
129 30398 : *dest++ = mData; // JIS 0201
130 30398 : if (dest >= destEnd) {
131 65 : goto error1;
132 : }
133 : }
134 : }
135 130093 : break;
136 :
137 : case 1: // Index to table
138 : {
139 22533 : MOZ_ASSERT(mData < 0xE000);
140 22533 : PRUint8 off = sbIdx[*src];
141 :
142 : // Error handling: in the case where the second octet is not in the
143 : // valid ranges 0x40-0x7E 0x80-0xFC, unconsume the invalid octet and
144 : // interpret it as the ASCII value. In the case where the second
145 : // octet is in the valid range but there is no mapping for the
146 : // 2-octet sequence, do not unconsume.
147 22533 : if(0xFF == off) {
148 3354 : src--;
149 3354 : if (mErrBehavior == kOnError_Signal)
150 0 : goto error_invalidchar;
151 3354 : *dest++ = UNICODE_REPLACEMENT_CHARACTER;
152 : } else {
153 19179 : PRUnichar ch = gJapaneseMap[mData+off];
154 19179 : if(ch == 0xfffd) {
155 3044 : if (mErrBehavior == kOnError_Signal)
156 0 : goto error_invalidchar;
157 3044 : ch = SJIS_UNMAPPED;
158 : }
159 19179 : *dest++ = ch;
160 : }
161 22533 : mState = 0;
162 22533 : if(dest >= destEnd)
163 26 : goto error1;
164 : }
165 22507 : break;
166 :
167 : case 2: // EUDC
168 : {
169 4505 : MOZ_ASSERT(0xE000 <= mData && mData < 0xF000);
170 4505 : PRUint8 off = sbIdx[*src];
171 :
172 : // Error handling as in case 1
173 4505 : if(0xFF == off) {
174 670 : src--;
175 670 : if (mErrBehavior == kOnError_Signal)
176 0 : goto error_invalidchar;
177 :
178 670 : *dest++ = UNICODE_REPLACEMENT_CHARACTER;
179 : } else {
180 3835 : *dest++ = mData + off;
181 : }
182 4505 : mState = 0;
183 4505 : if(dest >= destEnd)
184 5 : goto error1;
185 : }
186 4500 : break;
187 :
188 : }
189 157100 : src++;
190 : }
191 23012 : *aDestLen = dest - aDest;
192 23012 : return NS_OK;
193 : error_invalidchar:
194 0 : *aDestLen = dest - aDest;
195 0 : *aSrcLen = src - (const unsigned char*)aSrc;
196 0 : return NS_ERROR_ILLEGAL_INPUT;
197 : error1:
198 42143 : *aDestLen = dest - aDest;
199 42143 : src++;
200 42143 : if ((mState == 0) && (src == srcEnd)) {
201 42142 : return NS_OK;
202 : }
203 1 : *aSrcLen = src - (const unsigned char*)aSrc;
204 1 : return NS_OK_UDEC_MOREOUTPUT;
205 : }
206 :
207 : PRUnichar
208 0 : nsShiftJISToUnicode::GetCharacterForUnMapped()
209 : {
210 0 : return PRUnichar(SJIS_UNMAPPED);
211 : }
212 :
213 129987 : NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
214 : const char * aSrc, PRInt32 * aSrcLen,
215 : PRUnichar * aDest, PRInt32 * aDestLen)
216 : {
217 : static const PRUint8 sbIdx[256] =
218 : {
219 : /* 0x0X */
220 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
221 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
222 : /* 0x1X */
223 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
224 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
225 : /* 0x2X */
226 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
227 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
228 : /* 0x3X */
229 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
230 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
231 : /* 0x4X */
232 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
233 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
234 : /* 0x5X */
235 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
236 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
237 : /* 0x6X */
238 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
239 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
240 : /* 0x7X */
241 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
242 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
243 : /* 0x8X */
244 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
245 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
246 : /* 0x9X */
247 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
248 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
249 : /* 0xAX */
250 : 0xFF, 0, 1, 2, 3, 4, 5, 6,
251 : 7, 8 , 9, 10, 11, 12, 13, 14,
252 : /* 0xBX */
253 : 15, 16, 17, 18, 19, 20, 21, 22,
254 : 23, 24, 25, 26, 27, 28, 29, 30,
255 : /* 0xCX */
256 : 31, 32, 33, 34, 35, 36, 37, 38,
257 : 39, 40, 41, 42, 43, 44, 45, 46,
258 : /* 0xDX */
259 : 47, 48, 49, 50, 51, 52, 53, 54,
260 : 55, 56, 57, 58, 59, 60, 61, 62,
261 : /* 0xEX */
262 : 63, 64, 65, 66, 67, 68, 69, 70,
263 : 71, 72, 73, 74, 75, 76, 77, 78,
264 : /* 0xFX */
265 : 79, 80, 81, 82, 83, 84, 85, 86,
266 : 87, 88, 89, 90, 91, 92, 93, 0xFF,
267 : };
268 :
269 129987 : const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen;
270 129987 : const unsigned char* src =(unsigned char*) aSrc;
271 129987 : PRUnichar* destEnd = aDest + *aDestLen;
272 129987 : PRUnichar* dest = aDest;
273 1046027 : while((src < srcEnd))
274 : {
275 879500 : switch(mState)
276 : {
277 : case 0:
278 734878 : if(*src & 0x80 && *src != (unsigned char)0xa0)
279 : {
280 149202 : mData = JIS0208_INDEX[*src & 0x7F];
281 298373 : if(mData != 0xFFFD )
282 : {
283 53925 : mState = 1; // two byte JIS0208
284 : } else {
285 95277 : if( 0x8e == *src) {
286 : // JIS 0201
287 940 : mState = 2; // JIS0201
288 94337 : } else if(0x8f == *src) {
289 : // JIS 0212
290 65693 : mState = 3; // JIS0212
291 : } else {
292 : // others
293 28644 : if (mErrBehavior == kOnError_Signal)
294 0 : goto error_invalidchar;
295 28644 : *dest++ = 0xFFFD;
296 28644 : if(dest >= destEnd)
297 31 : goto error1;
298 : }
299 : }
300 : } else {
301 : // ASCII
302 585676 : *dest++ = (PRUnichar) *src;
303 585676 : if(dest >= destEnd)
304 93368 : goto error1;
305 : }
306 641479 : break;
307 :
308 : case 1: // Index to table
309 : {
310 53925 : PRUint8 off = sbIdx[*src];
311 53925 : if(0xFF == off) {
312 45042 : if (mErrBehavior == kOnError_Signal)
313 0 : goto error_invalidchar;
314 45042 : *dest++ = 0xFFFD;
315 : // if the first byte is valid for EUC-JP but the second
316 : // is not while being a valid US-ASCII, save it
317 : // instead of eating it up !
318 45042 : if ( (PRUint8)*src < (PRUint8)0x7f )
319 41744 : --src;
320 : } else {
321 8883 : *dest++ = gJapaneseMap[mData+off];
322 : }
323 53925 : mState = 0;
324 53925 : if(dest >= destEnd)
325 47 : goto error1;
326 : }
327 53878 : break;
328 :
329 : case 2: // JIS 0201
330 : {
331 940 : if((0xA1 <= *src) && (*src <= 0xDF)) {
332 126 : *dest++ = (0xFF61-0x00A1) + *src;
333 : } else {
334 814 : if (mErrBehavior == kOnError_Signal)
335 0 : goto error_invalidchar;
336 814 : *dest++ = 0xFFFD;
337 : // if 0x8e is not followed by a valid JIS X 0201 byte
338 : // but by a valid US-ASCII, save it instead of eating it up.
339 814 : if ( (PRUint8)*src < (PRUint8)0x7f )
340 673 : --src;
341 : }
342 940 : mState = 0;
343 940 : if(dest >= destEnd)
344 1 : goto error1;
345 : }
346 939 : break;
347 :
348 : case 3: // JIS 0212
349 : {
350 65693 : if (IN_GR_RANGE(*src))
351 : {
352 24064 : mData = JIS0212_INDEX[*src & 0x7F];
353 48128 : if(mData != 0xFFFD )
354 : {
355 24064 : mState = 4;
356 : } else {
357 0 : mState = 5; // error
358 : }
359 : } else {
360 : // First "JIS 0212" byte is not in the valid GR range: save it
361 41629 : if (mErrBehavior == kOnError_Signal)
362 0 : goto error_invalidchar;
363 41629 : *dest++ = 0xFFFD;
364 41629 : --src;
365 41629 : mState = 0;
366 41629 : if(dest >= destEnd)
367 0 : goto error1;
368 : }
369 : }
370 65693 : break;
371 : case 4:
372 : {
373 24064 : PRUint8 off = sbIdx[*src];
374 24064 : if(0xFF != off) {
375 8836 : *dest++ = gJapaneseMap[mData+off];
376 8836 : mState = 0;
377 8836 : if(dest >= destEnd)
378 0 : goto error1;
379 8836 : break;
380 : }
381 : // else fall through to error handler
382 : }
383 : case 5: // two bytes undefined
384 : {
385 15228 : if (mErrBehavior == kOnError_Signal)
386 0 : goto error_invalidchar;
387 15228 : *dest++ = 0xFFFD;
388 : // Undefined JIS 0212 two byte sequence. If the second byte is in
389 : // the valid range for a two byte sequence (0xa1 - 0xfe) consume
390 : // both bytes. Otherwise resynchronize on the second byte.
391 15228 : if (!IN_GR_RANGE(*src))
392 15228 : --src;
393 15228 : mState = 0;
394 15228 : if(dest >= destEnd)
395 0 : goto error1;
396 : }
397 15228 : break;
398 : }
399 786053 : src++;
400 : }
401 36540 : *aDestLen = dest - aDest;
402 36540 : return NS_OK;
403 : error_invalidchar:
404 0 : *aDestLen = dest - aDest;
405 0 : *aSrcLen = src - (const unsigned char*)aSrc;
406 0 : return NS_ERROR_ILLEGAL_INPUT;
407 : error1:
408 93447 : *aDestLen = dest - aDest;
409 93447 : src++;
410 93447 : if ((mState == 0) && (src == srcEnd)) {
411 93447 : return NS_OK;
412 : }
413 0 : *aSrcLen = src - (const unsigned char*)aSrc;
414 0 : return NS_OK_UDEC_MOREOUTPUT;
415 : }
416 :
417 :
418 :
419 128 : NS_IMETHODIMP nsISO2022JPToUnicodeV2::Convert(
420 : const char * aSrc, PRInt32 * aSrcLen,
421 : PRUnichar * aDest, PRInt32 * aDestLen)
422 : {
423 : static const PRUint16 fbIdx[128] =
424 : {
425 : /* 0x8X */
426 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
427 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
428 : /* 0x9X */
429 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
430 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
431 : /* 0xAX */
432 : 0xFFFD, 0, 94, 94* 2, 94* 3, 94* 4, 94* 5, 94* 6,
433 : 94* 7, 94* 8 , 94* 9, 94*10, 94*11, 94*12, 94*13, 94*14,
434 : /* 0xBX */
435 : 94*15, 94*16, 94*17, 94*18, 94*19, 94*20, 94*21, 94*22,
436 : 94*23, 94*24, 94*25, 94*26, 94*27, 94*28, 94*29, 94*30,
437 : /* 0xCX */
438 : 94*31, 94*32, 94*33, 94*34, 94*35, 94*36, 94*37, 94*38,
439 : 94*39, 94*40, 94*41, 94*42, 94*43, 94*44, 94*45, 94*46,
440 : /* 0xDX */
441 : 94*47, 94*48, 94*49, 94*50, 94*51, 94*52, 94*53, 94*54,
442 : 94*55, 94*56, 94*57, 94*58, 94*59, 94*60, 94*61, 94*62,
443 : /* 0xEX */
444 : 94*63, 94*64, 94*65, 94*66, 94*67, 94*68, 94*69, 94*70,
445 : 94*71, 94*72, 94*73, 94*74, 94*75, 94*76, 94*77, 94*78,
446 : /* 0xFX */
447 : 94*79, 94*80, 94*81, 94*82, 94*83, 94*84, 94*85, 94*86,
448 : 94*87, 94*88, 94*89, 94*90, 94*91, 94*92, 94*93, 0xFFFD,
449 : };
450 : static const PRUint8 sbIdx[256] =
451 : {
452 : /* 0x0X */
453 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
454 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
455 : /* 0x1X */
456 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
457 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
458 : /* 0x2X */
459 : 0xFF, 0, 1, 2, 3, 4, 5, 6,
460 : 7, 8 , 9, 10, 11, 12, 13, 14,
461 : /* 0x3X */
462 : 15, 16, 17, 18, 19, 20, 21, 22,
463 : 23, 24, 25, 26, 27, 28, 29, 30,
464 : /* 0x4X */
465 : 31, 32, 33, 34, 35, 36, 37, 38,
466 : 39, 40, 41, 42, 43, 44, 45, 46,
467 : /* 0x5X */
468 : 47, 48, 49, 50, 51, 52, 53, 54,
469 : 55, 56, 57, 58, 59, 60, 61, 62,
470 : /* 0x6X */
471 : 63, 64, 65, 66, 67, 68, 69, 70,
472 : 71, 72, 73, 74, 75, 76, 77, 78,
473 : /* 0x7X */
474 : 79, 80, 81, 82, 83, 84, 85, 86,
475 : 87, 88, 89, 90, 91, 92, 93, 0xFF,
476 : /* 0x8X */
477 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
478 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
479 : /* 0x9X */
480 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
481 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
482 : /* 0xAX */
483 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
484 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
485 : /* 0xBX */
486 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
487 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
488 : /* 0xCX */
489 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
490 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
491 : /* 0xDX */
492 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
493 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
494 : /* 0xEX */
495 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
496 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
497 : /* 0xFX */
498 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
499 : 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
500 : };
501 :
502 128 : const unsigned char* srcEnd = (unsigned char*)aSrc + *aSrcLen;
503 128 : const unsigned char* src =(unsigned char*) aSrc;
504 128 : PRUnichar* destEnd = aDest + *aDestLen;
505 128 : PRUnichar* dest = aDest;
506 256 : while((src < srcEnd))
507 : {
508 :
509 128 : switch(mState)
510 : {
511 : case mState_ASCII:
512 128 : if(0x1b == *src)
513 : {
514 0 : mLastLegalState = mState;
515 0 : mState = mState_ESC;
516 128 : } else if(*src & 0x80) {
517 128 : goto error2;
518 : } else {
519 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
520 0 : goto error1;
521 0 : *dest++ = (PRUnichar) *src;
522 : }
523 0 : break;
524 :
525 : case mState_ESC:
526 0 : if( '(' == *src) {
527 0 : mState = mState_ESC_28;
528 0 : } else if ('$' == *src) {
529 0 : mState = mState_ESC_24;
530 0 : } else if ('.' == *src) { // for ISO-2022-JP-2
531 0 : mState = mState_ESC_2e;
532 0 : } else if ('N' == *src) { // for ISO-2022-JP-2
533 0 : mState = mState_ESC_4e;
534 : } else {
535 0 : if (CHECK_OVERRUN(dest, destEnd, 2))
536 0 : goto error1;
537 0 : *dest++ = (PRUnichar) 0x1b;
538 0 : if(0x80 & *src)
539 0 : goto error2;
540 0 : *dest++ = (PRUnichar) *src;
541 0 : mState = mLastLegalState;
542 : }
543 0 : break;
544 :
545 : case mState_ESC_28: // ESC (
546 0 : if( 'B' == *src) {
547 0 : mState = mState_ASCII;
548 0 : if (mRunLength == 0) {
549 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
550 0 : goto error1;
551 0 : *dest++ = 0xFFFD;
552 : }
553 0 : mRunLength = 0;
554 0 : } else if ('J' == *src) {
555 0 : mState = mState_JISX0201_1976Roman;
556 0 : if (mRunLength == 0 && mLastLegalState != mState_ASCII) {
557 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
558 0 : goto error1;
559 0 : if (mErrBehavior == kOnError_Signal)
560 0 : goto error2;
561 0 : *dest++ = 0xFFFD;
562 : }
563 0 : mRunLength = 0;
564 0 : } else if ('I' == *src) {
565 0 : mState = mState_JISX0201_1976Kana;
566 0 : mRunLength = 0;
567 : } else {
568 0 : if (CHECK_OVERRUN(dest, destEnd, 3))
569 0 : goto error1;
570 0 : *dest++ = (PRUnichar) 0x1b;
571 0 : *dest++ = (PRUnichar) '(';
572 0 : if(0x80 & *src)
573 0 : goto error2;
574 0 : *dest++ = (PRUnichar) *src;
575 0 : mState = mLastLegalState;
576 : }
577 0 : break;
578 :
579 : case mState_ESC_24: // ESC $
580 0 : if( '@' == *src) {
581 0 : mState = mState_JISX0208_1978;
582 0 : mRunLength = 0;
583 0 : } else if ('A' == *src) {
584 0 : mState = mState_GB2312_1980;
585 0 : mRunLength = 0;
586 0 : } else if ('B' == *src) {
587 0 : mState = mState_JISX0208_1983;
588 0 : mRunLength = 0;
589 0 : } else if ('(' == *src) {
590 0 : mState = mState_ESC_24_28;
591 : } else {
592 0 : if (CHECK_OVERRUN(dest, destEnd, 3))
593 0 : goto error1;
594 0 : *dest++ = (PRUnichar) 0x1b;
595 0 : *dest++ = (PRUnichar) '$';
596 0 : if(0x80 & *src)
597 0 : goto error2;
598 0 : *dest++ = (PRUnichar) *src;
599 0 : mState = mLastLegalState;
600 : }
601 0 : break;
602 :
603 : case mState_ESC_24_28: // ESC $ (
604 0 : if( 'C' == *src) {
605 0 : mState = mState_KSC5601_1987;
606 0 : mRunLength = 0;
607 0 : } else if ('D' == *src) {
608 0 : mState = mState_JISX0212_1990;
609 0 : mRunLength = 0;
610 : } else {
611 0 : if (CHECK_OVERRUN(dest, destEnd, 4))
612 0 : goto error1;
613 0 : *dest++ = (PRUnichar) 0x1b;
614 0 : *dest++ = (PRUnichar) '$';
615 0 : *dest++ = (PRUnichar) '(';
616 0 : if(0x80 & *src)
617 0 : goto error2;
618 0 : *dest++ = (PRUnichar) *src;
619 0 : mState = mLastLegalState;
620 : }
621 0 : break;
622 :
623 : case mState_JISX0201_1976Roman:
624 0 : if(0x1b == *src) {
625 0 : mLastLegalState = mState;
626 0 : mState = mState_ESC;
627 0 : } else if(*src & 0x80) {
628 0 : goto error2;
629 : } else {
630 : // XXX We need to decide how to handle \ and ~ here
631 : // we may need a if statement here for '\' and '~'
632 : // to map them to Yen and Overbar
633 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
634 0 : goto error1;
635 0 : *dest++ = (PRUnichar) *src;
636 0 : ++mRunLength;
637 : }
638 0 : break;
639 :
640 : case mState_JISX0201_1976Kana:
641 0 : if(0x1b == *src) {
642 0 : mLastLegalState = mState;
643 0 : mState = mState_ESC;
644 : } else {
645 0 : if((0x21 <= *src) && (*src <= 0x5F)) {
646 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
647 0 : goto error1;
648 0 : *dest++ = (0xFF61-0x0021) + *src;
649 0 : ++mRunLength;
650 : } else {
651 : goto error2;
652 : }
653 : }
654 0 : break;
655 :
656 : case mState_JISX0208_1978:
657 0 : if(0x1b == *src) {
658 0 : mLastLegalState = mState;
659 0 : mState = mState_ESC;
660 0 : } else if(*src & 0x80) {
661 0 : mLastLegalState = mState;
662 0 : mState = mState_ERROR;
663 : } else {
664 0 : mData = JIS0208_INDEX[*src & 0x7F];
665 0 : if(0xFFFD == mData)
666 0 : goto error2;
667 0 : mState = mState_JISX0208_1978_2ndbyte;
668 : }
669 0 : break;
670 :
671 : case mState_GB2312_1980:
672 0 : if(0x1b == *src) {
673 0 : mLastLegalState = mState;
674 0 : mState = mState_ESC;
675 0 : } else if(*src & 0x80) {
676 0 : mLastLegalState = mState;
677 0 : mState = mState_ERROR;
678 : } else {
679 0 : mData = fbIdx[*src & 0x7F];
680 0 : if(0xFFFD == mData)
681 0 : goto error2;
682 0 : mState = mState_GB2312_1980_2ndbyte;
683 : }
684 0 : break;
685 :
686 : case mState_JISX0208_1983:
687 0 : if(0x1b == *src) {
688 0 : mLastLegalState = mState;
689 0 : mState = mState_ESC;
690 0 : } else if(*src & 0x80) {
691 0 : mLastLegalState = mState;
692 0 : mState = mState_ERROR;
693 : } else {
694 0 : mData = JIS0208_INDEX[*src & 0x7F];
695 0 : if(0xFFFD == mData)
696 0 : goto error2;
697 0 : mState = mState_JISX0208_1983_2ndbyte;
698 : }
699 0 : break;
700 :
701 : case mState_KSC5601_1987:
702 0 : if(0x1b == *src) {
703 0 : mLastLegalState = mState;
704 0 : mState = mState_ESC;
705 0 : } else if(*src & 0x80) {
706 0 : mLastLegalState = mState;
707 0 : mState = mState_ERROR;
708 : } else {
709 0 : mData = fbIdx[*src & 0x7F];
710 0 : if(0xFFFD == mData)
711 0 : goto error2;
712 0 : mState = mState_KSC5601_1987_2ndbyte;
713 : }
714 0 : break;
715 :
716 : case mState_JISX0212_1990:
717 0 : if(0x1b == *src) {
718 0 : mLastLegalState = mState;
719 0 : mState = mState_ESC;
720 0 : } else if(*src & 0x80) {
721 0 : mLastLegalState = mState;
722 0 : mState = mState_ERROR;
723 : } else {
724 0 : mData = JIS0212_INDEX[*src & 0x7F];
725 0 : if(0xFFFD == mData)
726 0 : goto error2;
727 0 : mState = mState_JISX0212_1990_2ndbyte;
728 : }
729 0 : break;
730 :
731 : case mState_JISX0208_1978_2ndbyte:
732 : {
733 0 : PRUint8 off = sbIdx[*src];
734 0 : if(0xFF == off) {
735 0 : goto error2;
736 : } else {
737 : // XXX We need to map from JIS X 0208 1983 to 1987
738 : // in the next line before pass to *dest++
739 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
740 0 : goto error1;
741 0 : *dest++ = gJapaneseMap[mData+off];
742 0 : ++mRunLength;
743 : }
744 0 : mState = mState_JISX0208_1978;
745 : }
746 0 : break;
747 :
748 : case mState_GB2312_1980_2ndbyte:
749 : {
750 0 : PRUint8 off = sbIdx[*src];
751 0 : if(0xFF == off) {
752 0 : goto error2;
753 : } else {
754 0 : if (!mGB2312Decoder) {
755 : // creating a delegate converter (GB2312)
756 : nsresult rv;
757 : nsCOMPtr<nsICharsetConverterManager> ccm =
758 0 : do_GetService(kCharsetConverterManagerCID, &rv);
759 0 : if (NS_SUCCEEDED(rv)) {
760 0 : rv = ccm->GetUnicodeDecoderRaw("GB2312", &mGB2312Decoder);
761 : }
762 : }
763 0 : if (!mGB2312Decoder) {// failed creating a delegate converter
764 0 : goto error2;
765 : } else {
766 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
767 0 : goto error1;
768 : unsigned char gb[2];
769 : PRUnichar uni;
770 0 : PRInt32 gbLen = 2, uniLen = 1;
771 : // ((mData/94)+0x21) is the original 1st byte.
772 : // *src is the present 2nd byte.
773 : // Put 2 bytes (one character) to gb[] with GB2312 encoding.
774 0 : gb[0] = ((mData / 94) + 0x21) | 0x80;
775 0 : gb[1] = *src | 0x80;
776 : // Convert GB2312 to unicode.
777 : mGB2312Decoder->Convert((const char *)gb, &gbLen,
778 0 : &uni, &uniLen);
779 0 : *dest++ = uni;
780 0 : ++mRunLength;
781 : }
782 : }
783 0 : mState = mState_GB2312_1980;
784 : }
785 0 : break;
786 :
787 : case mState_JISX0208_1983_2ndbyte:
788 : {
789 0 : PRUint8 off = sbIdx[*src];
790 0 : if(0xFF == off) {
791 0 : goto error2;
792 : } else {
793 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
794 0 : goto error1;
795 0 : *dest++ = gJapaneseMap[mData+off];
796 0 : ++mRunLength;
797 : }
798 0 : mState = mState_JISX0208_1983;
799 : }
800 0 : break;
801 :
802 : case mState_KSC5601_1987_2ndbyte:
803 : {
804 0 : PRUint8 off = sbIdx[*src];
805 0 : if(0xFF == off) {
806 0 : goto error2;
807 : } else {
808 0 : if (!mEUCKRDecoder) {
809 : // creating a delegate converter (EUC-KR)
810 : nsresult rv;
811 : nsCOMPtr<nsICharsetConverterManager> ccm =
812 0 : do_GetService(kCharsetConverterManagerCID, &rv);
813 0 : if (NS_SUCCEEDED(rv)) {
814 0 : rv = ccm->GetUnicodeDecoderRaw("EUC-KR", &mEUCKRDecoder);
815 : }
816 : }
817 0 : if (!mEUCKRDecoder) {// failed creating a delegate converter
818 0 : goto error2;
819 : } else {
820 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
821 0 : goto error1;
822 : unsigned char ksc[2];
823 : PRUnichar uni;
824 0 : PRInt32 kscLen = 2, uniLen = 1;
825 : // ((mData/94)+0x21) is the original 1st byte.
826 : // *src is the present 2nd byte.
827 : // Put 2 bytes (one character) to ksc[] with EUC-KR encoding.
828 0 : ksc[0] = ((mData / 94) + 0x21) | 0x80;
829 0 : ksc[1] = *src | 0x80;
830 : // Convert EUC-KR to unicode.
831 : mEUCKRDecoder->Convert((const char *)ksc, &kscLen,
832 0 : &uni, &uniLen);
833 0 : *dest++ = uni;
834 0 : ++mRunLength;
835 : }
836 : }
837 0 : mState = mState_KSC5601_1987;
838 : }
839 0 : break;
840 :
841 : case mState_JISX0212_1990_2ndbyte:
842 : {
843 0 : PRUint8 off = sbIdx[*src];
844 0 : if(0xFF == off) {
845 0 : goto error2;
846 : } else {
847 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
848 0 : goto error1;
849 0 : *dest++ = gJapaneseMap[mData+off];
850 0 : ++mRunLength;
851 : }
852 0 : mState = mState_JISX0212_1990;
853 : }
854 0 : break;
855 :
856 : case mState_ESC_2e: // ESC .
857 : // "ESC ." will designate 96 character set to G2.
858 0 : mState = mLastLegalState;
859 0 : if( 'A' == *src) {
860 0 : G2charset = G2_ISO88591;
861 0 : } else if ('F' == *src) {
862 0 : G2charset = G2_ISO88597;
863 : } else {
864 0 : if (CHECK_OVERRUN(dest, destEnd, 3))
865 0 : goto error1;
866 0 : *dest++ = (PRUnichar) 0x1b;
867 0 : *dest++ = (PRUnichar) '.';
868 0 : if(0x80 & *src)
869 0 : goto error2;
870 0 : *dest++ = (PRUnichar) *src;
871 : }
872 0 : break;
873 :
874 : case mState_ESC_4e: // ESC N
875 : // "ESC N" is the SS2 sequence, that invoke a G2 designated
876 : // character set. Since SS2 is effective only for next one
877 : // character, mState should be returned to the last status.
878 0 : mState = mLastLegalState;
879 0 : if((0x20 <= *src) && (*src <= 0x7F)) {
880 0 : if (G2_ISO88591 == G2charset) {
881 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
882 0 : goto error1;
883 0 : *dest++ = *src | 0x80;
884 0 : ++mRunLength;
885 0 : } else if (G2_ISO88597 == G2charset) {
886 0 : if (!mISO88597Decoder) {
887 : // creating a delegate converter (ISO-8859-7)
888 : nsresult rv;
889 : nsCOMPtr<nsICharsetConverterManager> ccm =
890 0 : do_GetService(kCharsetConverterManagerCID, &rv);
891 0 : if (NS_SUCCEEDED(rv)) {
892 0 : rv = ccm->GetUnicodeDecoderRaw("ISO-8859-7", &mISO88597Decoder);
893 : }
894 : }
895 0 : if (!mISO88597Decoder) {// failed creating a delegate converter
896 0 : goto error2;
897 : } else {
898 0 : if (CHECK_OVERRUN(dest, destEnd, 1))
899 0 : goto error1;
900 : // Put one character with ISO-8859-7 encoding.
901 0 : unsigned char gr = *src | 0x80;
902 : PRUnichar uni;
903 0 : PRInt32 grLen = 1, uniLen = 1;
904 : // Convert ISO-8859-7 to unicode.
905 : mISO88597Decoder->Convert((const char *)&gr, &grLen,
906 0 : &uni, &uniLen);
907 0 : *dest++ = uni;
908 0 : ++mRunLength;
909 : }
910 : } else {// G2charset is G2_unknown (not designated yet)
911 0 : goto error2;
912 0 : }
913 : } else {
914 0 : if (CHECK_OVERRUN(dest, destEnd, 3))
915 0 : goto error1;
916 0 : *dest++ = (PRUnichar) 0x1b;
917 0 : *dest++ = (PRUnichar) 'N';
918 0 : if(0x80 & *src)
919 0 : goto error2;
920 0 : *dest++ = (PRUnichar) *src;
921 : }
922 0 : break;
923 :
924 : case mState_ERROR:
925 0 : mState = mLastLegalState;
926 0 : mRunLength = 0;
927 0 : goto error2;
928 : break;
929 :
930 : } // switch
931 0 : src++;
932 : }
933 0 : *aDestLen = dest - aDest;
934 0 : return NS_OK;
935 : error1:
936 0 : *aDestLen = dest - aDest;
937 0 : *aSrcLen = src - (const unsigned char*)aSrc;
938 0 : return NS_OK_UDEC_MOREOUTPUT;
939 : error2:
940 128 : *aSrcLen = src - (const unsigned char*)aSrc;
941 128 : *aDestLen = dest - aDest;
942 128 : return NS_ERROR_UNEXPECTED;
943 : }
|