1 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is mozilla.org code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either of the GNU General Public License Version 2 or later (the "GPL"),
26 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 : /**
38 : * A character set converter from GBK to Unicode.
39 : *
40 : *
41 : * @created 07/Sept/1999
42 : * @author Yueheng Xu, Yueheng.Xu@intel.com
43 : */
44 :
45 : #include "nsGBKToUnicode.h"
46 : #include "nsUCvCnDll.h"
47 : #include "gbku.h"
48 :
49 :
50 : //------------------------------------------------------------
51 : // nsGBKUnique2BytesToUnicode
52 : //------------------------------------------------------------
53 : class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport
54 : {
55 : public:
56 : nsGBKUnique2BytesToUnicode();
57 0 : virtual ~nsGBKUnique2BytesToUnicode()
58 0 : { }
59 : protected:
60 : };
61 :
62 : static const PRUint16 g_utGBKUnique2Bytes[] = {
63 : #include "gbkuniq2b.ut"
64 : };
65 0 : nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode()
66 : : nsTableDecoderSupport(u2BytesCharset, nsnull,
67 0 : (uMappingTable*) &g_utGBKUnique2Bytes, 1)
68 : {
69 0 : }
70 :
71 : //------------------------------------------------------------
72 : // nsGB18030Unique2BytesToUnicode
73 : //------------------------------------------------------------
74 : class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport
75 : {
76 : public:
77 : nsGB18030Unique2BytesToUnicode();
78 2 : virtual ~nsGB18030Unique2BytesToUnicode()
79 4 : { }
80 : protected:
81 : };
82 :
83 : static const PRUint16 g_utGB18030Unique2Bytes[] = {
84 : #include "gb18030uniq2b.ut"
85 : };
86 1 : nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode()
87 : : nsTableDecoderSupport(u2BytesCharset, nsnull,
88 1 : (uMappingTable*) &g_utGB18030Unique2Bytes, 1)
89 : {
90 1 : }
91 :
92 : //------------------------------------------------------------
93 : // nsGB18030Unique4BytesToUnicode
94 : //------------------------------------------------------------
95 : class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport
96 : {
97 : public:
98 : nsGB18030Unique4BytesToUnicode();
99 2 : virtual ~nsGB18030Unique4BytesToUnicode()
100 4 : { }
101 : protected:
102 : };
103 :
104 : static const PRUint16 g_utGB18030Unique4Bytes[] = {
105 : #include "gb180304bytes.ut"
106 : };
107 1 : nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode()
108 : : nsTableDecoderSupport(u4BytesGB18030Charset, nsnull,
109 1 : (uMappingTable*) &g_utGB18030Unique4Bytes, 1)
110 : {
111 1 : }
112 :
113 :
114 : //----------------------------------------------------------------------
115 : // Class nsGBKToUnicode [implementation]
116 :
117 : //----------------------------------------------------------------------
118 : // Subclassing of nsTablesDecoderSupport class [implementation]
119 :
120 : #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \
121 : (UINT8_IN_RANGE(0x81, (c), 0xFE))
122 : #define FIRST_BYTE_IS_SURROGATE(c) \
123 : (UINT8_IN_RANGE(0x90, (c), 0xFE))
124 : #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
125 : (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
126 : #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
127 : (UINT8_IN_RANGE(0x30, (c), 0x39))
128 : #define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \
129 : (UINT8_IN_RANGE(0x81, (c), 0xFE))
130 : #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
131 : (UINT8_IN_RANGE(0x30, (c), 0x39))
132 :
133 66231 : NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
134 : PRInt32 * aSrcLength,
135 : PRUnichar *aDest,
136 : PRInt32 * aDestLength)
137 : {
138 66231 : PRInt32 i=0;
139 66231 : PRInt32 iSrcLength = (*aSrcLength);
140 66231 : PRInt32 iDestlen = 0;
141 66231 : nsresult rv=NS_OK;
142 66231 : *aSrcLength = 0;
143 :
144 219907 : for (i=0;i<iSrcLength;i++)
145 : {
146 155755 : if ( iDestlen >= (*aDestLength) )
147 : {
148 1260 : rv = NS_OK_UDEC_MOREOUTPUT;
149 1260 : break;
150 : }
151 : // The valid range for the 1st byte is [0x81,0xFE]
152 154495 : if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
153 : {
154 48967 : if(i+1 >= iSrcLength)
155 : {
156 189 : rv = NS_OK_UDEC_MOREINPUT;
157 189 : break;
158 : }
159 : // To make sure, the second byte has to be checked as well.
160 : // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
161 96926 : if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
162 : {
163 : // Valid GBK code
164 40572 : *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
165 40572 : if(UCS2_NO_MAPPING == *aDest)
166 : {
167 : // We cannot map in the common mapping, let's call the
168 : // delegate 2 byte decoder to decode the gbk or gb18030 unique
169 : // 2 byte mapping
170 255 : if(! TryExtensionDecoder(aSrc, aDest))
171 : {
172 0 : *aDest = UCS2_NO_MAPPING;
173 : }
174 : }
175 40572 : aSrc += 2;
176 40572 : i++;
177 : }
178 8206 : else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
179 : {
180 : // from the first 2 bytes, it looks like a 4 byte GB18030
181 1276 : if(i+3 >= iSrcLength) // make sure we got 4 bytes
182 : {
183 630 : rv = NS_OK_UDEC_MOREINPUT;
184 630 : break;
185 : }
186 : // 4 bytes patten
187 : // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
188 : // preset the
189 :
190 1324 : if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
191 32 : LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
192 : {
193 16 : if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
194 : {
195 : // let's call the delegated 4 byte gb18030 converter to convert it
196 16 : if(! Try4BytesDecoder(aSrc, aDest))
197 0 : *aDest = UCS2_NO_MAPPING;
198 : } else {
199 : // let's try supplement mapping
200 8 : if ( (iDestlen+1) < (*aDestLength) )
201 : {
202 8 : if(DecodeToSurrogate(aSrc, aDest))
203 : {
204 : // surrogte two PRUnichar
205 2 : iDestlen++;
206 2 : aDest++;
207 : } else {
208 6 : *aDest = UCS2_NO_MAPPING;
209 : }
210 : } else {
211 0 : if (*aDestLength < 2) {
212 0 : NS_ERROR("insufficient space in output buffer");
213 0 : *aDest = UCS2_NO_MAPPING;
214 : } else {
215 0 : rv = NS_OK_UDEC_MOREOUTPUT;
216 0 : break;
217 : }
218 : }
219 : }
220 16 : aSrc += 4;
221 16 : i += 3;
222 : } else {
223 630 : *aDest = UCS2_NO_MAPPING;
224 : // If the third and fourth bytes are not in the legal ranges for
225 : // a four-byte sequnce, resynchronize on the second byte
226 : // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
227 : // 0x30-0x39)
228 630 : aSrc++;
229 : }
230 : }
231 6930 : else if ((PRUint8) aSrc[0] == (PRUint8)0xA0 )
232 : {
233 : // stand-alone (not followed by a valid second byte) 0xA0 !
234 : // treat it as valid a la Netscape 4.x
235 55 : *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
236 55 : aSrc++;
237 : } else {
238 : // Invalid GBK code point (second byte should be 0x40 or higher)
239 6875 : *aDest = UCS2_NO_MAPPING;
240 6875 : aSrc++;
241 : }
242 : } else {
243 105528 : if(IS_ASCII(*aSrc))
244 : {
245 : // The source is an ASCII
246 104628 : *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
247 104628 : aSrc++;
248 : } else {
249 900 : if(IS_GBK_EURO(*aSrc)) {
250 387 : *aDest = UCS2_EURO;
251 : } else {
252 513 : *aDest = UCS2_NO_MAPPING;
253 : }
254 900 : aSrc++;
255 : }
256 : }
257 153676 : iDestlen++;
258 153676 : aDest++;
259 153676 : *aSrcLength = i+1;
260 : }
261 66231 : *aDestLength = iDestlen;
262 66231 : return rv;
263 : }
264 :
265 :
266 0 : void nsGBKToUnicode::CreateExtensionDecoder()
267 : {
268 0 : mExtensionDecoder = new nsGBKUnique2BytesToUnicode();
269 0 : }
270 0 : void nsGBKToUnicode::Create4BytesDecoder()
271 : {
272 0 : m4BytesDecoder = nsnull;
273 0 : }
274 1 : void nsGB18030ToUnicode::CreateExtensionDecoder()
275 : {
276 1 : mExtensionDecoder = new nsGB18030Unique2BytesToUnicode();
277 1 : }
278 1 : void nsGB18030ToUnicode::Create4BytesDecoder()
279 : {
280 1 : m4BytesDecoder = new nsGB18030Unique4BytesToUnicode();
281 1 : }
282 8 : bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, PRUnichar* aOut)
283 : {
284 8 : NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]), "illegal first byte");
285 8 : NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]), "illegal second byte");
286 8 : NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]), "illegal third byte");
287 8 : NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]), "illegal forth byte");
288 8 : if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
289 0 : return false;
290 8 : if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
291 0 : return false;
292 8 : if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
293 0 : return false;
294 8 : if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
295 0 : return false;
296 :
297 8 : PRUint8 a1 = (PRUint8) aSrc[0];
298 8 : PRUint8 a2 = (PRUint8) aSrc[1];
299 8 : PRUint8 a3 = (PRUint8) aSrc[2];
300 8 : PRUint8 a4 = (PRUint8) aSrc[3];
301 8 : a1 -= (PRUint8)0x90;
302 8 : a2 -= (PRUint8)0x30;
303 8 : a3 -= (PRUint8)0x81;
304 8 : a4 -= (PRUint8)0x30;
305 8 : PRUint32 idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
306 : // idx == ucs4Codepoint - 0x10000
307 8 : if (idx > 0x000FFFFF)
308 6 : return false;
309 :
310 2 : *aOut++ = 0xD800 | (idx >> 10);
311 2 : *aOut = 0xDC00 | (0x000003FF & idx);
312 :
313 2 : return true;
314 : }
315 255 : bool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, PRUnichar* aOut)
316 : {
317 255 : if(!mExtensionDecoder)
318 1 : CreateExtensionDecoder();
319 255 : NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter");
320 255 : if(mExtensionDecoder)
321 : {
322 255 : nsresult res = mExtensionDecoder->Reset();
323 255 : NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed");
324 255 : PRInt32 len = 2;
325 255 : PRInt32 dstlen = 1;
326 255 : res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen);
327 255 : NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)),
328 : "some strange conversion result");
329 : // if we failed, we then just use the 0xfffd
330 : // therefore, we ignore the res here.
331 255 : if(NS_SUCCEEDED(res))
332 255 : return true;
333 : }
334 0 : return false;
335 : }
336 0 : bool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, PRUnichar* aOut)
337 : {
338 0 : return false;
339 : }
340 8 : bool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, PRUnichar* aOut)
341 : {
342 8 : if(!m4BytesDecoder)
343 1 : Create4BytesDecoder();
344 8 : if(m4BytesDecoder)
345 : {
346 8 : nsresult res = m4BytesDecoder->Reset();
347 8 : NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed");
348 8 : PRInt32 len = 4;
349 8 : PRInt32 dstlen = 1;
350 8 : res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen);
351 8 : NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)),
352 : "some strange conversion result");
353 : // if we failed, we then just use the 0xfffd
354 : // therefore, we ignore the res here.
355 8 : if(NS_SUCCEEDED(res))
356 8 : return true;
357 : }
358 0 : return false;
359 : }
|