1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Mozilla Communicator client code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either of the GNU General Public License Version 2 or later (the "GPL"),
26 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 :
38 : #include "nsUCConstructors.h"
39 : #include "nsUCS2BEToUnicode.h"
40 : #include "nsUCvLatinDll.h"
41 : #include "nsCharTraits.h"
42 : #include <string.h>
43 : #include "prtypes.h"
44 :
45 : #define STATE_NORMAL 0
46 : #define STATE_HALF_CODE_POINT 1
47 : #define STATE_FIRST_CALL 2
48 : #define STATE_FOUND_BOM 3
49 : #define STATE_ODD_SURROGATE_PAIR 4
50 :
51 : static nsresult
52 351 : UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aOddByte,
53 : PRUnichar& aOddHighSurrogate, PRUnichar& aOddLowSurrogate,
54 : const char * aSrc,
55 : PRInt32 * aSrcLength, PRUnichar * aDest,
56 : PRInt32 * aDestLength,
57 : bool aSwapBytes)
58 : {
59 351 : const char* src = aSrc;
60 351 : const char* srcEnd = aSrc + *aSrcLength;
61 351 : PRUnichar* dest = aDest;
62 351 : PRUnichar* destEnd = aDest + *aDestLength;
63 :
64 351 : switch(aState) {
65 : case STATE_FOUND_BOM:
66 19 : NS_ASSERTION(*aSrcLength > 1, "buffer too short");
67 19 : src+=2;
68 19 : aState = STATE_NORMAL;
69 19 : break;
70 :
71 : case STATE_FIRST_CALL: // first time called
72 60 : NS_ASSERTION(*aSrcLength > 1, "buffer too short");
73 : // Eliminate BOM (0xFEFF). Note that different endian case is taken care
74 : // of in |Convert| of LE and BE converters. Here, we only have to
75 : // deal with the same endian case. That is, 0xFFFE (byte-swapped BOM) is
76 : // illegal.
77 60 : if(0xFEFF == *((PRUnichar*)src)) {
78 2 : src+=2;
79 58 : } else if(0xFFFE == *((PRUnichar*)src)) {
80 0 : *aSrcLength=0;
81 0 : *aDestLength=0;
82 0 : return NS_ERROR_ILLEGAL_INPUT;
83 : }
84 60 : aState = STATE_NORMAL;
85 60 : break;
86 :
87 : case STATE_ODD_SURROGATE_PAIR:
88 0 : if (*aDestLength < 2)
89 0 : *dest++ = UCS2_REPLACEMENT_CHAR;
90 : else {
91 0 : *dest++ = aOddHighSurrogate;
92 0 : *dest++ = aOddLowSurrogate;
93 0 : aOddHighSurrogate = aOddLowSurrogate = 0;
94 0 : aState = STATE_NORMAL;
95 : }
96 0 : break;
97 :
98 : case STATE_NORMAL:
99 : case STATE_HALF_CODE_POINT:
100 : default:
101 272 : break;
102 : }
103 :
104 351 : if (src == srcEnd) {
105 0 : *aDestLength = dest - aDest;
106 0 : return NS_OK;
107 : }
108 :
109 351 : PRUnichar oddHighSurrogate = aOddHighSurrogate;
110 :
111 : const char* srcEvenEnd;
112 :
113 : PRUnichar u;
114 351 : if (aState == STATE_HALF_CODE_POINT) {
115 : // the 1st byte of a 16-bit code unit was stored in |aOddByte| in the
116 : // previous run while the 2nd byte has to come from |*src|.
117 56 : aState = STATE_NORMAL;
118 : #ifdef IS_BIG_ENDIAN
119 : u = (aOddByte << 8) | *src++; // safe, we know we have at least one byte.
120 : #else
121 56 : u = (*src++ << 8) | aOddByte; // safe, we know we have at least one byte.
122 : #endif
123 56 : srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
124 56 : goto have_codepoint;
125 : } else {
126 295 : srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
127 : }
128 :
129 39608 : while (src != srcEvenEnd) {
130 38962 : if (dest == destEnd)
131 0 : goto error;
132 :
133 : #if !defined(__sparc__) && !defined(__arm__)
134 38962 : u = *(const PRUnichar*)src;
135 : #else
136 : memcpy(&u, src, 2);
137 : #endif
138 38962 : src += 2;
139 :
140 : have_codepoint:
141 39018 : if (aSwapBytes)
142 13700 : u = u << 8 | u >> 8;
143 :
144 39018 : if (!IS_SURROGATE(u)) {
145 38642 : if (oddHighSurrogate) {
146 32 : *dest++ = UCS2_REPLACEMENT_CHAR;
147 32 : if (dest == destEnd)
148 0 : goto error;
149 32 : oddHighSurrogate = 0;
150 : }
151 38642 : *dest++ = u;
152 376 : } else if (NS_IS_HIGH_SURROGATE(u)) {
153 192 : if (oddHighSurrogate) {
154 16 : *dest++ = UCS2_REPLACEMENT_CHAR;
155 16 : if (dest == destEnd)
156 0 : goto error;
157 : }
158 192 : oddHighSurrogate = u;
159 : }
160 : else /* if (NS_IS_LOW_SURROGATE(u)) */ {
161 184 : if (oddHighSurrogate && *aDestLength > 1) {
162 136 : if (dest + 1 >= destEnd) {
163 0 : aOddLowSurrogate = u;
164 0 : aOddHighSurrogate = oddHighSurrogate;
165 0 : aState = STATE_ODD_SURROGATE_PAIR;
166 0 : goto error;
167 : }
168 136 : *dest++ = oddHighSurrogate;
169 136 : *dest++ = u;
170 : } else {
171 48 : *dest++ = UCS2_REPLACEMENT_CHAR;
172 : }
173 184 : oddHighSurrogate = 0;
174 : }
175 : }
176 351 : if (src != srcEnd) {
177 : // store the lead byte of a 16-bit unit for the next run.
178 72 : aOddByte = *src++;
179 72 : aState = STATE_HALF_CODE_POINT;
180 : }
181 :
182 351 : aOddHighSurrogate = oddHighSurrogate;
183 :
184 351 : *aDestLength = dest - aDest;
185 351 : *aSrcLength = src - aSrc;
186 351 : return NS_OK;
187 :
188 : error:
189 0 : *aDestLength = dest - aDest;
190 0 : *aSrcLength = src - aSrc;
191 0 : return NS_OK_UDEC_MOREOUTPUT;
192 : }
193 :
194 : NS_IMETHODIMP
195 167 : nsUTF16ToUnicodeBase::Reset()
196 : {
197 167 : mState = STATE_FIRST_CALL;
198 167 : mOddByte = 0;
199 167 : mOddHighSurrogate = 0;
200 167 : mOddLowSurrogate = 0;
201 167 : return NS_OK;
202 : }
203 :
204 : NS_IMETHODIMP
205 385 : nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, PRInt32 aSrcLength,
206 : PRInt32 * aDestLength)
207 : {
208 : // the left-over data of the previous run have to be taken into account.
209 385 : *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT == mState) ? 1 : 0)) / 2;
210 385 : if (mOddHighSurrogate)
211 0 : (*aDestLength)++;
212 385 : if (mOddLowSurrogate)
213 0 : (*aDestLength)++;
214 385 : return NS_OK;
215 : }
216 :
217 :
218 : NS_IMETHODIMP
219 286 : nsUTF16BEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
220 : PRUnichar * aDest, PRInt32 * aDestLength)
221 : {
222 286 : if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
223 : {
224 129 : nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
225 129 : *aSrcLength=0;
226 129 : *aDestLength=0;
227 129 : return res;
228 : }
229 : #ifdef IS_LITTLE_ENDIAN
230 : // Remove the BOM if we're little-endian. The 'same endian' case with the
231 : // leading BOM will be taken care of by |UTF16ConvertToUnicode|.
232 157 : if(STATE_FIRST_CALL == mState) // Called for the first time.
233 : {
234 60 : mState = STATE_NORMAL;
235 60 : if(0xFFFE == *((PRUnichar*)aSrc)) {
236 : // eliminate BOM (on LE machines, BE BOM is 0xFFFE)
237 2 : mState = STATE_FOUND_BOM;
238 58 : } else if(0xFEFF == *((PRUnichar*)aSrc)) {
239 0 : *aSrcLength=0;
240 0 : *aDestLength=0;
241 0 : return NS_ERROR_ILLEGAL_INPUT;
242 : }
243 : }
244 : #endif
245 :
246 : nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
247 : mOddLowSurrogate,
248 : aSrc, aSrcLength, aDest, aDestLength,
249 : #ifdef IS_LITTLE_ENDIAN
250 : true
251 : #else
252 : false
253 : #endif
254 157 : );
255 157 : return rv;
256 : }
257 :
258 : NS_IMETHODIMP
259 285 : nsUTF16LEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
260 : PRUnichar * aDest, PRInt32 * aDestLength)
261 : {
262 285 : if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
263 : {
264 128 : nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
265 128 : *aSrcLength=0;
266 128 : *aDestLength=0;
267 128 : return res;
268 : }
269 : #ifdef IS_BIG_ENDIAN
270 : // Remove the BOM if we're big-endian. The 'same endian' case with the
271 : // leading BOM will be taken care of by |UTF16ConvertToUnicode|.
272 : if(STATE_FIRST_CALL == mState) // first time called
273 : {
274 : mState = STATE_NORMAL;
275 : if(0xFFFE == *((PRUnichar*)aSrc)) {
276 : // eliminate BOM (on BE machines, LE BOM is 0xFFFE)
277 : mState = STATE_FOUND_BOM;
278 : } else if(0xFEFF == *((PRUnichar*)aSrc)) {
279 : *aSrcLength=0;
280 : *aDestLength=0;
281 : return NS_ERROR_ILLEGAL_INPUT;
282 : }
283 : }
284 : #endif
285 :
286 : nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
287 : mOddLowSurrogate,
288 : aSrc, aSrcLength, aDest, aDestLength,
289 : #ifdef IS_BIG_ENDIAN
290 : true
291 : #else
292 : false
293 : #endif
294 157 : );
295 157 : return rv;
296 : }
297 :
298 : NS_IMETHODIMP
299 22 : nsUTF16ToUnicode::Reset()
300 : {
301 22 : mEndian = kUnknown;
302 22 : mFoundBOM = false;
303 22 : return nsUTF16ToUnicodeBase::Reset();
304 : }
305 :
306 : NS_IMETHODIMP
307 165 : nsUTF16ToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
308 : PRUnichar * aDest, PRInt32 * aDestLength)
309 : {
310 165 : if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
311 : {
312 128 : nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
313 128 : *aSrcLength=0;
314 128 : *aDestLength=0;
315 128 : return res;
316 : }
317 37 : if(STATE_FIRST_CALL == mState) // first time called
318 : {
319 21 : mState = STATE_NORMAL;
320 : // check if BOM (0xFEFF) is at the beginning, remove it if found, and
321 : // set mEndian accordingly.
322 21 : if(0xFF == PRUint8(aSrc[0]) && 0xFE == PRUint8(aSrc[1])) {
323 15 : mState = STATE_FOUND_BOM;
324 15 : mEndian = kLittleEndian;
325 15 : mFoundBOM = true;
326 : }
327 6 : else if(0xFE == PRUint8(aSrc[0]) && 0xFF == PRUint8(aSrc[1])) {
328 2 : mState = STATE_FOUND_BOM;
329 2 : mEndian = kBigEndian;
330 2 : mFoundBOM = true;
331 : }
332 : // BOM is not found, but we can use a simple heuristic to determine
333 : // the endianness. Assume the first character is [U+0001, U+00FF].
334 : // Not always valid, but it's very likely to hold for html/xml/css.
335 4 : else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00)
336 2 : mEndian = kBigEndian;
337 : }
338 2 : else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00)
339 2 : mEndian = kLittleEndian;
340 : }
341 : else { // Neither BOM nor 'plausible' byte patterns at the beginning.
342 : // Just assume it's BE (following Unicode standard)
343 : // and let the garbage show up in the browser. (security concern?)
344 : // (bug 246194)
345 0 : mEndian = kBigEndian;
346 : }
347 : }
348 :
349 : nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
350 : mOddLowSurrogate,
351 : aSrc, aSrcLength, aDest, aDestLength,
352 : #ifdef IS_BIG_ENDIAN
353 : (mEndian == kLittleEndian)
354 : #elif defined(IS_LITTLE_ENDIAN)
355 : (mEndian == kBigEndian)
356 : #else
357 : #error "Unknown endianness"
358 : #endif
359 37 : );
360 :
361 : // If BOM is not found and we're to return NS_OK, signal that BOM
362 : // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
363 37 : return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
364 : }
|