1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Mozilla Communicator client code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either of the GNU General Public License Version 2 or later (the "GPL"),
26 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 :
38 : #include "nsAlgorithm.h"
39 : #include "nsUCSupport.h"
40 : #include "nsUTF8ToUnicode.h"
41 : #include "mozilla/SSE.h"
42 :
43 : #define UNICODE_BYTE_ORDER_MARK 0xFEFF
44 :
45 47 : static PRUnichar* EmitSurrogatePair(PRUint32 ucs4, PRUnichar* aDest)
46 : {
47 47 : NS_ASSERTION(ucs4 > 0xFFFF, "Should be a supplementary character");
48 47 : ucs4 -= 0x00010000;
49 47 : *aDest++ = 0xD800 | (0x000003FF & (ucs4 >> 10));
50 47 : *aDest++ = 0xDC00 | (0x000003FF & ucs4);
51 47 : return aDest;
52 : }
53 :
54 : //----------------------------------------------------------------------
55 : // Class nsUTF8ToUnicode [implementation]
56 :
57 5188 : nsUTF8ToUnicode::nsUTF8ToUnicode()
58 5188 : : nsBasicDecoderSupport()
59 : {
60 5188 : Reset();
61 5188 : }
62 :
63 : //----------------------------------------------------------------------
64 : // Subclassing of nsTableDecoderSupport class [implementation]
65 :
66 : /**
67 : * Normally the maximum length of the output of the UTF8 decoder in UTF16
68 : * code units is the same as the length of the input in UTF8 code units,
69 : * since 1-byte, 2-byte and 3-byte UTF-8 sequences decode to a single
70 : * UTF-16 character, and 4-byte UTF-8 sequences decode to a surrogate pair.
71 : *
72 : * However, there is an edge case where the output can be longer than the
73 : * input: if the previous buffer ended with an incomplete multi-byte
74 : * sequence and this buffer does not begin with a valid continuation
75 : * byte, we will return NS_ERROR_ILLEGAL_INPUT and the caller may insert a
76 : * replacement character in the output buffer which corresponds to no
77 : * character in the input buffer. So in the worst case the destination
78 : * will need to be one code unit longer than the source.
79 : * See bug 301797.
80 : */
81 16302 : NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc,
82 : PRInt32 aSrcLength,
83 : PRInt32 * aDestLength)
84 : {
85 16302 : *aDestLength = aSrcLength + 1;
86 16302 : return NS_OK;
87 : }
88 :
89 :
90 : //----------------------------------------------------------------------
91 : // Subclassing of nsBasicDecoderSupport class [implementation]
92 :
93 5512 : NS_IMETHODIMP nsUTF8ToUnicode::Reset()
94 : {
95 :
96 5512 : mUcs4 = 0; // cached Unicode character
97 5512 : mState = 0; // cached expected number of octets after the current octet
98 : // until the beginning of the next UTF8 character sequence
99 5512 : mBytes = 1; // cached expected number of octets in the current sequence
100 5512 : mFirst = true;
101 :
102 5512 : return NS_OK;
103 :
104 : }
105 :
106 : //----------------------------------------------------------------------
107 : // Subclassing of nsBasicDecoderSupport class [implementation]
108 :
109 : // Fast ASCII -> UTF16 inner loop implementations
110 : //
111 : // Convert_ascii_run will update src and dst to the new values, and
112 : // len must be the maximum number ascii chars that it would be valid
113 : // to take from src and place into dst. (That is, the minimum of the
114 : // number of bytes left in src and the number of unichars available in
115 : // dst.)
116 :
117 : #if defined(__arm__) || defined(_M_ARM)
118 :
119 : // on ARM, do extra work to avoid byte/halfword reads/writes by
120 : // reading/writing a word at a time for as long as we can
121 : static inline void
122 : Convert_ascii_run (const char *&src,
123 : PRUnichar *&dst,
124 : PRInt32 len)
125 : {
126 : const PRUint32 *src32;
127 : PRUint32 *dst32;
128 :
129 : // with some alignments, we'd never actually break out of the slow loop, so
130 : // check and do the faster slow loop
131 : if ((((NS_PTR_TO_UINT32(dst) & 3) == 0) && ((NS_PTR_TO_UINT32(src) & 1) == 0)) ||
132 : (((NS_PTR_TO_UINT32(dst) & 3) == 2) && ((NS_PTR_TO_UINT32(src) & 1) == 1)))
133 : {
134 : while (((NS_PTR_TO_UINT32(src) & 3) ||
135 : (NS_PTR_TO_UINT32(dst) & 3)) &&
136 : len > 0)
137 : {
138 : if (*src & 0x80U)
139 : return;
140 : *dst++ = (PRUnichar) *src++;
141 : len--;
142 : }
143 : } else {
144 : goto finish;
145 : }
146 :
147 : // then go 4 bytes at a time
148 : src32 = (const PRUint32*) src;
149 : dst32 = (PRUint32*) dst;
150 :
151 : while (len > 4) {
152 : PRUint32 in = *src32++;
153 :
154 : if (in & 0x80808080U) {
155 : src32--;
156 : break;
157 : }
158 :
159 : *dst32++ = ((in & 0x000000ff) >> 0) | ((in & 0x0000ff00) << 8);
160 : *dst32++ = ((in & 0x00ff0000) >> 16) | ((in & 0xff000000) >> 8);
161 :
162 : len -= 4;
163 : }
164 :
165 : src = (const char *) src32;
166 : dst = (PRUnichar *) dst32;
167 :
168 : finish:
169 : while (len-- > 0 && (*src & 0x80U) == 0) {
170 : *dst++ = (PRUnichar) *src++;
171 : }
172 : }
173 :
174 : #else
175 :
176 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
177 : namespace mozilla {
178 : namespace SSE2 {
179 :
180 : void Convert_ascii_run(const char *&src, PRUnichar *&dst, PRInt32 len);
181 :
182 : }
183 : }
184 : #endif
185 :
186 : static inline void
187 17715 : Convert_ascii_run (const char *&src,
188 : PRUnichar *&dst,
189 : PRInt32 len)
190 : {
191 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
192 17715 : if (mozilla::supports_sse2()) {
193 17715 : mozilla::SSE2::Convert_ascii_run(src, dst, len);
194 17715 : return;
195 : }
196 : #endif
197 :
198 0 : while (len-- > 0 && (*src & 0x80U) == 0) {
199 0 : *dst++ = (PRUnichar) *src++;
200 : }
201 : }
202 :
203 : #endif
204 :
205 17210 : NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
206 : PRInt32 * aSrcLength,
207 : PRUnichar * aDest,
208 : PRInt32 * aDestLength)
209 : {
210 17210 : PRUint32 aSrcLen = (PRUint32) (*aSrcLength);
211 17210 : PRUint32 aDestLen = (PRUint32) (*aDestLength);
212 :
213 : const char *in, *inend;
214 17210 : inend = aSrc + aSrcLen;
215 :
216 : PRUnichar *out, *outend;
217 17210 : outend = aDest + aDestLen;
218 :
219 17210 : nsresult res = NS_OK; // conversion result
220 :
221 17210 : out = aDest;
222 17210 : if (mState == 0xFF) {
223 : // Emit supplementary character left over from previous iteration. If the
224 : // buffer size is insufficient, treat it as an illegal character.
225 0 : if (aDestLen < 2) {
226 0 : NS_ERROR("Output buffer insufficient to hold supplementary character");
227 0 : mState = 0;
228 0 : return NS_ERROR_ILLEGAL_INPUT;
229 : }
230 0 : out = EmitSurrogatePair(mUcs4, out);
231 0 : mUcs4 = 0;
232 0 : mState = 0;
233 0 : mBytes = 1;
234 0 : mFirst = false;
235 : }
236 :
237 : // alias these locally for speed
238 17210 : PRInt32 mUcs4 = this->mUcs4;
239 17210 : PRUint8 mState = this->mState;
240 17210 : PRUint8 mBytes = this->mBytes;
241 17210 : bool mFirst = this->mFirst;
242 :
243 : // Set mFirst to false now so we don't have to every time through the ASCII
244 : // branch within the loop.
245 17210 : if (mFirst && aSrcLen && (0 == (0x80 & (*aSrc))))
246 4360 : mFirst = false;
247 :
248 40911 : for (in = aSrc; ((in < inend) && (out < outend)); ++in) {
249 24165 : if (0 == mState) {
250 : // When mState is zero we expect either a US-ASCII character or a
251 : // multi-octet sequence.
252 20707 : if (0 == (0x80 & (*in))) {
253 17715 : PRInt32 max_loops = NS_MIN(inend - in, outend - out);
254 17715 : Convert_ascii_run(in, out, max_loops);
255 17715 : --in; // match the rest of the cases
256 17715 : mBytes = 1;
257 2992 : } else if (0xC0 == (0xE0 & (*in))) {
258 : // First octet of 2 octet sequence
259 2131 : mUcs4 = (PRUint32)(*in);
260 2131 : mUcs4 = (mUcs4 & 0x1F) << 6;
261 2131 : mState = 1;
262 2131 : mBytes = 2;
263 861 : } else if (0xE0 == (0xF0 & (*in))) {
264 : // First octet of 3 octet sequence
265 542 : mUcs4 = (PRUint32)(*in);
266 542 : mUcs4 = (mUcs4 & 0x0F) << 12;
267 542 : mState = 2;
268 542 : mBytes = 3;
269 319 : } else if (0xF0 == (0xF8 & (*in))) {
270 : // First octet of 4 octet sequence
271 82 : mUcs4 = (PRUint32)(*in);
272 82 : mUcs4 = (mUcs4 & 0x07) << 18;
273 82 : mState = 3;
274 82 : mBytes = 4;
275 237 : } else if (0xF8 == (0xFC & (*in))) {
276 : /* First octet of 5 octet sequence.
277 : *
278 : * This is illegal because the encoded codepoint must be either
279 : * (a) not the shortest form or
280 : * (b) outside the Unicode range of 0-0x10FFFF.
281 : * Rather than trying to resynchronize, we will carry on until the end
282 : * of the sequence and let the later error handling code catch it.
283 : */
284 11 : mUcs4 = (PRUint32)(*in);
285 11 : mUcs4 = (mUcs4 & 0x03) << 24;
286 11 : mState = 4;
287 11 : mBytes = 5;
288 226 : } else if (0xFC == (0xFE & (*in))) {
289 : // First octet of 6 octet sequence, see comments for 5 octet sequence.
290 1 : mUcs4 = (PRUint32)(*in);
291 1 : mUcs4 = (mUcs4 & 1) << 30;
292 1 : mState = 5;
293 1 : mBytes = 6;
294 : } else {
295 : /* Current octet is neither in the US-ASCII range nor a legal first
296 : * octet of a multi-octet sequence.
297 : *
298 : * Return an error condition. Caller is responsible for flushing and
299 : * refilling the buffer and resetting state.
300 : */
301 225 : res = NS_ERROR_ILLEGAL_INPUT;
302 225 : break;
303 : }
304 : } else {
305 : // When mState is non-zero, we expect a continuation of the multi-octet
306 : // sequence
307 3458 : if (0x80 == (0xC0 & (*in))) {
308 : // Legal continuation.
309 3247 : PRUint32 shift = (mState - 1) * 6;
310 3247 : PRUint32 tmp = *in;
311 3247 : tmp = (tmp & 0x0000003FL) << shift;
312 3247 : mUcs4 |= tmp;
313 :
314 3247 : if (0 == --mState) {
315 : /* End of the multi-octet sequence. mUcs4 now contains the final
316 : * Unicode codepoint to be output
317 : *
318 : * Check for illegal sequences and codepoints.
319 : */
320 :
321 : // From Unicode 3.1, non-shortest form is illegal
322 2618 : if (((2 == mBytes) && (mUcs4 < 0x0080)) ||
323 : ((3 == mBytes) && (mUcs4 < 0x0800)) ||
324 : ((4 == mBytes) && (mUcs4 < 0x10000)) ||
325 : (4 < mBytes) ||
326 : // From Unicode 3.2, surrogate characters are illegal
327 : ((mUcs4 & 0xFFFFF800) == 0xD800) ||
328 : // Codepoints outside the Unicode range are illegal
329 : (mUcs4 > 0x10FFFF)) {
330 28 : res = NS_ERROR_ILLEGAL_INPUT;
331 28 : break;
332 : }
333 2590 : if (mUcs4 > 0xFFFF) {
334 : // mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair
335 47 : if (out + 2 > outend) {
336 : // insufficient space left in the buffer. Keep mUcs4 for the
337 : // next iteration.
338 0 : mState = 0xFF;
339 0 : ++in;
340 0 : res = NS_OK_UDEC_MOREOUTPUT;
341 0 : break;
342 : }
343 47 : out = EmitSurrogatePair(mUcs4, out);
344 2543 : } else if (UNICODE_BYTE_ORDER_MARK != mUcs4 || !mFirst) {
345 : // Don't output the BOM only if it is the first character
346 2543 : *out++ = mUcs4;
347 : }
348 : //initialize UTF8 cache
349 2590 : mUcs4 = 0;
350 2590 : mState = 0;
351 2590 : mBytes = 1;
352 2590 : mFirst = false;
353 : }
354 : } else {
355 : /* ((0xC0 & (*in) != 0x80) && (mState != 0))
356 : *
357 : * Incomplete multi-octet sequence. Unconsume this
358 : * octet and return an error condition. Caller is responsible
359 : * for flushing and refilling the buffer and resetting state.
360 : */
361 211 : in--;
362 211 : res = NS_ERROR_ILLEGAL_INPUT;
363 211 : break;
364 : }
365 : }
366 : }
367 :
368 : // output not finished, output buffer too short
369 17210 : if ((NS_OK == res) && (in < inend) && (out >= outend))
370 0 : res = NS_OK_UDEC_MOREOUTPUT;
371 :
372 : // last UCS4 is incomplete, make sure the caller
373 : // returns with properly aligned continuation of the buffer
374 17210 : if ((NS_OK == res) && (mState != 0))
375 4 : res = NS_OK_UDEC_MOREINPUT;
376 :
377 17210 : *aSrcLength = in - aSrc;
378 17210 : *aDestLength = out - aDest;
379 :
380 17210 : this->mUcs4 = mUcs4;
381 17210 : this->mState = mState;
382 17210 : this->mBytes = mBytes;
383 17210 : this->mFirst = mFirst;
384 :
385 17210 : return(res);
386 : }
|