1 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is mozilla.org code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either of the GNU General Public License Version 2 or later (the "GPL"),
26 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 : /**
38 : * A character set converter from HZ to Unicode.
39 : *
40 : *
41 : * @created 08/Sept/1999
42 : * @author Yueheng Xu, Yueheng.Xu@intel.com
43 : *
44 : * Note: in this HZ-GB-2312 converter, we accept a string composed of 7-bit HZ
45 : * encoded Chinese chars,as it is defined in RFC1843 available at
46 : * http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html
47 : * and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html.
48 : *
49 : * Earlier versions of the converter said:
50 : * "In an effort to match the similar extended capability of Microsoft
51 : * Internet Explorer 5.0. We also accept the 8-bit GB encoded chars
52 : * mixed in a HZ string.
53 : * But this should not be a recommendedd practice for HTML authors."
54 : * However, testing in current versions of IE shows that it only accepts
55 : * 8-bit characters when the converter is in GB state, and when in ASCII
56 : * state each single 8-bit character is converted to U+FFFD
57 : *
58 : * The priority of converting are as follows: first convert 8-bit GB code; then,
59 : * consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current
60 : * state ( default to ASCII state ) of the string, each 7-bit char is converted as an
61 : * ASCII, or two 7-bit chars are converted into a Chinese character.
62 : */
63 :
64 :
65 :
66 : #include "nsUCvCnDll.h"
67 : #include "nsHZToUnicode.h"
68 : #include "gbku.h"
69 :
70 : //----------------------------------------------------------------------
71 : // Class nsHZToUnicode [implementation]
72 :
73 : //----------------------------------------------------------------------
74 : // Subclassing of nsTablesDecoderSupport class [implementation]
75 :
76 : #define HZ_STATE_GB 1
77 : #define HZ_STATE_ASCII 2
78 : #define HZ_STATE_ODD_BYTE_FLAG 0x80
79 : #define HZLEAD1 '~'
80 : #define HZLEAD2 '{'
81 : #define HZLEAD3 '}'
82 : #define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
83 : #define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))
84 :
85 8 : nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1)
86 : {
87 8 : mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state
88 8 : mRunLength = 0;
89 8 : mOddByte = 0;
90 8 : }
91 :
92 : //Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp.
93 65160 : NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
94 : const char* aSrc,
95 : PRInt32 * aSrcLength,
96 : PRUnichar *aDest,
97 : PRInt32 * aDestLength)
98 : {
99 65160 : PRInt32 i=0;
100 65160 : PRInt32 iSrcLength = *aSrcLength;
101 65160 : PRInt32 iDestlen = 0;
102 65160 : *aSrcLength=0;
103 65160 : nsresult res = NS_OK;
104 65160 : char oddByte = mOddByte;
105 :
106 260620 : for (i=0; i<iSrcLength; i++) {
107 195460 : if (iDestlen >= (*aDestLength)) {
108 0 : res = NS_OK_UDEC_MOREOUTPUT;
109 0 : break;
110 : }
111 :
112 195460 : char srcByte = *aSrc++;
113 195460 : (*aSrcLength)++;
114 :
115 195460 : if (!HZ_ODD_BYTE_STATE) {
116 194830 : if (srcByte == HZLEAD1 ||
117 : (HZ_ENCODING_STATE == HZ_STATE_GB &&
118 : (UINT8_IN_RANGE(0x21, srcByte, 0x7E) ||
119 : UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) {
120 630 : oddByte = srcByte;
121 630 : mHZState |= HZ_STATE_ODD_BYTE_FLAG;
122 : } else {
123 : *aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING :
124 194200 : CAST_CHAR_TO_UNICHAR(srcByte);
125 194200 : iDestlen++;
126 : }
127 : } else {
128 630 : if (oddByte & 0x80) {
129 : // Accept legal 8-bit GB 2312-80 sequences in GB mode only
130 0 : NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB,
131 : "Invalid lead byte in ASCII mode");
132 : *aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
133 : UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ?
134 0 : mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING;
135 0 : mRunLength++;
136 0 : iDestlen++;
137 : // otherwise, it is a 7-bit byte
138 : // The source will be an ASCII or a 7-bit HZ code depending on oddByte
139 630 : } else if (oddByte == HZLEAD1) { // if it is lead by '~'
140 538 : switch (srcByte) {
141 : case HZLEAD2:
142 : // we got a '~{'
143 : // we are switching to HZ state
144 16 : mHZState = HZ_STATE_GB;
145 16 : mRunLength = 0;
146 16 : break;
147 :
148 : case HZLEAD3:
149 : // we got a '~}'
150 : // we are switching to ASCII state
151 17 : mHZState = HZ_STATE_ASCII;
152 17 : if (mRunLength == 0) {
153 1 : *aDest++ = UCS2_NO_MAPPING;
154 1 : iDestlen++;
155 : }
156 17 : mRunLength = 0;
157 17 : break;
158 :
159 : case HZLEAD1:
160 : // we got a '~~', process like an ASCII, but no state change
161 0 : *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
162 0 : iDestlen++;
163 0 : mRunLength++;
164 0 : break;
165 :
166 : default:
167 : // Undefined ESC sequence '~X': treat as an error if X is a
168 : // printable character or we are in ASCII mode, and resynchronize
169 : // on the second character.
170 : //
171 : // N.B. For compatibility with other implementations, we treat '~\n'
172 : // as an illegal sequence even though RFC1843 permits it, and for
173 : // the same reason we pass through control characters including '\n'
174 : // and ' ' even in GB mode.
175 505 : if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) {
176 505 : *aDest++ = UCS2_NO_MAPPING;
177 : }
178 505 : aSrc--;
179 505 : (*aSrcLength)--;
180 505 : iDestlen++;
181 505 : break;
182 : }
183 92 : } else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
184 : *aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) &&
185 : UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ?
186 92 : mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) :
187 184 : UCS2_NO_MAPPING;
188 92 : mRunLength++;
189 92 : iDestlen++;
190 : } else {
191 0 : NS_NOTREACHED("2-byte sequence that we don't know how to handle");
192 0 : *aDest++ = UCS2_NO_MAPPING;
193 0 : iDestlen++;
194 : }
195 630 : oddByte = 0;
196 630 : mHZState &= ~HZ_STATE_ODD_BYTE_FLAG;
197 : }
198 : } // for loop
199 65160 : mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0;
200 65160 : *aDestLength = iDestlen;
201 65160 : return res;
202 : }
203 :
204 :
|