1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim:expandtab:shiftwidth=2:tabstop=2:
3 : */
4 : /* ***** BEGIN LICENSE BLOCK *****
5 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 : *
7 : * The contents of this file are subject to the Mozilla Public License Version
8 : * 1.1 (the "License"); you may not use this file except in compliance with
9 : * the License. You may obtain a copy of the License at
10 : * http://www.mozilla.org/MPL/
11 : *
12 : * Software distributed under the License is distributed on an "AS IS" basis,
13 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
14 : * for the specific language governing rights and limitations under the
15 : * License.
16 : *
17 : * The Original Code is GNU C Library code (http://www.gnu.org)
18 : *
19 : * The Initial Developer of the Original Code is
20 : * Bruno Haible <bruno@clisp.org>.
21 : * Portions created by the Initial Developer are Copyright (C) 2002
22 : * the Free Software Foundation. All Rights Reserved.
23 : *
24 : * Contributor(s):
25 : * Jungshik Shin <jshin@mailaps.org>
26 : *
27 : * Alternatively, the contents of this file may be used under the terms of
28 : * either the GNU General Public License Version 2 or later (the "GPL"), or
29 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 : * in which case the provisions of the GPL or the LGPL are applicable instead
31 : * of those above. If you wish to allow use of your version of this file only
32 : * under the terms of either the GPL or the LGPL, and not to allow others to
33 : * use your version of this file under the terms of the MPL, indicate your
34 : * decision by deleting the provisions above and replace them with the notice
35 : * and other provisions required by the GPL or the LGPL. If you do not delete
36 : * the provisions above, a recipient may use your version of this file under
37 : * the terms of any one of the MPL, the GPL or the LGPL.
38 : *
39 : * ***** END LICENSE BLOCK ***** */
40 :
41 : #include "nsUnicodeToTSCII.h"
42 : #include "nsMemory.h"
43 : #include "tamil.h"
44 :
45 : /*
46 : * TSCII is an 8-bit encoding consisting of:
47 : * 0x00..0x7F: ASCII
48 : * 0x80..0x90, 0x95..0x9F, 0xAB..0xFE:
49 : * Tamil letters and glyphs
50 : * 0xA1..0xA5, 0xAA: Tamil combining letters (after the base character)
51 : * 0xA6..0xA8: Tamil combining letters (before the base character)
52 : * 0x91..0x94: Punctuation
53 : * 0xA9: Symbols
54 : */
55 :
56 : //----------------------------------------------------------------------
57 : // Class nsUnicodeToTSCII [implementation]
58 :
59 0 : NS_IMPL_ISUPPORTS1(nsUnicodeToTSCII, nsIUnicodeEncoder)
60 :
61 : /*
62 : * During UCS-4 to TSCII conversion, mState contains
63 : * the last byte (or sometimes the last two bytes) to be output.
64 : * This can be:
65 : * 0x00 Nothing pending.
66 : * 0xB8..0xC9, 0x83..0x86 A consonant.
67 : * 0xEC, 0x8A A consonant with VIRAMA sign (final or joining).
68 : * 0x87, 0xC38A Two consonants combined through a VIRAMA sign.
69 : */
70 :
71 : static const PRUint8 UnicharToTSCII[] =
72 : {
73 : 0, 0, 0, 0xb7, 0, 0xab, 0xac, 0xfe, // 0x0B80..0x0B87
74 : 0xae, 0xaf, 0xb0, 0, 0, 0, 0xb1, 0xb2, // 0x0B88..0x0B8F
75 : 0xb3, 0, 0xb4, 0xb5, 0xb6, 0xb8, 0, 0, // 0x0B90..0x0B97
76 : 0, 0xb9, 0xba, 0, 0x83, 0, 0xbb, 0xbc, // 0x0B98..0x0B9F
77 : 0, 0, 0, 0xbd, 0xbe, 0, 0, 0, // 0x0BA0..0x0BA7
78 : 0xbf, 0xc9, 0xc0, 0, 0, 0, 0xc1, 0xc2, // 0x0BA8..0x0BAF
79 : 0xc3, 0xc8, 0xc4, 0xc7, 0xc6, 0xc5, 0, 0x84, // 0x0BB0..0x0BB7
80 : 0x85, 0x86, 0, 0, 0, 0, 0xa1, 0xa2, // 0x0BB8..0x0BBF
81 : 0xa3, 0xa4, 0xa5, 0, 0, 0, 0xa6, 0xa7, // 0x0BC0..0x0BC7
82 : 0xa8, 0, 0, 0, 0, 0, 0, 0, // 0x0BC8..0x0BCF
83 : 0, 0, 0, 0, 0, 0, 0, 0xaa, // 0x0BD0..0x0BD7
84 : 0, 0, 0, 0, 0, 0, 0, 0, // 0x0BD8..0x0BDF
85 : 0, 0, 0, 0, 0, 0, 0x80, 0x81, // 0x0BE0..0x0BE7
86 : 0x8d, 0x8e, 0x8f, 0x90, 0x95, 0x96, 0x97, 0x98, // 0x0BE8..0x0BEF
87 : 0x9d, 0x9e, 0x9f, 0, 0, 0, 0, 0, // 0x0BF0..0x0BF7
88 : 0, 0, 0, 0, 0, 0, 0, 0 // 0x0BF8..0x0BFF
89 : };
90 :
91 : static const PRUint8 consonant_with_u[] =
92 : {
93 : 0xcc, 0x99, 0xcd, 0x9a, 0xce, 0xcf, 0xd0, 0xd1, 0xd2,
94 : 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb
95 : };
96 :
97 : static const PRUint8 consonant_with_uu[] =
98 : {
99 : 0xdc, 0x9b, 0xdd, 0x9c, 0xde, 0xdf, 0xe0, 0xe1, 0xe2,
100 : 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb
101 : };
102 :
103 : static const PRUint8 consonant_with_virama[18] =
104 : {
105 : 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4,
106 : 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd
107 : };
108 :
109 :
110 : // Modified implementation of Unicode to TSCII converter in glibc by
111 : // Bruno Haible. My modifications are based on Unicode 3.0 chap. 9 and
112 : // the code chart for Tamil.
113 : NS_IMETHODIMP
114 0 : nsUnicodeToTSCII::Convert(const PRUnichar * aSrc, PRInt32 * aSrcLength,
115 : char * aDest, PRInt32 * aDestLength)
116 : {
117 0 : const PRUnichar * src = aSrc;
118 0 : const PRUnichar * srcEnd = aSrc + *aSrcLength;
119 0 : char * dest = aDest;
120 0 : char * destEnd = dest + *aDestLength;
121 :
122 0 : nsresult rv = NS_OK;
123 :
124 0 : while (src < srcEnd && dest < destEnd) {
125 0 : PRUnichar ch = *src;
126 0 : if (mBuffer) {
127 : // Attempt to combine the last character with this one.
128 0 : PRUint32 last = mBuffer;
129 :
130 : // last : consonant
131 0 : if (IS_TSC_CONSONANT(last)) {
132 0 : if (ch == UNI_VOWELSIGN_U && IS_TSC_CONSONANT1(last)) {
133 0 : *dest++ = consonant_with_u[last - TSC_KA];
134 0 : mBuffer = 0;
135 0 : ++src;
136 0 : continue;
137 : }
138 :
139 0 : if (ch == UNI_VOWELSIGN_UU && IS_TSC_CONSONANT1(last)) {
140 0 : *dest++ = consonant_with_uu[last - TSC_KA];
141 0 : mBuffer = 0;
142 0 : ++src;
143 0 : continue;
144 : }
145 :
146 : // reorder. vowel sign goes to the left of consonant
147 0 : if (IS_UNI_LEFT_VOWELSIGN(ch)) {
148 0 : if (dest + 2 > destEnd)
149 0 : goto error_more_output;
150 0 : *dest++ = TSC_LEFT_VOWELSIGN(ch);
151 0 : *dest++ = last;
152 0 : mBuffer = 0;
153 0 : ++src;
154 0 : continue;
155 : }
156 :
157 : // split and reorder. consonant goes bet. two parts
158 0 : if (IS_UNI_2PARTS_VOWELSIGN(ch)) {
159 0 : if (dest + 3 > destEnd)
160 0 : goto error_more_output;
161 0 : *dest++ = TSC_LEFT_VOWEL_PART(ch);
162 0 : *dest++ = last;
163 0 : *dest++ = TSC_RIGHT_VOWEL_PART(ch);
164 0 : mBuffer = 0;
165 0 : ++src;
166 0 : continue;
167 : }
168 :
169 : // Virama
170 0 : if (ch == UNI_VIRAMA) {
171 : // consonant KA can form a conjunct with consonant SSA(SHA).
172 : // buffer dead consonant 'K' for the now.
173 0 : if (last == TSC_KA) {
174 0 : mBuffer = TSC_KA_DEAD;
175 : }
176 : // SA can form a conjunct when followed by 'RA'.
177 : // buffer dead consonant 'S' for the now.
178 0 : else if (last == TSC_SA) {
179 0 : mBuffer = TSC_SA_DEAD;
180 : }
181 : else {
182 : *dest++ = IS_TSC_CONSONANT1(last) ?
183 0 : consonant_with_virama[last - TSC_KA] : last + 5;
184 0 : mBuffer = 0;
185 : }
186 0 : ++src;
187 0 : continue;
188 : }
189 :
190 : // consonant TA forms a ligature with vowel 'I' or 'II'.
191 0 : if (last == TSC_TA && (ch == UNI_VOWELSIGN_I || ch == UNI_VOWELSIGN_II)) {
192 0 : *dest++ = ch - (UNI_VOWELSIGN_I - TSC_TI_LIGA);
193 0 : mBuffer = 0;
194 0 : ++src;
195 0 : continue;
196 : }
197 : }
198 0 : else if (last == TSC_KA_DEAD) {
199 : // Kd + SSA = K.SSA
200 0 : if (ch == UNI_SSA) {
201 0 : mBuffer = TSC_KSSA;
202 0 : ++src;
203 0 : continue;
204 : }
205 : }
206 0 : else if (last == TSC_SA_DEAD) {
207 : // Sd + RA = S.RA. Buffer RA + Sd.
208 0 : if (ch == UNI_RA) {
209 0 : mBuffer = 0xc38a;
210 0 : ++src;
211 0 : continue;
212 : }
213 : }
214 0 : else if (last == TSC_KSSA) {
215 0 : if (ch == UNI_VIRAMA) {
216 0 : *dest++ = (char) TSC_KSSA_DEAD;
217 0 : mBuffer = 0;
218 0 : ++src;
219 0 : continue;
220 : }
221 :
222 : // vowel splitting/reordering should be done around conjuncts as well.
223 : // reorder. vowel sign goes to the left of consonant
224 0 : if (IS_UNI_LEFT_VOWELSIGN(ch)) {
225 0 : if (dest + 2 > destEnd)
226 0 : goto error_more_output;
227 0 : *dest++ = TSC_LEFT_VOWELSIGN(ch);
228 0 : *dest++ = last;
229 0 : mBuffer = 0;
230 0 : ++src;
231 0 : continue;
232 : }
233 :
234 : // split and reorder. consonant goes bet. two parts
235 0 : if (IS_UNI_2PARTS_VOWELSIGN(ch)) {
236 0 : if (dest + 3 > destEnd)
237 0 : goto error_more_output;
238 0 : *dest++ = TSC_LEFT_VOWEL_PART(ch);
239 0 : *dest++ = last;
240 0 : *dest++ = TSC_RIGHT_VOWEL_PART(ch);
241 0 : mBuffer = 0;
242 0 : ++src;
243 0 : continue;
244 : }
245 : }
246 : else {
247 0 : NS_ASSERTION(last == 0xc38a, "No other value can be buffered");
248 0 : if (ch == UNI_VOWELSIGN_II) {
249 0 : *dest++ = (char) TSC_SRII_LIGA;
250 0 : mBuffer = 0;
251 0 : ++src;
252 0 : continue;
253 : }
254 : else {
255 : // put back TSC_SA_DEAD and TSC_RA
256 0 : *dest++ = (char) TSC_SA_DEAD;
257 0 : mBuffer = TSC_RA;
258 0 : ++src;
259 0 : continue;
260 : }
261 : }
262 :
263 : /* Output the buffered character. */
264 0 : if (last >> 8) {
265 0 : if (dest + 2 > destEnd)
266 0 : goto error_more_output;
267 0 : *dest++ = last & 0xff;
268 0 : *dest++ = (last >> 8) & 0xff;
269 : }
270 : else
271 0 : *dest++ = last & 0xff;
272 0 : mBuffer = 0;
273 0 : continue;
274 : }
275 :
276 0 : if (ch < 0x80) // Plain ASCII character.
277 0 : *dest++ = (char)ch;
278 0 : else if (IS_UNI_TAMIL(ch)) {
279 0 : PRUint8 t = UnicharToTSCII[ch - UNI_TAMIL_START];
280 :
281 0 : if (t != 0) {
282 0 : if (IS_TSC_CONSONANT(t))
283 0 : mBuffer = (PRUint32) t;
284 : else
285 0 : *dest++ = t;
286 : }
287 0 : else if (IS_UNI_2PARTS_VOWELSIGN(ch)) {
288 : // actually this is an illegal sequence.
289 0 : if (dest + 2 > destEnd)
290 0 : goto error_more_output;
291 :
292 0 : *dest++ = TSC_LEFT_VOWEL_PART(ch);
293 0 : *dest++ = TSC_RIGHT_VOWEL_PART(ch);
294 : }
295 : else {
296 0 : *aDestLength = dest - aDest;
297 0 : return NS_ERROR_UENC_NOMAPPING;
298 0 : }
299 : }
300 0 : else if (ch == 0x00A9)
301 0 : *dest++ = (char)ch;
302 0 : else if (IS_UNI_SINGLE_QUOTE(ch))
303 0 : *dest++ = ch - UNI_LEFT_SINGLE_QUOTE + TSC_LEFT_SINGLE_QUOTE;
304 0 : else if (IS_UNI_DOUBLE_QUOTE(ch))
305 0 : *dest++ = ch - UNI_LEFT_DOUBLE_QUOTE + TSC_LEFT_DOUBLE_QUOTE;
306 : else {
307 0 : *aDestLength = dest - aDest;
308 0 : return NS_ERROR_UENC_NOMAPPING;
309 : }
310 :
311 : /* Now that we wrote the output increment the input pointer. */
312 0 : ++src;
313 : }
314 :
315 : // flush the buffer
316 0 : if (mBuffer >> 8) {
317 : // Write out the last character, two bytes.
318 0 : if (dest + 2 > destEnd)
319 0 : goto error_more_output;
320 0 : *dest++ = (mBuffer >> 8) & 0xff;
321 0 : *dest++ = mBuffer & 0xff;
322 0 : mBuffer = 0;
323 : }
324 0 : else if (mBuffer) {
325 : // Write out the last character, a single byte.
326 0 : if (dest >= destEnd)
327 0 : goto error_more_output;
328 0 : *dest++ = mBuffer & 0xff;
329 0 : mBuffer = 0;
330 : }
331 :
332 0 : *aSrcLength = src - aSrc;
333 0 : *aDestLength = dest - aDest;
334 0 : return rv;
335 :
336 : error_more_output:
337 0 : *aSrcLength = src - aSrc;
338 0 : *aDestLength = dest - aDest;
339 0 : return NS_OK_UENC_MOREOUTPUT;
340 : }
341 :
342 : NS_IMETHODIMP
343 0 : nsUnicodeToTSCII::Finish(char* aDest, PRInt32* aDestLength)
344 : {
345 0 : if (!mBuffer) {
346 0 : *aDestLength = 0;
347 0 : return NS_OK;
348 : }
349 :
350 0 : if (mBuffer >> 8) {
351 : // Write out the last character, two bytes.
352 0 : if (*aDestLength < 2) {
353 0 : *aDestLength = 0;
354 0 : return NS_OK_UENC_MOREOUTPUT;
355 : }
356 0 : *aDest++ = (mBuffer >> 8) & 0xff;
357 0 : *aDest++ = mBuffer & 0xff;
358 0 : mBuffer = 0;
359 0 : *aDestLength = 2;
360 : }
361 : else {
362 : // Write out the last character, a single byte.
363 0 : if (*aDestLength < 1) {
364 0 : *aDestLength = 0;
365 0 : return NS_OK_UENC_MOREOUTPUT;
366 : }
367 0 : *aDest++ = mBuffer & 0xff;
368 0 : mBuffer = 0;
369 0 : *aDestLength = 1;
370 : }
371 0 : return NS_OK;
372 : }
373 :
374 : //================================================================
375 : NS_IMETHODIMP
376 0 : nsUnicodeToTSCII::Reset()
377 : {
378 0 : mBuffer = 0;
379 0 : return NS_OK;
380 : }
381 :
382 : NS_IMETHODIMP
383 0 : nsUnicodeToTSCII::GetMaxLength(const PRUnichar * aSrc, PRInt32 aSrcLength,
384 : PRInt32 * aDestLength)
385 : {
386 : // Some Tamil letters can be decomposed into 2 glyphs in TSCII.
387 0 : *aDestLength = aSrcLength * 2;
388 0 : return NS_OK;
389 : }
390 :
391 :
392 : NS_IMETHODIMP
393 0 : nsUnicodeToTSCII::SetOutputErrorBehavior(PRInt32 aBehavior,
394 : nsIUnicharEncoder *aEncoder,
395 : PRUnichar aChar)
396 : {
397 0 : return NS_OK;
398 : }
399 :
400 :
401 : // same as the mapping of the C1(0x80-0x9f) part of Windows-1252 to Unicode
402 : const static PRUnichar gTSCIIToTTF[] = {
403 : 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
404 : 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
405 : 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
406 : 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
407 : };
408 :
409 : //----------------------------------------------------------------------
410 : // Class nsUnicodeToTamilTTF [implementation]
411 : //
412 0 : NS_IMPL_ISUPPORTS_INHERITED0(nsUnicodeToTamilTTF, nsUnicodeToTSCII)
413 :
414 : NS_IMETHODIMP
415 0 : nsUnicodeToTamilTTF::Convert(const PRUnichar * aSrc,
416 : PRInt32 * aSrcLength, char * aDest,
417 : PRInt32 * aDestLength)
418 : {
419 :
420 : PRInt32 medLen, destLen;
421 : char *med;
422 :
423 0 : GetMaxLength(aSrc, *aSrcLength, &destLen);
424 0 : NS_ASSERTION(destLen <= *aDestLength, "insufficient dest. buffer size");
425 :
426 : // TSCII converter is a single byte encoder and takes half the space
427 : // taken by TamilTTF encoder.
428 0 : medLen = destLen / 2;
429 :
430 0 : if (medLen > CHAR_BUFFER_SIZE) {
431 0 : med = (char *) nsMemory::Alloc(medLen);
432 0 : if (!med)
433 0 : return NS_ERROR_OUT_OF_MEMORY;
434 : }
435 : else
436 0 : med = mStaticBuffer;
437 :
438 0 : nsresult rv = nsUnicodeToTSCII::Convert(aSrc, aSrcLength, med, &medLen);
439 :
440 0 : if (NS_FAILED(rv)) {
441 0 : if (med != mStaticBuffer)
442 0 : nsMemory::Free(med);
443 0 : return rv;
444 : }
445 :
446 : PRInt32 i, j;
447 :
448 : // widen 8bit TSCII to pseudo-Unicode font encoding of TSCII-Tamil font
449 0 : for (i = 0, j = 0; i < medLen; i++) {
450 : // Only C1 part(0x80-0x9f) needs to be mapped as if they're CP1251.
451 0 : PRUnichar ucs2 = (med[i] & 0xe0) == 0x80 ?
452 0 : gTSCIIToTTF[med[i] & 0x7f] : PRUint8(med[i]);
453 : // A lot of TSCII fonts are still based on TSCII 1.6 so that
454 : // they have Tamil vowel 'I' at 0xad instead of 0xfe.
455 0 : if (ucs2 == 0xfe) ucs2 = 0xad;
456 0 : aDest[j++] = PRUint8((ucs2 & 0xff00) >> 8);
457 0 : aDest[j++] = PRUint8(ucs2 & 0x00ff);
458 : }
459 :
460 0 : *aDestLength = j;
461 :
462 0 : if (med != mStaticBuffer)
463 0 : nsMemory::Free(med);
464 :
465 0 : return NS_OK;
466 : }
467 :
468 : NS_IMETHODIMP
469 0 : nsUnicodeToTamilTTF::GetMaxLength(const PRUnichar * aSrc, PRInt32 aSrcLength, PRInt32 * aDestLength)
470 : {
471 : // Each Tamil character can generate at most two presentation forms,
472 : // but we're 'extending' them to 16bit shorts, which accounts for
473 : // additional factor of 2.
474 0 : *aDestLength = (aSrcLength + 1) * 4;
475 :
476 0 : return NS_OK;
477 : }
478 :
479 : NS_IMETHODIMP
480 0 : nsUnicodeToTamilTTF::SetOutputErrorBehavior(PRInt32 aBehavior,
481 : nsIUnicharEncoder *aEncoder,
482 : PRUnichar aChar)
483 : {
484 0 : if (aBehavior == kOnError_CallBack && aEncoder == nsnull)
485 0 : return NS_ERROR_NULL_POINTER;
486 0 : mErrEncoder = aEncoder;
487 0 : mErrBehavior = aBehavior;
488 0 : mErrChar = aChar;
489 0 : return NS_OK;
490 : }
491 :
|