1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is mozilla.org code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either of the GNU General Public License Version 2 or later (the "GPL"),
26 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 :
38 :
39 :
40 : #include "nsJISx4501LineBreaker.h"
41 :
42 : #include "pratom.h"
43 : #include "nsLWBRKDll.h"
44 : #include "jisx4501class.h"
45 : #include "nsComplexBreaker.h"
46 : #include "nsTArray.h"
47 : #include "nsUnicharUtils.h"
48 :
49 : /*
50 :
51 : Simplification of Pair Table in JIS X 4051
52 :
53 : 1. The Origion Table - in 4.1.3
54 :
55 : In JIS x 4051. The pair table is defined as below
56 :
57 : Class of
58 : Leading Class of Trailing Char Class
59 : Char
60 :
61 : 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
62 : * # * #
63 : 1 X X X X X X X X X X X X X X X X X X X X X E
64 : 2 X X X X X X
65 : 3 X X X X X X
66 : 4 X X X X X X
67 : 5 X X X X X X
68 : 6 X X X X X X
69 : 7 X X X X X X X
70 : 8 X X X X X X E
71 : 9 X X X X X X
72 : 10 X X X X X X
73 : 11 X X X X X X
74 : 12 X X X X X X
75 : 13 X X X X X X X
76 : 14 X X X X X X X
77 : 15 X X X X X X X X X
78 : 16 X X X X X X X X
79 : 17 X X X X X E
80 : 18 X X X X X X X X X
81 : 19 X E E E E E X X X X X X X X X X X X E X E E
82 : 20 X X X X X E
83 :
84 : * Same Char
85 : # Other Char
86 :
87 : X Cannot Break
88 :
89 : The classes mean:
90 : 1: Open parenthesis
91 : 2: Close parenthesis
92 : 3: Prohibit a line break before
93 : 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
94 : 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
95 : 6: Full stop
96 : 7: Non-breakable between same characters
97 : 8: Prefix (e.g., "$", "NO.")
98 : 9: Postfix (e.g., "%")
99 : 10: Ideographic space
100 : 11: Hiragana
101 : 12: Japanese characters (except class 11)
102 : 13: Subscript
103 : 14: Ruby
104 : 15: Numeric
105 : 16: Alphabet
106 : 17: Space for Western language
107 : 18: Western characters (except class 17)
108 : 19: Split line note (Warichu) begin quote
109 : 20: Split line note (Warichu) end quote
110 :
111 : 2. Simplified by remove the class which we do not care
112 :
113 : However, since we do not care about class 13(Subscript), 14(Ruby),
114 : 16 (Aphabet), 19(split line note begin quote), and 20(split line note end
115 : quote) we can simplify this par table into the following
116 :
117 : Class of
118 : Leading Class of Trailing Char Class
119 : Char
120 :
121 : 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18
122 :
123 : 1 X X X X X X X X X X X X X X X
124 : 2 X X X X X
125 : 3 X X X X X
126 : 4 X X X X X
127 : 5 X X X X X
128 : 6 X X X X X
129 : 7 X X X X X X
130 : 8 X X X X X X
131 : 9 X X X X X
132 : 10 X X X X X
133 : 11 X X X X X
134 : 12 X X X X X
135 : 15 X X X X X X X X
136 : 17 X X X X X
137 : 18 X X X X X X X
138 :
139 : 3. Simplified by merged classes
140 :
141 : After the 2 simplification, the pair table have some duplication
142 : a. class 2, 3, 4, 5, 6, are the same- we can merged them
143 : b. class 10, 11, 12, 17 are the same- we can merged them
144 :
145 :
146 : Class of
147 : Leading Class of Trailing Char Class
148 : Char
149 :
150 : 1 [a] 7 8 9 [b]15 18
151 :
152 : 1 X X X X X X X X
153 : [a] X
154 : 7 X X
155 : 8 X X
156 : 9 X
157 : [b] X
158 : 15 X X X X
159 : 18 X X X
160 :
161 :
162 : 4. We add COMPLEX characters and make it breakable w/ all ther class
163 : except after class 1 and before class [a]
164 :
165 : Class of
166 : Leading Class of Trailing Char Class
167 : Char
168 :
169 : 1 [a] 7 8 9 [b]15 18 COMPLEX
170 :
171 : 1 X X X X X X X X X
172 : [a] X
173 : 7 X X
174 : 8 X X
175 : 9 X
176 : [b] X
177 : 15 X X X X
178 : 18 X X X
179 : COMPLEX X T
180 :
181 : T : need special handling
182 :
183 :
184 : 5. However, we need two special class for some punctuations/parentheses,
185 : theirs breaking rules like character class (18), see bug 389056.
186 : And also we need character like punctuation that is same behavior with 18,
187 : but the characters are not letters of all languages. (e.g., '_')
188 : [c]. Based on open parenthesis class (1), but it is not breakable after
189 : character class (18) or numeric class (15).
190 : [d]. Based on close parenthesis (or punctuation) class (2), but it is not
191 : breakable before character class (18) or numeric class (15).
192 :
193 : Class of
194 : Leading Class of Trailing Char Class
195 : Char
196 :
197 : 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d]
198 :
199 : 1 X X X X X X X X X X X
200 : [a] X X X
201 : 7 X X
202 : 8 X X
203 : 9 X
204 : [b] X X
205 : 15 X X X X X X
206 : 18 X X X X X
207 : COMPLEX X T
208 : [c] X X X X X X X X X X X
209 : [d] X X X X
210 :
211 :
212 : 6. And Unicode has "NON-BREAK" characters. The lines should be broken around
213 : them. But in JIS X 4051, such class is not, therefore, we create [e].
214 :
215 : Class of
216 : Leading Class of Trailing Char Class
217 : Char
218 :
219 : 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
220 :
221 : 1 X X X X X X X X X X X X
222 : [a] X X X
223 : 7 X X X
224 : 8 X X X
225 : 9 X X
226 : [b] X X X
227 : 15 X X X X X X X
228 : 18 X X X X X X
229 : COMPLEX X T X
230 : [c] X X X X X X X X X X X X
231 : [d] X X X X X
232 : [e] X X X X X X X X X X X X
233 :
234 :
235 : 7. Now we use one bit to encode weather it is breakable, and use 2 bytes
236 : for one row, then the bit table will look like:
237 :
238 : 18 <- 1
239 :
240 : 1 0000 1111 1111 1111 = 0x0FFF
241 : [a] 0000 1100 0000 0010 = 0x0C02
242 : 7 0000 1000 0000 0110 = 0x0806
243 : 8 0000 1000 0100 0010 = 0x0842
244 : 9 0000 1000 0000 0010 = 0x0802
245 : [b] 0000 1100 0000 0010 = 0x0C02
246 : 15 0000 1110 1101 0010 = 0x0ED2
247 : 18 0000 1110 1100 0010 = 0x0EC2
248 : COMPLEX 0000 1001 0000 0010 = 0x0902
249 : [c] 0000 1111 1111 1111 = 0x0FFF
250 : [d] 0000 1100 1100 0010 = 0x0CC2
251 : [e] 0000 1111 1111 1111 = 0x0FFF
252 : */
253 :
254 : #define MAX_CLASSES 12
255 :
256 : static const PRUint16 gPair[MAX_CLASSES] = {
257 : 0x0FFF,
258 : 0x0C02,
259 : 0x0806,
260 : 0x0842,
261 : 0x0802,
262 : 0x0C02,
263 : 0x0ED2,
264 : 0x0EC2,
265 : 0x0902,
266 : 0x0FFF,
267 : 0x0CC2,
268 : 0x0FFF
269 : };
270 :
271 :
272 : /*
273 :
274 : 8. And if the character is not enough far from word start, word end and
275 : another break point, we should not break in non-CJK languages.
276 : I.e., Don't break around 15, 18, [c] and [d], but don't change
277 : that if they are related to [b].
278 :
279 : Class of
280 : Leading Class of Trailing Char Class
281 : Char
282 :
283 : 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
284 :
285 : 1 X X X X X X X X X X X X
286 : [a] X X X X X X
287 : 7 X X X X X X X
288 : 8 X X X X X X
289 : 9 X X X X X X
290 : [b] X X X
291 : 15 X X X X X X X X X X X
292 : 18 X X X X X X X X X X X
293 : COMPLEX X X X T X X X
294 : [c] X X X X X X X X X X X X
295 : [d] X X X X X X X X X X X
296 : [e] X X X X X X X X X X X X
297 :
298 : 18 <- 1
299 :
300 : 1 0000 1111 1111 1111 = 0x0FFF
301 : [a] 0000 1110 1100 0010 = 0x0EC2
302 : 7 0000 1110 1100 0110 = 0x0EC6
303 : 8 0000 1110 1100 0010 = 0x0EC2
304 : 9 0000 1110 1100 0010 = 0x0EC2
305 : [b] 0000 1100 0000 0010 = 0x0C02
306 : 15 0000 1111 1101 1111 = 0x0FDF
307 : 18 0000 1111 1101 1111 = 0x0FDF
308 : COMPLEX 0000 1111 1100 0010 = 0x0FC2
309 : [c] 0000 1111 1111 1111 = 0x0FFF
310 : [d] 0000 1111 1101 1111 = 0x0FDF
311 : [e] 0000 1111 1111 1111 = 0x0FFF
312 : */
313 :
314 : static const PRUint16 gPairConservative[MAX_CLASSES] = {
315 : 0x0FFF,
316 : 0x0EC2,
317 : 0x0EC6,
318 : 0x0EC2,
319 : 0x0EC2,
320 : 0x0C02,
321 : 0x0FDF,
322 : 0x0FDF,
323 : 0x0FC2,
324 : 0x0FFF,
325 : 0x0FDF,
326 : 0x0FFF
327 : };
328 :
329 :
330 : /*
331 :
332 : 9. Now we map the class to number
333 :
334 : 0: 1
335 : 1: [a]- 2, 3, 4, 5, 6
336 : 2: 7
337 : 3: 8
338 : 4: 9
339 : 5: [b]- 10, 11, 12, 17
340 : 6: 15
341 : 7: 18
342 : 8: COMPLEX
343 : 9: [c]
344 : A: [d]
345 : B: [e]
346 :
347 : and they mean:
348 : 0: Open parenthesis
349 : 1: Punctuation that prohibits break before
350 : 2: Non-breakable between same classes
351 : 3: Prefix
352 : 4: Postfix
353 : 5: Breakable character (Spaces and Most Japanese characters)
354 : 6: Numeric
355 : 7: Characters
356 : 8: Need special handling characters (E.g., Thai)
357 : 9: Open parentheses like Character (See bug 389056)
358 : A: Close parenthese (or punctuations) like Character (See bug 389056)
359 : B: Non breakable (See bug 390920)
360 :
361 : */
362 :
363 : #define CLASS_NONE PR_INT8_MAX
364 :
365 : #define CLASS_OPEN 0x00
366 : #define CLASS_CLOSE 0x01
367 : #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
368 : #define CLASS_PREFIX 0x03
369 : #define CLASS_POSTFFIX 0x04
370 : #define CLASS_BREAKABLE 0x05
371 : #define CLASS_NUMERIC 0x06
372 : #define CLASS_CHARACTER 0x07
373 : #define CLASS_COMPLEX 0x08
374 : #define CLASS_OPEN_LIKE_CHARACTER 0x09
375 : #define CLASS_CLOSE_LIKE_CHARACTER 0x0A
376 : #define CLASS_NON_BREAKABLE 0x0B
377 :
378 : #define U_NULL PRUnichar(0x0000)
379 : #define U_SLASH PRUnichar('/')
380 : #define U_SPACE PRUnichar(' ')
381 : #define U_HYPHEN PRUnichar('-')
382 : #define U_EQUAL PRUnichar('=')
383 : #define U_PERCENT PRUnichar('%')
384 : #define U_AMPERSAND PRUnichar('&')
385 : #define U_SEMICOLON PRUnichar(';')
386 : #define U_BACKSLASH PRUnichar('\\')
387 : #define U_OPEN_SINGLE_QUOTE PRUnichar(0x2018)
388 : #define U_OPEN_DOUBLE_QUOTE PRUnichar(0x201C)
389 : #define U_OPEN_GUILLEMET PRUnichar(0x00AB)
390 :
391 : #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
392 : (c) == U_SLASH || \
393 : (c) == U_PERCENT || \
394 : (c) == U_AMPERSAND || \
395 : (c) == U_SEMICOLON || \
396 : (c) == U_BACKSLASH || \
397 : (c) == U_OPEN_SINGLE_QUOTE || \
398 : (c) == U_OPEN_DOUBLE_QUOTE || \
399 : (c) == U_OPEN_GUILLEMET)
400 :
401 : #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
402 :
403 : static inline int
404 0 : GETCLASSFROMTABLE(const PRUint32* t, PRUint16 l)
405 : {
406 0 : return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
407 : }
408 :
409 : static inline int
410 0 : IS_HALFWIDTH_IN_JISx4051_CLASS3(PRUnichar u)
411 : {
412 0 : return ((0xff66 <= (u)) && ((u) <= 0xff70));
413 : }
414 :
415 : static inline int
416 850 : IS_CJK_CHAR(PRUnichar u)
417 : {
418 : return ((0x1100 <= (u) && (u) <= 0x11ff) ||
419 : (0x2e80 <= (u) && (u) <= 0xd7ff) ||
420 : (0xf900 <= (u) && (u) <= 0xfaff) ||
421 850 : (0xff00 <= (u) && (u) <= 0xffef) );
422 : }
423 :
424 : static inline bool
425 40 : IS_NONBREAKABLE_SPACE(PRUnichar u)
426 : {
427 40 : return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
428 : }
429 :
430 : static inline bool
431 40 : IS_HYPHEN(PRUnichar u)
432 : {
433 : return (u == U_HYPHEN ||
434 : u == 0x058A || // ARMENIAN HYPHEN
435 : u == 0x2010 || // HYPHEN
436 : u == 0x2012 || // FIGURE DASH
437 40 : u == 0x2013); // EN DASH
438 : }
439 :
440 : static PRInt8
441 40 : GetClass(PRUnichar u)
442 : {
443 40 : PRUint16 h = u & 0xFF00;
444 40 : PRUint16 l = u & 0x00ff;
445 : PRInt8 c;
446 :
447 : // Handle 3 range table first
448 40 : if (0x0000 == h) {
449 0 : c = GETCLASSFROMTABLE(gLBClass00, l);
450 40 : } else if (NS_NeedsPlatformNativeHandling(u)) {
451 0 : c = CLASS_COMPLEX;
452 40 : } else if (0x0E00 == h) {
453 0 : c = GETCLASSFROMTABLE(gLBClass0E, l);
454 40 : } else if (0x2000 == h) {
455 0 : c = GETCLASSFROMTABLE(gLBClass20, l);
456 40 : } else if (0x2100 == h) {
457 0 : c = GETCLASSFROMTABLE(gLBClass21, l);
458 40 : } else if (0x3000 == h) {
459 0 : c = GETCLASSFROMTABLE(gLBClass30, l);
460 40 : } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi
461 : ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul
462 : ((0xf900 <= h) && (h <= 0xfaff))) {
463 40 : c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility
464 0 : } else if (0xff00 == h) {
465 0 : if (l < 0x0060) { // Fullwidth ASCII variant
466 0 : c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));
467 0 : } else if (l < 0x00a0) {
468 0 : switch (l) {
469 0 : case 0x61: c = GetClass(0x3002); break;
470 0 : case 0x62: c = GetClass(0x300c); break;
471 0 : case 0x63: c = GetClass(0x300d); break;
472 0 : case 0x64: c = GetClass(0x3001); break;
473 0 : case 0x65: c = GetClass(0x30fb); break;
474 0 : case 0x9e: c = GetClass(0x309b); break;
475 0 : case 0x9f: c = GetClass(0x309c); break;
476 : default:
477 0 : if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u))
478 0 : c = CLASS_CLOSE; // jis x4051 class 3
479 : else
480 0 : c = CLASS_BREAKABLE; // jis x4051 class 11
481 0 : break;
482 : }
483 : // Halfwidth Katakana variants
484 0 : } else if (l < 0x00e0) {
485 0 : c = CLASS_CHARACTER; // Halfwidth Hangul variants
486 0 : } else if (l < 0x00f0) {
487 : static PRUnichar NarrowFFEx[16] = {
488 : 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
489 : 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
490 : };
491 0 : c = GetClass(NarrowFFEx[l - 0x00e0]);
492 : } else {
493 0 : c = CLASS_CHARACTER;
494 : }
495 0 : } else if (0x3100 == h) {
496 0 : if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
497 : // XXX: This is per UAX #14, but UAX #14 may change
498 : // the line breaking rules about Kanbun and Bopomofo.
499 0 : c = CLASS_BREAKABLE;
500 0 : } else if (l >= 0xf0) { // Katakana small letters for Ainu
501 0 : c = CLASS_CLOSE;
502 : } else { // unassigned
503 0 : c = CLASS_CHARACTER;
504 : }
505 0 : } else if (0x0300 == h) {
506 0 : if (0x4F == l || (0x5C <= l && l <= 0x62))
507 0 : c = CLASS_NON_BREAKABLE;
508 : else
509 0 : c = CLASS_CHARACTER;
510 0 : } else if (0x0500 == h) {
511 : // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
512 0 : if (l == 0x8A)
513 0 : c = GETCLASSFROMTABLE(gLBClass00, PRUint16(U_HYPHEN));
514 : else
515 0 : c = CLASS_CHARACTER;
516 0 : } else if (0x0F00 == h) {
517 0 : if (0x08 == l || 0x0C == l || 0x12 == l)
518 0 : c = CLASS_NON_BREAKABLE;
519 : else
520 0 : c = CLASS_CHARACTER;
521 0 : } else if (0x1800 == h) {
522 0 : if (0x0E == l)
523 0 : c = CLASS_NON_BREAKABLE;
524 : else
525 0 : c = CLASS_CHARACTER;
526 : } else {
527 0 : c = CLASS_CHARACTER; // others
528 : }
529 40 : return c;
530 : }
531 :
532 : static bool
533 39 : GetPair(PRInt8 c1, PRInt8 c2)
534 : {
535 39 : NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
536 39 : NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
537 :
538 39 : return (0 == ((gPair[c1] >> c2) & 0x0001));
539 : }
540 :
541 : static bool
542 0 : GetPairConservative(PRInt8 c1, PRInt8 c2)
543 : {
544 0 : NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
545 0 : NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
546 :
547 0 : return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
548 : }
549 :
550 1404 : nsJISx4051LineBreaker::nsJISx4051LineBreaker()
551 : {
552 1404 : }
553 :
554 2806 : nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
555 : {
556 5612 : }
557 :
558 12645 : NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker)
559 :
560 : class ContextState {
561 : public:
562 1 : ContextState(const PRUnichar* aText, PRUint32 aLength) {
563 1 : mUniText = aText;
564 1 : mText = nsnull;
565 1 : mLength = aLength;
566 1 : Init();
567 1 : }
568 :
569 0 : ContextState(const PRUint8* aText, PRUint32 aLength) {
570 0 : mUniText = nsnull;
571 0 : mText = aText;
572 0 : mLength = aLength;
573 0 : Init();
574 0 : }
575 :
576 0 : PRUint32 Length() { return mLength; }
577 0 : PRUint32 Index() { return mIndex; }
578 :
579 40 : PRUnichar GetCharAt(PRUint32 aIndex) {
580 40 : NS_ASSERTION(0 <= aIndex && aIndex < mLength, "Out of range!");
581 40 : return mUniText ? mUniText[aIndex] : PRUnichar(mText[aIndex]);
582 : }
583 :
584 40 : void AdvanceIndex() {
585 40 : ++mIndex;
586 40 : }
587 :
588 39 : void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
589 :
590 : // A word of western language should not be broken. But even if the word has
591 : // only ASCII characters, non-natural context words should be broken, e.g.,
592 : // URL and file path. For protecting the natural words, we should use
593 : // conservative breaking rules at following conditions:
594 : // 1. at near the start of word
595 : // 2. at near the end of word
596 : // 3. at near the latest broken point
597 : // CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
598 : #define CONSERVATIVE_BREAK_RANGE 6
599 :
600 39 : bool UseConservativeBreaking(PRUint32 aOffset = 0) {
601 39 : if (mHasCJKChar)
602 39 : return false;
603 0 : PRUint32 index = mIndex + aOffset;
604 : bool result = (index < CONSERVATIVE_BREAK_RANGE ||
605 : mLength - index < CONSERVATIVE_BREAK_RANGE ||
606 0 : index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
607 0 : if (result || !mHasNonbreakableSpace)
608 0 : return result;
609 :
610 : // This text has no-breakable space, we need to check whether the index
611 : // is near it.
612 :
613 : // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
614 0 : for (PRUint32 i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
615 0 : if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
616 0 : return true;
617 : }
618 : // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
619 0 : for (PRUint32 i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
620 0 : if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
621 0 : return true;
622 : }
623 0 : return false;
624 : }
625 :
626 0 : bool HasPreviousEqualsSign() const {
627 0 : return mHasPreviousEqualsSign;
628 : }
629 0 : void NotifySeenEqualsSign() {
630 0 : mHasPreviousEqualsSign = true;
631 0 : }
632 :
633 0 : bool HasPreviousSlash() const {
634 0 : return mHasPreviousSlash;
635 : }
636 0 : void NotifySeenSlash() {
637 0 : mHasPreviousSlash = true;
638 0 : }
639 :
640 0 : bool HasPreviousBackslash() const {
641 0 : return mHasPreviousBackslash;
642 : }
643 0 : void NotifySeenBackslash() {
644 0 : mHasPreviousBackslash = true;
645 0 : }
646 :
647 0 : PRUnichar GetPreviousNonHyphenCharacter() const {
648 0 : return mPreviousNonHyphenCharacter;
649 : }
650 40 : void NotifyNonHyphenCharacter(PRUnichar ch) {
651 40 : mPreviousNonHyphenCharacter = ch;
652 40 : }
653 :
654 : private:
655 1 : void Init() {
656 1 : mIndex = 0;
657 1 : mLastBreakIndex = 0;
658 1 : mPreviousNonHyphenCharacter = U_NULL;
659 1 : mHasCJKChar = 0;
660 1 : mHasNonbreakableSpace = 0;
661 1 : mHasPreviousEqualsSign = false;
662 1 : mHasPreviousSlash = false;
663 1 : mHasPreviousBackslash = false;
664 :
665 41 : for (PRUint32 i = 0; i < mLength; ++i) {
666 40 : PRUnichar u = GetCharAt(i);
667 40 : if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
668 0 : mHasNonbreakableSpace = 1;
669 40 : else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
670 1 : mHasCJKChar = 1;
671 : }
672 1 : }
673 :
674 : const PRUnichar* mUniText;
675 : const PRUint8* mText;
676 :
677 : PRUint32 mIndex;
678 : PRUint32 mLength; // length of text
679 : PRUint32 mLastBreakIndex;
680 : PRUnichar mPreviousNonHyphenCharacter; // The last character we have seen
681 : // which is not U_HYPHEN
682 : bool mHasCJKChar; // if the text has CJK character, this is true.
683 : bool mHasNonbreakableSpace; // if the text has no-breakable space,
684 : // this is true.
685 : bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
686 : bool mHasPreviousSlash; // True if we have seen a U_SLASH
687 : bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH
688 : };
689 :
690 : static PRInt8
691 0 : ContextualAnalysis(PRUnichar prev, PRUnichar cur, PRUnichar next,
692 : ContextState &aState)
693 : {
694 : // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
695 :
696 0 : if (IS_HYPHEN(cur)) {
697 : // If next character is hyphen, we don't need to break between them.
698 0 : if (IS_HYPHEN(next))
699 0 : return CLASS_CHARACTER;
700 : // If prev and next characters are numeric, it may be in Math context.
701 : // So, we should not break here.
702 0 : bool prevIsNum = IS_ASCII_DIGIT(prev);
703 0 : bool nextIsNum = IS_ASCII_DIGIT(next);
704 0 : if (prevIsNum && nextIsNum)
705 0 : return CLASS_NUMERIC;
706 : // If one side is numeric and the other is a character, or if both sides are
707 : // characters, the hyphen should be breakable.
708 0 : if (!aState.UseConservativeBreaking(1)) {
709 0 : PRUnichar prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
710 0 : if (prevOfHyphen && next) {
711 0 : bool prevIsChar = !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen) &&
712 0 : GetClass(prevOfHyphen) == CLASS_CHARACTER;
713 0 : bool nextIsChar = !NEED_CONTEXTUAL_ANALYSIS(next) &&
714 0 : GetClass(next) == CLASS_CHARACTER;
715 0 : if ((prevIsNum || prevIsChar) && (nextIsNum || nextIsChar))
716 0 : return CLASS_CLOSE;
717 : }
718 : }
719 : } else {
720 0 : aState.NotifyNonHyphenCharacter(cur);
721 0 : if (cur == U_SLASH || cur == U_BACKSLASH) {
722 : // If this is immediately after same char, we should not break here.
723 0 : if (prev == cur)
724 0 : return CLASS_CHARACTER;
725 : // If this text has two or more (BACK)SLASHs, this may be file path or URL.
726 : // Make sure to compute shouldReturn before we notify on this slash.
727 0 : bool shouldReturn = !aState.UseConservativeBreaking() &&
728 : (cur == U_SLASH ?
729 0 : aState.HasPreviousSlash() : aState.HasPreviousBackslash());
730 :
731 0 : if (cur == U_SLASH) {
732 0 : aState.NotifySeenSlash();
733 : } else {
734 0 : aState.NotifySeenBackslash();
735 : }
736 :
737 0 : if (shouldReturn)
738 0 : return CLASS_OPEN;
739 0 : } else if (cur == U_PERCENT) {
740 : // If this is a part of the param of URL, we should break before.
741 0 : if (!aState.UseConservativeBreaking()) {
742 0 : if (aState.Index() >= 3 &&
743 0 : aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
744 0 : return CLASS_OPEN;
745 0 : if (aState.Index() + 3 < aState.Length() &&
746 0 : aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
747 0 : return CLASS_OPEN;
748 : }
749 0 : } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
750 : // If this may be a separator of params of URL, we should break after.
751 0 : if (!aState.UseConservativeBreaking(1) &&
752 0 : aState.HasPreviousEqualsSign())
753 0 : return CLASS_CLOSE;
754 0 : } else if (cur == U_OPEN_SINGLE_QUOTE ||
755 : cur == U_OPEN_DOUBLE_QUOTE ||
756 : cur == U_OPEN_GUILLEMET) {
757 : // for CJK usage, we treat these as openers to allow a break before them,
758 : // but otherwise treat them as normal characters because quote mark usage
759 : // in various Western languages varies too much; see bug #450088 discussion.
760 0 : if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
761 0 : return CLASS_OPEN;
762 : } else {
763 0 : NS_ERROR("Forgot to handle the current character!");
764 : }
765 : }
766 0 : return GetClass(cur);
767 : }
768 :
769 :
770 : PRInt32
771 22 : nsJISx4051LineBreaker::WordMove(const PRUnichar* aText, PRUint32 aLen,
772 : PRUint32 aPos, PRInt8 aDirection)
773 : {
774 22 : bool textNeedsJISx4051 = false;
775 : PRInt32 begin, end;
776 :
777 848 : for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
778 826 : if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
779 37 : textNeedsJISx4051 = true;
780 : }
781 : }
782 45 : for (end = aPos + 1; end < PRInt32(aLen) && !NS_IsSpace(aText[end]); ++end) {
783 23 : if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
784 2 : textNeedsJISx4051 = true;
785 : }
786 : }
787 :
788 : PRInt32 ret;
789 44 : nsAutoTArray<PRUint8, 2000> breakState;
790 22 : if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
791 : // No complex text character, do not try to do complex line break.
792 : // (This is required for serializers. See Bug #344816.)
793 : // Also fall back to this when out of memory.
794 21 : if (aDirection < 0) {
795 11 : ret = (begin == PRInt32(aPos)) ? begin - 1 : begin;
796 : } else {
797 10 : ret = end;
798 : }
799 : } else {
800 1 : GetJISx4051Breaks(aText + begin, end - begin, breakState.Elements());
801 :
802 1 : ret = aPos;
803 2 : do {
804 1 : ret += aDirection;
805 1 : } while (begin < ret && ret < end && !breakState[ret - begin]);
806 : }
807 :
808 22 : return ret;
809 : }
810 :
811 : PRInt32
812 10 : nsJISx4051LineBreaker::Next(const PRUnichar* aText, PRUint32 aLen,
813 : PRUint32 aPos)
814 : {
815 10 : NS_ASSERTION(aText, "aText shouldn't be null");
816 10 : NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
817 :
818 10 : PRInt32 nextPos = WordMove(aText, aLen, aPos, 1);
819 10 : return nextPos < PRInt32(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
820 : }
821 :
822 : PRInt32
823 12 : nsJISx4051LineBreaker::Prev(const PRUnichar* aText, PRUint32 aLen,
824 : PRUint32 aPos)
825 : {
826 12 : NS_ASSERTION(aText, "aText shouldn't be null");
827 12 : NS_ASSERTION(aLen >= aPos && aPos > 0,
828 : "Bad position passed to nsJISx4051LineBreaker::Prev");
829 :
830 12 : PRInt32 prevPos = WordMove(aText, aLen, aPos, -1);
831 12 : return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
832 : }
833 :
834 : void
835 1 : nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUnichar* aChars, PRUint32 aLength,
836 : PRUint8* aBreakBefore)
837 : {
838 : PRUint32 cur;
839 1 : PRInt8 lastClass = CLASS_NONE;
840 1 : ContextState state(aChars, aLength);
841 :
842 41 : for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
843 40 : PRUnichar ch = aChars[cur];
844 : PRInt8 cl;
845 :
846 40 : if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
847 0 : cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
848 : ch,
849 0 : cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
850 0 : state);
851 : } else {
852 40 : if (ch == U_EQUAL)
853 0 : state.NotifySeenEqualsSign();
854 40 : state.NotifyNonHyphenCharacter(ch);
855 40 : cl = GetClass(ch);
856 : }
857 :
858 : bool allowBreak;
859 40 : if (cur > 0) {
860 39 : NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
861 : "Loop should have prevented adjacent complex chars here");
862 39 : if (state.UseConservativeBreaking())
863 0 : allowBreak = GetPairConservative(lastClass, cl);
864 : else
865 39 : allowBreak = GetPair(lastClass, cl);
866 : } else {
867 1 : allowBreak = false;
868 : }
869 40 : aBreakBefore[cur] = allowBreak;
870 40 : if (allowBreak)
871 39 : state.NotifyBreakBefore();
872 40 : lastClass = cl;
873 40 : if (CLASS_COMPLEX == cl) {
874 0 : PRUint32 end = cur + 1;
875 :
876 0 : while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
877 0 : ++end;
878 : }
879 :
880 0 : NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
881 :
882 : // restore breakability at chunk begin, which was always set to false
883 : // by the complex line breaker
884 0 : aBreakBefore[cur] = allowBreak;
885 :
886 0 : cur = end - 1;
887 : }
888 : }
889 1 : }
890 :
891 : void
892 0 : nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUint8* aChars, PRUint32 aLength,
893 : PRUint8* aBreakBefore)
894 : {
895 : PRUint32 cur;
896 0 : PRInt8 lastClass = CLASS_NONE;
897 0 : ContextState state(aChars, aLength);
898 :
899 0 : for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
900 0 : PRUnichar ch = aChars[cur];
901 : PRInt8 cl;
902 :
903 0 : if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
904 0 : cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
905 : ch,
906 0 : cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
907 0 : state);
908 : } else {
909 0 : if (ch == U_EQUAL)
910 0 : state.NotifySeenEqualsSign();
911 0 : state.NotifyNonHyphenCharacter(ch);
912 0 : cl = GetClass(ch);
913 : }
914 :
915 : bool allowBreak;
916 0 : if (cur > 0) {
917 0 : if (state.UseConservativeBreaking())
918 0 : allowBreak = GetPairConservative(lastClass, cl);
919 : else
920 0 : allowBreak = GetPair(lastClass, cl);
921 : } else {
922 0 : allowBreak = false;
923 : }
924 0 : aBreakBefore[cur] = allowBreak;
925 0 : if (allowBreak)
926 0 : state.NotifyBreakBefore();
927 0 : lastClass = cl;
928 : }
929 0 : }
|