LCOV - code coverage report
Current view: directory - intl/lwbrk/src - nsJISx4501LineBreaker.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 284 110 38.7 %
Date: 2012-06-02 Functions: 37 22 59.5 %

       1                 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2                 : /* ***** BEGIN LICENSE BLOCK *****
       3                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       4                 :  *
       5                 :  * The contents of this file are subject to the Mozilla Public License Version
       6                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       7                 :  * the License. You may obtain a copy of the License at
       8                 :  * http://www.mozilla.org/MPL/
       9                 :  *
      10                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      11                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      12                 :  * for the specific language governing rights and limitations under the
      13                 :  * License.
      14                 :  *
      15                 :  * The Original Code is mozilla.org code.
      16                 :  *
      17                 :  * The Initial Developer of the Original Code is
      18                 :  * Netscape Communications Corporation.
      19                 :  * Portions created by the Initial Developer are Copyright (C) 1998
      20                 :  * the Initial Developer. All Rights Reserved.
      21                 :  *
      22                 :  * Contributor(s):
      23                 :  *
      24                 :  * Alternatively, the contents of this file may be used under the terms of
      25                 :  * either of the GNU General Public License Version 2 or later (the "GPL"),
      26                 :  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      27                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      28                 :  * of those above. If you wish to allow use of your version of this file only
      29                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      30                 :  * use your version of this file under the terms of the MPL, indicate your
      31                 :  * decision by deleting the provisions above and replace them with the notice
      32                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      33                 :  * the provisions above, a recipient may use your version of this file under
      34                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      35                 :  *
      36                 :  * ***** END LICENSE BLOCK ***** */
      37                 : 
      38                 : 
      39                 : 
      40                 : #include "nsJISx4501LineBreaker.h"
      41                 : 
      42                 : #include "pratom.h"
      43                 : #include "nsLWBRKDll.h"
      44                 : #include "jisx4501class.h"
      45                 : #include "nsComplexBreaker.h"
      46                 : #include "nsTArray.h"
      47                 : #include "nsUnicharUtils.h"
      48                 : 
      49                 : /* 
      50                 : 
      51                 :    Simplification of Pair Table in JIS X 4051
      52                 : 
      53                 :    1. The Origion Table - in 4.1.3
      54                 : 
      55                 :    In JIS x 4051. The pair table is defined as below
      56                 : 
      57                 :    Class of
      58                 :    Leading    Class of Trailing Char Class
      59                 :    Char        
      60                 : 
      61                 :               1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20
      62                 :                                                  *  #  *  #
      63                 :         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E
      64                 :         2        X  X  X  X  X                                               X
      65                 :         3        X  X  X  X  X                                               X
      66                 :         4        X  X  X  X  X                                               X
      67                 :         5        X  X  X  X  X                                               X
      68                 :         6        X  X  X  X  X                                               X
      69                 :         7        X  X  X  X  X  X                                            X
      70                 :         8        X  X  X  X  X                                X              E
      71                 :         9        X  X  X  X  X                                               X
      72                 :        10        X  X  X  X  X                                               X
      73                 :        11        X  X  X  X  X                                               X
      74                 :        12        X  X  X  X  X                                               X
      75                 :        13        X  X  X  X  X                    X                          X
      76                 :        14        X  X  X  X  X                          X                    X
      77                 :        15        X  X  X  X  X        X                       X        X     X
      78                 :        16        X  X  X  X  X                                   X     X     X
      79                 :        17        X  X  X  X  X                                               E
      80                 :        18        X  X  X  X  X                                X  X     X     X
      81                 :        19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E
      82                 :        20        X  X  X  X  X                                               E
      83                 : 
      84                 :    * Same Char
      85                 :    # Other Char
      86                 : 
      87                 :    X Cannot Break
      88                 : 
      89                 :    The classes mean:
      90                 :       1: Open parenthesis
      91                 :       2: Close parenthesis
      92                 :       3: Prohibit a line break before
      93                 :       4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
      94                 :       5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
      95                 :       6: Full stop
      96                 :       7: Non-breakable between same characters
      97                 :       8: Prefix (e.g., "$", "NO.")
      98                 :       9: Postfix (e.g., "%")
      99                 :      10: Ideographic space
     100                 :      11: Hiragana
     101                 :      12: Japanese characters (except class 11)
     102                 :      13: Subscript
     103                 :      14: Ruby
     104                 :      15: Numeric
     105                 :      16: Alphabet
     106                 :      17: Space for Western language
     107                 :      18: Western characters (except class 17)
     108                 :      19: Split line note (Warichu) begin quote
     109                 :      20: Split line note (Warichu) end quote
     110                 : 
     111                 :    2. Simplified by remove the class which we do not care
     112                 : 
     113                 :    However, since we do not care about class 13(Subscript), 14(Ruby),
     114                 :    16 (Aphabet), 19(split line note begin quote), and 20(split line note end
     115                 :    quote) we can simplify this par table into the following
     116                 : 
     117                 :    Class of
     118                 :    Leading    Class of Trailing Char Class
     119                 :    Char
     120                 : 
     121                 :               1  2  3  4  5  6  7  8  9 10 11 12 15 17 18
     122                 : 
     123                 :         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X
     124                 :         2        X  X  X  X  X                           
     125                 :         3        X  X  X  X  X                           
     126                 :         4        X  X  X  X  X                           
     127                 :         5        X  X  X  X  X                           
     128                 :         6        X  X  X  X  X                           
     129                 :         7        X  X  X  X  X  X                        
     130                 :         8        X  X  X  X  X                    X      
     131                 :         9        X  X  X  X  X                           
     132                 :        10        X  X  X  X  X                           
     133                 :        11        X  X  X  X  X                           
     134                 :        12        X  X  X  X  X                           
     135                 :        15        X  X  X  X  X        X           X     X
     136                 :        17        X  X  X  X  X                           
     137                 :        18        X  X  X  X  X                    X     X
     138                 : 
     139                 :    3. Simplified by merged classes
     140                 : 
     141                 :    After the 2 simplification, the pair table have some duplication
     142                 :    a. class 2, 3, 4, 5, 6,  are the same- we can merged them
     143                 :    b. class 10, 11, 12, 17  are the same- we can merged them
     144                 : 
     145                 : 
     146                 :    Class of
     147                 :    Leading    Class of Trailing Char Class
     148                 :    Char
     149                 : 
     150                 :               1 [a] 7  8  9 [b]15 18
     151                 : 
     152                 :         1     X  X  X  X  X  X  X  X
     153                 :       [a]        X                  
     154                 :         7        X  X               
     155                 :         8        X              X   
     156                 :         9        X                  
     157                 :       [b]        X                  
     158                 :        15        X        X     X  X
     159                 :        18        X              X  X
     160                 : 
     161                 : 
     162                 :    4. We add COMPLEX characters and make it breakable w/ all ther class
     163                 :       except after class 1 and before class [a]
     164                 : 
     165                 :    Class of
     166                 :    Leading    Class of Trailing Char Class
     167                 :    Char
     168                 : 
     169                 :               1 [a] 7  8  9 [b]15 18 COMPLEX
     170                 : 
     171                 :         1     X  X  X  X  X  X  X  X  X
     172                 :       [a]        X                     
     173                 :         7        X  X                  
     174                 :         8        X              X      
     175                 :         9        X                     
     176                 :       [b]        X                     
     177                 :        15        X        X     X  X   
     178                 :        18        X              X  X   
     179                 :   COMPLEX        X                    T
     180                 : 
     181                 :      T : need special handling
     182                 : 
     183                 : 
     184                 :    5. However, we need two special class for some punctuations/parentheses,
     185                 :       theirs breaking rules like character class (18), see bug 389056.
     186                 :       And also we need character like punctuation that is same behavior with 18,
     187                 :       but the characters are not letters of all languages. (e.g., '_')
     188                 :       [c]. Based on open parenthesis class (1), but it is not breakable after
     189                 :            character class (18) or numeric class (15).
     190                 :       [d]. Based on close parenthesis (or punctuation) class (2), but it is not
     191                 :            breakable before character class (18) or numeric class (15).
     192                 : 
     193                 :    Class of
     194                 :    Leading    Class of Trailing Char Class
     195                 :    Char
     196                 : 
     197                 :               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d]
     198                 : 
     199                 :         1     X  X  X  X  X  X  X  X  X       X    X
     200                 :       [a]        X                            X    X
     201                 :         7        X  X                               
     202                 :         8        X              X                   
     203                 :         9        X                                  
     204                 :       [b]        X                                 X
     205                 :        15        X        X     X  X          X    X
     206                 :        18        X              X  X          X    X
     207                 :   COMPLEX        X                    T             
     208                 :       [c]     X  X  X  X  X  X  X  X  X       X    X
     209                 :       [d]        X              X  X               X
     210                 : 
     211                 : 
     212                 :    6. And Unicode has "NON-BREAK" characters. The lines should be broken around
     213                 :       them. But in JIS X 4051, such class is not, therefore, we create [e].
     214                 : 
     215                 :    Class of
     216                 :    Leading    Class of Trailing Char Class
     217                 :    Char
     218                 : 
     219                 :               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
     220                 : 
     221                 :         1     X  X  X  X  X  X  X  X  X       X    X   X
     222                 :       [a]        X                                 X   X
     223                 :         7        X  X                                  X
     224                 :         8        X              X                      X
     225                 :         9        X                                     X
     226                 :       [b]        X                                 X   X
     227                 :        15        X        X     X  X          X    X   X
     228                 :        18        X              X  X          X    X   X
     229                 :   COMPLEX        X                    T                X
     230                 :       [c]     X  X  X  X  X  X  X  X  X       X    X   X
     231                 :       [d]        X              X  X               X   X
     232                 :       [e]     X  X  X  X  X  X  X  X  X       X    X   X
     233                 : 
     234                 : 
     235                 :    7. Now we use one bit to encode weather it is breakable, and use 2 bytes
     236                 :       for one row, then the bit table will look like:
     237                 : 
     238                 :                  18    <-   1
     239                 : 
     240                 :        1  0000 1111 1111 1111  = 0x0FFF
     241                 :       [a] 0000 1100 0000 0010  = 0x0C02
     242                 :        7  0000 1000 0000 0110  = 0x0806
     243                 :        8  0000 1000 0100 0010  = 0x0842
     244                 :        9  0000 1000 0000 0010  = 0x0802
     245                 :       [b] 0000 1100 0000 0010  = 0x0C02
     246                 :       15  0000 1110 1101 0010  = 0x0ED2
     247                 :       18  0000 1110 1100 0010  = 0x0EC2
     248                 :  COMPLEX  0000 1001 0000 0010  = 0x0902
     249                 :       [c] 0000 1111 1111 1111  = 0x0FFF
     250                 :       [d] 0000 1100 1100 0010  = 0x0CC2
     251                 :       [e] 0000 1111 1111 1111  = 0x0FFF
     252                 : */
     253                 : 
     254                 : #define MAX_CLASSES 12
     255                 : 
     256                 : static const PRUint16 gPair[MAX_CLASSES] = {
     257                 :   0x0FFF,
     258                 :   0x0C02,
     259                 :   0x0806,
     260                 :   0x0842,
     261                 :   0x0802,
     262                 :   0x0C02,
     263                 :   0x0ED2,
     264                 :   0x0EC2,
     265                 :   0x0902,
     266                 :   0x0FFF,
     267                 :   0x0CC2,
     268                 :   0x0FFF
     269                 : };
     270                 : 
     271                 : 
     272                 : /*
     273                 : 
     274                 :    8. And if the character is not enough far from word start, word end and
     275                 :       another break point, we should not break in non-CJK languages.
     276                 :       I.e., Don't break around 15, 18, [c] and [d], but don't change
     277                 :       that if they are related to [b].
     278                 : 
     279                 :    Class of
     280                 :    Leading    Class of Trailing Char Class
     281                 :    Char
     282                 : 
     283                 :               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
     284                 : 
     285                 :         1     X  X  X  X  X  X  X  X  X       X    X   X
     286                 :       [a]        X              X  X          X    X   X
     287                 :         7        X  X           X  X          X    X   X
     288                 :         8        X              X  X          X    X   X
     289                 :         9        X              X  X          X    X   X
     290                 :       [b]        X                                 X   X
     291                 :        15     X  X  X  X  X     X  X  X       X    X   X
     292                 :        18     X  X  X  X  X     X  X  X       X    X   X
     293                 :   COMPLEX        X              X  X  T       X    X   X
     294                 :       [c]     X  X  X  X  X  X  X  X  X       X    X   X
     295                 :       [d]     X  X  X  X  X     X  X  X       X    X   X
     296                 :       [e]     X  X  X  X  X  X  X  X  X       X    X   X
     297                 : 
     298                 :                  18    <-   1
     299                 : 
     300                 :        1  0000 1111 1111 1111  = 0x0FFF
     301                 :       [a] 0000 1110 1100 0010  = 0x0EC2
     302                 :        7  0000 1110 1100 0110  = 0x0EC6
     303                 :        8  0000 1110 1100 0010  = 0x0EC2
     304                 :        9  0000 1110 1100 0010  = 0x0EC2
     305                 :       [b] 0000 1100 0000 0010  = 0x0C02
     306                 :       15  0000 1111 1101 1111  = 0x0FDF
     307                 :       18  0000 1111 1101 1111  = 0x0FDF
     308                 :  COMPLEX  0000 1111 1100 0010  = 0x0FC2
     309                 :       [c] 0000 1111 1111 1111  = 0x0FFF
     310                 :       [d] 0000 1111 1101 1111  = 0x0FDF
     311                 :       [e] 0000 1111 1111 1111  = 0x0FFF
     312                 : */
     313                 : 
     314                 : static const PRUint16 gPairConservative[MAX_CLASSES] = {
     315                 :   0x0FFF,
     316                 :   0x0EC2,
     317                 :   0x0EC6,
     318                 :   0x0EC2,
     319                 :   0x0EC2,
     320                 :   0x0C02,
     321                 :   0x0FDF,
     322                 :   0x0FDF,
     323                 :   0x0FC2,
     324                 :   0x0FFF,
     325                 :   0x0FDF,
     326                 :   0x0FFF
     327                 : };
     328                 : 
     329                 : 
     330                 : /*
     331                 : 
     332                 :    9. Now we map the class to number
     333                 : 
     334                 :       0: 1 
     335                 :       1: [a]- 2, 3, 4, 5, 6
     336                 :       2: 7
     337                 :       3: 8
     338                 :       4: 9
     339                 :       5: [b]- 10, 11, 12, 17
     340                 :       6: 15
     341                 :       7: 18
     342                 :       8: COMPLEX
     343                 :       9: [c]
     344                 :       A: [d]
     345                 :       B: [e]
     346                 : 
     347                 :     and they mean:
     348                 :       0: Open parenthesis
     349                 :       1: Punctuation that prohibits break before
     350                 :       2: Non-breakable between same classes
     351                 :       3: Prefix
     352                 :       4: Postfix
     353                 :       5: Breakable character (Spaces and Most Japanese characters)
     354                 :       6: Numeric
     355                 :       7: Characters
     356                 :       8: Need special handling characters (E.g., Thai)
     357                 :       9: Open parentheses like Character (See bug 389056)
     358                 :       A: Close parenthese (or punctuations) like Character (See bug 389056)
     359                 :       B: Non breakable (See bug 390920)
     360                 : 
     361                 : */
     362                 : 
     363                 : #define CLASS_NONE                             PR_INT8_MAX
     364                 : 
     365                 : #define CLASS_OPEN                             0x00
     366                 : #define CLASS_CLOSE                            0x01
     367                 : #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
     368                 : #define CLASS_PREFIX                           0x03
     369                 : #define CLASS_POSTFFIX                         0x04
     370                 : #define CLASS_BREAKABLE                        0x05
     371                 : #define CLASS_NUMERIC                          0x06
     372                 : #define CLASS_CHARACTER                        0x07
     373                 : #define CLASS_COMPLEX                          0x08
     374                 : #define CLASS_OPEN_LIKE_CHARACTER              0x09
     375                 : #define CLASS_CLOSE_LIKE_CHARACTER             0x0A
     376                 : #define CLASS_NON_BREAKABLE                    0x0B
     377                 : 
     378                 : #define U_NULL      PRUnichar(0x0000)
     379                 : #define U_SLASH     PRUnichar('/')
     380                 : #define U_SPACE     PRUnichar(' ')
     381                 : #define U_HYPHEN    PRUnichar('-')
     382                 : #define U_EQUAL     PRUnichar('=')
     383                 : #define U_PERCENT   PRUnichar('%')
     384                 : #define U_AMPERSAND PRUnichar('&')
     385                 : #define U_SEMICOLON PRUnichar(';')
     386                 : #define U_BACKSLASH PRUnichar('\\')
     387                 : #define U_OPEN_SINGLE_QUOTE PRUnichar(0x2018)
     388                 : #define U_OPEN_DOUBLE_QUOTE PRUnichar(0x201C)
     389                 : #define U_OPEN_GUILLEMET    PRUnichar(0x00AB)
     390                 : 
     391                 : #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
     392                 :                                      (c) == U_SLASH || \
     393                 :                                      (c) == U_PERCENT || \
     394                 :                                      (c) == U_AMPERSAND || \
     395                 :                                      (c) == U_SEMICOLON || \
     396                 :                                      (c) == U_BACKSLASH || \
     397                 :                                      (c) == U_OPEN_SINGLE_QUOTE || \
     398                 :                                      (c) == U_OPEN_DOUBLE_QUOTE || \
     399                 :                                      (c) == U_OPEN_GUILLEMET)
     400                 : 
     401                 : #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
     402                 : 
     403                 : static inline int
     404               0 : GETCLASSFROMTABLE(const PRUint32* t, PRUint16 l)
     405                 : {
     406               0 :   return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
     407                 : }
     408                 : 
     409                 : static inline int
     410               0 : IS_HALFWIDTH_IN_JISx4051_CLASS3(PRUnichar u)
     411                 : {
     412               0 :   return ((0xff66 <= (u)) && ((u) <= 0xff70));
     413                 : }
     414                 : 
     415                 : static inline int
     416             850 : IS_CJK_CHAR(PRUnichar u)
     417                 : {
     418                 :   return ((0x1100 <= (u) && (u) <= 0x11ff) ||
     419                 :           (0x2e80 <= (u) && (u) <= 0xd7ff) ||
     420                 :           (0xf900 <= (u) && (u) <= 0xfaff) ||
     421             850 :           (0xff00 <= (u) && (u) <= 0xffef) );
     422                 : }
     423                 : 
     424                 : static inline bool
     425              40 : IS_NONBREAKABLE_SPACE(PRUnichar u)
     426                 : {
     427              40 :   return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
     428                 : }
     429                 : 
     430                 : static inline bool
     431              40 : IS_HYPHEN(PRUnichar u)
     432                 : {
     433                 :   return (u == U_HYPHEN ||
     434                 :           u == 0x058A || // ARMENIAN HYPHEN
     435                 :           u == 0x2010 || // HYPHEN
     436                 :           u == 0x2012 || // FIGURE DASH
     437              40 :           u == 0x2013);  // EN DASH
     438                 : }
     439                 : 
     440                 : static PRInt8
     441              40 : GetClass(PRUnichar u)
     442                 : {
     443              40 :    PRUint16 h = u & 0xFF00;
     444              40 :    PRUint16 l = u & 0x00ff;
     445                 :    PRInt8 c;
     446                 : 
     447                 :    // Handle 3 range table first
     448              40 :    if (0x0000 == h) {
     449               0 :      c = GETCLASSFROMTABLE(gLBClass00, l);
     450              40 :    } else if (NS_NeedsPlatformNativeHandling(u)) {
     451               0 :      c = CLASS_COMPLEX;
     452              40 :    } else if (0x0E00 == h) {
     453               0 :      c = GETCLASSFROMTABLE(gLBClass0E, l);
     454              40 :    } else if (0x2000 == h) {
     455               0 :      c = GETCLASSFROMTABLE(gLBClass20, l);
     456              40 :    } else if (0x2100 == h) {
     457               0 :      c = GETCLASSFROMTABLE(gLBClass21, l);
     458              40 :    } else if (0x3000 == h) {
     459               0 :      c = GETCLASSFROMTABLE(gLBClass30, l);
     460              40 :    } else if (((0x3200 <= u) && (u <= 0xA4CF)) || // CJK and Yi
     461                 :               ((0xAC00 <= h) && (h <= 0xD7FF)) || // Hangul
     462                 :               ((0xf900 <= h) && (h <= 0xfaff))) {
     463              40 :      c = CLASS_BREAKABLE; // CJK character, Han, and Han Compatibility
     464               0 :    } else if (0xff00 == h) {
     465               0 :      if (l < 0x0060) { // Fullwidth ASCII variant
     466               0 :        c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));
     467               0 :      } else if (l < 0x00a0) {
     468               0 :        switch (l) {
     469               0 :          case 0x61: c = GetClass(0x3002); break;
     470               0 :          case 0x62: c = GetClass(0x300c); break;
     471               0 :          case 0x63: c = GetClass(0x300d); break;
     472               0 :          case 0x64: c = GetClass(0x3001); break;
     473               0 :          case 0x65: c = GetClass(0x30fb); break;
     474               0 :          case 0x9e: c = GetClass(0x309b); break;
     475               0 :          case 0x9f: c = GetClass(0x309c); break;
     476                 :          default:
     477               0 :            if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u))
     478               0 :               c = CLASS_CLOSE; // jis x4051 class 3
     479                 :            else
     480               0 :               c = CLASS_BREAKABLE; // jis x4051 class 11
     481               0 :            break;
     482                 :        }
     483                 :      // Halfwidth Katakana variants
     484               0 :      } else if (l < 0x00e0) {
     485               0 :        c = CLASS_CHARACTER; // Halfwidth Hangul variants
     486               0 :      } else if (l < 0x00f0) {
     487                 :        static PRUnichar NarrowFFEx[16] = {
     488                 :          0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
     489                 :          0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
     490                 :        };
     491               0 :        c = GetClass(NarrowFFEx[l - 0x00e0]);
     492                 :      } else {
     493               0 :        c = CLASS_CHARACTER;
     494                 :      }
     495               0 :    } else if (0x3100 == h) { 
     496               0 :      if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
     497                 :                       // XXX: This is per UAX #14, but UAX #14 may change
     498                 :                       // the line breaking rules about Kanbun and Bopomofo.
     499               0 :        c = CLASS_BREAKABLE;
     500               0 :      } else if (l >= 0xf0) { // Katakana small letters for Ainu
     501               0 :        c = CLASS_CLOSE;
     502                 :      } else { // unassigned
     503               0 :        c = CLASS_CHARACTER;
     504                 :      }
     505               0 :    } else if (0x0300 == h) {
     506               0 :      if (0x4F == l || (0x5C <= l && l <= 0x62))
     507               0 :        c = CLASS_NON_BREAKABLE;
     508                 :      else
     509               0 :        c = CLASS_CHARACTER;
     510               0 :    } else if (0x0500 == h) {
     511                 :      // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
     512               0 :      if (l == 0x8A)
     513               0 :        c = GETCLASSFROMTABLE(gLBClass00, PRUint16(U_HYPHEN));
     514                 :      else
     515               0 :        c = CLASS_CHARACTER;
     516               0 :    } else if (0x0F00 == h) {
     517               0 :      if (0x08 == l || 0x0C == l || 0x12 == l)
     518               0 :        c = CLASS_NON_BREAKABLE;
     519                 :      else
     520               0 :        c = CLASS_CHARACTER;
     521               0 :    } else if (0x1800 == h) {
     522               0 :      if (0x0E == l)
     523               0 :        c = CLASS_NON_BREAKABLE;
     524                 :      else
     525               0 :        c = CLASS_CHARACTER;
     526                 :    } else {
     527               0 :      c = CLASS_CHARACTER; // others
     528                 :    }
     529              40 :    return c;
     530                 : }
     531                 : 
     532                 : static bool
     533              39 : GetPair(PRInt8 c1, PRInt8 c2)
     534                 : {
     535              39 :   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
     536              39 :   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
     537                 : 
     538              39 :   return (0 == ((gPair[c1] >> c2) & 0x0001));
     539                 : }
     540                 : 
     541                 : static bool
     542               0 : GetPairConservative(PRInt8 c1, PRInt8 c2)
     543                 : {
     544               0 :   NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
     545               0 :   NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
     546                 : 
     547               0 :   return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
     548                 : }
     549                 : 
     550            1404 : nsJISx4051LineBreaker::nsJISx4051LineBreaker()
     551                 : {
     552            1404 : }
     553                 : 
     554            2806 : nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
     555                 : {
     556            5612 : }
     557                 : 
     558           12645 : NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker)
     559                 : 
     560                 : class ContextState {
     561                 : public:
     562               1 :   ContextState(const PRUnichar* aText, PRUint32 aLength) {
     563               1 :     mUniText = aText;
     564               1 :     mText = nsnull;
     565               1 :     mLength = aLength;
     566               1 :     Init();
     567               1 :   }
     568                 : 
     569               0 :   ContextState(const PRUint8* aText, PRUint32 aLength) {
     570               0 :     mUniText = nsnull;
     571               0 :     mText = aText;
     572               0 :     mLength = aLength;
     573               0 :     Init();
     574               0 :   }
     575                 : 
     576               0 :   PRUint32 Length() { return mLength; }
     577               0 :   PRUint32 Index() { return mIndex; }
     578                 : 
     579              40 :   PRUnichar GetCharAt(PRUint32 aIndex) {
     580              40 :     NS_ASSERTION(0 <= aIndex && aIndex < mLength, "Out of range!");
     581              40 :     return mUniText ? mUniText[aIndex] : PRUnichar(mText[aIndex]);
     582                 :   }
     583                 : 
     584              40 :   void AdvanceIndex() {
     585              40 :     ++mIndex;
     586              40 :   }
     587                 : 
     588              39 :   void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
     589                 : 
     590                 : // A word of western language should not be broken. But even if the word has
     591                 : // only ASCII characters, non-natural context words should be broken, e.g.,
     592                 : // URL and file path. For protecting the natural words, we should use
     593                 : // conservative breaking rules at following conditions:
     594                 : //   1. at near the start of word
     595                 : //   2. at near the end of word
     596                 : //   3. at near the latest broken point
     597                 : // CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
     598                 : #define CONSERVATIVE_BREAK_RANGE 6
     599                 : 
     600              39 :   bool UseConservativeBreaking(PRUint32 aOffset = 0) {
     601              39 :     if (mHasCJKChar)
     602              39 :       return false;
     603               0 :     PRUint32 index = mIndex + aOffset;
     604                 :     bool result = (index < CONSERVATIVE_BREAK_RANGE ||
     605                 :                      mLength - index < CONSERVATIVE_BREAK_RANGE ||
     606               0 :                      index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
     607               0 :     if (result || !mHasNonbreakableSpace)
     608               0 :       return result;
     609                 : 
     610                 :     // This text has no-breakable space, we need to check whether the index
     611                 :     // is near it.
     612                 : 
     613                 :     // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
     614               0 :     for (PRUint32 i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
     615               0 :       if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
     616               0 :         return true;
     617                 :     }
     618                 :     // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
     619               0 :     for (PRUint32 i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
     620               0 :       if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
     621               0 :         return true;
     622                 :     }
     623               0 :     return false;
     624                 :   }
     625                 : 
     626               0 :   bool HasPreviousEqualsSign() const {
     627               0 :     return mHasPreviousEqualsSign;
     628                 :   }
     629               0 :   void NotifySeenEqualsSign() {
     630               0 :     mHasPreviousEqualsSign = true;
     631               0 :   }
     632                 : 
     633               0 :   bool HasPreviousSlash() const {
     634               0 :     return mHasPreviousSlash;
     635                 :   }
     636               0 :   void NotifySeenSlash() {
     637               0 :     mHasPreviousSlash = true;
     638               0 :   }
     639                 : 
     640               0 :   bool HasPreviousBackslash() const {
     641               0 :     return mHasPreviousBackslash;
     642                 :   }
     643               0 :   void NotifySeenBackslash() {
     644               0 :     mHasPreviousBackslash = true;
     645               0 :   }
     646                 : 
     647               0 :   PRUnichar GetPreviousNonHyphenCharacter() const {
     648               0 :     return mPreviousNonHyphenCharacter;
     649                 :   }
     650              40 :   void NotifyNonHyphenCharacter(PRUnichar ch) {
     651              40 :     mPreviousNonHyphenCharacter = ch;
     652              40 :   }
     653                 : 
     654                 : private:
     655               1 :   void Init() {
     656               1 :     mIndex = 0;
     657               1 :     mLastBreakIndex = 0;
     658               1 :     mPreviousNonHyphenCharacter = U_NULL;
     659               1 :     mHasCJKChar = 0;
     660               1 :     mHasNonbreakableSpace = 0;
     661               1 :     mHasPreviousEqualsSign = false;
     662               1 :     mHasPreviousSlash = false;
     663               1 :     mHasPreviousBackslash = false;
     664                 : 
     665              41 :     for (PRUint32 i = 0; i < mLength; ++i) {
     666              40 :       PRUnichar u = GetCharAt(i);
     667              40 :       if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
     668               0 :         mHasNonbreakableSpace = 1;
     669              40 :       else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
     670               1 :         mHasCJKChar = 1;
     671                 :     }
     672               1 :   }
     673                 : 
     674                 :   const PRUnichar* mUniText;
     675                 :   const PRUint8* mText;
     676                 : 
     677                 :   PRUint32 mIndex;
     678                 :   PRUint32 mLength;         // length of text
     679                 :   PRUint32 mLastBreakIndex;
     680                 :   PRUnichar mPreviousNonHyphenCharacter; // The last character we have seen
     681                 :                                          // which is not U_HYPHEN
     682                 :   bool mHasCJKChar; // if the text has CJK character, this is true.
     683                 :   bool mHasNonbreakableSpace; // if the text has no-breakable space,
     684                 :                                      // this is true.
     685                 :   bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
     686                 :   bool mHasPreviousSlash;      // True if we have seen a U_SLASH
     687                 :   bool mHasPreviousBackslash;  // True if we have seen a U_BACKSLASH
     688                 : };
     689                 : 
     690                 : static PRInt8
     691               0 : ContextualAnalysis(PRUnichar prev, PRUnichar cur, PRUnichar next,
     692                 :                    ContextState &aState)
     693                 : {
     694                 :   // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
     695                 : 
     696               0 :   if (IS_HYPHEN(cur)) {
     697                 :     // If next character is hyphen, we don't need to break between them.
     698               0 :     if (IS_HYPHEN(next))
     699               0 :       return CLASS_CHARACTER;
     700                 :     // If prev and next characters are numeric, it may be in Math context.
     701                 :     // So, we should not break here.
     702               0 :     bool prevIsNum = IS_ASCII_DIGIT(prev);
     703               0 :     bool nextIsNum = IS_ASCII_DIGIT(next);
     704               0 :     if (prevIsNum && nextIsNum)
     705               0 :       return CLASS_NUMERIC;
     706                 :     // If one side is numeric and the other is a character, or if both sides are
     707                 :     // characters, the hyphen should be breakable.
     708               0 :     if (!aState.UseConservativeBreaking(1)) {
     709               0 :       PRUnichar prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
     710               0 :       if (prevOfHyphen && next) {
     711               0 :         bool prevIsChar = !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen) &&
     712               0 :                             GetClass(prevOfHyphen) == CLASS_CHARACTER;
     713               0 :         bool nextIsChar = !NEED_CONTEXTUAL_ANALYSIS(next) &&
     714               0 :                             GetClass(next) == CLASS_CHARACTER;
     715               0 :         if ((prevIsNum || prevIsChar) && (nextIsNum || nextIsChar))
     716               0 :           return CLASS_CLOSE;
     717                 :       }
     718                 :     }
     719                 :   } else {
     720               0 :     aState.NotifyNonHyphenCharacter(cur);
     721               0 :     if (cur == U_SLASH || cur == U_BACKSLASH) {
     722                 :       // If this is immediately after same char, we should not break here.
     723               0 :       if (prev == cur)
     724               0 :         return CLASS_CHARACTER;
     725                 :       // If this text has two or more (BACK)SLASHs, this may be file path or URL.
     726                 :       // Make sure to compute shouldReturn before we notify on this slash.
     727               0 :       bool shouldReturn = !aState.UseConservativeBreaking() &&
     728                 :         (cur == U_SLASH ?
     729               0 :          aState.HasPreviousSlash() : aState.HasPreviousBackslash());
     730                 : 
     731               0 :       if (cur == U_SLASH) {
     732               0 :         aState.NotifySeenSlash();
     733                 :       } else {
     734               0 :         aState.NotifySeenBackslash();
     735                 :       }
     736                 : 
     737               0 :       if (shouldReturn)
     738               0 :         return CLASS_OPEN;
     739               0 :     } else if (cur == U_PERCENT) {
     740                 :       // If this is a part of the param of URL, we should break before.
     741               0 :       if (!aState.UseConservativeBreaking()) {
     742               0 :         if (aState.Index() >= 3 &&
     743               0 :             aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
     744               0 :           return CLASS_OPEN;
     745               0 :         if (aState.Index() + 3 < aState.Length() &&
     746               0 :             aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
     747               0 :           return CLASS_OPEN;
     748                 :       }
     749               0 :     } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
     750                 :       // If this may be a separator of params of URL, we should break after.
     751               0 :       if (!aState.UseConservativeBreaking(1) &&
     752               0 :           aState.HasPreviousEqualsSign())
     753               0 :         return CLASS_CLOSE;
     754               0 :     } else if (cur == U_OPEN_SINGLE_QUOTE ||
     755                 :                cur == U_OPEN_DOUBLE_QUOTE ||
     756                 :                cur == U_OPEN_GUILLEMET) {
     757                 :       // for CJK usage, we treat these as openers to allow a break before them,
     758                 :       // but otherwise treat them as normal characters because quote mark usage
     759                 :       // in various Western languages varies too much; see bug #450088 discussion.
     760               0 :       if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
     761               0 :         return CLASS_OPEN;
     762                 :     } else {
     763               0 :       NS_ERROR("Forgot to handle the current character!");
     764                 :     }
     765                 :   }
     766               0 :   return GetClass(cur);
     767                 : }
     768                 : 
     769                 : 
     770                 : PRInt32
     771              22 : nsJISx4051LineBreaker::WordMove(const PRUnichar* aText, PRUint32 aLen,
     772                 :                                 PRUint32 aPos, PRInt8 aDirection)
     773                 : {
     774              22 :   bool    textNeedsJISx4051 = false;
     775                 :   PRInt32 begin, end;
     776                 : 
     777             848 :   for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
     778             826 :     if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
     779              37 :       textNeedsJISx4051 = true;
     780                 :     }
     781                 :   }
     782              45 :   for (end = aPos + 1; end < PRInt32(aLen) && !NS_IsSpace(aText[end]); ++end) {
     783              23 :     if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
     784               2 :       textNeedsJISx4051 = true;
     785                 :     }
     786                 :   }
     787                 : 
     788                 :   PRInt32 ret;
     789              44 :   nsAutoTArray<PRUint8, 2000> breakState;
     790              22 :   if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
     791                 :     // No complex text character, do not try to do complex line break.
     792                 :     // (This is required for serializers. See Bug #344816.)
     793                 :     // Also fall back to this when out of memory.
     794              21 :     if (aDirection < 0) {
     795              11 :       ret = (begin == PRInt32(aPos)) ? begin - 1 : begin;
     796                 :     } else {
     797              10 :       ret = end;
     798                 :     }
     799                 :   } else {
     800               1 :     GetJISx4051Breaks(aText + begin, end - begin, breakState.Elements());
     801                 : 
     802               1 :     ret = aPos;
     803               2 :     do {
     804               1 :       ret += aDirection;
     805               1 :     } while (begin < ret && ret < end && !breakState[ret - begin]);
     806                 :   }
     807                 : 
     808              22 :   return ret;
     809                 : }
     810                 : 
     811                 : PRInt32
     812              10 : nsJISx4051LineBreaker::Next(const PRUnichar* aText, PRUint32 aLen,
     813                 :                             PRUint32 aPos) 
     814                 : {
     815              10 :   NS_ASSERTION(aText, "aText shouldn't be null");
     816              10 :   NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
     817                 : 
     818              10 :   PRInt32 nextPos = WordMove(aText, aLen, aPos, 1);
     819              10 :   return nextPos < PRInt32(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
     820                 : }
     821                 : 
     822                 : PRInt32
     823              12 : nsJISx4051LineBreaker::Prev(const PRUnichar* aText, PRUint32 aLen,
     824                 :                             PRUint32 aPos) 
     825                 : {
     826              12 :   NS_ASSERTION(aText, "aText shouldn't be null");
     827              12 :   NS_ASSERTION(aLen >= aPos && aPos > 0,
     828                 :                "Bad position passed to nsJISx4051LineBreaker::Prev");
     829                 : 
     830              12 :   PRInt32 prevPos = WordMove(aText, aLen, aPos, -1);
     831              12 :   return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
     832                 : }
     833                 : 
     834                 : void
     835               1 : nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUnichar* aChars, PRUint32 aLength,
     836                 :                                          PRUint8* aBreakBefore)
     837                 : {
     838                 :   PRUint32 cur;
     839               1 :   PRInt8 lastClass = CLASS_NONE;
     840               1 :   ContextState state(aChars, aLength);
     841                 : 
     842              41 :   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
     843              40 :     PRUnichar ch = aChars[cur];
     844                 :     PRInt8 cl;
     845                 : 
     846              40 :     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
     847               0 :       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
     848                 :                               ch,
     849               0 :                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
     850               0 :                               state);
     851                 :     } else {
     852              40 :       if (ch == U_EQUAL)
     853               0 :         state.NotifySeenEqualsSign();
     854              40 :       state.NotifyNonHyphenCharacter(ch);
     855              40 :       cl = GetClass(ch);
     856                 :     }
     857                 : 
     858                 :     bool allowBreak;
     859              40 :     if (cur > 0) {
     860              39 :       NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
     861                 :                    "Loop should have prevented adjacent complex chars here");
     862              39 :       if (state.UseConservativeBreaking())
     863               0 :         allowBreak = GetPairConservative(lastClass, cl);
     864                 :       else
     865              39 :         allowBreak = GetPair(lastClass, cl);
     866                 :     } else {
     867               1 :       allowBreak = false;
     868                 :     }
     869              40 :     aBreakBefore[cur] = allowBreak;
     870              40 :     if (allowBreak)
     871              39 :       state.NotifyBreakBefore();
     872              40 :     lastClass = cl;
     873              40 :     if (CLASS_COMPLEX == cl) {
     874               0 :       PRUint32 end = cur + 1;
     875                 : 
     876               0 :       while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
     877               0 :         ++end;
     878                 :       }
     879                 : 
     880               0 :       NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
     881                 : 
     882                 :       // restore breakability at chunk begin, which was always set to false
     883                 :       // by the complex line breaker
     884               0 :       aBreakBefore[cur] = allowBreak;
     885                 : 
     886               0 :       cur = end - 1;
     887                 :     }
     888                 :   }
     889               1 : }
     890                 : 
     891                 : void
     892               0 : nsJISx4051LineBreaker::GetJISx4051Breaks(const PRUint8* aChars, PRUint32 aLength,
     893                 :                                          PRUint8* aBreakBefore)
     894                 : {
     895                 :   PRUint32 cur;
     896               0 :   PRInt8 lastClass = CLASS_NONE;
     897               0 :   ContextState state(aChars, aLength);
     898                 : 
     899               0 :   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
     900               0 :     PRUnichar ch = aChars[cur];
     901                 :     PRInt8 cl;
     902                 : 
     903               0 :     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
     904               0 :       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
     905                 :                               ch,
     906               0 :                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
     907               0 :                               state);
     908                 :     } else {
     909               0 :       if (ch == U_EQUAL)
     910               0 :         state.NotifySeenEqualsSign();
     911               0 :       state.NotifyNonHyphenCharacter(ch);
     912               0 :       cl = GetClass(ch);
     913                 :     }
     914                 : 
     915                 :     bool allowBreak;
     916               0 :     if (cur > 0) {
     917               0 :       if (state.UseConservativeBreaking())
     918               0 :         allowBreak = GetPairConservative(lastClass, cl);
     919                 :       else
     920               0 :         allowBreak = GetPair(lastClass, cl);
     921                 :     } else {
     922               0 :       allowBreak = false;
     923                 :     }
     924               0 :     aBreakBefore[cur] = allowBreak;
     925               0 :     if (allowBreak)
     926               0 :       state.NotifyBreakBefore();
     927               0 :     lastClass = cl;
     928                 :   }
     929               0 : }

Generated by: LCOV version 1.7