LCOV - code coverage report
Current view: directory - intl/hyphenation/src - hyphen.c (source / functions) Found Hit Coverage
Test: app.info Lines: 538 0 0.0 %
Date: 2012-06-02 Functions: 22 0 0.0 %

       1                 : /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
       2                 :  * licenses follows.
       3                 :  */
       4                 : 
       5                 : /* LibHnj - a library for high quality hyphenation and justification
       6                 :  * Copyright (C) 1998 Raph Levien, 
       7                 :  *           (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), 
       8                 :  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
       9                 :  *           (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
      10                 :  *
      11                 :  * This library is free software; you can redistribute it and/or
      12                 :  * modify it under the terms of the GNU Library General Public
      13                 :  * License as published by the Free Software Foundation; either
      14                 :  * version 2 of the License, or (at your option) any later version.
      15                 :  *
      16                 :  * This library is distributed in the hope that it will be useful,
      17                 :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      18                 :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      19                 :  * Library General Public License for more details.
      20                 :  *
      21                 :  * You should have received a copy of the GNU Library General Public
      22                 :  * License along with this library; if not, write to the 
      23                 :  * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 
      24                 :  * Boston, MA  02111-1307  USA.
      25                 : */
      26                 : 
      27                 : /*
      28                 :  * The contents of this file are subject to the Mozilla Public License
      29                 :  * Version 1.0 (the "MPL"); you may not use this file except in
      30                 :  * compliance with the MPL.  You may obtain a copy of the MPL at
      31                 :  * http://www.mozilla.org/MPL/
      32                 :  *
      33                 :  * Software distributed under the MPL is distributed on an "AS IS" basis,
      34                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
      35                 :  * for the specific language governing rights and limitations under the
      36                 :  * MPL.
      37                 :  *
      38                 :  */
      39                 : #include <stdlib.h> /* for NULL, malloc */
      40                 : #include <stdio.h>  /* for fprintf */
      41                 : #include <string.h> /* for strdup */
      42                 : 
      43                 : #ifdef UNX
      44                 : #include <unistd.h> /* for exit */
      45                 : #endif
      46                 : 
      47                 : #define noVERBOSE
      48                 : 
      49                 : /* calculate hyphenmin values with long ligature length (2 or 3 characters
      50                 :  * instead of 1 or 2) for comparison with hyphenation without ligatures */
      51                 : #define noLONG_LIGATURE
      52                 : 
      53                 : #ifdef LONG_LIGATURE
      54                 : #define LIG_xx  1
      55                 : #define LIG_xxx 2
      56                 : #else
      57                 : #define LIG_xx  0
      58                 : #define LIG_xxx 1
      59                 : #endif
      60                 : 
      61                 : #include "hnjalloc.h"
      62                 : #include "hyphen.h"
      63                 : 
      64                 : static char *
      65               0 : hnj_strdup (const char *s)
      66                 : {
      67                 :   char *new;
      68                 :   int l;
      69                 : 
      70               0 :   l = strlen (s);
      71               0 :   new = hnj_malloc (l + 1);
      72               0 :   memcpy (new, s, l);
      73               0 :   new[l] = 0;
      74               0 :   return new;
      75                 : }
      76                 : 
      77                 : /* remove cross-platform text line end characters */
      78               0 : void hnj_strchomp(char * s)
      79                 : {
      80               0 :   int k = strlen(s);
      81               0 :   if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
      82               0 :   if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
      83               0 : }
      84                 : 
      85                 : /* a little bit of a hash table implementation. This simply maps strings
      86                 :    to state numbers */
      87                 : 
      88                 : typedef struct _HashTab HashTab;
      89                 : typedef struct _HashEntry HashEntry;
      90                 : 
      91                 : /* A cheap, but effective, hack. */
      92                 : #define HASH_SIZE 31627
      93                 : 
      94                 : struct _HashTab {
      95                 :   HashEntry *entries[HASH_SIZE];
      96                 : };
      97                 : 
      98                 : struct _HashEntry {
      99                 :   HashEntry *next;
     100                 :   char *key;
     101                 :   int val;
     102                 : };
     103                 : 
     104                 : /* a char* hash function from ASU - adapted from Gtk+ */
     105                 : static unsigned int
     106               0 : hnj_string_hash (const char *s)
     107                 : {
     108                 :   const char *p;
     109               0 :   unsigned int h=0, g;
     110               0 :   for(p = s; *p != '\0'; p += 1) {
     111               0 :     h = ( h << 4 ) + *p;
     112               0 :     if ( ( g = h & 0xf0000000 ) ) {
     113               0 :       h = h ^ (g >> 24);
     114               0 :       h = h ^ g;
     115                 :     }
     116                 :   }
     117               0 :   return h /* % M */;
     118                 : }
     119                 : 
     120                 : static HashTab *
     121               0 : hnj_hash_new (void)
     122                 : {
     123                 :   HashTab *hashtab;
     124                 :   int i;
     125                 : 
     126               0 :   hashtab = hnj_malloc (sizeof(HashTab));
     127               0 :   for (i = 0; i < HASH_SIZE; i++)
     128               0 :     hashtab->entries[i] = NULL;
     129                 : 
     130               0 :   return hashtab;
     131                 : }
     132                 : 
     133                 : static void
     134               0 : hnj_hash_free (HashTab *hashtab)
     135                 : {
     136                 :   int i;
     137                 :   HashEntry *e, *next;
     138                 : 
     139               0 :   for (i = 0; i < HASH_SIZE; i++)
     140               0 :     for (e = hashtab->entries[i]; e; e = next)
     141                 :       {
     142               0 :         next = e->next;
     143               0 :         hnj_free (e->key);
     144               0 :         hnj_free (e);
     145                 :       }
     146                 : 
     147               0 :   hnj_free (hashtab);
     148               0 : }
     149                 : 
     150                 : /* assumes that key is not already present! */
     151                 : static void
     152               0 : hnj_hash_insert (HashTab *hashtab, const char *key, int val)
     153                 : {
     154                 :   int i;
     155                 :   HashEntry *e;
     156                 : 
     157               0 :   i = hnj_string_hash (key) % HASH_SIZE;
     158               0 :   e = hnj_malloc (sizeof(HashEntry));
     159               0 :   e->next = hashtab->entries[i];
     160               0 :   e->key = hnj_strdup (key);
     161               0 :   e->val = val;
     162               0 :   hashtab->entries[i] = e;
     163               0 : }
     164                 : 
     165                 : /* return val if found, otherwise -1 */
     166                 : static int
     167               0 : hnj_hash_lookup (HashTab *hashtab, const char *key)
     168                 : {
     169                 :   int i;
     170                 :   HashEntry *e;
     171               0 :   i = hnj_string_hash (key) % HASH_SIZE;
     172               0 :   for (e = hashtab->entries[i]; e; e = e->next)
     173               0 :     if (!strcmp (key, e->key))
     174               0 :       return e->val;
     175               0 :   return -1;
     176                 : }
     177                 : 
     178                 : /* Get the state number, allocating a new state if necessary. */
     179                 : static int
     180               0 : hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
     181                 : {
     182                 :   int state_num;
     183                 : 
     184               0 :   state_num = hnj_hash_lookup (hashtab, string);
     185                 : 
     186               0 :   if (state_num >= 0)
     187               0 :     return state_num;
     188                 : 
     189               0 :   hnj_hash_insert (hashtab, string, dict->num_states);
     190                 :   /* predicate is true if dict->num_states is a power of two */
     191               0 :   if (!(dict->num_states & (dict->num_states - 1)))
     192                 :     {
     193               0 :       dict->states = hnj_realloc (dict->states,
     194                 :                                   (dict->num_states << 1) *
     195                 :                                   sizeof(HyphenState));
     196                 :     }
     197               0 :   dict->states[dict->num_states].match = NULL;
     198               0 :   dict->states[dict->num_states].repl = NULL;
     199               0 :   dict->states[dict->num_states].fallback_state = -1;
     200               0 :   dict->states[dict->num_states].num_trans = 0;
     201               0 :   dict->states[dict->num_states].trans = NULL;
     202               0 :   return dict->num_states++;
     203                 : }
     204                 : 
     205                 : /* add a transition from state1 to state2 through ch - assumes that the
     206                 :    transition does not already exist */
     207                 : static void
     208               0 : hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
     209                 : {
     210                 :   int num_trans;
     211                 : 
     212               0 :   num_trans = dict->states[state1].num_trans;
     213               0 :   if (num_trans == 0)
     214                 :     {
     215               0 :       dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
     216                 :     }
     217               0 :   else if (!(num_trans & (num_trans - 1)))
     218                 :     {
     219               0 :       dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
     220                 :                                                 (num_trans << 1) *
     221                 :                                                 sizeof(HyphenTrans));
     222                 :     }
     223               0 :   dict->states[state1].trans[num_trans].ch = ch;
     224               0 :   dict->states[state1].trans[num_trans].new_state = state2;
     225               0 :   dict->states[state1].num_trans++;
     226               0 : }
     227                 : 
     228                 : #ifdef VERBOSE
     229                 : HashTab *global[1];
     230                 : 
     231                 : static char *
     232                 : get_state_str (int state, int level)
     233                 : {
     234                 :   int i;
     235                 :   HashEntry *e;
     236                 : 
     237                 :   for (i = 0; i < HASH_SIZE; i++)
     238                 :     for (e = global[level]->entries[i]; e; e = e->next)
     239                 :       if (e->val == state)
     240                 :         return e->key;
     241                 :   return NULL;
     242                 : }
     243                 : #endif
     244                 : 
     245               0 : void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
     246                 :   int i, j;
     247                 :   char word[MAX_CHARS];
     248                 :   char pattern[MAX_CHARS];
     249                 :   char * repl;
     250                 :   signed char replindex;
     251                 :   signed char replcut;
     252               0 :   int state_num = 0;
     253                 :   int last_state;
     254                 :   char ch;
     255                 :   int found;
     256                 : 
     257               0 :           if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
     258               0 :             dict->lhmin = atoi(buf + 13);
     259               0 :             return;
     260               0 :           } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
     261               0 :             dict->rhmin = atoi(buf + 14);
     262               0 :             return;
     263               0 :           } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
     264               0 :             dict->clhmin = atoi(buf + 21);
     265               0 :             return;
     266               0 :           } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
     267               0 :             dict->crhmin = atoi(buf + 22);
     268               0 :             return;
     269               0 :           } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
     270               0 :             char * space = buf + 8;
     271               0 :             while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
     272               0 :             if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
     273               0 :             if (dict->nohyphen) {
     274               0 :                 char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
     275               0 :                 *nhe = 0;
     276               0 :                 for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
     277               0 :                         if (*nhe == ',') {
     278               0 :                             dict->nohyphenl++;
     279               0 :                             *nhe = 0;
     280                 :                         }
     281                 :                 }
     282                 :             }
     283               0 :             return;
     284                 :           } 
     285               0 :           j = 0;
     286               0 :           pattern[j] = '0';
     287               0 :           repl = strchr(buf, '/');
     288               0 :           replindex = 0;
     289               0 :           replcut = 0;
     290               0 :           if (repl) {
     291               0 :             char * index = strchr(repl + 1, ',');
     292               0 :             *repl = '\0';
     293               0 :             if (index) {
     294               0 :                 char * index2 = strchr(index + 1, ',');
     295               0 :                 *index = '\0';
     296               0 :                 if (index2) {
     297               0 :                     *index2 = '\0';
     298               0 :                     replindex = (signed char) atoi(index + 1) - 1;
     299               0 :                     replcut = (signed char) atoi(index2 + 1);                
     300                 :                 }
     301                 :             } else {
     302               0 :                 hnj_strchomp(repl + 1);
     303               0 :                 replindex = 0;
     304               0 :                 replcut = (signed char) strlen(buf);
     305                 :             }
     306               0 :             repl = hnj_strdup(repl + 1);
     307                 :           }
     308               0 :           for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
     309                 :             {
     310               0 :               if (buf[i] >= '0' && buf[i] <= '9')
     311               0 :                 pattern[j] = buf[i];
     312                 :               else
     313                 :                 {
     314               0 :                   word[j] = buf[i];
     315               0 :                   pattern[++j] = '0';
     316                 :                 }
     317                 :             }
     318               0 :           word[j] = '\0';
     319               0 :           pattern[j + 1] = '\0';
     320                 : 
     321               0 :           i = 0;
     322               0 :           if (!repl) {
     323                 :             /* Optimize away leading zeroes */
     324               0 :             for (; pattern[i] == '0'; i++);
     325                 :           } else {
     326               0 :             if (*word == '.') i++;
     327                 :             /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
     328               0 :             if (dict->utf8) {
     329               0 :                 int pu = -1;        /* unicode character position */
     330               0 :                 int ps = -1;        /* unicode start position (original replindex) */
     331               0 :                 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
     332               0 :                 for (; pc < (strlen(word) + 1); pc++) {
     333                 :                 /* beginning of an UTF-8 character (not '10' start bits) */
     334               0 :                     if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
     335               0 :                     if ((ps < 0) && (replindex == pu)) {
     336               0 :                         ps = replindex;
     337               0 :                         replindex = (signed char) pc;
     338                 :                     }
     339               0 :                     if ((ps >= 0) && ((pu - ps) == replcut)) {
     340               0 :                         replcut = (signed char) (pc - replindex);
     341               0 :                         break;
     342                 :                     }
     343                 :                 }
     344               0 :                 if (*word == '.') replindex--;
     345                 :             }
     346                 :           }
     347                 : 
     348                 : #ifdef VERBOSE
     349                 :           printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
     350                 : #endif
     351               0 :           found = hnj_hash_lookup (hashtab, word);
     352               0 :           state_num = hnj_get_state (dict, hashtab, word);
     353               0 :           dict->states[state_num].match = hnj_strdup (pattern + i);
     354               0 :           dict->states[state_num].repl = repl;
     355               0 :           dict->states[state_num].replindex = replindex;
     356               0 :           if (!replcut) {
     357               0 :             dict->states[state_num].replcut = (signed char) strlen(word);
     358                 :           } else {
     359               0 :             dict->states[state_num].replcut = replcut;
     360                 :           }
     361                 : 
     362                 :           /* now, put in the prefix transitions */
     363               0 :           for (; found < 0 ;j--)
     364                 :             {
     365               0 :               last_state = state_num;
     366               0 :               ch = word[j - 1];
     367               0 :               word[j - 1] = '\0';
     368               0 :               found = hnj_hash_lookup (hashtab, word);
     369               0 :               state_num = hnj_get_state (dict, hashtab, word);
     370               0 :               hnj_add_trans (dict, state_num, last_state, ch);
     371                 :             }
     372                 : }
     373                 : 
     374                 : HyphenDict *
     375               0 : hnj_hyphen_load (const char *fn)
     376                 : {
     377                 :   HyphenDict *dict[2];
     378                 :   HashTab *hashtab;
     379                 :   FILE *f;
     380                 :   char buf[MAX_CHARS];
     381               0 :   int nextlevel = 0;
     382                 :   int i, j, k;
     383                 :   HashEntry *e;
     384               0 :   int state_num = 0;
     385                 : 
     386               0 :   f = fopen (fn, "r");
     387               0 :   if (f == NULL)
     388               0 :     return NULL;
     389                 : 
     390                 : // loading one or two dictionaries (separated by NEXTLEVEL keyword)
     391               0 : for (k = 0; k < 2; k++) { 
     392               0 :   hashtab = hnj_hash_new ();
     393                 : #ifdef VERBOSE
     394                 :   global[k] = hashtab;
     395                 : #endif
     396               0 :   hnj_hash_insert (hashtab, "", 0);
     397               0 :   dict[k] = hnj_malloc (sizeof(HyphenDict));
     398               0 :   dict[k]->num_states = 1;
     399               0 :   dict[k]->states = hnj_malloc (sizeof(HyphenState));
     400               0 :   dict[k]->states[0].match = NULL;
     401               0 :   dict[k]->states[0].repl = NULL;
     402               0 :   dict[k]->states[0].fallback_state = -1;
     403               0 :   dict[k]->states[0].num_trans = 0;
     404               0 :   dict[k]->states[0].trans = NULL;
     405               0 :   dict[k]->nextlevel = NULL;
     406               0 :   dict[k]->lhmin = 0;
     407               0 :   dict[k]->rhmin = 0;
     408               0 :   dict[k]->clhmin = 0;
     409               0 :   dict[k]->crhmin = 0;
     410               0 :   dict[k]->nohyphen = NULL;
     411               0 :   dict[k]->nohyphenl = 0;
     412                 : 
     413                 :   /* read in character set info */
     414               0 :   if (k == 0) {
     415               0 :     for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
     416               0 :     if (fgets(dict[k]->cset,  sizeof(dict[k]->cset),f) != NULL) {
     417               0 :       for (i=0;i<MAX_NAME;i++)
     418               0 :         if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
     419               0 :           dict[k]->cset[i] = 0;
     420                 :     } else {
     421               0 :       dict[k]->cset[0] = 0;
     422                 :     }
     423               0 :     dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
     424                 :   } else {
     425               0 :     strcpy(dict[k]->cset, dict[0]->cset);
     426               0 :     dict[k]->utf8 = dict[0]->utf8;
     427                 :   }
     428                 : 
     429               0 :   if (k == 0 || nextlevel) {
     430               0 :     while (fgets (buf, sizeof(buf), f) != NULL) {
     431               0 :       if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
     432               0 :         nextlevel = 1;
     433               0 :         break;
     434               0 :       } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
     435                 :     }
     436               0 :   } else if (k == 1) {
     437                 :     /* default first level: hyphen and ASCII apostrophe */
     438               0 :     if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN '\n", dict[k], hashtab);
     439               0 :     else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99\n", dict[k], hashtab);
     440               0 :     strcpy(buf, "1-1/=,1,1\n"); // buf rewritten by hnj_hyphen_load here
     441               0 :     hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
     442               0 :     hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
     443               0 :     if (dict[0]->utf8) {
     444               0 :       hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
     445               0 :       hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
     446                 :     }
     447                 :   }
     448                 : 
     449                 :   /* Could do unioning of matches here (instead of the preprocessor script).
     450                 :      If we did, the pseudocode would look something like this:
     451                 : 
     452                 :      foreach state in the hash table
     453                 :         foreach i = [1..length(state) - 1]
     454                 :            state to check is substr (state, i)
     455                 :            look it up
     456                 :            if found, and if there is a match, union the match in.
     457                 : 
     458                 :      It's also possible to avoid the quadratic blowup by doing the
     459                 :      search in order of increasing state string sizes - then you
     460                 :      can break the loop after finding the first match.
     461                 : 
     462                 :      This step should be optional in any case - if there is a
     463                 :      preprocessed rule table, it's always faster to use that.
     464                 : 
     465                 : */
     466                 : 
     467                 :   /* put in the fallback states */
     468               0 :   for (i = 0; i < HASH_SIZE; i++)
     469               0 :     for (e = hashtab->entries[i]; e; e = e->next)
     470                 :       {
     471               0 :         if (*(e->key)) for (j = 1; 1; j++)
     472                 :           {          
     473               0 :             state_num = hnj_hash_lookup (hashtab, e->key + j);
     474               0 :             if (state_num >= 0)
     475               0 :               break;
     476               0 :           }
     477                 :         /* KBH: FIXME state 0 fallback_state should always be -1? */
     478               0 :         if (e->val)
     479               0 :           dict[k]->states[e->val].fallback_state = state_num;
     480                 :       }
     481                 : #ifdef VERBOSE
     482                 :   for (i = 0; i < HASH_SIZE; i++)
     483                 :     for (e = hashtab->entries[i]; e; e = e->next)
     484                 :       {
     485                 :         printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
     486                 :                 dict[k]->states[e->val].fallback_state);
     487                 :         for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
     488                 :           printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
     489                 :                   dict[k]->states[e->val].trans[j].new_state);
     490                 :       }
     491                 : #endif
     492                 : 
     493                 : #ifndef VERBOSE
     494               0 :   hnj_hash_free (hashtab);
     495                 : #endif
     496               0 :   state_num = 0;
     497                 : }
     498               0 :   fclose(f);
     499               0 :   if (nextlevel) dict[0]->nextlevel = dict[1];
     500                 :   else {
     501               0 :     dict[1] -> nextlevel = dict[0];
     502               0 :     dict[1]->lhmin = dict[0]->lhmin;
     503               0 :     dict[1]->rhmin = dict[0]->rhmin;
     504               0 :     dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
     505               0 :     dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
     506                 : #ifdef VERBOSE
     507                 :     HashTab *r = global[0];
     508                 :     global[0] = global[1];
     509                 :     global[1] = r;
     510                 : #endif
     511               0 :     return dict[1];
     512                 :   }
     513               0 :   return dict[0];
     514                 : }
     515                 : 
     516               0 : void hnj_hyphen_free (HyphenDict *dict)
     517                 : {
     518                 :   int state_num;
     519                 :   HyphenState *hstate;
     520                 : 
     521               0 :   for (state_num = 0; state_num < dict->num_states; state_num++)
     522                 :     {
     523               0 :       hstate = &dict->states[state_num];
     524               0 :       if (hstate->match)
     525               0 :         hnj_free (hstate->match);
     526               0 :       if (hstate->repl)
     527               0 :         hnj_free (hstate->repl);
     528               0 :       if (hstate->trans)
     529               0 :         hnj_free (hstate->trans);
     530                 :     }
     531               0 :   if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
     532                 : 
     533               0 :   if (dict->nohyphen) hnj_free(dict->nohyphen);
     534                 : 
     535               0 :   hnj_free (dict->states);
     536                 : 
     537               0 :   hnj_free (dict);
     538               0 : }
     539                 : 
     540                 : #define MAX_WORD 256
     541                 : 
     542               0 : int hnj_hyphen_hyphenate (HyphenDict *dict,
     543                 :                            const char *word, int word_size,
     544                 :                            char *hyphens)
     545                 : {
     546                 :   char prep_word_buf[MAX_WORD];
     547                 :   char *prep_word;
     548                 :   int i, j, k;
     549                 :   int state;
     550                 :   char ch;
     551                 :   HyphenState *hstate;
     552                 :   char *match;
     553                 :   int offset;
     554                 : 
     555               0 :   if (word_size + 3 < MAX_WORD)
     556               0 :     prep_word = prep_word_buf;
     557                 :   else
     558               0 :     prep_word = hnj_malloc (word_size + 3);
     559                 : 
     560               0 :   j = 0;
     561               0 :   prep_word[j++] = '.';
     562                 : 
     563               0 :   for (i = 0; i < word_size; i++) {
     564               0 :     if (word[i] <= '9' && word[i] >= '0') {
     565               0 :       prep_word[j++] = '.';
     566                 :     } else {
     567               0 :       prep_word[j++] = word[i];
     568                 :     }
     569                 :   }
     570                 : 
     571               0 :   prep_word[j++] = '.';
     572               0 :   prep_word[j] = '\0';
     573                 : 
     574               0 :   for (i = 0; i < word_size + 5; i++)
     575               0 :     hyphens[i] = '0';
     576                 : 
     577                 : #ifdef VERBOSE
     578                 :   printf ("prep_word = %s\n", prep_word);
     579                 : #endif
     580                 : 
     581                 :   /* now, run the finite state machine */
     582               0 :   state = 0;
     583               0 :   for (i = 0; i < j; i++)
     584                 :     {
     585               0 :       ch = prep_word[i];
     586                 :       for (;;)
     587                 :         {
     588                 : 
     589               0 :           if (state == -1) {
     590                 :             /* return 1; */
     591                 :             /*  KBH: FIXME shouldn't this be as follows? */
     592               0 :             state = 0;
     593               0 :             goto try_next_letter;
     594                 :           }          
     595                 : 
     596                 : #ifdef VERBOSE
     597                 :           char *state_str;
     598                 :           state_str = get_state_str (state, 0);
     599                 : 
     600                 :           for (k = 0; k < i - strlen (state_str); k++)
     601                 :             putchar (' ');
     602                 :           printf ("%s", state_str);
     603                 : #endif
     604                 : 
     605               0 :           hstate = &dict->states[state];
     606               0 :           for (k = 0; k < hstate->num_trans; k++)
     607               0 :             if (hstate->trans[k].ch == ch)
     608                 :               {
     609               0 :                 state = hstate->trans[k].new_state;
     610               0 :                 goto found_state;
     611                 :               }
     612               0 :           state = hstate->fallback_state;
     613                 : #ifdef VERBOSE
     614                 :           printf (" falling back, fallback_state %d\n", state);
     615                 : #endif
     616               0 :         }
     617                 :     found_state:
     618                 : #ifdef VERBOSE
     619                 :       printf ("found state %d\n",state);
     620                 : #endif
     621                 :       /* Additional optimization is possible here - especially,
     622                 :          elimination of trailing zeroes from the match. Leading zeroes
     623                 :          have already been optimized. */
     624               0 :       match = dict->states[state].match;
     625                 :       /* replacing rules not handled by hyphen_hyphenate() */
     626               0 :       if (match && !dict->states[state].repl)
     627                 :         {
     628               0 :           offset = i + 1 - strlen (match);
     629                 : #ifdef VERBOSE
     630                 :           for (k = 0; k < offset; k++)
     631                 :             putchar (' ');
     632                 :           printf ("%s\n", match);
     633                 : #endif
     634                 :           /* This is a linear search because I tried a binary search and
     635                 :              found it to be just a teeny bit slower. */
     636               0 :           for (k = 0; match[k]; k++)
     637               0 :             if (hyphens[offset + k] < match[k])
     638               0 :               hyphens[offset + k] = match[k];
     639                 :         }
     640                 : 
     641                 :       /* KBH: we need this to make sure we keep looking in a word */
     642                 :       /* for patterns even if the current character is not known in state 0 */
     643                 :       /* since patterns for hyphenation may occur anywhere in the word */
     644                 :       try_next_letter: ;
     645                 : 
     646                 :     }
     647                 : #ifdef VERBOSE
     648                 :   for (i = 0; i < j; i++)
     649                 :     putchar (hyphens[i]);
     650                 :   putchar ('\n');
     651                 : #endif
     652                 : 
     653               0 :   for (i = 0; i < j - 4; i++)
     654                 : #if 0
     655                 :     if (hyphens[i + 1] & 1)
     656                 :       hyphens[i] = '-';
     657                 : #else
     658               0 :     hyphens[i] = hyphens[i + 1];
     659                 : #endif
     660               0 :   hyphens[0] = '0';
     661               0 :   for (; i < word_size; i++)
     662               0 :     hyphens[i] = '0';
     663               0 :   hyphens[word_size] = '\0';
     664                 : 
     665               0 :   if (prep_word != prep_word_buf)
     666               0 :     hnj_free (prep_word);
     667                 :     
     668               0 :   return 0;    
     669                 : }
     670                 : 
     671                 : /* Unicode ligature length */
     672               0 : int hnj_ligature(unsigned char c) {
     673               0 :     switch (c) {
     674                 :         case 0x80:                      /* ff */
     675                 :         case 0x81:                      /* fi */
     676               0 :         case 0x82: return LIG_xx;       /* fl */
     677                 :         case 0x83:                      /* ffi */
     678               0 :         case 0x84: return LIG_xxx;      /* ffl */
     679                 :         case 0x85:                      /* long st */
     680               0 :         case 0x86: return LIG_xx;       /* st */
     681                 :     }
     682               0 :     return 0;
     683                 : }
     684                 : 
     685                 : /* character length of the first n byte of the input word */
     686               0 : int hnj_hyphen_strnlen(const char * word, int n, int utf8)
     687                 : {
     688               0 :     int i = 0;
     689               0 :     int j = 0;
     690               0 :     while (j < n && word[j] != '\0') {
     691               0 :       i++;
     692                 :       // Unicode ligature support
     693               0 :       if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
     694               0 :         i += hnj_ligature(word[j + 2]);
     695                 :       }
     696               0 :       for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
     697                 :     }
     698               0 :     return i;
     699                 : }
     700                 : 
     701               0 : int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
     702                 :         char *** rep, int ** pos, int ** cut, int lhmin)
     703                 : {
     704               0 :     int i = 1, j;
     705                 : 
     706                 :     // Unicode ligature support
     707               0 :     if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC))  {
     708               0 :       i += hnj_ligature(word[2]);
     709                 :     }
     710                 : 
     711                 :     // ignore numbers
     712               0 :     for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
     713                 : 
     714               0 :     for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
     715                 :       // check length of the non-standard part
     716               0 :       if (*rep && *pos && *cut && (*rep)[j]) {
     717               0 :         char * rh = strchr((*rep)[j], '=');
     718               0 :         if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
     719               0 :           hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
     720               0 :             free((*rep)[j]);
     721               0 :             (*rep)[j] = NULL;
     722               0 :             hyphens[j] = '0';
     723                 :           }
     724                 :        } else {
     725               0 :          hyphens[j] = '0';
     726                 :        }
     727               0 :        j++;
     728                 : 
     729                 :        // Unicode ligature support
     730               0 :        if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
     731               0 :          i += hnj_ligature(word[j + 2]);
     732                 :        }
     733               0 :     } while (utf8 && (word[j] & 0xc0) == 0x80);
     734               0 :     return 0;
     735                 : }
     736                 : 
     737               0 : int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
     738                 :         char *** rep, int ** pos, int ** cut, int rhmin)
     739                 : {
     740               0 :     int i = 1;
     741                 :     int j;
     742                 : 
     743                 :     // ignore numbers
     744               0 :     for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
     745                 : 
     746               0 :     for (j = word_size - 2; i < rhmin && j > 0; j--) {
     747                 :       // check length of the non-standard part
     748               0 :       if (*rep && *pos && *cut && (*rep)[j]) {
     749               0 :         char * rh = strchr((*rep)[j], '=');
     750               0 :         if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
     751               0 :           hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
     752               0 :             free((*rep)[j]);
     753               0 :             (*rep)[j] = NULL;
     754               0 :             hyphens[j] = '0';
     755                 :           }
     756                 :        } else {
     757               0 :          hyphens[j] = '0';
     758                 :        }
     759               0 :        if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
     760                 :     }
     761               0 :     return 0;
     762                 : }
     763                 : 
     764                 : // recursive function for compound level hyphenation
     765               0 : int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
     766                 :     char * hyphens, char *** rep, int ** pos, int ** cut,
     767                 :     int clhmin, int crhmin, int lend, int rend)
     768                 : {
     769                 :   char prep_word_buf[MAX_WORD];
     770                 :   char *prep_word;
     771                 :   int i, j, k;
     772                 :   int state;
     773                 :   char ch;
     774                 :   HyphenState *hstate;
     775                 :   char *match;
     776                 :   char *repl;
     777                 :   signed char replindex;
     778                 :   signed char replcut;
     779                 :   int offset;
     780                 :   int matchlen_buf[MAX_CHARS];
     781                 :   int matchindex_buf[MAX_CHARS];
     782                 :   char * matchrepl_buf[MAX_CHARS];
     783                 :   int * matchlen;
     784                 :   int * matchindex;
     785                 :   char ** matchrepl;  
     786               0 :   int isrepl = 0;
     787                 :   int nHyphCount;
     788                 : 
     789               0 :   if (word_size + 3 < MAX_CHARS) {
     790               0 :     prep_word = prep_word_buf;
     791               0 :     matchlen = matchlen_buf;
     792               0 :     matchindex = matchindex_buf;
     793               0 :     matchrepl = matchrepl_buf;
     794                 :   } else {
     795               0 :     prep_word = hnj_malloc (word_size + 3);
     796               0 :     matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
     797               0 :     matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
     798               0 :     matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
     799                 :   }
     800                 : 
     801               0 :   j = 0;
     802               0 :   prep_word[j++] = '.';
     803                 :   
     804               0 :   for (i = 0; i < word_size; i++) {
     805               0 :     if (word[i] <= '9' && word[i] >= '0') {
     806               0 :       prep_word[j++] = '.';
     807                 :     } else {
     808               0 :       prep_word[j++] = word[i];
     809                 :     }
     810                 :   }
     811                 : 
     812                 : 
     813                 : 
     814               0 :   prep_word[j++] = '.';
     815               0 :   prep_word[j] = '\0';
     816                 : 
     817               0 :   for (i = 0; i < j; i++)
     818               0 :     hyphens[i] = '0';    
     819                 : 
     820                 : #ifdef VERBOSE
     821                 :   printf ("prep_word = %s\n", prep_word);
     822                 : #endif
     823                 : 
     824                 :   /* now, run the finite state machine */
     825               0 :   state = 0;
     826               0 :   for (i = 0; i < j; i++)
     827                 :     {
     828               0 :       ch = prep_word[i];
     829                 :       for (;;)
     830                 :         {
     831                 : 
     832               0 :           if (state == -1) {
     833                 :             /* return 1; */
     834                 :             /*  KBH: FIXME shouldn't this be as follows? */
     835               0 :             state = 0;
     836               0 :             goto try_next_letter;
     837                 :           }          
     838                 : 
     839                 : #ifdef VERBOSE
     840                 :           char *state_str;
     841                 :           state_str = get_state_str (state, 1);
     842                 : 
     843                 :           for (k = 0; k < i - strlen (state_str); k++)
     844                 :             putchar (' ');
     845                 :           printf ("%s", state_str);
     846                 : #endif
     847                 : 
     848               0 :           hstate = &dict->states[state];
     849               0 :           for (k = 0; k < hstate->num_trans; k++)
     850               0 :             if (hstate->trans[k].ch == ch)
     851                 :               {
     852               0 :                 state = hstate->trans[k].new_state;
     853               0 :                 goto found_state;
     854                 :               }
     855               0 :           state = hstate->fallback_state;
     856                 : #ifdef VERBOSE
     857                 :           printf (" falling back, fallback_state %d\n", state);
     858                 : #endif
     859               0 :         }
     860                 :     found_state:
     861                 : #ifdef VERBOSE
     862                 :       printf ("found state %d\n",state);
     863                 : #endif
     864                 :       /* Additional optimization is possible here - especially,
     865                 :          elimination of trailing zeroes from the match. Leading zeroes
     866                 :          have already been optimized. */
     867               0 :       match = dict->states[state].match;
     868               0 :       repl = dict->states[state].repl;
     869               0 :       replindex = dict->states[state].replindex;
     870               0 :       replcut = dict->states[state].replcut;
     871                 :       /* replacing rules not handled by hyphen_hyphenate() */
     872               0 :       if (match)
     873                 :         {
     874               0 :           offset = i + 1 - strlen (match);
     875                 : #ifdef VERBOSE
     876                 :           for (k = 0; k < offset; k++)
     877                 :             putchar (' ');
     878                 :           printf ("%s (%s)\n", match, repl);
     879                 : #endif
     880               0 :           if (repl) {
     881               0 :             if (!isrepl) for(; isrepl < word_size; isrepl++) {
     882               0 :                 matchrepl[isrepl] = NULL;
     883               0 :                 matchindex[isrepl] = -1;
     884                 :             }
     885               0 :             matchlen[offset + replindex] = replcut;
     886                 :           }
     887                 :           /* This is a linear search because I tried a binary search and
     888                 :              found it to be just a teeny bit slower. */
     889               0 :           for (k = 0; match[k]; k++) {
     890               0 :             if ((hyphens[offset + k] < match[k])) {
     891               0 :               hyphens[offset + k] = match[k];
     892               0 :               if (match[k]&1) {
     893               0 :                 matchrepl[offset + k] = repl;
     894               0 :                 if (repl && (k >= replindex) && (k <= replindex + replcut)) {
     895               0 :                     matchindex[offset + replindex] = offset + k;
     896                 :                 }
     897                 :               }
     898                 :             }
     899                 :           }
     900                 :           
     901                 :         }
     902                 : 
     903                 :       /* KBH: we need this to make sure we keep looking in a word */
     904                 :       /* for patterns even if the current character is not known in state 0 */
     905                 :       /* since patterns for hyphenation may occur anywhere in the word */
     906                 :       try_next_letter: ;
     907                 : 
     908                 :     }
     909                 : #ifdef VERBOSE
     910                 :   for (i = 0; i < j; i++)
     911                 :     putchar (hyphens[i]);
     912                 :   putchar ('\n');
     913                 : #endif
     914                 : 
     915               0 :   for (i = 0; i < j - 3; i++)
     916                 : #if 0
     917                 :     if (hyphens[i + 1] & 1)
     918                 :       hyphens[i] = '-';
     919                 : #else
     920               0 :     hyphens[i] = hyphens[i + 1];
     921                 : #endif
     922               0 :   for (; i < word_size; i++)
     923               0 :     hyphens[i] = '0';
     924               0 :   hyphens[word_size] = '\0';
     925                 : 
     926                 :        /* now create a new char string showing hyphenation positions */
     927                 :        /* count the hyphens and allocate space for the new hyphenated string */
     928               0 :        nHyphCount = 0;
     929               0 :        for (i = 0; i < word_size; i++)
     930               0 :           if (hyphens[i]&1)
     931               0 :              nHyphCount++;
     932               0 :        j = 0;
     933               0 :        for (i = 0; i < word_size; i++) {
     934               0 :            if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { 
     935               0 :                 if (rep && pos && cut) {
     936               0 :                     if (!*rep && !*pos && !*cut) {
     937                 :                         int k;
     938               0 :                         *rep = (char **) malloc(sizeof(char *) * word_size);
     939               0 :                         *pos = (int *) malloc(sizeof(int) * word_size);
     940               0 :                         *cut = (int *) malloc(sizeof(int) * word_size);
     941               0 :                         for (k = 0; k < word_size; k++) {
     942               0 :                             (*rep)[k] = NULL;
     943               0 :                             (*pos)[k] = 0;
     944               0 :                             (*cut)[k] = 0;
     945                 :                         }
     946                 :                     }
     947               0 :                     (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
     948               0 :                     (*pos)[matchindex[i] - 1] = matchindex[i] - i;
     949               0 :                     (*cut)[matchindex[i] - 1] = matchlen[i];
     950                 :                 }
     951               0 :                 j += strlen(matchrepl[matchindex[i]]);
     952               0 :                 i += matchlen[i] - 1;
     953                 :           }
     954                 :        }
     955                 : 
     956               0 :   if (matchrepl != matchrepl_buf) {
     957               0 :     hnj_free (matchrepl);
     958               0 :     hnj_free (matchlen);
     959               0 :     hnj_free (matchindex);
     960                 :   }
     961                 : 
     962                 :   // recursive hyphenation of the first (compound) level segments
     963               0 :   if (dict->nextlevel) {
     964                 :      char * rep2_buf[MAX_WORD];
     965                 :      int pos2_buf[MAX_WORD];
     966                 :      int cut2_buf[MAX_WORD];
     967                 :      char hyphens2_buf[MAX_WORD];
     968                 :      char ** rep2;
     969                 :      int * pos2;
     970                 :      int * cut2;
     971                 :      char * hyphens2;
     972               0 :      int begin = 0;
     973               0 :      if (word_size < MAX_CHARS) {
     974               0 :         rep2 = rep2_buf;
     975               0 :         pos2 = pos2_buf;
     976               0 :         cut2 = cut2_buf;
     977               0 :         hyphens2 = hyphens2_buf;
     978                 :      } else {
     979               0 :         rep2 = hnj_malloc (word_size * sizeof(char *));
     980               0 :         pos2 = hnj_malloc (word_size * sizeof(int));
     981               0 :         cut2 = hnj_malloc (word_size * sizeof(int));
     982               0 :         hyphens2 = hnj_malloc (word_size);
     983                 :      }
     984               0 :      for (i = 0; i < word_size; i++) rep2[i] = NULL;
     985               0 :      for (i = 0; i < word_size; i++) if 
     986               0 :         (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
     987               0 :         if (i - begin > 1) {
     988               0 :             int hyph = 0;
     989               0 :             prep_word[i + 2] = '\0';
     990                 :             /* non-standard hyphenation at compound boundary (Schiffahrt) */
     991               0 :             if (*rep && *pos && *cut && (*rep)[i]) {
     992               0 :                 char * l = strchr((*rep)[i], '=');
     993               0 :                 strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
     994               0 :                 if (l) {
     995               0 :                     hyph = (l - (*rep)[i]) - (*pos)[i];
     996               0 :                     prep_word[2 + i + hyph] = '\0';
     997                 :                 }
     998                 :             }
     999               0 :             hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
    1000                 :                 hyphens2, &rep2, &pos2, &cut2, clhmin,
    1001               0 :                 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
    1002               0 :             for (j = 0; j < i - begin - 1; j++) {
    1003               0 :                 hyphens[begin + j] = hyphens2[j];
    1004               0 :                 if (rep2[j] && rep && pos && cut) {
    1005               0 :                     if (!*rep && !*pos && !*cut) {
    1006                 :                         int k;
    1007               0 :                         *rep = (char **) malloc(sizeof(char *) * word_size);
    1008               0 :                         *pos = (int *) malloc(sizeof(int) * word_size);
    1009               0 :                         *cut = (int *) malloc(sizeof(int) * word_size);
    1010               0 :                         for (k = 0; k < word_size; k++) {
    1011               0 :                             (*rep)[k] = NULL;
    1012               0 :                             (*pos)[k] = 0;
    1013               0 :                             (*cut)[k] = 0;
    1014                 :                         }
    1015                 :                     }
    1016               0 :                     (*rep)[begin + j] = rep2[j];
    1017               0 :                     (*pos)[begin + j] = pos2[j];
    1018               0 :                     (*cut)[begin + j] = cut2[j];
    1019                 :                 }
    1020                 :             }
    1021               0 :             prep_word[i + 2] = word[i + 1];
    1022               0 :             if (*rep && *pos && *cut && (*rep)[i]) {
    1023               0 :                 strcpy(prep_word + 1, word);
    1024                 :             }
    1025                 :         }
    1026               0 :         begin = i + 1;
    1027               0 :         for (j = 0; j < word_size; j++) rep2[j] = NULL;
    1028                 :      }
    1029                 :      
    1030                 :      // non-compound
    1031               0 :      if (begin == 0) {
    1032               0 :         hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
    1033                 :             hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
    1034               0 :         if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
    1035                 :             rep, pos, cut, clhmin);
    1036               0 :         if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
    1037                 :             rep, pos, cut, crhmin);
    1038                 :      }
    1039                 :      
    1040               0 :      if (rep2 != rep2_buf) {
    1041               0 :         free(rep2);
    1042               0 :         free(cut2);
    1043               0 :         free(pos2);
    1044               0 :         free(hyphens2);
    1045                 :      }
    1046                 :   }
    1047                 : 
    1048               0 :   if (prep_word != prep_word_buf) hnj_free (prep_word);
    1049               0 :   return 0;
    1050                 : }
    1051                 : 
    1052                 : /* UTF-8 normalization of hyphen and non-standard positions */
    1053               0 : int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
    1054                 :         char *** rep, int ** pos, int ** cut)
    1055                 : {
    1056                 :   int i, j, k;
    1057               0 :   if ((((unsigned char) word[0]) >> 6) == 2) {
    1058               0 :     fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
    1059               0 :     return 1;
    1060                 :   }
    1061                 : 
    1062                 :   /* calculate UTF-8 character positions */
    1063               0 :   for (i = 0, j = -1; i < word_size; i++) {
    1064                 :     /* beginning of an UTF-8 character (not '10' start bits) */
    1065               0 :     if ((((unsigned char) word[i]) >> 6) != 2) j++;
    1066               0 :     hyphens[j] = hyphens[i];
    1067               0 :     if (rep && pos && cut && *rep && *pos && *cut) {
    1068               0 :         int l = (*pos)[i];
    1069               0 :         (*pos)[j] = 0;
    1070               0 :         for (k = 0; k < l; k++) {
    1071               0 :             if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
    1072                 :         }
    1073               0 :         k = i - l + 1;
    1074               0 :         l = k + (*cut)[i];
    1075               0 :         (*cut)[j] = 0;        
    1076               0 :         for (; k < l; k++) {
    1077               0 :             if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
    1078                 :         }
    1079               0 :         (*rep)[j] = (*rep)[i];
    1080               0 :         if (j < i) {
    1081               0 :             (*rep)[i] = NULL;
    1082               0 :             (*pos)[i] = 0;
    1083               0 :             (*cut)[i] = 0;
    1084                 :         }
    1085                 :     }
    1086                 :   }
    1087               0 :   hyphens[j + 1] = '\0';
    1088                 : #ifdef VERBOSE
    1089                 :   printf ("nums: %s\n", hyphens);
    1090                 : #endif
    1091               0 :   return 0;
    1092                 : }
    1093                 : 
    1094                 : /* get the word with all possible hyphenations (output: hyphword) */
    1095               0 : void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, 
    1096                 :     char * hyphword, char *** rep, int ** pos, int ** cut)
    1097                 : {
    1098                 :   int i, j;
    1099               0 :   for (i = 0, j = 0; i < l; i++, j++) {
    1100               0 :     if (hyphens[i]&1) {
    1101               0 :       hyphword[j] = word[i];
    1102               0 :       if (*rep && *pos && *cut && (*rep)[i]) {
    1103               0 :         strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
    1104               0 :         j += strlen((*rep)[i]) - (*pos)[i];
    1105               0 :         i += (*cut)[i] - (*pos)[i];
    1106               0 :       } else hyphword[++j] = '=';
    1107               0 :     } else hyphword[j] = word[i];
    1108                 :   }
    1109               0 :   hyphword[j] = '\0';
    1110               0 : }
    1111                 : 
    1112                 : 
    1113                 : /* main api function with default hyphenmin parameters */
    1114               0 : int hnj_hyphen_hyphenate2 (HyphenDict *dict,
    1115                 :                            const char *word, int word_size, char * hyphens,
    1116                 :                            char *hyphword, char *** rep, int ** pos, int ** cut)
    1117                 : {
    1118               0 :   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
    1119               0 :     dict->clhmin, dict->crhmin, 1, 1);
    1120               0 :   hnj_hyphen_lhmin(dict->utf8, word, word_size,
    1121               0 :     hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
    1122               0 :   hnj_hyphen_rhmin(dict->utf8, word, word_size,
    1123               0 :     hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
    1124                 : 
    1125                 :   /* nohyphen */
    1126               0 :   if (dict->nohyphen) {
    1127               0 :     char * nh = dict->nohyphen;
    1128                 :     int nhi;
    1129               0 :     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
    1130               0 :         char * nhy = (char *) strstr(word, nh);
    1131               0 :         while (nhy) {
    1132               0 :             hyphens[nhy - word + strlen(nh) - 1] = '0';
    1133               0 :             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = '0';
    1134               0 :             nhy = (char *) strstr(nhy + 1, nh);
    1135                 :         }
    1136               0 :         nh = nh + strlen(nh) + 1;
    1137                 :     }
    1138                 :   }
    1139                 : 
    1140               0 :   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
    1141               0 :   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
    1142                 : #ifdef VERBOSE
    1143                 :   printf ("nums: %s\n", hyphens);
    1144                 : #endif
    1145               0 :   return 0;
    1146                 : }
    1147                 : 
    1148                 : /* previous main api function with hyphenmin parameters */
    1149               0 : int hnj_hyphen_hyphenate3 (HyphenDict *dict,
    1150                 :         const char *word, int word_size, char * hyphens,
    1151                 :         char *hyphword, char *** rep, int ** pos, int ** cut,
    1152                 :         int lhmin, int rhmin, int clhmin, int crhmin)
    1153                 : {
    1154               0 :   lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
    1155               0 :   rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
    1156               0 :   clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
    1157               0 :   crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
    1158               0 :   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
    1159                 :     clhmin, crhmin, 1, 1);
    1160               0 :   hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
    1161                 :     rep, pos, cut, (lhmin > 0 ? lhmin : 2));
    1162               0 :   hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
    1163                 :     rep, pos, cut, (rhmin > 0 ? rhmin : 2));
    1164               0 :   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
    1165                 : 
    1166                 :   /* nohyphen */
    1167               0 :   if (dict->nohyphen) {
    1168               0 :     char * nh = dict->nohyphen;
    1169                 :     int nhi;
    1170               0 :     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
    1171               0 :         char * nhy = (char *) strstr(word, nh);
    1172               0 :         while (nhy) {
    1173               0 :             hyphens[nhy - word + strlen(nh) - 1] = 0;
    1174               0 :             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = 0;
    1175               0 :             nhy = (char *) strstr(nhy + 1, nh);
    1176                 :         }
    1177               0 :         nh = nh + strlen(nh) + 1;
    1178                 :     }
    1179                 :   }
    1180                 : 
    1181               0 :   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
    1182               0 :   return 0;
    1183                 : }

Generated by: LCOV version 1.7