1 : /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
2 : * licenses follows.
3 : */
4 :
5 : /* LibHnj - a library for high quality hyphenation and justification
6 : * Copyright (C) 1998 Raph Levien,
7 : * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
8 : * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
9 : * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
10 : *
11 : * This library is free software; you can redistribute it and/or
12 : * modify it under the terms of the GNU Library General Public
13 : * License as published by the Free Software Foundation; either
14 : * version 2 of the License, or (at your option) any later version.
15 : *
16 : * This library is distributed in the hope that it will be useful,
17 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 : * Library General Public License for more details.
20 : *
21 : * You should have received a copy of the GNU Library General Public
22 : * License along with this library; if not, write to the
23 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 : * Boston, MA 02111-1307 USA.
25 : */
26 :
27 : /*
28 : * The contents of this file are subject to the Mozilla Public License
29 : * Version 1.0 (the "MPL"); you may not use this file except in
30 : * compliance with the MPL. You may obtain a copy of the MPL at
31 : * http://www.mozilla.org/MPL/
32 : *
33 : * Software distributed under the MPL is distributed on an "AS IS" basis,
34 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
35 : * for the specific language governing rights and limitations under the
36 : * MPL.
37 : *
38 : */
39 : #include <stdlib.h> /* for NULL, malloc */
40 : #include <stdio.h> /* for fprintf */
41 : #include <string.h> /* for strdup */
42 :
43 : #ifdef UNX
44 : #include <unistd.h> /* for exit */
45 : #endif
46 :
47 : #define noVERBOSE
48 :
49 : /* calculate hyphenmin values with long ligature length (2 or 3 characters
50 : * instead of 1 or 2) for comparison with hyphenation without ligatures */
51 : #define noLONG_LIGATURE
52 :
53 : #ifdef LONG_LIGATURE
54 : #define LIG_xx 1
55 : #define LIG_xxx 2
56 : #else
57 : #define LIG_xx 0
58 : #define LIG_xxx 1
59 : #endif
60 :
61 : #include "hnjalloc.h"
62 : #include "hyphen.h"
63 :
64 : static char *
65 0 : hnj_strdup (const char *s)
66 : {
67 : char *new;
68 : int l;
69 :
70 0 : l = strlen (s);
71 0 : new = hnj_malloc (l + 1);
72 0 : memcpy (new, s, l);
73 0 : new[l] = 0;
74 0 : return new;
75 : }
76 :
77 : /* remove cross-platform text line end characters */
78 0 : void hnj_strchomp(char * s)
79 : {
80 0 : int k = strlen(s);
81 0 : if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
82 0 : if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
83 0 : }
84 :
85 : /* a little bit of a hash table implementation. This simply maps strings
86 : to state numbers */
87 :
88 : typedef struct _HashTab HashTab;
89 : typedef struct _HashEntry HashEntry;
90 :
91 : /* A cheap, but effective, hack. */
92 : #define HASH_SIZE 31627
93 :
94 : struct _HashTab {
95 : HashEntry *entries[HASH_SIZE];
96 : };
97 :
98 : struct _HashEntry {
99 : HashEntry *next;
100 : char *key;
101 : int val;
102 : };
103 :
104 : /* a char* hash function from ASU - adapted from Gtk+ */
105 : static unsigned int
106 0 : hnj_string_hash (const char *s)
107 : {
108 : const char *p;
109 0 : unsigned int h=0, g;
110 0 : for(p = s; *p != '\0'; p += 1) {
111 0 : h = ( h << 4 ) + *p;
112 0 : if ( ( g = h & 0xf0000000 ) ) {
113 0 : h = h ^ (g >> 24);
114 0 : h = h ^ g;
115 : }
116 : }
117 0 : return h /* % M */;
118 : }
119 :
120 : static HashTab *
121 0 : hnj_hash_new (void)
122 : {
123 : HashTab *hashtab;
124 : int i;
125 :
126 0 : hashtab = hnj_malloc (sizeof(HashTab));
127 0 : for (i = 0; i < HASH_SIZE; i++)
128 0 : hashtab->entries[i] = NULL;
129 :
130 0 : return hashtab;
131 : }
132 :
133 : static void
134 0 : hnj_hash_free (HashTab *hashtab)
135 : {
136 : int i;
137 : HashEntry *e, *next;
138 :
139 0 : for (i = 0; i < HASH_SIZE; i++)
140 0 : for (e = hashtab->entries[i]; e; e = next)
141 : {
142 0 : next = e->next;
143 0 : hnj_free (e->key);
144 0 : hnj_free (e);
145 : }
146 :
147 0 : hnj_free (hashtab);
148 0 : }
149 :
150 : /* assumes that key is not already present! */
151 : static void
152 0 : hnj_hash_insert (HashTab *hashtab, const char *key, int val)
153 : {
154 : int i;
155 : HashEntry *e;
156 :
157 0 : i = hnj_string_hash (key) % HASH_SIZE;
158 0 : e = hnj_malloc (sizeof(HashEntry));
159 0 : e->next = hashtab->entries[i];
160 0 : e->key = hnj_strdup (key);
161 0 : e->val = val;
162 0 : hashtab->entries[i] = e;
163 0 : }
164 :
165 : /* return val if found, otherwise -1 */
166 : static int
167 0 : hnj_hash_lookup (HashTab *hashtab, const char *key)
168 : {
169 : int i;
170 : HashEntry *e;
171 0 : i = hnj_string_hash (key) % HASH_SIZE;
172 0 : for (e = hashtab->entries[i]; e; e = e->next)
173 0 : if (!strcmp (key, e->key))
174 0 : return e->val;
175 0 : return -1;
176 : }
177 :
178 : /* Get the state number, allocating a new state if necessary. */
179 : static int
180 0 : hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
181 : {
182 : int state_num;
183 :
184 0 : state_num = hnj_hash_lookup (hashtab, string);
185 :
186 0 : if (state_num >= 0)
187 0 : return state_num;
188 :
189 0 : hnj_hash_insert (hashtab, string, dict->num_states);
190 : /* predicate is true if dict->num_states is a power of two */
191 0 : if (!(dict->num_states & (dict->num_states - 1)))
192 : {
193 0 : dict->states = hnj_realloc (dict->states,
194 : (dict->num_states << 1) *
195 : sizeof(HyphenState));
196 : }
197 0 : dict->states[dict->num_states].match = NULL;
198 0 : dict->states[dict->num_states].repl = NULL;
199 0 : dict->states[dict->num_states].fallback_state = -1;
200 0 : dict->states[dict->num_states].num_trans = 0;
201 0 : dict->states[dict->num_states].trans = NULL;
202 0 : return dict->num_states++;
203 : }
204 :
205 : /* add a transition from state1 to state2 through ch - assumes that the
206 : transition does not already exist */
207 : static void
208 0 : hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
209 : {
210 : int num_trans;
211 :
212 0 : num_trans = dict->states[state1].num_trans;
213 0 : if (num_trans == 0)
214 : {
215 0 : dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
216 : }
217 0 : else if (!(num_trans & (num_trans - 1)))
218 : {
219 0 : dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
220 : (num_trans << 1) *
221 : sizeof(HyphenTrans));
222 : }
223 0 : dict->states[state1].trans[num_trans].ch = ch;
224 0 : dict->states[state1].trans[num_trans].new_state = state2;
225 0 : dict->states[state1].num_trans++;
226 0 : }
227 :
228 : #ifdef VERBOSE
229 : HashTab *global[1];
230 :
231 : static char *
232 : get_state_str (int state, int level)
233 : {
234 : int i;
235 : HashEntry *e;
236 :
237 : for (i = 0; i < HASH_SIZE; i++)
238 : for (e = global[level]->entries[i]; e; e = e->next)
239 : if (e->val == state)
240 : return e->key;
241 : return NULL;
242 : }
243 : #endif
244 :
245 0 : void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
246 : int i, j;
247 : char word[MAX_CHARS];
248 : char pattern[MAX_CHARS];
249 : char * repl;
250 : signed char replindex;
251 : signed char replcut;
252 0 : int state_num = 0;
253 : int last_state;
254 : char ch;
255 : int found;
256 :
257 0 : if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
258 0 : dict->lhmin = atoi(buf + 13);
259 0 : return;
260 0 : } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
261 0 : dict->rhmin = atoi(buf + 14);
262 0 : return;
263 0 : } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
264 0 : dict->clhmin = atoi(buf + 21);
265 0 : return;
266 0 : } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
267 0 : dict->crhmin = atoi(buf + 22);
268 0 : return;
269 0 : } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
270 0 : char * space = buf + 8;
271 0 : while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
272 0 : if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
273 0 : if (dict->nohyphen) {
274 0 : char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
275 0 : *nhe = 0;
276 0 : for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
277 0 : if (*nhe == ',') {
278 0 : dict->nohyphenl++;
279 0 : *nhe = 0;
280 : }
281 : }
282 : }
283 0 : return;
284 : }
285 0 : j = 0;
286 0 : pattern[j] = '0';
287 0 : repl = strchr(buf, '/');
288 0 : replindex = 0;
289 0 : replcut = 0;
290 0 : if (repl) {
291 0 : char * index = strchr(repl + 1, ',');
292 0 : *repl = '\0';
293 0 : if (index) {
294 0 : char * index2 = strchr(index + 1, ',');
295 0 : *index = '\0';
296 0 : if (index2) {
297 0 : *index2 = '\0';
298 0 : replindex = (signed char) atoi(index + 1) - 1;
299 0 : replcut = (signed char) atoi(index2 + 1);
300 : }
301 : } else {
302 0 : hnj_strchomp(repl + 1);
303 0 : replindex = 0;
304 0 : replcut = (signed char) strlen(buf);
305 : }
306 0 : repl = hnj_strdup(repl + 1);
307 : }
308 0 : for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
309 : {
310 0 : if (buf[i] >= '0' && buf[i] <= '9')
311 0 : pattern[j] = buf[i];
312 : else
313 : {
314 0 : word[j] = buf[i];
315 0 : pattern[++j] = '0';
316 : }
317 : }
318 0 : word[j] = '\0';
319 0 : pattern[j + 1] = '\0';
320 :
321 0 : i = 0;
322 0 : if (!repl) {
323 : /* Optimize away leading zeroes */
324 0 : for (; pattern[i] == '0'; i++);
325 : } else {
326 0 : if (*word == '.') i++;
327 : /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
328 0 : if (dict->utf8) {
329 0 : int pu = -1; /* unicode character position */
330 0 : int ps = -1; /* unicode start position (original replindex) */
331 0 : int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
332 0 : for (; pc < (strlen(word) + 1); pc++) {
333 : /* beginning of an UTF-8 character (not '10' start bits) */
334 0 : if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
335 0 : if ((ps < 0) && (replindex == pu)) {
336 0 : ps = replindex;
337 0 : replindex = (signed char) pc;
338 : }
339 0 : if ((ps >= 0) && ((pu - ps) == replcut)) {
340 0 : replcut = (signed char) (pc - replindex);
341 0 : break;
342 : }
343 : }
344 0 : if (*word == '.') replindex--;
345 : }
346 : }
347 :
348 : #ifdef VERBOSE
349 : printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
350 : #endif
351 0 : found = hnj_hash_lookup (hashtab, word);
352 0 : state_num = hnj_get_state (dict, hashtab, word);
353 0 : dict->states[state_num].match = hnj_strdup (pattern + i);
354 0 : dict->states[state_num].repl = repl;
355 0 : dict->states[state_num].replindex = replindex;
356 0 : if (!replcut) {
357 0 : dict->states[state_num].replcut = (signed char) strlen(word);
358 : } else {
359 0 : dict->states[state_num].replcut = replcut;
360 : }
361 :
362 : /* now, put in the prefix transitions */
363 0 : for (; found < 0 ;j--)
364 : {
365 0 : last_state = state_num;
366 0 : ch = word[j - 1];
367 0 : word[j - 1] = '\0';
368 0 : found = hnj_hash_lookup (hashtab, word);
369 0 : state_num = hnj_get_state (dict, hashtab, word);
370 0 : hnj_add_trans (dict, state_num, last_state, ch);
371 : }
372 : }
373 :
374 : HyphenDict *
375 0 : hnj_hyphen_load (const char *fn)
376 : {
377 : HyphenDict *dict[2];
378 : HashTab *hashtab;
379 : FILE *f;
380 : char buf[MAX_CHARS];
381 0 : int nextlevel = 0;
382 : int i, j, k;
383 : HashEntry *e;
384 0 : int state_num = 0;
385 :
386 0 : f = fopen (fn, "r");
387 0 : if (f == NULL)
388 0 : return NULL;
389 :
390 : // loading one or two dictionaries (separated by NEXTLEVEL keyword)
391 0 : for (k = 0; k < 2; k++) {
392 0 : hashtab = hnj_hash_new ();
393 : #ifdef VERBOSE
394 : global[k] = hashtab;
395 : #endif
396 0 : hnj_hash_insert (hashtab, "", 0);
397 0 : dict[k] = hnj_malloc (sizeof(HyphenDict));
398 0 : dict[k]->num_states = 1;
399 0 : dict[k]->states = hnj_malloc (sizeof(HyphenState));
400 0 : dict[k]->states[0].match = NULL;
401 0 : dict[k]->states[0].repl = NULL;
402 0 : dict[k]->states[0].fallback_state = -1;
403 0 : dict[k]->states[0].num_trans = 0;
404 0 : dict[k]->states[0].trans = NULL;
405 0 : dict[k]->nextlevel = NULL;
406 0 : dict[k]->lhmin = 0;
407 0 : dict[k]->rhmin = 0;
408 0 : dict[k]->clhmin = 0;
409 0 : dict[k]->crhmin = 0;
410 0 : dict[k]->nohyphen = NULL;
411 0 : dict[k]->nohyphenl = 0;
412 :
413 : /* read in character set info */
414 0 : if (k == 0) {
415 0 : for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
416 0 : if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) {
417 0 : for (i=0;i<MAX_NAME;i++)
418 0 : if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
419 0 : dict[k]->cset[i] = 0;
420 : } else {
421 0 : dict[k]->cset[0] = 0;
422 : }
423 0 : dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
424 : } else {
425 0 : strcpy(dict[k]->cset, dict[0]->cset);
426 0 : dict[k]->utf8 = dict[0]->utf8;
427 : }
428 :
429 0 : if (k == 0 || nextlevel) {
430 0 : while (fgets (buf, sizeof(buf), f) != NULL) {
431 0 : if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
432 0 : nextlevel = 1;
433 0 : break;
434 0 : } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
435 : }
436 0 : } else if (k == 1) {
437 : /* default first level: hyphen and ASCII apostrophe */
438 0 : if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN '\n", dict[k], hashtab);
439 0 : else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99\n", dict[k], hashtab);
440 0 : strcpy(buf, "1-1/=,1,1\n"); // buf rewritten by hnj_hyphen_load here
441 0 : hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
442 0 : hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
443 0 : if (dict[0]->utf8) {
444 0 : hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
445 0 : hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
446 : }
447 : }
448 :
449 : /* Could do unioning of matches here (instead of the preprocessor script).
450 : If we did, the pseudocode would look something like this:
451 :
452 : foreach state in the hash table
453 : foreach i = [1..length(state) - 1]
454 : state to check is substr (state, i)
455 : look it up
456 : if found, and if there is a match, union the match in.
457 :
458 : It's also possible to avoid the quadratic blowup by doing the
459 : search in order of increasing state string sizes - then you
460 : can break the loop after finding the first match.
461 :
462 : This step should be optional in any case - if there is a
463 : preprocessed rule table, it's always faster to use that.
464 :
465 : */
466 :
467 : /* put in the fallback states */
468 0 : for (i = 0; i < HASH_SIZE; i++)
469 0 : for (e = hashtab->entries[i]; e; e = e->next)
470 : {
471 0 : if (*(e->key)) for (j = 1; 1; j++)
472 : {
473 0 : state_num = hnj_hash_lookup (hashtab, e->key + j);
474 0 : if (state_num >= 0)
475 0 : break;
476 0 : }
477 : /* KBH: FIXME state 0 fallback_state should always be -1? */
478 0 : if (e->val)
479 0 : dict[k]->states[e->val].fallback_state = state_num;
480 : }
481 : #ifdef VERBOSE
482 : for (i = 0; i < HASH_SIZE; i++)
483 : for (e = hashtab->entries[i]; e; e = e->next)
484 : {
485 : printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
486 : dict[k]->states[e->val].fallback_state);
487 : for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
488 : printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
489 : dict[k]->states[e->val].trans[j].new_state);
490 : }
491 : #endif
492 :
493 : #ifndef VERBOSE
494 0 : hnj_hash_free (hashtab);
495 : #endif
496 0 : state_num = 0;
497 : }
498 0 : fclose(f);
499 0 : if (nextlevel) dict[0]->nextlevel = dict[1];
500 : else {
501 0 : dict[1] -> nextlevel = dict[0];
502 0 : dict[1]->lhmin = dict[0]->lhmin;
503 0 : dict[1]->rhmin = dict[0]->rhmin;
504 0 : dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
505 0 : dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
506 : #ifdef VERBOSE
507 : HashTab *r = global[0];
508 : global[0] = global[1];
509 : global[1] = r;
510 : #endif
511 0 : return dict[1];
512 : }
513 0 : return dict[0];
514 : }
515 :
516 0 : void hnj_hyphen_free (HyphenDict *dict)
517 : {
518 : int state_num;
519 : HyphenState *hstate;
520 :
521 0 : for (state_num = 0; state_num < dict->num_states; state_num++)
522 : {
523 0 : hstate = &dict->states[state_num];
524 0 : if (hstate->match)
525 0 : hnj_free (hstate->match);
526 0 : if (hstate->repl)
527 0 : hnj_free (hstate->repl);
528 0 : if (hstate->trans)
529 0 : hnj_free (hstate->trans);
530 : }
531 0 : if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
532 :
533 0 : if (dict->nohyphen) hnj_free(dict->nohyphen);
534 :
535 0 : hnj_free (dict->states);
536 :
537 0 : hnj_free (dict);
538 0 : }
539 :
540 : #define MAX_WORD 256
541 :
542 0 : int hnj_hyphen_hyphenate (HyphenDict *dict,
543 : const char *word, int word_size,
544 : char *hyphens)
545 : {
546 : char prep_word_buf[MAX_WORD];
547 : char *prep_word;
548 : int i, j, k;
549 : int state;
550 : char ch;
551 : HyphenState *hstate;
552 : char *match;
553 : int offset;
554 :
555 0 : if (word_size + 3 < MAX_WORD)
556 0 : prep_word = prep_word_buf;
557 : else
558 0 : prep_word = hnj_malloc (word_size + 3);
559 :
560 0 : j = 0;
561 0 : prep_word[j++] = '.';
562 :
563 0 : for (i = 0; i < word_size; i++) {
564 0 : if (word[i] <= '9' && word[i] >= '0') {
565 0 : prep_word[j++] = '.';
566 : } else {
567 0 : prep_word[j++] = word[i];
568 : }
569 : }
570 :
571 0 : prep_word[j++] = '.';
572 0 : prep_word[j] = '\0';
573 :
574 0 : for (i = 0; i < word_size + 5; i++)
575 0 : hyphens[i] = '0';
576 :
577 : #ifdef VERBOSE
578 : printf ("prep_word = %s\n", prep_word);
579 : #endif
580 :
581 : /* now, run the finite state machine */
582 0 : state = 0;
583 0 : for (i = 0; i < j; i++)
584 : {
585 0 : ch = prep_word[i];
586 : for (;;)
587 : {
588 :
589 0 : if (state == -1) {
590 : /* return 1; */
591 : /* KBH: FIXME shouldn't this be as follows? */
592 0 : state = 0;
593 0 : goto try_next_letter;
594 : }
595 :
596 : #ifdef VERBOSE
597 : char *state_str;
598 : state_str = get_state_str (state, 0);
599 :
600 : for (k = 0; k < i - strlen (state_str); k++)
601 : putchar (' ');
602 : printf ("%s", state_str);
603 : #endif
604 :
605 0 : hstate = &dict->states[state];
606 0 : for (k = 0; k < hstate->num_trans; k++)
607 0 : if (hstate->trans[k].ch == ch)
608 : {
609 0 : state = hstate->trans[k].new_state;
610 0 : goto found_state;
611 : }
612 0 : state = hstate->fallback_state;
613 : #ifdef VERBOSE
614 : printf (" falling back, fallback_state %d\n", state);
615 : #endif
616 0 : }
617 : found_state:
618 : #ifdef VERBOSE
619 : printf ("found state %d\n",state);
620 : #endif
621 : /* Additional optimization is possible here - especially,
622 : elimination of trailing zeroes from the match. Leading zeroes
623 : have already been optimized. */
624 0 : match = dict->states[state].match;
625 : /* replacing rules not handled by hyphen_hyphenate() */
626 0 : if (match && !dict->states[state].repl)
627 : {
628 0 : offset = i + 1 - strlen (match);
629 : #ifdef VERBOSE
630 : for (k = 0; k < offset; k++)
631 : putchar (' ');
632 : printf ("%s\n", match);
633 : #endif
634 : /* This is a linear search because I tried a binary search and
635 : found it to be just a teeny bit slower. */
636 0 : for (k = 0; match[k]; k++)
637 0 : if (hyphens[offset + k] < match[k])
638 0 : hyphens[offset + k] = match[k];
639 : }
640 :
641 : /* KBH: we need this to make sure we keep looking in a word */
642 : /* for patterns even if the current character is not known in state 0 */
643 : /* since patterns for hyphenation may occur anywhere in the word */
644 : try_next_letter: ;
645 :
646 : }
647 : #ifdef VERBOSE
648 : for (i = 0; i < j; i++)
649 : putchar (hyphens[i]);
650 : putchar ('\n');
651 : #endif
652 :
653 0 : for (i = 0; i < j - 4; i++)
654 : #if 0
655 : if (hyphens[i + 1] & 1)
656 : hyphens[i] = '-';
657 : #else
658 0 : hyphens[i] = hyphens[i + 1];
659 : #endif
660 0 : hyphens[0] = '0';
661 0 : for (; i < word_size; i++)
662 0 : hyphens[i] = '0';
663 0 : hyphens[word_size] = '\0';
664 :
665 0 : if (prep_word != prep_word_buf)
666 0 : hnj_free (prep_word);
667 :
668 0 : return 0;
669 : }
670 :
671 : /* Unicode ligature length */
672 0 : int hnj_ligature(unsigned char c) {
673 0 : switch (c) {
674 : case 0x80: /* ff */
675 : case 0x81: /* fi */
676 0 : case 0x82: return LIG_xx; /* fl */
677 : case 0x83: /* ffi */
678 0 : case 0x84: return LIG_xxx; /* ffl */
679 : case 0x85: /* long st */
680 0 : case 0x86: return LIG_xx; /* st */
681 : }
682 0 : return 0;
683 : }
684 :
685 : /* character length of the first n byte of the input word */
686 0 : int hnj_hyphen_strnlen(const char * word, int n, int utf8)
687 : {
688 0 : int i = 0;
689 0 : int j = 0;
690 0 : while (j < n && word[j] != '\0') {
691 0 : i++;
692 : // Unicode ligature support
693 0 : if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
694 0 : i += hnj_ligature(word[j + 2]);
695 : }
696 0 : for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
697 : }
698 0 : return i;
699 : }
700 :
701 0 : int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
702 : char *** rep, int ** pos, int ** cut, int lhmin)
703 : {
704 0 : int i = 1, j;
705 :
706 : // Unicode ligature support
707 0 : if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) {
708 0 : i += hnj_ligature(word[2]);
709 : }
710 :
711 : // ignore numbers
712 0 : for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
713 :
714 0 : for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
715 : // check length of the non-standard part
716 0 : if (*rep && *pos && *cut && (*rep)[j]) {
717 0 : char * rh = strchr((*rep)[j], '=');
718 0 : if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
719 0 : hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
720 0 : free((*rep)[j]);
721 0 : (*rep)[j] = NULL;
722 0 : hyphens[j] = '0';
723 : }
724 : } else {
725 0 : hyphens[j] = '0';
726 : }
727 0 : j++;
728 :
729 : // Unicode ligature support
730 0 : if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) {
731 0 : i += hnj_ligature(word[j + 2]);
732 : }
733 0 : } while (utf8 && (word[j] & 0xc0) == 0x80);
734 0 : return 0;
735 : }
736 :
737 0 : int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
738 : char *** rep, int ** pos, int ** cut, int rhmin)
739 : {
740 0 : int i = 1;
741 : int j;
742 :
743 : // ignore numbers
744 0 : for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
745 :
746 0 : for (j = word_size - 2; i < rhmin && j > 0; j--) {
747 : // check length of the non-standard part
748 0 : if (*rep && *pos && *cut && (*rep)[j]) {
749 0 : char * rh = strchr((*rep)[j], '=');
750 0 : if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
751 0 : hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
752 0 : free((*rep)[j]);
753 0 : (*rep)[j] = NULL;
754 0 : hyphens[j] = '0';
755 : }
756 : } else {
757 0 : hyphens[j] = '0';
758 : }
759 0 : if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
760 : }
761 0 : return 0;
762 : }
763 :
764 : // recursive function for compound level hyphenation
765 0 : int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
766 : char * hyphens, char *** rep, int ** pos, int ** cut,
767 : int clhmin, int crhmin, int lend, int rend)
768 : {
769 : char prep_word_buf[MAX_WORD];
770 : char *prep_word;
771 : int i, j, k;
772 : int state;
773 : char ch;
774 : HyphenState *hstate;
775 : char *match;
776 : char *repl;
777 : signed char replindex;
778 : signed char replcut;
779 : int offset;
780 : int matchlen_buf[MAX_CHARS];
781 : int matchindex_buf[MAX_CHARS];
782 : char * matchrepl_buf[MAX_CHARS];
783 : int * matchlen;
784 : int * matchindex;
785 : char ** matchrepl;
786 0 : int isrepl = 0;
787 : int nHyphCount;
788 :
789 0 : if (word_size + 3 < MAX_CHARS) {
790 0 : prep_word = prep_word_buf;
791 0 : matchlen = matchlen_buf;
792 0 : matchindex = matchindex_buf;
793 0 : matchrepl = matchrepl_buf;
794 : } else {
795 0 : prep_word = hnj_malloc (word_size + 3);
796 0 : matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
797 0 : matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
798 0 : matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
799 : }
800 :
801 0 : j = 0;
802 0 : prep_word[j++] = '.';
803 :
804 0 : for (i = 0; i < word_size; i++) {
805 0 : if (word[i] <= '9' && word[i] >= '0') {
806 0 : prep_word[j++] = '.';
807 : } else {
808 0 : prep_word[j++] = word[i];
809 : }
810 : }
811 :
812 :
813 :
814 0 : prep_word[j++] = '.';
815 0 : prep_word[j] = '\0';
816 :
817 0 : for (i = 0; i < j; i++)
818 0 : hyphens[i] = '0';
819 :
820 : #ifdef VERBOSE
821 : printf ("prep_word = %s\n", prep_word);
822 : #endif
823 :
824 : /* now, run the finite state machine */
825 0 : state = 0;
826 0 : for (i = 0; i < j; i++)
827 : {
828 0 : ch = prep_word[i];
829 : for (;;)
830 : {
831 :
832 0 : if (state == -1) {
833 : /* return 1; */
834 : /* KBH: FIXME shouldn't this be as follows? */
835 0 : state = 0;
836 0 : goto try_next_letter;
837 : }
838 :
839 : #ifdef VERBOSE
840 : char *state_str;
841 : state_str = get_state_str (state, 1);
842 :
843 : for (k = 0; k < i - strlen (state_str); k++)
844 : putchar (' ');
845 : printf ("%s", state_str);
846 : #endif
847 :
848 0 : hstate = &dict->states[state];
849 0 : for (k = 0; k < hstate->num_trans; k++)
850 0 : if (hstate->trans[k].ch == ch)
851 : {
852 0 : state = hstate->trans[k].new_state;
853 0 : goto found_state;
854 : }
855 0 : state = hstate->fallback_state;
856 : #ifdef VERBOSE
857 : printf (" falling back, fallback_state %d\n", state);
858 : #endif
859 0 : }
860 : found_state:
861 : #ifdef VERBOSE
862 : printf ("found state %d\n",state);
863 : #endif
864 : /* Additional optimization is possible here - especially,
865 : elimination of trailing zeroes from the match. Leading zeroes
866 : have already been optimized. */
867 0 : match = dict->states[state].match;
868 0 : repl = dict->states[state].repl;
869 0 : replindex = dict->states[state].replindex;
870 0 : replcut = dict->states[state].replcut;
871 : /* replacing rules not handled by hyphen_hyphenate() */
872 0 : if (match)
873 : {
874 0 : offset = i + 1 - strlen (match);
875 : #ifdef VERBOSE
876 : for (k = 0; k < offset; k++)
877 : putchar (' ');
878 : printf ("%s (%s)\n", match, repl);
879 : #endif
880 0 : if (repl) {
881 0 : if (!isrepl) for(; isrepl < word_size; isrepl++) {
882 0 : matchrepl[isrepl] = NULL;
883 0 : matchindex[isrepl] = -1;
884 : }
885 0 : matchlen[offset + replindex] = replcut;
886 : }
887 : /* This is a linear search because I tried a binary search and
888 : found it to be just a teeny bit slower. */
889 0 : for (k = 0; match[k]; k++) {
890 0 : if ((hyphens[offset + k] < match[k])) {
891 0 : hyphens[offset + k] = match[k];
892 0 : if (match[k]&1) {
893 0 : matchrepl[offset + k] = repl;
894 0 : if (repl && (k >= replindex) && (k <= replindex + replcut)) {
895 0 : matchindex[offset + replindex] = offset + k;
896 : }
897 : }
898 : }
899 : }
900 :
901 : }
902 :
903 : /* KBH: we need this to make sure we keep looking in a word */
904 : /* for patterns even if the current character is not known in state 0 */
905 : /* since patterns for hyphenation may occur anywhere in the word */
906 : try_next_letter: ;
907 :
908 : }
909 : #ifdef VERBOSE
910 : for (i = 0; i < j; i++)
911 : putchar (hyphens[i]);
912 : putchar ('\n');
913 : #endif
914 :
915 0 : for (i = 0; i < j - 3; i++)
916 : #if 0
917 : if (hyphens[i + 1] & 1)
918 : hyphens[i] = '-';
919 : #else
920 0 : hyphens[i] = hyphens[i + 1];
921 : #endif
922 0 : for (; i < word_size; i++)
923 0 : hyphens[i] = '0';
924 0 : hyphens[word_size] = '\0';
925 :
926 : /* now create a new char string showing hyphenation positions */
927 : /* count the hyphens and allocate space for the new hyphenated string */
928 0 : nHyphCount = 0;
929 0 : for (i = 0; i < word_size; i++)
930 0 : if (hyphens[i]&1)
931 0 : nHyphCount++;
932 0 : j = 0;
933 0 : for (i = 0; i < word_size; i++) {
934 0 : if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
935 0 : if (rep && pos && cut) {
936 0 : if (!*rep && !*pos && !*cut) {
937 : int k;
938 0 : *rep = (char **) malloc(sizeof(char *) * word_size);
939 0 : *pos = (int *) malloc(sizeof(int) * word_size);
940 0 : *cut = (int *) malloc(sizeof(int) * word_size);
941 0 : for (k = 0; k < word_size; k++) {
942 0 : (*rep)[k] = NULL;
943 0 : (*pos)[k] = 0;
944 0 : (*cut)[k] = 0;
945 : }
946 : }
947 0 : (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
948 0 : (*pos)[matchindex[i] - 1] = matchindex[i] - i;
949 0 : (*cut)[matchindex[i] - 1] = matchlen[i];
950 : }
951 0 : j += strlen(matchrepl[matchindex[i]]);
952 0 : i += matchlen[i] - 1;
953 : }
954 : }
955 :
956 0 : if (matchrepl != matchrepl_buf) {
957 0 : hnj_free (matchrepl);
958 0 : hnj_free (matchlen);
959 0 : hnj_free (matchindex);
960 : }
961 :
962 : // recursive hyphenation of the first (compound) level segments
963 0 : if (dict->nextlevel) {
964 : char * rep2_buf[MAX_WORD];
965 : int pos2_buf[MAX_WORD];
966 : int cut2_buf[MAX_WORD];
967 : char hyphens2_buf[MAX_WORD];
968 : char ** rep2;
969 : int * pos2;
970 : int * cut2;
971 : char * hyphens2;
972 0 : int begin = 0;
973 0 : if (word_size < MAX_CHARS) {
974 0 : rep2 = rep2_buf;
975 0 : pos2 = pos2_buf;
976 0 : cut2 = cut2_buf;
977 0 : hyphens2 = hyphens2_buf;
978 : } else {
979 0 : rep2 = hnj_malloc (word_size * sizeof(char *));
980 0 : pos2 = hnj_malloc (word_size * sizeof(int));
981 0 : cut2 = hnj_malloc (word_size * sizeof(int));
982 0 : hyphens2 = hnj_malloc (word_size);
983 : }
984 0 : for (i = 0; i < word_size; i++) rep2[i] = NULL;
985 0 : for (i = 0; i < word_size; i++) if
986 0 : (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
987 0 : if (i - begin > 1) {
988 0 : int hyph = 0;
989 0 : prep_word[i + 2] = '\0';
990 : /* non-standard hyphenation at compound boundary (Schiffahrt) */
991 0 : if (*rep && *pos && *cut && (*rep)[i]) {
992 0 : char * l = strchr((*rep)[i], '=');
993 0 : strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
994 0 : if (l) {
995 0 : hyph = (l - (*rep)[i]) - (*pos)[i];
996 0 : prep_word[2 + i + hyph] = '\0';
997 : }
998 : }
999 0 : hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
1000 : hyphens2, &rep2, &pos2, &cut2, clhmin,
1001 0 : crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
1002 0 : for (j = 0; j < i - begin - 1; j++) {
1003 0 : hyphens[begin + j] = hyphens2[j];
1004 0 : if (rep2[j] && rep && pos && cut) {
1005 0 : if (!*rep && !*pos && !*cut) {
1006 : int k;
1007 0 : *rep = (char **) malloc(sizeof(char *) * word_size);
1008 0 : *pos = (int *) malloc(sizeof(int) * word_size);
1009 0 : *cut = (int *) malloc(sizeof(int) * word_size);
1010 0 : for (k = 0; k < word_size; k++) {
1011 0 : (*rep)[k] = NULL;
1012 0 : (*pos)[k] = 0;
1013 0 : (*cut)[k] = 0;
1014 : }
1015 : }
1016 0 : (*rep)[begin + j] = rep2[j];
1017 0 : (*pos)[begin + j] = pos2[j];
1018 0 : (*cut)[begin + j] = cut2[j];
1019 : }
1020 : }
1021 0 : prep_word[i + 2] = word[i + 1];
1022 0 : if (*rep && *pos && *cut && (*rep)[i]) {
1023 0 : strcpy(prep_word + 1, word);
1024 : }
1025 : }
1026 0 : begin = i + 1;
1027 0 : for (j = 0; j < word_size; j++) rep2[j] = NULL;
1028 : }
1029 :
1030 : // non-compound
1031 0 : if (begin == 0) {
1032 0 : hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
1033 : hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
1034 0 : if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1035 : rep, pos, cut, clhmin);
1036 0 : if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1037 : rep, pos, cut, crhmin);
1038 : }
1039 :
1040 0 : if (rep2 != rep2_buf) {
1041 0 : free(rep2);
1042 0 : free(cut2);
1043 0 : free(pos2);
1044 0 : free(hyphens2);
1045 : }
1046 : }
1047 :
1048 0 : if (prep_word != prep_word_buf) hnj_free (prep_word);
1049 0 : return 0;
1050 : }
1051 :
1052 : /* UTF-8 normalization of hyphen and non-standard positions */
1053 0 : int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
1054 : char *** rep, int ** pos, int ** cut)
1055 : {
1056 : int i, j, k;
1057 0 : if ((((unsigned char) word[0]) >> 6) == 2) {
1058 0 : fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
1059 0 : return 1;
1060 : }
1061 :
1062 : /* calculate UTF-8 character positions */
1063 0 : for (i = 0, j = -1; i < word_size; i++) {
1064 : /* beginning of an UTF-8 character (not '10' start bits) */
1065 0 : if ((((unsigned char) word[i]) >> 6) != 2) j++;
1066 0 : hyphens[j] = hyphens[i];
1067 0 : if (rep && pos && cut && *rep && *pos && *cut) {
1068 0 : int l = (*pos)[i];
1069 0 : (*pos)[j] = 0;
1070 0 : for (k = 0; k < l; k++) {
1071 0 : if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
1072 : }
1073 0 : k = i - l + 1;
1074 0 : l = k + (*cut)[i];
1075 0 : (*cut)[j] = 0;
1076 0 : for (; k < l; k++) {
1077 0 : if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
1078 : }
1079 0 : (*rep)[j] = (*rep)[i];
1080 0 : if (j < i) {
1081 0 : (*rep)[i] = NULL;
1082 0 : (*pos)[i] = 0;
1083 0 : (*cut)[i] = 0;
1084 : }
1085 : }
1086 : }
1087 0 : hyphens[j + 1] = '\0';
1088 : #ifdef VERBOSE
1089 : printf ("nums: %s\n", hyphens);
1090 : #endif
1091 0 : return 0;
1092 : }
1093 :
1094 : /* get the word with all possible hyphenations (output: hyphword) */
1095 0 : void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
1096 : char * hyphword, char *** rep, int ** pos, int ** cut)
1097 : {
1098 : int i, j;
1099 0 : for (i = 0, j = 0; i < l; i++, j++) {
1100 0 : if (hyphens[i]&1) {
1101 0 : hyphword[j] = word[i];
1102 0 : if (*rep && *pos && *cut && (*rep)[i]) {
1103 0 : strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
1104 0 : j += strlen((*rep)[i]) - (*pos)[i];
1105 0 : i += (*cut)[i] - (*pos)[i];
1106 0 : } else hyphword[++j] = '=';
1107 0 : } else hyphword[j] = word[i];
1108 : }
1109 0 : hyphword[j] = '\0';
1110 0 : }
1111 :
1112 :
1113 : /* main api function with default hyphenmin parameters */
1114 0 : int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1115 : const char *word, int word_size, char * hyphens,
1116 : char *hyphword, char *** rep, int ** pos, int ** cut)
1117 : {
1118 0 : hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1119 0 : dict->clhmin, dict->crhmin, 1, 1);
1120 0 : hnj_hyphen_lhmin(dict->utf8, word, word_size,
1121 0 : hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1122 0 : hnj_hyphen_rhmin(dict->utf8, word, word_size,
1123 0 : hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1124 :
1125 : /* nohyphen */
1126 0 : if (dict->nohyphen) {
1127 0 : char * nh = dict->nohyphen;
1128 : int nhi;
1129 0 : for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1130 0 : char * nhy = (char *) strstr(word, nh);
1131 0 : while (nhy) {
1132 0 : hyphens[nhy - word + strlen(nh) - 1] = '0';
1133 0 : if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0';
1134 0 : nhy = (char *) strstr(nhy + 1, nh);
1135 : }
1136 0 : nh = nh + strlen(nh) + 1;
1137 : }
1138 : }
1139 :
1140 0 : if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1141 0 : if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1142 : #ifdef VERBOSE
1143 : printf ("nums: %s\n", hyphens);
1144 : #endif
1145 0 : return 0;
1146 : }
1147 :
1148 : /* previous main api function with hyphenmin parameters */
1149 0 : int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1150 : const char *word, int word_size, char * hyphens,
1151 : char *hyphword, char *** rep, int ** pos, int ** cut,
1152 : int lhmin, int rhmin, int clhmin, int crhmin)
1153 : {
1154 0 : lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
1155 0 : rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
1156 0 : clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
1157 0 : crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
1158 0 : hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1159 : clhmin, crhmin, 1, 1);
1160 0 : hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1161 : rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1162 0 : hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1163 : rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1164 0 : if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1165 :
1166 : /* nohyphen */
1167 0 : if (dict->nohyphen) {
1168 0 : char * nh = dict->nohyphen;
1169 : int nhi;
1170 0 : for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1171 0 : char * nhy = (char *) strstr(word, nh);
1172 0 : while (nhy) {
1173 0 : hyphens[nhy - word + strlen(nh) - 1] = 0;
1174 0 : if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0;
1175 0 : nhy = (char *) strstr(nhy + 1, nh);
1176 : }
1177 0 : nh = nh + strlen(nh) + 1;
1178 : }
1179 : }
1180 :
1181 0 : if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1182 0 : return 0;
1183 : }
|