1 : /******* BEGIN LICENSE BLOCK *******
2 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 : *
4 : * The contents of this file are subject to the Mozilla Public License Version
5 : * 1.1 (the "License"); you may not use this file except in compliance with
6 : * the License. You may obtain a copy of the License at
7 : * http://www.mozilla.org/MPL/
8 : *
9 : * Software distributed under the License is distributed on an "AS IS" basis,
10 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 : * for the specific language governing rights and limitations under the
12 : * License.
13 : *
14 : * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 : * and László Németh (Hunspell). Portions created by the Initial Developers
16 : * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
17 : *
18 : * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 : * David Einstein (deinst@world.std.com)
20 : * László Németh (nemethl@gyorsposta.hu)
21 : * Caolan McNamara (caolanm@redhat.com)
22 : * Davide Prina
23 : * Giuseppe Modugno
24 : * Gianluca Turconi
25 : * Simon Brouwer
26 : * Noll Janos
27 : * Biro Arpad
28 : * Goldman Eleonora
29 : * Sarlos Tamas
30 : * Bencsath Boldizsar
31 : * Halacsy Peter
32 : * Dvornik Laszlo
33 : * Gefferth Andras
34 : * Nagy Viktor
35 : * Varga Daniel
36 : * Chris Halls
37 : * Rene Engelhard
38 : * Bram Moolenaar
39 : * Dafydd Jones
40 : * Harri Pitkanen
41 : * Andras Timar
42 : * Tor Lillqvist
43 : *
44 : * Alternatively, the contents of this file may be used under the terms of
45 : * either the GNU General Public License Version 2 or later (the "GPL"), or
46 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
47 : * in which case the provisions of the GPL or the LGPL are applicable instead
48 : * of those above. If you wish to allow use of your version of this file only
49 : * under the terms of either the GPL or the LGPL, and not to allow others to
50 : * use your version of this file under the terms of the MPL, indicate your
51 : * decision by deleting the provisions above and replace them with the notice
52 : * and other provisions required by the GPL or the LGPL. If you do not delete
53 : * the provisions above, a recipient may use your version of this file under
54 : * the terms of any one of the MPL, the GPL or the LGPL.
55 : *
56 : ******* END LICENSE BLOCK *******/
57 :
58 : #include <stdlib.h>
59 : #include <string.h>
60 : #include <stdio.h>
61 : #include <ctype.h>
62 :
63 : #include "hashmgr.hxx"
64 : #include "csutil.hxx"
65 : #include "atypes.hxx"
66 :
67 : // build a hash table from a munched word list
68 :
69 110 : HashMgr::HashMgr(const char * tpath, const char * apath, const char * key)
70 : {
71 110 : tablesize = 0;
72 110 : tableptr = NULL;
73 110 : flag_mode = FLAG_CHAR;
74 110 : complexprefixes = 0;
75 110 : utf8 = 0;
76 110 : langnum = 0;
77 110 : lang = NULL;
78 110 : enc = NULL;
79 110 : csconv = 0;
80 110 : ignorechars = NULL;
81 110 : ignorechars_utf16 = NULL;
82 110 : ignorechars_utf16_len = 0;
83 110 : numaliasf = 0;
84 110 : aliasf = NULL;
85 110 : numaliasm = 0;
86 110 : aliasm = NULL;
87 110 : forbiddenword = FORBIDDENWORD; // forbidden word signing flag
88 110 : load_config(apath, key);
89 110 : int ec = load_tables(tpath, key);
90 110 : if (ec) {
91 : /* error condition - what should we do here */
92 0 : HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
93 0 : if (tableptr) {
94 0 : free(tableptr);
95 0 : tableptr = NULL;
96 : }
97 0 : tablesize = 0;
98 : }
99 110 : }
100 :
101 :
102 110 : HashMgr::~HashMgr()
103 : {
104 110 : if (tableptr) {
105 : // now pass through hash table freeing up everything
106 : // go through column by column of the table
107 111266 : for (int i=0; i < tablesize; i++) {
108 111156 : struct hentry * pt = tableptr[i];
109 111156 : struct hentry * nt = NULL;
110 222815 : while(pt) {
111 503 : nt = pt->next;
112 503 : if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr);
113 503 : free(pt);
114 503 : pt = nt;
115 : }
116 : }
117 110 : free(tableptr);
118 : }
119 110 : tablesize = 0;
120 :
121 110 : if (aliasf) {
122 2 : for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
123 2 : free(aliasf);
124 2 : aliasf = NULL;
125 2 : if (aliasflen) {
126 2 : free(aliasflen);
127 2 : aliasflen = NULL;
128 : }
129 : }
130 110 : if (aliasm) {
131 2 : for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
132 2 : free(aliasm);
133 2 : aliasm = NULL;
134 : }
135 :
136 : #ifndef OPENOFFICEORG
137 : #ifndef MOZILLA_CLIENT
138 : if (utf8) free_utf_tbl();
139 : #endif
140 : #endif
141 :
142 110 : if (enc) free(enc);
143 110 : if (lang) free(lang);
144 :
145 110 : if (ignorechars) free(ignorechars);
146 110 : if (ignorechars_utf16) free(ignorechars_utf16);
147 :
148 : #ifdef MOZILLA_CLIENT
149 110 : delete [] csconv;
150 : #endif
151 110 : }
152 :
153 : // lookup a root word in the hashtable
154 :
155 14675 : struct hentry * HashMgr::lookup(const char *word) const
156 : {
157 : struct hentry * dp;
158 14675 : if (tableptr) {
159 14675 : dp = tableptr[hash(word)];
160 14675 : if (!dp) return NULL;
161 2567 : for ( ; dp != NULL; dp = dp->next) {
162 2253 : if (strcmp(word, dp->word) == 0) return dp;
163 : }
164 : }
165 314 : return NULL;
166 : }
167 :
168 : // add a word to the hash table (private)
169 505 : int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff,
170 : int al, const char * desc, bool onlyupcase)
171 : {
172 505 : bool upcasehomonym = false;
173 505 : int descl = desc ? (aliasm ? sizeof(short) : strlen(desc) + 1) : 0;
174 : // variable-length hash record with word and optional fields
175 : struct hentry* hp =
176 505 : (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl);
177 505 : if (!hp) return 1;
178 505 : char * hpw = hp->word;
179 505 : strcpy(hpw, word);
180 505 : if (ignorechars != NULL) {
181 14 : if (utf8) {
182 12 : remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len);
183 : } else {
184 2 : remove_ignored_chars(hpw, ignorechars);
185 : }
186 : }
187 505 : if (complexprefixes) {
188 6 : if (utf8) reverseword_utf(hpw); else reverseword(hpw);
189 : }
190 :
191 505 : int i = hash(hpw);
192 :
193 505 : hp->blen = (unsigned char) wbl;
194 505 : hp->clen = (unsigned char) wcl;
195 505 : hp->alen = (short) al;
196 505 : hp->astr = aff;
197 505 : hp->next = NULL;
198 505 : hp->next_homonym = NULL;
199 :
200 : // store the description string or its pointer
201 505 : if (desc) {
202 43 : hp->var = H_OPT;
203 43 : if (aliasm) {
204 2 : hp->var += H_OPT_ALIASM;
205 2 : store_pointer(hpw + wbl + 1, get_aliasm(atoi(desc)));
206 : } else {
207 41 : strcpy(hpw + wbl + 1, desc);
208 41 : if (complexprefixes) {
209 1 : if (utf8) reverseword_utf(HENTRY_DATA(hp));
210 1 : else reverseword(HENTRY_DATA(hp));
211 : }
212 : }
213 43 : if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON;
214 462 : } else hp->var = 0;
215 :
216 505 : struct hentry * dp = tableptr[i];
217 505 : if (!dp) {
218 486 : tableptr[i] = hp;
219 486 : return 0;
220 : }
221 43 : while (dp->next != NULL) {
222 5 : if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
223 : // remove hidden onlyupcase homonym
224 0 : if (!onlyupcase) {
225 0 : if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
226 0 : free(dp->astr);
227 0 : dp->astr = hp->astr;
228 0 : dp->alen = hp->alen;
229 0 : free(hp);
230 0 : return 0;
231 : } else {
232 0 : dp->next_homonym = hp;
233 : }
234 : } else {
235 0 : upcasehomonym = true;
236 : }
237 : }
238 5 : dp=dp->next;
239 : }
240 19 : if (strcmp(hp->word, dp->word) == 0) {
241 : // remove hidden onlyupcase homonym
242 16 : if (!onlyupcase) {
243 15 : if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
244 1 : free(dp->astr);
245 1 : dp->astr = hp->astr;
246 1 : dp->alen = hp->alen;
247 1 : free(hp);
248 1 : return 0;
249 : } else {
250 14 : dp->next_homonym = hp;
251 : }
252 : } else {
253 1 : upcasehomonym = true;
254 : }
255 : }
256 18 : if (!upcasehomonym) {
257 17 : dp->next = hp;
258 : } else {
259 : // remove hidden onlyupcase homonym
260 1 : if (hp->astr) free(hp->astr);
261 1 : free(hp);
262 : }
263 18 : return 0;
264 : }
265 :
266 488 : int HashMgr::add_hidden_capitalized_word(char * word, int wbl, int wcl,
267 : unsigned short * flags, int al, char * dp, int captype)
268 : {
269 : // add inner capitalized forms to handle the following allcap forms:
270 : // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
271 : // Allcaps with suffixes: CIA's -> CIA'S
272 505 : if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
273 : ((captype == ALLCAP) && (flags != NULL))) &&
274 14 : !((flags != NULL) && TESTAFF(flags, forbiddenword, al))) {
275 17 : unsigned short * flags2 = (unsigned short *) malloc (sizeof(unsigned short) * (al+1));
276 17 : if (!flags2) return 1;
277 17 : if (al) memcpy(flags2, flags, al * sizeof(unsigned short));
278 17 : flags2[al] = ONLYUPCASEFLAG;
279 17 : if (utf8) {
280 : char st[BUFSIZE];
281 : w_char w[BUFSIZE];
282 4 : int wlen = u8_u16(w, BUFSIZE, word);
283 4 : mkallsmall_utf(w, wlen, langnum);
284 4 : mkallcap_utf(w, 1, langnum);
285 4 : u16_u8(st, BUFSIZE, w, wlen);
286 4 : return add_word(st,wbl,wcl,flags2,al+1,dp, true);
287 : } else {
288 13 : mkallsmall(word, csconv);
289 13 : mkinitcap(word, csconv);
290 13 : return add_word(word,wbl,wcl,flags2,al+1,dp, true);
291 : }
292 : }
293 471 : return 0;
294 : }
295 :
296 : // detect captype and modify word length for UTF-8 encoding
297 488 : int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) {
298 : int len;
299 488 : if (utf8) {
300 : w_char dest_utf[BUFSIZE];
301 131 : len = u8_u16(dest_utf, BUFSIZE, word);
302 131 : *captype = get_captype_utf8(dest_utf, len, langnum);
303 : } else {
304 357 : len = wbl;
305 357 : *captype = get_captype((char *) word, len, csconv);
306 : }
307 488 : return len;
308 : }
309 :
310 : // remove word (personal dictionary function for standalone applications)
311 0 : int HashMgr::remove(const char * word)
312 : {
313 0 : struct hentry * dp = lookup(word);
314 0 : while (dp) {
315 0 : if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
316 : unsigned short * flags =
317 0 : (unsigned short *) malloc(sizeof(short) * (dp->alen + 1));
318 0 : if (!flags) return 1;
319 0 : for (int i = 0; i < dp->alen; i++) flags[i] = dp->astr[i];
320 0 : flags[dp->alen] = forbiddenword;
321 0 : dp->astr = flags;
322 0 : dp->alen++;
323 0 : flag_qsort(flags, 0, dp->alen);
324 : }
325 0 : dp = dp->next_homonym;
326 : }
327 0 : return 0;
328 : }
329 :
330 : /* remove forbidden flag to add a personal word to the hash */
331 0 : int HashMgr::remove_forbidden_flag(const char * word) {
332 0 : struct hentry * dp = lookup(word);
333 0 : if (!dp) return 1;
334 0 : while (dp) {
335 0 : if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
336 0 : if (dp->alen == 1) dp->alen = 0; // XXX forbidden words of personal dic.
337 : else {
338 : unsigned short * flags2 =
339 0 : (unsigned short *) malloc(sizeof(short) * (dp->alen - 1));
340 0 : if (!flags2) return 1;
341 0 : int i, j = 0;
342 0 : for (i = 0; i < dp->alen; i++) {
343 0 : if (dp->astr[i] != forbiddenword) flags2[j++] = dp->astr[i];
344 : }
345 0 : dp->alen--;
346 0 : dp->astr = flags2; // XXX allowed forbidden words
347 : }
348 : }
349 0 : dp = dp->next_homonym;
350 : }
351 0 : return 0;
352 : }
353 :
354 : // add a custom dic. word to the hash table (public)
355 0 : int HashMgr::add(const char * word)
356 : {
357 0 : unsigned short * flags = NULL;
358 0 : int al = 0;
359 0 : if (remove_forbidden_flag(word)) {
360 : int captype;
361 0 : int wbl = strlen(word);
362 0 : int wcl = get_clen_and_captype(word, wbl, &captype);
363 0 : add_word(word, wbl, wcl, flags, al, NULL, false);
364 0 : return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype);
365 : }
366 0 : return 0;
367 : }
368 :
369 0 : int HashMgr::add_with_affix(const char * word, const char * example)
370 : {
371 : // detect captype and modify word length for UTF-8 encoding
372 0 : struct hentry * dp = lookup(example);
373 0 : remove_forbidden_flag(word);
374 0 : if (dp && dp->astr) {
375 : int captype;
376 0 : int wbl = strlen(word);
377 0 : int wcl = get_clen_and_captype(word, wbl, &captype);
378 0 : if (aliasf) {
379 0 : add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false);
380 : } else {
381 0 : unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short));
382 0 : if (flags) {
383 0 : memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
384 0 : add_word(word, wbl, wcl, flags, dp->alen, NULL, false);
385 0 : } else return 1;
386 : }
387 0 : return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype);
388 : }
389 0 : return 1;
390 : }
391 :
392 : // walk the hash table entry by entry - null at end
393 : // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
394 0 : struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
395 : {
396 0 : if (hp && hp->next != NULL) return hp->next;
397 0 : for (col++; col < tablesize; col++) {
398 0 : if (tableptr[col]) return tableptr[col];
399 : }
400 : // null at end and reset to start
401 0 : col = -1;
402 0 : return NULL;
403 : }
404 :
405 : // load a munched word list and build a hash table on the fly
406 110 : int HashMgr::load_tables(const char * tpath, const char * key)
407 : {
408 : int al;
409 : char * ap;
410 : char * dp;
411 : char * dp2;
412 : unsigned short * flags;
413 : char * ts;
414 :
415 : // open dictionary file
416 110 : FileMgr * dict = new FileMgr(tpath, key);
417 110 : if (dict == NULL) return 1;
418 :
419 : // first read the first line of file to get hash table size */
420 110 : if (!(ts = dict->getline())) {
421 0 : HUNSPELL_WARNING(stderr, "error: empty dic file\n");
422 0 : delete dict;
423 0 : return 2;
424 : }
425 110 : mychomp(ts);
426 :
427 : /* remove byte order mark */
428 110 : if (strncmp(ts,"\xEF\xBB\xBF",3) == 0) {
429 1 : memmove(ts, ts+3, strlen(ts+3)+1);
430 : // warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions
431 : }
432 :
433 110 : tablesize = atoi(ts);
434 110 : if (tablesize == 0) {
435 0 : HUNSPELL_WARNING(stderr, "error: line 1: missing or bad word count in the dic file\n");
436 0 : delete dict;
437 0 : return 4;
438 : }
439 110 : tablesize = tablesize + 5 + USERWORD;
440 110 : if ((tablesize %2) == 0) tablesize++;
441 :
442 : // allocate the hash table
443 110 : tableptr = (struct hentry **) malloc(tablesize * sizeof(struct hentry *));
444 110 : if (! tableptr) {
445 0 : delete dict;
446 0 : return 3;
447 : }
448 110 : for (int i=0; i<tablesize; i++) tableptr[i] = NULL;
449 :
450 : // loop through all words on much list and add to hash
451 : // table and create word and affix strings
452 :
453 708 : while ((ts = dict->getline())) {
454 488 : mychomp(ts);
455 : // split each line into word and morphological description
456 488 : dp = ts;
457 977 : while ((dp = strchr(dp, ':'))) {
458 30 : if ((dp > ts + 3) && (*(dp - 3) == ' ' || *(dp - 3) == '\t')) {
459 29 : for (dp -= 4; dp >= ts && (*dp == ' ' || *dp == '\t'); dp--);
460 29 : if (dp < ts) { // missing word
461 0 : dp = NULL;
462 : } else {
463 29 : *(dp + 1) = '\0';
464 29 : dp = dp + 2;
465 : }
466 29 : break;
467 : }
468 1 : dp++;
469 : }
470 :
471 : // tabulator is the old morphological field separator
472 488 : dp2 = strchr(ts, '\t');
473 488 : if (dp2 && (!dp || dp2 < dp)) {
474 14 : *dp2 = '\0';
475 14 : dp = dp2 + 1;
476 : }
477 :
478 : // split each line into word and affix char strings
479 : // "\/" signs slash in words (not affix separator)
480 : // "/" at beginning of the line is word character (not affix separator)
481 488 : ap = strchr(ts,'/');
482 984 : while (ap) {
483 317 : if (ap == ts) {
484 1 : ap++;
485 1 : continue;
486 316 : } else if (*(ap - 1) != '\\') break;
487 : // replace "\/" with "/"
488 7 : for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
489 7 : ap = strchr(ap,'/');
490 : }
491 :
492 488 : if (ap) {
493 309 : *ap = '\0';
494 309 : if (aliasf) {
495 2 : int index = atoi(ap + 1);
496 2 : al = get_aliasf(index, &flags, dict);
497 2 : if (!al) {
498 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n", dict->getlinenum());
499 0 : *ap = '\0';
500 : }
501 : } else {
502 307 : al = decode_flags(&flags, ap + 1, dict);
503 307 : if (al == -1) {
504 0 : HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
505 0 : delete dict;
506 0 : return 6;
507 : }
508 307 : flag_qsort(flags, 0, al);
509 : }
510 : } else {
511 179 : al = 0;
512 179 : ap = NULL;
513 179 : flags = NULL;
514 : }
515 :
516 : int captype;
517 488 : int wbl = strlen(ts);
518 488 : int wcl = get_clen_and_captype(ts, wbl, &captype);
519 : // add the word and its index plus its capitalized form optionally
520 976 : if (add_word(ts,wbl,wcl,flags,al,dp, false) ||
521 488 : add_hidden_capitalized_word(ts, wbl, wcl, flags, al, dp, captype)) {
522 0 : delete dict;
523 0 : return 5;
524 : }
525 : }
526 :
527 110 : delete dict;
528 110 : return 0;
529 : }
530 :
531 : // the hash function is a simple load and rotate
532 : // algorithm borrowed
533 :
534 15180 : int HashMgr::hash(const char * word) const
535 : {
536 15180 : long hv = 0;
537 67671 : for (int i=0; i < 4 && *word != 0; i++)
538 52491 : hv = (hv << 8) | (*word++);
539 92159 : while (*word != 0) {
540 61799 : ROTATE(hv,ROTATE_LEN);
541 61799 : hv ^= (*word++);
542 : }
543 15180 : return (unsigned long) hv % tablesize;
544 : }
545 :
546 446 : int HashMgr::decode_flags(unsigned short ** result, char * flags, FileMgr * af) {
547 : int len;
548 446 : if (*flags == '\0') {
549 1 : *result = NULL;
550 1 : return 0;
551 : }
552 445 : switch (flag_mode) {
553 : case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
554 45 : len = strlen(flags);
555 45 : if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n", af->getlinenum());
556 45 : len /= 2;
557 45 : *result = (unsigned short *) malloc(len * sizeof(short));
558 45 : if (!*result) return -1;
559 136 : for (int i = 0; i < len; i++) {
560 91 : (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1];
561 : }
562 45 : break;
563 : }
564 : case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
565 : int i;
566 32 : len = 1;
567 32 : char * src = flags;
568 : unsigned short * dest;
569 : char * p;
570 283 : for (p = flags; *p; p++) {
571 251 : if (*p == ',') len++;
572 : }
573 32 : *result = (unsigned short *) malloc(len * sizeof(short));
574 32 : if (!*result) return -1;
575 32 : dest = *result;
576 283 : for (p = flags; *p; p++) {
577 251 : if (*p == ',') {
578 25 : i = atoi(src);
579 25 : if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
580 0 : af->getlinenum(), i, DEFAULTFLAGS - 1);
581 25 : *dest = (unsigned short) i;
582 25 : if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
583 25 : src = p + 1;
584 25 : dest++;
585 : }
586 : }
587 32 : i = atoi(src);
588 32 : if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: line %d: flag id %d is too large (max: %d)\n",
589 0 : af->getlinenum(), i, DEFAULTFLAGS - 1);
590 32 : *dest = (unsigned short) i;
591 32 : if (*dest == 0) HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n", af->getlinenum());
592 32 : break;
593 : }
594 : case FLAG_UNI: { // UTF-8 characters
595 : w_char w[BUFSIZE/2];
596 2 : len = u8_u16(w, BUFSIZE/2, flags);
597 2 : *result = (unsigned short *) malloc(len * sizeof(short));
598 2 : if (!*result) return -1;
599 2 : memcpy(*result, w, len * sizeof(short));
600 2 : break;
601 : }
602 : default: { // Ispell's one-character flags (erfg -> e r f g)
603 : unsigned short * dest;
604 366 : len = strlen(flags);
605 366 : *result = (unsigned short *) malloc(len * sizeof(short));
606 366 : if (!*result) return -1;
607 366 : dest = *result;
608 1073 : for (unsigned char * p = (unsigned char *) flags; *p; p++) {
609 707 : *dest = (unsigned short) *p;
610 707 : dest++;
611 : }
612 : }
613 : }
614 445 : return len;
615 : }
616 :
617 725 : unsigned short HashMgr::decode_flag(const char * f) {
618 725 : unsigned short s = 0;
619 : int i;
620 725 : switch (flag_mode) {
621 : case FLAG_LONG:
622 37 : s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
623 37 : break;
624 : case FLAG_NUM:
625 22 : i = atoi(f);
626 22 : if (i >= DEFAULTFLAGS) HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n", i, DEFAULTFLAGS - 1);
627 22 : s = (unsigned short) i;
628 22 : break;
629 : case FLAG_UNI:
630 8 : u8_u16((w_char *) &s, 1, f);
631 8 : break;
632 : default:
633 658 : s = (unsigned short) *((unsigned char *)f);
634 : }
635 725 : if (s == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
636 725 : return s;
637 : }
638 :
639 0 : char * HashMgr::encode_flag(unsigned short f) {
640 : unsigned char ch[10];
641 0 : if (f==0) return mystrdup("(NULL)");
642 0 : if (flag_mode == FLAG_LONG) {
643 0 : ch[0] = (unsigned char) (f >> 8);
644 0 : ch[1] = (unsigned char) (f - ((f >> 8) << 8));
645 0 : ch[2] = '\0';
646 0 : } else if (flag_mode == FLAG_NUM) {
647 0 : sprintf((char *) ch, "%d", f);
648 0 : } else if (flag_mode == FLAG_UNI) {
649 0 : u16_u8((char *) &ch, 10, (w_char *) &f, 1);
650 : } else {
651 0 : ch[0] = (unsigned char) (f);
652 0 : ch[1] = '\0';
653 : }
654 0 : return mystrdup((char *) ch);
655 : }
656 :
657 : // read in aff file and set flag mode
658 110 : int HashMgr::load_config(const char * affpath, const char * key)
659 : {
660 : char * line; // io buffers
661 110 : int firstline = 1;
662 :
663 : // open the affix file
664 110 : FileMgr * afflst = new FileMgr(affpath, key);
665 110 : if (!afflst) {
666 0 : HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
667 0 : return 1;
668 : }
669 :
670 : // read in each line ignoring any that do not
671 : // start with a known line type indicator
672 :
673 1105 : while ((line = afflst->getline())) {
674 939 : mychomp(line);
675 :
676 : /* remove byte order mark */
677 939 : if (firstline) {
678 110 : firstline = 0;
679 110 : if (strncmp(line,"\xEF\xBB\xBF",3) == 0) memmove(line, line+3, strlen(line+3)+1);
680 : }
681 :
682 : /* parse in the try string */
683 939 : if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
684 8 : if (flag_mode != FLAG_CHAR) {
685 0 : HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst->getlinenum());
686 : }
687 8 : if (strstr(line, "long")) flag_mode = FLAG_LONG;
688 8 : if (strstr(line, "num")) flag_mode = FLAG_NUM;
689 8 : if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
690 8 : if (flag_mode == FLAG_CHAR) {
691 0 : HUNSPELL_WARNING(stderr, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst->getlinenum());
692 : }
693 : }
694 939 : if (strncmp(line,"FORBIDDENWORD",13) == 0) {
695 7 : char * st = NULL;
696 7 : if (parse_string(line, &st, afflst->getlinenum())) {
697 0 : delete afflst;
698 0 : return 1;
699 : }
700 7 : forbiddenword = decode_flag(st);
701 7 : free(st);
702 : }
703 939 : if (strncmp(line, "SET", 3) == 0) {
704 37 : if (parse_string(line, &enc, afflst->getlinenum())) {
705 0 : delete afflst;
706 0 : return 1;
707 : }
708 37 : if (strcmp(enc, "UTF-8") == 0) {
709 28 : utf8 = 1;
710 : #ifndef OPENOFFICEORG
711 : #ifndef MOZILLA_CLIENT
712 : initialize_utf_tbl();
713 : #endif
714 : #endif
715 9 : } else csconv = get_current_cs(enc);
716 : }
717 939 : if (strncmp(line, "LANG", 4) == 0) {
718 0 : if (parse_string(line, &lang, afflst->getlinenum())) {
719 0 : delete afflst;
720 0 : return 1;
721 : }
722 0 : langnum = get_lang_num(lang);
723 : }
724 :
725 : /* parse in the ignored characters (for example, Arabic optional diacritics characters */
726 939 : if (strncmp(line,"IGNORE",6) == 0) {
727 4 : if (parse_array(line, &ignorechars, &ignorechars_utf16,
728 4 : &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
729 0 : delete afflst;
730 0 : return 1;
731 : }
732 : }
733 :
734 939 : if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
735 2 : if (parse_aliasf(line, afflst)) {
736 0 : delete afflst;
737 0 : return 1;
738 : }
739 : }
740 :
741 939 : if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
742 2 : if (parse_aliasm(line, afflst)) {
743 0 : delete afflst;
744 0 : return 1;
745 : }
746 : }
747 :
748 939 : if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
749 939 : if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
750 : }
751 110 : if (csconv == NULL) csconv = get_current_cs(SPELL_ENCODING);
752 110 : delete afflst;
753 110 : return 0;
754 : }
755 :
756 : /* parse in the ALIAS table */
757 2 : int HashMgr::parse_aliasf(char * line, FileMgr * af)
758 : {
759 2 : if (numaliasf != 0) {
760 0 : HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
761 0 : return 1;
762 : }
763 2 : char * tp = line;
764 : char * piece;
765 2 : int i = 0;
766 2 : int np = 0;
767 2 : piece = mystrsep(&tp, 0);
768 8 : while (piece) {
769 4 : if (*piece != '\0') {
770 4 : switch(i) {
771 2 : case 0: { np++; break; }
772 : case 1: {
773 2 : numaliasf = atoi(piece);
774 2 : if (numaliasf < 1) {
775 0 : numaliasf = 0;
776 0 : aliasf = NULL;
777 0 : aliasflen = NULL;
778 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
779 0 : return 1;
780 : }
781 2 : aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
782 2 : aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
783 2 : if (!aliasf || !aliasflen) {
784 0 : numaliasf = 0;
785 0 : if (aliasf) free(aliasf);
786 0 : if (aliasflen) free(aliasflen);
787 0 : aliasf = NULL;
788 0 : aliasflen = NULL;
789 0 : return 1;
790 : }
791 2 : np++;
792 2 : break;
793 : }
794 0 : default: break;
795 : }
796 4 : i++;
797 : }
798 4 : piece = mystrsep(&tp, 0);
799 : }
800 2 : if (np != 2) {
801 0 : numaliasf = 0;
802 0 : free(aliasf);
803 0 : free(aliasflen);
804 0 : aliasf = NULL;
805 0 : aliasflen = NULL;
806 0 : HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
807 0 : return 1;
808 : }
809 :
810 : /* now parse the numaliasf lines to read in the remainder of the table */
811 : char * nl;
812 6 : for (int j=0; j < numaliasf; j++) {
813 4 : if (!(nl = af->getline())) return 1;
814 4 : mychomp(nl);
815 4 : tp = nl;
816 4 : i = 0;
817 4 : aliasf[j] = NULL;
818 4 : aliasflen[j] = 0;
819 4 : piece = mystrsep(&tp, 0);
820 16 : while (piece) {
821 8 : if (*piece != '\0') {
822 8 : switch(i) {
823 : case 0: {
824 4 : if (strncmp(piece,"AF",2) != 0) {
825 0 : numaliasf = 0;
826 0 : free(aliasf);
827 0 : free(aliasflen);
828 0 : aliasf = NULL;
829 0 : aliasflen = NULL;
830 0 : HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
831 0 : return 1;
832 : }
833 4 : break;
834 : }
835 : case 1: {
836 4 : aliasflen[j] = (unsigned short) decode_flags(&(aliasf[j]), piece, af);
837 4 : flag_qsort(aliasf[j], 0, aliasflen[j]);
838 4 : break;
839 : }
840 0 : default: break;
841 : }
842 8 : i++;
843 : }
844 8 : piece = mystrsep(&tp, 0);
845 : }
846 4 : if (!aliasf[j]) {
847 0 : free(aliasf);
848 0 : free(aliasflen);
849 0 : aliasf = NULL;
850 0 : aliasflen = NULL;
851 0 : numaliasf = 0;
852 0 : HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
853 0 : return 1;
854 : }
855 : }
856 2 : return 0;
857 : }
858 :
859 293 : int HashMgr::is_aliasf() {
860 293 : return (aliasf != NULL);
861 : }
862 :
863 4 : int HashMgr::get_aliasf(int index, unsigned short ** fvec, FileMgr * af) {
864 4 : if ((index > 0) && (index <= numaliasf)) {
865 4 : *fvec = aliasf[index - 1];
866 4 : return aliasflen[index - 1];
867 : }
868 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n", af->getlinenum(), index);
869 0 : *fvec = NULL;
870 0 : return 0;
871 : }
872 :
873 : /* parse morph alias definitions */
874 2 : int HashMgr::parse_aliasm(char * line, FileMgr * af)
875 : {
876 2 : if (numaliasm != 0) {
877 0 : HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n", af->getlinenum());
878 0 : return 1;
879 : }
880 2 : char * tp = line;
881 : char * piece;
882 2 : int i = 0;
883 2 : int np = 0;
884 2 : piece = mystrsep(&tp, 0);
885 8 : while (piece) {
886 4 : if (*piece != '\0') {
887 4 : switch(i) {
888 2 : case 0: { np++; break; }
889 : case 1: {
890 2 : numaliasm = atoi(piece);
891 2 : if (numaliasm < 1) {
892 0 : HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n", af->getlinenum());
893 0 : return 1;
894 : }
895 2 : aliasm = (char **) malloc(numaliasm * sizeof(char *));
896 2 : if (!aliasm) {
897 0 : numaliasm = 0;
898 0 : return 1;
899 : }
900 2 : np++;
901 2 : break;
902 : }
903 0 : default: break;
904 : }
905 4 : i++;
906 : }
907 4 : piece = mystrsep(&tp, 0);
908 : }
909 2 : if (np != 2) {
910 0 : numaliasm = 0;
911 0 : free(aliasm);
912 0 : aliasm = NULL;
913 0 : HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", af->getlinenum());
914 0 : return 1;
915 : }
916 :
917 : /* now parse the numaliasm lines to read in the remainder of the table */
918 2 : char * nl = line;
919 9 : for (int j=0; j < numaliasm; j++) {
920 7 : if (!(nl = af->getline())) return 1;
921 7 : mychomp(nl);
922 7 : tp = nl;
923 7 : i = 0;
924 7 : aliasm[j] = NULL;
925 7 : piece = mystrsep(&tp, ' ');
926 28 : while (piece) {
927 14 : if (*piece != '\0') {
928 14 : switch(i) {
929 : case 0: {
930 7 : if (strncmp(piece,"AM",2) != 0) {
931 0 : HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
932 0 : numaliasm = 0;
933 0 : free(aliasm);
934 0 : aliasm = NULL;
935 0 : return 1;
936 : }
937 7 : break;
938 : }
939 : case 1: {
940 : // add the remaining of the line
941 7 : if (*tp) {
942 1 : *(tp - 1) = ' ';
943 1 : tp = tp + strlen(tp);
944 : }
945 7 : if (complexprefixes) {
946 4 : if (utf8) reverseword_utf(piece);
947 4 : else reverseword(piece);
948 : }
949 7 : aliasm[j] = mystrdup(piece);
950 7 : if (!aliasm[j]) {
951 0 : numaliasm = 0;
952 0 : free(aliasm);
953 0 : aliasm = NULL;
954 0 : return 1;
955 : }
956 7 : break; }
957 0 : default: break;
958 : }
959 14 : i++;
960 : }
961 14 : piece = mystrsep(&tp, ' ');
962 : }
963 7 : if (!aliasm[j]) {
964 0 : numaliasm = 0;
965 0 : free(aliasm);
966 0 : aliasm = NULL;
967 0 : HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n", af->getlinenum());
968 0 : return 1;
969 : }
970 : }
971 2 : return 0;
972 : }
973 :
974 208 : int HashMgr::is_aliasm() {
975 208 : return (aliasm != NULL);
976 : }
977 :
978 7 : char * HashMgr::get_aliasm(int index) {
979 7 : if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
980 0 : HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
981 0 : return NULL;
982 : }
|