1 : /******* BEGIN LICENSE BLOCK *******
2 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 : *
4 : * The contents of this file are subject to the Mozilla Public License Version
5 : * 1.1 (the "License"); you may not use this file except in compliance with
6 : * the License. You may obtain a copy of the License at
7 : * http://www.mozilla.org/MPL/
8 : *
9 : * Software distributed under the License is distributed on an "AS IS" basis,
10 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 : * for the specific language governing rights and limitations under the
12 : * License.
13 : *
14 : * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 : * and László Németh (Hunspell). Portions created by the Initial Developers
16 : * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
17 : *
18 : * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 : * David Einstein (deinst@world.std.com)
20 : * László Németh (nemethl@gyorsposta.hu)
21 : * Caolan McNamara (caolanm@redhat.com)
22 : * Davide Prina
23 : * Giuseppe Modugno
24 : * Gianluca Turconi
25 : * Simon Brouwer
26 : * Noll Janos
27 : * Biro Arpad
28 : * Goldman Eleonora
29 : * Sarlos Tamas
30 : * Bencsath Boldizsar
31 : * Halacsy Peter
32 : * Dvornik Laszlo
33 : * Gefferth Andras
34 : * Nagy Viktor
35 : * Varga Daniel
36 : * Chris Halls
37 : * Rene Engelhard
38 : * Bram Moolenaar
39 : * Dafydd Jones
40 : * Harri Pitkanen
41 : * Andras Timar
42 : * Tor Lillqvist
43 : *
44 : * Alternatively, the contents of this file may be used under the terms of
45 : * either the GNU General Public License Version 2 or later (the "GPL"), or
46 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
47 : * in which case the provisions of the GPL or the LGPL are applicable instead
48 : * of those above. If you wish to allow use of your version of this file only
49 : * under the terms of either the GPL or the LGPL, and not to allow others to
50 : * use your version of this file under the terms of the MPL, indicate your
51 : * decision by deleting the provisions above and replace them with the notice
52 : * and other provisions required by the GPL or the LGPL. If you do not delete
53 : * the provisions above, a recipient may use your version of this file under
54 : * the terms of any one of the MPL, the GPL or the LGPL.
55 : *
56 : ******* END LICENSE BLOCK *******/
57 :
58 : #include <stdlib.h>
59 : #include <string.h>
60 : #include <stdio.h>
61 : #include <ctype.h>
62 :
63 : #include "affentry.hxx"
64 : #include "csutil.hxx"
65 :
66 144 : PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
67 : {
68 : // register affix manager
69 144 : pmyMgr = pmgr;
70 :
71 : // set up its initial values
72 :
73 144 : aflag = dp->aflag; // flag
74 144 : strip = dp->strip; // string to strip
75 144 : appnd = dp->appnd; // string to append
76 144 : stripl = dp->stripl; // length of strip string
77 144 : appndl = dp->appndl; // length of append string
78 144 : numconds = dp->numconds; // length of the condition
79 144 : opts = dp->opts; // cross product flag
80 : // then copy over all of the conditions
81 144 : if (opts & aeLONGCOND) {
82 6 : memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
83 6 : c.l.conds2 = dp->c.l.conds2;
84 138 : } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
85 144 : next = NULL;
86 144 : nextne = NULL;
87 144 : nexteq = NULL;
88 144 : morphcode = dp->morphcode;
89 144 : contclass = dp->contclass;
90 144 : contclasslen = dp->contclasslen;
91 144 : }
92 :
93 :
94 144 : PfxEntry::~PfxEntry()
95 : {
96 144 : aflag = 0;
97 144 : if (appnd) free(appnd);
98 144 : if (strip) free(strip);
99 144 : pmyMgr = NULL;
100 144 : appnd = NULL;
101 144 : strip = NULL;
102 144 : if (opts & aeLONGCOND) free(c.l.conds2);
103 144 : if (morphcode && !(opts & aeALIASM)) free(morphcode);
104 144 : if (contclass && !(opts & aeALIASF)) free(contclass);
105 144 : }
106 :
107 : // add prefix to this word assuming conditions hold
108 0 : char * PfxEntry::add(const char * word, int len)
109 : {
110 : char tword[MAXWORDUTF8LEN + 4];
111 :
112 0 : if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
113 0 : (len >= numconds) && test_condition(word) &&
114 0 : (!stripl || (strncmp(word, strip, stripl) == 0)) &&
115 : ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
116 : /* we have a match so add prefix */
117 0 : char * pp = tword;
118 0 : if (appndl) {
119 0 : strcpy(tword,appnd);
120 0 : pp += appndl;
121 : }
122 0 : strcpy(pp, (word + stripl));
123 0 : return mystrdup(tword);
124 : }
125 0 : return NULL;
126 : }
127 :
128 341 : inline char * PfxEntry::nextchar(char * p) {
129 341 : if (p) {
130 341 : p++;
131 341 : if (opts & aeLONGCOND) {
132 : // jump to the 2nd part of the condition
133 119 : if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
134 : // end of the MAXCONDLEN length condition
135 222 : } else if (p == c.conds + MAXCONDLEN) return NULL;
136 335 : return *p ? p : NULL;
137 : }
138 0 : return NULL;
139 : }
140 :
141 1823 : inline int PfxEntry::test_condition(const char * st)
142 : {
143 1823 : const char * pos = NULL; // group with pos input position
144 1823 : bool neg = false; // complementer
145 1823 : bool ingroup = false; // character in the group
146 1823 : if (numconds == 0) return 1;
147 35 : char * p = c.conds;
148 244 : while (1) {
149 279 : switch (*p) {
150 0 : case '\0': return 1;
151 : case '[': {
152 40 : neg = false;
153 40 : ingroup = false;
154 40 : p = nextchar(p);
155 40 : pos = st; break;
156 : }
157 18 : case '^': { p = nextchar(p); neg = true; break; }
158 : case ']': {
159 40 : if ((neg && ingroup) || (!neg && !ingroup)) return 0;
160 23 : pos = NULL;
161 23 : p = nextchar(p);
162 : // skip the next character
163 23 : if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
164 23 : if (*st == '\0' && p) return 0; // word <= condition
165 23 : break;
166 : }
167 0 : case '.': if (!pos) { // dots are not metacharacters in groups: [.]
168 0 : p = nextchar(p);
169 : // skip the next character
170 0 : for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
171 0 : if (*st == '\0' && p) return 0; // word <= condition
172 0 : break;
173 : }
174 : default: {
175 181 : if (*st == *p) {
176 59 : st++;
177 59 : p = nextchar(p);
178 59 : if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
179 94 : while (p && (*p & 0xc0) == 0x80) { // character
180 36 : if (*p != *st) {
181 14 : if (!pos) return 0;
182 14 : st = pos;
183 14 : break;
184 : }
185 22 : p = nextchar(p);
186 22 : st++;
187 : }
188 72 : if (pos && st != pos) {
189 15 : ingroup = true;
190 15 : while (p && *p != ']' && (p = nextchar(p)));
191 : }
192 23 : } else if (pos) {
193 16 : ingroup = true;
194 16 : while (p && *p != ']' && (p = nextchar(p)));
195 : }
196 122 : } else if (pos) { // group
197 119 : p = nextchar(p);
198 3 : } else return 0;
199 : }
200 : }
201 259 : if (!p) return 1;
202 : }
203 : }
204 :
205 : // check if this prefix entry matches
206 1813 : struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
207 : {
208 : int tmpl; // length of tmpword
209 : struct hentry * he; // hash entry of root word or NULL
210 : char tmpword[MAXWORDUTF8LEN + 4];
211 :
212 : // on entry prefix is 0 length or already matches the beginning of the word.
213 : // So if the remaining root word has positive length
214 : // and if there are enough chars in root word and added back strip chars
215 : // to meet the number of characters conditions, then test it
216 :
217 1813 : tmpl = len - appndl;
218 :
219 1813 : if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
220 :
221 : // generate new root word by removing prefix and adding
222 : // back any characters that would have been stripped
223 :
224 1682 : if (stripl) strcpy (tmpword, strip);
225 1682 : strcpy ((tmpword + stripl), (word + appndl));
226 :
227 : // now make sure all of the conditions on characters
228 : // are met. Please see the appendix at the end of
229 : // this file for more info on exactly what is being
230 : // tested
231 :
232 : // if all conditions are met then check if resulting
233 : // root word in the dictionary
234 :
235 1682 : if (test_condition(tmpword)) {
236 1662 : tmpl += stripl;
237 1662 : if ((he = pmyMgr->lookup(tmpword)) != NULL) {
238 127 : do {
239 208 : if (TESTAFF(he->astr, aflag, he->alen) &&
240 : // forbid single prefixes with needaffix flag
241 38 : ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
242 : // needflag
243 6 : ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
244 2 : (contclass && TESTAFF(contclass, needflag, contclasslen))))
245 35 : return he;
246 127 : he = he->next_homonym; // check homonyms
247 : } while (he);
248 : }
249 :
250 : // prefix matched but no root word was found
251 : // if aeXPRODUCT is allowed, try again but now
252 : // ross checked combined with a suffix
253 :
254 : //if ((opts & aeXPRODUCT) && in_compound) {
255 1627 : if ((opts & aeXPRODUCT)) {
256 : he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
257 1611 : 0, NULL, FLAG_NULL, needflag, in_compound);
258 1611 : if (he) return he;
259 : }
260 : }
261 : }
262 1721 : return NULL;
263 : }
264 :
265 : // check if this prefix entry matches
266 141 : struct hentry * PfxEntry::check_twosfx(const char * word, int len,
267 : char in_compound, const FLAG needflag)
268 : {
269 : int tmpl; // length of tmpword
270 : struct hentry * he; // hash entry of root word or NULL
271 : char tmpword[MAXWORDUTF8LEN + 4];
272 :
273 : // on entry prefix is 0 length or already matches the beginning of the word.
274 : // So if the remaining root word has positive length
275 : // and if there are enough chars in root word and added back strip chars
276 : // to meet the number of characters conditions, then test it
277 :
278 141 : tmpl = len - appndl;
279 :
280 141 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
281 : (tmpl + stripl >= numconds)) {
282 :
283 : // generate new root word by removing prefix and adding
284 : // back any characters that would have been stripped
285 :
286 141 : if (stripl) strcpy (tmpword, strip);
287 141 : strcpy ((tmpword + stripl), (word + appndl));
288 :
289 : // now make sure all of the conditions on characters
290 : // are met. Please see the appendix at the end of
291 : // this file for more info on exactly what is being
292 : // tested
293 :
294 : // if all conditions are met then check if resulting
295 : // root word in the dictionary
296 :
297 141 : if (test_condition(tmpword)) {
298 141 : tmpl += stripl;
299 :
300 : // prefix matched but no root word was found
301 : // if aeXPRODUCT is allowed, try again but now
302 : // cross checked combined with a suffix
303 :
304 141 : if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
305 141 : he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
306 141 : if (he) return he;
307 : }
308 : }
309 : }
310 133 : return NULL;
311 : }
312 :
313 : // check if this prefix entry matches
314 0 : char * PfxEntry::check_twosfx_morph(const char * word, int len,
315 : char in_compound, const FLAG needflag)
316 : {
317 : int tmpl; // length of tmpword
318 : char tmpword[MAXWORDUTF8LEN + 4];
319 :
320 : // on entry prefix is 0 length or already matches the beginning of the word.
321 : // So if the remaining root word has positive length
322 : // and if there are enough chars in root word and added back strip chars
323 : // to meet the number of characters conditions, then test it
324 :
325 0 : tmpl = len - appndl;
326 :
327 0 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
328 : (tmpl + stripl >= numconds)) {
329 :
330 : // generate new root word by removing prefix and adding
331 : // back any characters that would have been stripped
332 :
333 0 : if (stripl) strcpy (tmpword, strip);
334 0 : strcpy ((tmpword + stripl), (word + appndl));
335 :
336 : // now make sure all of the conditions on characters
337 : // are met. Please see the appendix at the end of
338 : // this file for more info on exactly what is being
339 : // tested
340 :
341 : // if all conditions are met then check if resulting
342 : // root word in the dictionary
343 :
344 0 : if (test_condition(tmpword)) {
345 0 : tmpl += stripl;
346 :
347 : // prefix matched but no root word was found
348 : // if aeXPRODUCT is allowed, try again but now
349 : // ross checked combined with a suffix
350 :
351 0 : if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
352 : return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
353 0 : aeXPRODUCT, this, needflag);
354 : }
355 : }
356 : }
357 0 : return NULL;
358 : }
359 :
360 : // check if this prefix entry matches
361 0 : char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
362 : {
363 : int tmpl; // length of tmpword
364 : struct hentry * he; // hash entry of root word or NULL
365 : char tmpword[MAXWORDUTF8LEN + 4];
366 : char result[MAXLNLEN];
367 : char * st;
368 :
369 0 : *result = '\0';
370 :
371 : // on entry prefix is 0 length or already matches the beginning of the word.
372 : // So if the remaining root word has positive length
373 : // and if there are enough chars in root word and added back strip chars
374 : // to meet the number of characters conditions, then test it
375 :
376 0 : tmpl = len - appndl;
377 :
378 0 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
379 : (tmpl + stripl >= numconds)) {
380 :
381 : // generate new root word by removing prefix and adding
382 : // back any characters that would have been stripped
383 :
384 0 : if (stripl) strcpy (tmpword, strip);
385 0 : strcpy ((tmpword + stripl), (word + appndl));
386 :
387 : // now make sure all of the conditions on characters
388 : // are met. Please see the appendix at the end of
389 : // this file for more info on exactly what is being
390 : // tested
391 :
392 : // if all conditions are met then check if resulting
393 : // root word in the dictionary
394 :
395 0 : if (test_condition(tmpword)) {
396 0 : tmpl += stripl;
397 0 : if ((he = pmyMgr->lookup(tmpword)) != NULL) {
398 0 : do {
399 0 : if (TESTAFF(he->astr, aflag, he->alen) &&
400 : // forbid single prefixes with needaffix flag
401 0 : ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
402 : // needflag
403 0 : ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
404 0 : (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
405 0 : if (morphcode) {
406 0 : mystrcat(result, " ", MAXLNLEN);
407 0 : mystrcat(result, morphcode, MAXLNLEN);
408 0 : } else mystrcat(result,getKey(), MAXLNLEN);
409 0 : if (!HENTRY_FIND(he, MORPH_STEM)) {
410 0 : mystrcat(result, " ", MAXLNLEN);
411 0 : mystrcat(result, MORPH_STEM, MAXLNLEN);
412 0 : mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
413 : }
414 : // store the pointer of the hash entry
415 0 : if (HENTRY_DATA(he)) {
416 0 : mystrcat(result, " ", MAXLNLEN);
417 0 : mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
418 : } else {
419 : // return with debug information
420 0 : char * flag = pmyMgr->encode_flag(getFlag());
421 0 : mystrcat(result, " ", MAXLNLEN);
422 0 : mystrcat(result, MORPH_FLAG, MAXLNLEN);
423 0 : mystrcat(result, flag, MAXLNLEN);
424 0 : free(flag);
425 : }
426 0 : mystrcat(result, "\n", MAXLNLEN);
427 : }
428 0 : he = he->next_homonym;
429 : } while (he);
430 : }
431 :
432 : // prefix matched but no root word was found
433 : // if aeXPRODUCT is allowed, try again but now
434 : // ross checked combined with a suffix
435 :
436 0 : if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
437 : st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
438 0 : FLAG_NULL, needflag);
439 0 : if (st) {
440 0 : mystrcat(result, st, MAXLNLEN);
441 0 : free(st);
442 : }
443 : }
444 : }
445 : }
446 :
447 0 : if (*result) return mystrdup(result);
448 0 : return NULL;
449 : }
450 :
451 283 : SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
452 : {
453 : // register affix manager
454 283 : pmyMgr = pmgr;
455 :
456 : // set up its initial values
457 283 : aflag = dp->aflag; // char flag
458 283 : strip = dp->strip; // string to strip
459 283 : appnd = dp->appnd; // string to append
460 283 : stripl = dp->stripl; // length of strip string
461 283 : appndl = dp->appndl; // length of append string
462 283 : numconds = dp->numconds; // length of the condition
463 283 : opts = dp->opts; // cross product flag
464 :
465 : // then copy over all of the conditions
466 283 : if (opts & aeLONGCOND) {
467 5 : memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
468 5 : c.l.conds2 = dp->c.l.conds2;
469 278 : } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
470 283 : next = NULL;
471 283 : nextne = NULL;
472 283 : nexteq = NULL;
473 283 : rappnd = myrevstrdup(appnd);
474 283 : morphcode = dp->morphcode;
475 283 : contclass = dp->contclass;
476 283 : contclasslen = dp->contclasslen;
477 283 : }
478 :
479 :
480 283 : SfxEntry::~SfxEntry()
481 : {
482 283 : aflag = 0;
483 283 : if (appnd) free(appnd);
484 283 : if (rappnd) free(rappnd);
485 283 : if (strip) free(strip);
486 283 : pmyMgr = NULL;
487 283 : appnd = NULL;
488 283 : strip = NULL;
489 283 : if (opts & aeLONGCOND) free(c.l.conds2);
490 283 : if (morphcode && !(opts & aeALIASM)) free(morphcode);
491 283 : if (contclass && !(opts & aeALIASF)) free(contclass);
492 283 : }
493 :
494 : // add suffix to this word assuming conditions hold
495 0 : char * SfxEntry::add(const char * word, int len)
496 : {
497 : char tword[MAXWORDUTF8LEN + 4];
498 :
499 : /* make sure all conditions match */
500 0 : if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
501 0 : (len >= numconds) && test_condition(word + len, word) &&
502 0 : (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
503 : ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
504 : /* we have a match so add suffix */
505 0 : strcpy(tword,word);
506 0 : if (appndl) {
507 0 : strcpy(tword + len - stripl, appnd);
508 : } else {
509 0 : *(tword + len - stripl) = '\0';
510 : }
511 0 : return mystrdup(tword);
512 : }
513 0 : return NULL;
514 : }
515 :
516 911 : inline char * SfxEntry::nextchar(char * p) {
517 911 : if (p) {
518 911 : p++;
519 911 : if (opts & aeLONGCOND) {
520 : // jump to the 2nd part of the condition
521 141 : if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
522 : // end of the MAXCONDLEN length condition
523 770 : } else if (p == c.conds + MAXCONDLEN) return NULL;
524 905 : return *p ? p : NULL;
525 : }
526 0 : return NULL;
527 : }
528 :
529 6872 : inline int SfxEntry::test_condition(const char * st, const char * beg)
530 : {
531 6872 : const char * pos = NULL; // group with pos input position
532 6872 : bool neg = false; // complementer
533 6872 : bool ingroup = false; // character in the group
534 6872 : if (numconds == 0) return 1;
535 125 : char * p = c.conds;
536 125 : st--;
537 125 : int i = 1;
538 778 : while (1) {
539 903 : switch (*p) {
540 0 : case '\0': return 1;
541 108 : case '[': { p = nextchar(p); pos = st; break; }
542 44 : case '^': { p = nextchar(p); neg = true; break; }
543 82 : case ']': { if (!neg && !ingroup) return 0;
544 42 : i++;
545 : // skip the next character
546 42 : if (!ingroup) {
547 29 : for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
548 29 : st--;
549 : }
550 42 : pos = NULL;
551 42 : neg = false;
552 42 : ingroup = false;
553 42 : p = nextchar(p);
554 42 : if (st < beg && p) return 0; // word <= condition
555 42 : break;
556 : }
557 37 : case '.': if (!pos) { // dots are not metacharacters in groups: [.]
558 37 : p = nextchar(p);
559 : // skip the next character
560 37 : for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
561 37 : if (st < beg) { // word <= condition
562 1 : if (p) return 0; else return 1;
563 : }
564 36 : if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
565 0 : st--;
566 0 : if (st < beg) { // word <= condition
567 0 : if (p) return 0; else return 1;
568 : }
569 : }
570 36 : break;
571 : }
572 : default: {
573 632 : if (*st == *p) {
574 123 : p = nextchar(p);
575 123 : if ((opts & aeUTF8) && (*st & 0x80)) {
576 21 : st--;
577 42 : while (p && (st >= beg)) {
578 21 : if (*p != *st) {
579 0 : if (!pos) return 0;
580 0 : st = pos;
581 0 : break;
582 : }
583 : // first byte of the UTF-8 multibyte character
584 21 : if ((*p & 0xc0) != 0x80) break;
585 0 : p = nextchar(p);
586 0 : st--;
587 : }
588 21 : if (pos && st != pos) {
589 12 : if (neg) return 0;
590 6 : else if (i == numconds) return 1;
591 3 : ingroup = true;
592 3 : while (p && *p != ']' && (p = nextchar(p)));
593 3 : st--;
594 : }
595 12 : if (p && *p != ']') p = nextchar(p);
596 102 : } else if (pos) {
597 27 : if (neg) return 0;
598 18 : else if (i == numconds) return 1;
599 10 : ingroup = true;
600 10 : while (p && *p != ']' && (p = nextchar(p)));
601 : // if (p && *p != ']') p = nextchar(p);
602 10 : st--;
603 : }
604 97 : if (!pos) {
605 84 : i++;
606 84 : st--;
607 : }
608 97 : if (st < beg && p && *p != ']') return 0; // word <= condition
609 509 : } else if (pos) { // group
610 496 : p = nextchar(p);
611 13 : } else return 0;
612 : }
613 : }
614 823 : if (!p) return 1;
615 : }
616 : }
617 :
618 : // see if this suffix is present in the word
619 6839 : struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
620 : PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
621 : const FLAG badflag)
622 : {
623 : int tmpl; // length of tmpword
624 : struct hentry * he; // hash entry pointer
625 : unsigned char * cp;
626 : char tmpword[MAXWORDUTF8LEN + 4];
627 6839 : PfxEntry* ep = ppfx;
628 :
629 : // if this suffix is being cross checked with a prefix
630 : // but it does not support cross products skip it
631 :
632 6839 : if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
633 0 : return NULL;
634 :
635 : // upon entry suffix is 0 length or already matches the end of the word.
636 : // So if the remaining root word has positive length
637 : // and if there are enough chars in root word and added back strip chars
638 : // to meet the number of characters conditions, then test it
639 :
640 6839 : tmpl = len - appndl;
641 : // the second condition is not enough for UTF-8 strings
642 : // it checked in test_condition()
643 :
644 6839 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
645 : (tmpl + stripl >= numconds)) {
646 :
647 : // generate new root word by removing suffix and adding
648 : // back any characters that would have been stripped or
649 : // or null terminating the shorter string
650 :
651 6837 : strcpy (tmpword, word);
652 6837 : cp = (unsigned char *)(tmpword + tmpl);
653 6837 : if (stripl) {
654 52 : strcpy ((char *)cp, strip);
655 52 : tmpl += stripl;
656 52 : cp = (unsigned char *)(tmpword + tmpl);
657 6785 : } else *cp = '\0';
658 :
659 : // now make sure all of the conditions on characters
660 : // are met. Please see the appendix at the end of
661 : // this file for more info on exactly what is being
662 : // tested
663 :
664 : // if all conditions are met then check if resulting
665 : // root word in the dictionary
666 :
667 6837 : if (test_condition((char *) cp, (char *) tmpword)) {
668 :
669 : #ifdef SZOSZABLYA_POSSIBLE_ROOTS
670 : fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
671 : #endif
672 6769 : if ((he = pmyMgr->lookup(tmpword)) != NULL) {
673 284 : do {
674 : // check conditional suffix (enabled by prefix)
675 1347 : if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
676 88 : TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
677 : (((optflags & aeXPRODUCT) == 0) ||
678 171 : (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
679 : // enabled by prefix
680 150 : ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
681 : ) &&
682 : // handle cont. class
683 : ((!cclass) ||
684 21 : ((contclass) && TESTAFF(contclass, cclass, contclasslen))
685 : ) &&
686 : // check only in compound homonyms (bad flags)
687 2 : (!badflag || !TESTAFF(he->astr, badflag, he->alen)
688 : ) &&
689 : // handle required flag
690 : ((!needflag) ||
691 209 : (TESTAFF(he->astr, needflag, he->alen) ||
692 205 : ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
693 : )
694 217 : ) return he;
695 284 : he = he->next_homonym; // check homonyms
696 : } while (he);
697 :
698 : // obsolote stemming code (used only by the
699 : // experimental SuffixMgr:suggest_pos_stems)
700 : // store resulting root in wlst
701 6273 : } else if (wlst && (*ns < maxSug)) {
702 0 : int cwrd = 1;
703 0 : for (int k=0; k < *ns; k++)
704 0 : if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
705 0 : if (cwrd) {
706 0 : wlst[*ns] = mystrdup(tmpword);
707 0 : if (wlst[*ns] == NULL) {
708 0 : for (int j=0; j<*ns; j++) free(wlst[j]);
709 0 : *ns = -1;
710 0 : return NULL;
711 : }
712 0 : (*ns)++;
713 : }
714 : }
715 : }
716 : }
717 6622 : return NULL;
718 : }
719 :
720 : // see if two-level suffix is present in the word
721 35 : struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
722 : PfxEntry* ppfx, const FLAG needflag)
723 : {
724 : int tmpl; // length of tmpword
725 : struct hentry * he; // hash entry pointer
726 : unsigned char * cp;
727 : char tmpword[MAXWORDUTF8LEN + 4];
728 35 : PfxEntry* ep = ppfx;
729 :
730 :
731 : // if this suffix is being cross checked with a prefix
732 : // but it does not support cross products skip it
733 :
734 35 : if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
735 0 : return NULL;
736 :
737 : // upon entry suffix is 0 length or already matches the end of the word.
738 : // So if the remaining root word has positive length
739 : // and if there are enough chars in root word and added back strip chars
740 : // to meet the number of characters conditions, then test it
741 :
742 35 : tmpl = len - appndl;
743 :
744 35 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
745 : (tmpl + stripl >= numconds)) {
746 :
747 : // generate new root word by removing suffix and adding
748 : // back any characters that would have been stripped or
749 : // or null terminating the shorter string
750 :
751 35 : strcpy (tmpword, word);
752 35 : cp = (unsigned char *)(tmpword + tmpl);
753 35 : if (stripl) {
754 0 : strcpy ((char *)cp, strip);
755 0 : tmpl += stripl;
756 0 : cp = (unsigned char *)(tmpword + tmpl);
757 35 : } else *cp = '\0';
758 :
759 : // now make sure all of the conditions on characters
760 : // are met. Please see the appendix at the end of
761 : // this file for more info on exactly what is being
762 : // tested
763 :
764 : // if all conditions are met then recall suffix_check
765 :
766 35 : if (test_condition((char *) cp, (char *) tmpword)) {
767 35 : if (ppfx) {
768 : // handle conditional suffix
769 8 : if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
770 0 : he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
771 : else
772 8 : he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
773 : } else {
774 27 : he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
775 : }
776 35 : if (he) return he;
777 : }
778 : }
779 14 : return NULL;
780 : }
781 :
782 : // see if two-level suffix is present in the word
783 0 : char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
784 : PfxEntry* ppfx, const FLAG needflag)
785 : {
786 : int tmpl; // length of tmpword
787 : unsigned char * cp;
788 : char tmpword[MAXWORDUTF8LEN + 4];
789 0 : PfxEntry* ep = ppfx;
790 : char * st;
791 :
792 : char result[MAXLNLEN];
793 :
794 0 : *result = '\0';
795 :
796 : // if this suffix is being cross checked with a prefix
797 : // but it does not support cross products skip it
798 :
799 0 : if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
800 0 : return NULL;
801 :
802 : // upon entry suffix is 0 length or already matches the end of the word.
803 : // So if the remaining root word has positive length
804 : // and if there are enough chars in root word and added back strip chars
805 : // to meet the number of characters conditions, then test it
806 :
807 0 : tmpl = len - appndl;
808 :
809 0 : if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
810 : (tmpl + stripl >= numconds)) {
811 :
812 : // generate new root word by removing suffix and adding
813 : // back any characters that would have been stripped or
814 : // or null terminating the shorter string
815 :
816 0 : strcpy (tmpword, word);
817 0 : cp = (unsigned char *)(tmpword + tmpl);
818 0 : if (stripl) {
819 0 : strcpy ((char *)cp, strip);
820 0 : tmpl += stripl;
821 0 : cp = (unsigned char *)(tmpword + tmpl);
822 0 : } else *cp = '\0';
823 :
824 : // now make sure all of the conditions on characters
825 : // are met. Please see the appendix at the end of
826 : // this file for more info on exactly what is being
827 : // tested
828 :
829 : // if all conditions are met then recall suffix_check
830 :
831 0 : if (test_condition((char *) cp, (char *) tmpword)) {
832 0 : if (ppfx) {
833 : // handle conditional suffix
834 0 : if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
835 0 : st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
836 0 : if (st) {
837 0 : if (ppfx->getMorph()) {
838 0 : mystrcat(result, ppfx->getMorph(), MAXLNLEN);
839 0 : mystrcat(result, " ", MAXLNLEN);
840 : }
841 0 : mystrcat(result,st, MAXLNLEN);
842 0 : free(st);
843 0 : mychomp(result);
844 : }
845 : } else {
846 0 : st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
847 0 : if (st) {
848 0 : mystrcat(result, st, MAXLNLEN);
849 0 : free(st);
850 0 : mychomp(result);
851 : }
852 : }
853 : } else {
854 0 : st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
855 0 : if (st) {
856 0 : mystrcat(result, st, MAXLNLEN);
857 0 : free(st);
858 0 : mychomp(result);
859 : }
860 : }
861 0 : if (*result) return mystrdup(result);
862 : }
863 : }
864 0 : return NULL;
865 : }
866 :
867 : // get next homonym with same affix
868 0 : struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
869 : const FLAG cclass, const FLAG needflag)
870 : {
871 0 : PfxEntry* ep = ppfx;
872 0 : FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
873 :
874 0 : while (he->next_homonym) {
875 0 : he = he->next_homonym;
876 0 : if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
877 : ((optflags & aeXPRODUCT) == 0 ||
878 0 : TESTAFF(he->astr, eFlag, he->alen) ||
879 : // handle conditional suffix
880 0 : ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
881 : ) &&
882 : // handle cont. class
883 : ((!cclass) ||
884 0 : ((contclass) && TESTAFF(contclass, cclass, contclasslen))
885 : ) &&
886 : // handle required flag
887 : ((!needflag) ||
888 0 : (TESTAFF(he->astr, needflag, he->alen) ||
889 0 : ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
890 : )
891 0 : ) return he;
892 : }
893 0 : return NULL;
894 : }
895 :
896 :
897 : #if 0
898 :
899 : Appendix: Understanding Affix Code
900 :
901 :
902 : An affix is either a prefix or a suffix attached to root words to make
903 : other words.
904 :
905 : Basically a Prefix or a Suffix is set of AffEntry objects
906 : which store information about the prefix or suffix along
907 : with supporting routines to check if a word has a particular
908 : prefix or suffix or a combination.
909 :
910 : The structure affentry is defined as follows:
911 :
912 : struct affentry
913 : {
914 : unsigned short aflag; // ID used to represent the affix
915 : char * strip; // string to strip before adding affix
916 : char * appnd; // the affix string to add
917 : unsigned char stripl; // length of the strip string
918 : unsigned char appndl; // length of the affix string
919 : char numconds; // the number of conditions that must be met
920 : char opts; // flag: aeXPRODUCT- combine both prefix and suffix
921 : char conds[SETSIZE]; // array which encodes the conditions to be met
922 : };
923 :
924 :
925 : Here is a suffix borrowed from the en_US.aff file. This file
926 : is whitespace delimited.
927 :
928 : SFX D Y 4
929 : SFX D 0 e d
930 : SFX D y ied [^aeiou]y
931 : SFX D 0 ed [^ey]
932 : SFX D 0 ed [aeiou]y
933 :
934 : This information can be interpreted as follows:
935 :
936 : In the first line has 4 fields
937 :
938 : Field
939 : -----
940 : 1 SFX - indicates this is a suffix
941 : 2 D - is the name of the character flag which represents this suffix
942 : 3 Y - indicates it can be combined with prefixes (cross product)
943 : 4 4 - indicates that sequence of 4 affentry structures are needed to
944 : properly store the affix information
945 :
946 : The remaining lines describe the unique information for the 4 SfxEntry
947 : objects that make up this affix. Each line can be interpreted
948 : as follows: (note fields 1 and 2 are as a check against line 1 info)
949 :
950 : Field
951 : -----
952 : 1 SFX - indicates this is a suffix
953 : 2 D - is the name of the character flag for this affix
954 : 3 y - the string of chars to strip off before adding affix
955 : (a 0 here indicates the NULL string)
956 : 4 ied - the string of affix characters to add
957 : 5 [^aeiou]y - the conditions which must be met before the affix
958 : can be applied
959 :
960 : Field 5 is interesting. Since this is a suffix, field 5 tells us that
961 : there are 2 conditions that must be met. The first condition is that
962 : the next to the last character in the word must *NOT* be any of the
963 : following "a", "e", "i", "o" or "u". The second condition is that
964 : the last character of the word must end in "y".
965 :
966 : So how can we encode this information concisely and be able to
967 : test for both conditions in a fast manner? The answer is found
968 : but studying the wonderful ispell code of Geoff Kuenning, et.al.
969 : (now available under a normal BSD license).
970 :
971 : If we set up a conds array of 256 bytes indexed (0 to 255) and access it
972 : using a character (cast to an unsigned char) of a string, we have 8 bits
973 : of information we can store about that character. Specifically we
974 : could use each bit to say if that character is allowed in any of the
975 : last (or first for prefixes) 8 characters of the word.
976 :
977 : Basically, each character at one end of the word (up to the number
978 : of conditions) is used to index into the conds array and the resulting
979 : value found there says whether the that character is valid for a
980 : specific character position in the word.
981 :
982 : For prefixes, it does this by setting bit 0 if that char is valid
983 : in the first position, bit 1 if valid in the second position, and so on.
984 :
985 : If a bit is not set, then that char is not valid for that postion in the
986 : word.
987 :
988 : If working with suffixes bit 0 is used for the character closest
989 : to the front, bit 1 for the next character towards the end, ...,
990 : with bit numconds-1 representing the last char at the end of the string.
991 :
992 : Note: since entries in the conds[] are 8 bits, only 8 conditions
993 : (read that only 8 character positions) can be examined at one
994 : end of a word (the beginning for prefixes and the end for suffixes.
995 :
996 : So to make this clearer, lets encode the conds array values for the
997 : first two affentries for the suffix D described earlier.
998 :
999 :
1000 : For the first affentry:
1001 : numconds = 1 (only examine the last character)
1002 :
1003 : conds['e'] = (1 << 0) (the word must end in an E)
1004 : all others are all 0
1005 :
1006 : For the second affentry:
1007 : numconds = 2 (only examine the last two characters)
1008 :
1009 : conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
1010 : where X is all characters *but* a, e, i, o, or u
1011 :
1012 :
1013 : conds['y'] = (1 << 1) (the last char must be a y)
1014 : all other bits for all other entries in the conds array are zero
1015 :
1016 :
1017 : #endif
1018 :
|