1 : /******* BEGIN LICENSE BLOCK *******
2 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 : *
4 : * The contents of this file are subject to the Mozilla Public License Version
5 : * 1.1 (the "License"); you may not use this file except in compliance with
6 : * the License. You may obtain a copy of the License at
7 : * http://www.mozilla.org/MPL/
8 : *
9 : * Software distributed under the License is distributed on an "AS IS" basis,
10 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 : * for the specific language governing rights and limitations under the
12 : * License.
13 : *
14 : * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
15 : * and László Németh (Hunspell). Portions created by the Initial Developers
16 : * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
17 : *
18 : * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
19 : * David Einstein (deinst@world.std.com)
20 : * László Németh (nemethl@gyorsposta.hu)
21 : * Caolan McNamara (caolanm@redhat.com)
22 : * Davide Prina
23 : * Giuseppe Modugno
24 : * Gianluca Turconi
25 : * Simon Brouwer
26 : * Noll Janos
27 : * Biro Arpad
28 : * Goldman Eleonora
29 : * Sarlos Tamas
30 : * Bencsath Boldizsar
31 : * Halacsy Peter
32 : * Dvornik Laszlo
33 : * Gefferth Andras
34 : * Nagy Viktor
35 : * Varga Daniel
36 : * Chris Halls
37 : * Rene Engelhard
38 : * Bram Moolenaar
39 : * Dafydd Jones
40 : * Harri Pitkanen
41 : * Andras Timar
42 : * Tor Lillqvist
43 : *
44 : * Alternatively, the contents of this file may be used under the terms of
45 : * either the GNU General Public License Version 2 or later (the "GPL"), or
46 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
47 : * in which case the provisions of the GPL or the LGPL are applicable instead
48 : * of those above. If you wish to allow use of your version of this file only
49 : * under the terms of either the GPL or the LGPL, and not to allow others to
50 : * use your version of this file under the terms of the MPL, indicate your
51 : * decision by deleting the provisions above and replace them with the notice
52 : * and other provisions required by the GPL or the LGPL. If you do not delete
53 : * the provisions above, a recipient may use your version of this file under
54 : * the terms of any one of the MPL, the GPL or the LGPL.
55 : *
56 : ******* END LICENSE BLOCK *******/
57 :
58 : #include <stdlib.h>
59 : #include <string.h>
60 : #include <stdio.h>
61 : #include <ctype.h>
62 :
63 : #include "suggestmgr.hxx"
64 : #include "htypes.hxx"
65 : #include "csutil.hxx"
66 :
67 : const w_char W_VLINE = { '\0', '|' };
68 :
69 110 : SuggestMgr::SuggestMgr(const char * tryme, int maxn,
70 : AffixMgr * aptr)
71 : {
72 :
73 : // register affix manager and check in string of chars to
74 : // try when building candidate suggestions
75 110 : pAMgr = aptr;
76 :
77 110 : csconv = NULL;
78 :
79 110 : ckeyl = 0;
80 110 : ckey = NULL;
81 110 : ckey_utf = NULL;
82 :
83 110 : ctryl = 0;
84 110 : ctry = NULL;
85 110 : ctry_utf = NULL;
86 :
87 110 : utf8 = 0;
88 110 : langnum = 0;
89 110 : complexprefixes = 0;
90 :
91 110 : maxSug = maxn;
92 110 : nosplitsugs = 0;
93 110 : maxngramsugs = MAXNGRAMSUGS;
94 110 : maxcpdsugs = MAXCOMPOUNDSUGS;
95 :
96 110 : if (pAMgr) {
97 110 : langnum = pAMgr->get_langnum();
98 110 : ckey = pAMgr->get_key_string();
99 110 : nosplitsugs = pAMgr->get_nosplitsugs();
100 110 : if (pAMgr->get_maxngramsugs() >= 0)
101 13 : maxngramsugs = pAMgr->get_maxngramsugs();
102 110 : utf8 = pAMgr->get_utf8();
103 110 : if (pAMgr->get_maxcpdsugs() >= 0)
104 0 : maxcpdsugs = pAMgr->get_maxcpdsugs();
105 110 : if (!utf8)
106 : {
107 82 : char * enc = pAMgr->get_encoding();
108 82 : csconv = get_current_cs(enc);
109 82 : free(enc);
110 : }
111 110 : complexprefixes = pAMgr->get_complexprefixes();
112 : }
113 :
114 110 : if (ckey) {
115 110 : if (utf8) {
116 : w_char t[MAXSWL];
117 28 : ckeyl = u8_u16(t, MAXSWL, ckey);
118 28 : ckey_utf = (w_char *) malloc(ckeyl * sizeof(w_char));
119 28 : if (ckey_utf) memcpy(ckey_utf, t, ckeyl * sizeof(w_char));
120 0 : else ckeyl = 0;
121 : } else {
122 82 : ckeyl = strlen(ckey);
123 : }
124 : }
125 :
126 110 : if (tryme) {
127 14 : ctry = mystrdup(tryme);
128 14 : if (ctry) ctryl = strlen(ctry);
129 14 : if (ctry && utf8) {
130 : w_char t[MAXSWL];
131 4 : ctryl = u8_u16(t, MAXSWL, tryme);
132 4 : ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));
133 4 : if (ctry_utf) memcpy(ctry_utf, t, ctryl * sizeof(w_char));
134 0 : else ctryl = 0;
135 : }
136 : }
137 110 : }
138 :
139 :
140 110 : SuggestMgr::~SuggestMgr()
141 : {
142 110 : pAMgr = NULL;
143 110 : if (ckey) free(ckey);
144 110 : ckey = NULL;
145 110 : if (ckey_utf) free(ckey_utf);
146 110 : ckey_utf = NULL;
147 110 : ckeyl = 0;
148 110 : if (ctry) free(ctry);
149 110 : ctry = NULL;
150 110 : if (ctry_utf) free(ctry_utf);
151 110 : ctry_utf = NULL;
152 110 : ctryl = 0;
153 110 : maxSug = 0;
154 : #ifdef MOZILLA_CLIENT
155 110 : delete [] csconv;
156 : #endif
157 110 : }
158 :
159 0 : int SuggestMgr::testsug(char** wlst, const char * candidate, int wl, int ns, int cpdsuggest,
160 : int * timer, clock_t * timelimit) {
161 0 : int cwrd = 1;
162 0 : if (ns == maxSug) return maxSug;
163 0 : for (int k=0; k < ns; k++) {
164 0 : if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
165 : }
166 0 : if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {
167 0 : wlst[ns] = mystrdup(candidate);
168 0 : if (wlst[ns] == NULL) {
169 0 : for (int j=0; j<ns; j++) free(wlst[j]);
170 0 : return -1;
171 : }
172 0 : ns++;
173 : }
174 0 : return ns;
175 : }
176 :
177 : // generate suggestions for a misspelled word
178 : // pass in address of array of char * pointers
179 : // onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
180 :
181 0 : int SuggestMgr::suggest(char*** slst, const char * w, int nsug,
182 : int * onlycompoundsug)
183 : {
184 0 : int nocompoundtwowords = 0;
185 : char ** wlst;
186 : w_char word_utf[MAXSWL];
187 0 : int wl = 0;
188 0 : int nsugorig = nsug;
189 : char w2[MAXWORDUTF8LEN];
190 0 : const char * word = w;
191 0 : int oldSug = 0;
192 :
193 : // word reversing wrapper for complex prefixes
194 0 : if (complexprefixes) {
195 0 : strcpy(w2, w);
196 0 : if (utf8) reverseword_utf(w2); else reverseword(w2);
197 0 : word = w2;
198 : }
199 :
200 0 : if (*slst) {
201 0 : wlst = *slst;
202 : } else {
203 0 : wlst = (char **) malloc(maxSug * sizeof(char *));
204 0 : if (wlst == NULL) return -1;
205 0 : for (int i = 0; i < maxSug; i++) {
206 0 : wlst[i] = NULL;
207 : }
208 : }
209 :
210 0 : if (utf8) {
211 0 : wl = u8_u16(word_utf, MAXSWL, word);
212 0 : if (wl == -1) {
213 0 : *slst = wlst;
214 0 : return nsug;
215 : }
216 : }
217 :
218 0 : for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
219 :
220 : // limit compound suggestion
221 0 : if (cpdsuggest > 0) oldSug = nsug;
222 :
223 : // suggestions for an uppercase word (html -> HTML)
224 0 : if ((nsug < maxSug) && (nsug > -1)) {
225 : nsug = (utf8) ? capchars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
226 0 : capchars(wlst, word, nsug, cpdsuggest);
227 : }
228 :
229 : // perhaps we made a typical fault of spelling
230 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
231 0 : nsug = replchars(wlst, word, nsug, cpdsuggest);
232 : }
233 :
234 : // perhaps we made chose the wrong char from a related set
235 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
236 0 : nsug = mapchars(wlst, word, nsug, cpdsuggest);
237 : }
238 :
239 : // only suggest compound words when no other suggestion
240 0 : if ((cpdsuggest == 0) && (nsug > nsugorig)) nocompoundtwowords=1;
241 :
242 : // did we swap the order of chars by mistake
243 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
244 : nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
245 0 : swapchar(wlst, word, nsug, cpdsuggest);
246 : }
247 :
248 : // did we swap the order of non adjacent chars by mistake
249 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
250 : nsug = (utf8) ? longswapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
251 0 : longswapchar(wlst, word, nsug, cpdsuggest);
252 : }
253 :
254 : // did we just hit the wrong key in place of a good char (case and keyboard)
255 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
256 : nsug = (utf8) ? badcharkey_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
257 0 : badcharkey(wlst, word, nsug, cpdsuggest);
258 : }
259 :
260 : // did we add a char that should not be there
261 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
262 : nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
263 0 : extrachar(wlst, word, nsug, cpdsuggest);
264 : }
265 :
266 :
267 : // did we forgot a char
268 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
269 : nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
270 0 : forgotchar(wlst, word, nsug, cpdsuggest);
271 : }
272 :
273 : // did we move a char
274 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
275 : nsug = (utf8) ? movechar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
276 0 : movechar(wlst, word, nsug, cpdsuggest);
277 : }
278 :
279 : // did we just hit the wrong key in place of a good char
280 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
281 : nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
282 0 : badchar(wlst, word, nsug, cpdsuggest);
283 : }
284 :
285 : // did we double two characters
286 0 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
287 : nsug = (utf8) ? doubletwochars_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
288 0 : doubletwochars(wlst, word, nsug, cpdsuggest);
289 : }
290 :
291 : // perhaps we forgot to hit space and two words ran together
292 0 : if (!nosplitsugs && (nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs))) {
293 0 : nsug = twowords(wlst, word, nsug, cpdsuggest);
294 : }
295 :
296 : } // repeating ``for'' statement compounding support
297 :
298 0 : if (nsug < 0) {
299 : // we ran out of memory - we should free up as much as possible
300 0 : for (int i = 0; i < maxSug; i++)
301 0 : if (wlst[i] != NULL) free(wlst[i]);
302 0 : free(wlst);
303 0 : wlst = NULL;
304 : }
305 :
306 0 : if (!nocompoundtwowords && (nsug > 0) && onlycompoundsug) *onlycompoundsug = 1;
307 :
308 0 : *slst = wlst;
309 0 : return nsug;
310 : }
311 :
312 : // generate suggestions for a word with typical mistake
313 : // pass in address of array of char * pointers
314 : #ifdef HUNSPELL_EXPERIMENTAL
315 : int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug)
316 : {
317 : int nocompoundtwowords = 0;
318 : char ** wlst;
319 : int oldSug;
320 :
321 : char w2[MAXWORDUTF8LEN];
322 : const char * word = w;
323 :
324 : // word reversing wrapper for complex prefixes
325 : if (complexprefixes) {
326 : strcpy(w2, w);
327 : if (utf8) reverseword_utf(w2); else reverseword(w2);
328 : word = w2;
329 : }
330 :
331 : if (*slst) {
332 : wlst = *slst;
333 : } else {
334 : wlst = (char **) malloc(maxSug * sizeof(char *));
335 : if (wlst == NULL) return -1;
336 : }
337 :
338 : for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
339 :
340 : // limit compound suggestion
341 : if (cpdsuggest > 0) oldSug = nsug;
342 :
343 : // perhaps we made a typical fault of spelling
344 : if ((nsug < maxSug) && (nsug > -1))
345 : nsug = replchars(wlst, word, nsug, cpdsuggest);
346 :
347 : // perhaps we made chose the wrong char from a related set
348 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs)))
349 : nsug = mapchars(wlst, word, nsug, cpdsuggest);
350 :
351 : if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
352 :
353 : // perhaps we forgot to hit space and two words ran together
354 :
355 : if ((nsug < maxSug) && (nsug > -1) && (!cpdsuggest || (nsug < oldSug + maxcpdsugs)) && check_forbidden(word, strlen(word))) {
356 : nsug = twowords(wlst, word, nsug, cpdsuggest);
357 : }
358 :
359 : } // repeating ``for'' statement compounding support
360 :
361 : if (nsug < 0) {
362 : for (int i=0;i<maxSug; i++)
363 : if (wlst[i] != NULL) free(wlst[i]);
364 : free(wlst);
365 : return -1;
366 : }
367 :
368 : *slst = wlst;
369 : return nsug;
370 : }
371 : #endif // END OF HUNSPELL_EXPERIMENTAL CODE
372 :
373 : // suggestions for an uppercase word (html -> HTML)
374 0 : int SuggestMgr::capchars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
375 : {
376 : char candidate[MAXSWUTF8L];
377 : w_char candidate_utf[MAXSWL];
378 0 : memcpy(candidate_utf, word, wl * sizeof(w_char));
379 0 : mkallcap_utf(candidate_utf, wl, langnum);
380 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
381 0 : return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
382 : }
383 :
384 : // suggestions for an uppercase word (html -> HTML)
385 0 : int SuggestMgr::capchars(char** wlst, const char * word, int ns, int cpdsuggest)
386 : {
387 : char candidate[MAXSWUTF8L];
388 0 : strcpy(candidate, word);
389 0 : mkallcap(candidate, csconv);
390 0 : return testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
391 : }
392 :
393 : // suggestions for when chose the wrong char out of a related set
394 0 : int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)
395 : {
396 : char candidate[MAXSWUTF8L];
397 : clock_t timelimit;
398 : int timer;
399 0 : candidate[0] = '\0';
400 :
401 0 : int wl = strlen(word);
402 0 : if (wl < 2 || ! pAMgr) return ns;
403 :
404 0 : int nummap = pAMgr->get_nummap();
405 0 : struct mapentry* maptable = pAMgr->get_maptable();
406 0 : if (maptable==NULL) return ns;
407 :
408 0 : timelimit = clock();
409 0 : timer = MINTIMER;
410 0 : return map_related(word, (char *) &candidate, 0, 0, wlst, cpdsuggest, ns, maptable, nummap, &timer, &timelimit);
411 : }
412 :
413 0 : int SuggestMgr::map_related(const char * word, char * candidate, int wn, int cn,
414 : char** wlst, int cpdsuggest, int ns,
415 : const mapentry* maptable, int nummap, int * timer, clock_t * timelimit)
416 : {
417 0 : if (*(word + wn) == '\0') {
418 0 : int cwrd = 1;
419 0 : *(candidate + cn) = '\0';
420 0 : int wl = strlen(candidate);
421 0 : for (int m=0; m < ns; m++)
422 0 : if (strcmp(candidate, wlst[m]) == 0) cwrd = 0;
423 0 : if ((cwrd) && checkword(candidate, wl, cpdsuggest, timer, timelimit)) {
424 0 : if (ns < maxSug) {
425 0 : wlst[ns] = mystrdup(candidate);
426 0 : if (wlst[ns] == NULL) return -1;
427 0 : ns++;
428 : }
429 : }
430 0 : return ns;
431 : }
432 0 : int in_map = 0;
433 0 : for (int j = 0; j < nummap; j++) {
434 0 : for (int k = 0; k < maptable[j].len; k++) {
435 0 : int len = strlen(maptable[j].set[k]);
436 0 : if (strncmp(maptable[j].set[k], word + wn, len) == 0) {
437 0 : in_map = 1;
438 0 : for (int l = 0; l < maptable[j].len; l++) {
439 0 : strcpy(candidate + cn, maptable[j].set[l]);
440 0 : ns = map_related(word, candidate, wn + len, strlen(candidate), wlst,
441 0 : cpdsuggest, ns, maptable, nummap, timer, timelimit);
442 0 : if (!(*timer)) return ns;
443 : }
444 : }
445 : }
446 : }
447 0 : if (!in_map) {
448 0 : *(candidate + cn) = *(word + wn);
449 : ns = map_related(word, candidate, wn + 1, cn + 1, wlst, cpdsuggest,
450 0 : ns, maptable, nummap, timer, timelimit);
451 : }
452 0 : return ns;
453 : }
454 :
455 : // suggestions for a typical fault of spelling, that
456 : // differs with more, than 1 letter from the right form.
457 0 : int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest)
458 : {
459 : char candidate[MAXSWUTF8L];
460 : const char * r;
461 : int lenr, lenp;
462 0 : int wl = strlen(word);
463 0 : if (wl < 2 || ! pAMgr) return ns;
464 0 : int numrep = pAMgr->get_numrep();
465 0 : struct replentry* reptable = pAMgr->get_reptable();
466 0 : if (reptable==NULL) return ns;
467 0 : for (int i=0; i < numrep; i++ ) {
468 0 : r = word;
469 0 : lenr = strlen(reptable[i].pattern2);
470 0 : lenp = strlen(reptable[i].pattern);
471 : // search every occurence of the pattern in the word
472 0 : while ((r=strstr(r, reptable[i].pattern)) != NULL && (!reptable[i].end || strlen(r) == strlen(reptable[i].pattern)) &&
473 0 : (!reptable[i].start || r == word)) {
474 0 : strcpy(candidate, word);
475 0 : if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break;
476 0 : strcpy(candidate+(r-word),reptable[i].pattern2);
477 0 : strcpy(candidate+(r-word)+lenr, r+lenp);
478 0 : ns = testsug(wlst, candidate, wl-lenp+lenr, ns, cpdsuggest, NULL, NULL);
479 0 : if (ns == -1) return -1;
480 : // check REP suggestions with space
481 0 : char * sp = strchr(candidate, ' ');
482 0 : if (sp) {
483 0 : char * prev = candidate;
484 0 : while (sp) {
485 0 : *sp = '\0';
486 0 : if (checkword(prev, strlen(prev), 0, NULL, NULL)) {
487 0 : int oldns = ns;
488 0 : *sp = ' ';
489 0 : ns = testsug(wlst, sp + 1, strlen(sp + 1), ns, cpdsuggest, NULL, NULL);
490 0 : if (ns == -1) return -1;
491 0 : if (oldns < ns) {
492 0 : free(wlst[ns - 1]);
493 0 : wlst[ns - 1] = mystrdup(candidate);
494 0 : if (!wlst[ns - 1]) return -1;
495 : }
496 : }
497 0 : *sp = ' ';
498 0 : prev = sp + 1;
499 0 : sp = strchr(prev, ' ');
500 : }
501 : }
502 0 : r++; // search for the next letter
503 : }
504 : }
505 0 : return ns;
506 : }
507 :
508 : // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
509 0 : int SuggestMgr::doubletwochars(char** wlst, const char * word, int ns, int cpdsuggest)
510 : {
511 : char candidate[MAXSWUTF8L];
512 0 : int state=0;
513 0 : int wl = strlen(word);
514 0 : if (wl < 5 || ! pAMgr) return ns;
515 0 : for (int i=2; i < wl; i++ ) {
516 0 : if (word[i]==word[i-2]) {
517 0 : state++;
518 0 : if (state==3) {
519 0 : strcpy(candidate,word);
520 0 : strcpy(candidate+i-1,word+i+1);
521 0 : ns = testsug(wlst, candidate, wl-2, ns, cpdsuggest, NULL, NULL);
522 0 : if (ns == -1) return -1;
523 0 : state=0;
524 : }
525 : } else {
526 0 : state=0;
527 : }
528 : }
529 0 : return ns;
530 : }
531 :
532 : // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
533 0 : int SuggestMgr::doubletwochars_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
534 : {
535 : w_char candidate_utf[MAXSWL];
536 : char candidate[MAXSWUTF8L];
537 0 : int state=0;
538 0 : if (wl < 5 || ! pAMgr) return ns;
539 0 : for (int i=2; i < wl; i++) {
540 0 : if (w_char_eq(word[i], word[i-2])) {
541 0 : state++;
542 0 : if (state==3) {
543 0 : memcpy(candidate_utf, word, (i - 1) * sizeof(w_char));
544 0 : memcpy(candidate_utf+i-1, word+i+1, (wl-i-1) * sizeof(w_char));
545 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl-2);
546 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
547 0 : if (ns == -1) return -1;
548 0 : state=0;
549 : }
550 : } else {
551 0 : state=0;
552 : }
553 : }
554 0 : return ns;
555 : }
556 :
557 : // error is wrong char in place of correct one (case and keyboard related version)
558 0 : int SuggestMgr::badcharkey(char ** wlst, const char * word, int ns, int cpdsuggest)
559 : {
560 : char tmpc;
561 : char candidate[MAXSWUTF8L];
562 0 : int wl = strlen(word);
563 0 : strcpy(candidate, word);
564 : // swap out each char one by one and try uppercase and neighbor
565 : // keyboard chars in its place to see if that makes a good word
566 :
567 0 : for (int i=0; i < wl; i++) {
568 0 : tmpc = candidate[i];
569 : // check with uppercase letters
570 0 : candidate[i] = csconv[((unsigned char)tmpc)].cupper;
571 0 : if (tmpc != candidate[i]) {
572 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
573 0 : if (ns == -1) return -1;
574 0 : candidate[i] = tmpc;
575 : }
576 : // check neighbor characters in keyboard string
577 0 : if (!ckey) continue;
578 0 : char * loc = strchr(ckey, tmpc);
579 0 : while (loc) {
580 0 : if ((loc > ckey) && (*(loc - 1) != '|')) {
581 0 : candidate[i] = *(loc - 1);
582 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
583 0 : if (ns == -1) return -1;
584 : }
585 0 : if ((*(loc + 1) != '|') && (*(loc + 1) != '\0')) {
586 0 : candidate[i] = *(loc + 1);
587 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
588 0 : if (ns == -1) return -1;
589 : }
590 0 : loc = strchr(loc + 1, tmpc);
591 : }
592 0 : candidate[i] = tmpc;
593 : }
594 0 : return ns;
595 : }
596 :
597 : // error is wrong char in place of correct one (case and keyboard related version)
598 0 : int SuggestMgr::badcharkey_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
599 : {
600 : w_char tmpc;
601 : w_char candidate_utf[MAXSWL];
602 : char candidate[MAXSWUTF8L];
603 0 : memcpy(candidate_utf, word, wl * sizeof(w_char));
604 : // swap out each char one by one and try all the tryme
605 : // chars in its place to see if that makes a good word
606 0 : for (int i=0; i < wl; i++) {
607 0 : tmpc = candidate_utf[i];
608 : // check with uppercase letters
609 0 : mkallcap_utf(candidate_utf + i, 1, langnum);
610 0 : if (!w_char_eq(tmpc, candidate_utf[i])) {
611 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
612 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
613 0 : if (ns == -1) return -1;
614 0 : candidate_utf[i] = tmpc;
615 : }
616 : // check neighbor characters in keyboard string
617 0 : if (!ckey) continue;
618 0 : w_char * loc = ckey_utf;
619 0 : while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc)) loc++;
620 0 : while (loc < (ckey_utf + ckeyl)) {
621 0 : if ((loc > ckey_utf) && !w_char_eq(*(loc - 1), W_VLINE)) {
622 0 : candidate_utf[i] = *(loc - 1);
623 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
624 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
625 0 : if (ns == -1) return -1;
626 : }
627 0 : if (((loc + 1) < (ckey_utf + ckeyl)) && !w_char_eq(*(loc + 1), W_VLINE)) {
628 0 : candidate_utf[i] = *(loc + 1);
629 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
630 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
631 0 : if (ns == -1) return -1;
632 : }
633 0 : do { loc++; } while ((loc < (ckey_utf + ckeyl)) && !w_char_eq(*loc, tmpc));
634 : }
635 0 : candidate_utf[i] = tmpc;
636 : }
637 0 : return ns;
638 : }
639 :
640 : // error is wrong char in place of correct one
641 0 : int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)
642 : {
643 : char tmpc;
644 : char candidate[MAXSWUTF8L];
645 0 : clock_t timelimit = clock();
646 0 : int timer = MINTIMER;
647 0 : int wl = strlen(word);
648 0 : strcpy(candidate, word);
649 : // swap out each char one by one and try all the tryme
650 : // chars in its place to see if that makes a good word
651 0 : for (int j=0; j < ctryl; j++) {
652 0 : for (int i=wl-1; i >= 0; i--) {
653 0 : tmpc = candidate[i];
654 0 : if (ctry[j] == tmpc) continue;
655 0 : candidate[i] = ctry[j];
656 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, &timer, &timelimit);
657 0 : if (ns == -1) return -1;
658 0 : if (!timer) return ns;
659 0 : candidate[i] = tmpc;
660 : }
661 : }
662 0 : return ns;
663 : }
664 :
665 : // error is wrong char in place of correct one
666 0 : int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
667 : {
668 : w_char tmpc;
669 : w_char candidate_utf[MAXSWL];
670 : char candidate[MAXSWUTF8L];
671 0 : clock_t timelimit = clock();
672 0 : int timer = MINTIMER;
673 0 : memcpy(candidate_utf, word, wl * sizeof(w_char));
674 : // swap out each char one by one and try all the tryme
675 : // chars in its place to see if that makes a good word
676 0 : for (int j=0; j < ctryl; j++) {
677 0 : for (int i=wl-1; i >= 0; i--) {
678 0 : tmpc = candidate_utf[i];
679 0 : if (w_char_eq(tmpc, ctry_utf[j])) continue;
680 0 : candidate_utf[i] = ctry_utf[j];
681 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
682 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
683 0 : if (ns == -1) return -1;
684 0 : if (!timer) return ns;
685 0 : candidate_utf[i] = tmpc;
686 : }
687 : }
688 0 : return ns;
689 : }
690 :
691 : // error is word has an extra letter it does not need
692 0 : int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
693 : {
694 : char candidate[MAXSWUTF8L];
695 : w_char candidate_utf[MAXSWL];
696 : w_char * p;
697 0 : w_char tmpc = W_VLINE; // not used value, only for VCC warning message
698 0 : if (wl < 2) return ns;
699 : // try omitting one char of word at a time
700 0 : memcpy(candidate_utf, word, wl * sizeof(w_char));
701 0 : for (p = candidate_utf + wl - 1; p >= candidate_utf; p--) {
702 0 : w_char tmpc2 = *p;
703 0 : if (p < candidate_utf + wl - 1) *p = tmpc;
704 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
705 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
706 0 : if (ns == -1) return -1;
707 0 : tmpc = tmpc2;
708 : }
709 0 : return ns;
710 : }
711 :
712 : // error is word has an extra letter it does not need
713 0 : int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)
714 : {
715 0 : char tmpc = '\0';
716 : char candidate[MAXSWUTF8L];
717 : char * p;
718 0 : int wl = strlen(word);
719 0 : if (wl < 2) return ns;
720 : // try omitting one char of word at a time
721 0 : strcpy (candidate, word);
722 0 : for (p = candidate + wl - 1; p >=candidate; p--) {
723 0 : char tmpc2 = *p;
724 0 : *p = tmpc;
725 0 : ns = testsug(wlst, candidate, wl-1, ns, cpdsuggest, NULL, NULL);
726 0 : if (ns == -1) return -1;
727 0 : tmpc = tmpc2;
728 : }
729 0 : return ns;
730 : }
731 :
732 : // error is missing a letter it needs
733 0 : int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
734 : {
735 : char candidate[MAXSWUTF8L];
736 : char * p;
737 0 : clock_t timelimit = clock();
738 0 : int timer = MINTIMER;
739 0 : int wl = strlen(word);
740 : // try inserting a tryme character before every letter (and the null terminator)
741 0 : for (int i = 0; i < ctryl; i++) {
742 0 : strcpy(candidate, word);
743 0 : for (p = candidate + wl; p >= candidate; p--) {
744 0 : *(p+1) = *p;
745 0 : *p = ctry[i];
746 0 : ns = testsug(wlst, candidate, wl+1, ns, cpdsuggest, &timer, &timelimit);
747 0 : if (ns == -1) return -1;
748 0 : if (!timer) return ns;
749 : }
750 : }
751 0 : return ns;
752 : }
753 :
754 : // error is missing a letter it needs
755 0 : int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
756 : {
757 : w_char candidate_utf[MAXSWL];
758 : char candidate[MAXSWUTF8L];
759 : w_char * p;
760 0 : clock_t timelimit = clock();
761 0 : int timer = MINTIMER;
762 : // try inserting a tryme character at the end of the word and before every letter
763 0 : for (int i = 0; i < ctryl; i++) {
764 0 : memcpy (candidate_utf, word, wl * sizeof(w_char));
765 0 : for (p = candidate_utf + wl; p >= candidate_utf; p--) {
766 0 : *(p + 1) = *p;
767 0 : *p = ctry_utf[i];
768 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
769 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, &timer, &timelimit);
770 0 : if (ns == -1) return -1;
771 0 : if (!timer) return ns;
772 : }
773 : }
774 0 : return ns;
775 : }
776 :
777 :
778 : /* error is should have been two words */
779 0 : int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest)
780 : {
781 : char candidate[MAXSWUTF8L];
782 : char * p;
783 : int c1, c2;
784 0 : int forbidden = 0;
785 : int cwrd;
786 :
787 0 : int wl=strlen(word);
788 0 : if (wl < 3) return ns;
789 :
790 0 : if (langnum == LANG_hu) forbidden = check_forbidden(word, wl);
791 :
792 0 : strcpy(candidate + 1, word);
793 : // split the string into two pieces after every char
794 : // if both pieces are good words make them a suggestion
795 0 : for (p = candidate + 1; p[1] != '\0'; p++) {
796 0 : p[-1] = *p;
797 : // go to end of the UTF-8 character
798 0 : while (utf8 && ((p[1] & 0xc0) == 0x80)) {
799 0 : *p = p[1];
800 0 : p++;
801 : }
802 0 : if (utf8 && p[1] == '\0') break; // last UTF-8 character
803 0 : *p = '\0';
804 0 : c1 = checkword(candidate,strlen(candidate), cpdsuggest, NULL, NULL);
805 0 : if (c1) {
806 0 : c2 = checkword((p+1),strlen(p+1), cpdsuggest, NULL, NULL);
807 0 : if (c2) {
808 0 : *p = ' ';
809 :
810 : // spec. Hungarian code (need a better compound word support)
811 0 : if ((langnum == LANG_hu) && !forbidden &&
812 : // if 3 repeating letter, use - instead of space
813 0 : (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
814 : // or multiple compounding, with more, than 6 syllables
815 0 : ((c1 == 3) && (c2 >= 2)))) *p = '-';
816 :
817 0 : cwrd = 1;
818 0 : for (int k=0; k < ns; k++)
819 0 : if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
820 0 : if (ns < maxSug) {
821 0 : if (cwrd) {
822 0 : wlst[ns] = mystrdup(candidate);
823 0 : if (wlst[ns] == NULL) return -1;
824 0 : ns++;
825 : }
826 0 : } else return ns;
827 : // add two word suggestion with dash, if TRY string contains
828 : // "a" or "-"
829 : // NOTE: cwrd doesn't modified for REP twoword sugg.
830 0 : if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&
831 0 : mystrlen(p + 1) > 1 &&
832 0 : mystrlen(candidate) - mystrlen(p) > 1) {
833 0 : *p = '-';
834 0 : for (int k=0; k < ns; k++)
835 0 : if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
836 0 : if (ns < maxSug) {
837 0 : if (cwrd) {
838 0 : wlst[ns] = mystrdup(candidate);
839 0 : if (wlst[ns] == NULL) return -1;
840 0 : ns++;
841 : }
842 0 : } else return ns;
843 : }
844 : }
845 : }
846 : }
847 0 : return ns;
848 : }
849 :
850 :
851 : // error is adjacent letter were swapped
852 0 : int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
853 : {
854 : char candidate[MAXSWUTF8L];
855 : char * p;
856 : char tmpc;
857 0 : int wl=strlen(word);
858 : // try swapping adjacent chars one by one
859 0 : strcpy(candidate, word);
860 0 : for (p = candidate; p[1] != 0; p++) {
861 0 : tmpc = *p;
862 0 : *p = p[1];
863 0 : p[1] = tmpc;
864 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
865 0 : if (ns == -1) return -1;
866 0 : p[1] = *p;
867 0 : *p = tmpc;
868 : }
869 : // try double swaps for short words
870 : // ahev -> have, owudl -> would
871 0 : if (wl == 4 || wl == 5) {
872 0 : candidate[0] = word[1];
873 0 : candidate[1] = word[0];
874 0 : candidate[2] = word[2];
875 0 : candidate[wl - 2] = word[wl - 1];
876 0 : candidate[wl - 1] = word[wl - 2];
877 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
878 0 : if (ns == -1) return -1;
879 0 : if (wl == 5) {
880 0 : candidate[0] = word[0];
881 0 : candidate[1] = word[2];
882 0 : candidate[2] = word[1];
883 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
884 0 : if (ns == -1) return -1;
885 : }
886 : }
887 0 : return ns;
888 : }
889 :
890 : // error is adjacent letter were swapped
891 0 : int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
892 : {
893 : w_char candidate_utf[MAXSWL];
894 : char candidate[MAXSWUTF8L];
895 : w_char * p;
896 : w_char tmpc;
897 0 : int len = 0;
898 : // try swapping adjacent chars one by one
899 0 : memcpy (candidate_utf, word, wl * sizeof(w_char));
900 0 : for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
901 0 : tmpc = *p;
902 0 : *p = p[1];
903 0 : p[1] = tmpc;
904 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
905 0 : if (len == 0) len = strlen(candidate);
906 0 : ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
907 0 : if (ns == -1) return -1;
908 0 : p[1] = *p;
909 0 : *p = tmpc;
910 : }
911 : // try double swaps for short words
912 : // ahev -> have, owudl -> would, suodn -> sound
913 0 : if (wl == 4 || wl == 5) {
914 0 : candidate_utf[0] = word[1];
915 0 : candidate_utf[1] = word[0];
916 0 : candidate_utf[2] = word[2];
917 0 : candidate_utf[wl - 2] = word[wl - 1];
918 0 : candidate_utf[wl - 1] = word[wl - 2];
919 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
920 0 : ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
921 0 : if (ns == -1) return -1;
922 0 : if (wl == 5) {
923 0 : candidate_utf[0] = word[0];
924 0 : candidate_utf[1] = word[2];
925 0 : candidate_utf[2] = word[1];
926 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
927 0 : ns = testsug(wlst, candidate, len, ns, cpdsuggest, NULL, NULL);
928 0 : if (ns == -1) return -1;
929 : }
930 : }
931 0 : return ns;
932 : }
933 :
934 : // error is not adjacent letter were swapped
935 0 : int SuggestMgr::longswapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
936 : {
937 : char candidate[MAXSWUTF8L];
938 : char * p;
939 : char * q;
940 : char tmpc;
941 0 : int wl=strlen(word);
942 : // try swapping not adjacent chars one by one
943 0 : strcpy(candidate, word);
944 0 : for (p = candidate; *p != 0; p++) {
945 0 : for (q = candidate; *q != 0; q++) {
946 0 : if (abs((int)(p-q)) > 1) {
947 0 : tmpc = *p;
948 0 : *p = *q;
949 0 : *q = tmpc;
950 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
951 0 : if (ns == -1) return -1;
952 0 : *q = *p;
953 0 : *p = tmpc;
954 : }
955 : }
956 : }
957 0 : return ns;
958 : }
959 :
960 :
961 : // error is adjacent letter were swapped
962 0 : int SuggestMgr::longswapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
963 : {
964 : w_char candidate_utf[MAXSWL];
965 : char candidate[MAXSWUTF8L];
966 : w_char * p;
967 : w_char * q;
968 : w_char tmpc;
969 : // try swapping not adjacent chars
970 0 : memcpy (candidate_utf, word, wl * sizeof(w_char));
971 0 : for (p = candidate_utf; p < (candidate_utf + wl); p++) {
972 0 : for (q = candidate_utf; q < (candidate_utf + wl); q++) {
973 0 : if (abs((int)(p-q)) > 1) {
974 0 : tmpc = *p;
975 0 : *p = *q;
976 0 : *q = tmpc;
977 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
978 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
979 0 : if (ns == -1) return -1;
980 0 : *q = *p;
981 0 : *p = tmpc;
982 : }
983 : }
984 : }
985 0 : return ns;
986 : }
987 :
988 : // error is a letter was moved
989 0 : int SuggestMgr::movechar(char ** wlst, const char * word, int ns, int cpdsuggest)
990 : {
991 : char candidate[MAXSWUTF8L];
992 : char * p;
993 : char * q;
994 : char tmpc;
995 :
996 0 : int wl=strlen(word);
997 : // try moving a char
998 0 : strcpy(candidate, word);
999 0 : for (p = candidate; *p != 0; p++) {
1000 0 : for (q = p + 1; (*q != 0) && ((q - p) < 10); q++) {
1001 0 : tmpc = *(q-1);
1002 0 : *(q-1) = *q;
1003 0 : *q = tmpc;
1004 0 : if ((q-p) < 2) continue; // omit swap char
1005 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
1006 0 : if (ns == -1) return -1;
1007 : }
1008 0 : strcpy(candidate, word);
1009 : }
1010 0 : for (p = candidate + wl - 1; p > candidate; p--) {
1011 0 : for (q = p - 1; (q >= candidate) && ((p - q) < 10); q--) {
1012 0 : tmpc = *(q+1);
1013 0 : *(q+1) = *q;
1014 0 : *q = tmpc;
1015 0 : if ((p-q) < 2) continue; // omit swap char
1016 0 : ns = testsug(wlst, candidate, wl, ns, cpdsuggest, NULL, NULL);
1017 0 : if (ns == -1) return -1;
1018 : }
1019 0 : strcpy(candidate, word);
1020 : }
1021 0 : return ns;
1022 : }
1023 :
1024 : // error is a letter was moved
1025 0 : int SuggestMgr::movechar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
1026 : {
1027 : w_char candidate_utf[MAXSWL];
1028 : char candidate[MAXSWUTF8L];
1029 : w_char * p;
1030 : w_char * q;
1031 : w_char tmpc;
1032 : // try moving a char
1033 0 : memcpy (candidate_utf, word, wl * sizeof(w_char));
1034 0 : for (p = candidate_utf; p < (candidate_utf + wl); p++) {
1035 0 : for (q = p + 1; (q < (candidate_utf + wl)) && ((q - p) < 10); q++) {
1036 0 : tmpc = *(q-1);
1037 0 : *(q-1) = *q;
1038 0 : *q = tmpc;
1039 0 : if ((q-p) < 2) continue; // omit swap char
1040 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1041 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
1042 0 : if (ns == -1) return -1;
1043 : }
1044 0 : memcpy (candidate_utf, word, wl * sizeof(w_char));
1045 : }
1046 0 : for (p = candidate_utf + wl - 1; p > candidate_utf; p--) {
1047 0 : for (q = p - 1; (q >= candidate_utf) && ((p - q) < 10); q--) {
1048 0 : tmpc = *(q+1);
1049 0 : *(q+1) = *q;
1050 0 : *q = tmpc;
1051 0 : if ((p-q) < 2) continue; // omit swap char
1052 0 : u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
1053 0 : ns = testsug(wlst, candidate, strlen(candidate), ns, cpdsuggest, NULL, NULL);
1054 0 : if (ns == -1) return -1;
1055 : }
1056 0 : memcpy (candidate_utf, word, wl * sizeof(w_char));
1057 : }
1058 0 : return ns;
1059 : }
1060 :
1061 : // generate a set of suggestions for very poorly spelled words
1062 0 : int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr** pHMgr, int md)
1063 : {
1064 :
1065 : int i, j;
1066 : int lval;
1067 : int sc, scphon;
1068 : int lp, lpphon;
1069 0 : int nonbmp = 0;
1070 :
1071 : // exhaustively search through all root words
1072 : // keeping track of the MAX_ROOTS most similar root words
1073 : struct hentry * roots[MAX_ROOTS];
1074 : char * rootsphon[MAX_ROOTS];
1075 : int scores[MAX_ROOTS];
1076 : int scoresphon[MAX_ROOTS];
1077 0 : for (i = 0; i < MAX_ROOTS; i++) {
1078 0 : roots[i] = NULL;
1079 0 : scores[i] = -100 * i;
1080 0 : rootsphon[i] = NULL;
1081 0 : scoresphon[i] = -100 * i;
1082 : }
1083 0 : lp = MAX_ROOTS - 1;
1084 0 : lpphon = MAX_ROOTS - 1;
1085 0 : scphon = -20000;
1086 0 : int low = NGRAM_LOWERING;
1087 :
1088 : char w2[MAXWORDUTF8LEN];
1089 : char f[MAXSWUTF8L];
1090 0 : char * word = w;
1091 :
1092 : // word reversing wrapper for complex prefixes
1093 0 : if (complexprefixes) {
1094 0 : strcpy(w2, w);
1095 0 : if (utf8) reverseword_utf(w2); else reverseword(w2);
1096 0 : word = w2;
1097 : }
1098 :
1099 : char mw[MAXSWUTF8L];
1100 : w_char u8[MAXSWL];
1101 0 : int nc = strlen(word);
1102 0 : int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc;
1103 :
1104 : // set character based ngram suggestion for words with non-BMP Unicode characters
1105 0 : if (n == -1) {
1106 0 : utf8 = 0; // XXX not state-free
1107 0 : n = nc;
1108 0 : nonbmp = 1;
1109 0 : low = 0;
1110 : }
1111 :
1112 0 : struct hentry* hp = NULL;
1113 0 : int col = -1;
1114 0 : phonetable * ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
1115 : char target[MAXSWUTF8L];
1116 : char candidate[MAXSWUTF8L];
1117 0 : if (ph) {
1118 0 : if (utf8) {
1119 : w_char _w[MAXSWL];
1120 0 : int _wl = u8_u16(_w, MAXSWL, word);
1121 0 : mkallcap_utf(_w, _wl, langnum);
1122 0 : u16_u8(candidate, MAXSWUTF8L, _w, _wl);
1123 : } else {
1124 0 : strcpy(candidate, word);
1125 0 : if (!nonbmp) mkallcap(candidate, csconv);
1126 : }
1127 0 : phonet(candidate, target, nc, *ph); // XXX phonet() is 8-bit (nc, not n)
1128 : }
1129 :
1130 0 : FLAG forbiddenword = pAMgr ? pAMgr->get_forbiddenword() : FLAG_NULL;
1131 0 : FLAG nosuggest = pAMgr ? pAMgr->get_nosuggest() : FLAG_NULL;
1132 0 : FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;
1133 0 : FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;
1134 :
1135 0 : for (i = 0; i < md; i++) {
1136 0 : while (0 != (hp = (pHMgr[i])->walk_hashtable(col, hp))) {
1137 0 : if ((hp->astr) && (pAMgr) &&
1138 0 : (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
1139 0 : TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
1140 0 : TESTAFF(hp->astr, nosuggest, hp->alen) ||
1141 0 : TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
1142 0 : TESTAFF(hp->astr, onlyincompound, hp->alen))) continue;
1143 :
1144 0 : sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
1145 0 : leftcommonsubstring(word, HENTRY_WORD(hp));
1146 :
1147 : // check special pronounciation
1148 0 : if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
1149 0 : int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
1150 0 : + leftcommonsubstring(word, f);
1151 0 : if (sc2 > sc) sc = sc2;
1152 : }
1153 :
1154 0 : scphon = -20000;
1155 0 : if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) {
1156 : char target2[MAXSWUTF8L];
1157 0 : if (utf8) {
1158 : w_char _w[MAXSWL];
1159 0 : int _wl = u8_u16(_w, MAXSWL, HENTRY_WORD(hp));
1160 0 : mkallcap_utf(_w, _wl, langnum);
1161 0 : u16_u8(candidate, MAXSWUTF8L, _w, _wl);
1162 : } else {
1163 0 : strcpy(candidate, HENTRY_WORD(hp));
1164 0 : mkallcap(candidate, csconv);
1165 : }
1166 0 : phonet(candidate, target2, -1, *ph);
1167 0 : scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
1168 : }
1169 :
1170 0 : if (sc > scores[lp]) {
1171 0 : scores[lp] = sc;
1172 0 : roots[lp] = hp;
1173 0 : lval = sc;
1174 0 : for (j=0; j < MAX_ROOTS; j++)
1175 0 : if (scores[j] < lval) {
1176 0 : lp = j;
1177 0 : lval = scores[j];
1178 : }
1179 : }
1180 :
1181 :
1182 0 : if (scphon > scoresphon[lpphon]) {
1183 0 : scoresphon[lpphon] = scphon;
1184 0 : rootsphon[lpphon] = HENTRY_WORD(hp);
1185 0 : lval = scphon;
1186 0 : for (j=0; j < MAX_ROOTS; j++)
1187 0 : if (scoresphon[j] < lval) {
1188 0 : lpphon = j;
1189 0 : lval = scoresphon[j];
1190 : }
1191 : }
1192 : }}
1193 :
1194 : // find minimum threshold for a passable suggestion
1195 : // mangle original word three differnt ways
1196 : // and score them to generate a minimum acceptable score
1197 0 : int thresh = 0;
1198 0 : for (int sp = 1; sp < 4; sp++) {
1199 0 : if (utf8) {
1200 0 : for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';
1201 0 : u16_u8(mw, MAXSWUTF8L, u8, n);
1202 0 : thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
1203 : } else {
1204 0 : strcpy(mw, word);
1205 0 : for (int k=sp; k < n; k+=4) *(mw + k) = '*';
1206 0 : thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
1207 : }
1208 : }
1209 0 : thresh = thresh / 3;
1210 0 : thresh--;
1211 :
1212 : // now expand affixes on each of these root words and
1213 : // and use length adjusted ngram scores to select
1214 : // possible suggestions
1215 : char * guess[MAX_GUESS];
1216 : char * guessorig[MAX_GUESS];
1217 : int gscore[MAX_GUESS];
1218 0 : for(i=0;i<MAX_GUESS;i++) {
1219 0 : guess[i] = NULL;
1220 0 : guessorig[i] = NULL;
1221 0 : gscore[i] = -100 * i;
1222 : }
1223 :
1224 0 : lp = MAX_GUESS - 1;
1225 :
1226 : struct guessword * glst;
1227 0 : glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
1228 0 : if (! glst) {
1229 0 : if (nonbmp) utf8 = 1;
1230 0 : return ns;
1231 : }
1232 :
1233 0 : for (i = 0; i < MAX_ROOTS; i++) {
1234 0 : if (roots[i]) {
1235 0 : struct hentry * rp = roots[i];
1236 : int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen,
1237 : rp->astr, rp->alen, word, nc,
1238 0 : ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL));
1239 :
1240 0 : for (int k = 0; k < nw ; k++) {
1241 0 : sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +
1242 0 : leftcommonsubstring(word, glst[k].word);
1243 :
1244 0 : if (sc > thresh) {
1245 0 : if (sc > gscore[lp]) {
1246 0 : if (guess[lp]) {
1247 0 : free (guess[lp]);
1248 0 : if (guessorig[lp]) {
1249 0 : free(guessorig[lp]);
1250 0 : guessorig[lp] = NULL;
1251 : }
1252 : }
1253 0 : gscore[lp] = sc;
1254 0 : guess[lp] = glst[k].word;
1255 0 : guessorig[lp] = glst[k].orig;
1256 0 : lval = sc;
1257 0 : for (j=0; j < MAX_GUESS; j++)
1258 0 : if (gscore[j] < lval) {
1259 0 : lp = j;
1260 0 : lval = gscore[j];
1261 : }
1262 : } else {
1263 0 : free(glst[k].word);
1264 0 : if (glst[k].orig) free(glst[k].orig);
1265 : }
1266 : } else {
1267 0 : free(glst[k].word);
1268 0 : if (glst[k].orig) free(glst[k].orig);
1269 : }
1270 : }
1271 : }
1272 : }
1273 0 : free(glst);
1274 :
1275 : // now we are done generating guesses
1276 : // sort in order of decreasing score
1277 :
1278 :
1279 0 : bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1280 0 : if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1281 :
1282 : // weight suggestions with a similarity index, based on
1283 : // the longest common subsequent algorithm and resort
1284 :
1285 0 : int is_swap = 0;
1286 0 : int re = 0;
1287 0 : double fact = 1.0;
1288 0 : if (pAMgr) {
1289 0 : int maxd = pAMgr->get_maxdiff();
1290 0 : if (maxd >= 0) fact = (10.0 - maxd)/5.0;
1291 : }
1292 :
1293 0 : for (i=0; i < MAX_GUESS; i++) {
1294 0 : if (guess[i]) {
1295 : // lowering guess[i]
1296 : char gl[MAXSWUTF8L];
1297 : int len;
1298 0 : if (utf8) {
1299 : w_char _w[MAXSWL];
1300 0 : len = u8_u16(_w, MAXSWL, guess[i]);
1301 0 : mkallsmall_utf(_w, len, langnum);
1302 0 : u16_u8(gl, MAXSWUTF8L, _w, len);
1303 : } else {
1304 0 : strcpy(gl, guess[i]);
1305 0 : if (!nonbmp) mkallsmall(gl, csconv);
1306 0 : len = strlen(guess[i]);
1307 : }
1308 :
1309 0 : int _lcs = lcslen(word, gl);
1310 :
1311 : // same characters with different casing
1312 0 : if ((n == len) && (n == _lcs)) {
1313 0 : gscore[i] += 2000;
1314 0 : break;
1315 : }
1316 : // using 2-gram instead of 3, and other weightening
1317 :
1318 0 : re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
1319 0 : ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
1320 :
1321 : gscore[i] =
1322 : // length of longest common subsequent minus length difference
1323 0 : 2 * _lcs - abs((int) (n - len)) +
1324 : // weight length of the left common substring
1325 0 : leftcommonsubstring(word, gl) +
1326 : // weight equal character positions
1327 0 : (!nonbmp && commoncharacterpositions(word, gl, &is_swap) ? 1: 0) +
1328 : // swap character (not neighboring)
1329 : ((is_swap) ? 10 : 0) +
1330 : // ngram
1331 0 : ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) +
1332 : // weighted ngrams
1333 : re +
1334 : // different limit for dictionaries with PHONE rules
1335 0 : (ph ? (re < len * fact ? -1000 : 0) : (re < (n + len)*fact? -1000 : 0));
1336 : }
1337 : }
1338 :
1339 0 : bubblesort(&guess[0], &guessorig[0], &gscore[0], MAX_GUESS);
1340 :
1341 : // phonetic version
1342 0 : if (ph) for (i=0; i < MAX_ROOTS; i++) {
1343 0 : if (rootsphon[i]) {
1344 : // lowering rootphon[i]
1345 : char gl[MAXSWUTF8L];
1346 : int len;
1347 0 : if (utf8) {
1348 : w_char _w[MAXSWL];
1349 0 : len = u8_u16(_w, MAXSWL, rootsphon[i]);
1350 0 : mkallsmall_utf(_w, len, langnum);
1351 0 : u16_u8(gl, MAXSWUTF8L, _w, len);
1352 : } else {
1353 0 : strcpy(gl, rootsphon[i]);
1354 0 : if (!nonbmp) mkallsmall(gl, csconv);
1355 0 : len = strlen(rootsphon[i]);
1356 : }
1357 :
1358 : // heuristic weigthing of ngram scores
1359 0 : scoresphon[i] += 2 * lcslen(word, gl) - abs((int) (n - len)) +
1360 : // weight length of the left common substring
1361 0 : leftcommonsubstring(word, gl);
1362 : }
1363 : }
1364 :
1365 0 : if (ph) bubblesort(&rootsphon[0], NULL, &scoresphon[0], MAX_ROOTS);
1366 :
1367 : // copy over
1368 0 : int oldns = ns;
1369 :
1370 0 : int same = 0;
1371 0 : for (i=0; i < MAX_GUESS; i++) {
1372 0 : if (guess[i]) {
1373 0 : if ((ns < oldns + maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
1374 0 : int unique = 1;
1375 : // leave only excellent suggestions, if exists
1376 0 : if (gscore[i] > 1000) same = 1; else if (gscore[i] < -100) {
1377 0 : same = 1;
1378 : // keep the best ngram suggestions, unless in ONLYMAXDIFF mode
1379 0 : if (ns > oldns || (pAMgr && pAMgr->get_onlymaxdiff())) {
1380 0 : free(guess[i]);
1381 0 : if (guessorig[i]) free(guessorig[i]);
1382 0 : continue;
1383 : }
1384 : }
1385 0 : for (j = 0; j < ns; j++) {
1386 : // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1387 0 : if ((!guessorig[i] && strstr(guess[i], wlst[j])) ||
1388 0 : (guessorig[i] && strstr(guessorig[i], wlst[j])) ||
1389 : // check forbidden words
1390 0 : !checkword(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;
1391 : }
1392 0 : if (unique) {
1393 0 : wlst[ns++] = guess[i];
1394 0 : if (guessorig[i]) {
1395 0 : free(guess[i]);
1396 0 : wlst[ns-1] = guessorig[i];
1397 : }
1398 : } else {
1399 0 : free(guess[i]);
1400 0 : if (guessorig[i]) free(guessorig[i]);
1401 0 : }
1402 : } else {
1403 0 : free(guess[i]);
1404 0 : if (guessorig[i]) free(guessorig[i]);
1405 : }
1406 : }
1407 : }
1408 :
1409 0 : oldns = ns;
1410 0 : if (ph) for (i=0; i < MAX_ROOTS; i++) {
1411 0 : if (rootsphon[i]) {
1412 0 : if ((ns < oldns + MAXPHONSUGS) && (ns < maxSug)) {
1413 0 : int unique = 1;
1414 0 : for (j = 0; j < ns; j++) {
1415 : // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1416 0 : if (strstr(rootsphon[i], wlst[j]) ||
1417 : // check forbidden words
1418 0 : !checkword(rootsphon[i], strlen(rootsphon[i]), 0, NULL, NULL)) unique = 0;
1419 : }
1420 0 : if (unique) {
1421 0 : wlst[ns++] = mystrdup(rootsphon[i]);
1422 0 : if (!wlst[ns - 1]) return ns - 1;
1423 : }
1424 : }
1425 : }
1426 : }
1427 :
1428 0 : if (nonbmp) utf8 = 1;
1429 0 : return ns;
1430 : }
1431 :
1432 :
1433 : // see if a candidate suggestion is spelled correctly
1434 : // needs to check both root words and words with affixes
1435 :
1436 : // obsolote MySpell-HU modifications:
1437 : // return value 2 and 3 marks compounding with hyphen (-)
1438 : // `3' marks roots without suffix
1439 0 : int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * timer, clock_t * timelimit)
1440 : {
1441 0 : struct hentry * rv=NULL;
1442 0 : struct hentry * rv2=NULL;
1443 0 : int nosuffix = 0;
1444 :
1445 : // check time limit
1446 0 : if (timer) {
1447 0 : (*timer)--;
1448 0 : if (!(*timer) && timelimit) {
1449 0 : if ((clock() - *timelimit) > TIMELIMIT) return 0;
1450 0 : *timer = MAXPLUSTIMER;
1451 : }
1452 : }
1453 :
1454 0 : if (pAMgr) {
1455 0 : if (cpdsuggest==1) {
1456 0 : if (pAMgr->get_compound()) {
1457 0 : rv = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 1, 0); //EXT
1458 0 : if (rv && (!(rv2 = pAMgr->lookup(word)) || !rv2->astr ||
1459 0 : !(TESTAFF(rv2->astr,pAMgr->get_forbiddenword(),rv2->alen) ||
1460 0 : TESTAFF(rv2->astr,pAMgr->get_nosuggest(),rv2->alen)))) return 3; // XXX obsolote categorisation + only ICONV needs affix flag check?
1461 : }
1462 0 : return 0;
1463 : }
1464 :
1465 0 : rv = pAMgr->lookup(word);
1466 :
1467 0 : if (rv) {
1468 0 : if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
1469 0 : || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;
1470 0 : while (rv) {
1471 0 : if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
1472 0 : TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1473 0 : TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
1474 0 : rv = rv->next_homonym;
1475 0 : } else break;
1476 : }
1477 0 : } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX
1478 :
1479 0 : if (rv) {
1480 0 : nosuffix=1;
1481 : } else {
1482 0 : rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix
1483 : }
1484 :
1485 0 : if (!rv && pAMgr->have_contclass()) {
1486 0 : rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL);
1487 0 : if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL);
1488 : }
1489 :
1490 : // check forbidden words
1491 0 : if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) ||
1492 0 : TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
1493 0 : TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
1494 0 : TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;
1495 :
1496 0 : if (rv) { // XXX obsolote
1497 0 : if ((pAMgr->get_compoundflag()) &&
1498 0 : TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix;
1499 0 : return 1;
1500 : }
1501 : }
1502 0 : return 0;
1503 : }
1504 :
1505 0 : int SuggestMgr::check_forbidden(const char * word, int len)
1506 : {
1507 0 : struct hentry * rv = NULL;
1508 :
1509 0 : if (pAMgr) {
1510 0 : rv = pAMgr->lookup(word);
1511 0 : if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) ||
1512 0 : TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
1513 0 : if (!(pAMgr->prefix_check(word,len,1)))
1514 0 : rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix
1515 : // check forbidden words
1516 0 : if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1;
1517 : }
1518 0 : return 0;
1519 : }
1520 :
1521 : #ifdef HUNSPELL_EXPERIMENTAL
1522 : // suggest possible stems
1523 : int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)
1524 : {
1525 : char ** wlst;
1526 :
1527 : struct hentry * rv = NULL;
1528 :
1529 : char w2[MAXSWUTF8L];
1530 : const char * word = w;
1531 :
1532 : // word reversing wrapper for complex prefixes
1533 : if (complexprefixes) {
1534 : strcpy(w2, w);
1535 : if (utf8) reverseword_utf(w2); else reverseword(w2);
1536 : word = w2;
1537 : }
1538 :
1539 : int wl = strlen(word);
1540 :
1541 :
1542 : if (*slst) {
1543 : wlst = *slst;
1544 : } else {
1545 : wlst = (char **) calloc(maxSug, sizeof(char *));
1546 : if (wlst == NULL) return -1;
1547 : }
1548 :
1549 : rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug);
1550 :
1551 : // delete dash from end of word
1552 : if (nsug > 0) {
1553 : for (int j=0; j < nsug; j++) {
1554 : if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0';
1555 : }
1556 : }
1557 :
1558 : *slst = wlst;
1559 : return nsug;
1560 : }
1561 : #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1562 :
1563 :
1564 0 : char * SuggestMgr::suggest_morph(const char * w)
1565 : {
1566 : char result[MAXLNLEN];
1567 0 : char * r = (char *) result;
1568 : char * st;
1569 :
1570 0 : struct hentry * rv = NULL;
1571 :
1572 0 : *result = '\0';
1573 :
1574 0 : if (! pAMgr) return NULL;
1575 :
1576 : char w2[MAXSWUTF8L];
1577 0 : const char * word = w;
1578 :
1579 : // word reversing wrapper for complex prefixes
1580 0 : if (complexprefixes) {
1581 0 : strcpy(w2, w);
1582 0 : if (utf8) reverseword_utf(w2); else reverseword(w2);
1583 0 : word = w2;
1584 : }
1585 :
1586 0 : rv = pAMgr->lookup(word);
1587 :
1588 0 : while (rv) {
1589 0 : if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
1590 0 : TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) ||
1591 0 : TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
1592 0 : if (!HENTRY_FIND(rv, MORPH_STEM)) {
1593 0 : mystrcat(result, " ", MAXLNLEN);
1594 0 : mystrcat(result, MORPH_STEM, MAXLNLEN);
1595 0 : mystrcat(result, word, MAXLNLEN);
1596 : }
1597 0 : if (HENTRY_DATA(rv)) {
1598 0 : mystrcat(result, " ", MAXLNLEN);
1599 0 : mystrcat(result, HENTRY_DATA2(rv), MAXLNLEN);
1600 : }
1601 0 : mystrcat(result, "\n", MAXLNLEN);
1602 : }
1603 0 : rv = rv->next_homonym;
1604 : }
1605 :
1606 0 : st = pAMgr->affix_check_morph(word,strlen(word));
1607 0 : if (st) {
1608 0 : mystrcat(result, st, MAXLNLEN);
1609 0 : free(st);
1610 : }
1611 :
1612 0 : if (pAMgr->get_compound() && (*result == '\0'))
1613 0 : pAMgr->compound_check_morph(word, strlen(word),
1614 0 : 0, 0, 100, 0,NULL, 0, &r, NULL);
1615 :
1616 0 : return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL;
1617 : }
1618 :
1619 : #ifdef HUNSPELL_EXPERIMENTAL
1620 : char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)
1621 : {
1622 : char * p = NULL;
1623 : char ** wlst = (char **) calloc(maxSug, sizeof(char *));
1624 : if (!**wlst) return NULL;
1625 : // we will use only the first suggestion
1626 : for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";
1627 : int ns = suggest(&wlst, word, maxSug - 1, NULL);
1628 : if (ns == maxSug) {
1629 : p = suggest_morph(wlst[maxSug - 1]);
1630 : free(wlst[maxSug - 1]);
1631 : }
1632 : if (wlst) free(wlst);
1633 : return p;
1634 : }
1635 : #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1636 :
1637 : /* affixation */
1638 0 : char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern)
1639 : {
1640 : char result[MAXLNLEN];
1641 0 : *result = '\0';
1642 0 : int sfxcount = get_sfxcount(pattern);
1643 :
1644 0 : if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL;
1645 :
1646 0 : if (HENTRY_DATA(rv)) {
1647 : char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen,
1648 0 : HENTRY_DATA(rv), pattern, 0);
1649 0 : if (aff) {
1650 0 : mystrcat(result, aff, MAXLNLEN);
1651 0 : mystrcat(result, "\n", MAXLNLEN);
1652 0 : free(aff);
1653 : }
1654 : }
1655 :
1656 : // check all allomorphs
1657 : char allomorph[MAXLNLEN];
1658 0 : char * p = NULL;
1659 0 : if (HENTRY_DATA(rv)) p = (char *) strstr(HENTRY_DATA2(rv), MORPH_ALLOMORPH);
1660 0 : while (p) {
1661 0 : struct hentry * rv2 = NULL;
1662 0 : p += MORPH_TAG_LEN;
1663 0 : int plen = fieldlen(p);
1664 0 : strncpy(allomorph, p, plen);
1665 0 : allomorph[plen] = '\0';
1666 0 : rv2 = pAMgr->lookup(allomorph);
1667 0 : while (rv2) {
1668 : // if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) {
1669 0 : if (HENTRY_DATA(rv2)) {
1670 0 : char * st = (char *) strstr(HENTRY_DATA2(rv2), MORPH_STEM);
1671 0 : if (st && (strncmp(st + MORPH_TAG_LEN,
1672 0 : HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) {
1673 : char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen,
1674 0 : HENTRY_DATA(rv2), pattern, 0);
1675 0 : if (aff) {
1676 0 : mystrcat(result, aff, MAXLNLEN);
1677 0 : mystrcat(result, "\n", MAXLNLEN);
1678 0 : free(aff);
1679 : }
1680 : }
1681 : }
1682 0 : rv2 = rv2->next_homonym;
1683 : }
1684 0 : p = strstr(p + plen, MORPH_ALLOMORPH);
1685 : }
1686 :
1687 0 : return (*result) ? mystrdup(result) : NULL;
1688 : }
1689 :
1690 0 : char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) {
1691 : char result[MAXLNLEN];
1692 : char result2[MAXLNLEN];
1693 : char newpattern[MAXLNLEN];
1694 0 : *newpattern = '\0';
1695 0 : if (n == 0) return 0;
1696 0 : *result2 = '\0';
1697 0 : struct hentry * rv = NULL;
1698 0 : if (!pAMgr) return NULL;
1699 :
1700 : // search affixed forms with and without derivational suffixes
1701 0 : while(1) {
1702 :
1703 0 : for (int k = 0; k < n; k++) {
1704 0 : *result = '\0';
1705 : // add compound word parts (except the last one)
1706 0 : char * s = (char *) desc[k];
1707 0 : char * part = strstr(s, MORPH_PART);
1708 0 : if (part) {
1709 0 : char * nextpart = strstr(part + 1, MORPH_PART);
1710 0 : while (nextpart) {
1711 0 : copy_field(result + strlen(result), part, MORPH_PART);
1712 0 : part = nextpart;
1713 0 : nextpart = strstr(part + 1, MORPH_PART);
1714 : }
1715 0 : s = part;
1716 : }
1717 :
1718 : char **pl;
1719 : char tok[MAXLNLEN];
1720 0 : strcpy(tok, s);
1721 0 : char * alt = strstr(tok, " | ");
1722 0 : while (alt) {
1723 0 : alt[1] = MSEP_ALT;
1724 0 : alt = strstr(alt, " | ");
1725 : }
1726 0 : int pln = line_tok(tok, &pl, MSEP_ALT);
1727 0 : for (int i = 0; i < pln; i++) {
1728 : // remove inflectional and terminal suffixes
1729 0 : char * is = strstr(pl[i], MORPH_INFL_SFX);
1730 0 : if (is) *is = '\0';
1731 0 : char * ts = strstr(pl[i], MORPH_TERM_SFX);
1732 0 : while (ts) {
1733 0 : *ts = '_';
1734 0 : ts = strstr(pl[i], MORPH_TERM_SFX);
1735 : }
1736 0 : char * st = strstr(s, MORPH_STEM);
1737 0 : if (st) {
1738 0 : copy_field(tok, st, MORPH_STEM);
1739 0 : rv = pAMgr->lookup(tok);
1740 0 : while (rv) {
1741 : char newpat[MAXLNLEN];
1742 0 : strcpy(newpat, pl[i]);
1743 0 : strcat(newpat, pattern);
1744 0 : char * sg = suggest_hentry_gen(rv, newpat);
1745 0 : if (!sg) sg = suggest_hentry_gen(rv, pattern);
1746 0 : if (sg) {
1747 : char ** gen;
1748 0 : int genl = line_tok(sg, &gen, MSEP_REC);
1749 0 : free(sg);
1750 0 : sg = NULL;
1751 0 : for (int j = 0; j < genl; j++) {
1752 0 : if (strstr(pl[i], MORPH_SURF_PFX)) {
1753 0 : int r2l = strlen(result2);
1754 0 : result2[r2l] = MSEP_REC;
1755 0 : strcpy(result2 + r2l + 1, result);
1756 0 : copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX);
1757 0 : mystrcat(result2, gen[j], MAXLNLEN);
1758 : } else {
1759 0 : sprintf(result2 + strlen(result2), "%c%s%s",
1760 0 : MSEP_REC, result, gen[j]);
1761 : }
1762 : }
1763 0 : freelist(&gen, genl);
1764 : }
1765 0 : rv = rv->next_homonym;
1766 : }
1767 : }
1768 : }
1769 0 : freelist(&pl, pln);
1770 : }
1771 :
1772 0 : if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break;
1773 0 : strcpy(newpattern, pattern);
1774 0 : pattern = newpattern;
1775 0 : char * ds = strstr(pattern, MORPH_DERI_SFX);
1776 0 : while (ds) {
1777 0 : strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN);
1778 0 : ds = strstr(pattern, MORPH_DERI_SFX);
1779 : }
1780 : }
1781 0 : return (*result2 ? mystrdup(result2) : NULL);
1782 : }
1783 :
1784 :
1785 : // generate an n-gram score comparing s1 and s2
1786 0 : int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt)
1787 : {
1788 0 : int nscore = 0;
1789 : int ns;
1790 : int l1;
1791 : int l2;
1792 0 : int test = 0;
1793 :
1794 0 : if (utf8) {
1795 : w_char su1[MAXSWL];
1796 : w_char su2[MAXSWL];
1797 0 : l1 = u8_u16(su1, MAXSWL, s1);
1798 0 : l2 = u8_u16(su2, MAXSWL, s2);
1799 0 : if ((l2 <= 0) || (l1 == -1)) return 0;
1800 : // lowering dictionary word
1801 0 : if (opt & NGRAM_LOWERING) mkallsmall_utf(su2, l2, langnum);
1802 0 : for (int j = 1; j <= n; j++) {
1803 0 : ns = 0;
1804 0 : for (int i = 0; i <= (l1-j); i++) {
1805 0 : int k = 0;
1806 0 : for (int l = 0; l <= (l2-j); l++) {
1807 0 : for (k = 0; k < j; k++) {
1808 0 : w_char * c1 = su1 + i + k;
1809 0 : w_char * c2 = su2 + l + k;
1810 0 : if ((c1->l != c2->l) || (c1->h != c2->h)) break;
1811 : }
1812 0 : if (k == j) {
1813 0 : ns++;
1814 0 : break;
1815 : }
1816 : }
1817 0 : if (k != j && opt & NGRAM_WEIGHTED) {
1818 0 : ns--;
1819 0 : test++;
1820 0 : if (i == 0 || i == l1-j) ns--; // side weight
1821 : }
1822 : }
1823 0 : nscore = nscore + ns;
1824 0 : if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break;
1825 : }
1826 : } else {
1827 0 : l2 = strlen(s2);
1828 0 : if (l2 == 0) return 0;
1829 0 : l1 = strlen(s1);
1830 0 : char *t = mystrdup(s2);
1831 0 : if (opt & NGRAM_LOWERING) mkallsmall(t, csconv);
1832 0 : for (int j = 1; j <= n; j++) {
1833 0 : ns = 0;
1834 0 : for (int i = 0; i <= (l1-j); i++) {
1835 0 : char c = *(s1 + i + j);
1836 0 : *(s1 + i + j) = '\0';
1837 0 : if (strstr(t,(s1+i))) {
1838 0 : ns++;
1839 0 : } else if (opt & NGRAM_WEIGHTED) {
1840 0 : ns--;
1841 0 : test++;
1842 0 : if (i == 0 || i == l1-j) ns--; // side weight
1843 : }
1844 0 : *(s1 + i + j ) = c;
1845 : }
1846 0 : nscore = nscore + ns;
1847 0 : if (ns < 2 && !(opt & NGRAM_WEIGHTED)) break;
1848 : }
1849 0 : free(t);
1850 : }
1851 :
1852 0 : ns = 0;
1853 0 : if (opt & NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
1854 0 : if (opt & NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
1855 0 : ns = (nscore - ((ns > 0) ? ns : 0));
1856 0 : return ns;
1857 : }
1858 :
1859 : // length of the left common substring of s1 and (decapitalised) s2
1860 0 : int SuggestMgr::leftcommonsubstring(char * s1, const char * s2) {
1861 0 : if (utf8) {
1862 : w_char su1[MAXSWL];
1863 : w_char su2[MAXSWL];
1864 0 : su1[0].l = su2[0].l = su1[0].h = su2[0].h = 0;
1865 : // decapitalize dictionary word
1866 0 : if (complexprefixes) {
1867 0 : int l1 = u8_u16(su1, MAXSWL, s1);
1868 0 : int l2 = u8_u16(su2, MAXSWL, s2);
1869 0 : if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;
1870 : } else {
1871 : int i;
1872 0 : u8_u16(su1, 1, s1);
1873 0 : u8_u16(su2, 1, s2);
1874 0 : unsigned short idx = (su2->h << 8) + su2->l;
1875 0 : unsigned short otheridx = (su1->h << 8) + su1->l;
1876 0 : if (otheridx != idx &&
1877 0 : (otheridx != unicodetolower(idx, langnum))) return 0;
1878 0 : int l1 = u8_u16(su1, MAXSWL, s1);
1879 0 : int l2 = u8_u16(su2, MAXSWL, s2);
1880 0 : for(i = 1; (i < l1) && (i < l2) &&
1881 : (su1[i].l == su2[i].l) && (su1[i].h == su2[i].h); i++);
1882 0 : return i;
1883 : }
1884 : } else {
1885 0 : if (complexprefixes) {
1886 0 : int l1 = strlen(s1);
1887 0 : int l2 = strlen(s2);
1888 0 : if (*(s2+l1-1) == *(s2+l2-1)) return 1;
1889 : } else {
1890 0 : char * olds = s1;
1891 : // decapitalise dictionary word
1892 0 : if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) return 0;
1893 0 : do {
1894 0 : s1++; s2++;
1895 : } while ((*s1 == *s2) && (*s1 != '\0'));
1896 0 : return (int)(s1 - olds);
1897 : }
1898 : }
1899 0 : return 0;
1900 : }
1901 :
1902 0 : int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_swap) {
1903 0 : int num = 0;
1904 0 : int diff = 0;
1905 : int diffpos[2];
1906 0 : *is_swap = 0;
1907 0 : if (utf8) {
1908 : w_char su1[MAXSWL];
1909 : w_char su2[MAXSWL];
1910 0 : int l1 = u8_u16(su1, MAXSWL, s1);
1911 0 : int l2 = u8_u16(su2, MAXSWL, s2);
1912 : // decapitalize dictionary word
1913 0 : if (complexprefixes) {
1914 0 : mkallsmall_utf(su2+l2-1, 1, langnum);
1915 : } else {
1916 0 : mkallsmall_utf(su2, 1, langnum);
1917 : }
1918 0 : for (int i = 0; (i < l1) && (i < l2); i++) {
1919 0 : if (((short *) su1)[i] == ((short *) su2)[i]) {
1920 0 : num++;
1921 : } else {
1922 0 : if (diff < 2) diffpos[diff] = i;
1923 0 : diff++;
1924 : }
1925 : }
1926 0 : if ((diff == 2) && (l1 == l2) &&
1927 0 : (((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) &&
1928 0 : (((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1;
1929 : } else {
1930 : int i;
1931 : char t[MAXSWUTF8L];
1932 0 : strcpy(t, s2);
1933 : // decapitalize dictionary word
1934 0 : if (complexprefixes) {
1935 0 : int l2 = strlen(t);
1936 0 : *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower;
1937 : } else {
1938 0 : mkallsmall(t, csconv);
1939 : }
1940 0 : for (i = 0; (*(s1+i) != 0) && (*(t+i) != 0); i++) {
1941 0 : if (*(s1+i) == *(t+i)) {
1942 0 : num++;
1943 : } else {
1944 0 : if (diff < 2) diffpos[diff] = i;
1945 0 : diff++;
1946 : }
1947 : }
1948 0 : if ((diff == 2) && (*(s1+i) == 0) && (*(t+i) == 0) &&
1949 0 : (*(s1+diffpos[0]) == *(t+diffpos[1])) &&
1950 0 : (*(s1+diffpos[1]) == *(t+diffpos[0]))) *is_swap = 1;
1951 : }
1952 0 : return num;
1953 : }
1954 :
1955 0 : int SuggestMgr::mystrlen(const char * word) {
1956 0 : if (utf8) {
1957 : w_char w[MAXSWL];
1958 0 : return u8_u16(w, MAXSWL, word);
1959 0 : } else return strlen(word);
1960 : }
1961 :
1962 : // sort in decreasing order of score
1963 0 : void SuggestMgr::bubblesort(char** rword, char** rword2, int* rsc, int n )
1964 : {
1965 0 : int m = 1;
1966 0 : while (m < n) {
1967 0 : int j = m;
1968 0 : while (j > 0) {
1969 0 : if (rsc[j-1] < rsc[j]) {
1970 0 : int sctmp = rsc[j-1];
1971 0 : char * wdtmp = rword[j-1];
1972 0 : rsc[j-1] = rsc[j];
1973 0 : rword[j-1] = rword[j];
1974 0 : rsc[j] = sctmp;
1975 0 : rword[j] = wdtmp;
1976 0 : if (rword2) {
1977 0 : wdtmp = rword2[j-1];
1978 0 : rword2[j-1] = rword2[j];
1979 0 : rword2[j] = wdtmp;
1980 : }
1981 0 : j--;
1982 0 : } else break;
1983 : }
1984 0 : m++;
1985 : }
1986 : return;
1987 : }
1988 :
1989 : // longest common subsequence
1990 0 : void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) {
1991 : int n, m;
1992 : w_char su[MAXSWL];
1993 : w_char su2[MAXSWL];
1994 : char * b;
1995 : char * c;
1996 : int i;
1997 : int j;
1998 0 : if (utf8) {
1999 0 : m = u8_u16(su, MAXSWL, s);
2000 0 : n = u8_u16(su2, MAXSWL, s2);
2001 : } else {
2002 0 : m = strlen(s);
2003 0 : n = strlen(s2);
2004 : }
2005 0 : c = (char *) malloc((m + 1) * (n + 1));
2006 0 : b = (char *) malloc((m + 1) * (n + 1));
2007 0 : if (!c || !b) {
2008 0 : if (c) free(c);
2009 0 : if (b) free(b);
2010 0 : *result = NULL;
2011 0 : return;
2012 : }
2013 0 : for (i = 1; i <= m; i++) c[i*(n+1)] = 0;
2014 0 : for (j = 0; j <= n; j++) c[j] = 0;
2015 0 : for (i = 1; i <= m; i++) {
2016 0 : for (j = 1; j <= n; j++) {
2017 0 : if ( ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1)))
2018 0 : || ((!utf8) && ((*(s+i-1)) == (*(s2+j-1))))) {
2019 0 : c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1;
2020 0 : b[i*(n+1) + j] = LCS_UPLEFT;
2021 0 : } else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) {
2022 0 : c[i*(n+1) + j] = c[(i-1)*(n+1) + j];
2023 0 : b[i*(n+1) + j] = LCS_UP;
2024 : } else {
2025 0 : c[i*(n+1) + j] = c[i*(n+1) + j-1];
2026 0 : b[i*(n+1) + j] = LCS_LEFT;
2027 : }
2028 : }
2029 : }
2030 0 : *result = b;
2031 0 : free(c);
2032 0 : *l1 = m;
2033 0 : *l2 = n;
2034 : }
2035 :
2036 0 : int SuggestMgr::lcslen(const char * s, const char* s2) {
2037 : int m;
2038 : int n;
2039 : int i;
2040 : int j;
2041 : char * result;
2042 0 : int len = 0;
2043 0 : lcs(s, s2, &m, &n, &result);
2044 0 : if (!result) return 0;
2045 0 : i = m;
2046 0 : j = n;
2047 0 : while ((i != 0) && (j != 0)) {
2048 0 : if (result[i*(n+1) + j] == LCS_UPLEFT) {
2049 0 : len++;
2050 0 : i--;
2051 0 : j--;
2052 0 : } else if (result[i*(n+1) + j] == LCS_UP) {
2053 0 : i--;
2054 0 : } else j--;
2055 : }
2056 0 : free(result);
2057 0 : return len;
2058 : }
|