LCOV - code coverage report
Current view: directory - extensions/spellcheck/hunspell/src - affentry.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 402 249 61.9 %
Date: 2012-06-02 Functions: 18 12 66.7 %

       1                 : /******* BEGIN LICENSE BLOCK *******
       2                 :  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
       3                 :  * 
       4                 :  * The contents of this file are subject to the Mozilla Public License Version
       5                 :  * 1.1 (the "License"); you may not use this file except in compliance with
       6                 :  * the License. You may obtain a copy of the License at
       7                 :  * http://www.mozilla.org/MPL/
       8                 :  * 
       9                 :  * Software distributed under the License is distributed on an "AS IS" basis,
      10                 :  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
      11                 :  * for the specific language governing rights and limitations under the
      12                 :  * License.
      13                 :  * 
      14                 :  * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
      15                 :  * and László Németh (Hunspell). Portions created by the Initial Developers
      16                 :  * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
      17                 :  * 
      18                 :  * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
      19                 :  *                 David Einstein (deinst@world.std.com)
      20                 :  *                 László Németh (nemethl@gyorsposta.hu)
      21                 :  *                 Caolan McNamara (caolanm@redhat.com)
      22                 :  *                 Davide Prina
      23                 :  *                 Giuseppe Modugno
      24                 :  *                 Gianluca Turconi
      25                 :  *                 Simon Brouwer
      26                 :  *                 Noll Janos
      27                 :  *                 Biro Arpad
      28                 :  *                 Goldman Eleonora
      29                 :  *                 Sarlos Tamas
      30                 :  *                 Bencsath Boldizsar
      31                 :  *                 Halacsy Peter
      32                 :  *                 Dvornik Laszlo
      33                 :  *                 Gefferth Andras
      34                 :  *                 Nagy Viktor
      35                 :  *                 Varga Daniel
      36                 :  *                 Chris Halls
      37                 :  *                 Rene Engelhard
      38                 :  *                 Bram Moolenaar
      39                 :  *                 Dafydd Jones
      40                 :  *                 Harri Pitkanen
      41                 :  *                 Andras Timar
      42                 :  *                 Tor Lillqvist
      43                 :  * 
      44                 :  * Alternatively, the contents of this file may be used under the terms of
      45                 :  * either the GNU General Public License Version 2 or later (the "GPL"), or
      46                 :  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
      47                 :  * in which case the provisions of the GPL or the LGPL are applicable instead
      48                 :  * of those above. If you wish to allow use of your version of this file only
      49                 :  * under the terms of either the GPL or the LGPL, and not to allow others to
      50                 :  * use your version of this file under the terms of the MPL, indicate your
      51                 :  * decision by deleting the provisions above and replace them with the notice
      52                 :  * and other provisions required by the GPL or the LGPL. If you do not delete
      53                 :  * the provisions above, a recipient may use your version of this file under
      54                 :  * the terms of any one of the MPL, the GPL or the LGPL.
      55                 :  *
      56                 :  ******* END LICENSE BLOCK *******/
      57                 : 
      58                 : #include <stdlib.h>
      59                 : #include <string.h>
      60                 : #include <stdio.h>
      61                 : #include <ctype.h>
      62                 : 
      63                 : #include "affentry.hxx"
      64                 : #include "csutil.hxx"
      65                 : 
      66             144 : PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
      67                 : {
      68                 :   // register affix manager
      69             144 :   pmyMgr = pmgr;
      70                 : 
      71                 :   // set up its initial values
      72                 : 
      73             144 :   aflag = dp->aflag;         // flag
      74             144 :   strip = dp->strip;         // string to strip
      75             144 :   appnd = dp->appnd;         // string to append
      76             144 :   stripl = dp->stripl;       // length of strip string
      77             144 :   appndl = dp->appndl;       // length of append string
      78             144 :   numconds = dp->numconds;   // length of the condition
      79             144 :   opts = dp->opts;           // cross product flag
      80                 :   // then copy over all of the conditions
      81             144 :   if (opts & aeLONGCOND) {
      82               6 :     memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
      83               6 :     c.l.conds2 = dp->c.l.conds2;
      84             138 :   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
      85             144 :   next = NULL;
      86             144 :   nextne = NULL;
      87             144 :   nexteq = NULL;
      88             144 :   morphcode = dp->morphcode;
      89             144 :   contclass = dp->contclass;
      90             144 :   contclasslen = dp->contclasslen;
      91             144 : }
      92                 : 
      93                 : 
      94             144 : PfxEntry::~PfxEntry()
      95                 : {
      96             144 :     aflag = 0;
      97             144 :     if (appnd) free(appnd);
      98             144 :     if (strip) free(strip);
      99             144 :     pmyMgr = NULL;
     100             144 :     appnd = NULL;
     101             144 :     strip = NULL;
     102             144 :     if (opts & aeLONGCOND) free(c.l.conds2);
     103             144 :     if (morphcode && !(opts & aeALIASM)) free(morphcode);
     104             144 :     if (contclass && !(opts & aeALIASF)) free(contclass);
     105             144 : }
     106                 : 
     107                 : // add prefix to this word assuming conditions hold
     108               0 : char * PfxEntry::add(const char * word, int len)
     109                 : {
     110                 :     char tword[MAXWORDUTF8LEN + 4];
     111                 : 
     112               0 :     if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) && 
     113               0 :        (len >= numconds) && test_condition(word) &&
     114               0 :        (!stripl || (strncmp(word, strip, stripl) == 0)) &&
     115                 :        ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
     116                 :     /* we have a match so add prefix */
     117               0 :               char * pp = tword;
     118               0 :               if (appndl) {
     119               0 :                   strcpy(tword,appnd);
     120               0 :                   pp += appndl;
     121                 :                }
     122               0 :                strcpy(pp, (word + stripl));
     123               0 :                return mystrdup(tword);
     124                 :      }
     125               0 :      return NULL;
     126                 : }
     127                 : 
     128             341 : inline char * PfxEntry::nextchar(char * p) {
     129             341 :     if (p) {
     130             341 :         p++;
     131             341 :         if (opts & aeLONGCOND) {
     132                 :             // jump to the 2nd part of the condition
     133             119 :             if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
     134                 :         // end of the MAXCONDLEN length condition
     135             222 :         } else if (p == c.conds + MAXCONDLEN) return NULL;
     136             335 :         return *p ? p : NULL;
     137                 :     }
     138               0 :     return NULL;
     139                 : }
     140                 : 
     141            1823 : inline int PfxEntry::test_condition(const char * st)
     142                 : {
     143            1823 :     const char * pos = NULL; // group with pos input position
     144            1823 :     bool neg = false;        // complementer
     145            1823 :     bool ingroup = false;    // character in the group
     146            1823 :     if (numconds == 0) return 1;
     147              35 :     char * p = c.conds;
     148             244 :     while (1) {
     149             279 :       switch (*p) {
     150               0 :         case '\0': return 1;
     151                 :         case '[': { 
     152              40 :                 neg = false;
     153              40 :                 ingroup = false;
     154              40 :                 p = nextchar(p);
     155              40 :                 pos = st; break;
     156                 :             }
     157              18 :         case '^': { p = nextchar(p); neg = true; break; }
     158                 :         case ']': { 
     159              40 :                 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
     160              23 :                 pos = NULL;
     161              23 :                 p = nextchar(p);
     162                 :                 // skip the next character
     163              23 :                 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
     164              23 :                 if (*st == '\0' && p) return 0; // word <= condition
     165              23 :                 break;
     166                 :             }
     167               0 :          case '.': if (!pos) { // dots are not metacharacters in groups: [.]
     168               0 :                 p = nextchar(p);
     169                 :                 // skip the next character
     170               0 :                 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
     171               0 :                 if (*st == '\0' && p) return 0; // word <= condition
     172               0 :                 break;
     173                 :             }
     174                 :     default: {
     175             181 :                 if (*st == *p) {
     176              59 :                     st++;
     177              59 :                     p = nextchar(p);
     178              59 :                     if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
     179              94 :                         while (p && (*p & 0xc0) == 0x80) {       // character
     180              36 :                             if (*p != *st) {
     181              14 :                                 if (!pos) return 0;
     182              14 :                                 st = pos;
     183              14 :                                 break;
     184                 :                             }
     185              22 :                             p = nextchar(p);
     186              22 :                             st++;
     187                 :                         }
     188              72 :                         if (pos && st != pos) {
     189              15 :                             ingroup = true;
     190              15 :                             while (p && *p != ']' && (p = nextchar(p)));
     191                 :                         }
     192              23 :                     } else if (pos) {
     193              16 :                         ingroup = true;
     194              16 :                         while (p && *p != ']' && (p = nextchar(p)));
     195                 :                     }
     196             122 :                 } else if (pos) { // group
     197             119 :                     p = nextchar(p);
     198               3 :                 } else return 0;
     199                 :             }
     200                 :       }
     201             259 :       if (!p) return 1;
     202                 :     }
     203                 : }
     204                 : 
     205                 : // check if this prefix entry matches
     206            1813 : struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
     207                 : {
     208                 :     int                 tmpl;   // length of tmpword
     209                 :     struct hentry *     he;     // hash entry of root word or NULL
     210                 :     char                tmpword[MAXWORDUTF8LEN + 4];
     211                 : 
     212                 :     // on entry prefix is 0 length or already matches the beginning of the word.
     213                 :     // So if the remaining root word has positive length
     214                 :     // and if there are enough chars in root word and added back strip chars
     215                 :     // to meet the number of characters conditions, then test it
     216                 : 
     217            1813 :      tmpl = len - appndl;
     218                 : 
     219            1813 :      if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
     220                 : 
     221                 :             // generate new root word by removing prefix and adding
     222                 :             // back any characters that would have been stripped
     223                 : 
     224            1682 :             if (stripl) strcpy (tmpword, strip);
     225            1682 :             strcpy ((tmpword + stripl), (word + appndl));
     226                 : 
     227                 :             // now make sure all of the conditions on characters
     228                 :             // are met.  Please see the appendix at the end of
     229                 :             // this file for more info on exactly what is being
     230                 :             // tested
     231                 : 
     232                 :             // if all conditions are met then check if resulting
     233                 :             // root word in the dictionary
     234                 : 
     235            1682 :             if (test_condition(tmpword)) {
     236            1662 :                 tmpl += stripl;
     237            1662 :                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
     238             127 :                    do {
     239             208 :                       if (TESTAFF(he->astr, aflag, he->alen) &&
     240                 :                         // forbid single prefixes with needaffix flag
     241              38 :                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
     242                 :                         // needflag
     243               6 :                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
     244               2 :                          (contclass && TESTAFF(contclass, needflag, contclasslen))))
     245              35 :                             return he;
     246             127 :                       he = he->next_homonym; // check homonyms
     247                 :                    } while (he);
     248                 :                 }
     249                 : 
     250                 :                 // prefix matched but no root word was found
     251                 :                 // if aeXPRODUCT is allowed, try again but now
     252                 :                 // ross checked combined with a suffix
     253                 : 
     254                 :                 //if ((opts & aeXPRODUCT) && in_compound) {
     255            1627 :                 if ((opts & aeXPRODUCT)) {
     256                 :                    he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
     257            1611 :                         0, NULL, FLAG_NULL, needflag, in_compound);
     258            1611 :                    if (he) return he;
     259                 :                 }
     260                 :             }
     261                 :      }
     262            1721 :     return NULL;
     263                 : }
     264                 : 
     265                 : // check if this prefix entry matches
     266             141 : struct hentry * PfxEntry::check_twosfx(const char * word, int len,
     267                 :     char in_compound, const FLAG needflag)
     268                 : {
     269                 :     int                 tmpl;   // length of tmpword
     270                 :     struct hentry *     he;     // hash entry of root word or NULL
     271                 :     char                tmpword[MAXWORDUTF8LEN + 4];
     272                 : 
     273                 :     // on entry prefix is 0 length or already matches the beginning of the word.
     274                 :     // So if the remaining root word has positive length
     275                 :     // and if there are enough chars in root word and added back strip chars
     276                 :     // to meet the number of characters conditions, then test it
     277                 : 
     278             141 :      tmpl = len - appndl;
     279                 : 
     280             141 :      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
     281                 :         (tmpl + stripl >= numconds)) {
     282                 : 
     283                 :             // generate new root word by removing prefix and adding
     284                 :             // back any characters that would have been stripped
     285                 : 
     286             141 :             if (stripl) strcpy (tmpword, strip);
     287             141 :             strcpy ((tmpword + stripl), (word + appndl));
     288                 : 
     289                 :             // now make sure all of the conditions on characters
     290                 :             // are met.  Please see the appendix at the end of
     291                 :             // this file for more info on exactly what is being
     292                 :             // tested
     293                 : 
     294                 :             // if all conditions are met then check if resulting
     295                 :             // root word in the dictionary
     296                 : 
     297             141 :             if (test_condition(tmpword)) {
     298             141 :                 tmpl += stripl;
     299                 : 
     300                 :                 // prefix matched but no root word was found
     301                 :                 // if aeXPRODUCT is allowed, try again but now
     302                 :                 // cross checked combined with a suffix
     303                 : 
     304             141 :                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
     305             141 :                    he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
     306             141 :                    if (he) return he;
     307                 :                 }
     308                 :             }
     309                 :      }
     310             133 :     return NULL;
     311                 : }
     312                 : 
     313                 : // check if this prefix entry matches
     314               0 : char * PfxEntry::check_twosfx_morph(const char * word, int len,
     315                 :          char in_compound, const FLAG needflag)
     316                 : {
     317                 :     int                 tmpl;   // length of tmpword
     318                 :     char                tmpword[MAXWORDUTF8LEN + 4];
     319                 : 
     320                 :     // on entry prefix is 0 length or already matches the beginning of the word.
     321                 :     // So if the remaining root word has positive length
     322                 :     // and if there are enough chars in root word and added back strip chars
     323                 :     // to meet the number of characters conditions, then test it
     324                 : 
     325               0 :      tmpl = len - appndl;
     326                 : 
     327               0 :      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
     328                 :         (tmpl + stripl >= numconds)) {
     329                 : 
     330                 :             // generate new root word by removing prefix and adding
     331                 :             // back any characters that would have been stripped
     332                 : 
     333               0 :             if (stripl) strcpy (tmpword, strip);
     334               0 :             strcpy ((tmpword + stripl), (word + appndl));
     335                 : 
     336                 :             // now make sure all of the conditions on characters
     337                 :             // are met.  Please see the appendix at the end of
     338                 :             // this file for more info on exactly what is being
     339                 :             // tested
     340                 : 
     341                 :             // if all conditions are met then check if resulting
     342                 :             // root word in the dictionary
     343                 : 
     344               0 :             if (test_condition(tmpword)) {
     345               0 :                 tmpl += stripl;
     346                 : 
     347                 :                 // prefix matched but no root word was found
     348                 :                 // if aeXPRODUCT is allowed, try again but now
     349                 :                 // ross checked combined with a suffix
     350                 : 
     351               0 :                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
     352                 :                     return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
     353               0 :                              aeXPRODUCT, this, needflag);
     354                 :                 }
     355                 :             }
     356                 :      }
     357               0 :     return NULL;
     358                 : }
     359                 : 
     360                 : // check if this prefix entry matches
     361               0 : char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
     362                 : {
     363                 :     int                 tmpl;   // length of tmpword
     364                 :     struct hentry *     he;     // hash entry of root word or NULL
     365                 :     char                tmpword[MAXWORDUTF8LEN + 4];
     366                 :     char                result[MAXLNLEN];
     367                 :     char * st;
     368                 : 
     369               0 :     *result = '\0';
     370                 : 
     371                 :     // on entry prefix is 0 length or already matches the beginning of the word.
     372                 :     // So if the remaining root word has positive length
     373                 :     // and if there are enough chars in root word and added back strip chars
     374                 :     // to meet the number of characters conditions, then test it
     375                 : 
     376               0 :      tmpl = len - appndl;
     377                 : 
     378               0 :      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
     379                 :         (tmpl + stripl >= numconds)) {
     380                 : 
     381                 :             // generate new root word by removing prefix and adding
     382                 :             // back any characters that would have been stripped
     383                 : 
     384               0 :             if (stripl) strcpy (tmpword, strip);
     385               0 :             strcpy ((tmpword + stripl), (word + appndl));
     386                 : 
     387                 :             // now make sure all of the conditions on characters
     388                 :             // are met.  Please see the appendix at the end of
     389                 :             // this file for more info on exactly what is being
     390                 :             // tested
     391                 : 
     392                 :             // if all conditions are met then check if resulting
     393                 :             // root word in the dictionary
     394                 : 
     395               0 :             if (test_condition(tmpword)) {
     396               0 :                 tmpl += stripl;
     397               0 :                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
     398               0 :                     do {
     399               0 :                       if (TESTAFF(he->astr, aflag, he->alen) &&
     400                 :                         // forbid single prefixes with needaffix flag
     401               0 :                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
     402                 :                         // needflag
     403               0 :                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
     404               0 :                          (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
     405               0 :                             if (morphcode) {
     406               0 :                                 mystrcat(result, " ", MAXLNLEN);
     407               0 :                                 mystrcat(result, morphcode, MAXLNLEN);
     408               0 :                             } else mystrcat(result,getKey(), MAXLNLEN);
     409               0 :                             if (!HENTRY_FIND(he, MORPH_STEM)) {
     410               0 :                                 mystrcat(result, " ", MAXLNLEN);
     411               0 :                                 mystrcat(result, MORPH_STEM, MAXLNLEN);
     412               0 :                                 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
     413                 :                             }
     414                 :                             // store the pointer of the hash entry
     415               0 :                             if (HENTRY_DATA(he)) {
     416               0 :                                 mystrcat(result, " ", MAXLNLEN);
     417               0 :                                 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
     418                 :                             } else {
     419                 :                                 // return with debug information
     420               0 :                                 char * flag = pmyMgr->encode_flag(getFlag());
     421               0 :                                 mystrcat(result, " ", MAXLNLEN);
     422               0 :                                 mystrcat(result, MORPH_FLAG, MAXLNLEN);
     423               0 :                                 mystrcat(result, flag, MAXLNLEN);
     424               0 :                                 free(flag);
     425                 :                             }
     426               0 :                             mystrcat(result, "\n", MAXLNLEN);
     427                 :                       }
     428               0 :                       he = he->next_homonym;
     429                 :                     } while (he);
     430                 :                 }
     431                 : 
     432                 :                 // prefix matched but no root word was found
     433                 :                 // if aeXPRODUCT is allowed, try again but now
     434                 :                 // ross checked combined with a suffix
     435                 : 
     436               0 :                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
     437                 :                    st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
     438               0 :                      FLAG_NULL, needflag);
     439               0 :                    if (st) {
     440               0 :                         mystrcat(result, st, MAXLNLEN);
     441               0 :                         free(st);
     442                 :                    }
     443                 :                 }
     444                 :             }
     445                 :      }
     446                 :     
     447               0 :     if (*result) return mystrdup(result);
     448               0 :     return NULL;
     449                 : }
     450                 : 
     451             283 : SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
     452                 : {
     453                 :   // register affix manager
     454             283 :   pmyMgr = pmgr;
     455                 : 
     456                 :   // set up its initial values
     457             283 :   aflag = dp->aflag;         // char flag
     458             283 :   strip = dp->strip;         // string to strip
     459             283 :   appnd = dp->appnd;         // string to append
     460             283 :   stripl = dp->stripl;       // length of strip string
     461             283 :   appndl = dp->appndl;       // length of append string
     462             283 :   numconds = dp->numconds;   // length of the condition
     463             283 :   opts = dp->opts;           // cross product flag
     464                 : 
     465                 :   // then copy over all of the conditions
     466             283 :   if (opts & aeLONGCOND) {
     467               5 :     memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
     468               5 :     c.l.conds2 = dp->c.l.conds2;
     469             278 :   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
     470             283 :   next = NULL;
     471             283 :   nextne = NULL;
     472             283 :   nexteq = NULL;
     473             283 :   rappnd = myrevstrdup(appnd);
     474             283 :   morphcode = dp->morphcode;
     475             283 :   contclass = dp->contclass;
     476             283 :   contclasslen = dp->contclasslen;
     477             283 : }
     478                 : 
     479                 : 
     480             283 : SfxEntry::~SfxEntry()
     481                 : {
     482             283 :     aflag = 0;
     483             283 :     if (appnd) free(appnd);
     484             283 :     if (rappnd) free(rappnd);
     485             283 :     if (strip) free(strip);
     486             283 :     pmyMgr = NULL;
     487             283 :     appnd = NULL;
     488             283 :     strip = NULL;
     489             283 :     if (opts & aeLONGCOND) free(c.l.conds2);
     490             283 :     if (morphcode && !(opts & aeALIASM)) free(morphcode);
     491             283 :     if (contclass && !(opts & aeALIASF)) free(contclass);
     492             283 : }
     493                 : 
     494                 : // add suffix to this word assuming conditions hold
     495               0 : char * SfxEntry::add(const char * word, int len)
     496                 : {
     497                 :     char                tword[MAXWORDUTF8LEN + 4];
     498                 : 
     499                 :      /* make sure all conditions match */
     500               0 :      if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
     501               0 :         (len >= numconds) && test_condition(word + len, word) &&
     502               0 :         (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
     503                 :         ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
     504                 :               /* we have a match so add suffix */
     505               0 :               strcpy(tword,word);
     506               0 :               if (appndl) {
     507               0 :                   strcpy(tword + len - stripl, appnd);
     508                 :               } else {
     509               0 :                   *(tword + len - stripl) = '\0';
     510                 :               }
     511               0 :               return mystrdup(tword);
     512                 :      }
     513               0 :      return NULL;
     514                 : }
     515                 : 
     516             911 : inline char * SfxEntry::nextchar(char * p) {
     517             911 :     if (p) {
     518             911 :         p++;
     519             911 :         if (opts & aeLONGCOND) {
     520                 :             // jump to the 2nd part of the condition
     521             141 :             if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
     522                 :         // end of the MAXCONDLEN length condition
     523             770 :         } else if (p == c.conds + MAXCONDLEN) return NULL;
     524             905 :         return *p ? p : NULL;
     525                 :     }
     526               0 :     return NULL;
     527                 : }
     528                 : 
     529            6872 : inline int SfxEntry::test_condition(const char * st, const char * beg)
     530                 : {
     531            6872 :     const char * pos = NULL;    // group with pos input position
     532            6872 :     bool neg = false;           // complementer
     533            6872 :     bool ingroup = false;       // character in the group
     534            6872 :     if (numconds == 0) return 1;
     535             125 :     char * p = c.conds;
     536             125 :     st--;
     537             125 :     int i = 1;
     538             778 :     while (1) {
     539             903 :       switch (*p) {
     540               0 :         case '\0': return 1;
     541             108 :         case '[': { p = nextchar(p); pos = st; break; }
     542              44 :         case '^': { p = nextchar(p); neg = true; break; }
     543              82 :         case ']': { if (!neg && !ingroup) return 0;
     544              42 :                 i++;
     545                 :                 // skip the next character
     546              42 :                 if (!ingroup) {
     547              29 :                     for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
     548              29 :                     st--;
     549                 :                 }                    
     550              42 :                 pos = NULL;
     551              42 :                 neg = false;
     552              42 :                 ingroup = false;
     553              42 :                 p = nextchar(p);
     554              42 :                 if (st < beg && p) return 0; // word <= condition
     555              42 :                 break;
     556                 :             }
     557              37 :         case '.': if (!pos) { // dots are not metacharacters in groups: [.]
     558              37 :                 p = nextchar(p);
     559                 :                 // skip the next character
     560              37 :                 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
     561              37 :                 if (st < beg) { // word <= condition
     562               1 :                     if (p) return 0; else return 1;
     563                 :                 }
     564              36 :                 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
     565               0 :                     st--;
     566               0 :                     if (st < beg) { // word <= condition
     567               0 :                         if (p) return 0; else return 1;
     568                 :                     }
     569                 :                 }
     570              36 :                 break;
     571                 :             }
     572                 :     default: {
     573             632 :                 if (*st == *p) {
     574             123 :                     p = nextchar(p);
     575             123 :                     if ((opts & aeUTF8) && (*st & 0x80)) {
     576              21 :                         st--;
     577              42 :                         while (p && (st >= beg)) {
     578              21 :                             if (*p != *st) {
     579               0 :                                 if (!pos) return 0;
     580               0 :                                 st = pos;
     581               0 :                                 break;
     582                 :                             }
     583                 :                             // first byte of the UTF-8 multibyte character
     584              21 :                             if ((*p & 0xc0) != 0x80) break;
     585               0 :                             p = nextchar(p);
     586               0 :                             st--;
     587                 :                         }
     588              21 :                         if (pos && st != pos) {
     589              12 :                             if (neg) return 0;
     590               6 :                             else if (i == numconds) return 1;
     591               3 :                             ingroup = true;
     592               3 :                             while (p && *p != ']' && (p = nextchar(p)));
     593               3 :                             st--;
     594                 :                         }
     595              12 :                         if (p && *p != ']') p = nextchar(p);
     596             102 :                     } else if (pos) {
     597              27 :                         if (neg) return 0;
     598              18 :                         else if (i == numconds) return 1;
     599              10 :                         ingroup = true;
     600              10 :                         while (p && *p != ']' && (p = nextchar(p)));
     601                 : //                      if (p && *p != ']') p = nextchar(p);
     602              10 :                         st--;
     603                 :                     }
     604              97 :                     if (!pos) {
     605              84 :                         i++;
     606              84 :                         st--;
     607                 :                     }
     608              97 :                     if (st < beg && p && *p != ']') return 0; // word <= condition
     609             509 :                 } else if (pos) { // group
     610             496 :                     p = nextchar(p);
     611              13 :                 } else return 0;
     612                 :             }
     613                 :       }
     614             823 :       if (!p) return 1;
     615                 :     }
     616                 : }
     617                 : 
     618                 : // see if this suffix is present in the word
     619            6839 : struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
     620                 :     PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
     621                 :     const FLAG badflag)
     622                 : {
     623                 :     int                 tmpl;            // length of tmpword
     624                 :     struct hentry *     he;              // hash entry pointer
     625                 :     unsigned char *     cp;
     626                 :     char                tmpword[MAXWORDUTF8LEN + 4];
     627            6839 :     PfxEntry* ep = ppfx;
     628                 : 
     629                 :     // if this suffix is being cross checked with a prefix
     630                 :     // but it does not support cross products skip it
     631                 : 
     632            6839 :     if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
     633               0 :         return NULL;
     634                 : 
     635                 :     // upon entry suffix is 0 length or already matches the end of the word.
     636                 :     // So if the remaining root word has positive length
     637                 :     // and if there are enough chars in root word and added back strip chars
     638                 :     // to meet the number of characters conditions, then test it
     639                 : 
     640            6839 :     tmpl = len - appndl;
     641                 :     // the second condition is not enough for UTF-8 strings
     642                 :     // it checked in test_condition()
     643                 : 
     644            6839 :     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
     645                 :         (tmpl + stripl >= numconds)) {
     646                 : 
     647                 :             // generate new root word by removing suffix and adding
     648                 :             // back any characters that would have been stripped or
     649                 :             // or null terminating the shorter string
     650                 : 
     651            6837 :             strcpy (tmpword, word);
     652            6837 :             cp = (unsigned char *)(tmpword + tmpl);
     653            6837 :             if (stripl) {
     654              52 :                 strcpy ((char *)cp, strip);
     655              52 :                 tmpl += stripl;
     656              52 :                 cp = (unsigned char *)(tmpword + tmpl);
     657            6785 :             } else *cp = '\0';
     658                 : 
     659                 :             // now make sure all of the conditions on characters
     660                 :             // are met.  Please see the appendix at the end of
     661                 :             // this file for more info on exactly what is being
     662                 :             // tested
     663                 : 
     664                 :             // if all conditions are met then check if resulting
     665                 :             // root word in the dictionary
     666                 : 
     667            6837 :             if (test_condition((char *) cp, (char *) tmpword)) {
     668                 : 
     669                 : #ifdef SZOSZABLYA_POSSIBLE_ROOTS
     670                 :                 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
     671                 : #endif
     672            6769 :                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
     673             284 :                     do {
     674                 :                         // check conditional suffix (enabled by prefix)
     675            1347 :                         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
     676              88 :                                     TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
     677                 :                             (((optflags & aeXPRODUCT) == 0) ||
     678             171 :                             (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
     679                 :                              // enabled by prefix
     680             150 :                             ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
     681                 :                             ) &&
     682                 :                             // handle cont. class
     683                 :                             ((!cclass) ||
     684              21 :                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
     685                 :                             ) &&
     686                 :                             // check only in compound homonyms (bad flags)
     687               2 :                             (!badflag || !TESTAFF(he->astr, badflag, he->alen)
     688                 :                             ) &&
     689                 :                             // handle required flag
     690                 :                             ((!needflag) ||
     691             209 :                               (TESTAFF(he->astr, needflag, he->alen) ||
     692             205 :                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
     693                 :                             )
     694             217 :                         ) return he;
     695             284 :                         he = he->next_homonym; // check homonyms
     696                 :                     } while (he);
     697                 : 
     698                 :                 // obsolote stemming code (used only by the
     699                 :                 // experimental SuffixMgr:suggest_pos_stems)
     700                 :                 // store resulting root in wlst
     701            6273 :                 } else if (wlst && (*ns < maxSug)) {
     702               0 :                     int cwrd = 1;
     703               0 :                     for (int k=0; k < *ns; k++)
     704               0 :                         if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
     705               0 :                     if (cwrd) {
     706               0 :                         wlst[*ns] = mystrdup(tmpword);
     707               0 :                         if (wlst[*ns] == NULL) {
     708               0 :                             for (int j=0; j<*ns; j++) free(wlst[j]);
     709               0 :                             *ns = -1;
     710               0 :                             return NULL;
     711                 :                         }
     712               0 :                         (*ns)++;
     713                 :                     }
     714                 :                 }
     715                 :             }
     716                 :     }
     717            6622 :     return NULL;
     718                 : }
     719                 : 
     720                 : // see if two-level suffix is present in the word
     721              35 : struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
     722                 :     PfxEntry* ppfx, const FLAG needflag)
     723                 : {
     724                 :     int                 tmpl;            // length of tmpword
     725                 :     struct hentry *     he;              // hash entry pointer
     726                 :     unsigned char *     cp;
     727                 :     char                tmpword[MAXWORDUTF8LEN + 4];
     728              35 :     PfxEntry* ep = ppfx;
     729                 : 
     730                 : 
     731                 :     // if this suffix is being cross checked with a prefix
     732                 :     // but it does not support cross products skip it
     733                 : 
     734              35 :     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
     735               0 :         return NULL;
     736                 : 
     737                 :     // upon entry suffix is 0 length or already matches the end of the word.
     738                 :     // So if the remaining root word has positive length
     739                 :     // and if there are enough chars in root word and added back strip chars
     740                 :     // to meet the number of characters conditions, then test it
     741                 : 
     742              35 :     tmpl = len - appndl;
     743                 : 
     744              35 :     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
     745                 :        (tmpl + stripl >= numconds)) {
     746                 : 
     747                 :             // generate new root word by removing suffix and adding
     748                 :             // back any characters that would have been stripped or
     749                 :             // or null terminating the shorter string
     750                 : 
     751              35 :             strcpy (tmpword, word);
     752              35 :             cp = (unsigned char *)(tmpword + tmpl);
     753              35 :             if (stripl) {
     754               0 :                 strcpy ((char *)cp, strip);
     755               0 :                 tmpl += stripl;
     756               0 :                 cp = (unsigned char *)(tmpword + tmpl);
     757              35 :             } else *cp = '\0';
     758                 : 
     759                 :             // now make sure all of the conditions on characters
     760                 :             // are met.  Please see the appendix at the end of
     761                 :             // this file for more info on exactly what is being
     762                 :             // tested
     763                 : 
     764                 :             // if all conditions are met then recall suffix_check
     765                 : 
     766              35 :             if (test_condition((char *) cp, (char *) tmpword)) {
     767              35 :                 if (ppfx) {
     768                 :                     // handle conditional suffix
     769               8 :                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
     770               0 :                         he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
     771                 :                     else
     772               8 :                         he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
     773                 :                 } else {
     774              27 :                     he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
     775                 :                 }
     776              35 :                 if (he) return he;
     777                 :             }
     778                 :     }
     779              14 :     return NULL;
     780                 : }
     781                 : 
     782                 : // see if two-level suffix is present in the word
     783               0 : char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
     784                 :     PfxEntry* ppfx, const FLAG needflag)
     785                 : {
     786                 :     int                 tmpl;            // length of tmpword
     787                 :     unsigned char *     cp;
     788                 :     char                tmpword[MAXWORDUTF8LEN + 4];
     789               0 :     PfxEntry* ep = ppfx;
     790                 :     char * st;
     791                 : 
     792                 :     char result[MAXLNLEN];
     793                 : 
     794               0 :     *result = '\0';
     795                 : 
     796                 :     // if this suffix is being cross checked with a prefix
     797                 :     // but it does not support cross products skip it
     798                 : 
     799               0 :     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
     800               0 :         return NULL;
     801                 : 
     802                 :     // upon entry suffix is 0 length or already matches the end of the word.
     803                 :     // So if the remaining root word has positive length
     804                 :     // and if there are enough chars in root word and added back strip chars
     805                 :     // to meet the number of characters conditions, then test it
     806                 : 
     807               0 :     tmpl = len - appndl;
     808                 : 
     809               0 :     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
     810                 :        (tmpl + stripl >= numconds)) {
     811                 : 
     812                 :             // generate new root word by removing suffix and adding
     813                 :             // back any characters that would have been stripped or
     814                 :             // or null terminating the shorter string
     815                 : 
     816               0 :             strcpy (tmpword, word);
     817               0 :             cp = (unsigned char *)(tmpword + tmpl);
     818               0 :             if (stripl) {
     819               0 :                 strcpy ((char *)cp, strip);
     820               0 :                 tmpl += stripl;
     821               0 :                 cp = (unsigned char *)(tmpword + tmpl);
     822               0 :             } else *cp = '\0';
     823                 : 
     824                 :             // now make sure all of the conditions on characters
     825                 :             // are met.  Please see the appendix at the end of
     826                 :             // this file for more info on exactly what is being
     827                 :             // tested
     828                 : 
     829                 :             // if all conditions are met then recall suffix_check
     830                 : 
     831               0 :             if (test_condition((char *) cp, (char *) tmpword)) {
     832               0 :                 if (ppfx) {
     833                 :                     // handle conditional suffix
     834               0 :                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
     835               0 :                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
     836               0 :                         if (st) {
     837               0 :                             if (ppfx->getMorph()) {
     838               0 :                                 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
     839               0 :                                 mystrcat(result, " ", MAXLNLEN);
     840                 :                             }
     841               0 :                             mystrcat(result,st, MAXLNLEN);
     842               0 :                             free(st);
     843               0 :                             mychomp(result);
     844                 :                         }
     845                 :                     } else {
     846               0 :                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
     847               0 :                         if (st) {
     848               0 :                             mystrcat(result, st, MAXLNLEN);
     849               0 :                             free(st);
     850               0 :                             mychomp(result);
     851                 :                         }
     852                 :                     }
     853                 :                 } else {
     854               0 :                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
     855               0 :                         if (st) {
     856               0 :                             mystrcat(result, st, MAXLNLEN);
     857               0 :                             free(st);
     858               0 :                             mychomp(result);
     859                 :                         }
     860                 :                 }
     861               0 :                 if (*result) return mystrdup(result);
     862                 :             }
     863                 :     }
     864               0 :     return NULL;
     865                 : }
     866                 : 
     867                 : // get next homonym with same affix
     868               0 : struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
     869                 :     const FLAG cclass, const FLAG needflag)
     870                 : {
     871               0 :     PfxEntry* ep = ppfx;
     872               0 :     FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
     873                 : 
     874               0 :     while (he->next_homonym) {
     875               0 :         he = he->next_homonym;
     876               0 :         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
     877                 :                             ((optflags & aeXPRODUCT) == 0 ||
     878               0 :                             TESTAFF(he->astr, eFlag, he->alen) ||
     879                 :                              // handle conditional suffix
     880               0 :                             ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
     881                 :                             ) &&
     882                 :                             // handle cont. class
     883                 :                             ((!cclass) ||
     884               0 :                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
     885                 :                             ) &&
     886                 :                             // handle required flag
     887                 :                             ((!needflag) ||
     888               0 :                               (TESTAFF(he->astr, needflag, he->alen) ||
     889               0 :                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
     890                 :                             )
     891               0 :                         ) return he;
     892                 :     }
     893               0 :     return NULL;
     894                 : }
     895                 : 
     896                 : 
     897                 : #if 0
     898                 : 
     899                 : Appendix:  Understanding Affix Code
     900                 : 
     901                 : 
     902                 : An affix is either a  prefix or a suffix attached to root words to make 
     903                 : other words.
     904                 : 
     905                 : Basically a Prefix or a Suffix is set of AffEntry objects
     906                 : which store information about the prefix or suffix along 
     907                 : with supporting routines to check if a word has a particular 
     908                 : prefix or suffix or a combination.
     909                 : 
     910                 : The structure affentry is defined as follows:
     911                 : 
     912                 : struct affentry
     913                 : {
     914                 :    unsigned short aflag;    // ID used to represent the affix
     915                 :    char * strip;            // string to strip before adding affix
     916                 :    char * appnd;            // the affix string to add
     917                 :    unsigned char stripl;    // length of the strip string
     918                 :    unsigned char appndl;    // length of the affix string
     919                 :    char numconds;           // the number of conditions that must be met
     920                 :    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix 
     921                 :    char   conds[SETSIZE];   // array which encodes the conditions to be met
     922                 : };
     923                 : 
     924                 : 
     925                 : Here is a suffix borrowed from the en_US.aff file.  This file 
     926                 : is whitespace delimited.
     927                 : 
     928                 : SFX D Y 4 
     929                 : SFX D   0     e          d
     930                 : SFX D   y     ied        [^aeiou]y
     931                 : SFX D   0     ed         [^ey]
     932                 : SFX D   0     ed         [aeiou]y
     933                 : 
     934                 : This information can be interpreted as follows:
     935                 : 
     936                 : In the first line has 4 fields
     937                 : 
     938                 : Field
     939                 : -----
     940                 : 1     SFX - indicates this is a suffix
     941                 : 2     D   - is the name of the character flag which represents this suffix
     942                 : 3     Y   - indicates it can be combined with prefixes (cross product)
     943                 : 4     4   - indicates that sequence of 4 affentry structures are needed to
     944                 :                properly store the affix information
     945                 : 
     946                 : The remaining lines describe the unique information for the 4 SfxEntry 
     947                 : objects that make up this affix.  Each line can be interpreted
     948                 : as follows: (note fields 1 and 2 are as a check against line 1 info)
     949                 : 
     950                 : Field
     951                 : -----
     952                 : 1     SFX         - indicates this is a suffix
     953                 : 2     D           - is the name of the character flag for this affix
     954                 : 3     y           - the string of chars to strip off before adding affix
     955                 :                          (a 0 here indicates the NULL string)
     956                 : 4     ied         - the string of affix characters to add
     957                 : 5     [^aeiou]y   - the conditions which must be met before the affix
     958                 :                     can be applied
     959                 : 
     960                 : Field 5 is interesting.  Since this is a suffix, field 5 tells us that
     961                 : there are 2 conditions that must be met.  The first condition is that 
     962                 : the next to the last character in the word must *NOT* be any of the 
     963                 : following "a", "e", "i", "o" or "u".  The second condition is that
     964                 : the last character of the word must end in "y".
     965                 : 
     966                 : So how can we encode this information concisely and be able to 
     967                 : test for both conditions in a fast manner?  The answer is found
     968                 : but studying the wonderful ispell code of Geoff Kuenning, et.al. 
     969                 : (now available under a normal BSD license).
     970                 : 
     971                 : If we set up a conds array of 256 bytes indexed (0 to 255) and access it
     972                 : using a character (cast to an unsigned char) of a string, we have 8 bits
     973                 : of information we can store about that character.  Specifically we
     974                 : could use each bit to say if that character is allowed in any of the 
     975                 : last (or first for prefixes) 8 characters of the word.
     976                 : 
     977                 : Basically, each character at one end of the word (up to the number 
     978                 : of conditions) is used to index into the conds array and the resulting 
     979                 : value found there says whether the that character is valid for a 
     980                 : specific character position in the word.  
     981                 : 
     982                 : For prefixes, it does this by setting bit 0 if that char is valid 
     983                 : in the first position, bit 1 if valid in the second position, and so on. 
     984                 : 
     985                 : If a bit is not set, then that char is not valid for that postion in the
     986                 : word.
     987                 : 
     988                 : If working with suffixes bit 0 is used for the character closest 
     989                 : to the front, bit 1 for the next character towards the end, ..., 
     990                 : with bit numconds-1 representing the last char at the end of the string. 
     991                 : 
     992                 : Note: since entries in the conds[] are 8 bits, only 8 conditions 
     993                 : (read that only 8 character positions) can be examined at one
     994                 : end of a word (the beginning for prefixes and the end for suffixes.
     995                 : 
     996                 : So to make this clearer, lets encode the conds array values for the 
     997                 : first two affentries for the suffix D described earlier.
     998                 : 
     999                 : 
    1000                 :   For the first affentry:    
    1001                 :      numconds = 1             (only examine the last character)
    1002                 : 
    1003                 :      conds['e'] =  (1 << 0)   (the word must end in an E)
    1004                 :      all others are all 0
    1005                 : 
    1006                 :   For the second affentry:
    1007                 :      numconds = 2             (only examine the last two characters)     
    1008                 : 
    1009                 :      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
    1010                 :          where X is all characters *but* a, e, i, o, or u
    1011                 :          
    1012                 : 
    1013                 :      conds['y'] = (1 << 1)     (the last char must be a y)
    1014                 :      all other bits for all other entries in the conds array are zero
    1015                 : 
    1016                 : 
    1017                 : #endif
    1018                 : 

Generated by: LCOV version 1.7