1 : /******* BEGIN LICENSE BLOCK *******
2 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 : *
4 : * The contents of this file are subject to the Mozilla Public License Version
5 : * 1.1 (the "License"); you may not use this file except in compliance with
6 : * the License. You may obtain a copy of the License at
7 : * http://www.mozilla.org/MPL/
8 : *
9 : * Software distributed under the License is distributed on an "AS IS" basis,
10 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 : * for the specific language governing rights and limitations under the
12 : * License.
13 : *
14 : * The Initial Developer of the Original Code is Björn Jacke. Portions created
15 : * by the Initial Developers are Copyright (C) 2000-2007 the Initial
16 : * Developers. All Rights Reserved.
17 : *
18 : * Contributor(s): Björn Jacke (bjoern.jacke@gmx.de)
19 : * László Németh (nemethl@gyorsposta.hu)
20 : * Caolan McNamara (caolanm@redhat.com)
21 : *
22 : * Alternatively, the contents of this file may be used under the terms of
23 : * either the GNU General Public License Version 2 or later (the "GPL"), or
24 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
25 : * in which case the provisions of the GPL or the LGPL are applicable instead
26 : * of those above. If you wish to allow use of your version of this file only
27 : * under the terms of either the GPL or the LGPL, and not to allow others to
28 : * use your version of this file under the terms of the MPL, indicate your
29 : * decision by deleting the provisions above and replace them with the notice
30 : * and other provisions required by the GPL or the LGPL. If you do not delete
31 : * the provisions above, a recipient may use your version of this file under
32 : * the terms of any one of the MPL, the GPL or the LGPL.
33 : *
34 : * Changelog:
35 : * 2000-01-05 Björn Jacke <bjoern.jacke AT gmx.de>
36 : * Initial Release insprired by the article about phonetic
37 : * transformations out of c't 25/1999
38 : *
39 : * 2007-07-26 Björn Jacke <bjoern.jacke AT gmx.de>
40 : * Released under MPL/GPL/LGPL tri-license for Hunspell
41 : *
42 : * 2007-08-23 László Németh <nemeth at OOo>
43 : * Porting from Aspell to Hunspell using C-like structs
44 : *
45 : ******* END LICENSE BLOCK *******/
46 :
47 : #include <stdlib.h>
48 : #include <string.h>
49 : #include <stdio.h>
50 : #include <ctype.h>
51 :
52 : #include "csutil.hxx"
53 : #include "phonet.hxx"
54 :
55 1 : void init_phonet_hash(phonetable & parms)
56 : {
57 : int i, k;
58 :
59 257 : for (i = 0; i < HASHSIZE; i++) {
60 256 : parms.hash[i] = -1;
61 : }
62 :
63 106 : for (i = 0; parms.rules[i][0] != '\0'; i += 2) {
64 : /** set hash value **/
65 105 : k = (unsigned char) parms.rules[i][0];
66 :
67 105 : if (parms.hash[k] < 0) {
68 27 : parms.hash[k] = i;
69 : }
70 : }
71 1 : }
72 :
73 : // like strcpy but safe if the strings overlap
74 : // but only if dest < src
75 0 : static inline void strmove(char * dest, char * src) {
76 0 : while (*src)
77 0 : *dest++ = *src++;
78 0 : *dest = '\0';
79 0 : }
80 :
81 0 : static int myisalpha(char ch) {
82 0 : if ((unsigned char) ch < 128) return isalpha(ch);
83 0 : return 1;
84 : }
85 :
86 : /* phonetic transcription algorithm */
87 : /* see: http://aspell.net/man-html/Phonetic-Code.html */
88 : /* convert string to uppercase before this call */
89 0 : int phonet (const char * inword, char * target,
90 : int len,
91 : phonetable & parms)
92 : {
93 : /** Do phonetic transformation. **/
94 : /** "len" = length of "inword" incl. '\0'. **/
95 :
96 : /** result: >= 0: length of "target" **/
97 : /** otherwise: error **/
98 :
99 0 : int i,j,k=0,n,p,z;
100 0 : int k0,n0,p0=-333,z0;
101 : char c, c0;
102 : const char * s;
103 : typedef unsigned char uchar;
104 : char word[MAXPHONETUTF8LEN + 1];
105 0 : if (len == -1) len = strlen(inword);
106 0 : if (len > MAXPHONETUTF8LEN) return 0;
107 0 : strcpy(word, inword);
108 :
109 : /** check word **/
110 0 : i = j = z = 0;
111 0 : while ((c = word[i]) != '\0') {
112 0 : n = parms.hash[(uchar) c];
113 0 : z0 = 0;
114 :
115 0 : if (n >= 0) {
116 : /** check all rules for the same letter **/
117 0 : while (parms.rules[n][0] == c) {
118 :
119 : /** check whole string **/
120 0 : k = 1; /** number of found letters **/
121 0 : p = 5; /** default priority **/
122 0 : s = parms.rules[n];
123 0 : s++; /** important for (see below) "*(s-1)" **/
124 :
125 0 : while (*s != '\0' && word[i+k] == *s
126 0 : && !isdigit ((unsigned char) *s) && strchr ("(-<^$", *s) == NULL) {
127 0 : k++;
128 0 : s++;
129 : }
130 0 : if (*s == '(') {
131 : /** check letters in "(..)" **/
132 0 : if (myisalpha(word[i+k]) // ...could be implied?
133 0 : && strchr(s+1, word[i+k]) != NULL) {
134 0 : k++;
135 0 : while (*s != ')')
136 0 : s++;
137 0 : s++;
138 : }
139 : }
140 0 : p0 = (int) *s;
141 0 : k0 = k;
142 0 : while (*s == '-' && k > 1) {
143 0 : k--;
144 0 : s++;
145 : }
146 0 : if (*s == '<')
147 0 : s++;
148 0 : if (isdigit ((unsigned char) *s)) {
149 : /** determine priority **/
150 0 : p = *s - '0';
151 0 : s++;
152 : }
153 0 : if (*s == '^' && *(s+1) == '^')
154 0 : s++;
155 :
156 0 : if (*s == '\0'
157 : || (*s == '^'
158 0 : && (i == 0 || ! myisalpha(word[i-1]))
159 0 : && (*(s+1) != '$'
160 0 : || (! myisalpha(word[i+k0]) )))
161 : || (*s == '$' && i > 0
162 0 : && myisalpha(word[i-1])
163 0 : && (! myisalpha(word[i+k0]) )))
164 : {
165 : /** search for followup rules, if: **/
166 : /** parms.followup and k > 1 and NO '-' in searchstring **/
167 0 : c0 = word[i+k-1];
168 0 : n0 = parms.hash[(uchar) c0];
169 :
170 : // if (parms.followup && k > 1 && n0 >= 0
171 0 : if (k > 1 && n0 >= 0
172 0 : && p0 != (int) '-' && word[i+k] != '\0') {
173 : /** test follow-up rule for "word[i+k]" **/
174 0 : while (parms.rules[n0][0] == c0) {
175 :
176 : /** check whole string **/
177 0 : k0 = k;
178 0 : p0 = 5;
179 0 : s = parms.rules[n0];
180 0 : s++;
181 0 : while (*s != '\0' && word[i+k0] == *s
182 0 : && ! isdigit((unsigned char) *s) && strchr("(-<^$",*s) == NULL) {
183 0 : k0++;
184 0 : s++;
185 : }
186 0 : if (*s == '(') {
187 : /** check letters **/
188 0 : if (myisalpha(word[i+k0])
189 0 : && strchr (s+1, word[i+k0]) != NULL) {
190 0 : k0++;
191 0 : while (*s != ')' && *s != '\0')
192 0 : s++;
193 0 : if (*s == ')')
194 0 : s++;
195 : }
196 : }
197 0 : while (*s == '-') {
198 : /** "k0" gets NOT reduced **/
199 : /** because "if (k0 == k)" **/
200 0 : s++;
201 : }
202 0 : if (*s == '<')
203 0 : s++;
204 0 : if (isdigit ((unsigned char) *s)) {
205 0 : p0 = *s - '0';
206 0 : s++;
207 : }
208 :
209 0 : if (*s == '\0'
210 : /** *s == '^' cuts **/
211 0 : || (*s == '$' && ! myisalpha(word[i+k0])))
212 : {
213 0 : if (k0 == k) {
214 : /** this is just a piece of the string **/
215 0 : n0 += 2;
216 0 : continue;
217 : }
218 :
219 0 : if (p0 < p) {
220 : /** priority too low **/
221 0 : n0 += 2;
222 0 : continue;
223 : }
224 : /** rule fits; stop search **/
225 0 : break;
226 : }
227 0 : n0 += 2;
228 : } /** End of "while (parms.rules[n0][0] == c0)" **/
229 :
230 0 : if (p0 >= p && parms.rules[n0][0] == c0) {
231 0 : n += 2;
232 0 : continue;
233 : }
234 : } /** end of follow-up stuff **/
235 :
236 : /** replace string **/
237 0 : s = parms.rules[n+1];
238 0 : p0 = (parms.rules[n][0] != '\0'
239 0 : && strchr (parms.rules[n]+1,'<') != NULL) ? 1:0;
240 0 : if (p0 == 1 && z == 0) {
241 : /** rule with '<' is used **/
242 0 : if (j > 0 && *s != '\0'
243 0 : && (target[j-1] == c || target[j-1] == *s)) {
244 0 : j--;
245 : }
246 0 : z0 = 1;
247 0 : z = 1;
248 0 : k0 = 0;
249 0 : while (*s != '\0' && word[i+k0] != '\0') {
250 0 : word[i+k0] = *s;
251 0 : k0++;
252 0 : s++;
253 : }
254 0 : if (k > k0)
255 0 : strmove (&word[0]+i+k0, &word[0]+i+k);
256 :
257 : /** new "actual letter" **/
258 0 : c = word[i];
259 : }
260 : else { /** no '<' rule used **/
261 0 : i += k - 1;
262 0 : z = 0;
263 0 : while (*s != '\0'
264 0 : && *(s+1) != '\0' && j < len) {
265 0 : if (j == 0 || target[j-1] != *s) {
266 0 : target[j] = *s;
267 0 : j++;
268 : }
269 0 : s++;
270 : }
271 : /** new "actual letter" **/
272 0 : c = *s;
273 0 : if (parms.rules[n][0] != '\0'
274 0 : && strstr (parms.rules[n]+1, "^^") != NULL) {
275 0 : if (c != '\0') {
276 0 : target[j] = c;
277 0 : j++;
278 : }
279 0 : strmove (&word[0], &word[0]+i+1);
280 0 : i = 0;
281 0 : z0 = 1;
282 : }
283 : }
284 0 : break;
285 : } /** end of follow-up stuff **/
286 0 : n += 2;
287 : } /** end of while (parms.rules[n][0] == c) **/
288 : } /** end of if (n >= 0) **/
289 0 : if (z0 == 0) {
290 : // if (k && (assert(p0!=-333),!p0) && j < len && c != '\0'
291 : // && (!parms.collapse_result || j == 0 || target[j-1] != c)){
292 0 : if (k && !p0 && j < len && c != '\0'
293 : && (1 || j == 0 || target[j-1] != c)){
294 : /** condense only double letters **/
295 0 : target[j] = c;
296 : ///printf("\n setting \n");
297 0 : j++;
298 : }
299 :
300 0 : i++;
301 0 : z = 0;
302 0 : k=0;
303 : }
304 : } /** end of while ((c = word[i]) != '\0') **/
305 :
306 0 : target[j] = '\0';
307 0 : return (j);
308 :
309 : } /** end of function "phonet" **/
|