1 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Mozilla Universal charset detector code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 2001
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : * Shy Shalom <shooshX@gmail.com>
24 : *
25 : * Alternatively, the contents of this file may be used under the terms of
26 : * either the GNU General Public License Version 2 or later (the "GPL"), or
27 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 : * in which case the provisions of the GPL or the LGPL are applicable instead
29 : * of those above. If you wish to allow use of your version of this file only
30 : * under the terms of either the GPL or the LGPL, and not to allow others to
31 : * use your version of this file under the terms of the MPL, indicate your
32 : * decision by deleting the provisions above and replace them with the notice
33 : * and other provisions required by the GPL or the LGPL. If you do not delete
34 : * the provisions above, a recipient may use your version of this file under
35 : * the terms of any one of the MPL, the GPL or the LGPL.
36 : *
37 : * ***** END LICENSE BLOCK ***** */
38 :
39 : #include <stdio.h>
40 : #include "prmem.h"
41 :
42 : #include "nsSBCharSetProber.h"
43 : #include "nsSBCSGroupProber.h"
44 :
45 : #include "nsHebrewProber.h"
46 :
47 0 : nsSBCSGroupProber::nsSBCSGroupProber()
48 : {
49 0 : mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
50 0 : mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
51 0 : mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
52 0 : mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
53 0 : mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
54 0 : mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
55 0 : mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
56 0 : mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
57 0 : mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
58 0 : mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
59 0 : mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel);
60 :
61 0 : nsHebrewProber *hebprober = new nsHebrewProber();
62 : // Notice: Any change in these indexes - 10,11,12 must be reflected
63 : // in the code below as well.
64 0 : mProbers[11] = hebprober;
65 0 : mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew
66 0 : mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew
67 : // Tell the Hebrew prober about the logical and visual probers
68 0 : if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null
69 : {
70 0 : hebprober->SetModelProbers(mProbers[12], mProbers[13]);
71 : }
72 : else // One or more is null. avoid any Hebrew probing, null them all
73 : {
74 0 : for (PRUint32 i = 11; i <= 13; ++i)
75 : {
76 0 : delete mProbers[i];
77 0 : mProbers[i] = 0;
78 : }
79 : }
80 :
81 : // disable latin2 before latin1 is available, otherwise all latin1
82 : // will be detected as latin2 because of their similarity.
83 : //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
84 : //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
85 :
86 0 : Reset();
87 0 : }
88 :
89 0 : nsSBCSGroupProber::~nsSBCSGroupProber()
90 : {
91 0 : for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
92 : {
93 0 : delete mProbers[i];
94 : }
95 0 : }
96 :
97 :
98 0 : const char* nsSBCSGroupProber::GetCharSetName()
99 : {
100 : //if we have no answer yet
101 0 : if (mBestGuess == -1)
102 : {
103 0 : GetConfidence();
104 : //no charset seems positive
105 0 : if (mBestGuess == -1)
106 : //we will use default.
107 0 : mBestGuess = 0;
108 : }
109 0 : return mProbers[mBestGuess]->GetCharSetName();
110 : }
111 :
112 0 : void nsSBCSGroupProber::Reset(void)
113 : {
114 0 : mActiveNum = 0;
115 0 : for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
116 : {
117 0 : if (mProbers[i]) // not null
118 : {
119 0 : mProbers[i]->Reset();
120 0 : mIsActive[i] = true;
121 0 : ++mActiveNum;
122 : }
123 : else
124 0 : mIsActive[i] = false;
125 : }
126 0 : mBestGuess = -1;
127 0 : mState = eDetecting;
128 0 : }
129 :
130 :
131 0 : nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
132 : {
133 : nsProbingState st;
134 : PRUint32 i;
135 0 : char *newBuf1 = 0;
136 0 : PRUint32 newLen1 = 0;
137 :
138 : //apply filter to original buffer, and we got new buffer back
139 : //depend on what script it is, we will feed them the new buffer
140 : //we got after applying proper filter
141 : //this is done without any consideration to KeepEnglishLetters
142 : //of each prober since as of now, there are no probers here which
143 : //recognize languages with English characters.
144 0 : if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1))
145 0 : goto done;
146 :
147 0 : if (newLen1 == 0)
148 0 : goto done; // Nothing to see here, move on.
149 :
150 0 : for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
151 : {
152 0 : if (!mIsActive[i])
153 0 : continue;
154 0 : st = mProbers[i]->HandleData(newBuf1, newLen1);
155 0 : if (st == eFoundIt)
156 : {
157 0 : mBestGuess = i;
158 0 : mState = eFoundIt;
159 0 : break;
160 : }
161 0 : else if (st == eNotMe)
162 : {
163 0 : mIsActive[i] = false;
164 0 : mActiveNum--;
165 0 : if (mActiveNum <= 0)
166 : {
167 0 : mState = eNotMe;
168 0 : break;
169 : }
170 : }
171 : }
172 :
173 : done:
174 0 : PR_FREEIF(newBuf1);
175 :
176 0 : return mState;
177 : }
178 :
179 0 : float nsSBCSGroupProber::GetConfidence(void)
180 : {
181 : PRUint32 i;
182 0 : float bestConf = 0.0, cf;
183 :
184 0 : switch (mState)
185 : {
186 : case eFoundIt:
187 0 : return (float)0.99; //sure yes
188 : case eNotMe:
189 0 : return (float)0.01; //sure no
190 : default:
191 0 : for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
192 : {
193 0 : if (!mIsActive[i])
194 0 : continue;
195 0 : cf = mProbers[i]->GetConfidence();
196 0 : if (bestConf < cf)
197 : {
198 0 : bestConf = cf;
199 0 : mBestGuess = i;
200 : }
201 : }
202 : }
203 0 : return bestConf;
204 : }
205 :
206 : #ifdef DEBUG_chardet
207 : void nsSBCSGroupProber::DumpStatus()
208 : {
209 : PRUint32 i;
210 : float cf;
211 :
212 : cf = GetConfidence();
213 : printf(" SBCS Group Prober --------begin status \r\n");
214 : for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
215 : {
216 : if (!mIsActive[i])
217 : printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName());
218 : else
219 : mProbers[i]->DumpStatus();
220 : }
221 : printf(" SBCS Group found best match [%s] confidence %f.\r\n",
222 : mProbers[mBestGuess]->GetCharSetName(), cf);
223 : }
224 : #endif
|