1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Mozilla Universal charset detector code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 2001
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : * Shy Shalom <shooshX@gmail.com>
24 : *
25 : * Alternatively, the contents of this file may be used under the terms of
26 : * either the GNU General Public License Version 2 or later (the "GPL"), or
27 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 : * in which case the provisions of the GPL or the LGPL are applicable instead
29 : * of those above. If you wish to allow use of your version of this file only
30 : * under the terms of either the GPL or the LGPL, and not to allow others to
31 : * use your version of this file under the terms of the MPL, indicate your
32 : * decision by deleting the provisions above and replace them with the notice
33 : * and other provisions required by the GPL or the LGPL. If you do not delete
34 : * the provisions above, a recipient may use your version of this file under
35 : * the terms of any one of the MPL, the GPL or the LGPL.
36 : *
37 : * ***** END LICENSE BLOCK ***** */
38 :
39 : #include "nscore.h"
40 :
41 : #include "nsUniversalDetector.h"
42 :
43 : #include "nsMBCSGroupProber.h"
44 : #include "nsSBCSGroupProber.h"
45 : #include "nsEscCharsetProber.h"
46 : #include "nsLatin1Prober.h"
47 :
48 0 : nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
49 : {
50 0 : mDone = false;
51 0 : mBestGuess = -1; //illegal value as signal
52 0 : mInTag = false;
53 0 : mEscCharSetProber = nsnull;
54 :
55 0 : mStart = true;
56 0 : mDetectedCharset = nsnull;
57 0 : mGotData = false;
58 0 : mInputState = ePureAscii;
59 0 : mLastChar = '\0';
60 0 : mLanguageFilter = aLanguageFilter;
61 :
62 : PRUint32 i;
63 0 : for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
64 0 : mCharSetProbers[i] = nsnull;
65 0 : }
66 :
67 0 : nsUniversalDetector::~nsUniversalDetector()
68 : {
69 0 : for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
70 0 : delete mCharSetProbers[i];
71 :
72 0 : delete mEscCharSetProber;
73 0 : }
74 :
75 : void
76 0 : nsUniversalDetector::Reset()
77 : {
78 0 : mDone = false;
79 0 : mBestGuess = -1; //illegal value as signal
80 0 : mInTag = false;
81 :
82 0 : mStart = true;
83 0 : mDetectedCharset = nsnull;
84 0 : mGotData = false;
85 0 : mInputState = ePureAscii;
86 0 : mLastChar = '\0';
87 :
88 0 : if (mEscCharSetProber)
89 0 : mEscCharSetProber->Reset();
90 :
91 : PRUint32 i;
92 0 : for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
93 0 : if (mCharSetProbers[i])
94 0 : mCharSetProbers[i]->Reset();
95 0 : }
96 :
97 : //---------------------------------------------------------------------
98 : #define SHORTCUT_THRESHOLD (float)0.95
99 : #define MINIMUM_THRESHOLD (float)0.20
100 :
101 0 : nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
102 : {
103 0 : if(mDone)
104 0 : return NS_OK;
105 :
106 0 : if (aLen > 0)
107 0 : mGotData = true;
108 :
109 : //If the data starts with BOM, we know it is UTF
110 0 : if (mStart)
111 : {
112 0 : mStart = false;
113 0 : if (aLen > 2)
114 0 : switch (aBuf[0])
115 : {
116 : case '\xEF':
117 0 : if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
118 : // EF BB BF UTF-8 encoded BOM
119 0 : mDetectedCharset = "UTF-8";
120 0 : break;
121 : case '\xFE':
122 0 : if ('\xFF' == aBuf[1])
123 : // FE FF UTF-16, big endian BOM
124 0 : mDetectedCharset = "UTF-16";
125 0 : break;
126 : case '\xFF':
127 0 : if ('\xFE' == aBuf[1])
128 : // FF FE UTF-16, little endian BOM
129 0 : mDetectedCharset = "UTF-16";
130 0 : break;
131 : } // switch
132 :
133 0 : if (mDetectedCharset)
134 : {
135 0 : mDone = true;
136 0 : return NS_OK;
137 : }
138 : }
139 :
140 : PRUint32 i;
141 0 : for (i = 0; i < aLen; i++)
142 : {
143 : //other than 0xa0, if every othe character is ascii, the page is ascii
144 0 : if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
145 : {
146 : //we got a non-ascii byte (high-byte)
147 0 : if (mInputState != eHighbyte)
148 : {
149 : //adjust state
150 0 : mInputState = eHighbyte;
151 :
152 : //kill mEscCharSetProber if it is active
153 0 : if (mEscCharSetProber) {
154 0 : delete mEscCharSetProber;
155 0 : mEscCharSetProber = nsnull;
156 : }
157 :
158 : //start multibyte and singlebyte charset prober
159 0 : if (nsnull == mCharSetProbers[0])
160 : {
161 0 : mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
162 0 : if (nsnull == mCharSetProbers[0])
163 0 : return NS_ERROR_OUT_OF_MEMORY;
164 : }
165 0 : if (nsnull == mCharSetProbers[1] &&
166 : (mLanguageFilter & NS_FILTER_NON_CJK))
167 : {
168 0 : mCharSetProbers[1] = new nsSBCSGroupProber;
169 0 : if (nsnull == mCharSetProbers[1])
170 0 : return NS_ERROR_OUT_OF_MEMORY;
171 : }
172 0 : if (nsnull == mCharSetProbers[2])
173 : {
174 0 : mCharSetProbers[2] = new nsLatin1Prober;
175 0 : if (nsnull == mCharSetProbers[2])
176 0 : return NS_ERROR_OUT_OF_MEMORY;
177 : }
178 : }
179 : }
180 : else
181 : {
182 : //ok, just pure ascii so far
183 0 : if ( ePureAscii == mInputState &&
184 0 : (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
185 : {
186 : //found escape character or HZ "~{"
187 0 : mInputState = eEscAscii;
188 : }
189 0 : mLastChar = aBuf[i];
190 : }
191 : }
192 :
193 : nsProbingState st;
194 0 : switch (mInputState)
195 : {
196 : case eEscAscii:
197 0 : if (nsnull == mEscCharSetProber) {
198 0 : mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
199 0 : if (nsnull == mEscCharSetProber)
200 0 : return NS_ERROR_OUT_OF_MEMORY;
201 : }
202 0 : st = mEscCharSetProber->HandleData(aBuf, aLen);
203 0 : if (st == eFoundIt)
204 : {
205 0 : mDone = true;
206 0 : mDetectedCharset = mEscCharSetProber->GetCharSetName();
207 : }
208 0 : break;
209 : case eHighbyte:
210 0 : for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
211 : {
212 0 : if (mCharSetProbers[i])
213 : {
214 0 : st = mCharSetProbers[i]->HandleData(aBuf, aLen);
215 0 : if (st == eFoundIt)
216 : {
217 0 : mDone = true;
218 0 : mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
219 0 : return NS_OK;
220 : }
221 : }
222 : }
223 0 : break;
224 :
225 : default: //pure ascii
226 : ;//do nothing here
227 : }
228 0 : return NS_OK;
229 : }
230 :
231 :
232 : //---------------------------------------------------------------------
233 0 : void nsUniversalDetector::DataEnd()
234 : {
235 0 : if (!mGotData)
236 : {
237 : // we haven't got any data yet, return immediately
238 : // caller program sometimes call DataEnd before anything has been sent to detector
239 0 : return;
240 : }
241 :
242 0 : if (mDetectedCharset)
243 : {
244 0 : mDone = true;
245 0 : Report(mDetectedCharset);
246 0 : return;
247 : }
248 :
249 0 : switch (mInputState)
250 : {
251 : case eHighbyte:
252 : {
253 : float proberConfidence;
254 0 : float maxProberConfidence = (float)0.0;
255 0 : PRInt32 maxProber = 0;
256 :
257 0 : for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
258 : {
259 0 : if (mCharSetProbers[i])
260 : {
261 0 : proberConfidence = mCharSetProbers[i]->GetConfidence();
262 0 : if (proberConfidence > maxProberConfidence)
263 : {
264 0 : maxProberConfidence = proberConfidence;
265 0 : maxProber = i;
266 : }
267 : }
268 : }
269 : //do not report anything because we are not confident of it, that's in fact a negative answer
270 0 : if (maxProberConfidence > MINIMUM_THRESHOLD)
271 0 : Report(mCharSetProbers[maxProber]->GetCharSetName());
272 : }
273 0 : break;
274 : case eEscAscii:
275 0 : break;
276 : default:
277 : ;
278 : }
279 0 : return;
280 : }
|