1 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Mozilla Universal charset detector code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 2001
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : * Shy Shalom <shooshX@gmail.com>
24 : * Proofpoint, Inc.
25 : *
26 : * Alternatively, the contents of this file may be used under the terms of
27 : * either the GNU General Public License Version 2 or later (the "GPL"), or
28 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 : * in which case the provisions of the GPL or the LGPL are applicable instead
30 : * of those above. If you wish to allow use of your version of this file only
31 : * under the terms of either the GPL or the LGPL, and not to allow others to
32 : * use your version of this file under the terms of the MPL, indicate your
33 : * decision by deleting the provisions above and replace them with the notice
34 : * and other provisions required by the GPL or the LGPL. If you do not delete
35 : * the provisions above, a recipient may use your version of this file under
36 : * the terms of any one of the MPL, the GPL or the LGPL.
37 : *
38 : * ***** END LICENSE BLOCK ***** */
39 : #include <stdio.h>
40 :
41 : #include "nsMBCSGroupProber.h"
42 : #include "nsUniversalDetector.h"
43 :
44 : #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
45 : const char *ProberName[] =
46 : {
47 : "UTF8",
48 : "SJIS",
49 : "EUCJP",
50 : "GB18030",
51 : "EUCKR",
52 : "Big5",
53 : "EUCTW",
54 : };
55 :
56 : #endif
57 :
58 0 : nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
59 : {
60 0 : for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
61 0 : mProbers[i] = nsnull;
62 :
63 0 : mProbers[0] = new nsUTF8Prober();
64 0 : if (aLanguageFilter & NS_FILTER_JAPANESE)
65 : {
66 0 : mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
67 0 : mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
68 : }
69 0 : if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
70 0 : mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
71 0 : if (aLanguageFilter & NS_FILTER_KOREAN)
72 0 : mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
73 0 : if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
74 : {
75 0 : mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
76 0 : mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
77 : }
78 0 : Reset();
79 0 : }
80 :
81 0 : nsMBCSGroupProber::~nsMBCSGroupProber()
82 : {
83 0 : for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
84 : {
85 0 : delete mProbers[i];
86 : }
87 0 : }
88 :
89 0 : const char* nsMBCSGroupProber::GetCharSetName()
90 : {
91 0 : if (mBestGuess == -1)
92 : {
93 0 : GetConfidence();
94 0 : if (mBestGuess == -1)
95 0 : mBestGuess = 0;
96 : }
97 0 : return mProbers[mBestGuess]->GetCharSetName();
98 : }
99 :
100 0 : void nsMBCSGroupProber::Reset(void)
101 : {
102 0 : mActiveNum = 0;
103 0 : for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
104 : {
105 0 : if (mProbers[i])
106 : {
107 0 : mProbers[i]->Reset();
108 0 : mIsActive[i] = true;
109 0 : ++mActiveNum;
110 : }
111 : else
112 0 : mIsActive[i] = false;
113 : }
114 0 : mBestGuess = -1;
115 0 : mState = eDetecting;
116 0 : mKeepNext = 0;
117 0 : }
118 :
119 0 : nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
120 : {
121 : nsProbingState st;
122 0 : PRUint32 start = 0;
123 0 : PRUint32 keepNext = mKeepNext;
124 :
125 : //do filtering to reduce load to probers
126 0 : for (PRUint32 pos = 0; pos < aLen; ++pos)
127 : {
128 0 : if (aBuf[pos] & 0x80)
129 : {
130 0 : if (!keepNext)
131 0 : start = pos;
132 0 : keepNext = 2;
133 : }
134 0 : else if (keepNext)
135 : {
136 0 : if (--keepNext == 0)
137 : {
138 0 : for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
139 : {
140 0 : if (!mIsActive[i])
141 0 : continue;
142 0 : st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
143 0 : if (st == eFoundIt)
144 : {
145 0 : mBestGuess = i;
146 0 : mState = eFoundIt;
147 0 : return mState;
148 : }
149 : }
150 : }
151 : }
152 : }
153 :
154 0 : if (keepNext) {
155 0 : for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
156 : {
157 0 : if (!mIsActive[i])
158 0 : continue;
159 0 : st = mProbers[i]->HandleData(aBuf + start, aLen - start);
160 0 : if (st == eFoundIt)
161 : {
162 0 : mBestGuess = i;
163 0 : mState = eFoundIt;
164 0 : return mState;
165 : }
166 : }
167 : }
168 0 : mKeepNext = keepNext;
169 :
170 0 : return mState;
171 : }
172 :
173 0 : float nsMBCSGroupProber::GetConfidence(void)
174 : {
175 : PRUint32 i;
176 0 : float bestConf = 0.0, cf;
177 :
178 0 : switch (mState)
179 : {
180 : case eFoundIt:
181 0 : return (float)0.99;
182 : case eNotMe:
183 0 : return (float)0.01;
184 : default:
185 0 : for (i = 0; i < NUM_OF_PROBERS; i++)
186 : {
187 0 : if (!mIsActive[i])
188 0 : continue;
189 0 : cf = mProbers[i]->GetConfidence();
190 0 : if (bestConf < cf)
191 : {
192 0 : bestConf = cf;
193 0 : mBestGuess = i;
194 : }
195 : }
196 : }
197 0 : return bestConf;
198 : }
199 :
200 : #ifdef DEBUG_chardet
201 : void nsMBCSGroupProber::DumpStatus()
202 : {
203 : PRUint32 i;
204 : float cf;
205 :
206 : GetConfidence();
207 : for (i = 0; i < NUM_OF_PROBERS; i++)
208 : {
209 : if (!mIsActive[i])
210 : printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
211 : else
212 : {
213 : cf = mProbers[i]->GetConfidence();
214 : printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
215 : }
216 : }
217 : }
218 : #endif
219 :
220 : #ifdef DEBUG_jgmyers
221 : void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset)
222 : {
223 : for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) {
224 : states[offset].name = ProberName[i];
225 : states[offset].isActive = mIsActive[i];
226 : states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0;
227 : ++offset;
228 : }
229 : }
230 : #endif /* DEBUG_jgmyers */
|