1 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is Mozilla Universal charset detector code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 2001
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : * Shy Shalom <shooshX@gmail.com>
24 : *
25 : * Alternatively, the contents of this file may be used under the terms of
26 : * either the GNU General Public License Version 2 or later (the "GPL"), or
27 : * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 : * in which case the provisions of the GPL or the LGPL are applicable instead
29 : * of those above. If you wish to allow use of your version of this file only
30 : * under the terms of either the GPL or the LGPL, and not to allow others to
31 : * use your version of this file under the terms of the MPL, indicate your
32 : * decision by deleting the provisions above and replace them with the notice
33 : * and other provisions required by the GPL or the LGPL. If you do not delete
34 : * the provisions above, a recipient may use your version of this file under
35 : * the terms of any one of the MPL, the GPL or the LGPL.
36 : *
37 : * ***** END LICENSE BLOCK ***** */
38 :
39 : #include "nsLatin1Prober.h"
40 : #include "prmem.h"
41 : #include <stdio.h>
42 :
43 : #define UDF 0 // undefined
44 : #define OTH 1 //other
45 : #define ASC 2 // ascii capital letter
46 : #define ASS 3 // ascii small letter
47 : #define ACV 4 // accent capital vowel
48 : #define ACO 5 // accent capital other
49 : #define ASV 6 // accent small vowel
50 : #define ASO 7 // accent small other
51 : #define CLASS_NUM 8 // total classes
52 :
53 : static const unsigned char Latin1_CharToClass[] =
54 : {
55 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
56 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
57 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
58 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
59 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
60 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
61 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
62 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
63 : OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
64 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
65 : ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
66 : ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
67 : OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
68 : ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
69 : ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
70 : ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
71 : OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
72 : OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
73 : UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
74 : OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
75 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
76 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
77 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
78 : OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
79 : ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
80 : ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
81 : ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
82 : ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
83 : ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
84 : ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
85 : ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
86 : ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
87 : };
88 :
89 :
90 : /* 0 : illegal
91 : 1 : very unlikely
92 : 2 : normal
93 : 3 : very likely
94 : */
95 : static const unsigned char Latin1ClassModel[] =
96 : {
97 : /* UDF OTH ASC ASS ACV ACO ASV ASO */
98 : /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
99 : /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
100 : /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
101 : /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
102 : /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
103 : /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
104 : /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
105 : /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
106 : };
107 :
108 0 : void nsLatin1Prober::Reset(void)
109 : {
110 0 : mState = eDetecting;
111 0 : mLastCharClass = OTH;
112 0 : for (int i = 0; i < FREQ_CAT_NUM; i++)
113 0 : mFreqCounter[i] = 0;
114 0 : }
115 :
116 :
117 0 : nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
118 : {
119 0 : char *newBuf1 = 0;
120 0 : PRUint32 newLen1 = 0;
121 :
122 0 : if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
123 0 : newBuf1 = (char*)aBuf;
124 0 : newLen1 = aLen;
125 : }
126 :
127 : unsigned char charClass;
128 : unsigned char freq;
129 0 : for (PRUint32 i = 0; i < newLen1; i++)
130 : {
131 0 : charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
132 0 : freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
133 0 : if (freq == 0) {
134 0 : mState = eNotMe;
135 0 : break;
136 : }
137 0 : mFreqCounter[freq]++;
138 0 : mLastCharClass = charClass;
139 : }
140 :
141 0 : if (newBuf1 != aBuf)
142 0 : PR_FREEIF(newBuf1);
143 :
144 0 : return mState;
145 : }
146 :
147 0 : float nsLatin1Prober::GetConfidence(void)
148 : {
149 0 : if (mState == eNotMe)
150 0 : return 0.01f;
151 :
152 : float confidence;
153 0 : PRUint32 total = 0;
154 0 : for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)
155 0 : total += mFreqCounter[i];
156 :
157 0 : if(!total)
158 0 : confidence = 0.0f;
159 : else
160 : {
161 0 : confidence = mFreqCounter[3]*1.0f / total;
162 0 : confidence -= mFreqCounter[1]*20.0f/total;
163 : }
164 :
165 0 : if (confidence < 0.0f)
166 0 : confidence = 0.0f;
167 :
168 : // lower the confidence of latin1 so that other more accurate detector
169 : // can take priority.
170 0 : confidence *= 0.50f;
171 :
172 0 : return confidence;
173 : }
174 :
175 : #ifdef DEBUG_chardet
176 : void nsLatin1Prober::DumpStatus()
177 : {
178 : printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
179 : }
180 : #endif
181 :
182 :
|