1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is mozilla.org code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either of the GNU General Public License Version 2 or later (the "GPL"),
26 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 :
38 :
39 : /**
40 : * MODULE NOTES:
41 : * @update gess 4/1/98
42 : *
43 : * The scanner is a low-level service class that knows
44 : * how to consume characters out of an (internal) stream.
45 : * This class also offers a series of utility methods
46 : * that most tokenizers want, such as readUntil()
47 : * and SkipWhitespace().
48 : */
49 :
50 :
51 : #ifndef SCANNER
52 : #define SCANNER
53 :
54 : #include "nsCOMPtr.h"
55 : #include "nsString.h"
56 : #include "nsIParser.h"
57 : #include "prtypes.h"
58 : #include "nsIUnicodeDecoder.h"
59 : #include "nsScannerString.h"
60 :
61 : class nsParser;
62 :
63 : class nsReadEndCondition {
64 : public:
65 : const PRUnichar *mChars;
66 : PRUnichar mFilter;
67 : explicit nsReadEndCondition(const PRUnichar* aTerminateChars);
68 : private:
69 : nsReadEndCondition(const nsReadEndCondition& aOther); // No copying
70 : void operator=(const nsReadEndCondition& aOther); // No assigning
71 : };
72 :
73 : class nsScanner {
74 : public:
75 :
76 : /**
77 : * Use this constructor if you want i/o to be based on
78 : * a single string you hand in during construction.
79 : * This short cut was added for Javascript.
80 : *
81 : * @update ftang 3/02/99
82 : * @param aCharset charset
83 : * @param aCharsetSource - where the charset info came from
84 : * @param aMode represents the parser mode (nav, other)
85 : * @return
86 : */
87 : nsScanner(const nsAString& anHTMLString, const nsACString& aCharset, PRInt32 aSource);
88 :
89 : /**
90 : * Use this constructor if you want i/o to be based on
91 : * a file (therefore a stream) or just data you provide via Append().
92 : *
93 : * @update ftang 3/02/99
94 : * @param aCharset charset
95 : * @param aCharsetSource - where the charset info came from
96 : * @param aMode represents the parser mode (nav, other)
97 : * @return
98 : */
99 : nsScanner(nsString& aFilename,bool aCreateStream, const nsACString& aCharset, PRInt32 aSource);
100 :
101 : ~nsScanner();
102 :
103 : /**
104 : * retrieve next char from internal input stream
105 : *
106 : * @update gess 3/25/98
107 : * @param ch is the char to accept new value
108 : * @return error code reflecting read status
109 : */
110 : nsresult GetChar(PRUnichar& ch);
111 :
112 : /**
113 : * peek ahead to consume next char from scanner's internal
114 : * input buffer
115 : *
116 : * @update gess 3/25/98
117 : * @param ch is the char to accept new value
118 : * @return error code reflecting read status
119 : */
120 : nsresult Peek(PRUnichar& ch, PRUint32 aOffset=0);
121 :
122 : nsresult Peek(nsAString& aStr, PRInt32 aNumChars, PRInt32 aOffset = 0);
123 :
124 : /**
125 : * Skip over chars as long as they equal given char
126 : *
127 : * @update gess 3/25/98
128 : * @param char to be skipped
129 : * @return error code
130 : */
131 : nsresult SkipOver(PRUnichar aSkipChar);
132 :
133 : /**
134 : * Skip whitespace on scanner input stream
135 : *
136 : * @update gess 3/25/98
137 : * @return error status
138 : */
139 : nsresult SkipWhitespace(PRInt32& aNewlinesSkipped);
140 :
141 : /**
142 : * Consume characters until you run into space, a '<', a '>', or a '/'.
143 : *
144 : * @param aString - receives new data from stream
145 : * @return error code
146 : */
147 : nsresult ReadTagIdentifier(nsScannerSharedSubstring& aString);
148 :
149 : /**
150 : * Consume characters until you run into a char that's not valid in an
151 : * entity name
152 : *
153 : * @param aString - receives new data from stream
154 : * @return error code
155 : */
156 : nsresult ReadEntityIdentifier(nsString& aString);
157 : nsresult ReadNumber(nsString& aString,PRInt32 aBase);
158 : nsresult ReadWhitespace(nsScannerSharedSubstring& aString,
159 : PRInt32& aNewlinesSkipped,
160 : bool& aHaveCR);
161 : nsresult ReadWhitespace(nsScannerIterator& aStart,
162 : nsScannerIterator& aEnd,
163 : PRInt32& aNewlinesSkipped);
164 :
165 : /**
166 : * Consume characters until you find the terminal char
167 : *
168 : * @update gess 3/25/98
169 : * @param aString receives new data from stream
170 : * @param aTerminal contains terminating char
171 : * @param addTerminal tells us whether to append terminal to aString
172 : * @return error code
173 : */
174 : nsresult ReadUntil(nsAString& aString,
175 : PRUnichar aTerminal,
176 : bool addTerminal);
177 :
178 : /**
179 : * Consume characters until you find one contained in given
180 : * terminal set.
181 : *
182 : * @update gess 3/25/98
183 : * @param aString receives new data from stream
184 : * @param aTermSet contains set of terminating chars
185 : * @param addTerminal tells us whether to append terminal to aString
186 : * @return error code
187 : */
188 : nsresult ReadUntil(nsAString& aString,
189 : const nsReadEndCondition& aEndCondition,
190 : bool addTerminal);
191 :
192 : nsresult ReadUntil(nsScannerSharedSubstring& aString,
193 : const nsReadEndCondition& aEndCondition,
194 : bool addTerminal);
195 :
196 : nsresult ReadUntil(nsScannerIterator& aStart,
197 : nsScannerIterator& aEnd,
198 : const nsReadEndCondition& aEndCondition,
199 : bool addTerminal);
200 :
201 : /**
202 : * Records current offset position in input stream. This allows us
203 : * to back up to this point if the need should arise, such as when
204 : * tokenization gets interrupted.
205 : *
206 : * @update gess 5/12/98
207 : * @param
208 : * @return
209 : */
210 : PRInt32 Mark(void);
211 :
212 : /**
213 : * Resets current offset position of input stream to marked position.
214 : * This allows us to back up to this point if the need should arise,
215 : * such as when tokenization gets interrupted.
216 : * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
217 : *
218 : * @update gess 5/12/98
219 : * @param
220 : * @return
221 : */
222 : void RewindToMark(void);
223 :
224 :
225 : /**
226 : *
227 : *
228 : * @update harishd 01/12/99
229 : * @param
230 : * @return
231 : */
232 : bool UngetReadable(const nsAString& aBuffer);
233 :
234 : /**
235 : *
236 : *
237 : * @update gess 5/13/98
238 : * @param
239 : * @return
240 : */
241 : nsresult Append(const nsAString& aBuffer);
242 :
243 : /**
244 : *
245 : *
246 : * @update gess 5/21/98
247 : * @param
248 : * @return
249 : */
250 : nsresult Append(const char* aBuffer, PRUint32 aLen,
251 : nsIRequest *aRequest);
252 :
253 : /**
254 : * Call this to copy bytes out of the scanner that have not yet been consumed
255 : * by the tokenization process.
256 : *
257 : * @update gess 5/12/98
258 : * @param aCopyBuffer is where the scanner buffer will be copied to
259 : * @return nada
260 : */
261 : void CopyUnusedData(nsString& aCopyBuffer);
262 :
263 : /**
264 : * Retrieve the name of the file that the scanner is reading from.
265 : * In some cases, it's just a given name, because the scanner isn't
266 : * really reading from a file.
267 : *
268 : * @update gess 5/12/98
269 : * @return
270 : */
271 : nsString& GetFilename(void);
272 :
273 : static void SelfTest();
274 :
275 : /**
276 : * Use this setter to change the scanner's unicode decoder
277 : *
278 : * @update ftang 3/02/99
279 : * @param aCharset a normalized (alias resolved) charset name
280 : * @param aCharsetSource- where the charset info came from
281 : * @return
282 : */
283 : nsresult SetDocumentCharset(const nsACString& aCharset, PRInt32 aSource);
284 :
285 : void BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd);
286 : void CurrentPosition(nsScannerIterator& aPosition);
287 : void EndReading(nsScannerIterator& aPosition);
288 : void SetPosition(nsScannerIterator& aPosition,
289 : bool aTruncate = false,
290 : bool aReverse = false);
291 : void ReplaceCharacter(nsScannerIterator& aPosition,
292 : PRUnichar aChar);
293 :
294 : /**
295 : * Internal method used to cause the internal buffer to
296 : * be filled with data.
297 : *
298 : * @update gess4/3/98
299 : */
300 732 : bool IsIncremental(void) {return mIncremental;}
301 3332 : void SetIncremental(bool anIncrValue) {mIncremental=anIncrValue;}
302 :
303 : /**
304 : * Return the position of the first non-whitespace
305 : * character. This is only reliable before consumers start
306 : * reading from this scanner.
307 : */
308 6766 : PRInt32 FirstNonWhitespacePosition()
309 : {
310 6766 : return mFirstNonWhitespacePosition;
311 : }
312 :
313 : /**
314 : * Override replacement character used by nsIUnicodeDecoder.
315 : * Default behavior is that it uses nsIUnicodeDecoder's mapping.
316 : *
317 : * @param aReplacementCharacter the replacement character
318 : * XML (expat) parser uses 0xffff
319 : */
320 : void OverrideReplacementCharacter(PRUnichar aReplacementCharacter);
321 :
322 : protected:
323 :
324 : bool AppendToBuffer(nsScannerString::Buffer *, nsIRequest *aRequest, PRInt32 aErrorPos = -1);
325 4 : bool AppendToBuffer(const nsAString& aStr)
326 : {
327 4 : nsScannerString::Buffer* buf = nsScannerString::AllocBufferFromString(aStr);
328 4 : if (!buf)
329 0 : return false;
330 4 : AppendToBuffer(buf, nsnull);
331 4 : return true;
332 : }
333 :
334 : nsScannerString* mSlidingBuffer;
335 : nsScannerIterator mCurrentPosition; // The position we will next read from in the scanner buffer
336 : nsScannerIterator mMarkPosition; // The position last marked (we may rewind to here)
337 : nsScannerIterator mEndPosition; // The current end of the scanner buffer
338 : nsScannerIterator mFirstInvalidPosition; // The position of the first invalid character that was detected
339 : nsString mFilename;
340 : PRUint32 mCountRemaining; // The number of bytes still to be read
341 : // from the scanner buffer
342 : bool mIncremental;
343 : bool mHasInvalidCharacter;
344 : PRUnichar mReplacementCharacter;
345 : PRInt32 mFirstNonWhitespacePosition;
346 : PRInt32 mCharsetSource;
347 : nsCString mCharset;
348 : nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder;
349 :
350 : private:
351 : nsScanner &operator =(const nsScanner &); // Not implemented.
352 : };
353 :
354 : #endif
355 :
356 :
|