1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is mozilla.org code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : *
24 : * Alternatively, the contents of this file may be used under the terms of
25 : * either of the GNU General Public License Version 2 or later (the "GPL"),
26 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 : * in which case the provisions of the GPL or the LGPL are applicable instead
28 : * of those above. If you wish to allow use of your version of this file only
29 : * under the terms of either the GPL or the LGPL, and not to allow others to
30 : * use your version of this file under the terms of the MPL, indicate your
31 : * decision by deleting the provisions above and replace them with the notice
32 : * and other provisions required by the GPL or the LGPL. If you do not delete
33 : * the provisions above, a recipient may use your version of this file under
34 : * the terms of any one of the MPL, the GPL or the LGPL.
35 : *
36 : * ***** END LICENSE BLOCK ***** */
37 :
38 : /**
39 : * MODULE NOTES:
40 : * @update gess 4/1/98
41 : *
42 : * This file contains the declarations for all the HTML specific token types that
43 : * our DTD's understand. In fact, the same set of token types are used for XML.
44 : * Currently we have tokens for text, comments, start and end tags, entities,
45 : * attributes, style, script and skipped content. Whitespace and newlines also
46 : * have their own token types, but don't count on them to stay forever.
47 : *
48 : * If you're looking for the html tags, they're in a file called nsHTMLTag.h/cpp.
49 : *
50 : * Most of the token types have a similar API. They have methods to get the type
51 : * of token (GetTokenType); those that represent HTML tags also have a method to
52 : * get type tag type (GetTypeID). In addition, most have a method that causes the
53 : * token to help in the parsing process called (Consume). We've also thrown in a
54 : * few standard debugging methods as well.
55 : */
56 :
57 : #ifndef HTMLTOKENS_H
58 : #define HTMLTOKENS_H
59 :
60 : #include "nsToken.h"
61 : #include "nsHTMLTags.h"
62 : #include "nsString.h"
63 : #include "nsScannerString.h"
64 :
65 : class nsScanner;
66 :
67 : /*******************************************************************
68 : * This enum defines the set of token types that we currently support.
69 : *******************************************************************/
70 :
71 : enum eHTMLTokenTypes {
72 : eToken_unknown=0,
73 : eToken_start=1, eToken_end, eToken_comment, eToken_entity,
74 : eToken_whitespace, eToken_newline, eToken_text, eToken_attribute,
75 : eToken_instruction, eToken_cdatasection, eToken_doctypeDecl, eToken_markupDecl,
76 : eToken_last //make sure this stays the last token...
77 : };
78 :
79 : nsresult ConsumeQuotedString(PRUnichar aChar,nsString& aString,nsScanner& aScanner);
80 : nsresult ConsumeAttributeText(PRUnichar aChar,nsString& aString,nsScanner& aScanner);
81 : const PRUnichar* GetTagName(PRInt32 aTag);
82 : //PRInt32 FindEntityIndex(nsString& aString,PRInt32 aCount=-1);
83 :
84 :
85 :
86 : /**
87 : * This declares the basic token type used in the HTML DTD's.
88 : * @update gess 3/25/98
89 : */
90 : class CHTMLToken : public CToken {
91 : public:
92 : virtual ~CHTMLToken();
93 : CHTMLToken(eHTMLTags aTag);
94 :
95 0 : virtual eContainerInfo GetContainerInfo(void) const {return eFormUnknown;}
96 0 : virtual void SetContainerInfo(eContainerInfo aInfo) { }
97 :
98 : protected:
99 : };
100 :
101 : /**
102 : * This declares start tokens, which always take the form <xxxx>.
103 : * This class also knows how to consume related attributes.
104 : *
105 : * @update gess 3/25/98
106 : */
107 850 : class CStartToken: public CHTMLToken {
108 850 : CTOKEN_IMPL_SIZEOF
109 :
110 : public:
111 : CStartToken(eHTMLTags aTag=eHTMLTag_unknown);
112 : CStartToken(const nsAString& aString);
113 : CStartToken(const nsAString& aName,eHTMLTags aTag);
114 :
115 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
116 : virtual PRInt32 GetTypeID(void);
117 : virtual PRInt32 GetTokenType(void);
118 :
119 : virtual bool IsEmpty(void);
120 : virtual void SetEmpty(bool aValue);
121 :
122 : virtual const nsSubstring& GetStringValue();
123 : virtual void GetSource(nsString& anOutputString);
124 : virtual void AppendSourceTo(nsAString& anOutputString);
125 :
126 : // the following info is used to set well-formedness state on start tags...
127 0 : virtual eContainerInfo GetContainerInfo(void) const {return mContainerInfo;}
128 1214 : virtual void SetContainerInfo(eContainerInfo aContainerInfo) {
129 1214 : if (eFormUnknown==mContainerInfo) {
130 719 : mContainerInfo=aContainerInfo;
131 : }
132 1214 : }
133 0 : virtual bool IsWellFormed(void) const {
134 0 : return eWellFormed == mContainerInfo;
135 : }
136 :
137 : nsString mTextValue;
138 : protected:
139 : eContainerInfo mContainerInfo;
140 : bool mEmpty;
141 : #ifdef DEBUG
142 : bool mAttributed;
143 : #endif
144 : };
145 :
146 :
147 : /**
148 : * This declares end tokens, which always take the
149 : * form </xxxx>. This class also knows how to consume
150 : * related attributes.
151 : *
152 : * @update gess 3/25/98
153 : */
154 344 : class CEndToken: public CHTMLToken {
155 344 : CTOKEN_IMPL_SIZEOF
156 :
157 : public:
158 : CEndToken(eHTMLTags aTag);
159 : CEndToken(const nsAString& aString);
160 : CEndToken(const nsAString& aName,eHTMLTags aTag);
161 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
162 : virtual PRInt32 GetTypeID(void);
163 : virtual PRInt32 GetTokenType(void);
164 :
165 : virtual const nsSubstring& GetStringValue();
166 : virtual void GetSource(nsString& anOutputString);
167 : virtual void AppendSourceTo(nsAString& anOutputString);
168 :
169 : protected:
170 : nsString mTextValue;
171 : };
172 :
173 :
174 : /**
175 : * This declares comment tokens. Comments are usually
176 : * thought of as tokens, but we treat them that way
177 : * here so that the parser can have a consistent view
178 : * of all tokens.
179 : *
180 : * @update gess 3/25/98
181 : */
182 25 : class CCommentToken: public CHTMLToken {
183 25 : CTOKEN_IMPL_SIZEOF
184 :
185 : public:
186 : CCommentToken();
187 : CCommentToken(const nsAString& aString);
188 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
189 : virtual PRInt32 GetTokenType(void);
190 : virtual const nsSubstring& GetStringValue(void);
191 : virtual void AppendSourceTo(nsAString& anOutputString);
192 :
193 : nsresult ConsumeStrictComment(nsScanner& aScanner);
194 : nsresult ConsumeQuirksComment(nsScanner& aScanner);
195 :
196 : protected:
197 : nsScannerSubstring mComment; // does not include MDO & MDC
198 : nsScannerSubstring mCommentDecl; // includes MDO & MDC
199 : };
200 :
201 :
202 : /**
203 : * This class declares entity tokens, which always take
204 : * the form &xxxx;. This class also offers a few utility
205 : * methods that allow you to easily reduce entities.
206 : *
207 : * @update gess 3/25/98
208 : */
209 0 : class CEntityToken : public CHTMLToken {
210 0 : CTOKEN_IMPL_SIZEOF
211 :
212 : public:
213 : CEntityToken();
214 : CEntityToken(const nsAString& aString);
215 : virtual PRInt32 GetTokenType(void);
216 : PRInt32 TranslateToUnicodeStr(nsString& aString);
217 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
218 : static nsresult ConsumeEntity(PRUnichar aChar, nsString& aString,
219 : nsScanner& aScanner);
220 : static PRInt32 TranslateToUnicodeStr(PRInt32 aValue,nsString& aString);
221 :
222 : virtual const nsSubstring& GetStringValue(void);
223 : virtual void GetSource(nsString& anOutputString);
224 : virtual void AppendSourceTo(nsAString& anOutputString);
225 :
226 : protected:
227 : nsString mTextValue;
228 : };
229 :
230 :
231 : /**
232 : * Whitespace tokens are used where whitespace can be
233 : * detected as distinct from text. This allows us to
234 : * easily skip leading/trailing whitespace when desired.
235 : *
236 : * @update gess 3/25/98
237 : */
238 281 : class CWhitespaceToken: public CHTMLToken {
239 281 : CTOKEN_IMPL_SIZEOF
240 :
241 : public:
242 : CWhitespaceToken();
243 : CWhitespaceToken(const nsAString& aString);
244 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
245 : virtual PRInt32 GetTokenType(void);
246 : virtual const nsSubstring& GetStringValue(void);
247 :
248 : protected:
249 : nsScannerSharedSubstring mTextValue;
250 : };
251 :
252 : /**
253 : * Text tokens contain the normalized form of html text.
254 : * These tokens are guaranteed not to contain entities,
255 : * start or end tags, or newlines.
256 : *
257 : * @update gess 3/25/98
258 : */
259 306 : class CTextToken: public CHTMLToken {
260 306 : CTOKEN_IMPL_SIZEOF
261 :
262 : public:
263 : CTextToken();
264 : CTextToken(const nsAString& aString);
265 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
266 : virtual PRInt32 GetTokenType(void);
267 : virtual PRInt32 GetTextLength(void);
268 : virtual void CopyTo(nsAString& aStr);
269 : virtual const nsSubstring& GetStringValue(void);
270 : virtual void Bind(nsScanner* aScanner, nsScannerIterator& aStart,
271 : nsScannerIterator& aEnd);
272 : virtual void Bind(const nsAString& aStr);
273 :
274 : nsresult ConsumeCharacterData(bool aIgnoreComments,
275 : nsScanner& aScanner,
276 : const nsAString& aEndTagName,
277 : PRInt32 aFlag,
278 : bool& aFlushTokens);
279 :
280 : nsresult ConsumeParsedCharacterData(bool aDiscardFirstNewline,
281 : bool aConservativeConsume,
282 : nsScanner& aScanner,
283 : const nsAString& aEndTagName,
284 : PRInt32 aFlag,
285 : bool& aFound);
286 :
287 : protected:
288 : nsScannerSubstring mTextValue;
289 : };
290 :
291 :
292 : /**
293 : * CDATASection tokens contain raw unescaped text content delimited by
294 : * a ![CDATA[ and ]].
295 : * XXX Not really a HTML construct - maybe we need a separation
296 : *
297 : * @update vidur 11/12/98
298 : */
299 0 : class CCDATASectionToken : public CHTMLToken {
300 0 : CTOKEN_IMPL_SIZEOF
301 :
302 : public:
303 : CCDATASectionToken(eHTMLTags aTag = eHTMLTag_unknown);
304 : CCDATASectionToken(const nsAString& aString);
305 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
306 : virtual PRInt32 GetTokenType(void);
307 : virtual const nsSubstring& GetStringValue(void);
308 :
309 : protected:
310 : nsString mTextValue;
311 : };
312 :
313 :
314 : /**
315 : * Declaration tokens contain raw unescaped text content (not really, but
316 : * right now we use this only for view source).
317 : * XXX Not really a HTML construct - maybe we need a separation
318 : *
319 : */
320 0 : class CMarkupDeclToken : public CHTMLToken {
321 0 : CTOKEN_IMPL_SIZEOF
322 :
323 : public:
324 : CMarkupDeclToken();
325 : CMarkupDeclToken(const nsAString& aString);
326 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
327 : virtual PRInt32 GetTokenType(void);
328 : virtual const nsSubstring& GetStringValue(void);
329 :
330 : protected:
331 : nsScannerSubstring mTextValue;
332 : };
333 :
334 :
335 : /**
336 : * Attribute tokens are used to contain attribute key/value
337 : * pairs whereever they may occur. Typically, they should
338 : * occur only in start tokens. However, we may expand that
339 : * ability when XML tokens become commonplace.
340 : *
341 : * @update gess 3/25/98
342 : */
343 : class CAttributeToken: public CHTMLToken {
344 732 : CTOKEN_IMPL_SIZEOF
345 :
346 : public:
347 : CAttributeToken();
348 : CAttributeToken(const nsAString& aString);
349 : CAttributeToken(const nsAString& aKey, const nsAString& aString);
350 732 : ~CAttributeToken() {}
351 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
352 : virtual PRInt32 GetTokenType(void);
353 1799 : const nsSubstring& GetKey(void) { return mTextKey.AsString(); }
354 : virtual void SetKey(const nsAString& aKey);
355 : virtual void BindKey(nsScanner* aScanner, nsScannerIterator& aStart,
356 : nsScannerIterator& aEnd);
357 621 : const nsSubstring& GetValue(void) {return mTextValue.str();}
358 : virtual const nsSubstring& GetStringValue(void);
359 : virtual void GetSource(nsString& anOutputString);
360 : virtual void AppendSourceTo(nsAString& anOutputString);
361 :
362 : bool mHasEqualWithoutValue;
363 : protected:
364 : nsScannerSharedSubstring mTextValue;
365 : nsScannerSubstring mTextKey;
366 : };
367 :
368 :
369 : /**
370 : * Newline tokens contain, you guessed it, newlines.
371 : * They consume newline (CR/LF) either alone or in pairs.
372 : *
373 : * @update gess 3/25/98
374 : */
375 530 : class CNewlineToken: public CHTMLToken {
376 530 : CTOKEN_IMPL_SIZEOF
377 :
378 : public:
379 : CNewlineToken();
380 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
381 : virtual PRInt32 GetTokenType(void);
382 : virtual const nsSubstring& GetStringValue(void);
383 :
384 : static void AllocNewline();
385 : static void FreeNewline();
386 : };
387 :
388 :
389 : /**
390 : * Whitespace tokens are used where whitespace can be
391 : * detected as distinct from text. This allows us to
392 : * easily skip leading/trailing whitespace when desired.
393 : *
394 : * @update gess 3/25/98
395 : */
396 0 : class CInstructionToken: public CHTMLToken {
397 0 : CTOKEN_IMPL_SIZEOF
398 :
399 : public:
400 : CInstructionToken();
401 : CInstructionToken(const nsAString& aString);
402 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
403 : virtual PRInt32 GetTokenType(void);
404 : virtual const nsSubstring& GetStringValue(void);
405 :
406 : protected:
407 : nsString mTextValue;
408 : };
409 :
410 :
411 : /**
412 : * This token is generated by the HTML and Expat tokenizers
413 : * when they see the doctype declaration ("<!DOCTYPE ... >")
414 : *
415 : */
416 :
417 25 : class CDoctypeDeclToken: public CHTMLToken {
418 25 : CTOKEN_IMPL_SIZEOF
419 :
420 : public:
421 : CDoctypeDeclToken(eHTMLTags aTag=eHTMLTag_unknown);
422 : CDoctypeDeclToken(const nsAString& aString,eHTMLTags aTag=eHTMLTag_unknown);
423 : virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,PRInt32 aMode);
424 : virtual PRInt32 GetTokenType(void);
425 : virtual const nsSubstring& GetStringValue(void);
426 : virtual void SetStringValue(const nsAString& aStr);
427 :
428 : protected:
429 : nsString mTextValue;
430 : };
431 :
432 : #endif
|