1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim: set ts=2 sw=2 et tw=78: */
3 : /* ***** BEGIN LICENSE BLOCK *****
4 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 : *
6 : * The contents of this file are subject to the Mozilla Public License Version
7 : * 1.1 (the "License"); you may not use this file except in compliance with
8 : * the License. You may obtain a copy of the License at
9 : * http://www.mozilla.org/MPL/
10 : *
11 : * Software distributed under the License is distributed on an "AS IS" basis,
12 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 : * for the specific language governing rights and limitations under the
14 : * License.
15 : *
16 : * The Original Code is mozilla.org code.
17 : *
18 : * The Initial Developer of the Original Code is
19 : * Netscape Communications Corporation.
20 : * Portions created by the Initial Developer are Copyright (C) 1998
21 : * the Initial Developer. All Rights Reserved.
22 : *
23 : * Contributor(s):
24 : * Blake Kaplan <mrbkap@gmail.com>
25 : *
26 : * Alternatively, the contents of this file may be used under the terms of
27 : * either of the GNU General Public License Version 2 or later (the "GPL"),
28 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 : * in which case the provisions of the GPL or the LGPL are applicable instead
30 : * of those above. If you wish to allow use of your version of this file only
31 : * under the terms of either the GPL or the LGPL, and not to allow others to
32 : * use your version of this file under the terms of the MPL, indicate your
33 : * decision by deleting the provisions above and replace them with the notice
34 : * and other provisions required by the GPL or the LGPL. If you do not delete
35 : * the provisions above, a recipient may use your version of this file under
36 : * the terms of any one of the MPL, the GPL or the LGPL.
37 : *
38 : * ***** END LICENSE BLOCK ***** */
39 :
40 : #include <ctype.h>
41 : #include <time.h>
42 : #include <stdio.h>
43 : #include "nsScanner.h"
44 : #include "nsToken.h"
45 : #include "nsHTMLTokens.h"
46 : #include "prtypes.h"
47 : #include "nsDebug.h"
48 : #include "nsHTMLTags.h"
49 : #include "nsHTMLEntities.h"
50 : #include "nsCRT.h"
51 : #include "nsReadableUtils.h"
52 : #include "nsUnicharUtils.h"
53 : #include "nsScanner.h"
54 : #include "nsParserConstants.h"
55 :
56 : static const PRUnichar sUserdefined[] = {'u', 's', 'e', 'r', 'd', 'e', 'f',
57 : 'i', 'n', 'e', 'd', 0};
58 :
59 : static const PRUnichar kAttributeTerminalChars[] = {
60 : PRUnichar('&'), PRUnichar('\t'), PRUnichar('\n'),
61 : PRUnichar('\r'), PRUnichar(' '), PRUnichar('>'),
62 : PRUnichar(0)
63 : };
64 :
65 : static void AppendNCR(nsSubstring& aString, PRInt32 aNCRValue);
66 : /**
67 : * Consumes an entity from aScanner and expands it into aString.
68 : *
69 : * @param aString The target string to append the entity to.
70 : * @param aScanner Controller of underlying input source
71 : * @param aIECompatible Controls whether we respect entities with values >
72 : * 255 and no terminating semicolon.
73 : * @param aFlag If NS_IPARSER_FLAG_VIEW_SOURCE do not reduce entities...
74 : * @return error result
75 : */
76 : static nsresult
77 11 : ConsumeEntity(nsScannerSharedSubstring& aString,
78 : nsScanner& aScanner,
79 : bool aIECompatible,
80 : PRInt32 aFlag)
81 : {
82 11 : nsresult result = NS_OK;
83 :
84 : PRUnichar ch;
85 11 : result = aScanner.Peek(ch, 1);
86 :
87 11 : if (NS_SUCCEEDED(result)) {
88 11 : PRUnichar amp = 0;
89 11 : PRInt32 theNCRValue = 0;
90 22 : nsAutoString entity;
91 :
92 11 : if (nsCRT::IsAsciiAlpha(ch) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
93 11 : result = CEntityToken::ConsumeEntity(ch, entity, aScanner);
94 11 : if (NS_SUCCEEDED(result)) {
95 11 : theNCRValue = nsHTMLEntities::EntityToUnicode(entity);
96 11 : PRUnichar theTermChar = entity.Last();
97 : // If an entity value is greater than 255 then:
98 : // Nav 4.x does not treat it as an entity,
99 : // IE treats it as an entity if terminated with a semicolon.
100 : // Resembling IE!!
101 :
102 11 : nsSubstring &writable = aString.writable();
103 11 : if (theNCRValue < 0 ||
104 : (aIECompatible && theNCRValue > 255 && theTermChar != ';')) {
105 : // Looks like we're not dealing with an entity
106 0 : writable.Append(kAmpersand);
107 0 : writable.Append(entity);
108 : } else {
109 : // A valid entity so reduce it.
110 11 : writable.Append(PRUnichar(theNCRValue));
111 : }
112 : }
113 0 : } else if (ch == kHashsign && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
114 0 : result = CEntityToken::ConsumeEntity(ch, entity, aScanner);
115 0 : if (NS_SUCCEEDED(result)) {
116 0 : nsSubstring &writable = aString.writable();
117 0 : if (result == NS_HTMLTOKENS_NOT_AN_ENTITY) {
118 : // Looked like an entity but it's not
119 0 : aScanner.GetChar(amp);
120 0 : writable.Append(amp);
121 0 : result = NS_OK;
122 : } else {
123 : PRInt32 err;
124 0 : theNCRValue = entity.ToInteger(&err, kAutoDetect);
125 0 : AppendNCR(writable, theNCRValue);
126 : }
127 0 : }
128 : } else {
129 : // What we thought as entity is not really an entity...
130 0 : aScanner.GetChar(amp);
131 0 : aString.writable().Append(amp);
132 : }
133 : }
134 :
135 11 : return result;
136 : }
137 :
138 : /*
139 : * This general purpose method is used when you want to
140 : * consume attributed text value.
141 : * Note: It also reduces entities.
142 : *
143 : * @param aNewlineCount -- the newline count to increment when hitting newlines
144 : * @param aScanner -- controller of underlying input source
145 : * @param aTerminalChars -- characters that stop consuming attribute.
146 : * @param aAllowNewlines -- whether to allow newlines in the value.
147 : * XXX it would be nice to roll this info into
148 : * aTerminalChars somehow....
149 : * @param aIECompatEntities IE treats entities with values > 255 as
150 : * entities only if they're terminated with a
151 : * semicolon. This is true to follow that behavior
152 : * and false to treat all values as entities.
153 : * @param aFlag - contains information such as |dtd mode|view mode|doctype|etc...
154 : * @return error result
155 : */
156 : static nsresult
157 757 : ConsumeUntil(nsScannerSharedSubstring& aString,
158 : PRInt32& aNewlineCount,
159 : nsScanner& aScanner,
160 : const nsReadEndCondition& aEndCondition,
161 : bool aAllowNewlines,
162 : bool aIECompatEntities,
163 : PRInt32 aFlag)
164 : {
165 757 : nsresult result = NS_OK;
166 757 : bool done = false;
167 :
168 1536 : do {
169 768 : result = aScanner.ReadUntil(aString, aEndCondition, false);
170 768 : if (NS_SUCCEEDED(result)) {
171 : PRUnichar ch;
172 768 : aScanner.Peek(ch);
173 768 : if (ch == kAmpersand) {
174 11 : result = ConsumeEntity(aString, aScanner, aIECompatEntities, aFlag);
175 757 : } else if (ch == kCR && aAllowNewlines) {
176 0 : aScanner.GetChar(ch);
177 0 : result = aScanner.Peek(ch);
178 0 : if (NS_SUCCEEDED(result)) {
179 0 : nsSubstring &writable = aString.writable();
180 0 : if (ch == kNewLine) {
181 0 : writable.AppendLiteral("\r\n");
182 0 : aScanner.GetChar(ch);
183 : } else {
184 0 : writable.Append(PRUnichar('\r'));
185 : }
186 0 : ++aNewlineCount;
187 0 : }
188 757 : } else if (ch == kNewLine && aAllowNewlines) {
189 0 : aScanner.GetChar(ch);
190 0 : aString.writable().Append(PRUnichar('\n'));
191 0 : ++aNewlineCount;
192 : } else {
193 757 : done = true;
194 : }
195 : }
196 1536 : } while (NS_SUCCEEDED(result) && !done);
197 :
198 757 : return result;
199 : }
200 :
201 : /**************************************************************
202 : And now for the token classes...
203 : **************************************************************/
204 :
205 : /**
206 : * Constructor from tag id
207 : */
208 3093 : CHTMLToken::CHTMLToken(eHTMLTags aTag)
209 3093 : : CToken(aTag)
210 : {
211 3093 : }
212 :
213 :
214 3093 : CHTMLToken::~CHTMLToken()
215 : {
216 3093 : }
217 :
218 : /*
219 : * Constructor from tag id
220 : */
221 797 : CStartToken::CStartToken(eHTMLTags aTag)
222 797 : : CHTMLToken(aTag)
223 : {
224 797 : mEmpty = false;
225 797 : mContainerInfo = eFormUnknown;
226 : #ifdef DEBUG
227 797 : mAttributed = false;
228 : #endif
229 797 : }
230 :
231 0 : CStartToken::CStartToken(const nsAString& aName)
232 0 : : CHTMLToken(eHTMLTag_unknown)
233 : {
234 0 : mEmpty = false;
235 0 : mContainerInfo = eFormUnknown;
236 0 : mTextValue.Assign(aName);
237 : #ifdef DEBUG
238 0 : mAttributed = false;
239 : #endif
240 0 : }
241 :
242 53 : CStartToken::CStartToken(const nsAString& aName, eHTMLTags aTag)
243 53 : : CHTMLToken(aTag)
244 : {
245 53 : mEmpty = false;
246 53 : mContainerInfo = eFormUnknown;
247 53 : mTextValue.Assign(aName);
248 : #ifdef DEBUG
249 53 : mAttributed = false;
250 : #endif
251 53 : }
252 :
253 : /*
254 : * This method returns the typeid (the tag type) for this token.
255 : */
256 : PRInt32
257 10605 : CStartToken::GetTypeID()
258 : {
259 10605 : if (eHTMLTag_unknown == mTypeID) {
260 0 : mTypeID = nsHTMLTags::LookupTag(mTextValue);
261 : }
262 10605 : return mTypeID;
263 : }
264 :
265 : PRInt32
266 2519 : CStartToken::GetTokenType()
267 : {
268 2519 : return eToken_start;
269 : }
270 :
271 : void
272 0 : CStartToken::SetEmpty(bool aValue)
273 : {
274 0 : mEmpty = aValue;
275 0 : }
276 :
277 : bool
278 0 : CStartToken::IsEmpty()
279 : {
280 0 : return mEmpty;
281 : }
282 :
283 : /*
284 : * Consume the identifier portion of the start tag
285 : */
286 : nsresult
287 794 : CStartToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
288 : {
289 : // If you're here, we've already Consumed the < char, and are
290 : // ready to Consume the rest of the open tag identifier.
291 : // Stop consuming as soon as you see a space or a '>'.
292 : // NOTE: We don't Consume the tag attributes here, nor do we eat the ">"
293 :
294 794 : nsresult result = NS_OK;
295 1588 : nsScannerSharedSubstring tagIdent;
296 :
297 794 : if (aFlag & NS_IPARSER_FLAG_HTML) {
298 794 : result = aScanner.ReadTagIdentifier(tagIdent);
299 794 : mTypeID = (PRInt32)nsHTMLTags::LookupTag(tagIdent.str());
300 : // Save the original tag string if this is user-defined or if we
301 : // are viewing source
302 794 : if (eHTMLTag_userdefined == mTypeID ||
303 : (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
304 0 : mTextValue = tagIdent.str();
305 : }
306 : } else {
307 0 : result = aScanner.ReadTagIdentifier(tagIdent);
308 0 : mTextValue = tagIdent.str();
309 0 : mTypeID = nsHTMLTags::LookupTag(mTextValue);
310 : }
311 :
312 794 : if (NS_SUCCEEDED(result) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
313 794 : result = aScanner.SkipWhitespace(mNewlineCount);
314 : }
315 :
316 794 : if (kEOF == result && !aScanner.IsIncremental()) {
317 : // Take what we can get.
318 0 : result = NS_OK;
319 : }
320 :
321 794 : return result;
322 : }
323 :
324 : const nsSubstring&
325 0 : CStartToken::GetStringValue()
326 : {
327 0 : if (eHTMLTag_unknown < mTypeID && mTypeID < eHTMLTag_text) {
328 0 : if (!mTextValue.Length()) {
329 0 : mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
330 : }
331 : }
332 0 : return mTextValue;
333 : }
334 :
335 : void
336 0 : CStartToken::GetSource(nsString& anOutputString)
337 : {
338 0 : anOutputString.Truncate();
339 0 : AppendSourceTo(anOutputString);
340 0 : }
341 :
342 : void
343 0 : CStartToken::AppendSourceTo(nsAString& anOutputString)
344 : {
345 0 : anOutputString.Append(PRUnichar('<'));
346 : /*
347 : * Watch out for Bug 15204
348 : */
349 0 : if (!mTextValue.IsEmpty()) {
350 0 : anOutputString.Append(mTextValue);
351 : } else {
352 0 : anOutputString.Append(GetTagName(mTypeID));
353 : }
354 :
355 0 : anOutputString.Append(PRUnichar('>'));
356 0 : }
357 :
358 344 : CEndToken::CEndToken(eHTMLTags aTag)
359 344 : : CHTMLToken(aTag)
360 : {
361 344 : }
362 :
363 0 : CEndToken::CEndToken(const nsAString& aName)
364 0 : : CHTMLToken(eHTMLTag_unknown)
365 : {
366 0 : mTextValue.Assign(aName);
367 0 : }
368 :
369 0 : CEndToken::CEndToken(const nsAString& aName, eHTMLTags aTag)
370 0 : : CHTMLToken(aTag)
371 : {
372 0 : mTextValue.Assign(aName);
373 0 : }
374 :
375 : nsresult
376 344 : CEndToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
377 : {
378 344 : nsresult result = NS_OK;
379 688 : nsScannerSharedSubstring tagIdent;
380 :
381 344 : if (aFlag & NS_IPARSER_FLAG_HTML) {
382 344 : result = aScanner.ReadTagIdentifier(tagIdent);
383 :
384 344 : mTypeID = (PRInt32)nsHTMLTags::LookupTag(tagIdent.str());
385 : // Save the original tag string if this is user-defined or if we
386 : // are viewing source
387 344 : if (eHTMLTag_userdefined == mTypeID ||
388 : (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
389 0 : mTextValue = tagIdent.str();
390 : }
391 : } else {
392 0 : result = aScanner.ReadTagIdentifier(tagIdent);
393 0 : mTextValue = tagIdent.str();
394 0 : mTypeID = nsHTMLTags::LookupTag(mTextValue);
395 : }
396 :
397 344 : if (NS_SUCCEEDED(result) && !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
398 344 : result = aScanner.SkipWhitespace(mNewlineCount);
399 : }
400 :
401 344 : if (kEOF == result && !aScanner.IsIncremental()) {
402 : // Take what we can get.
403 0 : result = NS_OK;
404 : }
405 :
406 344 : return result;
407 : }
408 :
409 :
410 : /*
411 : * Asks the token to determine the <i>HTMLTag type</i> of
412 : * the token. This turns around and looks up the tag name
413 : * in the tag dictionary.
414 : */
415 : PRInt32
416 1032 : CEndToken::GetTypeID()
417 : {
418 1032 : if (eHTMLTag_unknown == mTypeID) {
419 0 : mTypeID = nsHTMLTags::LookupTag(mTextValue);
420 0 : switch (mTypeID) {
421 : case eHTMLTag_dir:
422 : case eHTMLTag_menu:
423 0 : mTypeID = eHTMLTag_ul;
424 0 : break;
425 :
426 : default:
427 0 : break;
428 : }
429 : }
430 :
431 1032 : return mTypeID;
432 : }
433 :
434 : PRInt32
435 688 : CEndToken::GetTokenType()
436 : {
437 688 : return eToken_end;
438 : }
439 :
440 : const nsSubstring&
441 0 : CEndToken::GetStringValue()
442 : {
443 0 : if (eHTMLTag_unknown < mTypeID && mTypeID < eHTMLTag_text) {
444 0 : if (!mTextValue.Length()) {
445 0 : mTextValue.Assign(nsHTMLTags::GetStringValue((nsHTMLTag) mTypeID));
446 : }
447 : }
448 0 : return mTextValue;
449 : }
450 :
451 : void
452 0 : CEndToken::GetSource(nsString& anOutputString)
453 : {
454 0 : anOutputString.Truncate();
455 0 : AppendSourceTo(anOutputString);
456 0 : }
457 :
458 : void
459 0 : CEndToken::AppendSourceTo(nsAString& anOutputString)
460 : {
461 0 : anOutputString.AppendLiteral("</");
462 0 : if (!mTextValue.IsEmpty()) {
463 0 : anOutputString.Append(mTextValue);
464 : } else {
465 0 : anOutputString.Append(GetTagName(mTypeID));
466 : }
467 :
468 0 : anOutputString.Append(PRUnichar('>'));
469 0 : }
470 :
471 306 : CTextToken::CTextToken()
472 306 : : CHTMLToken(eHTMLTag_text)
473 : {
474 306 : }
475 :
476 0 : CTextToken::CTextToken(const nsAString& aName)
477 0 : : CHTMLToken(eHTMLTag_text)
478 : {
479 0 : mTextValue.Rebind(aName);
480 0 : }
481 :
482 : PRInt32
483 943 : CTextToken::GetTokenType()
484 : {
485 943 : return eToken_text;
486 : }
487 :
488 : PRInt32
489 0 : CTextToken::GetTextLength()
490 : {
491 0 : return mTextValue.Length();
492 : }
493 :
494 : nsresult
495 281 : CTextToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
496 : {
497 : static const PRUnichar theTerminalsChars[] =
498 : { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('&'), PRUnichar('<'),
499 : PRUnichar(0) };
500 281 : static const nsReadEndCondition theEndCondition(theTerminalsChars);
501 281 : nsresult result = NS_OK;
502 281 : bool done = false;
503 281 : nsScannerIterator origin, start, end;
504 :
505 : // Start scanning after the first character, because we know it to
506 : // be part of this text token (we wouldn't have come here if it weren't)
507 281 : aScanner.CurrentPosition(origin);
508 281 : start = origin;
509 281 : aScanner.EndReading(end);
510 :
511 281 : NS_ASSERTION(start != end, "Calling CTextToken::Consume when already at the "
512 : "end of a document is a bad idea.");
513 :
514 281 : aScanner.SetPosition(++start);
515 :
516 892 : while (NS_OK == result && !done) {
517 330 : result = aScanner.ReadUntil(start, end, theEndCondition, false);
518 330 : if (NS_OK == result) {
519 330 : result = aScanner.Peek(aChar);
520 :
521 330 : if (NS_OK == result && (kCR == aChar || kNewLine == aChar)) {
522 49 : switch (aChar) {
523 : case kCR:
524 : {
525 : // It's a carriage return. See if this is part of a CR-LF pair (in
526 : // which case we need to treat it as one newline). If we're at the
527 : // edge of a packet, then leave the CR on the scanner, since it
528 : // could still be part of a CR-LF pair. Otherwise, it isn't.
529 : PRUnichar theNextChar;
530 0 : result = aScanner.Peek(theNextChar, 1);
531 :
532 0 : if (result == kEOF && aScanner.IsIncremental()) {
533 0 : break;
534 : }
535 :
536 0 : if (NS_SUCCEEDED(result)) {
537 : // Actually get the carriage return.
538 0 : aScanner.GetChar(aChar);
539 : }
540 :
541 0 : if (kLF == theNextChar) {
542 : // If the "\r" is followed by a "\n", don't replace it and let
543 : // it be ignored by the layout system.
544 0 : end.advance(2);
545 0 : aScanner.GetChar(theNextChar);
546 : } else {
547 : // If it is standalone, replace the "\r" with a "\n" so that it
548 : // will be considered by the layout system.
549 0 : aScanner.ReplaceCharacter(end, kLF);
550 0 : ++end;
551 : }
552 0 : ++mNewlineCount;
553 0 : break;
554 : }
555 : case kLF:
556 49 : aScanner.GetChar(aChar);
557 49 : ++end;
558 49 : ++mNewlineCount;
559 49 : break;
560 49 : }
561 : } else {
562 281 : done = true;
563 : }
564 : }
565 : }
566 :
567 : // Note: This function is only called from nsHTMLTokenizer::ConsumeText. If
568 : // we return an error result from the final buffer, then it is responsible
569 : // for turning it into an NS_OK result.
570 281 : aScanner.BindSubstring(mTextValue, origin, end);
571 :
572 281 : return result;
573 : }
574 :
575 : /*
576 : * Consume as much clear text from scanner as possible.
577 : * The scanner is left on the < of the perceived end tag.
578 : *
579 : * @param aChar -- last char consumed from stream
580 : * @param aConservativeConsume -- controls our handling of content with no
581 : * terminating string.
582 : * @param aIgnoreComments -- whether or not we should take comments into
583 : * account in looking for the end tag.
584 : * @param aScanner -- controller of underlying input source
585 : * @param aEndTagname -- the terminal tag name.
586 : * @param aFlag -- dtd modes and such.
587 : * @param aFlushTokens -- true if we found the terminal tag.
588 : * @return error result
589 : */
590 : nsresult
591 0 : CTextToken::ConsumeCharacterData(bool aIgnoreComments,
592 : nsScanner& aScanner,
593 : const nsAString& aEndTagName,
594 : PRInt32 aFlag,
595 : bool& aFlushTokens)
596 : {
597 0 : nsresult result = NS_OK;
598 0 : nsScannerIterator theStartOffset, theCurrOffset, theTermStrPos,
599 0 : theStartCommentPos, theAltTermStrPos, endPos;
600 0 : bool done = false;
601 0 : bool theLastIteration = false;
602 :
603 0 : aScanner.CurrentPosition(theStartOffset);
604 0 : theCurrOffset = theStartOffset;
605 0 : aScanner.EndReading(endPos);
606 0 : theTermStrPos = theStartCommentPos = theAltTermStrPos = endPos;
607 :
608 : // ALGORITHM: *** The performance is based on correctness of the document ***
609 : // 1. Look for a '<' character. This could be
610 : // a) Start of a comment (<!--),
611 : // b) Start of the terminal string, or
612 : // c) a start of a tag.
613 : // We are interested in a) and b). c) is ignored because in CDATA we
614 : // don't care for tags.
615 : // NOTE: Technically speaking in CDATA we should ignore the comments too!
616 : // But for compatibility we don't.
617 : // 2. Having the offset, for '<', search for the terminal string from there
618 : // on and record its offset.
619 : // 3. From the same '<' offset also search for start of a comment '<!--'.
620 : // If found search for end comment '-->' between the terminal string and
621 : // '<!--'. If you did not find the end comment, then we have a malformed
622 : // document, i.e., this section has a prematured terminal string Ex.
623 : // <SCRIPT><!-- document.write('</SCRIPT>') //--> </SCRIPT>. But record
624 : // terminal string's offset if this is the first premature terminal
625 : // string, and update the current offset to the terminal string
626 : // (prematured) offset and goto step 1.
627 : // 4. Amen...If you found a terminal string and '-->'. Otherwise goto step 1.
628 : // 5. If the end of the document is reached and if we still don't have the
629 : // condition in step 4. then assume that the prematured terminal string
630 : // is the actual terminal string and goto step 1. This will be our last
631 : // iteration. If there is no premature terminal string and we're being
632 : // conservative in our consumption (aConservativeConsume), then don't
633 : // consume anything from the scanner. Otherwise, we consume all the way
634 : // until the end.
635 :
636 0 : NS_NAMED_LITERAL_STRING(ltslash, "</");
637 0 : const nsString theTerminalString = ltslash + aEndTagName;
638 :
639 0 : PRUint32 termStrLen = theTerminalString.Length();
640 0 : while (result == NS_OK && !done) {
641 0 : bool found = false;
642 0 : nsScannerIterator gtOffset, ltOffset = theCurrOffset;
643 0 : while (FindCharInReadable(PRUnichar(kLessThan), ltOffset, endPos) &&
644 0 : ((PRUint32)ltOffset.size_forward() >= termStrLen ||
645 0 : Distance(ltOffset, endPos) >= termStrLen)) {
646 : // Make a copy of the (presumed) end tag and
647 : // do a case-insensitive comparison
648 :
649 0 : nsScannerIterator start(ltOffset), end(ltOffset);
650 0 : end.advance(termStrLen);
651 :
652 0 : if (CaseInsensitiveFindInReadable(theTerminalString, start, end) &&
653 0 : (end == endPos || (*end == '>' || *end == ' ' ||
654 0 : *end == '\t' || *end == '\n' ||
655 0 : *end == '\r'))) {
656 0 : gtOffset = end;
657 : // Note that aIgnoreComments is only not set for <script>. We don't
658 : // want to execute scripts that aren't in the form of: <script\s.*>
659 0 : if ((end == endPos && aIgnoreComments) ||
660 0 : FindCharInReadable(PRUnichar(kGreaterThan), gtOffset, endPos)) {
661 0 : found = true;
662 0 : theTermStrPos = start;
663 : }
664 0 : break;
665 : }
666 0 : ltOffset.advance(1);
667 : }
668 :
669 0 : if (found && theTermStrPos != endPos) {
670 0 : if (!(aFlag & NS_IPARSER_FLAG_STRICT_MODE) &&
671 0 : !theLastIteration && !aIgnoreComments) {
672 0 : nsScannerIterator endComment(ltOffset);
673 0 : endComment.advance(5);
674 :
675 0 : if ((theStartCommentPos == endPos) &&
676 0 : FindInReadable(NS_LITERAL_STRING("<!--"), theCurrOffset,
677 0 : endComment)) {
678 0 : theStartCommentPos = theCurrOffset;
679 : }
680 :
681 0 : if (theStartCommentPos != endPos) {
682 : // Search for --> between <!-- and </TERMINALSTRING>.
683 0 : theCurrOffset = theStartCommentPos;
684 0 : nsScannerIterator terminal(theTermStrPos);
685 0 : if (!RFindInReadable(NS_LITERAL_STRING("-->"),
686 0 : theCurrOffset, terminal)) {
687 : // If you're here it means that we have a bogus terminal string.
688 : // Even though it is bogus, the position of the terminal string
689 : // could be helpful in case we hit the rock bottom.
690 0 : if (theAltTermStrPos == endPos) {
691 : // But we only want to remember the first bogus terminal string.
692 0 : theAltTermStrPos = theTermStrPos;
693 : }
694 :
695 : // We did not find '-->' so keep searching for terminal string.
696 0 : theCurrOffset = theTermStrPos;
697 0 : theCurrOffset.advance(termStrLen);
698 0 : continue;
699 : }
700 : }
701 : }
702 :
703 0 : aScanner.BindSubstring(mTextValue, theStartOffset, theTermStrPos);
704 0 : aScanner.SetPosition(ltOffset);
705 :
706 : // We found </SCRIPT> or </STYLE>...permit flushing -> Ref: Bug 22485
707 0 : aFlushTokens = true;
708 0 : done = true;
709 : } else {
710 : // We end up here if:
711 : // a) when the buffer runs out ot data.
712 : // b) when the terminal string is not found.
713 0 : if (!aScanner.IsIncremental()) {
714 0 : if (theAltTermStrPos != endPos) {
715 : // If you're here it means that we hit the rock bottom and therefore
716 : // switch to plan B, since we have an alternative terminating string.
717 0 : theCurrOffset = theAltTermStrPos;
718 0 : theLastIteration = true;
719 : } else {
720 : // Oops, We fell all the way down to the end of the document.
721 0 : done = true; // Do this to fix Bug. 35456
722 0 : result = kFakeEndTag;
723 0 : aScanner.BindSubstring(mTextValue, theStartOffset, endPos);
724 0 : aScanner.SetPosition(endPos);
725 : }
726 : } else {
727 0 : result = kEOF;
728 : }
729 : }
730 : }
731 :
732 0 : if (result == NS_OK) {
733 0 : mNewlineCount = mTextValue.CountChar(kNewLine);
734 : }
735 :
736 0 : return result;
737 : }
738 :
739 : /*
740 : * Consume as much clear text from scanner as possible. Reducing entities.
741 : * The scanner is left on the < of the perceived end tag.
742 : *
743 : * @param aChar -- last char consumed from stream
744 : * @param aConservativeConsume -- controls our handling of content with no
745 : * terminating string.
746 : * @param aScanner -- controller of underlying input source
747 : * @param aEndTagname -- the terminal tag name.
748 : * @param aFlag -- dtd modes and such.
749 : * @param aFlushTokens -- true if we found the terminal tag.
750 : * @return error result
751 : */
752 : nsresult
753 25 : CTextToken::ConsumeParsedCharacterData(bool aDiscardFirstNewline,
754 : bool aConservativeConsume,
755 : nsScanner& aScanner,
756 : const nsAString& aEndTagName,
757 : PRInt32 aFlag,
758 : bool& aFound)
759 : {
760 : // This function is fairly straightforward except if there is no terminating
761 : // string. If there is, we simply loop through all of the entities, reducing
762 : // them as necessary and skipping over non-terminal strings starting with <.
763 : // If there is *no* terminal string, then we examine aConservativeConsume.
764 : // If we want to be conservative, we backtrack to the first place in the
765 : // document that looked like the end of PCDATA (i.e., the first tag). This
766 : // is for compatibility and so we don't regress bug 42945. If we are not
767 : // conservative, then we consume everything, all the way up to the end of
768 : // the document.
769 :
770 : static const PRUnichar terminalChars[] = {
771 : PRUnichar('\r'), PRUnichar('\n'), PRUnichar('&'), PRUnichar('<'),
772 : PRUnichar(0)
773 : };
774 25 : static const nsReadEndCondition theEndCondition(terminalChars);
775 :
776 25 : nsScannerIterator currPos, endPos, altEndPos;
777 25 : PRUint32 truncPos = 0;
778 25 : PRInt32 truncNewlineCount = 0;
779 25 : aScanner.CurrentPosition(currPos);
780 25 : aScanner.EndReading(endPos);
781 :
782 25 : altEndPos = endPos;
783 :
784 50 : nsScannerSharedSubstring theContent;
785 25 : PRUnichar ch = 0;
786 :
787 50 : NS_NAMED_LITERAL_STRING(commentStart, "<!--");
788 50 : NS_NAMED_LITERAL_STRING(ltslash, "</");
789 50 : const nsString theTerminalString = ltslash + aEndTagName;
790 25 : PRUint32 termStrLen = theTerminalString.Length();
791 25 : PRUint32 commentStartLen = commentStart.Length();
792 :
793 25 : nsresult result = NS_OK;
794 :
795 : // Note that if we're already at the end of the document, the ConsumeUntil
796 : // will fail, and we'll do the right thing.
797 0 : do {
798 : result = ConsumeUntil(theContent, mNewlineCount, aScanner,
799 25 : theEndCondition, true, false, aFlag);
800 :
801 25 : if (aDiscardFirstNewline &&
802 0 : (NS_SUCCEEDED(result) || !aScanner.IsIncremental()) &&
803 0 : !(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
804 : // Check if the very first character is a newline, and if so discard it.
805 : // Note that we don't want to discard it in view source!
806 : // Also note that this has to happen here (as opposed to before the
807 : // ConsumeUntil) because we have to expand any entities.
808 : // XXX It would be nice to be able to do this without calling
809 : // writable()!
810 0 : const nsSubstring &firstChunk = theContent.str();
811 0 : if (!firstChunk.IsEmpty()) {
812 0 : PRUint32 where = 0;
813 0 : PRUnichar newline = firstChunk.First();
814 :
815 0 : if (newline == kCR || newline == kNewLine) {
816 0 : ++where;
817 :
818 0 : if (firstChunk.Length() > 1) {
819 0 : if (newline == kCR && firstChunk.CharAt(1) == kNewLine) {
820 : // Handle \r\n = 1 newline.
821 0 : ++where;
822 : }
823 : // Note: \n\r = 2 newlines.
824 : }
825 : }
826 :
827 0 : if (where != 0) {
828 0 : theContent.writable() = Substring(firstChunk, where);
829 : }
830 : }
831 : }
832 25 : aDiscardFirstNewline = false;
833 :
834 25 : if (NS_FAILED(result)) {
835 0 : if (kEOF == result && !aScanner.IsIncremental()) {
836 0 : aFound = true; // this is as good as it gets.
837 0 : result = kFakeEndTag;
838 :
839 0 : if (aConservativeConsume && altEndPos != endPos) {
840 : // We ran out of room looking for a </title>. Go back to the first
841 : // place that looked like a tag and use that as our stopping point.
842 0 : theContent.writable().Truncate(truncPos);
843 0 : mNewlineCount = truncNewlineCount;
844 0 : aScanner.SetPosition(altEndPos, false, true);
845 : }
846 : // else we take everything we consumed.
847 0 : mTextValue.Rebind(theContent.str());
848 : } else {
849 0 : aFound = false;
850 : }
851 :
852 0 : return result;
853 : }
854 :
855 25 : aScanner.CurrentPosition(currPos);
856 25 : aScanner.GetChar(ch); // this character must be '&' or '<'
857 :
858 25 : if (ch == kLessThan && altEndPos == endPos) {
859 : // Keep this position in case we need it for later.
860 25 : altEndPos = currPos;
861 25 : truncPos = theContent.str().Length();
862 25 : truncNewlineCount = mNewlineCount;
863 : }
864 :
865 25 : if (Distance(currPos, endPos) >= termStrLen) {
866 25 : nsScannerIterator start(currPos), end(currPos);
867 25 : end.advance(termStrLen);
868 :
869 25 : if (CaseInsensitiveFindInReadable(theTerminalString, start, end)) {
870 25 : if (end != endPos && (*end == '>' || *end == ' ' ||
871 0 : *end == '\t' || *end == '\n' ||
872 0 : *end == '\r')) {
873 25 : aFound = true;
874 25 : mTextValue.Rebind(theContent.str());
875 :
876 : // Note: This SetPosition() is actually going backwards from the
877 : // scanner's mCurrentPosition (so we pass aReverse == true). This
878 : // is because we call GetChar() above after we get the current
879 : // position.
880 25 : aScanner.SetPosition(currPos, false, true);
881 25 : break;
882 : }
883 : }
884 : }
885 : // IE only consumes <!-- --> as comments in PCDATA.
886 0 : if (Distance(currPos, endPos) >= commentStartLen) {
887 0 : nsScannerIterator start(currPos), end(currPos);
888 0 : end.advance(commentStartLen);
889 :
890 0 : if (CaseInsensitiveFindInReadable(commentStart, start, end)) {
891 0 : CCommentToken consumer; // stack allocated.
892 :
893 : // CCommentToken expects us to be on the '-'
894 0 : aScanner.SetPosition(currPos.advance(2));
895 :
896 : // In quirks mode we consume too many things as comments, so pretend
897 : // that we're not by modifying aFlag.
898 0 : result = consumer.Consume(*currPos, aScanner,
899 : (aFlag & ~NS_IPARSER_FLAG_QUIRKS_MODE) |
900 0 : NS_IPARSER_FLAG_STRICT_MODE);
901 0 : if (kEOF == result) {
902 : // This can only happen if we're really out of space.
903 0 : return kEOF;
904 0 : } else if (kNotAComment == result) {
905 : // Fall through and consume this as text.
906 0 : aScanner.CurrentPosition(currPos);
907 0 : aScanner.SetPosition(currPos.advance(1));
908 : } else {
909 0 : consumer.AppendSourceTo(theContent.writable());
910 0 : mNewlineCount += consumer.GetNewlineCount();
911 0 : continue;
912 : }
913 : }
914 : }
915 :
916 0 : result = kEOF;
917 : // We did not find the terminal string yet so
918 : // include the character that stopped consumption.
919 0 : theContent.writable().Append(ch);
920 : } while (currPos != endPos);
921 :
922 25 : return result;
923 : }
924 :
925 : void
926 0 : CTextToken::CopyTo(nsAString& aStr)
927 : {
928 0 : nsScannerIterator start, end;
929 0 : mTextValue.BeginReading(start);
930 0 : mTextValue.EndReading(end);
931 0 : CopyUnicodeTo(start, end, aStr);
932 0 : }
933 :
934 306 : const nsSubstring& CTextToken::GetStringValue()
935 : {
936 306 : return mTextValue.AsString();
937 : }
938 :
939 : void
940 0 : CTextToken::Bind(nsScanner* aScanner, nsScannerIterator& aStart,
941 : nsScannerIterator& aEnd)
942 : {
943 0 : aScanner->BindSubstring(mTextValue, aStart, aEnd);
944 0 : }
945 :
946 : void
947 0 : CTextToken::Bind(const nsAString& aStr)
948 : {
949 0 : mTextValue.Rebind(aStr);
950 0 : }
951 :
952 0 : CCDATASectionToken::CCDATASectionToken(eHTMLTags aTag)
953 0 : : CHTMLToken(aTag)
954 : {
955 0 : }
956 :
957 0 : CCDATASectionToken::CCDATASectionToken(const nsAString& aName)
958 0 : : CHTMLToken(eHTMLTag_unknown)
959 : {
960 0 : mTextValue.Assign(aName);
961 0 : }
962 :
963 : PRInt32
964 0 : CCDATASectionToken::GetTokenType()
965 : {
966 0 : return eToken_cdatasection;
967 : }
968 :
969 : /*
970 : * Consume as much marked test from scanner as possible.
971 : * Note: This has to handle case: "<![ ! IE 5]>", in addition to "<![..[..]]>"
972 : *
973 : * @param aChar -- last char consumed from stream
974 : * @param aScanner -- controller of underlying input source
975 : * @return error result
976 : */
977 : nsresult
978 0 : CCDATASectionToken::Consume(PRUnichar aChar, nsScanner& aScanner,
979 : PRInt32 aFlag)
980 : {
981 : static const PRUnichar theTerminalsChars[] =
982 : { PRUnichar('\r'), PRUnichar('\n'), PRUnichar(']'), PRUnichar(0) };
983 0 : static const nsReadEndCondition theEndCondition(theTerminalsChars);
984 0 : nsresult result = NS_OK;
985 0 : bool done = false;
986 :
987 0 : while (NS_OK == result && !done) {
988 0 : result = aScanner.ReadUntil(mTextValue, theEndCondition, false);
989 0 : if (NS_OK == result) {
990 0 : result = aScanner.Peek(aChar);
991 0 : if (kCR == aChar && NS_OK == result) {
992 0 : result = aScanner.GetChar(aChar); // Strip off the \r
993 0 : result = aScanner.Peek(aChar); // Then see what's next.
994 0 : if (NS_OK == result) {
995 0 : switch(aChar) {
996 : case kCR:
997 0 : result = aScanner.GetChar(aChar); // Strip off the \r
998 0 : mTextValue.AppendLiteral("\n\n");
999 0 : mNewlineCount += 2;
1000 0 : break;
1001 :
1002 : case kNewLine:
1003 : // Which means we saw \r\n, which becomes \n
1004 0 : result = aScanner.GetChar(aChar); // Strip off the \n
1005 :
1006 : // Fall through...
1007 : default:
1008 0 : mTextValue.AppendLiteral("\n");
1009 0 : mNewlineCount++;
1010 0 : break;
1011 : }
1012 : }
1013 0 : } else if (kNewLine == aChar) {
1014 0 : result = aScanner.GetChar(aChar);
1015 0 : mTextValue.Append(aChar);
1016 0 : ++mNewlineCount;
1017 0 : } else if (kRightSquareBracket == aChar) {
1018 0 : bool canClose = false;
1019 0 : result = aScanner.GetChar(aChar); // Strip off the ]
1020 0 : mTextValue.Append(aChar);
1021 0 : result = aScanner.Peek(aChar); // Then see what's next.
1022 0 : if (NS_OK == result && kRightSquareBracket == aChar) {
1023 0 : result = aScanner.GetChar(aChar); // Strip off the second ]
1024 0 : mTextValue.Append(aChar);
1025 0 : canClose = true;
1026 : }
1027 :
1028 : // The goal here is to not lose data from the page when encountering
1029 : // markup like: <![endif]-->. This means that in normal parsing, we
1030 : // allow ']' to end the marked section and just drop everything between
1031 : // it an the '>'. In view-source mode, we cannot drop things on the
1032 : // floor like that. In fact, to make view-source of XML with script in
1033 : // CDATA sections at all bearable, we need to somewhat enforce the ']]>'
1034 : // terminator for marked sections. So make the tokenization somewhat
1035 : // different when in view-source _and_ dealing with a CDATA section.
1036 : // XXX We should remember this StringBeginsWith test.
1037 : bool inCDATA = (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) &&
1038 0 : StringBeginsWith(mTextValue, NS_LITERAL_STRING("[CDATA["));
1039 0 : if (inCDATA) {
1040 : // Consume all right square brackets to catch cases such as:
1041 : // <![CDATA[foo]]]>
1042 0 : while (true) {
1043 0 : result = aScanner.Peek(aChar);
1044 0 : if (result != NS_OK || aChar != kRightSquareBracket) {
1045 : break;
1046 : }
1047 :
1048 0 : mTextValue.Append(aChar);
1049 0 : aScanner.GetChar(aChar);
1050 : }
1051 : } else {
1052 0 : nsAutoString dummy; // Skip any bad data
1053 0 : result = aScanner.ReadUntil(dummy, kGreaterThan, false);
1054 : }
1055 0 : if (NS_OK == result &&
1056 0 : (!inCDATA || (canClose && kGreaterThan == aChar))) {
1057 0 : result = aScanner.GetChar(aChar); // Strip off the >
1058 0 : done = true;
1059 : }
1060 : } else {
1061 0 : done = true;
1062 : }
1063 : }
1064 : }
1065 :
1066 0 : if (kEOF == result && !aScanner.IsIncremental()) {
1067 : // We ran out of space looking for the end of this CDATA section.
1068 : // In order to not completely lose the entire section, treat everything
1069 : // until the end of the document as part of the CDATA section and let
1070 : // the DTD handle it.
1071 0 : mInError = true;
1072 0 : result = NS_OK;
1073 : }
1074 :
1075 0 : return result;
1076 : }
1077 :
1078 : const nsSubstring&
1079 0 : CCDATASectionToken::GetStringValue()
1080 : {
1081 0 : return mTextValue;
1082 : }
1083 :
1084 :
1085 0 : CMarkupDeclToken::CMarkupDeclToken()
1086 0 : : CHTMLToken(eHTMLTag_markupDecl)
1087 : {
1088 0 : }
1089 :
1090 0 : CMarkupDeclToken::CMarkupDeclToken(const nsAString& aName)
1091 0 : : CHTMLToken(eHTMLTag_markupDecl)
1092 : {
1093 0 : mTextValue.Rebind(aName);
1094 0 : }
1095 :
1096 : PRInt32
1097 0 : CMarkupDeclToken::GetTokenType()
1098 : {
1099 0 : return eToken_markupDecl;
1100 : }
1101 :
1102 : /*
1103 : * Consume as much declaration from scanner as possible.
1104 : * Declaration is a markup declaration of ELEMENT, ATTLIST, ENTITY or
1105 : * NOTATION, which can span multiple lines and ends in >.
1106 : *
1107 : * @param aChar -- last char consumed from stream
1108 : * @param aScanner -- controller of underlying input source
1109 : * @return error result
1110 : */
1111 : nsresult
1112 0 : CMarkupDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner,
1113 : PRInt32 aFlag)
1114 : {
1115 : static const PRUnichar theTerminalsChars[] =
1116 : { PRUnichar('\n'), PRUnichar('\r'), PRUnichar('\''), PRUnichar('"'),
1117 : PRUnichar('>'),
1118 : PRUnichar(0) };
1119 0 : static const nsReadEndCondition theEndCondition(theTerminalsChars);
1120 0 : nsresult result = NS_OK;
1121 0 : bool done = false;
1122 0 : PRUnichar quote = 0;
1123 :
1124 0 : nsScannerIterator origin, start, end;
1125 0 : aScanner.CurrentPosition(origin);
1126 0 : start = origin;
1127 :
1128 0 : while (NS_OK == result && !done) {
1129 0 : aScanner.SetPosition(start);
1130 0 : result = aScanner.ReadUntil(start, end, theEndCondition, false);
1131 0 : if (NS_OK == result) {
1132 0 : result = aScanner.Peek(aChar);
1133 :
1134 0 : if (NS_OK == result) {
1135 0 : PRUnichar theNextChar = 0;
1136 0 : if (kCR == aChar || kNewLine == aChar) {
1137 0 : result = aScanner.GetChar(aChar); // Strip off the char
1138 0 : result = aScanner.Peek(theNextChar); // Then see what's next.
1139 : }
1140 0 : switch(aChar) {
1141 : case kCR:
1142 : // result = aScanner.GetChar(aChar);
1143 0 : if (kLF == theNextChar) {
1144 : // If the "\r" is followed by a "\n", don't replace it and
1145 : // let it be ignored by the layout system
1146 0 : end.advance(2);
1147 0 : result = aScanner.GetChar(theNextChar);
1148 : } else {
1149 : // If it standalone, replace the "\r" with a "\n" so that
1150 : // it will be considered by the layout system
1151 0 : aScanner.ReplaceCharacter(end, kLF);
1152 0 : ++end;
1153 : }
1154 0 : ++mNewlineCount;
1155 0 : break;
1156 : case kLF:
1157 0 : ++end;
1158 0 : ++mNewlineCount;
1159 0 : break;
1160 : case '\'':
1161 : case '"':
1162 0 : ++end;
1163 0 : if (quote) {
1164 0 : if (quote == aChar) {
1165 0 : quote = 0;
1166 : }
1167 : } else {
1168 0 : quote = aChar;
1169 : }
1170 0 : break;
1171 : case kGreaterThan:
1172 0 : if (quote) {
1173 0 : ++end;
1174 : } else {
1175 0 : start = end;
1176 : // Note that start is wrong after this, we just avoid temp var
1177 0 : ++start;
1178 0 : aScanner.SetPosition(start); // Skip the >
1179 0 : done = true;
1180 : }
1181 0 : break;
1182 : default:
1183 0 : NS_ABORT_IF_FALSE(0, "should not happen, switch is missing cases?");
1184 0 : break;
1185 : }
1186 0 : start = end;
1187 : } else {
1188 0 : done = true;
1189 : }
1190 : }
1191 : }
1192 0 : aScanner.BindSubstring(mTextValue, origin, end);
1193 :
1194 0 : if (kEOF == result) {
1195 0 : mInError = true;
1196 0 : if (!aScanner.IsIncremental()) {
1197 : // Hide this EOF.
1198 0 : result = NS_OK;
1199 : }
1200 : }
1201 :
1202 0 : return result;
1203 : }
1204 :
1205 : const nsSubstring&
1206 0 : CMarkupDeclToken::GetStringValue()
1207 : {
1208 0 : return mTextValue.AsString();
1209 : }
1210 :
1211 :
1212 25 : CCommentToken::CCommentToken()
1213 25 : : CHTMLToken(eHTMLTag_comment)
1214 : {
1215 25 : }
1216 :
1217 0 : CCommentToken::CCommentToken(const nsAString& aName)
1218 0 : : CHTMLToken(eHTMLTag_comment)
1219 : {
1220 0 : mComment.Rebind(aName);
1221 0 : }
1222 :
1223 : void
1224 0 : CCommentToken::AppendSourceTo(nsAString& anOutputString)
1225 : {
1226 0 : AppendUnicodeTo(mCommentDecl, anOutputString);
1227 0 : }
1228 :
1229 : static bool
1230 0 : IsCommentEnd(const nsScannerIterator& aCurrent, const nsScannerIterator& aEnd,
1231 : nsScannerIterator& aGt)
1232 : {
1233 0 : nsScannerIterator current = aCurrent;
1234 0 : PRInt32 dashes = 0;
1235 :
1236 0 : while (current != aEnd && dashes != 2) {
1237 0 : if (*current == kGreaterThan) {
1238 0 : aGt = current;
1239 0 : return true;
1240 : }
1241 0 : if (*current == PRUnichar('-')) {
1242 0 : ++dashes;
1243 : } else {
1244 0 : dashes = 0;
1245 : }
1246 0 : ++current;
1247 : }
1248 :
1249 0 : return false;
1250 : }
1251 :
1252 : nsresult
1253 0 : CCommentToken::ConsumeStrictComment(nsScanner& aScanner)
1254 : {
1255 : // <!--[... -- ... -- ...]*-->
1256 : /*********************************************************
1257 : NOTE: This algorithm does a fine job of handling comments
1258 : when they're formatted per spec, but if they're not
1259 : we don't handle them well.
1260 : *********************************************************/
1261 0 : nsScannerIterator end, current, gt, lt;
1262 0 : aScanner.EndReading(end);
1263 0 : aScanner.CurrentPosition(current);
1264 :
1265 0 : nsScannerIterator beginData = end;
1266 :
1267 0 : lt = current;
1268 0 : lt.advance(-2); // <!
1269 :
1270 0 : current.advance(-1);
1271 :
1272 : // Regular comment must start with <!--
1273 0 : if (*current == kExclamation &&
1274 0 : ++current != end && *current == kMinus &&
1275 0 : ++current != end && *current == kMinus &&
1276 0 : ++current != end) {
1277 0 : nsScannerIterator currentEnd = end;
1278 0 : bool balancedComment = false;
1279 0 : NS_NAMED_LITERAL_STRING(dashes, "--");
1280 0 : beginData = current;
1281 :
1282 0 : while (FindInReadable(dashes, current, currentEnd)) {
1283 0 : current.advance(2);
1284 :
1285 0 : balancedComment = !balancedComment; // We need to match '--' with '--'
1286 :
1287 0 : if (balancedComment && IsCommentEnd(current, end, gt)) {
1288 : // done
1289 0 : current.advance(-2);
1290 : // Note: it's ok if beginData == current, (we'll copy an empty string)
1291 : // and we need to bind mComment anyway.
1292 0 : aScanner.BindSubstring(mComment, beginData, current);
1293 0 : aScanner.BindSubstring(mCommentDecl, lt, ++gt);
1294 0 : aScanner.SetPosition(gt);
1295 0 : return NS_OK;
1296 : }
1297 :
1298 : // Continue after the last '--'
1299 0 : currentEnd = end;
1300 : }
1301 : }
1302 :
1303 : // If beginData == end, we did not find opening '--'
1304 0 : if (beginData == end) {
1305 : // This might have been empty comment: <!>
1306 : // Or it could have been something completely bogus like: <!This is foobar>
1307 : // Handle both cases below
1308 0 : aScanner.CurrentPosition(current);
1309 0 : beginData = current;
1310 0 : if (FindCharInReadable('>', current, end)) {
1311 0 : aScanner.BindSubstring(mComment, beginData, current);
1312 0 : aScanner.BindSubstring(mCommentDecl, lt, ++current);
1313 0 : aScanner.SetPosition(current);
1314 0 : return NS_OK;
1315 : }
1316 : }
1317 :
1318 0 : if (aScanner.IsIncremental()) {
1319 : // We got here because we saw the beginning of a comment,
1320 : // but not yet the end, and we are still loading the page. In that
1321 : // case the return value here will cause us to unwind,
1322 : // wait for more content, and try again.
1323 : // XXX For performance reasons we should cache where we were, and
1324 : // continue from there for next call
1325 0 : return kEOF;
1326 : }
1327 :
1328 : // There was no terminating string, parse this comment as text.
1329 0 : aScanner.SetPosition(lt, false, true);
1330 0 : return kNotAComment;
1331 : }
1332 :
1333 : nsresult
1334 25 : CCommentToken::ConsumeQuirksComment(nsScanner& aScanner)
1335 : {
1336 : // <![-[-]] ... [[-]-|--!]>
1337 : /*********************************************************
1338 : NOTE: This algorithm does a fine job of handling comments
1339 : commonly used, but it doesn't really consume them
1340 : per spec (But then, neither does IE or Nav).
1341 : *********************************************************/
1342 25 : nsScannerIterator end, current;
1343 25 : aScanner.EndReading(end);
1344 25 : aScanner.CurrentPosition(current);
1345 25 : nsScannerIterator beginData = current,
1346 25 : beginLastMinus = end,
1347 25 : bestAltCommentEnd = end,
1348 25 : lt = current;
1349 25 : lt.advance(-2); // <!
1350 :
1351 : // When we get here, we have always already consumed <!
1352 : // Skip over possible leading minuses
1353 25 : if (current != end && *current == kMinus) {
1354 25 : beginLastMinus = current;
1355 25 : ++current;
1356 25 : ++beginData;
1357 25 : if (current != end && *current == kMinus) { // <!--
1358 25 : beginLastMinus = current;
1359 25 : ++current;
1360 25 : ++beginData;
1361 : // Long form comment
1362 :
1363 25 : nsScannerIterator currentEnd = end, gt = end;
1364 :
1365 : // Find the end of the comment
1366 50 : while (FindCharInReadable(kGreaterThan, current, currentEnd)) {
1367 25 : gt = current;
1368 25 : if (bestAltCommentEnd == end) {
1369 25 : bestAltCommentEnd = gt;
1370 : }
1371 25 : --current;
1372 25 : bool goodComment = false;
1373 25 : if (current != beginLastMinus && *current == kMinus) { // ->
1374 25 : --current;
1375 25 : if (current != beginLastMinus && *current == kMinus) { // -->
1376 25 : goodComment = true;
1377 25 : --current;
1378 : }
1379 0 : } else if (current != beginLastMinus && *current == '!') {
1380 0 : --current;
1381 0 : if (current != beginLastMinus && *current == kMinus) {
1382 0 : --current;
1383 0 : if (current != beginLastMinus && *current == kMinus) { // --!>
1384 0 : --current;
1385 0 : goodComment = true;
1386 : }
1387 : }
1388 0 : } else if (current == beginLastMinus) {
1389 0 : goodComment = true;
1390 : }
1391 :
1392 25 : if (goodComment) {
1393 : // done
1394 25 : aScanner.BindSubstring(mComment, beginData, ++current);
1395 25 : aScanner.BindSubstring(mCommentDecl, lt, ++gt);
1396 25 : aScanner.SetPosition(gt);
1397 25 : return NS_OK;
1398 : } else {
1399 : // try again starting after the last '>'
1400 0 : current = ++gt;
1401 0 : currentEnd = end;
1402 : }
1403 : }
1404 :
1405 0 : if (aScanner.IsIncremental()) {
1406 : // We got here because we saw the beginning of a comment,
1407 : // but not yet the end, and we are still loading the page. In that
1408 : // case the return value here will cause us to unwind,
1409 : // wait for more content, and try again.
1410 : // XXX For performance reasons we should cache where we were, and
1411 : // continue from there for next call
1412 0 : return kEOF;
1413 : }
1414 :
1415 : // If you're here, then we're in a special state.
1416 : // The problem at hand is that we've hit the end of the document without
1417 : // finding the normal endcomment delimiter "-->". In this case, the
1418 : // first thing we try is to see if we found an alternate endcomment
1419 : // delimiter ">". If so, rewind just pass that, and use everything up
1420 : // to that point as your comment. If not, the document has no end
1421 : // comment and should be treated as one big comment.
1422 0 : gt = bestAltCommentEnd;
1423 0 : aScanner.BindSubstring(mComment, beginData, gt);
1424 0 : if (gt != end) {
1425 0 : ++gt;
1426 : }
1427 0 : aScanner.BindSubstring(mCommentDecl, lt, gt);
1428 0 : aScanner.SetPosition(gt);
1429 0 : return NS_OK;
1430 : }
1431 : }
1432 :
1433 : // This could be short form of comment
1434 : // Find the end of the comment
1435 0 : current = beginData;
1436 0 : if (FindCharInReadable(kGreaterThan, current, end)) {
1437 0 : nsScannerIterator gt = current;
1438 0 : if (current != beginData) {
1439 0 : --current;
1440 0 : if (current != beginData && *current == kMinus) { // ->
1441 0 : --current;
1442 0 : if (current != beginData && *current == kMinus) { // -->
1443 0 : --current;
1444 : }
1445 0 : } else if (current != beginData && *current == '!') { // !>
1446 0 : --current;
1447 0 : if (current != beginData && *current == kMinus) { // -!>
1448 0 : --current;
1449 0 : if (current != beginData && *current == kMinus) { // --!>
1450 0 : --current;
1451 : }
1452 : }
1453 : }
1454 : }
1455 :
1456 0 : if (current != gt) {
1457 0 : aScanner.BindSubstring(mComment, beginData, ++current);
1458 : } else {
1459 : // Bind mComment to an empty string (note that if current == gt,
1460 : // then current == beginData). We reach this for <!>
1461 0 : aScanner.BindSubstring(mComment, beginData, current);
1462 : }
1463 0 : aScanner.BindSubstring(mCommentDecl, lt, ++gt);
1464 0 : aScanner.SetPosition(gt);
1465 0 : return NS_OK;
1466 : }
1467 :
1468 0 : if (!aScanner.IsIncremental()) {
1469 : // This isn't a comment at all, go back to the < and consume as text.
1470 0 : aScanner.SetPosition(lt, false, true);
1471 0 : return kNotAComment;
1472 : }
1473 :
1474 : // Wait for more data...
1475 0 : return kEOF;
1476 : }
1477 :
1478 : /*
1479 : * Consume the identifier portion of the comment.
1480 : * Note that we've already eaten the "<!" portion.
1481 : *
1482 : * @param aChar -- last char consumed from stream
1483 : * @param aScanner -- controller of underlying input source
1484 : * @return error result
1485 : */
1486 : nsresult
1487 25 : CCommentToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1488 : {
1489 25 : nsresult result = true;
1490 :
1491 25 : if (aFlag & NS_IPARSER_FLAG_STRICT_MODE) {
1492 : // Enabling strict comment parsing for Bug 53011 and 2749 contradicts!
1493 0 : result = ConsumeStrictComment(aScanner);
1494 : } else {
1495 25 : result = ConsumeQuirksComment(aScanner);
1496 : }
1497 :
1498 25 : if (NS_SUCCEEDED(result)) {
1499 25 : mNewlineCount = mCommentDecl.CountChar(kNewLine);
1500 : }
1501 :
1502 25 : return result;
1503 : }
1504 :
1505 : const nsSubstring&
1506 0 : CCommentToken::GetStringValue()
1507 : {
1508 0 : return mComment.AsString();
1509 : }
1510 :
1511 : PRInt32
1512 50 : CCommentToken::GetTokenType()
1513 : {
1514 50 : return eToken_comment;
1515 : }
1516 :
1517 530 : CNewlineToken::CNewlineToken()
1518 530 : : CHTMLToken(eHTMLTag_newline)
1519 : {
1520 530 : }
1521 :
1522 : PRInt32
1523 1590 : CNewlineToken::GetTokenType()
1524 : {
1525 1590 : return eToken_newline;
1526 : }
1527 :
1528 : static nsScannerSubstring* gNewlineStr;
1529 : void
1530 263 : CNewlineToken::AllocNewline()
1531 : {
1532 526 : gNewlineStr = new nsScannerSubstring(NS_LITERAL_STRING("\n"));
1533 263 : }
1534 :
1535 : void
1536 263 : CNewlineToken::FreeNewline()
1537 : {
1538 263 : if (gNewlineStr) {
1539 263 : delete gNewlineStr;
1540 263 : gNewlineStr = nsnull;
1541 : }
1542 263 : }
1543 :
1544 : /**
1545 : * This method retrieves the value of this internal string.
1546 : *
1547 : * @return nsString reference to internal string value
1548 : */
1549 : const nsSubstring&
1550 0 : CNewlineToken::GetStringValue()
1551 : {
1552 0 : return gNewlineStr->AsString();
1553 : }
1554 :
1555 : /*
1556 : * Consume one newline (cr/lf pair).
1557 : *
1558 : * @param aChar -- last char consumed from stream
1559 : * @param aScanner -- controller of underlying input source
1560 : * @return error result
1561 : */
1562 : nsresult
1563 530 : CNewlineToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1564 : {
1565 : /*
1566 : * Here's what the HTML spec says about newlines:
1567 : *
1568 : * "A line break is defined to be a carriage return (
),
1569 : * a line feed (
), or a carriage return/line feed pair.
1570 : * All line breaks constitute white space."
1571 : */
1572 :
1573 530 : nsresult rv = NS_OK;
1574 530 : if (aChar == kCR) {
1575 : PRUnichar theChar;
1576 0 : rv = aScanner.Peek(theChar);
1577 0 : if (theChar == kNewLine) {
1578 0 : rv = aScanner.GetChar(theChar);
1579 0 : } else if (rv == kEOF && !aScanner.IsIncremental()) {
1580 : // Make sure we don't lose information about this trailing newline.
1581 0 : rv = NS_OK;
1582 : }
1583 : }
1584 :
1585 530 : mNewlineCount = 1;
1586 530 : return rv;
1587 : }
1588 :
1589 732 : CAttributeToken::CAttributeToken()
1590 732 : : CHTMLToken(eHTMLTag_unknown)
1591 : {
1592 732 : mHasEqualWithoutValue = false;
1593 732 : }
1594 :
1595 : /*
1596 : * String based constructor
1597 : */
1598 0 : CAttributeToken::CAttributeToken(const nsAString& aName)
1599 0 : : CHTMLToken(eHTMLTag_unknown)
1600 : {
1601 0 : mTextValue.writable().Assign(aName);
1602 0 : mHasEqualWithoutValue = false;
1603 0 : }
1604 :
1605 : /*
1606 : * construct initializing data to key value pair
1607 : */
1608 0 : CAttributeToken::CAttributeToken(const nsAString& aKey, const nsAString& aName)
1609 0 : : CHTMLToken(eHTMLTag_unknown)
1610 : {
1611 0 : mTextValue.writable().Assign(aName);
1612 0 : mTextKey.Rebind(aKey);
1613 0 : mHasEqualWithoutValue = false;
1614 0 : }
1615 :
1616 : PRInt32
1617 1468 : CAttributeToken::GetTokenType()
1618 : {
1619 1468 : return eToken_attribute;
1620 : }
1621 :
1622 : const nsSubstring&
1623 0 : CAttributeToken::GetStringValue()
1624 : {
1625 0 : return mTextValue.str();
1626 : }
1627 :
1628 : void
1629 0 : CAttributeToken::GetSource(nsString& anOutputString)
1630 : {
1631 0 : anOutputString.Truncate();
1632 0 : AppendSourceTo(anOutputString);
1633 0 : }
1634 :
1635 : void
1636 0 : CAttributeToken::AppendSourceTo(nsAString& anOutputString)
1637 : {
1638 0 : AppendUnicodeTo(mTextKey, anOutputString);
1639 0 : if (mTextValue.str().Length() || mHasEqualWithoutValue) {
1640 0 : anOutputString.AppendLiteral("=");
1641 : }
1642 0 : anOutputString.Append(mTextValue.str());
1643 : // anOutputString.AppendLiteral(";");
1644 0 : }
1645 :
1646 : /*
1647 : * This general purpose method is used when you want to
1648 : * consume a known quoted string.
1649 : */
1650 : static nsresult
1651 732 : ConsumeQuotedString(PRUnichar aChar,
1652 : nsScannerSharedSubstring& aString,
1653 : PRInt32& aNewlineCount,
1654 : nsScanner& aScanner,
1655 : PRInt32 aFlag)
1656 : {
1657 732 : NS_ASSERTION(aChar == kQuote || aChar == kApostrophe,
1658 : "char is neither quote nor apostrophe");
1659 : // Hold onto this in case this is an unterminated string literal
1660 732 : PRUint32 origLen = aString.str().Length();
1661 :
1662 : static const PRUnichar theTerminalCharsQuote[] = {
1663 : PRUnichar(kQuote), PRUnichar('&'), PRUnichar(kCR),
1664 : PRUnichar(kNewLine), PRUnichar(0) };
1665 : static const PRUnichar theTerminalCharsApostrophe[] = {
1666 : PRUnichar(kApostrophe), PRUnichar('&'), PRUnichar(kCR),
1667 : PRUnichar(kNewLine), PRUnichar(0) };
1668 : static const nsReadEndCondition
1669 732 : theTerminateConditionQuote(theTerminalCharsQuote);
1670 : static const nsReadEndCondition
1671 732 : theTerminateConditionApostrophe(theTerminalCharsApostrophe);
1672 :
1673 : // Assume Quote to init to something
1674 732 : const nsReadEndCondition *terminateCondition = &theTerminateConditionQuote;
1675 732 : if (aChar == kApostrophe) {
1676 0 : terminateCondition = &theTerminateConditionApostrophe;
1677 : }
1678 :
1679 732 : nsresult result = NS_OK;
1680 732 : nsScannerIterator theOffset;
1681 732 : aScanner.CurrentPosition(theOffset);
1682 :
1683 : result = ConsumeUntil(aString, aNewlineCount, aScanner,
1684 732 : *terminateCondition, true, true, aFlag);
1685 :
1686 732 : if (NS_SUCCEEDED(result)) {
1687 732 : result = aScanner.GetChar(aChar); // aChar should be " or '
1688 : }
1689 :
1690 : // Ref: Bug 35806
1691 : // A back up measure when disaster strikes...
1692 : // Ex <table> <tr d="><td>hello</td></tr></table>
1693 1464 : if (!aString.str().IsEmpty() && aString.str().Last() != aChar &&
1694 732 : !aScanner.IsIncremental() && result == kEOF) {
1695 : static const nsReadEndCondition
1696 0 : theAttributeTerminator(kAttributeTerminalChars);
1697 0 : aString.writable().Truncate(origLen);
1698 0 : aScanner.SetPosition(theOffset, false, true);
1699 : result = ConsumeUntil(aString, aNewlineCount, aScanner,
1700 0 : theAttributeTerminator, false, true, aFlag);
1701 0 : if (NS_SUCCEEDED(result) && (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1702 : // Remember that this string literal was unterminated.
1703 0 : result = NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL;
1704 : }
1705 : }
1706 732 : return result;
1707 : }
1708 :
1709 : /*
1710 : * This method is meant to be used by view-source to consume invalid attributes.
1711 : * For the purposes of this method, an invalid attribute is an attribute that
1712 : * starts with either ', ", or /. We consume all ', ", or / and the following
1713 : * whitespace.
1714 : *
1715 : * @param aScanner -- the scanner we're reading our data from.
1716 : * @param aChar -- the character we're skipping
1717 : * @param aCurrent -- the current position that we're looking at.
1718 : * @param aNewlineCount -- a count of the newlines we've consumed.
1719 : * @return error result.
1720 : */
1721 : static nsresult
1722 0 : ConsumeInvalidAttribute(nsScanner& aScanner,
1723 : PRUnichar aChar,
1724 : nsScannerIterator& aCurrent,
1725 : PRInt32& aNewlineCount)
1726 : {
1727 0 : NS_ASSERTION(aChar == kApostrophe || aChar == kQuote || aChar == kForwardSlash,
1728 : "aChar must be a quote or apostrophe");
1729 0 : nsScannerIterator end, wsbeg;
1730 0 : aScanner.EndReading(end);
1731 :
1732 0 : while (aCurrent != end && *aCurrent == aChar) {
1733 0 : ++aCurrent;
1734 : }
1735 :
1736 0 : aScanner.SetPosition(aCurrent);
1737 0 : return aScanner.ReadWhitespace(wsbeg, aCurrent, aNewlineCount);
1738 : }
1739 :
1740 : /*
1741 : * Consume the key and value portions of the attribute.
1742 : */
1743 : nsresult
1744 732 : CAttributeToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1745 : {
1746 : nsresult result;
1747 732 : nsScannerIterator wsstart, wsend;
1748 :
1749 732 : if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1750 0 : result = aScanner.ReadWhitespace(wsstart, wsend, mNewlineCount);
1751 0 : if (kEOF == result && wsstart != wsend) {
1752 : // Do this here so if this is the final token in the document, we don't
1753 : // lose the whitespace.
1754 0 : aScanner.BindSubstring(mTextKey, wsstart, wsend);
1755 : }
1756 : } else {
1757 732 : result = aScanner.SkipWhitespace(mNewlineCount);
1758 : }
1759 :
1760 732 : if (NS_OK == result) {
1761 : static const PRUnichar theTerminalsChars[] =
1762 : { PRUnichar(' '), PRUnichar('"'),
1763 : PRUnichar('='), PRUnichar('\n'),
1764 : PRUnichar('\r'), PRUnichar('\t'),
1765 : PRUnichar('>'), PRUnichar('<'),
1766 : PRUnichar('\''), PRUnichar('/'),
1767 : PRUnichar(0) };
1768 732 : static const nsReadEndCondition theEndCondition(theTerminalsChars);
1769 :
1770 732 : nsScannerIterator start, end;
1771 732 : result = aScanner.ReadUntil(start, end, theEndCondition, false);
1772 :
1773 732 : if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1774 732 : aScanner.BindSubstring(mTextKey, start, end);
1775 0 : } else if (kEOF == result && wsstart != end) {
1776 : // Capture all of the text (from the beginning of the whitespace to the
1777 : // end of the document).
1778 0 : aScanner.BindSubstring(mTextKey, wsstart, end);
1779 : }
1780 :
1781 : // Now it's time to Consume the (optional) value...
1782 732 : if (NS_OK == result) {
1783 732 : if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1784 0 : result = aScanner.ReadWhitespace(start, wsend, mNewlineCount);
1785 0 : aScanner.BindSubstring(mTextKey, wsstart, wsend);
1786 : } else {
1787 732 : result = aScanner.SkipWhitespace(mNewlineCount);
1788 : }
1789 :
1790 732 : if (NS_OK == result) {
1791 : // Skip ahead until you find an equal sign or a '>'...
1792 732 : result = aScanner.Peek(aChar);
1793 732 : if (NS_OK == result) {
1794 732 : if (kEqual == aChar) {
1795 732 : result = aScanner.GetChar(aChar); // Skip the equal sign...
1796 732 : if (NS_OK == result) {
1797 732 : if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1798 : bool haveCR;
1799 : result = aScanner.ReadWhitespace(mTextValue, mNewlineCount,
1800 0 : haveCR);
1801 : } else {
1802 732 : result = aScanner.SkipWhitespace(mNewlineCount);
1803 : }
1804 :
1805 732 : if (NS_OK == result) {
1806 732 : result = aScanner.Peek(aChar); // And grab the next char.
1807 732 : if (NS_OK == result) {
1808 732 : if (kQuote == aChar || kApostrophe == aChar) {
1809 732 : aScanner.GetChar(aChar);
1810 732 : if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1811 0 : mTextValue.writable().Append(aChar);
1812 : }
1813 :
1814 : result = ConsumeQuotedString(aChar, mTextValue,
1815 : mNewlineCount, aScanner,
1816 732 : aFlag);
1817 1464 : if (NS_SUCCEEDED(result) &&
1818 : (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1819 0 : mTextValue.writable().Append(aChar);
1820 732 : } else if (result ==
1821 : NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL) {
1822 0 : result = NS_OK;
1823 0 : mInError = true;
1824 : }
1825 : // According to spec. we ( who? ) should ignore linefeeds.
1826 : // But look, even the carriage return was getting stripped
1827 : // ( wonder why! ) - Ref. to bug 15204. Okay, so the
1828 : // spec. told us to ignore linefeeds, bug then what about
1829 : // bug 47535 ? Should we preserve everything then? Well,
1830 : // let's make it so!
1831 0 : } else if (kGreaterThan == aChar) {
1832 0 : mHasEqualWithoutValue = true;
1833 0 : mInError = true;
1834 : } else {
1835 : static const nsReadEndCondition
1836 0 : theAttributeTerminator(kAttributeTerminalChars);
1837 : result =
1838 : ConsumeUntil(mTextValue,
1839 : mNewlineCount,
1840 : aScanner,
1841 : theAttributeTerminator,
1842 : false,
1843 : true,
1844 0 : aFlag);
1845 : }
1846 : }
1847 732 : if (NS_OK == result) {
1848 732 : if (aFlag & NS_IPARSER_FLAG_VIEW_SOURCE) {
1849 : bool haveCR;
1850 : result = aScanner.ReadWhitespace(mTextValue, mNewlineCount,
1851 0 : haveCR);
1852 : } else {
1853 732 : result = aScanner.SkipWhitespace(mNewlineCount);
1854 : }
1855 : }
1856 : } else {
1857 : // We saw an equal sign but ran out of room looking for a value.
1858 0 : mHasEqualWithoutValue = true;
1859 0 : mInError = true;
1860 : }
1861 : }
1862 : } else {
1863 : // This is where we have to handle fairly busted content.
1864 : // If you're here, it means we saw an attribute name, but couldn't
1865 : // find the following equal sign. <tag NAME....
1866 :
1867 : // Doing this right in all cases is <i>REALLY</i> ugly.
1868 : // My best guess is to grab the next non-ws char. We know it's not
1869 : // '=', so let's see what it is. If it's a '"', then assume we're
1870 : // reading from the middle of the value. Try stripping the quote
1871 : // and continuing... Note that this code also strips forward
1872 : // slashes to handle cases like <tag NAME/>
1873 0 : if (kQuote == aChar || kApostrophe == aChar ||
1874 : kForwardSlash == aChar) {
1875 : // In XML, a trailing slash isn't an error.
1876 0 : if (kForwardSlash != aChar || !(aFlag & NS_IPARSER_FLAG_XML)) {
1877 0 : mInError = true;
1878 : }
1879 :
1880 0 : if (!(aFlag & NS_IPARSER_FLAG_VIEW_SOURCE)) {
1881 0 : result = aScanner.SkipOver(aChar); // Strip quote or slash.
1882 0 : if (NS_SUCCEEDED(result)) {
1883 0 : result = aScanner.SkipWhitespace(mNewlineCount);
1884 : }
1885 : } else {
1886 : // We want to collect whitespace here so that following
1887 : // attributes can have the right line number (and for
1888 : // parity with the non-view-source code above).
1889 : result = ConsumeInvalidAttribute(aScanner, aChar,
1890 0 : wsend, mNewlineCount);
1891 :
1892 0 : aScanner.BindSubstring(mTextKey, wsstart, wsend);
1893 0 : aScanner.SetPosition(wsend);
1894 : }
1895 : }
1896 : }
1897 : }
1898 : }
1899 : }
1900 :
1901 732 : if (NS_OK == result) {
1902 732 : if (mTextValue.str().Length() == 0 && mTextKey.Length() == 0 &&
1903 0 : mNewlineCount == 0 && !mHasEqualWithoutValue) {
1904 : // This attribute contains no useful information for us, so there is no
1905 : // use in keeping it around. Attributes that are otherwise empty, but
1906 : // have newlines in them are passed on the the DTD so it can get line
1907 : // numbering right.
1908 0 : return NS_ERROR_HTMLPARSER_BADATTRIBUTE;
1909 : }
1910 : }
1911 : }
1912 :
1913 732 : if (kEOF == result && !aScanner.IsIncremental()) {
1914 : // This is our run-of-the mill "don't lose content at the end of a
1915 : // document" with a slight twist: we don't want to bother returning an
1916 : // empty attribute key, even if this is the end of the document.
1917 0 : if (mTextKey.Length() == 0) {
1918 0 : result = NS_ERROR_HTMLPARSER_BADATTRIBUTE;
1919 : } else {
1920 0 : result = NS_OK;
1921 : }
1922 : }
1923 :
1924 732 : return result;
1925 : }
1926 :
1927 : void
1928 0 : CAttributeToken::SetKey(const nsAString& aKey)
1929 : {
1930 0 : mTextKey.Rebind(aKey);
1931 0 : }
1932 :
1933 : void
1934 0 : CAttributeToken::BindKey(nsScanner* aScanner,
1935 : nsScannerIterator& aStart,
1936 : nsScannerIterator& aEnd)
1937 : {
1938 0 : aScanner->BindSubstring(mTextKey, aStart, aEnd);
1939 0 : }
1940 :
1941 281 : CWhitespaceToken::CWhitespaceToken()
1942 281 : : CHTMLToken(eHTMLTag_whitespace)
1943 : {
1944 281 : }
1945 :
1946 0 : CWhitespaceToken::CWhitespaceToken(const nsAString& aName)
1947 0 : : CHTMLToken(eHTMLTag_whitespace)
1948 : {
1949 0 : mTextValue.writable().Assign(aName);
1950 0 : }
1951 :
1952 843 : PRInt32 CWhitespaceToken::GetTokenType()
1953 : {
1954 843 : return eToken_whitespace;
1955 : }
1956 :
1957 : /*
1958 : * This general purpose method is used when you want to
1959 : * consume an aribrary sequence of whitespace.
1960 : *
1961 : * @param aChar -- last char consumed from stream
1962 : * @param aScanner -- controller of underlying input source
1963 : * @return error result
1964 : */
1965 : nsresult
1966 281 : CWhitespaceToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
1967 : {
1968 : // If possible, we'd like to just be a dependent substring starting at
1969 : // |aChar|. The scanner has already been advanced, so we need to
1970 : // back it up to facilitate this.
1971 :
1972 281 : nsScannerIterator start;
1973 281 : aScanner.CurrentPosition(start);
1974 281 : aScanner.SetPosition(--start, false, true);
1975 :
1976 : bool haveCR;
1977 :
1978 281 : nsresult result = aScanner.ReadWhitespace(mTextValue, mNewlineCount, haveCR);
1979 :
1980 281 : if (result == kEOF && !aScanner.IsIncremental()) {
1981 : // Oops, we ran off the end, make sure we don't lose the trailing
1982 : // whitespace!
1983 0 : result = NS_OK;
1984 : }
1985 :
1986 281 : if (NS_OK == result && haveCR) {
1987 0 : mTextValue.writable().StripChar(kCR);
1988 : }
1989 281 : return result;
1990 : }
1991 :
1992 : const nsSubstring&
1993 0 : CWhitespaceToken::GetStringValue()
1994 : {
1995 0 : return mTextValue.str();
1996 : }
1997 :
1998 0 : CEntityToken::CEntityToken()
1999 0 : : CHTMLToken(eHTMLTag_entity)
2000 : {
2001 0 : }
2002 :
2003 0 : CEntityToken::CEntityToken(const nsAString& aName)
2004 0 : : CHTMLToken(eHTMLTag_entity)
2005 : {
2006 0 : mTextValue.Assign(aName);
2007 0 : }
2008 :
2009 :
2010 : /*
2011 : * Consume the rest of the entity. We've already eaten the "&".
2012 : *
2013 : * @param aChar -- last char consumed from stream
2014 : * @param aScanner -- controller of underlying input source
2015 : * @return error result
2016 : */
2017 : nsresult
2018 0 : CEntityToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
2019 : {
2020 0 : nsresult result = ConsumeEntity(aChar, mTextValue, aScanner);
2021 0 : return result;
2022 : }
2023 :
2024 : PRInt32
2025 0 : CEntityToken::GetTokenType()
2026 : {
2027 0 : return eToken_entity;
2028 : }
2029 :
2030 : /*
2031 : * This general purpose method is used when you want to
2032 : * consume an entity &xxxx;. Keep in mind that entities
2033 : * are <i>not</i> reduced inline.
2034 : *
2035 : * @param aChar -- last char consumed from stream
2036 : * @param aScanner -- controller of underlying input source
2037 : * @return error result
2038 : */
2039 : nsresult
2040 11 : CEntityToken::ConsumeEntity(PRUnichar aChar,
2041 : nsString& aString,
2042 : nsScanner& aScanner)
2043 : {
2044 11 : nsresult result = NS_OK;
2045 11 : if (kLeftBrace == aChar) {
2046 : // You're consuming a script entity...
2047 0 : aScanner.GetChar(aChar); // Consume &
2048 :
2049 0 : PRInt32 rightBraceCount = 0;
2050 0 : PRInt32 leftBraceCount = 0;
2051 :
2052 0 : do {
2053 0 : result = aScanner.GetChar(aChar);
2054 :
2055 0 : if (NS_FAILED(result)) {
2056 0 : return result;
2057 : }
2058 :
2059 0 : aString.Append(aChar);
2060 0 : if (aChar == kRightBrace) {
2061 0 : ++rightBraceCount;
2062 0 : } else if (aChar == kLeftBrace) {
2063 0 : ++leftBraceCount;
2064 : }
2065 : } while (leftBraceCount != rightBraceCount);
2066 : } else {
2067 11 : PRUnichar theChar = 0;
2068 11 : if (kHashsign == aChar) {
2069 0 : result = aScanner.Peek(theChar, 2);
2070 :
2071 0 : if (NS_FAILED(result)) {
2072 0 : if (kEOF == result && !aScanner.IsIncremental()) {
2073 : // If this is the last buffer then we are certainly
2074 : // not dealing with an entity. That's, there are
2075 : // no more characters after &#. Bug 188278.
2076 0 : return NS_HTMLTOKENS_NOT_AN_ENTITY;
2077 : }
2078 0 : return result;
2079 : }
2080 :
2081 0 : if (nsCRT::IsAsciiDigit(theChar)) {
2082 0 : aScanner.GetChar(aChar); // Consume &
2083 0 : aScanner.GetChar(aChar); // Consume #
2084 0 : aString.Assign(aChar);
2085 0 : result = aScanner.ReadNumber(aString, 10);
2086 0 : } else if (theChar == 'x' || theChar == 'X') {
2087 0 : aScanner.GetChar(aChar); // Consume &
2088 0 : aScanner.GetChar(aChar); // Consume #
2089 0 : aScanner.GetChar(theChar); // Consume x
2090 0 : aString.Assign(aChar);
2091 0 : aString.Append(theChar);
2092 0 : result = aScanner.ReadNumber(aString, 16);
2093 : } else {
2094 0 : return NS_HTMLTOKENS_NOT_AN_ENTITY;
2095 : }
2096 : } else {
2097 11 : result = aScanner.Peek(theChar, 1);
2098 :
2099 11 : if (NS_FAILED(result)) {
2100 0 : return result;
2101 : }
2102 :
2103 11 : if (nsCRT::IsAsciiAlpha(theChar) ||
2104 : theChar == '_' ||
2105 : theChar == ':') {
2106 11 : aScanner.GetChar(aChar); // Consume &
2107 11 : result = aScanner.ReadEntityIdentifier(aString);
2108 : } else {
2109 0 : return NS_HTMLTOKENS_NOT_AN_ENTITY;
2110 : }
2111 : }
2112 : }
2113 :
2114 11 : if (NS_FAILED(result)) {
2115 0 : return result;
2116 : }
2117 :
2118 11 : result = aScanner.Peek(aChar);
2119 :
2120 11 : if (NS_FAILED(result)) {
2121 0 : return result;
2122 : }
2123 :
2124 11 : if (aChar == kSemicolon) {
2125 : // Consume semicolon that stopped the scan
2126 11 : aString.Append(aChar);
2127 11 : result = aScanner.GetChar(aChar);
2128 : }
2129 :
2130 11 : return result;
2131 : }
2132 :
2133 : /**
2134 : * Map some illegal but commonly used numeric entities into their
2135 : * appropriate unicode value.
2136 : */
2137 : #define NOT_USED 0xfffd
2138 :
2139 : static const PRUint16 PA_HackTable[] = {
2140 : 0x20ac, /* EURO SIGN */
2141 : NOT_USED,
2142 : 0x201a, /* SINGLE LOW-9 QUOTATION MARK */
2143 : 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
2144 : 0x201e, /* DOUBLE LOW-9 QUOTATION MARK */
2145 : 0x2026, /* HORIZONTAL ELLIPSIS */
2146 : 0x2020, /* DAGGER */
2147 : 0x2021, /* DOUBLE DAGGER */
2148 : 0x02c6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
2149 : 0x2030, /* PER MILLE SIGN */
2150 : 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
2151 : 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
2152 : 0x0152, /* LATIN CAPITAL LIGATURE OE */
2153 : NOT_USED,
2154 : 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
2155 : NOT_USED,
2156 : NOT_USED,
2157 : 0x2018, /* LEFT SINGLE QUOTATION MARK */
2158 : 0x2019, /* RIGHT SINGLE QUOTATION MARK */
2159 : 0x201c, /* LEFT DOUBLE QUOTATION MARK */
2160 : 0x201d, /* RIGHT DOUBLE QUOTATION MARK */
2161 : 0x2022, /* BULLET */
2162 : 0x2013, /* EN DASH */
2163 : 0x2014, /* EM DASH */
2164 : 0x02dc, /* SMALL TILDE */
2165 : 0x2122, /* TRADE MARK SIGN */
2166 : 0x0161, /* LATIN SMALL LETTER S WITH CARON */
2167 : 0x203a, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
2168 : 0x0153, /* LATIN SMALL LIGATURE OE */
2169 : NOT_USED,
2170 : 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
2171 : 0x0178 /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
2172 : };
2173 :
2174 : static void
2175 0 : AppendNCR(nsSubstring& aString, PRInt32 aNCRValue)
2176 : {
2177 : /* For some illegal, but popular usage */
2178 0 : if (aNCRValue >= 0x0080 && aNCRValue <= 0x009f) {
2179 0 : aNCRValue = PA_HackTable[aNCRValue - 0x0080];
2180 : }
2181 :
2182 0 : AppendUCS4ToUTF16(ENSURE_VALID_CHAR(aNCRValue), aString);
2183 0 : }
2184 :
2185 : /*
2186 : * This method converts this entity into its underlying
2187 : * unicode equivalent.
2188 : *
2189 : * @param aString will hold the resulting string value
2190 : * @return numeric (unichar) value
2191 : */
2192 : PRInt32
2193 0 : CEntityToken::TranslateToUnicodeStr(nsString& aString)
2194 : {
2195 0 : PRInt32 value = 0;
2196 :
2197 0 : if (mTextValue.Length() > 1) {
2198 0 : PRUnichar theChar0 = mTextValue.CharAt(0);
2199 :
2200 0 : if (kHashsign == theChar0) {
2201 0 : PRInt32 err = 0;
2202 :
2203 0 : value = mTextValue.ToInteger(&err, kAutoDetect);
2204 :
2205 0 : if (0 == err) {
2206 0 : AppendNCR(aString, value);
2207 : }
2208 : } else {
2209 0 : value = nsHTMLEntities::EntityToUnicode(mTextValue);
2210 0 : if (-1 < value) {
2211 : // We found a named entity...
2212 0 : aString.Assign(PRUnichar(value));
2213 : }
2214 : }
2215 : }
2216 :
2217 0 : return value;
2218 : }
2219 :
2220 :
2221 : const
2222 0 : nsSubstring& CEntityToken::GetStringValue()
2223 : {
2224 0 : return mTextValue;
2225 : }
2226 :
2227 : void
2228 0 : CEntityToken::GetSource(nsString& anOutputString)
2229 : {
2230 0 : anOutputString.AppendLiteral("&");
2231 0 : anOutputString += mTextValue;
2232 : // Any possible ; is part of our text value.
2233 0 : }
2234 :
2235 : void
2236 0 : CEntityToken::AppendSourceTo(nsAString& anOutputString)
2237 : {
2238 0 : anOutputString.AppendLiteral("&");
2239 0 : anOutputString += mTextValue;
2240 : // Any possible ; is part of our text value.
2241 0 : }
2242 :
2243 : const PRUnichar*
2244 0 : GetTagName(PRInt32 aTag)
2245 : {
2246 0 : const PRUnichar *result = nsHTMLTags::GetStringValue((nsHTMLTag) aTag);
2247 :
2248 0 : if (result) {
2249 0 : return result;
2250 : }
2251 :
2252 0 : if (aTag >= eHTMLTag_userdefined) {
2253 0 : return sUserdefined;
2254 : }
2255 :
2256 0 : return 0;
2257 : }
2258 :
2259 :
2260 0 : CInstructionToken::CInstructionToken()
2261 0 : : CHTMLToken(eHTMLTag_instruction)
2262 : {
2263 0 : }
2264 :
2265 0 : CInstructionToken::CInstructionToken(const nsAString& aString)
2266 0 : : CHTMLToken(eHTMLTag_unknown)
2267 : {
2268 0 : mTextValue.Assign(aString);
2269 0 : }
2270 :
2271 : nsresult
2272 0 : CInstructionToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
2273 : {
2274 0 : mTextValue.AssignLiteral("<?");
2275 0 : nsresult result = NS_OK;
2276 0 : bool done = false;
2277 :
2278 0 : while (NS_OK == result && !done) {
2279 : // Note, this call does *not* consume the >.
2280 0 : result = aScanner.ReadUntil(mTextValue, kGreaterThan, false);
2281 0 : if (NS_SUCCEEDED(result)) {
2282 : // In HTML, PIs end with a '>', in XML, they end with a '?>'. Cover both
2283 : // cases here.
2284 0 : if (!(aFlag & NS_IPARSER_FLAG_XML) ||
2285 0 : kQuestionMark == mTextValue.Last()) {
2286 : // This really is the end of the PI.
2287 0 : done = true;
2288 : }
2289 : // Need to append this character no matter what.
2290 0 : aScanner.GetChar(aChar);
2291 0 : mTextValue.Append(aChar);
2292 : }
2293 : }
2294 :
2295 0 : if (kEOF == result && !aScanner.IsIncremental()) {
2296 : // Hide the EOF result because there is no more text coming.
2297 0 : mInError = true;
2298 0 : result = NS_OK;
2299 : }
2300 :
2301 0 : return result;
2302 : }
2303 :
2304 : PRInt32
2305 0 : CInstructionToken::GetTokenType()
2306 : {
2307 0 : return eToken_instruction;
2308 : }
2309 :
2310 : const nsSubstring&
2311 0 : CInstructionToken::GetStringValue()
2312 : {
2313 0 : return mTextValue;
2314 : }
2315 :
2316 : // Doctype decl token
2317 :
2318 25 : CDoctypeDeclToken::CDoctypeDeclToken(eHTMLTags aTag)
2319 25 : : CHTMLToken(aTag)
2320 : {
2321 25 : }
2322 :
2323 0 : CDoctypeDeclToken::CDoctypeDeclToken(const nsAString& aString, eHTMLTags aTag)
2324 0 : : CHTMLToken(aTag), mTextValue(aString)
2325 : {
2326 0 : }
2327 :
2328 : /**
2329 : * This method consumes a doctype element.
2330 : * Note: I'm rewriting this method to seek to the first <, since quotes can
2331 : * really screw us up.
2332 : * XXX Maybe this should do better in XML or strict mode?
2333 : */
2334 : nsresult
2335 25 : CDoctypeDeclToken::Consume(PRUnichar aChar, nsScanner& aScanner, PRInt32 aFlag)
2336 : {
2337 : static const PRUnichar terminalChars[] =
2338 : { PRUnichar('>'), PRUnichar('<'),
2339 : PRUnichar(0)
2340 : };
2341 25 : static const nsReadEndCondition theEndCondition(terminalChars);
2342 :
2343 25 : nsScannerIterator start, end;
2344 :
2345 25 : aScanner.CurrentPosition(start);
2346 25 : aScanner.EndReading(end);
2347 :
2348 25 : nsresult result = aScanner.ReadUntil(start, end, theEndCondition, false);
2349 :
2350 25 : if (NS_SUCCEEDED(result)) {
2351 : PRUnichar ch;
2352 25 : aScanner.Peek(ch);
2353 25 : if (ch == kGreaterThan) {
2354 : // Include '>' but not '<' since '<'
2355 : // could belong to another tag.
2356 25 : aScanner.GetChar(ch);
2357 25 : end.advance(1);
2358 : } else {
2359 0 : NS_ASSERTION(kLessThan == ch,
2360 : "Make sure this doctype decl. is really in error.");
2361 0 : mInError = true;
2362 : }
2363 0 : } else if (!aScanner.IsIncremental()) {
2364 : // We have reached the document end but haven't
2365 : // found either a '<' or a '>'. Therefore use
2366 : // whatever we have.
2367 0 : mInError = true;
2368 0 : result = NS_OK;
2369 : }
2370 :
2371 25 : if (NS_SUCCEEDED(result)) {
2372 25 : start.advance(-2); // Make sure to consume <!
2373 25 : CopyUnicodeTo(start, end, mTextValue);
2374 : }
2375 :
2376 25 : return result;
2377 : }
2378 :
2379 : PRInt32
2380 75 : CDoctypeDeclToken::GetTokenType()
2381 : {
2382 75 : return eToken_doctypeDecl;
2383 : }
2384 :
2385 : const nsSubstring&
2386 25 : CDoctypeDeclToken::GetStringValue()
2387 : {
2388 25 : return mTextValue;
2389 : }
2390 :
2391 : void
2392 25 : CDoctypeDeclToken::SetStringValue(const nsAString& aStr)
2393 : {
2394 25 : mTextValue.Assign(aStr);
2395 25 : }
|