1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim: set sw=2 ts=2 et tw=79: */
3 : /* ***** BEGIN LICENSE BLOCK *****
4 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 : *
6 : * The contents of this file are subject to the Mozilla Public License Version
7 : * 1.1 (the "License"); you may not use this file except in compliance with
8 : * the License. You may obtain a copy of the License at
9 : * http://www.mozilla.org/MPL/
10 : *
11 : * Software distributed under the License is distributed on an "AS IS" basis,
12 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 : * for the specific language governing rights and limitations under the
14 : * License.
15 : *
16 : * The Original Code is mozilla.org code.
17 : *
18 : * The Initial Developer of the Original Code is
19 : * Netscape Communications Corporation.
20 : * Portions created by the Initial Developer are Copyright (C) 1998
21 : * the Initial Developer. All Rights Reserved.
22 : *
23 : * Contributor(s):
24 : * Pierre Phaneuf <pp@ludusdesign.com>
25 : *
26 : * Alternatively, the contents of this file may be used under the terms of
27 : * either of the GNU General Public License Version 2 or later (the "GPL"),
28 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 : * in which case the provisions of the GPL or the LGPL are applicable instead
30 : * of those above. If you wish to allow use of your version of this file only
31 : * under the terms of either the GPL or the LGPL, and not to allow others to
32 : * use your version of this file under the terms of the MPL, indicate your
33 : * decision by deleting the provisions above and replace them with the notice
34 : * and other provisions required by the GPL or the LGPL. If you do not delete
35 : * the provisions above, a recipient may use your version of this file under
36 : * the terms of any one of the MPL, the GPL or the LGPL.
37 : *
38 : * ***** END LICENSE BLOCK ***** */
39 :
40 : #include "nsIAtom.h"
41 : #include "nsParser.h"
42 : #include "nsString.h"
43 : #include "nsCRT.h"
44 : #include "nsScanner.h"
45 : #include "plstr.h"
46 : #include "nsIStringStream.h"
47 : #include "nsIChannel.h"
48 : #include "nsICachingChannel.h"
49 : #include "nsICacheEntryDescriptor.h"
50 : #include "nsCharsetAlias.h"
51 : #include "nsICharsetConverterManager.h"
52 : #include "nsIInputStream.h"
53 : #include "CNavDTD.h"
54 : #include "prenv.h"
55 : #include "prlock.h"
56 : #include "prcvar.h"
57 : #include "nsParserCIID.h"
58 : #include "nsReadableUtils.h"
59 : #include "nsCOMPtr.h"
60 : #include "nsExpatDriver.h"
61 : #include "nsIServiceManager.h"
62 : #include "nsICategoryManager.h"
63 : #include "nsISupportsPrimitives.h"
64 : #include "nsIFragmentContentSink.h"
65 : #include "nsStreamUtils.h"
66 : #include "nsHTMLTokenizer.h"
67 : #include "nsIDocument.h"
68 : #include "nsNetUtil.h"
69 : #include "nsScriptLoader.h"
70 : #include "nsDataHashtable.h"
71 : #include "nsIThreadPool.h"
72 : #include "nsXPCOMCIDInternal.h"
73 : #include "nsMimeTypes.h"
74 : #include "mozilla/CondVar.h"
75 : #include "mozilla/Mutex.h"
76 : #include "nsParserConstants.h"
77 :
78 : using namespace mozilla;
79 :
80 : #define NS_PARSER_FLAG_PARSER_ENABLED 0x00000002
81 : #define NS_PARSER_FLAG_OBSERVERS_ENABLED 0x00000004
82 : #define NS_PARSER_FLAG_PENDING_CONTINUE_EVENT 0x00000008
83 : #define NS_PARSER_FLAG_FLUSH_TOKENS 0x00000020
84 : #define NS_PARSER_FLAG_CAN_TOKENIZE 0x00000040
85 :
86 : static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
87 : static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
88 : static NS_DEFINE_IID(kIParserIID, NS_IPARSER_IID);
89 :
90 : //-------------- Begin ParseContinue Event Definition ------------------------
91 : /*
92 : The parser can be explicitly interrupted by passing a return value of
93 : NS_ERROR_HTMLPARSER_INTERRUPTED from BuildModel on the DTD. This will cause
94 : the parser to stop processing and allow the application to return to the event
95 : loop. The data which was left at the time of interruption will be processed
96 : the next time OnDataAvailable is called. If the parser has received its final
97 : chunk of data then OnDataAvailable will no longer be called by the networking
98 : module, so the parser will schedule a nsParserContinueEvent which will call
99 : the parser to process the remaining data after returning to the event loop.
100 : If the parser is interrupted while processing the remaining data it will
101 : schedule another ParseContinueEvent. The processing of data followed by
102 : scheduling of the continue events will proceed until either:
103 :
104 : 1) All of the remaining data can be processed without interrupting
105 : 2) The parser has been cancelled.
106 :
107 :
108 : This capability is currently used in CNavDTD and nsHTMLContentSink. The
109 : nsHTMLContentSink is notified by CNavDTD when a chunk of tokens is going to be
110 : processed and when each token is processed. The nsHTML content sink records
111 : the time when the chunk has started processing and will return
112 : NS_ERROR_HTMLPARSER_INTERRUPTED if the token processing time has exceeded a
113 : threshold called max tokenizing processing time. This allows the content sink
114 : to limit how much data is processed in a single chunk which in turn gates how
115 : much time is spent away from the event loop. Processing smaller chunks of data
116 : also reduces the time spent in subsequent reflows.
117 :
118 : This capability is most apparent when loading large documents. If the maximum
119 : token processing time is set small enough the application will remain
120 : responsive during document load.
121 :
122 : A side-effect of this capability is that document load is not complete when
123 : the last chunk of data is passed to OnDataAvailable since the parser may have
124 : been interrupted when the last chunk of data arrived. The document is complete
125 : when all of the document has been tokenized and there aren't any pending
126 : nsParserContinueEvents. This can cause problems if the application assumes
127 : that it can monitor the load requests to determine when the document load has
128 : been completed. This is what happens in Mozilla. The document is considered
129 : completely loaded when all of the load requests have been satisfied. To delay
130 : the document load until all of the parsing has been completed the
131 : nsHTMLContentSink adds a dummy parser load request which is not removed until
132 : the nsHTMLContentSink's DidBuildModel is called. The CNavDTD will not call
133 : DidBuildModel until the final chunk of data has been passed to the parser
134 : through the OnDataAvailable and there aren't any pending
135 : nsParserContineEvents.
136 :
137 : Currently the parser is ignores requests to be interrupted during the
138 : processing of script. This is because a document.write followed by JavaScript
139 : calls to manipulate the DOM may fail if the parser was interrupted during the
140 : document.write.
141 :
142 : For more details @see bugzilla bug 76722
143 : */
144 :
145 :
146 : class nsParserContinueEvent : public nsRunnable
147 0 : {
148 : public:
149 : nsRefPtr<nsParser> mParser;
150 :
151 0 : nsParserContinueEvent(nsParser* aParser)
152 0 : : mParser(aParser)
153 0 : {}
154 :
155 0 : NS_IMETHOD Run()
156 : {
157 0 : mParser->HandleParserContinueEvent(this);
158 0 : return NS_OK;
159 : }
160 : };
161 :
162 : //-------------- End ParseContinue Event Definition ------------------------
163 :
164 : nsICharsetConverterManager* nsParser::sCharsetConverterManager = nsnull;
165 :
166 : /**
167 : * This gets called when the htmlparser module is initialized.
168 : */
169 : // static
170 : nsresult
171 263 : nsParser::Init()
172 : {
173 : nsresult rv;
174 :
175 : nsCOMPtr<nsICharsetConverterManager> charsetConverter =
176 526 : do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
177 263 : NS_ENSURE_SUCCESS(rv, rv);
178 :
179 263 : charsetConverter.swap(sCharsetConverterManager);
180 :
181 263 : return NS_OK;
182 : }
183 :
184 :
185 : /**
186 : * This gets called when the htmlparser module is shutdown.
187 : */
188 : // static
189 263 : void nsParser::Shutdown()
190 : {
191 263 : NS_IF_RELEASE(sCharsetConverterManager);
192 263 : }
193 :
194 : #ifdef DEBUG
195 : static bool gDumpContent=false;
196 : #endif
197 :
198 : /**
199 : * default constructor
200 : */
201 3345 : nsParser::nsParser()
202 : {
203 3345 : Initialize(true);
204 3345 : }
205 :
206 10035 : nsParser::~nsParser()
207 : {
208 3345 : Cleanup();
209 13380 : }
210 :
211 : void
212 3346 : nsParser::Initialize(bool aConstructor)
213 : {
214 : #ifdef NS_DEBUG
215 3346 : if (!gDumpContent) {
216 3346 : gDumpContent = PR_GetEnv("PARSER_DUMP_CONTENT") != nsnull;
217 : }
218 : #endif
219 :
220 3346 : if (aConstructor) {
221 : // Raw pointer
222 3345 : mParserContext = 0;
223 : }
224 : else {
225 : // nsCOMPtrs
226 1 : mObserver = nsnull;
227 1 : mUnusedInput.Truncate();
228 : }
229 :
230 3346 : mContinueEvent = nsnull;
231 3346 : mCharsetSource = kCharsetUninitialized;
232 3346 : mCharset.AssignLiteral("ISO-8859-1");
233 3346 : mInternalState = NS_OK;
234 3346 : mStreamStatus = 0;
235 3346 : mCommand = eViewNormal;
236 : mFlags = NS_PARSER_FLAG_OBSERVERS_ENABLED |
237 : NS_PARSER_FLAG_PARSER_ENABLED |
238 3346 : NS_PARSER_FLAG_CAN_TOKENIZE;
239 :
240 3346 : mProcessingNetworkData = false;
241 3346 : mIsAboutBlank = false;
242 3346 : }
243 :
244 : void
245 3346 : nsParser::Cleanup()
246 : {
247 : #ifdef NS_DEBUG
248 3346 : if (gDumpContent) {
249 0 : if (mSink) {
250 : // Sink (HTMLContentSink at this time) supports nsIDebugDumpContent
251 : // interface. We can get to the content model through the sink.
252 0 : nsresult result = NS_OK;
253 0 : nsCOMPtr<nsIDebugDumpContent> trigger = do_QueryInterface(mSink, &result);
254 0 : if (NS_SUCCEEDED(result)) {
255 0 : trigger->DumpContentModel();
256 : }
257 : }
258 : }
259 : #endif
260 :
261 : #ifdef DEBUG
262 3346 : if (mParserContext && mParserContext->mPrevContext) {
263 0 : NS_WARNING("Extra parser contexts still on the parser stack");
264 : }
265 : #endif
266 :
267 10036 : while (mParserContext) {
268 3344 : CParserContext *pc = mParserContext->mPrevContext;
269 3344 : delete mParserContext;
270 3344 : mParserContext = pc;
271 : }
272 :
273 : // It should not be possible for this flag to be set when we are getting
274 : // destroyed since this flag implies a pending nsParserContinueEvent, which
275 : // has an owning reference to |this|.
276 3346 : NS_ASSERTION(!(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT), "bad");
277 3346 : }
278 :
279 1464 : NS_IMPL_CYCLE_COLLECTION_CLASS(nsParser)
280 :
281 0 : NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsParser)
282 0 : NS_IMPL_CYCLE_COLLECTION_UNLINK_NSCOMPTR(mDTD)
283 0 : NS_IMPL_CYCLE_COLLECTION_UNLINK_NSCOMPTR(mSink)
284 0 : NS_IMPL_CYCLE_COLLECTION_UNLINK_NSCOMPTR(mObserver)
285 0 : NS_IMPL_CYCLE_COLLECTION_UNLINK_END
286 :
287 111 : NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsParser)
288 111 : NS_IMPL_CYCLE_COLLECTION_TRAVERSE_NSCOMPTR(mDTD)
289 111 : NS_IMPL_CYCLE_COLLECTION_TRAVERSE_NSCOMPTR(mSink)
290 111 : NS_IMPL_CYCLE_COLLECTION_TRAVERSE_NSCOMPTR(mObserver)
291 111 : CParserContext *pc = tmp->mParserContext;
292 333 : while (pc) {
293 111 : cb.NoteXPCOMChild(pc->mTokenizer);
294 111 : pc = pc->mPrevContext;
295 : }
296 111 : NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
297 :
298 19687 : NS_IMPL_CYCLE_COLLECTING_ADDREF(nsParser)
299 19687 : NS_IMPL_CYCLE_COLLECTING_RELEASE(nsParser)
300 39567 : NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsParser)
301 26630 : NS_INTERFACE_MAP_ENTRY(nsIStreamListener)
302 20431 : NS_INTERFACE_MAP_ENTRY(nsIParser)
303 13673 : NS_INTERFACE_MAP_ENTRY(nsIRequestObserver)
304 13673 : NS_INTERFACE_MAP_ENTRY(nsISupportsWeakReference)
305 13673 : NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIParser)
306 11865 : NS_INTERFACE_MAP_END
307 :
308 : // The parser continue event is posted only if
309 : // all of the data to parse has been passed to ::OnDataAvailable
310 : // and the parser has been interrupted by the content sink
311 : // because the processing of tokens took too long.
312 :
313 : nsresult
314 0 : nsParser::PostContinueEvent()
315 : {
316 0 : if (!(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT)) {
317 : // If this flag isn't set, then there shouldn't be a live continue event!
318 0 : NS_ASSERTION(!mContinueEvent, "bad");
319 :
320 : // This creates a reference cycle between this and the event that is
321 : // broken when the event fires.
322 0 : nsCOMPtr<nsIRunnable> event = new nsParserContinueEvent(this);
323 0 : if (NS_FAILED(NS_DispatchToCurrentThread(event))) {
324 0 : NS_WARNING("failed to dispatch parser continuation event");
325 : } else {
326 0 : mFlags |= NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
327 0 : mContinueEvent = event;
328 : }
329 : }
330 0 : return NS_OK;
331 : }
332 :
333 : NS_IMETHODIMP_(void)
334 0 : nsParser::GetCommand(nsCString& aCommand)
335 : {
336 0 : aCommand = mCommandStr;
337 0 : }
338 :
339 : /**
340 : * Call this method once you've created a parser, and want to instruct it
341 : * about the command which caused the parser to be constructed. For example,
342 : * this allows us to select a DTD which can do, say, view-source.
343 : *
344 : * @param aCommand the command string to set
345 : */
346 : NS_IMETHODIMP_(void)
347 1038 : nsParser::SetCommand(const char* aCommand)
348 : {
349 1038 : mCommandStr.Assign(aCommand);
350 1038 : if (mCommandStr.Equals("view-source")) {
351 0 : mCommand = eViewSource;
352 1038 : } else if (mCommandStr.Equals("view-fragment")) {
353 0 : mCommand = eViewFragment;
354 : } else {
355 1038 : mCommand = eViewNormal;
356 : }
357 1038 : }
358 :
359 : /**
360 : * Call this method once you've created a parser, and want to instruct it
361 : * about the command which caused the parser to be constructed. For example,
362 : * this allows us to select a DTD which can do, say, view-source.
363 : *
364 : * @param aParserCommand the command to set
365 : */
366 : NS_IMETHODIMP_(void)
367 0 : nsParser::SetCommand(eParserCommands aParserCommand)
368 : {
369 0 : mCommand = aParserCommand;
370 0 : }
371 :
372 : /**
373 : * Call this method once you've created a parser, and want to instruct it
374 : * about what charset to load
375 : *
376 : * @param aCharset- the charset of a document
377 : * @param aCharsetSource- the source of the charset
378 : */
379 : NS_IMETHODIMP_(void)
380 3906 : nsParser::SetDocumentCharset(const nsACString& aCharset, PRInt32 aCharsetSource)
381 : {
382 3906 : mCharset = aCharset;
383 3906 : mCharsetSource = aCharsetSource;
384 3906 : if (mParserContext && mParserContext->mScanner) {
385 591 : mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
386 : }
387 3906 : }
388 :
389 : void
390 591 : nsParser::SetSinkCharset(nsACString& aCharset)
391 : {
392 591 : if (mSink) {
393 591 : mSink->SetDocumentCharset(aCharset);
394 : }
395 591 : }
396 :
397 : /**
398 : * This method gets called in order to set the content
399 : * sink for this parser to dump nodes to.
400 : *
401 : * @param nsIContentSink interface for node receiver
402 : */
403 : NS_IMETHODIMP_(void)
404 3344 : nsParser::SetContentSink(nsIContentSink* aSink)
405 : {
406 3344 : NS_PRECONDITION(aSink, "sink cannot be null!");
407 3344 : mSink = aSink;
408 :
409 3344 : if (mSink) {
410 3344 : mSink->SetParser(this);
411 6688 : nsCOMPtr<nsIHTMLContentSink> htmlSink = do_QueryInterface(mSink);
412 3344 : if (htmlSink) {
413 28 : mIsAboutBlank = htmlSink->IsAboutBlank();
414 : }
415 : }
416 3344 : }
417 :
418 : /**
419 : * retrieve the sink set into the parser
420 : * @return current sink
421 : */
422 : NS_IMETHODIMP_(nsIContentSink*)
423 1038 : nsParser::GetContentSink()
424 : {
425 1038 : return mSink;
426 : }
427 :
428 : /**
429 : * Determine what DTD mode (and thus what layout nsCompatibility mode)
430 : * to use for this document based on the first chunk of data received
431 : * from the network (each parsercontext can have its own mode). (No,
432 : * this is not an optimal solution -- we really don't need to know until
433 : * after we've received the DOCTYPE, and this could easily be part of
434 : * the regular parsing process if the parser were designed in a way that
435 : * made such modifications easy.)
436 : */
437 :
438 : // Parse the PS production in the SGML spec (excluding the part dealing
439 : // with entity references) starting at theIndex into theBuffer, and
440 : // return the first index after the end of the production.
441 : static PRInt32
442 50 : ParsePS(const nsString& aBuffer, PRInt32 aIndex)
443 : {
444 25 : for (;;) {
445 50 : PRUnichar ch = aBuffer.CharAt(aIndex);
446 50 : if ((ch == PRUnichar(' ')) || (ch == PRUnichar('\t')) ||
447 : (ch == PRUnichar('\n')) || (ch == PRUnichar('\r'))) {
448 25 : ++aIndex;
449 25 : } else if (ch == PRUnichar('-')) {
450 : PRInt32 tmpIndex;
451 0 : if (aBuffer.CharAt(aIndex+1) == PRUnichar('-') &&
452 0 : kNotFound != (tmpIndex=aBuffer.Find("--",false,aIndex+2,-1))) {
453 0 : aIndex = tmpIndex + 2;
454 : } else {
455 0 : return aIndex;
456 : }
457 : } else {
458 25 : return aIndex;
459 : }
460 : }
461 : }
462 :
463 : #define PARSE_DTD_HAVE_DOCTYPE (1<<0)
464 : #define PARSE_DTD_HAVE_PUBLIC_ID (1<<1)
465 : #define PARSE_DTD_HAVE_SYSTEM_ID (1<<2)
466 : #define PARSE_DTD_HAVE_INTERNAL_SUBSET (1<<3)
467 :
468 : // return true on success (includes not present), false on failure
469 : static bool
470 28 : ParseDocTypeDecl(const nsString &aBuffer,
471 : PRInt32 *aResultFlags,
472 : nsString &aPublicID,
473 : nsString &aSystemID)
474 : {
475 28 : bool haveDoctype = false;
476 28 : *aResultFlags = 0;
477 :
478 : // Skip through any comments and processing instructions
479 : // The PI-skipping is a bit of a hack.
480 28 : PRInt32 theIndex = 0;
481 0 : do {
482 28 : theIndex = aBuffer.FindChar('<', theIndex);
483 28 : if (theIndex == kNotFound) break;
484 25 : PRUnichar nextChar = aBuffer.CharAt(theIndex+1);
485 25 : if (nextChar == PRUnichar('!')) {
486 25 : PRInt32 tmpIndex = theIndex + 2;
487 25 : if (kNotFound !=
488 : (theIndex=aBuffer.Find("DOCTYPE", true, tmpIndex, 0))) {
489 25 : haveDoctype = true;
490 25 : theIndex += 7; // skip "DOCTYPE"
491 25 : break;
492 : }
493 0 : theIndex = ParsePS(aBuffer, tmpIndex);
494 0 : theIndex = aBuffer.FindChar('>', theIndex);
495 0 : } else if (nextChar == PRUnichar('?')) {
496 0 : theIndex = aBuffer.FindChar('>', theIndex);
497 : } else {
498 0 : break;
499 : }
500 : } while (theIndex != kNotFound);
501 :
502 28 : if (!haveDoctype)
503 3 : return true;
504 25 : *aResultFlags |= PARSE_DTD_HAVE_DOCTYPE;
505 :
506 25 : theIndex = ParsePS(aBuffer, theIndex);
507 25 : theIndex = aBuffer.Find("HTML", true, theIndex, 0);
508 25 : if (kNotFound == theIndex)
509 25 : return false;
510 0 : theIndex = ParsePS(aBuffer, theIndex+4);
511 0 : PRInt32 tmpIndex = aBuffer.Find("PUBLIC", true, theIndex, 0);
512 :
513 0 : if (kNotFound != tmpIndex) {
514 0 : theIndex = ParsePS(aBuffer, tmpIndex+6);
515 :
516 : // We get here only if we've read <!DOCTYPE HTML PUBLIC
517 : // (not case sensitive) possibly with comments within.
518 :
519 : // Now find the beginning and end of the public identifier
520 : // and the system identifier (if present).
521 :
522 0 : PRUnichar lit = aBuffer.CharAt(theIndex);
523 0 : if ((lit != PRUnichar('\"')) && (lit != PRUnichar('\'')))
524 0 : return false;
525 :
526 : // Start is the first character, excluding the quote, and End is
527 : // the final quote, so there are (end-start) characters.
528 :
529 0 : PRInt32 PublicIDStart = theIndex + 1;
530 0 : PRInt32 PublicIDEnd = aBuffer.FindChar(lit, PublicIDStart);
531 0 : if (kNotFound == PublicIDEnd)
532 0 : return false;
533 0 : theIndex = ParsePS(aBuffer, PublicIDEnd + 1);
534 0 : PRUnichar next = aBuffer.CharAt(theIndex);
535 0 : if (next == PRUnichar('>')) {
536 : // There was a public identifier, but no system
537 : // identifier,
538 : // so do nothing.
539 : // This is needed to avoid the else at the end, and it's
540 : // also the most common case.
541 0 : } else if ((next == PRUnichar('\"')) ||
542 : (next == PRUnichar('\''))) {
543 : // We found a system identifier.
544 0 : *aResultFlags |= PARSE_DTD_HAVE_SYSTEM_ID;
545 0 : PRInt32 SystemIDStart = theIndex + 1;
546 0 : PRInt32 SystemIDEnd = aBuffer.FindChar(next, SystemIDStart);
547 0 : if (kNotFound == SystemIDEnd)
548 0 : return false;
549 : aSystemID =
550 0 : Substring(aBuffer, SystemIDStart, SystemIDEnd - SystemIDStart);
551 0 : } else if (next == PRUnichar('[')) {
552 : // We found an internal subset.
553 0 : *aResultFlags |= PARSE_DTD_HAVE_INTERNAL_SUBSET;
554 : } else {
555 : // Something's wrong.
556 0 : return false;
557 : }
558 :
559 : // Since a public ID is a minimum literal, we must trim
560 : // and collapse whitespace
561 0 : aPublicID = Substring(aBuffer, PublicIDStart, PublicIDEnd - PublicIDStart);
562 0 : aPublicID.CompressWhitespace(true, true);
563 0 : *aResultFlags |= PARSE_DTD_HAVE_PUBLIC_ID;
564 : } else {
565 0 : tmpIndex=aBuffer.Find("SYSTEM", true, theIndex, 0);
566 0 : if (kNotFound != tmpIndex) {
567 : // DOCTYPES with system ID but no Public ID
568 0 : *aResultFlags |= PARSE_DTD_HAVE_SYSTEM_ID;
569 :
570 0 : theIndex = ParsePS(aBuffer, tmpIndex+6);
571 0 : PRUnichar next = aBuffer.CharAt(theIndex);
572 0 : if (next != PRUnichar('\"') && next != PRUnichar('\''))
573 0 : return false;
574 :
575 0 : PRInt32 SystemIDStart = theIndex + 1;
576 0 : PRInt32 SystemIDEnd = aBuffer.FindChar(next, SystemIDStart);
577 :
578 0 : if (kNotFound == SystemIDEnd)
579 0 : return false;
580 : aSystemID =
581 0 : Substring(aBuffer, SystemIDStart, SystemIDEnd - SystemIDStart);
582 0 : theIndex = ParsePS(aBuffer, SystemIDEnd + 1);
583 : }
584 :
585 0 : PRUnichar nextChar = aBuffer.CharAt(theIndex);
586 0 : if (nextChar == PRUnichar('['))
587 0 : *aResultFlags |= PARSE_DTD_HAVE_INTERNAL_SUBSET;
588 0 : else if (nextChar != PRUnichar('>'))
589 0 : return false;
590 : }
591 0 : return true;
592 : }
593 :
594 : struct PubIDInfo
595 : {
596 : enum eMode {
597 : eQuirks, /* always quirks mode, unless there's an internal subset */
598 : eAlmostStandards,/* eCompatibility_AlmostStandards */
599 : eFullStandards /* eCompatibility_FullStandards */
600 : /*
601 : * public IDs that should trigger strict mode are not listed
602 : * since we want all future public IDs to trigger strict mode as
603 : * well
604 : */
605 : };
606 :
607 : const char* name;
608 : eMode mode_if_no_sysid;
609 : eMode mode_if_sysid;
610 : };
611 :
612 : #define ELEMENTS_OF(array_) (sizeof(array_)/sizeof(array_[0]))
613 :
614 : // These must be in nsCRT::strcmp order so binary-search can be used.
615 : // This is verified, |#ifdef DEBUG|, below.
616 :
617 : // Even though public identifiers should be case sensitive, we will do
618 : // all comparisons after converting to lower case in order to do
619 : // case-insensitive comparison since there are a number of existing web
620 : // sites that use the incorrect case. Therefore all of the public
621 : // identifiers below are in lower case (with the correct case following,
622 : // in comments). The case is verified, |#ifdef DEBUG|, below.
623 : static const PubIDInfo kPublicIDs[] = {
624 : {"+//silmaril//dtd html pro v0r11 19970101//en" /* "+//Silmaril//dtd html Pro v0r11 19970101//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
625 : {"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en" /* "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
626 : {"-//as//dtd html 3.0 aswedit + extensions//en" /* "-//AS//DTD HTML 3.0 asWedit + extensions//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
627 : {"-//ietf//dtd html 2.0 level 1//en" /* "-//IETF//DTD HTML 2.0 Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
628 : {"-//ietf//dtd html 2.0 level 2//en" /* "-//IETF//DTD HTML 2.0 Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
629 : {"-//ietf//dtd html 2.0 strict level 1//en" /* "-//IETF//DTD HTML 2.0 Strict Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
630 : {"-//ietf//dtd html 2.0 strict level 2//en" /* "-//IETF//DTD HTML 2.0 Strict Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
631 : {"-//ietf//dtd html 2.0 strict//en" /* "-//IETF//DTD HTML 2.0 Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
632 : {"-//ietf//dtd html 2.0//en" /* "-//IETF//DTD HTML 2.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
633 : {"-//ietf//dtd html 2.1e//en" /* "-//IETF//DTD HTML 2.1E//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
634 : {"-//ietf//dtd html 3.0//en" /* "-//IETF//DTD HTML 3.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
635 : {"-//ietf//dtd html 3.0//en//" /* "-//IETF//DTD HTML 3.0//EN//" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
636 : {"-//ietf//dtd html 3.2 final//en" /* "-//IETF//DTD HTML 3.2 Final//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
637 : {"-//ietf//dtd html 3.2//en" /* "-//IETF//DTD HTML 3.2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
638 : {"-//ietf//dtd html 3//en" /* "-//IETF//DTD HTML 3//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
639 : {"-//ietf//dtd html level 0//en" /* "-//IETF//DTD HTML Level 0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
640 : {"-//ietf//dtd html level 0//en//2.0" /* "-//IETF//DTD HTML Level 0//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
641 : {"-//ietf//dtd html level 1//en" /* "-//IETF//DTD HTML Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
642 : {"-//ietf//dtd html level 1//en//2.0" /* "-//IETF//DTD HTML Level 1//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
643 : {"-//ietf//dtd html level 2//en" /* "-//IETF//DTD HTML Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
644 : {"-//ietf//dtd html level 2//en//2.0" /* "-//IETF//DTD HTML Level 2//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
645 : {"-//ietf//dtd html level 3//en" /* "-//IETF//DTD HTML Level 3//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
646 : {"-//ietf//dtd html level 3//en//3.0" /* "-//IETF//DTD HTML Level 3//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
647 : {"-//ietf//dtd html strict level 0//en" /* "-//IETF//DTD HTML Strict Level 0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
648 : {"-//ietf//dtd html strict level 0//en//2.0" /* "-//IETF//DTD HTML Strict Level 0//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
649 : {"-//ietf//dtd html strict level 1//en" /* "-//IETF//DTD HTML Strict Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
650 : {"-//ietf//dtd html strict level 1//en//2.0" /* "-//IETF//DTD HTML Strict Level 1//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
651 : {"-//ietf//dtd html strict level 2//en" /* "-//IETF//DTD HTML Strict Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
652 : {"-//ietf//dtd html strict level 2//en//2.0" /* "-//IETF//DTD HTML Strict Level 2//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
653 : {"-//ietf//dtd html strict level 3//en" /* "-//IETF//DTD HTML Strict Level 3//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
654 : {"-//ietf//dtd html strict level 3//en//3.0" /* "-//IETF//DTD HTML Strict Level 3//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
655 : {"-//ietf//dtd html strict//en" /* "-//IETF//DTD HTML Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
656 : {"-//ietf//dtd html strict//en//2.0" /* "-//IETF//DTD HTML Strict//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
657 : {"-//ietf//dtd html strict//en//3.0" /* "-//IETF//DTD HTML Strict//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
658 : {"-//ietf//dtd html//en" /* "-//IETF//DTD HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
659 : {"-//ietf//dtd html//en//2.0" /* "-//IETF//DTD HTML//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
660 : {"-//ietf//dtd html//en//3.0" /* "-//IETF//DTD HTML//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
661 : {"-//metrius//dtd metrius presentational//en" /* "-//Metrius//DTD Metrius Presentational//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
662 : {"-//microsoft//dtd internet explorer 2.0 html strict//en" /* "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
663 : {"-//microsoft//dtd internet explorer 2.0 html//en" /* "-//Microsoft//DTD Internet Explorer 2.0 HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
664 : {"-//microsoft//dtd internet explorer 2.0 tables//en" /* "-//Microsoft//DTD Internet Explorer 2.0 Tables//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
665 : {"-//microsoft//dtd internet explorer 3.0 html strict//en" /* "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
666 : {"-//microsoft//dtd internet explorer 3.0 html//en" /* "-//Microsoft//DTD Internet Explorer 3.0 HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
667 : {"-//microsoft//dtd internet explorer 3.0 tables//en" /* "-//Microsoft//DTD Internet Explorer 3.0 Tables//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
668 : {"-//netscape comm. corp.//dtd html//en" /* "-//Netscape Comm. Corp.//DTD HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
669 : {"-//netscape comm. corp.//dtd strict html//en" /* "-//Netscape Comm. Corp.//DTD Strict HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
670 : {"-//o'reilly and associates//dtd html 2.0//en" /* "-//O'Reilly and Associates//DTD HTML 2.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
671 : {"-//o'reilly and associates//dtd html extended 1.0//en" /* "-//O'Reilly and Associates//DTD HTML Extended 1.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
672 : {"-//o'reilly and associates//dtd html extended relaxed 1.0//en" /* "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
673 : {"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//en" /* "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
674 : {"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//en" /* "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
675 : {"-//spyglass//dtd html 2.0 extended//en" /* "-//Spyglass//DTD HTML 2.0 Extended//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
676 : {"-//sq//dtd html 2.0 hotmetal + extensions//en" /* "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
677 : {"-//sun microsystems corp.//dtd hotjava html//en" /* "-//Sun Microsystems Corp.//DTD HotJava HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
678 : {"-//sun microsystems corp.//dtd hotjava strict html//en" /* "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
679 : {"-//w3c//dtd html 3 1995-03-24//en" /* "-//W3C//DTD HTML 3 1995-03-24//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
680 : {"-//w3c//dtd html 3.2 draft//en" /* "-//W3C//DTD HTML 3.2 Draft//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
681 : {"-//w3c//dtd html 3.2 final//en" /* "-//W3C//DTD HTML 3.2 Final//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
682 : {"-//w3c//dtd html 3.2//en" /* "-//W3C//DTD HTML 3.2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
683 : {"-//w3c//dtd html 3.2s draft//en" /* "-//W3C//DTD HTML 3.2S Draft//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
684 : {"-//w3c//dtd html 4.0 frameset//en" /* "-//W3C//DTD HTML 4.0 Frameset//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
685 : {"-//w3c//dtd html 4.0 transitional//en" /* "-//W3C//DTD HTML 4.0 Transitional//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
686 : {"-//w3c//dtd html 4.01 frameset//en" /* "-//W3C//DTD HTML 4.01 Frameset//EN" */, PubIDInfo::eQuirks, PubIDInfo::eAlmostStandards},
687 : {"-//w3c//dtd html 4.01 transitional//en" /* "-//W3C//DTD HTML 4.01 Transitional//EN" */, PubIDInfo::eQuirks, PubIDInfo::eAlmostStandards},
688 : {"-//w3c//dtd html experimental 19960712//en" /* "-//W3C//DTD HTML Experimental 19960712//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
689 : {"-//w3c//dtd html experimental 970421//en" /* "-//W3C//DTD HTML Experimental 970421//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
690 : {"-//w3c//dtd w3 html//en" /* "-//W3C//DTD W3 HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
691 : {"-//w3c//dtd xhtml 1.0 frameset//en" /* "-//W3C//DTD XHTML 1.0 Frameset//EN" */, PubIDInfo::eAlmostStandards, PubIDInfo::eAlmostStandards},
692 : {"-//w3c//dtd xhtml 1.0 transitional//en" /* "-//W3C//DTD XHTML 1.0 Transitional//EN" */, PubIDInfo::eAlmostStandards, PubIDInfo::eAlmostStandards},
693 : {"-//w3o//dtd w3 html 3.0//en" /* "-//W3O//DTD W3 HTML 3.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
694 : {"-//w3o//dtd w3 html 3.0//en//" /* "-//W3O//DTD W3 HTML 3.0//EN//" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
695 : {"-//w3o//dtd w3 html strict 3.0//en//" /* "-//W3O//DTD W3 HTML Strict 3.0//EN//" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
696 : {"-//webtechs//dtd mozilla html 2.0//en" /* "-//WebTechs//DTD Mozilla HTML 2.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
697 : {"-//webtechs//dtd mozilla html//en" /* "-//WebTechs//DTD Mozilla HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
698 : {"-/w3c/dtd html 4.0 transitional/en" /* "-/W3C/DTD HTML 4.0 Transitional/EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
699 : {"html" /* "HTML" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
700 : };
701 :
702 : #ifdef DEBUG
703 : static void
704 28 : VerifyPublicIDs()
705 : {
706 : static bool gVerified = false;
707 28 : if (!gVerified) {
708 10 : gVerified = true;
709 : PRUint32 i;
710 760 : for (i = 0; i < ELEMENTS_OF(kPublicIDs) - 1; ++i) {
711 750 : if (nsCRT::strcmp(kPublicIDs[i].name, kPublicIDs[i+1].name) >= 0) {
712 0 : NS_NOTREACHED("doctypes out of order");
713 : printf("Doctypes %s and %s out of order.\n",
714 0 : kPublicIDs[i].name, kPublicIDs[i+1].name);
715 : }
716 : }
717 770 : for (i = 0; i < ELEMENTS_OF(kPublicIDs); ++i) {
718 1520 : nsCAutoString lcPubID(kPublicIDs[i].name);
719 760 : ToLowerCase(lcPubID);
720 760 : if (nsCRT::strcmp(kPublicIDs[i].name, lcPubID.get()) != 0) {
721 0 : NS_NOTREACHED("doctype not lower case");
722 0 : printf("Doctype %s not lower case.\n", kPublicIDs[i].name);
723 : }
724 : }
725 : }
726 28 : }
727 : #endif
728 :
729 : static void
730 28 : DetermineHTMLParseMode(const nsString& aBuffer,
731 : nsDTDMode& aParseMode,
732 : eParserDocType& aDocType)
733 : {
734 : #ifdef DEBUG
735 28 : VerifyPublicIDs();
736 : #endif
737 : PRInt32 resultFlags;
738 56 : nsAutoString publicIDUCS2, sysIDUCS2;
739 28 : if (ParseDocTypeDecl(aBuffer, &resultFlags, publicIDUCS2, sysIDUCS2)) {
740 3 : if (!(resultFlags & PARSE_DTD_HAVE_DOCTYPE)) {
741 : // no DOCTYPE
742 3 : aParseMode = eDTDMode_quirks;
743 3 : aDocType = eHTML_Quirks;
744 0 : } else if ((resultFlags & PARSE_DTD_HAVE_INTERNAL_SUBSET) ||
745 0 : !(resultFlags & PARSE_DTD_HAVE_PUBLIC_ID)) {
746 : // A doctype with an internal subset is always full_standards.
747 : // A doctype without a public ID is always full_standards.
748 0 : aDocType = eHTML_Strict;
749 0 : aParseMode = eDTDMode_full_standards;
750 :
751 : // Special hack for IBM's custom DOCTYPE.
752 0 : if (!(resultFlags & PARSE_DTD_HAVE_INTERNAL_SUBSET) &&
753 0 : sysIDUCS2 == NS_LITERAL_STRING(
754 : "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")) {
755 0 : aParseMode = eDTDMode_quirks;
756 0 : aDocType = eHTML_Quirks;
757 : }
758 :
759 : } else {
760 : // We have to check our list of public IDs to see what to do.
761 : // Yes, we want UCS2 to ASCII lossy conversion.
762 0 : nsCAutoString publicID;
763 0 : publicID.AssignWithConversion(publicIDUCS2);
764 :
765 : // See comment above definition of kPublicIDs about case
766 : // sensitivity.
767 0 : ToLowerCase(publicID);
768 :
769 : // Binary search to see if we can find the correct public ID
770 : // These must be signed since maximum can go below zero and we'll
771 : // crash if it's unsigned.
772 0 : PRInt32 minimum = 0;
773 0 : PRInt32 maximum = ELEMENTS_OF(kPublicIDs) - 1;
774 : PRInt32 index;
775 0 : for (;;) {
776 0 : index = (minimum + maximum) / 2;
777 : PRInt32 comparison =
778 0 : nsCRT::strcmp(publicID.get(), kPublicIDs[index].name);
779 0 : if (comparison == 0)
780 : break;
781 0 : if (comparison < 0)
782 0 : maximum = index - 1;
783 : else
784 0 : minimum = index + 1;
785 :
786 0 : if (maximum < minimum) {
787 : // The DOCTYPE is not in our list, so it must be full_standards.
788 0 : aParseMode = eDTDMode_full_standards;
789 0 : aDocType = eHTML_Strict;
790 : return;
791 : }
792 : }
793 :
794 0 : switch ((resultFlags & PARSE_DTD_HAVE_SYSTEM_ID)
795 : ? kPublicIDs[index].mode_if_sysid
796 : : kPublicIDs[index].mode_if_no_sysid)
797 : {
798 : case PubIDInfo::eQuirks:
799 0 : aParseMode = eDTDMode_quirks;
800 0 : aDocType = eHTML_Quirks;
801 0 : break;
802 : case PubIDInfo::eAlmostStandards:
803 0 : aParseMode = eDTDMode_almost_standards;
804 0 : aDocType = eHTML_Strict;
805 0 : break;
806 : case PubIDInfo::eFullStandards:
807 0 : aParseMode = eDTDMode_full_standards;
808 0 : aDocType = eHTML_Strict;
809 0 : break;
810 : default:
811 0 : NS_NOTREACHED("no other cases!");
812 : }
813 : }
814 : } else {
815 : // badly formed DOCTYPE -> quirks
816 25 : aParseMode = eDTDMode_quirks;
817 25 : aDocType = eHTML_Quirks;
818 : }
819 : }
820 :
821 : static void
822 3341 : DetermineParseMode(const nsString& aBuffer, nsDTDMode& aParseMode,
823 : eParserDocType& aDocType, const nsACString& aMimeType)
824 : {
825 3341 : if (aMimeType.EqualsLiteral(TEXT_HTML)) {
826 28 : DetermineHTMLParseMode(aBuffer, aParseMode, aDocType);
827 26504 : } else if (aMimeType.EqualsLiteral(TEXT_PLAIN) ||
828 3313 : aMimeType.EqualsLiteral(TEXT_CSS) ||
829 3313 : aMimeType.EqualsLiteral(APPLICATION_JAVASCRIPT) ||
830 3313 : aMimeType.EqualsLiteral(APPLICATION_XJAVASCRIPT) ||
831 3313 : aMimeType.EqualsLiteral(APPLICATION_JSON) ||
832 3313 : aMimeType.EqualsLiteral(TEXT_ECMASCRIPT) ||
833 3313 : aMimeType.EqualsLiteral(APPLICATION_ECMASCRIPT) ||
834 3313 : aMimeType.EqualsLiteral(TEXT_JAVASCRIPT)) {
835 0 : aDocType = ePlainText;
836 0 : aParseMode = eDTDMode_quirks;
837 : } else { // Some form of XML
838 3313 : aDocType = eXML;
839 3313 : aParseMode = eDTDMode_full_standards;
840 : }
841 3341 : }
842 :
843 : static nsIDTD*
844 3342 : FindSuitableDTD(CParserContext& aParserContext)
845 : {
846 : // We always find a DTD.
847 3342 : aParserContext.mAutoDetectStatus = ePrimaryDetect;
848 :
849 : // Quick check for view source.
850 3342 : NS_ABORT_IF_FALSE(aParserContext.mParserCommand != eViewSource,
851 : "The old parser is not supposed to be used for View Source anymore.");
852 :
853 : // Now see if we're parsing HTML (which, as far as we're concerned, simply
854 : // means "not XML").
855 3342 : if (aParserContext.mDocType != eXML) {
856 28 : return new CNavDTD();
857 : }
858 :
859 : // If we're here, then we'd better be parsing XML.
860 3314 : NS_ASSERTION(aParserContext.mDocType == eXML, "What are you trying to send me, here?");
861 3314 : return new nsExpatDriver();
862 : }
863 :
864 : NS_IMETHODIMP
865 65 : nsParser::CancelParsingEvents()
866 : {
867 65 : if (mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT) {
868 0 : NS_ASSERTION(mContinueEvent, "mContinueEvent is null");
869 : // Revoke the pending continue parsing event
870 0 : mContinueEvent = nsnull;
871 0 : mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
872 : }
873 65 : return NS_OK;
874 : }
875 :
876 : ////////////////////////////////////////////////////////////////////////
877 :
878 : /**
879 : * Evalutes EXPR1 and EXPR2 exactly once each, in that order. Stores the value
880 : * of EXPR2 in RV is EXPR2 fails, otherwise RV contains the result of EXPR1
881 : * (which could be success or failure).
882 : *
883 : * To understand the motivation for this construct, consider these example
884 : * methods:
885 : *
886 : * nsresult nsSomething::DoThatThing(nsIWhatever* obj) {
887 : * nsresult rv = NS_OK;
888 : * ...
889 : * return obj->DoThatThing();
890 : * NS_ENSURE_SUCCESS(rv, rv);
891 : * ...
892 : * return rv;
893 : * }
894 : *
895 : * void nsCaller::MakeThingsHappen() {
896 : * return mSomething->DoThatThing(mWhatever);
897 : * }
898 : *
899 : * Suppose, for whatever reason*, we want to shift responsibility for calling
900 : * mWhatever->DoThatThing() from nsSomething::DoThatThing up to
901 : * nsCaller::MakeThingsHappen. We might rewrite the two methods as follows:
902 : *
903 : * nsresult nsSomething::DoThatThing() {
904 : * nsresult rv = NS_OK;
905 : * ...
906 : * ...
907 : * return rv;
908 : * }
909 : *
910 : * void nsCaller::MakeThingsHappen() {
911 : * nsresult rv;
912 : * PREFER_LATTER_ERROR_CODE(mSomething->DoThatThing(),
913 : * mWhatever->DoThatThing(),
914 : * rv);
915 : * return rv;
916 : * }
917 : *
918 : * *Possible reasons include: nsCaller doesn't want to give mSomething access
919 : * to mWhatever, nsCaller wants to guarantee that mWhatever->DoThatThing() will
920 : * be called regardless of how nsSomething::DoThatThing behaves, &c.
921 : */
922 : #define PREFER_LATTER_ERROR_CODE(EXPR1, EXPR2, RV) { \
923 : nsresult RV##__temp = EXPR1; \
924 : RV = EXPR2; \
925 : if (NS_FAILED(RV)) { \
926 : RV = RV##__temp; \
927 : } \
928 : }
929 :
930 : /**
931 : * This gets called just prior to the model actually
932 : * being constructed. It's important to make this the
933 : * last thing that happens right before parsing, so we
934 : * can delay until the last moment the resolution of
935 : * which DTD to use (unless of course we're assigned one).
936 : */
937 : nsresult
938 6757 : nsParser::WillBuildModel(nsString& aFilename)
939 : {
940 6757 : if (!mParserContext)
941 0 : return kInvalidParserContext;
942 :
943 6757 : if (eUnknownDetect != mParserContext->mAutoDetectStatus)
944 3415 : return NS_OK;
945 :
946 3342 : if (eDTDMode_unknown == mParserContext->mDTDMode ||
947 : eDTDMode_autodetect == mParserContext->mDTDMode) {
948 : PRUnichar buf[1025];
949 6682 : nsFixedString theBuffer(buf, 1024, 0);
950 :
951 : // Grab 1024 characters, starting at the first non-whitespace
952 : // character, to look for the doctype in.
953 3341 : mParserContext->mScanner->Peek(theBuffer, 1024, mParserContext->mScanner->FirstNonWhitespacePosition());
954 : DetermineParseMode(theBuffer, mParserContext->mDTDMode,
955 3341 : mParserContext->mDocType, mParserContext->mMimeType);
956 : }
957 :
958 3342 : NS_ASSERTION(!mDTD || !mParserContext->mPrevContext,
959 : "Clobbering DTD for non-root parser context!");
960 3342 : mDTD = FindSuitableDTD(*mParserContext);
961 3342 : NS_ENSURE_TRUE(mDTD, NS_ERROR_OUT_OF_MEMORY);
962 :
963 : nsITokenizer* tokenizer;
964 3342 : nsresult rv = mParserContext->GetTokenizer(mDTD, mSink, tokenizer);
965 3342 : NS_ENSURE_SUCCESS(rv, rv);
966 :
967 3342 : rv = mDTD->WillBuildModel(*mParserContext, tokenizer, mSink);
968 3342 : nsresult sinkResult = mSink->WillBuildModel(mDTD->GetMode());
969 : // nsIDTD::WillBuildModel used to be responsible for calling
970 : // nsIContentSink::WillBuildModel, but that obligation isn't expressible
971 : // in the nsIDTD interface itself, so it's sounder and simpler to give that
972 : // responsibility back to the parser. The former behavior of the DTD was to
973 : // NS_ENSURE_SUCCESS the sink WillBuildModel call, so if the sink returns
974 : // failure we should use sinkResult instead of rv, to preserve the old error
975 : // handling behavior of the DTD:
976 3342 : return NS_FAILED(sinkResult) ? sinkResult : rv;
977 : }
978 :
979 : /**
980 : * This gets called when the parser is done with its input.
981 : * Note that the parser may have been called recursively, so we
982 : * have to check for a prev. context before closing out the DTD/sink.
983 : */
984 : nsresult
985 3342 : nsParser::DidBuildModel(nsresult anErrorCode)
986 : {
987 3342 : nsresult result = anErrorCode;
988 :
989 3342 : if (IsComplete()) {
990 3342 : if (mParserContext && !mParserContext->mPrevContext) {
991 : // Let sink know if we're about to end load because we've been terminated.
992 : // In that case we don't want it to run deferred scripts.
993 3342 : bool terminated = mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING;
994 3342 : if (mDTD && mSink) {
995 3342 : nsresult dtdResult = mDTD->DidBuildModel(anErrorCode),
996 3342 : sinkResult = mSink->DidBuildModel(terminated);
997 : // nsIDTD::DidBuildModel used to be responsible for calling
998 : // nsIContentSink::DidBuildModel, but that obligation isn't expressible
999 : // in the nsIDTD interface itself, so it's sounder and simpler to give
1000 : // that responsibility back to the parser. The former behavior of the
1001 : // DTD was to NS_ENSURE_SUCCESS the sink DidBuildModel call, so if the
1002 : // sink returns failure we should use sinkResult instead of dtdResult,
1003 : // to preserve the old error handling behavior of the DTD:
1004 3342 : result = NS_FAILED(sinkResult) ? sinkResult : dtdResult;
1005 : }
1006 :
1007 : //Ref. to bug 61462.
1008 3342 : mParserContext->mRequest = 0;
1009 : }
1010 : }
1011 :
1012 3342 : return result;
1013 : }
1014 :
1015 : /**
1016 : * This method adds a new parser context to the list,
1017 : * pushing the current one to the next position.
1018 : *
1019 : * @param ptr to new context
1020 : */
1021 : void
1022 3344 : nsParser::PushContext(CParserContext& aContext)
1023 : {
1024 3344 : NS_ASSERTION(aContext.mPrevContext == mParserContext,
1025 : "Trying to push a context whose previous context differs from "
1026 : "the current parser context.");
1027 3344 : mParserContext = &aContext;
1028 3344 : }
1029 :
1030 : /**
1031 : * This method pops the topmost context off the stack,
1032 : * returning it to the user. The next context (if any)
1033 : * becomes the current context.
1034 : * @update gess7/22/98
1035 : * @return prev. context
1036 : */
1037 : CParserContext*
1038 0 : nsParser::PopContext()
1039 : {
1040 0 : CParserContext* oldContext = mParserContext;
1041 0 : if (oldContext) {
1042 0 : mParserContext = oldContext->mPrevContext;
1043 0 : if (mParserContext) {
1044 : // If the old context was blocked, propagate the blocked state
1045 : // back to the new one. Also, propagate the stream listener state
1046 : // but don't override onStop state to guarantee the call to DidBuildModel().
1047 0 : if (mParserContext->mStreamListenerState != eOnStop) {
1048 0 : mParserContext->mStreamListenerState = oldContext->mStreamListenerState;
1049 : }
1050 : // Update the current context's tokenizer to any information gleaned
1051 : // while parsing document.write() calls (such as "a plaintext tag was
1052 : // found")
1053 0 : if (mParserContext->mTokenizer) {
1054 0 : mParserContext->mTokenizer->CopyState(oldContext->mTokenizer);
1055 : }
1056 : }
1057 : }
1058 0 : return oldContext;
1059 : }
1060 :
1061 : /**
1062 : * Call this when you want control whether or not the parser will parse
1063 : * and tokenize input (TRUE), or whether it just caches input to be
1064 : * parsed later (FALSE).
1065 : *
1066 : * @param aState determines whether we parse/tokenize or just cache.
1067 : * @return current state
1068 : */
1069 : void
1070 0 : nsParser::SetUnusedInput(nsString& aBuffer)
1071 : {
1072 0 : mUnusedInput = aBuffer;
1073 0 : }
1074 :
1075 : /**
1076 : * Call this when you want to *force* the parser to terminate the
1077 : * parsing process altogether. This is binary -- so once you terminate
1078 : * you can't resume without restarting altogether.
1079 : */
1080 : NS_IMETHODIMP
1081 65 : nsParser::Terminate(void)
1082 : {
1083 : // We should only call DidBuildModel once, so don't do anything if this is
1084 : // the second time that Terminate has been called.
1085 65 : if (mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING) {
1086 0 : return NS_OK;
1087 : }
1088 :
1089 65 : nsresult result = NS_OK;
1090 : // XXX - [ until we figure out a way to break parser-sink circularity ]
1091 : // Hack - Hold a reference until we are completely done...
1092 130 : nsCOMPtr<nsIParser> kungFuDeathGrip(this);
1093 65 : mInternalState = result = NS_ERROR_HTMLPARSER_STOPPARSING;
1094 :
1095 : // CancelParsingEvents must be called to avoid leaking the nsParser object
1096 : // @see bug 108049
1097 : // If NS_PARSER_FLAG_PENDING_CONTINUE_EVENT is set then CancelParsingEvents
1098 : // will reset it so DidBuildModel will call DidBuildModel on the DTD. Note:
1099 : // The IsComplete() call inside of DidBuildModel looks at the pendingContinueEvents flag.
1100 65 : CancelParsingEvents();
1101 :
1102 : // If we got interrupted in the middle of a document.write, then we might
1103 : // have more than one parser context on our parsercontext stack. This has
1104 : // the effect of making DidBuildModel a no-op, meaning that we never call
1105 : // our sink's DidBuildModel and break the reference cycle, causing a leak.
1106 : // Since we're getting terminated, we manually clean up our context stack.
1107 130 : while (mParserContext && mParserContext->mPrevContext) {
1108 0 : CParserContext *prev = mParserContext->mPrevContext;
1109 0 : delete mParserContext;
1110 0 : mParserContext = prev;
1111 : }
1112 :
1113 65 : if (mDTD) {
1114 65 : mDTD->Terminate();
1115 65 : DidBuildModel(result);
1116 0 : } else if (mSink) {
1117 : // We have no parser context or no DTD yet (so we got terminated before we
1118 : // got any data). Manually break the reference cycle with the sink.
1119 0 : result = mSink->DidBuildModel(true);
1120 0 : NS_ENSURE_SUCCESS(result, result);
1121 : }
1122 :
1123 65 : return NS_OK;
1124 : }
1125 :
1126 : NS_IMETHODIMP
1127 0 : nsParser::ContinueInterruptedParsing()
1128 : {
1129 : // If there are scripts executing, then the content sink is jumping the gun
1130 : // (probably due to a synchronous XMLHttpRequest) and will re-enable us
1131 : // later, see bug 460706.
1132 0 : if (!IsOkToProcessNetworkData()) {
1133 0 : return NS_OK;
1134 : }
1135 :
1136 : // If the stream has already finished, there's a good chance
1137 : // that we might start closing things down when the parser
1138 : // is reenabled. To make sure that we're not deleted across
1139 : // the reenabling process, hold a reference to ourselves.
1140 0 : nsresult result=NS_OK;
1141 0 : nsCOMPtr<nsIParser> kungFuDeathGrip(this);
1142 :
1143 : #ifdef DEBUG
1144 0 : if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) {
1145 0 : NS_WARNING("Don't call ContinueInterruptedParsing on a blocked parser.");
1146 : }
1147 : #endif
1148 :
1149 : bool isFinalChunk = mParserContext &&
1150 0 : mParserContext->mStreamListenerState == eOnStop;
1151 :
1152 0 : mProcessingNetworkData = true;
1153 0 : if (mSink) {
1154 0 : mSink->WillParse();
1155 : }
1156 0 : result = ResumeParse(true, isFinalChunk); // Ref. bug 57999
1157 0 : mProcessingNetworkData = false;
1158 :
1159 0 : if (result != NS_OK) {
1160 0 : result=mInternalState;
1161 : }
1162 :
1163 0 : return result;
1164 : }
1165 :
1166 : /**
1167 : * Stops parsing temporarily. That's it will prevent the
1168 : * parser from building up content model.
1169 : */
1170 : NS_IMETHODIMP_(void)
1171 0 : nsParser::BlockParser()
1172 : {
1173 0 : mFlags &= ~NS_PARSER_FLAG_PARSER_ENABLED;
1174 0 : }
1175 :
1176 : /**
1177 : * Open up the parser for tokenization, building up content
1178 : * model..etc. However, this method does not resume parsing
1179 : * automatically. It's the callers' responsibility to restart
1180 : * the parsing engine.
1181 : */
1182 : NS_IMETHODIMP_(void)
1183 0 : nsParser::UnblockParser()
1184 : {
1185 0 : if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) {
1186 0 : mFlags |= NS_PARSER_FLAG_PARSER_ENABLED;
1187 : } else {
1188 0 : NS_WARNING("Trying to unblock an unblocked parser.");
1189 : }
1190 0 : }
1191 :
1192 : NS_IMETHODIMP_(void)
1193 0 : nsParser::ContinueInterruptedParsingAsync()
1194 : {
1195 0 : mSink->ContinueInterruptedParsingAsync();
1196 0 : }
1197 :
1198 : /**
1199 : * Call this to query whether the parser is enabled or not.
1200 : */
1201 : NS_IMETHODIMP_(bool)
1202 0 : nsParser::IsParserEnabled()
1203 : {
1204 0 : return (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) != 0;
1205 : }
1206 :
1207 : /**
1208 : * Call this to query whether the parser thinks it's done with parsing.
1209 : */
1210 : NS_IMETHODIMP_(bool)
1211 3342 : nsParser::IsComplete()
1212 : {
1213 3342 : return !(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT);
1214 : }
1215 :
1216 :
1217 0 : void nsParser::HandleParserContinueEvent(nsParserContinueEvent *ev)
1218 : {
1219 : // Ignore any revoked continue events...
1220 0 : if (mContinueEvent != ev)
1221 0 : return;
1222 :
1223 0 : mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
1224 0 : mContinueEvent = nsnull;
1225 :
1226 0 : NS_ASSERTION(IsOkToProcessNetworkData(),
1227 : "Interrupted in the middle of a script?");
1228 0 : ContinueInterruptedParsing();
1229 : }
1230 :
1231 : bool
1232 0 : nsParser::IsInsertionPointDefined()
1233 : {
1234 0 : return true;
1235 : }
1236 :
1237 : void
1238 0 : nsParser::BeginEvaluatingParserInsertedScript()
1239 : {
1240 0 : }
1241 :
1242 : void
1243 0 : nsParser::EndEvaluatingParserInsertedScript()
1244 : {
1245 0 : }
1246 :
1247 : void
1248 0 : nsParser::MarkAsNotScriptCreated(const char* aCommand)
1249 : {
1250 0 : }
1251 :
1252 : bool
1253 0 : nsParser::IsScriptCreated()
1254 : {
1255 0 : return false;
1256 : }
1257 :
1258 : /**
1259 : * This is the main controlling routine in the parsing process.
1260 : * Note that it may get called multiple times for the same scanner,
1261 : * since this is a pushed based system, and all the tokens may
1262 : * not have been consumed by the scanner during a given invocation
1263 : * of this method.
1264 : */
1265 : NS_IMETHODIMP
1266 3343 : nsParser::Parse(nsIURI* aURL,
1267 : nsIRequestObserver* aListener,
1268 : void* aKey,
1269 : nsDTDMode aMode)
1270 : {
1271 :
1272 3343 : NS_PRECONDITION(aURL, "Error: Null URL given");
1273 :
1274 3343 : nsresult result=kBadURL;
1275 3343 : mObserver = aListener;
1276 :
1277 3343 : if (aURL) {
1278 6686 : nsCAutoString spec;
1279 3343 : nsresult rv = aURL->GetSpec(spec);
1280 3343 : if (rv != NS_OK) {
1281 0 : return rv;
1282 : }
1283 10029 : NS_ConvertUTF8toUTF16 theName(spec);
1284 :
1285 : nsScanner* theScanner = new nsScanner(theName, false, mCharset,
1286 3343 : mCharsetSource);
1287 : CParserContext* pc = new CParserContext(mParserContext, theScanner, aKey,
1288 3343 : mCommand, aListener);
1289 3343 : if (pc && theScanner) {
1290 3343 : pc->mMultipart = true;
1291 3343 : pc->mContextType = CParserContext::eCTURL;
1292 3343 : pc->mDTDMode = aMode;
1293 3343 : PushContext(*pc);
1294 :
1295 3343 : result = NS_OK;
1296 : } else {
1297 0 : result = mInternalState = NS_ERROR_HTMLPARSER_BADCONTEXT;
1298 : }
1299 : }
1300 3343 : return result;
1301 : }
1302 :
1303 : /**
1304 : * Call this method if all you want to do is parse 1 string full of HTML text.
1305 : * In particular, this method should be called by the DOM when it has an HTML
1306 : * string to feed to the parser in real-time.
1307 : *
1308 : * @param aSourceBuffer contains a string-full of real content
1309 : * @param aMimeType tells us what type of content to expect in the given string
1310 : */
1311 : NS_IMETHODIMP
1312 3 : nsParser::Parse(const nsAString& aSourceBuffer,
1313 : void* aKey,
1314 : const nsACString& aMimeType,
1315 : bool aLastCall,
1316 : nsDTDMode aMode)
1317 : {
1318 3 : nsresult result = NS_OK;
1319 :
1320 : // Don't bother if we're never going to parse this.
1321 3 : if (mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING) {
1322 0 : return result;
1323 : }
1324 :
1325 3 : if (!aLastCall && aSourceBuffer.IsEmpty()) {
1326 : // Nothing is being passed to the parser so return
1327 : // immediately. mUnusedInput will get processed when
1328 : // some data is actually passed in.
1329 : // But if this is the last call, make sure to finish up
1330 : // stuff correctly.
1331 0 : return result;
1332 : }
1333 :
1334 : // Hack to pass on to the dtd the caller's desire to
1335 : // parse a fragment without worrying about containment rules
1336 3 : if (aMode == eDTDMode_fragment)
1337 0 : mCommand = eViewFragment;
1338 :
1339 : // Maintain a reference to ourselves so we don't go away
1340 : // till we're completely done.
1341 6 : nsCOMPtr<nsIParser> kungFuDeathGrip(this);
1342 :
1343 3 : if (aLastCall || !aSourceBuffer.IsEmpty() || !mUnusedInput.IsEmpty()) {
1344 : // Note: The following code will always find the parser context associated
1345 : // with the given key, even if that context has been suspended (e.g., for
1346 : // another document.write call). This doesn't appear to be exactly what IE
1347 : // does in the case where this happens, but this makes more sense.
1348 3 : CParserContext* pc = mParserContext;
1349 6 : while (pc && pc->mKey != aKey) {
1350 0 : pc = pc->mPrevContext;
1351 : }
1352 :
1353 3 : if (!pc) {
1354 : // Only make a new context if we don't have one, OR if we do, but has a
1355 : // different context key.
1356 1 : nsScanner* theScanner = new nsScanner(mUnusedInput, mCharset, mCharsetSource);
1357 1 : NS_ENSURE_TRUE(theScanner, NS_ERROR_OUT_OF_MEMORY);
1358 :
1359 1 : eAutoDetectResult theStatus = eUnknownDetect;
1360 :
1361 1 : if (mParserContext && mParserContext->mMimeType == aMimeType) {
1362 : // Ref. Bug 90379
1363 0 : NS_ASSERTION(mDTD, "How come the DTD is null?");
1364 :
1365 0 : if (mParserContext) {
1366 0 : theStatus = mParserContext->mAutoDetectStatus;
1367 : // Added this to fix bug 32022.
1368 : }
1369 : }
1370 :
1371 : pc = new CParserContext(mParserContext, theScanner, aKey, mCommand,
1372 1 : 0, theStatus, aLastCall);
1373 1 : NS_ENSURE_TRUE(pc, NS_ERROR_OUT_OF_MEMORY);
1374 :
1375 1 : PushContext(*pc);
1376 :
1377 1 : pc->mMultipart = !aLastCall; // By default
1378 1 : if (pc->mPrevContext) {
1379 0 : pc->mMultipart |= pc->mPrevContext->mMultipart;
1380 : }
1381 :
1382 : // Start fix bug 40143
1383 1 : if (pc->mMultipart) {
1384 1 : pc->mStreamListenerState = eOnDataAvail;
1385 1 : if (pc->mScanner) {
1386 1 : pc->mScanner->SetIncremental(true);
1387 : }
1388 : } else {
1389 0 : pc->mStreamListenerState = eOnStop;
1390 0 : if (pc->mScanner) {
1391 0 : pc->mScanner->SetIncremental(false);
1392 : }
1393 : }
1394 : // end fix for 40143
1395 :
1396 1 : pc->mContextType=CParserContext::eCTString;
1397 1 : pc->SetMimeType(aMimeType);
1398 1 : if (pc->mPrevContext && aMode == eDTDMode_autodetect) {
1399 : // Preserve the DTD mode from the last context, bug 265814.
1400 0 : pc->mDTDMode = pc->mPrevContext->mDTDMode;
1401 : } else {
1402 1 : pc->mDTDMode = aMode;
1403 : }
1404 :
1405 1 : mUnusedInput.Truncate();
1406 :
1407 1 : pc->mScanner->Append(aSourceBuffer);
1408 : // Do not interrupt document.write() - bug 95487
1409 1 : result = ResumeParse(false, false, false);
1410 : } else {
1411 2 : pc->mScanner->Append(aSourceBuffer);
1412 2 : if (!pc->mPrevContext) {
1413 : // Set stream listener state to eOnStop, on the final context - Fix 68160,
1414 : // to guarantee DidBuildModel() call - Fix 36148
1415 2 : if (aLastCall) {
1416 1 : pc->mStreamListenerState = eOnStop;
1417 1 : pc->mScanner->SetIncremental(false);
1418 : }
1419 :
1420 2 : if (pc == mParserContext) {
1421 : // If pc is not mParserContext, then this call to ResumeParse would
1422 : // do the wrong thing and try to continue parsing using
1423 : // mParserContext. We need to wait to actually resume parsing on pc.
1424 2 : ResumeParse(false, false, false);
1425 : }
1426 : }
1427 : }
1428 : }
1429 :
1430 3 : return result;
1431 : }
1432 :
1433 : NS_IMETHODIMP
1434 1 : nsParser::ParseFragment(const nsAString& aSourceBuffer,
1435 : nsTArray<nsString>& aTagStack)
1436 : {
1437 1 : nsresult result = NS_OK;
1438 2 : nsAutoString theContext;
1439 1 : PRUint32 theCount = aTagStack.Length();
1440 1 : PRUint32 theIndex = 0;
1441 :
1442 : // Disable observers for fragments
1443 1 : mFlags &= ~NS_PARSER_FLAG_OBSERVERS_ENABLED;
1444 :
1445 2 : for (theIndex = 0; theIndex < theCount; theIndex++) {
1446 1 : theContext.AppendLiteral("<");
1447 1 : theContext.Append(aTagStack[theCount - theIndex - 1]);
1448 1 : theContext.AppendLiteral(">");
1449 : }
1450 :
1451 1 : if (theCount == 0) {
1452 : // Ensure that the buffer is not empty. Because none of the DTDs care
1453 : // about leading whitespace, this doesn't change the result.
1454 0 : theContext.AssignLiteral(" ");
1455 : }
1456 :
1457 : // First, parse the context to build up the DTD's tag stack. Note that we
1458 : // pass false for the aLastCall parameter.
1459 : result = Parse(theContext,
1460 : (void*)&theContext,
1461 1 : NS_LITERAL_CSTRING("application/xml"),
1462 : false,
1463 1 : eDTDMode_full_standards);
1464 1 : if (NS_FAILED(result)) {
1465 0 : mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
1466 0 : return result;
1467 : }
1468 :
1469 1 : if (!mSink) {
1470 : // Parse must have failed in the XML case and so the sink was killed.
1471 0 : return NS_ERROR_HTMLPARSER_STOPPARSING;
1472 : }
1473 :
1474 2 : nsCOMPtr<nsIFragmentContentSink> fragSink = do_QueryInterface(mSink);
1475 1 : NS_ASSERTION(fragSink, "ParseFragment requires a fragment content sink");
1476 :
1477 1 : fragSink->WillBuildContent();
1478 : // Now, parse the actual content. Note that this is the last call
1479 : // for HTML content, but for XML, we will want to build and parse
1480 : // the end tags. However, if tagStack is empty, it's the last call
1481 : // for XML as well.
1482 1 : if (theCount == 0) {
1483 : result = Parse(aSourceBuffer,
1484 : &theContext,
1485 0 : NS_LITERAL_CSTRING("application/xml"),
1486 : true,
1487 0 : eDTDMode_full_standards);
1488 0 : fragSink->DidBuildContent();
1489 : } else {
1490 : // Add an end tag chunk, so expat will read the whole source buffer,
1491 : // and not worry about ']]' etc.
1492 2 : result = Parse(aSourceBuffer + NS_LITERAL_STRING("</"),
1493 : &theContext,
1494 1 : NS_LITERAL_CSTRING("application/xml"),
1495 : false,
1496 2 : eDTDMode_full_standards);
1497 1 : fragSink->DidBuildContent();
1498 :
1499 1 : if (NS_SUCCEEDED(result)) {
1500 2 : nsAutoString endContext;
1501 2 : for (theIndex = 0; theIndex < theCount; theIndex++) {
1502 : // we already added an end tag chunk above
1503 1 : if (theIndex > 0) {
1504 0 : endContext.AppendLiteral("</");
1505 : }
1506 :
1507 1 : nsString& thisTag = aTagStack[theIndex];
1508 : // was there an xmlns=?
1509 1 : PRInt32 endOfTag = thisTag.FindChar(PRUnichar(' '));
1510 1 : if (endOfTag == -1) {
1511 0 : endContext.Append(thisTag);
1512 : } else {
1513 1 : endContext.Append(Substring(thisTag,0,endOfTag));
1514 : }
1515 :
1516 1 : endContext.AppendLiteral(">");
1517 : }
1518 :
1519 : result = Parse(endContext,
1520 : &theContext,
1521 1 : NS_LITERAL_CSTRING("application/xml"),
1522 : true,
1523 1 : eDTDMode_full_standards);
1524 : }
1525 : }
1526 :
1527 1 : mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
1528 :
1529 1 : return result;
1530 : }
1531 :
1532 : /**
1533 : * This routine is called to cause the parser to continue parsing its
1534 : * underlying stream. This call allows the parse process to happen in
1535 : * chunks, such as when the content is push based, and we need to parse in
1536 : * pieces.
1537 : *
1538 : * An interesting change in how the parser gets used has led us to add extra
1539 : * processing to this method. The case occurs when the parser is blocked in
1540 : * one context, and gets a parse(string) call in another context. In this
1541 : * case, the parserContexts are linked. No problem.
1542 : *
1543 : * The problem is that Parse(string) assumes that it can proceed unabated,
1544 : * but if the parser is already blocked that assumption is false. So we
1545 : * needed to add a mechanism here to allow the parser to continue to process
1546 : * (the pop and free) contexts until 1) it get's blocked again; 2) it runs
1547 : * out of contexts.
1548 : *
1549 : *
1550 : * @param allowItertion : set to true if non-script resumption is requested
1551 : * @param aIsFinalChunk : tells us when the last chunk of data is provided.
1552 : * @return error code -- 0 if ok, non-zero if error.
1553 : */
1554 : nsresult
1555 6768 : nsParser::ResumeParse(bool allowIteration, bool aIsFinalChunk,
1556 : bool aCanInterrupt)
1557 : {
1558 6768 : nsresult result = NS_OK;
1559 :
1560 6768 : if ((mFlags & NS_PARSER_FLAG_PARSER_ENABLED) &&
1561 : mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) {
1562 :
1563 6757 : result = WillBuildModel(mParserContext->mScanner->GetFilename());
1564 6757 : if (NS_FAILED(result)) {
1565 0 : mFlags &= ~NS_PARSER_FLAG_CAN_TOKENIZE;
1566 0 : return result;
1567 : }
1568 :
1569 6757 : if (mDTD) {
1570 6757 : mSink->WillResume();
1571 6757 : bool theIterationIsOk = true;
1572 :
1573 16929 : while (result == NS_OK && theIterationIsOk) {
1574 6757 : if (!mUnusedInput.IsEmpty() && mParserContext->mScanner) {
1575 : // -- Ref: Bug# 22485 --
1576 : // Insert the unused input into the source buffer
1577 : // as if it was read from the input stream.
1578 : // Adding UngetReadable() per vidur!!
1579 0 : mParserContext->mScanner->UngetReadable(mUnusedInput);
1580 0 : mUnusedInput.Truncate(0);
1581 : }
1582 :
1583 : // Only allow parsing to be interrupted in the subsequent call to
1584 : // build model.
1585 : nsresult theTokenizerResult = (mFlags & NS_PARSER_FLAG_CAN_TOKENIZE)
1586 6757 : ? Tokenize(aIsFinalChunk)
1587 13514 : : NS_OK;
1588 6757 : result = BuildModel();
1589 :
1590 6757 : if (result == NS_ERROR_HTMLPARSER_INTERRUPTED && aIsFinalChunk) {
1591 0 : PostContinueEvent();
1592 : }
1593 :
1594 : theIterationIsOk = theTokenizerResult != kEOF &&
1595 6757 : result != NS_ERROR_HTMLPARSER_INTERRUPTED;
1596 :
1597 : // Make sure not to stop parsing too early. Therefore, before shutting
1598 : // down the parser, it's important to check whether the input buffer
1599 : // has been scanned to completion (theTokenizerResult should be kEOF).
1600 : // kEOF -> End of buffer.
1601 :
1602 : // If we're told to block the parser, we disable all further parsing
1603 : // (and cache any data coming in) until the parser is re-enabled.
1604 6757 : if (NS_ERROR_HTMLPARSER_BLOCK == result) {
1605 0 : mSink->WillInterrupt();
1606 0 : if (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) {
1607 : // If we were blocked by a recursive invocation, don't re-block.
1608 0 : BlockParser();
1609 : }
1610 0 : return NS_OK;
1611 : }
1612 6757 : if (NS_ERROR_HTMLPARSER_STOPPARSING == result) {
1613 : // Note: Parser Terminate() calls DidBuildModel.
1614 65 : if (mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) {
1615 0 : DidBuildModel(mStreamStatus);
1616 0 : mInternalState = result;
1617 : }
1618 :
1619 65 : return NS_OK;
1620 : }
1621 6692 : if ((NS_OK == result && theTokenizerResult == kEOF) ||
1622 : result == NS_ERROR_HTMLPARSER_INTERRUPTED) {
1623 : bool theContextIsStringBased =
1624 6692 : CParserContext::eCTString == mParserContext->mContextType;
1625 :
1626 10107 : if (mParserContext->mStreamListenerState == eOnStop ||
1627 3415 : !mParserContext->mMultipart || theContextIsStringBased) {
1628 3279 : if (!mParserContext->mPrevContext) {
1629 3279 : if (mParserContext->mStreamListenerState == eOnStop) {
1630 3277 : DidBuildModel(mStreamStatus);
1631 3277 : return NS_OK;
1632 : }
1633 : } else {
1634 0 : CParserContext* theContext = PopContext();
1635 0 : if (theContext) {
1636 0 : theIterationIsOk = allowIteration && theContextIsStringBased;
1637 0 : if (theContext->mCopyUnused) {
1638 0 : theContext->mScanner->CopyUnusedData(mUnusedInput);
1639 : }
1640 :
1641 0 : delete theContext;
1642 : }
1643 :
1644 0 : result = mInternalState;
1645 : aIsFinalChunk = mParserContext &&
1646 0 : mParserContext->mStreamListenerState == eOnStop;
1647 : // ...then intentionally fall through to mSink->WillInterrupt()...
1648 : }
1649 : }
1650 : }
1651 :
1652 3415 : if (theTokenizerResult == kEOF ||
1653 : result == NS_ERROR_HTMLPARSER_INTERRUPTED) {
1654 3415 : result = (result == NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result;
1655 3415 : mSink->WillInterrupt();
1656 : }
1657 : }
1658 : } else {
1659 0 : mInternalState = result = NS_ERROR_HTMLPARSER_UNRESOLVEDDTD;
1660 : }
1661 : }
1662 :
1663 3426 : return (result == NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result;
1664 : }
1665 :
1666 : /**
1667 : * This is where we loop over the tokens created in the
1668 : * tokenization phase, and try to make sense out of them.
1669 : */
1670 : nsresult
1671 6757 : nsParser::BuildModel()
1672 : {
1673 6757 : nsITokenizer* theTokenizer = nsnull;
1674 :
1675 6757 : nsresult result = NS_OK;
1676 6757 : if (mParserContext) {
1677 6757 : result = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer);
1678 : }
1679 :
1680 6757 : if (NS_SUCCEEDED(result)) {
1681 6757 : if (mDTD) {
1682 6757 : bool inDocWrite = !!mParserContext->mPrevContext;
1683 6757 : result = mDTD->BuildModel(theTokenizer,
1684 : // ignore interruptions in document.write
1685 6757 : !inDocWrite, // don't count lines in document.write
1686 13514 : &mCharset);
1687 : }
1688 : } else {
1689 0 : mInternalState = result = NS_ERROR_HTMLPARSER_BADTOKENIZER;
1690 : }
1691 6757 : return result;
1692 : }
1693 :
1694 : /*******************************************************************
1695 : These methods are used to talk to the netlib system...
1696 : *******************************************************************/
1697 :
1698 : nsresult
1699 3341 : nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext)
1700 : {
1701 3341 : NS_PRECONDITION(eNone == mParserContext->mStreamListenerState,
1702 : "Parser's nsIStreamListener API was not setup "
1703 : "correctly in constructor.");
1704 3341 : if (mObserver) {
1705 0 : mObserver->OnStartRequest(request, aContext);
1706 : }
1707 3341 : mParserContext->mStreamListenerState = eOnStart;
1708 3341 : mParserContext->mAutoDetectStatus = eUnknownDetect;
1709 3341 : mParserContext->mRequest = request;
1710 :
1711 3341 : NS_ASSERTION(!mParserContext->mPrevContext,
1712 : "Clobbering DTD for non-root parser context!");
1713 3341 : mDTD = nsnull;
1714 :
1715 : nsresult rv;
1716 6682 : nsCAutoString contentType;
1717 6682 : nsCOMPtr<nsIChannel> channel = do_QueryInterface(request);
1718 3341 : if (channel) {
1719 3341 : rv = channel->GetContentType(contentType);
1720 3341 : if (NS_SUCCEEDED(rv)) {
1721 3341 : mParserContext->SetMimeType(contentType);
1722 : }
1723 : }
1724 :
1725 3341 : rv = NS_OK;
1726 :
1727 3341 : return rv;
1728 : }
1729 :
1730 :
1731 : #define UTF16_BOM "UTF-16"
1732 : #define UTF16_BE "UTF-16BE"
1733 : #define UTF16_LE "UTF-16LE"
1734 : #define UTF8 "UTF-8"
1735 :
1736 0 : static inline bool IsSecondMarker(unsigned char aChar)
1737 : {
1738 0 : switch (aChar) {
1739 : case '!':
1740 : case '?':
1741 : case 'h':
1742 : case 'H':
1743 0 : return true;
1744 : default:
1745 0 : return false;
1746 : }
1747 : }
1748 :
1749 : static bool
1750 2446 : DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen,
1751 : nsCString& oCharset, PRInt32& oCharsetSource)
1752 : {
1753 2446 : oCharsetSource= kCharsetFromAutoDetection;
1754 2446 : oCharset.Truncate();
1755 : // See http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing
1756 : // for details
1757 : // Also, MS Win2K notepad now generate 3 bytes BOM in UTF8 as UTF8 signature
1758 : // We need to check that
1759 : // UCS2 BOM FEFF = UTF8 EF BB BF
1760 2446 : switch(aBytes[0])
1761 : {
1762 : case 0x00:
1763 0 : if((0x3C==aBytes[1]) && (0x00==aBytes[2])) {
1764 : // 00 3C 00
1765 0 : if(IsSecondMarker(aBytes[3])) {
1766 : // 00 3C 00 SM UTF-16, big-endian, no Byte Order Mark
1767 0 : oCharset.Assign(UTF16_BE);
1768 0 : oCharsetSource = kCharsetFromByteOrderMark;
1769 : }
1770 : }
1771 0 : break;
1772 : case 0x3C:
1773 2446 : if(0x00==aBytes[1] && (0x00==aBytes[3])) {
1774 : // 3C 00 XX 00
1775 0 : if(IsSecondMarker(aBytes[2])) {
1776 : // 3C 00 SM 00 UTF-16, little-endian, no Byte Order Mark
1777 0 : oCharset.Assign(UTF16_LE);
1778 0 : oCharsetSource = kCharsetFromByteOrderMark;
1779 : }
1780 : // For html, meta tag detector is invoked before this so that we have
1781 : // to deal only with XML here.
1782 9634 : } else if( (0x3F==aBytes[1]) &&
1783 4792 : (0x78==aBytes[2]) && (0x6D==aBytes[3]) &&
1784 2396 : (0 == PL_strncmp("<?xml", (char*)aBytes, 5 ))) {
1785 : // 3C 3F 78 6D
1786 : // ASCII characters are in their normal positions, so we can safely
1787 : // deal with the XML declaration in the old C way
1788 : // The shortest string so far (strlen==5):
1789 : // <?xml
1790 : PRInt32 i;
1791 2396 : bool versionFound = false, encodingFound = false;
1792 24283 : for (i=6; i < aLen && !encodingFound; ++i) {
1793 : // end of XML declaration?
1794 25545 : if ((((char*)aBytes)[i] == '?') &&
1795 : ((i+1) < aLen) &&
1796 1829 : (((char*)aBytes)[i+1] == '>')) {
1797 1829 : break;
1798 : }
1799 : // Version is required.
1800 21887 : if (!versionFound) {
1801 : // Want to avoid string comparisons, hence looking for 'n'
1802 : // and only if found check the string leading to it. Not
1803 : // foolproof, but fast.
1804 : // The shortest string allowed before this is (strlen==13):
1805 : // <?xml version
1806 19168 : if ((((char*)aBytes)[i] == 'n') &&
1807 : (i >= 12) &&
1808 2396 : (0 == PL_strncmp("versio", (char*)(aBytes+i-6), 6 ))) {
1809 : // Fast forward through version
1810 2396 : char q = 0;
1811 14376 : for (++i; i < aLen; ++i) {
1812 14376 : char qi = ((char*)aBytes)[i];
1813 14376 : if (qi == '\'' || qi == '"') {
1814 4792 : if (q && q == qi) {
1815 : // ending quote
1816 2396 : versionFound = true;
1817 2396 : break;
1818 : } else {
1819 : // Starting quote
1820 2396 : q = qi;
1821 : }
1822 : }
1823 : }
1824 : }
1825 : } else {
1826 : // encoding must follow version
1827 : // Want to avoid string comparisons, hence looking for 'g'
1828 : // and only if found check the string leading to it. Not
1829 : // foolproof, but fast.
1830 : // The shortest allowed string before this (strlen==26):
1831 : // <?xml version="1" encoding
1832 5681 : if ((((char*)aBytes)[i] == 'g') &&
1833 : (i >= 25) &&
1834 566 : (0 == PL_strncmp("encodin", (char*)(aBytes+i-7), 7 ))) {
1835 566 : PRInt32 encStart = 0;
1836 566 : char q = 0;
1837 5298 : for (++i; i < aLen; ++i) {
1838 5298 : char qi = ((char*)aBytes)[i];
1839 5298 : if (qi == '\'' || qi == '"') {
1840 1132 : if (q && q == qi) {
1841 566 : PRInt32 count = i - encStart;
1842 : // encoding value is invalid if it is UTF-16
1843 1132 : if (count > 0 &&
1844 566 : (0 != PL_strcmp("UTF-16", (char*)(aBytes+encStart)))) {
1845 566 : oCharset.Assign((char*)(aBytes+encStart),count);
1846 566 : oCharsetSource = kCharsetFromMetaTag;
1847 : }
1848 566 : encodingFound = true;
1849 566 : break;
1850 : } else {
1851 566 : encStart = i+1;
1852 566 : q = qi;
1853 : }
1854 : }
1855 : }
1856 : }
1857 : } // if (!versionFound)
1858 : } // for
1859 : }
1860 2446 : break;
1861 : case 0xEF:
1862 0 : if((0xBB==aBytes[1]) && (0xBF==aBytes[2])) {
1863 : // EF BB BF
1864 : // Win2K UTF-8 BOM
1865 0 : oCharset.Assign(UTF8);
1866 0 : oCharsetSource= kCharsetFromByteOrderMark;
1867 : }
1868 0 : break;
1869 : case 0xFE:
1870 0 : if(0xFF==aBytes[1]) {
1871 : // FE FF UTF-16, big-endian
1872 0 : oCharset.Assign(UTF16_BOM);
1873 0 : oCharsetSource= kCharsetFromByteOrderMark;
1874 : }
1875 0 : break;
1876 : case 0xFF:
1877 0 : if(0xFE==aBytes[1]) {
1878 : // FF FE
1879 : // UTF-16, little-endian
1880 0 : oCharset.Assign(UTF16_BOM);
1881 0 : oCharsetSource= kCharsetFromByteOrderMark;
1882 : }
1883 0 : break;
1884 : // case 0x4C: if((0x6F==aBytes[1]) && ((0xA7==aBytes[2] && (0x94==aBytes[3])) {
1885 : // We do not care EBCIDIC here....
1886 : // }
1887 : // break;
1888 : } // switch
1889 2446 : return !oCharset.IsEmpty();
1890 : }
1891 :
1892 : inline const char
1893 2675 : GetNextChar(nsACString::const_iterator& aStart,
1894 : nsACString::const_iterator& aEnd)
1895 : {
1896 2675 : NS_ASSERTION(aStart != aEnd, "end of buffer");
1897 2675 : return (++aStart != aEnd) ? *aStart : '\0';
1898 : }
1899 :
1900 : bool
1901 2471 : nsParser::DetectMetaTag(const char* aBytes,
1902 : PRInt32 aLen,
1903 : nsCString& aCharset,
1904 : PRInt32& aCharsetSource)
1905 : {
1906 2471 : aCharsetSource= kCharsetFromMetaTag;
1907 2471 : aCharset.SetLength(0);
1908 :
1909 : // XXX Only look inside HTML documents for now. For XML
1910 : // documents we should be looking inside the XMLDecl.
1911 2471 : if (!mParserContext->mMimeType.EqualsLiteral(TEXT_HTML)) {
1912 2446 : return false;
1913 : }
1914 :
1915 : // Fast and loose parsing to determine if we have a complete
1916 : // META tag in this block, looking upto 2k into it.
1917 : const nsASingleFragmentCString& str =
1918 50 : Substring(aBytes, aBytes + NS_MIN(aLen, 2048));
1919 : // XXXldb Should be const_char_iterator when FindInReadable supports it.
1920 25 : nsACString::const_iterator begin, end;
1921 :
1922 25 : str.BeginReading(begin);
1923 25 : str.EndReading(end);
1924 25 : nsACString::const_iterator currPos(begin);
1925 25 : nsACString::const_iterator tokEnd;
1926 25 : nsACString::const_iterator tagEnd(begin);
1927 :
1928 100 : while (currPos != end) {
1929 75 : if (!FindCharInReadable('<', currPos, end))
1930 0 : break; // no tag found in this buffer
1931 :
1932 75 : if (GetNextChar(currPos, end) == '!') {
1933 75 : if (GetNextChar(currPos, end) != '-' ||
1934 25 : GetNextChar(currPos, end) != '-') {
1935 : // If we only see a <! not followed by --, just skip to the next >.
1936 25 : if (!FindCharInReadable('>', currPos, end)) {
1937 0 : return false; // No more tags to follow.
1938 : }
1939 :
1940 : // Continue searching for a meta tag following this "comment".
1941 25 : ++currPos;
1942 25 : continue;
1943 : }
1944 :
1945 : // Found MDO ( <!-- ). Now search for MDC ( --[*s]> )
1946 25 : bool foundMDC = false;
1947 25 : bool foundMatch = false;
1948 2550 : while (!foundMDC) {
1949 2525 : if (GetNextChar(currPos, end) == '-' &&
1950 25 : GetNextChar(currPos, end) == '-') {
1951 25 : foundMatch = !foundMatch; // toggle until we've matching "--"
1952 2475 : } else if (currPos == end) {
1953 0 : return false; // Couldn't find --[*s]> in this buffer
1954 2475 : } else if (foundMatch && *currPos == '>') {
1955 25 : foundMDC = true; // found comment end delimiter.
1956 25 : ++currPos;
1957 : }
1958 : }
1959 25 : continue; // continue searching for META tag.
1960 : }
1961 :
1962 : // Find the end of the tag, break if incomplete
1963 25 : tagEnd = currPos;
1964 25 : if (!FindCharInReadable('>', tagEnd, end))
1965 0 : break;
1966 :
1967 : // If this is not a META tag, continue to next loop
1968 200 : if ( (*currPos != 'm' && *currPos != 'M') ||
1969 50 : (*(++currPos) != 'e' && *currPos != 'E') ||
1970 50 : (*(++currPos) != 't' && *currPos != 'T') ||
1971 50 : (*(++currPos) != 'a' && *currPos != 'A') ||
1972 25 : !nsCRT::IsAsciiSpace(*(++currPos))) {
1973 0 : currPos = tagEnd;
1974 0 : continue;
1975 : }
1976 :
1977 : // If could not find "charset" in this tag, skip this tag and try next
1978 25 : tokEnd = tagEnd;
1979 25 : if (!CaseInsensitiveFindInReadable(NS_LITERAL_CSTRING("CHARSET"),
1980 25 : currPos, tokEnd)) {
1981 0 : currPos = tagEnd;
1982 0 : continue;
1983 : }
1984 25 : currPos = tokEnd;
1985 :
1986 : // skip spaces before '='
1987 100 : while (*currPos == kSpace || *currPos == kNewLine ||
1988 50 : *currPos == kCR || *currPos == kTab) {
1989 0 : ++currPos;
1990 : }
1991 : // skip '='
1992 25 : if (*currPos != '=') {
1993 0 : currPos = tagEnd;
1994 0 : continue;
1995 : }
1996 25 : ++currPos;
1997 : // skip spaces after '='
1998 100 : while (*currPos == kSpace || *currPos == kNewLine ||
1999 50 : *currPos == kCR || *currPos == kTab) {
2000 0 : ++currPos;
2001 : }
2002 :
2003 : // skip open quote
2004 25 : if (*currPos == '\'' || *currPos == '\"')
2005 0 : ++currPos;
2006 :
2007 : // find the end of charset string
2008 25 : tokEnd = currPos;
2009 175 : while (*tokEnd != '\'' && *tokEnd != '\"' && tokEnd != tagEnd)
2010 125 : ++tokEnd;
2011 :
2012 : // return true if we successfully got something for charset
2013 25 : if (currPos != tokEnd) {
2014 25 : aCharset.Assign(currPos.get(), tokEnd.get() - currPos.get());
2015 25 : return true;
2016 : }
2017 :
2018 : // Nothing specified as charset, continue next loop
2019 0 : currPos = tagEnd;
2020 : }
2021 :
2022 0 : return false;
2023 : }
2024 :
2025 : static NS_METHOD
2026 0 : NoOpParserWriteFunc(nsIInputStream* in,
2027 : void* closure,
2028 : const char* fromRawSegment,
2029 : PRUint32 toOffset,
2030 : PRUint32 count,
2031 : PRUint32 *writeCount)
2032 : {
2033 0 : *writeCount = count;
2034 0 : return NS_OK;
2035 : }
2036 :
2037 : typedef struct {
2038 : bool mNeedCharsetCheck;
2039 : nsParser* mParser;
2040 : nsScanner* mScanner;
2041 : nsIRequest* mRequest;
2042 : } ParserWriteStruct;
2043 :
2044 : /*
2045 : * This function is invoked as a result of a call to a stream's
2046 : * ReadSegments() method. It is called for each contiguous buffer
2047 : * of data in the underlying stream or pipe. Using ReadSegments
2048 : * allows us to avoid copying data to read out of the stream.
2049 : */
2050 : static NS_METHOD
2051 3491 : ParserWriteFunc(nsIInputStream* in,
2052 : void* closure,
2053 : const char* fromRawSegment,
2054 : PRUint32 toOffset,
2055 : PRUint32 count,
2056 : PRUint32 *writeCount)
2057 : {
2058 : nsresult result;
2059 3491 : ParserWriteStruct* pws = static_cast<ParserWriteStruct*>(closure);
2060 3491 : const char* buf = fromRawSegment;
2061 3491 : PRUint32 theNumRead = count;
2062 :
2063 3491 : if (!pws) {
2064 0 : return NS_ERROR_FAILURE;
2065 : }
2066 :
2067 3491 : if (pws->mNeedCharsetCheck) {
2068 : PRInt32 guessSource;
2069 4942 : nsCAutoString guess;
2070 4942 : nsCAutoString preferred;
2071 :
2072 2471 : pws->mNeedCharsetCheck = false;
2073 4917 : if (pws->mParser->DetectMetaTag(buf, theNumRead, guess, guessSource) ||
2074 : ((count >= 4) &&
2075 : DetectByteOrderMark((const unsigned char*)buf,
2076 2446 : theNumRead, guess, guessSource))) {
2077 591 : result = nsCharsetAlias::GetPreferred(guess, preferred);
2078 : // Only continue if it's a recognized charset and not
2079 : // one of a designated set that we ignore.
2080 2364 : if (NS_SUCCEEDED(result) &&
2081 : ((kCharsetFromByteOrderMark == guessSource) ||
2082 591 : (!preferred.EqualsLiteral("UTF-16") &&
2083 591 : !preferred.EqualsLiteral("UTF-16BE") &&
2084 591 : !preferred.EqualsLiteral("UTF-16LE")))) {
2085 591 : guess = preferred;
2086 591 : pws->mParser->SetDocumentCharset(guess, guessSource);
2087 591 : pws->mParser->SetSinkCharset(preferred);
2088 1182 : nsCOMPtr<nsICachingChannel> channel(do_QueryInterface(pws->mRequest));
2089 591 : if (channel) {
2090 722 : nsCOMPtr<nsISupports> cacheToken;
2091 361 : channel->GetCacheToken(getter_AddRefs(cacheToken));
2092 361 : if (cacheToken) {
2093 722 : nsCOMPtr<nsICacheEntryDescriptor> cacheDescriptor(do_QueryInterface(cacheToken));
2094 361 : if (cacheDescriptor) {
2095 : #ifdef DEBUG
2096 : nsresult rv =
2097 : #endif
2098 361 : cacheDescriptor->SetMetaDataElement("charset",
2099 361 : guess.get());
2100 361 : NS_ASSERTION(NS_SUCCEEDED(rv),"cannot SetMetaDataElement");
2101 : }
2102 : }
2103 : }
2104 : }
2105 : }
2106 : }
2107 :
2108 3491 : result = pws->mScanner->Append(buf, theNumRead, pws->mRequest);
2109 3491 : if (NS_SUCCEEDED(result)) {
2110 3491 : *writeCount = count;
2111 : }
2112 :
2113 3491 : return result;
2114 : }
2115 :
2116 : nsresult
2117 3425 : nsParser::OnDataAvailable(nsIRequest *request, nsISupports* aContext,
2118 : nsIInputStream *pIStream, PRUint32 sourceOffset,
2119 : PRUint32 aLength)
2120 : {
2121 3425 : NS_PRECONDITION((eOnStart == mParserContext->mStreamListenerState ||
2122 : eOnDataAvail == mParserContext->mStreamListenerState),
2123 : "Error: OnStartRequest() must be called before OnDataAvailable()");
2124 3425 : NS_PRECONDITION(NS_InputStreamIsBuffered(pIStream),
2125 : "Must have a buffered input stream");
2126 :
2127 3425 : nsresult rv = NS_OK;
2128 :
2129 3425 : if (mIsAboutBlank) {
2130 0 : MOZ_ASSERT(false, "Must not get OnDataAvailable for about:blank");
2131 : // ... but if an extension tries to feed us data for about:blank in a
2132 : // release build, silently ignore the data.
2133 : PRUint32 totalRead;
2134 : rv = pIStream->ReadSegments(NoOpParserWriteFunc,
2135 : nsnull,
2136 : aLength,
2137 0 : &totalRead);
2138 0 : return rv;
2139 : }
2140 :
2141 3425 : CParserContext *theContext = mParserContext;
2142 :
2143 6850 : while (theContext && theContext->mRequest != request) {
2144 0 : theContext = theContext->mPrevContext;
2145 : }
2146 :
2147 3425 : if (theContext) {
2148 3425 : theContext->mStreamListenerState = eOnDataAvail;
2149 :
2150 3425 : if (eInvalidDetect == theContext->mAutoDetectStatus) {
2151 0 : if (theContext->mScanner) {
2152 0 : nsScannerIterator iter;
2153 0 : theContext->mScanner->EndReading(iter);
2154 0 : theContext->mScanner->SetPosition(iter, true);
2155 : }
2156 : }
2157 :
2158 : PRUint32 totalRead;
2159 : ParserWriteStruct pws;
2160 : pws.mNeedCharsetCheck =
2161 3425 : (0 == sourceOffset) && (mCharsetSource < kCharsetFromMetaTag);
2162 3425 : pws.mParser = this;
2163 3425 : pws.mScanner = theContext->mScanner;
2164 3425 : pws.mRequest = request;
2165 :
2166 3425 : rv = pIStream->ReadSegments(ParserWriteFunc, &pws, aLength, &totalRead);
2167 3425 : if (NS_FAILED(rv)) {
2168 0 : return rv;
2169 : }
2170 :
2171 : // Don't bother to start parsing until we've seen some
2172 : // non-whitespace data
2173 6850 : if (IsOkToProcessNetworkData() &&
2174 3425 : theContext->mScanner->FirstNonWhitespacePosition() >= 0) {
2175 3424 : mProcessingNetworkData = true;
2176 3424 : if (mSink) {
2177 3424 : mSink->WillParse();
2178 : }
2179 3424 : rv = ResumeParse();
2180 3424 : mProcessingNetworkData = false;
2181 : }
2182 : } else {
2183 0 : rv = NS_ERROR_UNEXPECTED;
2184 : }
2185 :
2186 3425 : return rv;
2187 : }
2188 :
2189 : /**
2190 : * This is called by the networking library once the last block of data
2191 : * has been collected from the net.
2192 : */
2193 : nsresult
2194 3341 : nsParser::OnStopRequest(nsIRequest *request, nsISupports* aContext,
2195 : nsresult status)
2196 : {
2197 3341 : nsresult rv = NS_OK;
2198 :
2199 3341 : CParserContext *pc = mParserContext;
2200 6693 : while (pc) {
2201 3341 : if (pc->mRequest == request) {
2202 3330 : pc->mStreamListenerState = eOnStop;
2203 3330 : pc->mScanner->SetIncremental(false);
2204 3330 : break;
2205 : }
2206 :
2207 11 : pc = pc->mPrevContext;
2208 : }
2209 :
2210 3341 : mStreamStatus = status;
2211 :
2212 3341 : if (IsOkToProcessNetworkData() && NS_SUCCEEDED(rv)) {
2213 3341 : mProcessingNetworkData = true;
2214 3341 : if (mSink) {
2215 3330 : mSink->WillParse();
2216 : }
2217 3341 : rv = ResumeParse(true, true);
2218 3341 : mProcessingNetworkData = false;
2219 : }
2220 :
2221 : // If the parser isn't enabled, we don't finish parsing till
2222 : // it is reenabled.
2223 :
2224 :
2225 : // XXX Should we wait to notify our observers as well if the
2226 : // parser isn't yet enabled?
2227 3341 : if (mObserver) {
2228 0 : mObserver->OnStopRequest(request, aContext, status);
2229 : }
2230 :
2231 3341 : return rv;
2232 : }
2233 :
2234 :
2235 : /*******************************************************************
2236 : Here come the tokenization methods...
2237 : *******************************************************************/
2238 :
2239 :
2240 : /**
2241 : * Part of the code sandwich, this gets called right before
2242 : * the tokenization process begins. The main reason for
2243 : * this call is to allow the delegate to do initialization.
2244 : */
2245 : bool
2246 6757 : nsParser::WillTokenize(bool aIsFinalChunk)
2247 : {
2248 6757 : if (!mParserContext) {
2249 0 : return true;
2250 : }
2251 :
2252 : nsITokenizer* theTokenizer;
2253 6757 : nsresult result = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer);
2254 6757 : NS_ENSURE_SUCCESS(result, false);
2255 6757 : return NS_SUCCEEDED(theTokenizer->WillTokenize(aIsFinalChunk,
2256 : &mTokenAllocator));
2257 : }
2258 :
2259 :
2260 : /**
2261 : * This is the primary control routine to consume tokens.
2262 : * It iteratively consumes tokens until an error occurs or
2263 : * you run out of data.
2264 : */
2265 6757 : nsresult nsParser::Tokenize(bool aIsFinalChunk)
2266 : {
2267 : nsITokenizer* theTokenizer;
2268 :
2269 6757 : nsresult result = NS_ERROR_NOT_AVAILABLE;
2270 6757 : if (mParserContext) {
2271 6757 : result = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer);
2272 : }
2273 :
2274 6757 : if (NS_SUCCEEDED(result)) {
2275 6757 : if (mFlags & NS_PARSER_FLAG_FLUSH_TOKENS) {
2276 : // For some reason tokens didn't get flushed (probably
2277 : // the parser got blocked before all the tokens in the
2278 : // stack got handled). Flush 'em now. Ref. bug 104856
2279 0 : if (theTokenizer->GetCount() != 0) {
2280 0 : return result;
2281 : }
2282 :
2283 : // Reset since the tokens have been flushed.
2284 0 : mFlags &= ~NS_PARSER_FLAG_FLUSH_TOKENS;
2285 : }
2286 :
2287 6757 : bool flushTokens = false;
2288 :
2289 6757 : mParserContext->mNumConsumed = 0;
2290 :
2291 6757 : bool killSink = false;
2292 :
2293 6757 : WillTokenize(aIsFinalChunk);
2294 6757 : while (NS_SUCCEEDED(result)) {
2295 9012 : mParserContext->mNumConsumed += mParserContext->mScanner->Mark();
2296 9012 : result = theTokenizer->ConsumeToken(*mParserContext->mScanner,
2297 18024 : flushTokens);
2298 9012 : if (NS_FAILED(result)) {
2299 6757 : mParserContext->mScanner->RewindToMark();
2300 6757 : if (kEOF == result){
2301 6692 : break;
2302 : }
2303 65 : if (NS_ERROR_HTMLPARSER_STOPPARSING == result) {
2304 65 : killSink = true;
2305 65 : result = Terminate();
2306 65 : break;
2307 : }
2308 2255 : } else if (flushTokens && (mFlags & NS_PARSER_FLAG_OBSERVERS_ENABLED)) {
2309 : // I added the extra test of NS_PARSER_FLAG_OBSERVERS_ENABLED to fix Bug# 23931.
2310 : // Flush tokens on seeing </SCRIPT> -- Ref: Bug# 22485 --
2311 : // Also remember to update the marked position.
2312 0 : mFlags |= NS_PARSER_FLAG_FLUSH_TOKENS;
2313 0 : mParserContext->mNumConsumed += mParserContext->mScanner->Mark();
2314 0 : break;
2315 : }
2316 : }
2317 6757 : DidTokenize(aIsFinalChunk);
2318 :
2319 6757 : if (killSink) {
2320 65 : mSink = nsnull;
2321 : }
2322 : } else {
2323 0 : result = mInternalState = NS_ERROR_HTMLPARSER_BADTOKENIZER;
2324 : }
2325 :
2326 6757 : return result;
2327 : }
2328 :
2329 : /**
2330 : * This is the tail-end of the code sandwich for the
2331 : * tokenization process. It gets called once tokenziation
2332 : * has completed for each phase.
2333 : */
2334 : bool
2335 6757 : nsParser::DidTokenize(bool aIsFinalChunk)
2336 : {
2337 6757 : if (!mParserContext) {
2338 0 : return true;
2339 : }
2340 :
2341 : nsITokenizer* theTokenizer;
2342 6757 : nsresult rv = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer);
2343 6757 : NS_ENSURE_SUCCESS(rv, false);
2344 :
2345 6757 : rv = theTokenizer->DidTokenize(aIsFinalChunk);
2346 6757 : return NS_SUCCEEDED(rv);
2347 : }
2348 :
2349 : /**
2350 : * Get the channel associated with this parser
2351 : *
2352 : * @param aChannel out param that will contain the result
2353 : * @return NS_OK if successful
2354 : */
2355 : NS_IMETHODIMP
2356 0 : nsParser::GetChannel(nsIChannel** aChannel)
2357 : {
2358 0 : nsresult result = NS_ERROR_NOT_AVAILABLE;
2359 0 : if (mParserContext && mParserContext->mRequest) {
2360 0 : result = CallQueryInterface(mParserContext->mRequest, aChannel);
2361 : }
2362 0 : return result;
2363 : }
2364 :
2365 : /**
2366 : * Get the DTD associated with this parser
2367 : */
2368 : NS_IMETHODIMP
2369 0 : nsParser::GetDTD(nsIDTD** aDTD)
2370 : {
2371 0 : if (mParserContext) {
2372 0 : NS_IF_ADDREF(*aDTD = mDTD);
2373 : }
2374 :
2375 0 : return NS_OK;
2376 : }
2377 :
2378 : /**
2379 : * Get this as nsIStreamListener
2380 : */
2381 : nsIStreamListener*
2382 0 : nsParser::GetStreamListener()
2383 : {
2384 0 : return this;
2385 4392 : }
|