1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /* ***** BEGIN LICENSE BLOCK *****
3 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Original Code is mozilla.org code.
16 : *
17 : * The Initial Developer of the Original Code is
18 : * Netscape Communications Corporation.
19 : * Portions created by the Initial Developer are Copyright (C) 1998
20 : * the Initial Developer. All Rights Reserved.
21 : *
22 : * Contributor(s):
23 : * Henri Sivonen <hsivonen@iki.fi>
24 : *
25 : * Alternatively, the contents of this file may be used under the terms of
26 : * either of the GNU General Public License Version 2 or later (the "GPL"),
27 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 : * in which case the provisions of the GPL or the LGPL are applicable instead
29 : * of those above. If you wish to allow use of your version of this file only
30 : * under the terms of either the GPL or the LGPL, and not to allow others to
31 : * use your version of this file under the terms of the MPL, indicate your
32 : * decision by deleting the provisions above and replace them with the notice
33 : * and other provisions required by the GPL or the LGPL. If you do not delete
34 : * the provisions above, a recipient may use your version of this file under
35 : * the terms of any one of the MPL, the GPL or the LGPL.
36 : *
37 : * ***** END LICENSE BLOCK ***** */
38 :
39 : #ifndef nsHtml5StreamParser_h__
40 : #define nsHtml5StreamParser_h__
41 :
42 : #include "nsAutoPtr.h"
43 : #include "nsCOMPtr.h"
44 : #include "nsIStreamListener.h"
45 : #include "nsICharsetDetectionObserver.h"
46 : #include "nsHtml5MetaScanner.h"
47 : #include "nsIUnicodeDecoder.h"
48 : #include "nsHtml5TreeOpExecutor.h"
49 : #include "nsHtml5OwningUTF16Buffer.h"
50 : #include "nsIInputStream.h"
51 : #include "mozilla/Mutex.h"
52 : #include "nsHtml5AtomTable.h"
53 : #include "nsHtml5Speculation.h"
54 : #include "nsITimer.h"
55 : #include "nsICharsetDetector.h"
56 :
57 : class nsHtml5Parser;
58 :
59 : #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
60 : #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
61 :
62 : enum eParserMode {
63 : /**
64 : * Parse a document normally as HTML.
65 : */
66 : NORMAL,
67 :
68 : /**
69 : * View document as HTML source.
70 : */
71 : VIEW_SOURCE_HTML,
72 :
73 : /**
74 : * View document as XML source
75 : */
76 : VIEW_SOURCE_XML,
77 :
78 : /**
79 : * View document as plain text source
80 : */
81 : VIEW_SOURCE_PLAIN,
82 :
83 : /**
84 : * View document as plain text
85 : */
86 : PLAIN_TEXT,
87 :
88 : /**
89 : * Load as data (XHR)
90 : */
91 : LOAD_AS_DATA
92 : };
93 :
94 : enum eBomState {
95 : /**
96 : * BOM sniffing hasn't started.
97 : */
98 : BOM_SNIFFING_NOT_STARTED = 0,
99 :
100 : /**
101 : * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
102 : * seen.
103 : */
104 : SEEN_UTF_16_LE_FIRST_BYTE = 1,
105 :
106 : /**
107 : * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
108 : * seen.
109 : */
110 : SEEN_UTF_16_BE_FIRST_BYTE = 2,
111 :
112 : /**
113 : * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
114 : * seen.
115 : */
116 : SEEN_UTF_8_FIRST_BYTE = 3,
117 :
118 : /**
119 : * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
120 : * have been seen.
121 : */
122 : SEEN_UTF_8_SECOND_BYTE = 4,
123 :
124 : /**
125 : * BOM sniffing was started but is now over for whatever reason.
126 : */
127 : BOM_SNIFFING_OVER = 5
128 : };
129 :
130 : enum eHtml5StreamState {
131 : STREAM_NOT_STARTED = 0,
132 : STREAM_BEING_READ = 1,
133 : STREAM_ENDED = 2
134 : };
135 :
136 : class nsHtml5StreamParser : public nsIStreamListener,
137 : public nsICharsetDetectionObserver {
138 :
139 : friend class nsHtml5RequestStopper;
140 : friend class nsHtml5DataAvailable;
141 : friend class nsHtml5StreamParserContinuation;
142 : friend class nsHtml5TimerKungFu;
143 :
144 : public:
145 0 : NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
146 0 : NS_DECL_CYCLE_COLLECTING_ISUPPORTS
147 1464 : NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser, nsIStreamListener)
148 :
149 : static void InitializeStatics();
150 :
151 : nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
152 : nsHtml5Parser* aOwner,
153 : eParserMode aMode);
154 :
155 : virtual ~nsHtml5StreamParser();
156 :
157 : // nsIRequestObserver methods:
158 : NS_DECL_NSIREQUESTOBSERVER
159 : // nsIStreamListener methods:
160 : NS_DECL_NSISTREAMLISTENER
161 :
162 : // nsICharsetDetectionObserver
163 : /**
164 : * Chardet calls this to report the detection result
165 : */
166 : NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf);
167 :
168 : // EncodingDeclarationHandler
169 : // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
170 : /**
171 : * Tree builder uses this to report a late <meta charset>
172 : */
173 : bool internalEncodingDeclaration(nsString* aEncoding);
174 :
175 : // Not from an external interface
176 :
177 : /**
178 : * Call this method once you've created a parser, and want to instruct it
179 : * about what charset to load
180 : *
181 : * @param aCharset the charset of a document
182 : * @param aCharsetSource the source of the charset
183 : */
184 0 : inline void SetDocumentCharset(const nsACString& aCharset, PRInt32 aSource) {
185 0 : NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
186 : "SetDocumentCharset called too late.");
187 0 : NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
188 0 : mCharset = aCharset;
189 0 : mCharsetSource = aSource;
190 0 : }
191 :
192 0 : inline void SetObserver(nsIRequestObserver* aObserver) {
193 0 : NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
194 0 : mObserver = aObserver;
195 0 : }
196 :
197 : nsresult GetChannel(nsIChannel** aChannel);
198 :
199 : /**
200 : * The owner parser must call this after script execution
201 : * when no scripts are executing and the document.written
202 : * buffer has been exhausted.
203 : */
204 : void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
205 : nsHtml5TreeBuilder* aTreeBuilder,
206 : bool aLastWasCR);
207 :
208 : /**
209 : * Continues the stream parser if the charset switch failed.
210 : */
211 : void ContinueAfterFailedCharsetSwitch();
212 :
213 0 : void Terminate() {
214 0 : mozilla::MutexAutoLock autoLock(mTerminatedMutex);
215 0 : mTerminated = true;
216 0 : }
217 :
218 : void DropTimer();
219 :
220 : /**
221 : * Sets mCharset and mCharsetSource appropriately for the XML View Source
222 : * case if aEncoding names a supported rough ASCII superset and sets
223 : * the mCharset and mCharsetSource to the UTF-8 default otherwise.
224 : */
225 : void SetEncodingFromExpat(const PRUnichar* aEncoding);
226 :
227 : /**
228 : * Sets the URL for View Source title in case this parser ends up being
229 : * used for View Source. If aURL is a view-source: URL, takes the inner
230 : * URL. data: URLs are shown with an ellipsis instead of the actual data.
231 : */
232 : void SetViewSourceTitle(nsIURI* aURL);
233 :
234 : private:
235 :
236 : #ifdef DEBUG
237 0 : bool IsParserThread() {
238 : bool ret;
239 0 : mThread->IsOnCurrentThread(&ret);
240 0 : return ret;
241 : }
242 : #endif
243 :
244 : void MarkAsBroken();
245 :
246 : /**
247 : * Marks the stream parser as interrupted. If you ever add calls to this
248 : * method, be sure to review Uninterrupt usage very, very carefully to
249 : * avoid having a previous in-flight runnable cancel your Interrupt()
250 : * call on the other thread too soon.
251 : */
252 0 : void Interrupt() {
253 0 : mozilla::MutexAutoLock autoLock(mTerminatedMutex);
254 0 : mInterrupted = true;
255 0 : }
256 :
257 0 : void Uninterrupt() {
258 0 : NS_ASSERTION(IsParserThread(), "Wrong thread!");
259 0 : mTokenizerMutex.AssertCurrentThreadOwns();
260 : // Not acquiring mTerminatedMutex because mTokenizerMutex is already
261 : // held at this point and is already stronger.
262 0 : mInterrupted = false;
263 0 : }
264 :
265 : /**
266 : * Flushes the tree ops from the tree builder and disarms the flush
267 : * timer.
268 : */
269 : void FlushTreeOpsAndDisarmTimer();
270 :
271 : void ParseAvailableData();
272 :
273 : void DoStopRequest();
274 :
275 : void DoDataAvailable(PRUint8* aBuffer, PRUint32 aLength);
276 :
277 0 : bool IsTerminatedOrInterrupted() {
278 0 : mozilla::MutexAutoLock autoLock(mTerminatedMutex);
279 0 : return mTerminated || mInterrupted;
280 : }
281 :
282 0 : bool IsTerminated() {
283 0 : mozilla::MutexAutoLock autoLock(mTerminatedMutex);
284 0 : return mTerminated;
285 : }
286 :
287 : /**
288 : * True when there is a Unicode decoder already
289 : */
290 0 : inline bool HasDecoder() {
291 0 : return !!mUnicodeDecoder;
292 : }
293 :
294 : /**
295 : * Push bytes from network when there is no Unicode decoder yet
296 : */
297 : nsresult SniffStreamBytes(const PRUint8* aFromSegment,
298 : PRUint32 aCount,
299 : PRUint32* aWriteCount);
300 :
301 : /**
302 : * Push bytes from network when there is a Unicode decoder already
303 : */
304 : nsresult WriteStreamBytes(const PRUint8* aFromSegment,
305 : PRUint32 aCount,
306 : PRUint32* aWriteCount);
307 :
308 : /**
309 : * Check whether every other byte in the sniffing buffer is zero.
310 : */
311 : void SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment,
312 : PRUint32 aCountToSniffingLimit);
313 :
314 : /**
315 : * <meta charset> scan failed. Try chardet if applicable. After this, the
316 : * the parser will have some encoding even if a last resolt fallback.
317 : *
318 : * @param aFromSegment The current network buffer or null if the sniffing
319 : * buffer is being flushed due to network stream ending.
320 : * @param aCount The number of bytes in aFromSegment (ignored if
321 : * aFromSegment is null)
322 : * @param aWriteCount Return value for how many bytes got read from the
323 : * buffer.
324 : * @param aCountToSniffingLimit The number of unfilled slots in
325 : * mSniffingBuffer
326 : */
327 : nsresult FinalizeSniffing(const PRUint8* aFromSegment,
328 : PRUint32 aCount,
329 : PRUint32* aWriteCount,
330 : PRUint32 aCountToSniffingLimit);
331 :
332 : /**
333 : * Set up the Unicode decoder and write the sniffing buffer into it
334 : * followed by the current network buffer.
335 : *
336 : * @param aFromSegment The current network buffer or null if the sniffing
337 : * buffer is being flushed due to network stream ending.
338 : * @param aCount The number of bytes in aFromSegment (ignored if
339 : * aFromSegment is null)
340 : * @param aWriteCount Return value for how many bytes got read from the
341 : * buffer.
342 : */
343 : nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const PRUint8* aFromSegment,
344 : PRUint32 aCount,
345 : PRUint32* aWriteCount);
346 :
347 : /**
348 : * Write the sniffing buffer into the Unicode decoder followed by the
349 : * current network buffer.
350 : *
351 : * @param aFromSegment The current network buffer or null if the sniffing
352 : * buffer is being flushed due to network stream ending.
353 : * @param aCount The number of bytes in aFromSegment (ignored if
354 : * aFromSegment is null)
355 : * @param aWriteCount Return value for how many bytes got read from the
356 : * buffer.
357 : */
358 : nsresult WriteSniffingBufferAndCurrentSegment(const PRUint8* aFromSegment,
359 : PRUint32 aCount,
360 : PRUint32* aWriteCount);
361 :
362 : /**
363 : * Initialize the Unicode decoder, mark the BOM as the source and
364 : * drop the sniffer.
365 : *
366 : * @param aCharsetName The charset name to report to the outside (UTF-16
367 : * or UTF-8)
368 : * @param aDecoderCharsetName The actual name for the decoder's charset
369 : * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
370 : * been swallowed)
371 : */
372 : nsresult SetupDecodingFromBom(const char* aCharsetName,
373 : const char* aDecoderCharsetName);
374 :
375 : /**
376 : * Become confident or resolve and encoding name to its preferred form.
377 : * @param aEncoding the value of an internal encoding decl. Acts as an
378 : * out param, too, when the method returns true.
379 : * @return true if the parser needs to start using the new value of
380 : * aEncoding and false if the parser became confident or if
381 : * the encoding name did not specify a usable encoding
382 : */
383 : bool PreferredForInternalEncodingDecl(nsACString& aEncoding);
384 :
385 : /**
386 : * Callback for mFlushTimer.
387 : */
388 : static void TimerCallback(nsITimer* aTimer, void* aClosure);
389 :
390 : /**
391 : * Parser thread entry point for (maybe) flushing the ops and posting
392 : * a flush runnable back on the main thread.
393 : */
394 : void TimerFlush();
395 :
396 : nsCOMPtr<nsIRequest> mRequest;
397 : nsCOMPtr<nsIRequestObserver> mObserver;
398 :
399 : /**
400 : * The document title to use if this turns out to be a View Source parser.
401 : */
402 : nsCString mViewSourceTitle;
403 :
404 : /**
405 : * The Unicode decoder
406 : */
407 : nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder;
408 :
409 : /**
410 : * The buffer for sniffing the character encoding
411 : */
412 : nsAutoArrayPtr<PRUint8> mSniffingBuffer;
413 :
414 : /**
415 : * The number of meaningful bytes in mSniffingBuffer
416 : */
417 : PRUint32 mSniffingLength;
418 :
419 : /**
420 : * BOM sniffing state
421 : */
422 : eBomState mBomState;
423 :
424 : /**
425 : * <meta> prescan implementation
426 : */
427 : nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
428 :
429 : // encoding-related stuff
430 : /**
431 : * The source (confidence) of the character encoding in use
432 : */
433 : PRInt32 mCharsetSource;
434 :
435 : /**
436 : * The character encoding in use
437 : */
438 : nsCString mCharset;
439 :
440 : /**
441 : * Whether reparse is forbidden
442 : */
443 : bool mReparseForbidden;
444 :
445 : // Portable parser objects
446 : /**
447 : * The first buffer in the pending UTF-16 buffer queue
448 : */
449 : nsRefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
450 :
451 : /**
452 : * The last buffer in the pending UTF-16 buffer queue
453 : */
454 : nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to
455 : // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
456 :
457 : /**
458 : * The tree operation executor
459 : */
460 : nsHtml5TreeOpExecutor* mExecutor;
461 :
462 : /**
463 : * The HTML5 tree builder
464 : */
465 : nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
466 :
467 : /**
468 : * The HTML5 tokenizer
469 : */
470 : nsAutoPtr<nsHtml5Tokenizer> mTokenizer;
471 :
472 : /**
473 : * Makes sure the main thread can't mess the tokenizer state while it's
474 : * tokenizing. This mutex also protects the current speculation.
475 : */
476 : mozilla::Mutex mTokenizerMutex;
477 :
478 : /**
479 : * The scoped atom table
480 : */
481 : nsHtml5AtomTable mAtomTable;
482 :
483 : /**
484 : * The owner parser.
485 : */
486 : nsRefPtr<nsHtml5Parser> mOwner;
487 :
488 : /**
489 : * Whether the last character tokenized was a carriage return (for CRLF)
490 : */
491 : bool mLastWasCR;
492 :
493 : /**
494 : * For tracking stream life cycle
495 : */
496 : eHtml5StreamState mStreamState;
497 :
498 : /**
499 : * Whether we are speculating.
500 : */
501 : bool mSpeculating;
502 :
503 : /**
504 : * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
505 : */
506 : bool mAtEOF;
507 :
508 : /**
509 : * The speculations. The mutex protects the nsTArray itself.
510 : * To access the queue of current speculation, mTokenizerMutex must be
511 : * obtained.
512 : * The current speculation is the last element
513 : */
514 : nsTArray<nsAutoPtr<nsHtml5Speculation> > mSpeculations;
515 : mozilla::Mutex mSpeculationMutex;
516 :
517 : /**
518 : * True to terminate early; protected by mTerminatedMutex
519 : */
520 : bool mTerminated;
521 : bool mInterrupted;
522 : mozilla::Mutex mTerminatedMutex;
523 :
524 : /**
525 : * The thread this stream parser runs on.
526 : */
527 : nsCOMPtr<nsIThread> mThread;
528 :
529 : nsCOMPtr<nsIRunnable> mExecutorFlusher;
530 :
531 : nsCOMPtr<nsIRunnable> mLoadFlusher;
532 :
533 : /**
534 : * The chardet instance if chardet is enabled.
535 : */
536 : nsCOMPtr<nsICharsetDetector> mChardet;
537 :
538 : /**
539 : * If false, don't push data to chardet.
540 : */
541 : bool mFeedChardet;
542 :
543 : /**
544 : * Timer for flushing tree ops once in a while when not speculating.
545 : */
546 : nsCOMPtr<nsITimer> mFlushTimer;
547 :
548 : /**
549 : * Keeps track whether mFlushTimer has been armed. Unfortunately,
550 : * nsITimer doesn't enable querying this from the timer itself.
551 : */
552 : bool mFlushTimerArmed;
553 :
554 : /**
555 : * False initially and true after the timer has fired at least once.
556 : */
557 : bool mFlushTimerEverFired;
558 :
559 : /**
560 : * Whether the parser is doing a normal parse, view source or plain text.
561 : */
562 : eParserMode mMode;
563 :
564 : /**
565 : * The pref html5.flushtimer.initialdelay: Time in milliseconds between
566 : * the time a network buffer is seen and the timer firing when the
567 : * timer hasn't fired previously in this parse.
568 : */
569 : static PRInt32 sTimerInitialDelay;
570 :
571 : /**
572 : * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
573 : * the time a network buffer is seen and the timer firing when the
574 : * timer has already fired previously in this parse.
575 : */
576 : static PRInt32 sTimerSubsequentDelay;
577 : };
578 :
579 : #endif // nsHtml5StreamParser_h__
|