1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim: set ts=2 sw=2 et tw=78: */
3 : /* ***** BEGIN LICENSE BLOCK *****
4 : * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 : *
6 : * The contents of this file are subject to the Mozilla Public License Version
7 : * 1.1 (the "License"); you may not use this file except in compliance with
8 : * the License. You may obtain a copy of the License at
9 : * http://www.mozilla.org/MPL/
10 : *
11 : * Software distributed under the License is distributed on an "AS IS" basis,
12 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 : * for the specific language governing rights and limitations under the
14 : * License.
15 : *
16 : * The Original Code is mozilla.org code.
17 : *
18 : * The Initial Developer of the Original Code is
19 : * Netscape Communications Corporation.
20 : * Portions created by the Initial Developer are Copyright (C) 1998
21 : * the Initial Developer. All Rights Reserved.
22 : *
23 : * Contributor(s):
24 : *
25 : * Alternatively, the contents of this file may be used under the terms of
26 : * either of the GNU General Public License Version 2 or later (the "GPL"),
27 : * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 : * in which case the provisions of the GPL or the LGPL are applicable instead
29 : * of those above. If you wish to allow use of your version of this file only
30 : * under the terms of either the GPL or the LGPL, and not to allow others to
31 : * use your version of this file under the terms of the MPL, indicate your
32 : * decision by deleting the provisions above and replace them with the notice
33 : * and other provisions required by the GPL or the LGPL. If you do not delete
34 : * the provisions above, a recipient may use your version of this file under
35 : * the terms of any one of the MPL, the GPL or the LGPL.
36 : *
37 : * ***** END LICENSE BLOCK ***** */
38 :
39 : //#define __INCREMENTAL 1
40 :
41 : #include "nsScanner.h"
42 : #include "nsDebug.h"
43 : #include "nsIServiceManager.h"
44 : #include "nsICharsetConverterManager.h"
45 : #include "nsCharsetAlias.h"
46 : #include "nsReadableUtils.h"
47 : #include "nsIInputStream.h"
48 : #include "nsILocalFile.h"
49 : #include "nsNetUtil.h"
50 : #include "nsUTF8Utils.h" // for LossyConvertEncoding
51 : #include "nsCRT.h"
52 : #include "nsParser.h"
53 :
54 : // We replace NUL characters with this character.
55 : static PRUnichar sInvalid = UCS2_REPLACEMENT_CHAR;
56 :
57 60 : nsReadEndCondition::nsReadEndCondition(const PRUnichar* aTerminateChars) :
58 60 : mChars(aTerminateChars), mFilter(PRUnichar(~0)) // All bits set
59 : {
60 : // Build filter that will be used to filter out characters with
61 : // bits that none of the terminal chars have. This works very well
62 : // because terminal chars often have only the last 4-6 bits set and
63 : // normal ascii letters have bit 7 set. Other letters have even higher
64 : // bits set.
65 :
66 : // Calculate filter
67 60 : const PRUnichar *current = aTerminateChars;
68 60 : PRUnichar terminalChar = *current;
69 400 : while (terminalChar) {
70 280 : mFilter &= ~terminalChar;
71 280 : ++current;
72 280 : terminalChar = *current;
73 : }
74 60 : }
75 :
76 : #ifdef __INCREMENTAL
77 : const int kBufsize=1;
78 : #else
79 : const int kBufsize=64;
80 : #endif
81 :
82 : /**
83 : * Use this constructor if you want i/o to be based on
84 : * a single string you hand in during construction.
85 : * This short cut was added for Javascript.
86 : *
87 : * @update gess 5/12/98
88 : * @param aMode represents the parser mode (nav, other)
89 : * @return
90 : */
91 1 : nsScanner::nsScanner(const nsAString& anHTMLString, const nsACString& aCharset,
92 1 : PRInt32 aSource)
93 : {
94 1 : MOZ_COUNT_CTOR(nsScanner);
95 :
96 1 : mSlidingBuffer = nsnull;
97 1 : mCountRemaining = 0;
98 1 : mFirstNonWhitespacePosition = -1;
99 1 : if (AppendToBuffer(anHTMLString)) {
100 1 : mSlidingBuffer->BeginReading(mCurrentPosition);
101 : } else {
102 : /* XXX see hack below, re: bug 182067 */
103 0 : memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
104 0 : mEndPosition = mCurrentPosition;
105 : }
106 1 : mMarkPosition = mCurrentPosition;
107 1 : mIncremental = false;
108 1 : mUnicodeDecoder = 0;
109 1 : mCharsetSource = kCharsetUninitialized;
110 1 : mHasInvalidCharacter = false;
111 1 : mReplacementCharacter = PRUnichar(0x0);
112 1 : }
113 :
114 : /**
115 : * Use this constructor if you want i/o to be based on strings
116 : * the scanner receives. If you pass a null filename, you
117 : * can still provide data to the scanner via append.
118 : *
119 : * @update gess 5/12/98
120 : * @param aFilename --
121 : * @return
122 : */
123 3343 : nsScanner::nsScanner(nsString& aFilename,bool aCreateStream,
124 : const nsACString& aCharset, PRInt32 aSource)
125 3343 : : mFilename(aFilename)
126 : {
127 3343 : MOZ_COUNT_CTOR(nsScanner);
128 3343 : NS_ASSERTION(!aCreateStream, "This is always true.");
129 :
130 3343 : mSlidingBuffer = nsnull;
131 :
132 : // XXX This is a big hack. We need to initialize the iterators to something.
133 : // What matters is that mCurrentPosition == mEndPosition, so that our methods
134 : // believe that we are at EOF (see bug 182067). We null out mCurrentPosition
135 : // so that we have some hope of catching null pointer dereferences associated
136 : // with this hack. --darin
137 3343 : memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
138 3343 : mMarkPosition = mCurrentPosition;
139 3343 : mEndPosition = mCurrentPosition;
140 :
141 3343 : mIncremental = true;
142 3343 : mFirstNonWhitespacePosition = -1;
143 3343 : mCountRemaining = 0;
144 :
145 3343 : mUnicodeDecoder = 0;
146 3343 : mCharsetSource = kCharsetUninitialized;
147 3343 : mHasInvalidCharacter = false;
148 3343 : mReplacementCharacter = PRUnichar(0x0);
149 3343 : SetDocumentCharset(aCharset, aSource);
150 3343 : }
151 :
152 3934 : nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , PRInt32 aSource)
153 : {
154 3934 : if (aSource < mCharsetSource) // priority is lower the the current one , just
155 0 : return NS_OK;
156 :
157 3934 : nsresult res = NS_OK;
158 3934 : if (!mCharset.IsEmpty())
159 : {
160 : bool same;
161 591 : res = nsCharsetAlias::Equals(aCharset, mCharset, &same);
162 591 : if(NS_SUCCEEDED(res) && same)
163 : {
164 412 : return NS_OK; // no difference, don't change it
165 : }
166 : }
167 :
168 : // different, need to change it
169 7044 : nsCString charsetName;
170 3522 : res = nsCharsetAlias::GetPreferred(aCharset, charsetName);
171 :
172 3522 : if(NS_FAILED(res) && (mCharsetSource == kCharsetUninitialized))
173 : {
174 : // failed - unknown alias , fallback to ISO-8859-1
175 0 : mCharset.AssignLiteral("ISO-8859-1");
176 : }
177 : else
178 : {
179 3522 : mCharset.Assign(charsetName);
180 : }
181 :
182 3522 : mCharsetSource = aSource;
183 :
184 3522 : NS_ASSERTION(nsParser::GetCharsetConverterManager(),
185 : "Must have the charset converter manager!");
186 :
187 3522 : res = nsParser::GetCharsetConverterManager()->
188 3522 : GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
189 3522 : if (NS_SUCCEEDED(res) && mUnicodeDecoder)
190 : {
191 : // We need to detect conversion error of character to support XML
192 : // encoding error.
193 3522 : mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);
194 : }
195 :
196 3522 : return res;
197 : }
198 :
199 :
200 : /**
201 : * default destructor
202 : *
203 : * @update gess 3/25/98
204 : * @param
205 : * @return
206 : */
207 6688 : nsScanner::~nsScanner() {
208 :
209 3344 : delete mSlidingBuffer;
210 :
211 3344 : MOZ_COUNT_DTOR(nsScanner);
212 3344 : }
213 :
214 : /**
215 : * Resets current offset position of input stream to marked position.
216 : * This allows us to back up to this point if the need should arise,
217 : * such as when tokenization gets interrupted.
218 : * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
219 : *
220 : * @update gess 5/12/98
221 : * @param
222 : * @return
223 : */
224 6757 : void nsScanner::RewindToMark(void){
225 6757 : if (mSlidingBuffer) {
226 6703 : mCountRemaining += (Distance(mMarkPosition, mCurrentPosition));
227 6703 : mCurrentPosition = mMarkPosition;
228 : }
229 6757 : }
230 :
231 :
232 : /**
233 : * Records current offset position in input stream. This allows us
234 : * to back up to this point if the need should arise, such as when
235 : * tokenization gets interrupted.
236 : *
237 : * @update gess 7/29/98
238 : * @param
239 : * @return
240 : */
241 15651 : PRInt32 nsScanner::Mark() {
242 15651 : PRInt32 distance = 0;
243 15651 : if (mSlidingBuffer) {
244 15597 : nsScannerIterator oldStart;
245 15597 : mSlidingBuffer->BeginReading(oldStart);
246 :
247 15597 : distance = Distance(oldStart, mCurrentPosition);
248 :
249 15597 : mSlidingBuffer->DiscardPrefix(mCurrentPosition);
250 15597 : mSlidingBuffer->BeginReading(mCurrentPosition);
251 15597 : mMarkPosition = mCurrentPosition;
252 : }
253 :
254 15651 : return distance;
255 : }
256 :
257 : /**
258 : * Insert data to our underlying input buffer as
259 : * if it were read from an input stream.
260 : *
261 : * @update harishd 01/12/99
262 : * @return error code
263 : */
264 0 : bool nsScanner::UngetReadable(const nsAString& aBuffer) {
265 0 : if (!mSlidingBuffer) {
266 0 : return false;
267 : }
268 :
269 0 : mSlidingBuffer->UngetReadable(aBuffer,mCurrentPosition);
270 0 : mSlidingBuffer->BeginReading(mCurrentPosition); // Insertion invalidated our iterators
271 0 : mSlidingBuffer->EndReading(mEndPosition);
272 :
273 0 : PRUint32 length = aBuffer.Length();
274 0 : mCountRemaining += length; // Ref. bug 117441
275 0 : return true;
276 : }
277 :
278 : /**
279 : * Append data to our underlying input buffer as
280 : * if it were read from an input stream.
281 : *
282 : * @update gess4/3/98
283 : * @return error code
284 : */
285 3 : nsresult nsScanner::Append(const nsAString& aBuffer) {
286 3 : if (!AppendToBuffer(aBuffer))
287 0 : return NS_ERROR_OUT_OF_MEMORY;
288 3 : return NS_OK;
289 : }
290 :
291 : /**
292 : *
293 : *
294 : * @update gess 5/21/98
295 : * @param
296 : * @return
297 : */
298 3491 : nsresult nsScanner::Append(const char* aBuffer, PRUint32 aLen,
299 : nsIRequest *aRequest)
300 : {
301 3491 : nsresult res = NS_OK;
302 3491 : if (mUnicodeDecoder) {
303 3491 : PRInt32 unicharBufLen = 0;
304 3491 : mUnicodeDecoder->GetMaxLength(aBuffer, aLen, &unicharBufLen);
305 3491 : nsScannerString::Buffer* buffer = nsScannerString::AllocBuffer(unicharBufLen + 1);
306 3491 : NS_ENSURE_TRUE(buffer,NS_ERROR_OUT_OF_MEMORY);
307 3491 : PRUnichar *unichars = buffer->DataStart();
308 :
309 3491 : PRInt32 totalChars = 0;
310 3491 : PRInt32 unicharLength = unicharBufLen;
311 3491 : PRInt32 errorPos = -1;
312 :
313 3491 : do {
314 3491 : PRInt32 srcLength = aLen;
315 3491 : res = mUnicodeDecoder->Convert(aBuffer, &srcLength, unichars, &unicharLength);
316 :
317 3491 : totalChars += unicharLength;
318 : // Continuation of failure case
319 3491 : if(NS_FAILED(res)) {
320 : // if we failed, we consume one byte, replace it with the replacement
321 : // character and try the conversion again.
322 :
323 : // This is only needed because some decoders don't follow the
324 : // nsIUnicodeDecoder contract: they return a failure when *aDestLength
325 : // is 0 rather than the correct NS_OK_UDEC_MOREOUTPUT. See bug 244177
326 0 : if ((unichars + unicharLength) >= buffer->DataEnd()) {
327 0 : NS_ERROR("Unexpected end of destination buffer");
328 0 : break;
329 : }
330 :
331 0 : if (mReplacementCharacter == 0x0 && errorPos == -1) {
332 0 : errorPos = totalChars;
333 : }
334 0 : unichars[unicharLength++] = mReplacementCharacter == 0x0 ?
335 0 : mUnicodeDecoder->GetCharacterForUnMapped() :
336 0 : mReplacementCharacter;
337 :
338 0 : unichars = unichars + unicharLength;
339 0 : unicharLength = unicharBufLen - (++totalChars);
340 :
341 0 : mUnicodeDecoder->Reset();
342 :
343 0 : if(((PRUint32) (srcLength + 1)) > aLen) {
344 0 : srcLength = aLen;
345 : }
346 : else {
347 0 : ++srcLength;
348 : }
349 :
350 0 : aBuffer += srcLength;
351 0 : aLen -= srcLength;
352 : }
353 3491 : } while (NS_FAILED(res) && (aLen > 0));
354 :
355 3491 : buffer->SetDataLength(totalChars);
356 : // Don't propagate return code of unicode decoder
357 : // since it doesn't reflect on our success or failure
358 : // - Ref. bug 87110
359 3491 : res = NS_OK;
360 3491 : if (!AppendToBuffer(buffer, aRequest, errorPos))
361 0 : res = NS_ERROR_OUT_OF_MEMORY;
362 : }
363 : else {
364 0 : NS_WARNING("No decoder found.");
365 0 : res = NS_ERROR_FAILURE;
366 : }
367 :
368 3491 : return res;
369 : }
370 :
371 : /**
372 : * retrieve next char from scanners internal input stream
373 : *
374 : * @update gess 3/25/98
375 : * @param
376 : * @return error code reflecting read status
377 : */
378 5848 : nsresult nsScanner::GetChar(PRUnichar& aChar) {
379 5848 : if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
380 0 : aChar = 0;
381 0 : return kEOF;
382 : }
383 :
384 5848 : aChar = *mCurrentPosition++;
385 5848 : --mCountRemaining;
386 :
387 5848 : return NS_OK;
388 : }
389 :
390 :
391 : /**
392 : * peek ahead to consume next char from scanner's internal
393 : * input buffer
394 : *
395 : * @update gess 3/25/98
396 : * @param
397 : * @return
398 : */
399 16438 : nsresult nsScanner::Peek(PRUnichar& aChar, PRUint32 aOffset) {
400 16438 : aChar = 0;
401 :
402 16438 : if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
403 53 : return kEOF;
404 : }
405 :
406 16385 : if (aOffset > 0) {
407 1554 : if (mCountRemaining <= aOffset)
408 0 : return kEOF;
409 :
410 1554 : nsScannerIterator pos = mCurrentPosition;
411 1554 : pos.advance(aOffset);
412 1554 : aChar=*pos;
413 : }
414 : else {
415 14831 : aChar=*mCurrentPosition;
416 : }
417 :
418 16385 : return NS_OK;
419 : }
420 :
421 3366 : nsresult nsScanner::Peek(nsAString& aStr, PRInt32 aNumChars, PRInt32 aOffset)
422 : {
423 3366 : if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
424 54 : return kEOF;
425 : }
426 :
427 3312 : nsScannerIterator start, end;
428 :
429 3312 : start = mCurrentPosition;
430 :
431 3312 : if ((PRInt32)mCountRemaining <= aOffset) {
432 0 : return kEOF;
433 : }
434 :
435 3312 : if (aOffset > 0) {
436 0 : start.advance(aOffset);
437 : }
438 :
439 3312 : if (mCountRemaining < PRUint32(aNumChars + aOffset)) {
440 2644 : end = mEndPosition;
441 : }
442 : else {
443 668 : end = start;
444 668 : end.advance(aNumChars);
445 : }
446 :
447 3312 : CopyUnicodeTo(start, end, aStr);
448 :
449 3312 : return NS_OK;
450 : }
451 :
452 :
453 : /**
454 : * Skip whitespace on scanner input stream
455 : *
456 : * @update gess 3/25/98
457 : * @param
458 : * @return error status
459 : */
460 4798 : nsresult nsScanner::SkipWhitespace(PRInt32& aNewlinesSkipped) {
461 :
462 4798 : if (!mSlidingBuffer) {
463 0 : return kEOF;
464 : }
465 :
466 4798 : PRUnichar theChar = 0;
467 4798 : nsresult result = Peek(theChar);
468 :
469 4798 : if (NS_FAILED(result)) {
470 0 : return result;
471 : }
472 :
473 4798 : nsScannerIterator current = mCurrentPosition;
474 4798 : bool done = false;
475 4798 : bool skipped = false;
476 :
477 15126 : while (!done && current != mEndPosition) {
478 5530 : switch(theChar) {
479 : case '\n':
480 0 : case '\r': ++aNewlinesSkipped;
481 : case ' ' :
482 : case '\t':
483 : {
484 732 : skipped = true;
485 732 : PRUnichar thePrevChar = theChar;
486 732 : theChar = (++current != mEndPosition) ? *current : '\0';
487 732 : if ((thePrevChar == '\r' && theChar == '\n') ||
488 : (thePrevChar == '\n' && theChar == '\r')) {
489 0 : theChar = (++current != mEndPosition) ? *current : '\0'; // CRLF == LFCR => LF
490 : }
491 : }
492 732 : break;
493 : default:
494 4798 : done = true;
495 4798 : break;
496 : }
497 : }
498 :
499 4798 : if (skipped) {
500 732 : SetPosition(current);
501 732 : if (current == mEndPosition) {
502 0 : result = kEOF;
503 : }
504 : }
505 :
506 4798 : return result;
507 : }
508 :
509 : /**
510 : * Skip over chars as long as they equal given char
511 : *
512 : * @update gess 3/25/98
513 : * @param
514 : * @return error code
515 : */
516 0 : nsresult nsScanner::SkipOver(PRUnichar aSkipChar){
517 :
518 0 : if (!mSlidingBuffer) {
519 0 : return kEOF;
520 : }
521 :
522 0 : PRUnichar ch=0;
523 0 : nsresult result=NS_OK;
524 :
525 0 : while(NS_OK==result) {
526 0 : result=Peek(ch);
527 0 : if(NS_OK == result) {
528 0 : if(ch!=aSkipChar) {
529 0 : break;
530 : }
531 0 : GetChar(ch);
532 : }
533 0 : else break;
534 : } //while
535 0 : return result;
536 :
537 : }
538 :
539 : #if 0
540 : void DoErrTest(nsString& aString) {
541 : PRInt32 pos=aString.FindChar(0);
542 : if(kNotFound<pos) {
543 : if(aString.Length()-1!=pos) {
544 : }
545 : }
546 : }
547 :
548 : void DoErrTest(nsCString& aString) {
549 : PRInt32 pos=aString.FindChar(0);
550 : if(kNotFound<pos) {
551 : if(aString.Length()-1!=pos) {
552 : }
553 : }
554 : }
555 : #endif
556 :
557 : /**
558 : * Consume characters until you run into space, a '<', a '>', or a '/'.
559 : *
560 : * @param aString - receives new data from stream
561 : * @return error code
562 : */
563 1138 : nsresult nsScanner::ReadTagIdentifier(nsScannerSharedSubstring& aString) {
564 :
565 1138 : if (!mSlidingBuffer) {
566 0 : return kEOF;
567 : }
568 :
569 1138 : PRUnichar theChar=0;
570 1138 : nsresult result=Peek(theChar);
571 1138 : nsScannerIterator current, end;
572 1138 : bool found=false;
573 :
574 1138 : current = mCurrentPosition;
575 1138 : end = mEndPosition;
576 :
577 : // Loop until we find an illegal character. Everything is then appended
578 : // later.
579 5428 : while(current != end && !found) {
580 3152 : theChar=*current;
581 :
582 3152 : switch(theChar) {
583 : case '\n':
584 : case '\r':
585 : case ' ' :
586 : case '\t':
587 : case '\v':
588 : case '\f':
589 : case '<':
590 : case '>':
591 : case '/':
592 1138 : found = true;
593 1138 : break;
594 :
595 : case '\0':
596 0 : ReplaceCharacter(current, sInvalid);
597 0 : break;
598 :
599 : default:
600 2014 : break;
601 : }
602 :
603 3152 : if (!found) {
604 2014 : ++current;
605 : }
606 : }
607 :
608 : // Don't bother appending nothing.
609 1138 : if (current != mCurrentPosition) {
610 1138 : AppendUnicodeTo(mCurrentPosition, current, aString);
611 : }
612 :
613 1138 : SetPosition(current);
614 1138 : if (current == end) {
615 0 : result = kEOF;
616 : }
617 :
618 : //DoErrTest(aString);
619 :
620 1138 : return result;
621 : }
622 :
623 : /**
624 : * Consume characters until you run into a char that's not valid in an
625 : * entity name
626 : *
627 : * @param aString - receives new data from stream
628 : * @return error code
629 : */
630 11 : nsresult nsScanner::ReadEntityIdentifier(nsString& aString) {
631 :
632 11 : if (!mSlidingBuffer) {
633 0 : return kEOF;
634 : }
635 :
636 11 : PRUnichar theChar=0;
637 11 : nsresult result=Peek(theChar);
638 11 : nsScannerIterator origin, current, end;
639 11 : bool found=false;
640 :
641 11 : origin = mCurrentPosition;
642 11 : current = mCurrentPosition;
643 11 : end = mEndPosition;
644 :
645 55 : while(current != end) {
646 :
647 44 : theChar=*current;
648 44 : if(theChar) {
649 44 : found=false;
650 44 : switch(theChar) {
651 : case '_':
652 : case '-':
653 : case '.':
654 : // Don't allow ':' in entity names. See bug 23791
655 0 : found = true;
656 0 : break;
657 : default:
658 : found = ('a'<=theChar && theChar<='z') ||
659 : ('A'<=theChar && theChar<='Z') ||
660 44 : ('0'<=theChar && theChar<='9');
661 44 : break;
662 : }
663 :
664 44 : if(!found) {
665 11 : AppendUnicodeTo(mCurrentPosition, current, aString);
666 11 : break;
667 : }
668 : }
669 33 : ++current;
670 : }
671 :
672 11 : SetPosition(current);
673 11 : if (current == end) {
674 0 : AppendUnicodeTo(origin, current, aString);
675 0 : return kEOF;
676 : }
677 :
678 : //DoErrTest(aString);
679 :
680 11 : return result;
681 : }
682 :
683 : /**
684 : * Consume digits
685 : *
686 : * @param aString - should contain digits
687 : * @return error code
688 : */
689 0 : nsresult nsScanner::ReadNumber(nsString& aString,PRInt32 aBase) {
690 :
691 0 : if (!mSlidingBuffer) {
692 0 : return kEOF;
693 : }
694 :
695 0 : NS_ASSERTION(aBase == 10 || aBase == 16,"base value not supported");
696 :
697 0 : PRUnichar theChar=0;
698 0 : nsresult result=Peek(theChar);
699 0 : nsScannerIterator origin, current, end;
700 :
701 0 : origin = mCurrentPosition;
702 0 : current = origin;
703 0 : end = mEndPosition;
704 :
705 0 : bool done = false;
706 0 : while(current != end) {
707 0 : theChar=*current;
708 0 : if(theChar) {
709 : done = (theChar < '0' || theChar > '9') &&
710 : ((aBase == 16)? (theChar < 'A' || theChar > 'F') &&
711 : (theChar < 'a' || theChar > 'f')
712 0 : :true);
713 0 : if(done) {
714 0 : AppendUnicodeTo(origin, current, aString);
715 0 : break;
716 : }
717 : }
718 0 : ++current;
719 : }
720 :
721 0 : SetPosition(current);
722 0 : if (current == end) {
723 0 : AppendUnicodeTo(origin, current, aString);
724 0 : return kEOF;
725 : }
726 :
727 : //DoErrTest(aString);
728 :
729 0 : return result;
730 : }
731 :
732 : /**
733 : * Consume characters until you find the terminal char
734 : *
735 : * @update gess 3/25/98
736 : * @param aString receives new data from stream
737 : * @param addTerminal tells us whether to append terminal to aString
738 : * @return error code
739 : */
740 281 : nsresult nsScanner::ReadWhitespace(nsScannerSharedSubstring& aString,
741 : PRInt32& aNewlinesSkipped,
742 : bool& aHaveCR) {
743 :
744 281 : aHaveCR = false;
745 :
746 281 : if (!mSlidingBuffer) {
747 0 : return kEOF;
748 : }
749 :
750 281 : PRUnichar theChar = 0;
751 281 : nsresult result = Peek(theChar);
752 :
753 281 : if (NS_FAILED(result)) {
754 0 : return result;
755 : }
756 :
757 281 : nsScannerIterator origin, current, end;
758 281 : bool done = false;
759 :
760 281 : origin = mCurrentPosition;
761 281 : current = origin;
762 281 : end = mEndPosition;
763 :
764 281 : bool haveCR = false;
765 :
766 2475 : while(!done && current != end) {
767 1913 : switch(theChar) {
768 : case '\n':
769 : case '\r':
770 : {
771 0 : ++aNewlinesSkipped;
772 0 : PRUnichar thePrevChar = theChar;
773 0 : theChar = (++current != end) ? *current : '\0';
774 0 : if ((thePrevChar == '\r' && theChar == '\n') ||
775 : (thePrevChar == '\n' && theChar == '\r')) {
776 0 : theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF
777 0 : haveCR = true;
778 0 : } else if (thePrevChar == '\r') {
779 : // Lone CR becomes CRLF; callers should know to remove extra CRs
780 0 : AppendUnicodeTo(origin, current, aString);
781 0 : aString.writable().Append(PRUnichar('\n'));
782 0 : origin = current;
783 0 : haveCR = true;
784 : }
785 : }
786 0 : break;
787 : case ' ' :
788 : case '\t':
789 1632 : theChar = (++current != end) ? *current : '\0';
790 1632 : break;
791 : default:
792 281 : done = true;
793 281 : AppendUnicodeTo(origin, current, aString);
794 281 : break;
795 : }
796 : }
797 :
798 281 : SetPosition(current);
799 281 : if (current == end) {
800 0 : AppendUnicodeTo(origin, current, aString);
801 0 : result = kEOF;
802 : }
803 :
804 281 : aHaveCR = haveCR;
805 281 : return result;
806 : }
807 :
808 : //XXXbz callers of this have to manage their lone '\r' themselves if they want
809 : //it to work. Good thing they're all in view-source and it deals.
810 0 : nsresult nsScanner::ReadWhitespace(nsScannerIterator& aStart,
811 : nsScannerIterator& aEnd,
812 : PRInt32& aNewlinesSkipped) {
813 :
814 0 : if (!mSlidingBuffer) {
815 0 : return kEOF;
816 : }
817 :
818 0 : PRUnichar theChar = 0;
819 0 : nsresult result = Peek(theChar);
820 :
821 0 : if (NS_FAILED(result)) {
822 0 : return result;
823 : }
824 :
825 0 : nsScannerIterator origin, current, end;
826 0 : bool done = false;
827 :
828 0 : origin = mCurrentPosition;
829 0 : current = origin;
830 0 : end = mEndPosition;
831 :
832 0 : while(!done && current != end) {
833 0 : switch(theChar) {
834 : case '\n':
835 0 : case '\r': ++aNewlinesSkipped;
836 : case ' ' :
837 : case '\t':
838 : {
839 0 : PRUnichar thePrevChar = theChar;
840 0 : theChar = (++current != end) ? *current : '\0';
841 0 : if ((thePrevChar == '\r' && theChar == '\n') ||
842 : (thePrevChar == '\n' && theChar == '\r')) {
843 0 : theChar = (++current != end) ? *current : '\0'; // CRLF == LFCR => LF
844 : }
845 : }
846 0 : break;
847 : default:
848 0 : done = true;
849 0 : aStart = origin;
850 0 : aEnd = current;
851 0 : break;
852 : }
853 : }
854 :
855 0 : SetPosition(current);
856 0 : if (current == end) {
857 0 : aStart = origin;
858 0 : aEnd = current;
859 0 : result = kEOF;
860 : }
861 :
862 0 : return result;
863 : }
864 :
865 : /**
866 : * Consume characters until you encounter one contained in given
867 : * input set.
868 : *
869 : * @update gess 3/25/98
870 : * @param aString will contain the result of this method
871 : * @param aTerminalSet is an ordered string that contains
872 : * the set of INVALID characters
873 : * @return error code
874 : */
875 0 : nsresult nsScanner::ReadUntil(nsAString& aString,
876 : const nsReadEndCondition& aEndCondition,
877 : bool addTerminal)
878 : {
879 0 : if (!mSlidingBuffer) {
880 0 : return kEOF;
881 : }
882 :
883 0 : nsScannerIterator origin, current;
884 0 : const PRUnichar* setstart = aEndCondition.mChars;
885 : const PRUnichar* setcurrent;
886 :
887 0 : origin = mCurrentPosition;
888 0 : current = origin;
889 :
890 0 : PRUnichar theChar=0;
891 0 : nsresult result=Peek(theChar);
892 :
893 0 : if (NS_FAILED(result)) {
894 0 : return result;
895 : }
896 :
897 0 : while (current != mEndPosition) {
898 0 : theChar = *current;
899 0 : if (theChar == '\0') {
900 0 : ReplaceCharacter(current, sInvalid);
901 0 : theChar = sInvalid;
902 : }
903 :
904 : // Filter out completely wrong characters
905 : // Check if all bits are in the required area
906 0 : if(!(theChar & aEndCondition.mFilter)) {
907 : // They were. Do a thorough check.
908 :
909 0 : setcurrent = setstart;
910 0 : while (*setcurrent) {
911 0 : if (*setcurrent == theChar) {
912 0 : if(addTerminal)
913 0 : ++current;
914 0 : AppendUnicodeTo(origin, current, aString);
915 0 : SetPosition(current);
916 :
917 : //DoErrTest(aString);
918 :
919 0 : return NS_OK;
920 : }
921 0 : ++setcurrent;
922 : }
923 : }
924 :
925 0 : ++current;
926 : }
927 :
928 : // If we are here, we didn't find any terminator in the string and
929 : // current = mEndPosition
930 0 : SetPosition(current);
931 0 : AppendUnicodeTo(origin, current, aString);
932 0 : return kEOF;
933 : }
934 :
935 768 : nsresult nsScanner::ReadUntil(nsScannerSharedSubstring& aString,
936 : const nsReadEndCondition& aEndCondition,
937 : bool addTerminal)
938 : {
939 768 : if (!mSlidingBuffer) {
940 0 : return kEOF;
941 : }
942 :
943 768 : nsScannerIterator origin, current;
944 768 : const PRUnichar* setstart = aEndCondition.mChars;
945 : const PRUnichar* setcurrent;
946 :
947 768 : origin = mCurrentPosition;
948 768 : current = origin;
949 :
950 768 : PRUnichar theChar=0;
951 768 : nsresult result=Peek(theChar);
952 :
953 768 : if (NS_FAILED(result)) {
954 0 : return result;
955 : }
956 :
957 79105 : while (current != mEndPosition) {
958 78337 : theChar = *current;
959 78337 : if (theChar == '\0') {
960 0 : ReplaceCharacter(current, sInvalid);
961 0 : theChar = sInvalid;
962 : }
963 :
964 : // Filter out completely wrong characters
965 : // Check if all bits are in the required area
966 78337 : if(!(theChar & aEndCondition.mFilter)) {
967 : // They were. Do a thorough check.
968 :
969 4421 : setcurrent = setstart;
970 23540 : while (*setcurrent) {
971 15466 : if (*setcurrent == theChar) {
972 768 : if(addTerminal)
973 0 : ++current;
974 768 : AppendUnicodeTo(origin, current, aString);
975 768 : SetPosition(current);
976 :
977 : //DoErrTest(aString);
978 :
979 768 : return NS_OK;
980 : }
981 14698 : ++setcurrent;
982 : }
983 : }
984 :
985 77569 : ++current;
986 : }
987 :
988 : // If we are here, we didn't find any terminator in the string and
989 : // current = mEndPosition
990 0 : SetPosition(current);
991 0 : AppendUnicodeTo(origin, current, aString);
992 0 : return kEOF;
993 : }
994 :
995 1087 : nsresult nsScanner::ReadUntil(nsScannerIterator& aStart,
996 : nsScannerIterator& aEnd,
997 : const nsReadEndCondition &aEndCondition,
998 : bool addTerminal)
999 : {
1000 1087 : if (!mSlidingBuffer) {
1001 0 : return kEOF;
1002 : }
1003 :
1004 1087 : nsScannerIterator origin, current;
1005 1087 : const PRUnichar* setstart = aEndCondition.mChars;
1006 : const PRUnichar* setcurrent;
1007 :
1008 1087 : origin = mCurrentPosition;
1009 1087 : current = origin;
1010 :
1011 1087 : PRUnichar theChar=0;
1012 1087 : nsresult result=Peek(theChar);
1013 :
1014 1087 : if (NS_FAILED(result)) {
1015 0 : aStart = aEnd = current;
1016 0 : return result;
1017 : }
1018 :
1019 14056 : while (current != mEndPosition) {
1020 12969 : theChar = *current;
1021 12969 : if (theChar == '\0') {
1022 0 : ReplaceCharacter(current, sInvalid);
1023 0 : theChar = sInvalid;
1024 : }
1025 :
1026 : // Filter out completely wrong characters
1027 : // Check if all bits are in the required area
1028 12969 : if(!(theChar & aEndCondition.mFilter)) {
1029 : // They were. Do a thorough check.
1030 1954 : setcurrent = setstart;
1031 9783 : while (*setcurrent) {
1032 6962 : if (*setcurrent == theChar) {
1033 1087 : if(addTerminal)
1034 0 : ++current;
1035 1087 : aStart = origin;
1036 1087 : aEnd = current;
1037 1087 : SetPosition(current);
1038 :
1039 1087 : return NS_OK;
1040 : }
1041 5875 : ++setcurrent;
1042 : }
1043 : }
1044 :
1045 11882 : ++current;
1046 : }
1047 :
1048 : // If we are here, we didn't find any terminator in the string and
1049 : // current = mEndPosition
1050 0 : SetPosition(current);
1051 0 : aStart = origin;
1052 0 : aEnd = current;
1053 0 : return kEOF;
1054 : }
1055 :
1056 : /**
1057 : * Consumes chars until you see the given terminalChar
1058 : *
1059 : * @update gess 3/25/98
1060 : * @param
1061 : * @return error code
1062 : */
1063 0 : nsresult nsScanner::ReadUntil(nsAString& aString,
1064 : PRUnichar aTerminalChar,
1065 : bool addTerminal)
1066 : {
1067 0 : if (!mSlidingBuffer) {
1068 0 : return kEOF;
1069 : }
1070 :
1071 0 : nsScannerIterator origin, current;
1072 :
1073 0 : origin = mCurrentPosition;
1074 0 : current = origin;
1075 :
1076 : PRUnichar theChar;
1077 0 : nsresult result = Peek(theChar);
1078 :
1079 0 : if (NS_FAILED(result)) {
1080 0 : return result;
1081 : }
1082 :
1083 0 : while (current != mEndPosition) {
1084 0 : theChar = *current;
1085 0 : if (theChar == '\0') {
1086 0 : ReplaceCharacter(current, sInvalid);
1087 0 : theChar = sInvalid;
1088 : }
1089 :
1090 0 : if (aTerminalChar == theChar) {
1091 0 : if(addTerminal)
1092 0 : ++current;
1093 0 : AppendUnicodeTo(origin, current, aString);
1094 0 : SetPosition(current);
1095 0 : return NS_OK;
1096 : }
1097 0 : ++current;
1098 : }
1099 :
1100 : // If we are here, we didn't find any terminator in the string and
1101 : // current = mEndPosition
1102 0 : AppendUnicodeTo(origin, current, aString);
1103 0 : SetPosition(current);
1104 0 : return kEOF;
1105 :
1106 : }
1107 :
1108 1063 : void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd)
1109 : {
1110 1063 : aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
1111 1063 : }
1112 :
1113 8098 : void nsScanner::CurrentPosition(nsScannerIterator& aPosition)
1114 : {
1115 8098 : aPosition = mCurrentPosition;
1116 8098 : }
1117 :
1118 13749 : void nsScanner::EndReading(nsScannerIterator& aPosition)
1119 : {
1120 13749 : aPosition = mEndPosition;
1121 13749 : }
1122 :
1123 11268 : void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate, bool aReverse)
1124 : {
1125 11268 : if (mSlidingBuffer) {
1126 : #ifdef DEBUG
1127 11268 : PRUint32 origRemaining = mCountRemaining;
1128 : #endif
1129 :
1130 11268 : if (aReverse) {
1131 306 : mCountRemaining += (Distance(aPosition, mCurrentPosition));
1132 : }
1133 : else {
1134 10962 : mCountRemaining -= (Distance(mCurrentPosition, aPosition));
1135 : }
1136 :
1137 11268 : NS_ASSERTION((mCountRemaining >= origRemaining && aReverse) ||
1138 : (mCountRemaining <= origRemaining && !aReverse),
1139 : "Improper use of nsScanner::SetPosition. Make sure to set the"
1140 : " aReverse parameter correctly");
1141 :
1142 11268 : mCurrentPosition = aPosition;
1143 11268 : if (aTerminate && (mCurrentPosition == mEndPosition)) {
1144 6630 : mMarkPosition = mCurrentPosition;
1145 6630 : mSlidingBuffer->DiscardPrefix(mCurrentPosition);
1146 : }
1147 : }
1148 11268 : }
1149 :
1150 0 : void nsScanner::ReplaceCharacter(nsScannerIterator& aPosition,
1151 : PRUnichar aChar)
1152 : {
1153 0 : if (mSlidingBuffer) {
1154 0 : mSlidingBuffer->ReplaceCharacter(aPosition, aChar);
1155 : }
1156 0 : }
1157 :
1158 3495 : bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf,
1159 : nsIRequest *aRequest,
1160 : PRInt32 aErrorPos)
1161 : {
1162 3495 : PRUint32 countRemaining = mCountRemaining;
1163 3495 : if (!mSlidingBuffer) {
1164 3288 : mSlidingBuffer = new nsScannerString(aBuf);
1165 3288 : if (!mSlidingBuffer)
1166 0 : return false;
1167 3288 : mSlidingBuffer->BeginReading(mCurrentPosition);
1168 3288 : mMarkPosition = mCurrentPosition;
1169 3288 : mSlidingBuffer->EndReading(mEndPosition);
1170 3288 : mCountRemaining = aBuf->DataLength();
1171 : }
1172 : else {
1173 207 : mSlidingBuffer->AppendBuffer(aBuf);
1174 207 : if (mCurrentPosition == mEndPosition) {
1175 134 : mSlidingBuffer->BeginReading(mCurrentPosition);
1176 : }
1177 207 : mSlidingBuffer->EndReading(mEndPosition);
1178 207 : mCountRemaining += aBuf->DataLength();
1179 : }
1180 :
1181 3495 : if (aErrorPos != -1 && !mHasInvalidCharacter) {
1182 0 : mHasInvalidCharacter = true;
1183 0 : mFirstInvalidPosition = mCurrentPosition;
1184 0 : mFirstInvalidPosition.advance(countRemaining + aErrorPos);
1185 : }
1186 :
1187 3495 : if (mFirstNonWhitespacePosition == -1) {
1188 3289 : nsScannerIterator iter(mCurrentPosition);
1189 3289 : nsScannerIterator end(mEndPosition);
1190 :
1191 6579 : while (iter != end) {
1192 3288 : if (!nsCRT::IsAsciiSpace(*iter)) {
1193 3287 : mFirstNonWhitespacePosition = Distance(mCurrentPosition, iter);
1194 :
1195 3287 : break;
1196 : }
1197 :
1198 1 : ++iter;
1199 : }
1200 : }
1201 3495 : return true;
1202 : }
1203 :
1204 : /**
1205 : * call this to copy bytes out of the scanner that have not yet been consumed
1206 : * by the tokenization process.
1207 : *
1208 : * @update gess 5/12/98
1209 : * @param aCopyBuffer is where the scanner buffer will be copied to
1210 : * @return nada
1211 : */
1212 0 : void nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
1213 0 : if (!mSlidingBuffer) {
1214 0 : aCopyBuffer.Truncate();
1215 0 : return;
1216 : }
1217 :
1218 0 : nsScannerIterator start, end;
1219 0 : start = mCurrentPosition;
1220 0 : end = mEndPosition;
1221 :
1222 0 : CopyUnicodeTo(start, end, aCopyBuffer);
1223 : }
1224 :
1225 : /**
1226 : * Retrieve the name of the file that the scanner is reading from.
1227 : * In some cases, it's just a given name, because the scanner isn't
1228 : * really reading from a file.
1229 : *
1230 : * @update gess 5/12/98
1231 : * @return
1232 : */
1233 10099 : nsString& nsScanner::GetFilename(void) {
1234 10099 : return mFilename;
1235 : }
1236 :
1237 : /**
1238 : * Conduct self test. Actually, selftesting for this class
1239 : * occurs in the parser selftest.
1240 : *
1241 : * @update gess 3/25/98
1242 : * @param
1243 : * @return
1244 : */
1245 :
1246 0 : void nsScanner::SelfTest(void) {
1247 : #ifdef _DEBUG
1248 : #endif
1249 0 : }
1250 :
1251 3314 : void nsScanner::OverrideReplacementCharacter(PRUnichar aReplacementCharacter)
1252 : {
1253 3314 : mReplacementCharacter = aReplacementCharacter;
1254 :
1255 3314 : if (mHasInvalidCharacter) {
1256 0 : ReplaceCharacter(mFirstInvalidPosition, mReplacementCharacter);
1257 : }
1258 3314 : }
1259 :
|