1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sw=4 et tw=99 ft=cpp:
3 : *
4 : * ***** BEGIN LICENSE BLOCK *****
5 : * Copyright (C) 2009 Apple Inc. All rights reserved.
6 : *
7 : * Redistribution and use in source and binary forms, with or without
8 : * modification, are permitted provided that the following conditions
9 : * are met:
10 : * 1. Redistributions of source code must retain the above copyright
11 : * notice, this list of conditions and the following disclaimer.
12 : * 2. Redistributions in binary form must reproduce the above copyright
13 : * notice, this list of conditions and the following disclaimer in the
14 : * documentation and/or other materials provided with the distribution.
15 : *
16 : * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
17 : * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 : * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
20 : * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 : * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 : * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 : * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 : * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 : *
28 : * ***** END LICENSE BLOCK ***** */
29 :
30 : #ifndef YarrParser_h
31 : #define YarrParser_h
32 :
33 : #include "Yarr.h"
34 :
35 : namespace JSC { namespace Yarr {
36 :
37 : #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
38 :
39 : enum BuiltInCharacterClassID {
40 : DigitClassID,
41 : SpaceClassID,
42 : WordClassID,
43 : NewlineClassID
44 : };
45 :
46 : // The Parser class should not be used directly - only via the Yarr::parse() method.
47 : template<class Delegate>
48 : class Parser {
49 : private:
50 : template<class FriendDelegate>
51 : friend ErrorCode parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
52 :
53 : /*
54 : * CharacterClassParserDelegate:
55 : *
56 : * The class CharacterClassParserDelegate is used in the parsing of character
57 : * classes. This class handles detection of character ranges. This class
58 : * implements enough of the delegate interface such that it can be passed to
59 : * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
60 : * to perform the parsing of escape characters in character sets.
61 : */
62 : class CharacterClassParserDelegate {
63 : public:
64 48926 : CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
65 : : m_delegate(delegate)
66 : , m_err(err)
67 : , m_state(Empty)
68 48926 : , m_character(0)
69 : {
70 48926 : }
71 :
72 : /*
73 : * begin():
74 : *
75 : * Called at beginning of construction.
76 : */
77 48926 : void begin(bool invert)
78 : {
79 48926 : m_delegate.atomCharacterClassBegin(invert);
80 48926 : }
81 :
82 : /*
83 : * atomPatternCharacter():
84 : *
85 : * This method is called either from parseCharacterClass() (for an unescaped
86 : * character in a character class), or from parseEscape(). In the former case
87 : * the value true will be passed for the argument 'hyphenIsRange', and in this
88 : * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
89 : * is different to /[a\-z]/).
90 : */
91 267603 : void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
92 : {
93 267603 : switch (m_state) {
94 : case AfterCharacterClass:
95 : // Following a builtin character class we need look out for a hyphen.
96 : // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
97 : // If we see a hyphen following a charater class then unlike usual
98 : // we'll report it to the delegate immediately, and put ourself into
99 : // a poisoned state. Any following calls to add another character or
100 : // character class will result in an error. (A hypen following a
101 : // character-class is itself valid, but only at the end of a regex).
102 1193 : if (hyphenIsRange && ch == '-') {
103 162 : m_delegate.atomCharacterClassAtom('-');
104 162 : m_state = AfterCharacterClassHyphen;
105 162 : return;
106 : }
107 : // Otherwise just fall through - cached character so treat this as Empty.
108 :
109 : case Empty:
110 89842 : m_character = ch;
111 89842 : m_state = CachedCharacter;
112 89842 : return;
113 :
114 : case CachedCharacter:
115 110560 : if (hyphenIsRange && ch == '-')
116 67211 : m_state = CachedCharacterHyphen;
117 : else {
118 43349 : m_delegate.atomCharacterClassAtom(m_character);
119 43349 : m_character = ch;
120 : }
121 110560 : return;
122 :
123 : case CachedCharacterHyphen:
124 66994 : if (ch < m_character) {
125 9 : m_err = CharacterClassOutOfOrder;
126 9 : return;
127 : }
128 66985 : m_delegate.atomCharacterClassRange(m_character, ch);
129 66985 : m_state = Empty;
130 66985 : return;
131 :
132 : case AfterCharacterClassHyphen:
133 45 : m_delegate.atomCharacterClassAtom(ch);
134 45 : m_state = Empty;
135 45 : return;
136 : }
137 : }
138 :
139 : /*
140 : * atomBuiltInCharacterClass():
141 : *
142 : * Adds a built-in character class, called by parseEscape().
143 : */
144 1686 : void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
145 : {
146 1686 : switch (m_state) {
147 : case CachedCharacter:
148 : // Flush the currently cached character, then fall through.
149 239 : m_delegate.atomCharacterClassAtom(m_character);
150 :
151 : case Empty:
152 : case AfterCharacterClass:
153 1614 : m_state = AfterCharacterClass;
154 1614 : m_delegate.atomCharacterClassBuiltIn(classID, invert);
155 1614 : return;
156 :
157 : case CachedCharacterHyphen:
158 : // Error! We have a range that looks like [x-\d]. We require
159 : // the end of the range to be a single character.
160 45 : m_err = CharacterClassInvalidRange;
161 45 : return;
162 :
163 : case AfterCharacterClassHyphen:
164 27 : m_delegate.atomCharacterClassBuiltIn(classID, invert);
165 27 : m_state = Empty;
166 27 : return;
167 : }
168 : }
169 :
170 : /*
171 : * end():
172 : *
173 : * Called at end of construction.
174 : */
175 48870 : void end()
176 : {
177 48870 : if (m_state == CachedCharacter)
178 22392 : m_delegate.atomCharacterClassAtom(m_character);
179 26478 : else if (m_state == CachedCharacterHyphen) {
180 172 : m_delegate.atomCharacterClassAtom(m_character);
181 172 : m_delegate.atomCharacterClassAtom('-');
182 : }
183 48870 : m_delegate.atomCharacterClassEnd();
184 48870 : }
185 :
186 : // parseEscape() should never call these delegate methods when
187 : // invoked with inCharacterClass set.
188 : void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
189 : void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
190 :
191 : private:
192 : Delegate& m_delegate;
193 : ErrorCode& m_err;
194 : enum CharacterClassConstructionState {
195 : Empty,
196 : CachedCharacter,
197 : CachedCharacterHyphen,
198 : AfterCharacterClass,
199 : AfterCharacterClassHyphen
200 : } m_state;
201 : UChar m_character;
202 : };
203 :
204 98052 : Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
205 : : m_delegate(delegate)
206 : , m_backReferenceLimit(backReferenceLimit)
207 : , m_err(NoError)
208 : , m_data(pattern.chars())
209 : , m_size(pattern.length())
210 : , m_index(0)
211 98052 : , m_parenthesesNestingDepth(0)
212 : {
213 98052 : }
214 :
215 : /*
216 : * parseEscape():
217 : *
218 : * Helper for parseTokens() AND parseCharacterClass().
219 : * Unlike the other parser methods, this function does not report tokens
220 : * directly to the member delegate (m_delegate), instead tokens are
221 : * emitted to the delegate provided as an argument. In the case of atom
222 : * escapes, parseTokens() will call parseEscape() passing m_delegate as
223 : * an argument, and as such the escape will be reported to the delegate.
224 : *
225 : * However this method may also be used by parseCharacterClass(), in which
226 : * case a CharacterClassParserDelegate will be passed as the delegate that
227 : * tokens should be added to. A boolean flag is also provided to indicate
228 : * whether that an escape in a CharacterClass is being parsed (some parsing
229 : * rules change in this context).
230 : *
231 : * The boolean value returned by this method indicates whether the token
232 : * parsed was an atom (outside of a characted class \b and \B will be
233 : * interpreted as assertions).
234 : */
235 : template<bool inCharacterClass, class EscapeDelegate>
236 107498 : bool parseEscape(EscapeDelegate& delegate)
237 : {
238 107498 : ASSERT(!m_err);
239 107498 : ASSERT(peek() == '\\');
240 107498 : consume();
241 :
242 107498 : if (atEndOfPattern()) {
243 18 : m_err = EscapeUnterminated;
244 18 : return false;
245 : }
246 :
247 107480 : switch (peek()) {
248 : // Assertions
249 : case 'b':
250 1026 : consume();
251 : if (inCharacterClass)
252 54 : delegate.atomPatternCharacter('\b');
253 : else {
254 972 : delegate.assertionWordBoundary(false);
255 972 : return false;
256 : }
257 54 : break;
258 : case 'B':
259 54 : consume();
260 : if (inCharacterClass)
261 54 : delegate.atomPatternCharacter('B');
262 : else {
263 0 : delegate.assertionWordBoundary(true);
264 0 : return false;
265 : }
266 54 : break;
267 :
268 : // CharacterClassEscape
269 : case 'd':
270 29741 : consume();
271 29741 : delegate.atomBuiltInCharacterClass(DigitClassID, false);
272 29741 : break;
273 : case 's':
274 2755 : consume();
275 2755 : delegate.atomBuiltInCharacterClass(SpaceClassID, false);
276 2755 : break;
277 : case 'w':
278 2251 : consume();
279 2251 : delegate.atomBuiltInCharacterClass(WordClassID, false);
280 2251 : break;
281 : case 'D':
282 44 : consume();
283 44 : delegate.atomBuiltInCharacterClass(DigitClassID, true);
284 44 : break;
285 : case 'S':
286 146 : consume();
287 146 : delegate.atomBuiltInCharacterClass(SpaceClassID, true);
288 146 : break;
289 : case 'W':
290 72 : consume();
291 72 : delegate.atomBuiltInCharacterClass(WordClassID, true);
292 72 : break;
293 :
294 : // DecimalEscape
295 : case '1':
296 : case '2':
297 : case '3':
298 : case '4':
299 : case '5':
300 : case '6':
301 : case '7':
302 : case '8':
303 : case '9': {
304 : // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
305 : // First, try to parse this as backreference.
306 : if (!inCharacterClass) {
307 36 : ParseState state = saveState();
308 :
309 : unsigned backReference;
310 36 : if (!consumeNumber(backReference))
311 0 : break;
312 36 : if (backReference <= m_backReferenceLimit) {
313 36 : delegate.atomBackReference(backReference);
314 36 : break;
315 : }
316 :
317 0 : restoreState(state);
318 : }
319 :
320 : // Not a backreference, and not octal.
321 0 : if (peek() >= '8') {
322 0 : delegate.atomPatternCharacter('\\');
323 0 : break;
324 : }
325 :
326 : // Fall-through to handle this as an octal escape.
327 : }
328 :
329 : // Octal escape
330 : case '0':
331 22 : delegate.atomPatternCharacter(consumeOctal());
332 22 : break;
333 :
334 : // ControlEscape
335 : case 'f':
336 18 : consume();
337 18 : delegate.atomPatternCharacter('\f');
338 18 : break;
339 : case 'n':
340 5050 : consume();
341 5050 : delegate.atomPatternCharacter('\n');
342 5050 : break;
343 : case 'r':
344 3484 : consume();
345 3484 : delegate.atomPatternCharacter('\r');
346 3484 : break;
347 : case 't':
348 6105 : consume();
349 6105 : delegate.atomPatternCharacter('\t');
350 6105 : break;
351 : case 'v':
352 18 : consume();
353 18 : delegate.atomPatternCharacter('\v');
354 18 : break;
355 :
356 : // ControlLetter
357 : case 'c': {
358 0 : ParseState state = saveState();
359 0 : consume();
360 0 : if (!atEndOfPattern()) {
361 0 : int control = consume();
362 :
363 : // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
364 0 : if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
365 0 : delegate.atomPatternCharacter(control & 0x1f);
366 0 : break;
367 : }
368 : }
369 0 : restoreState(state);
370 0 : delegate.atomPatternCharacter('\\');
371 0 : break;
372 : }
373 :
374 : // HexEscape
375 : case 'x': {
376 455 : consume();
377 455 : int x = tryConsumeHex(2);
378 455 : if (x == -1)
379 0 : delegate.atomPatternCharacter('x');
380 : else
381 455 : delegate.atomPatternCharacter(x);
382 455 : break;
383 : }
384 :
385 : // UnicodeEscape
386 : case 'u': {
387 4084 : consume();
388 4084 : int u = tryConsumeHex(4);
389 4084 : if (u == -1)
390 0 : delegate.atomPatternCharacter('u');
391 : else
392 4084 : delegate.atomPatternCharacter(u);
393 4084 : break;
394 : }
395 :
396 : // IdentityEscape
397 : default:
398 52119 : delegate.atomPatternCharacter(consume());
399 : }
400 :
401 106508 : return true;
402 : }
403 :
404 : /*
405 : * parseAtomEscape(), parseCharacterClassEscape():
406 : *
407 : * These methods alias to parseEscape().
408 : */
409 87330 : bool parseAtomEscape()
410 : {
411 87330 : return parseEscape<false>(m_delegate);
412 : }
413 20168 : void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
414 : {
415 20168 : parseEscape<true>(delegate);
416 20168 : }
417 :
418 : /*
419 : * parseCharacterClass():
420 : *
421 : * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
422 : * to an instance of CharacterClassParserDelegate, to describe the character class to the
423 : * delegate.
424 : */
425 48926 : void parseCharacterClass()
426 : {
427 48926 : ASSERT(!m_err);
428 48926 : ASSERT(peek() == '[');
429 48926 : consume();
430 :
431 48926 : CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
432 :
433 48926 : characterClassConstructor.begin(tryConsume('^'));
434 :
435 48926 : while (!atEndOfPattern()) {
436 318159 : switch (peek()) {
437 : case ']':
438 48870 : consume();
439 48870 : characterClassConstructor.end();
440 48870 : return;
441 :
442 : case '\\':
443 20168 : parseCharacterClassEscape(characterClassConstructor);
444 20168 : break;
445 :
446 : default:
447 249121 : characterClassConstructor.atomPatternCharacter(consume(), true);
448 : }
449 :
450 269289 : if (m_err)
451 54 : return;
452 : }
453 :
454 2 : m_err = CharacterClassUnmatched;
455 : }
456 :
457 : /*
458 : * parseParenthesesBegin():
459 : *
460 : * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
461 : */
462 62028 : void parseParenthesesBegin()
463 : {
464 62028 : ASSERT(!m_err);
465 62028 : ASSERT(peek() == '(');
466 62028 : consume();
467 :
468 62028 : if (tryConsume('?')) {
469 22067 : if (atEndOfPattern()) {
470 0 : m_err = ParenthesesTypeInvalid;
471 0 : return;
472 : }
473 :
474 22067 : switch (consume()) {
475 : case ':':
476 21923 : m_delegate.atomParenthesesSubpatternBegin(false);
477 21923 : break;
478 :
479 : case '=':
480 72 : m_delegate.atomParentheticalAssertionBegin();
481 72 : break;
482 :
483 : case '!':
484 72 : m_delegate.atomParentheticalAssertionBegin(true);
485 72 : break;
486 :
487 : default:
488 0 : m_err = ParenthesesTypeInvalid;
489 : }
490 : } else
491 39961 : m_delegate.atomParenthesesSubpatternBegin();
492 :
493 62028 : ++m_parenthesesNestingDepth;
494 : }
495 :
496 : /*
497 : * parseParenthesesEnd():
498 : *
499 : * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
500 : */
501 62028 : void parseParenthesesEnd()
502 : {
503 62028 : ASSERT(!m_err);
504 62028 : ASSERT(peek() == ')');
505 62028 : consume();
506 :
507 62028 : if (m_parenthesesNestingDepth > 0)
508 62028 : m_delegate.atomParenthesesEnd();
509 : else
510 0 : m_err = ParenthesesUnmatched;
511 :
512 62028 : --m_parenthesesNestingDepth;
513 62028 : }
514 :
515 : /*
516 : * parseQuantifier():
517 : *
518 : * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
519 : */
520 96382 : void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
521 : {
522 96382 : ASSERT(!m_err);
523 96382 : ASSERT(min <= max);
524 :
525 96382 : if (min == unsigned(-1)) {
526 9 : m_err = QuantifierTooLarge;
527 9 : return;
528 : }
529 :
530 96373 : if (lastTokenWasAnAtom)
531 96364 : m_delegate.quantifyAtom(min, max, !tryConsume('?'));
532 : else
533 9 : m_err = QuantifierWithoutAtom;
534 : }
535 :
536 : /*
537 : * parseTokens():
538 : *
539 : * This method loops over the input pattern reporting tokens to the delegate.
540 : * The method returns when a parse error is detected, or the end of the pattern
541 : * is reached. One piece of state is tracked around the loop, which is whether
542 : * the last token passed to the delegate was an atom (this is necessary to detect
543 : * a parse error when a quantifier provided without an atom to quantify).
544 : */
545 98052 : void parseTokens()
546 : {
547 98052 : bool lastTokenWasAnAtom = false;
548 :
549 1021475 : while (!atEndOfPattern()) {
550 825463 : switch (peek()) {
551 : case '|':
552 13041 : consume();
553 13041 : m_delegate.disjunction();
554 13041 : lastTokenWasAnAtom = false;
555 13041 : break;
556 :
557 : case '(':
558 62028 : parseParenthesesBegin();
559 62028 : lastTokenWasAnAtom = false;
560 62028 : break;
561 :
562 : case ')':
563 62028 : parseParenthesesEnd();
564 62028 : lastTokenWasAnAtom = true;
565 62028 : break;
566 :
567 : case '^':
568 28371 : consume();
569 28371 : m_delegate.assertionBOL();
570 28371 : lastTokenWasAnAtom = false;
571 28371 : break;
572 :
573 : case '$':
574 27881 : consume();
575 27881 : m_delegate.assertionEOL();
576 27881 : lastTokenWasAnAtom = false;
577 27881 : break;
578 :
579 : case '.':
580 8795 : consume();
581 8795 : m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
582 8795 : lastTokenWasAnAtom = true;
583 8795 : break;
584 :
585 : case '[':
586 48926 : parseCharacterClass();
587 48926 : lastTokenWasAnAtom = true;
588 48926 : break;
589 :
590 : case '\\':
591 87330 : lastTokenWasAnAtom = parseAtomEscape();
592 87330 : break;
593 :
594 : case '*':
595 19497 : consume();
596 19497 : parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
597 19497 : lastTokenWasAnAtom = false;
598 19497 : break;
599 :
600 : case '+':
601 52839 : consume();
602 52839 : parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
603 52839 : lastTokenWasAnAtom = false;
604 52839 : break;
605 :
606 : case '?':
607 14329 : consume();
608 14329 : parseQuantifier(lastTokenWasAnAtom, 0, 1);
609 14329 : lastTokenWasAnAtom = false;
610 14329 : break;
611 :
612 : case '{': {
613 9717 : ParseState state = saveState();
614 :
615 9717 : consume();
616 9717 : if (peekIsDigit()) {
617 : unsigned min;
618 9717 : if (!consumeNumber(min))
619 0 : break;
620 9717 : unsigned max = min;
621 :
622 9717 : if (tryConsume(',')) {
623 874 : if (peekIsDigit()) {
624 302 : if (!consumeNumber(max))
625 0 : break;
626 : } else {
627 572 : max = quantifyInfinite;
628 : }
629 : }
630 :
631 9717 : if (tryConsume('}')) {
632 9717 : if (min <= max)
633 9717 : parseQuantifier(lastTokenWasAnAtom, min, max);
634 : else
635 0 : m_err = QuantifierOutOfOrder;
636 9717 : lastTokenWasAnAtom = false;
637 9717 : break;
638 : }
639 : }
640 :
641 0 : restoreState(state);
642 : } // if we did not find a complete quantifer, fall through to the default case.
643 :
644 : default:
645 390681 : m_delegate.atomPatternCharacter(consume());
646 390681 : lastTokenWasAnAtom = true;
647 : }
648 :
649 825463 : if (m_err)
650 92 : return;
651 : }
652 :
653 97960 : if (m_parenthesesNestingDepth > 0)
654 0 : m_err = MissingParentheses;
655 : }
656 :
657 : /*
658 : * parse():
659 : *
660 : * This method calls parseTokens() to parse over the input and converts any
661 : * error code to a const char* for a result.
662 : */
663 98052 : ErrorCode parse()
664 : {
665 98052 : if (m_size > MAX_PATTERN_SIZE)
666 0 : m_err = PatternTooLarge;
667 : else
668 98052 : parseTokens();
669 98052 : ASSERT(atEndOfPattern() || m_err);
670 :
671 98052 : return m_err;
672 : }
673 :
674 :
675 : // Misc helper functions:
676 :
677 : typedef unsigned ParseState;
678 :
679 14292 : ParseState saveState()
680 : {
681 14292 : return m_index;
682 : }
683 :
684 0 : void restoreState(ParseState state)
685 : {
686 0 : m_index = state;
687 0 : }
688 :
689 1749846 : bool atEndOfPattern()
690 : {
691 1749846 : ASSERT(m_index <= m_size);
692 1749846 : return m_index == m_size;
693 : }
694 :
695 1588448 : int peek()
696 : {
697 1588448 : ASSERT(m_index < m_size);
698 1588448 : return m_data[m_index];
699 : }
700 :
701 36625 : bool peekIsDigit()
702 : {
703 36625 : return !atEndOfPattern() && WTF::isASCIIDigit(peek());
704 : }
705 :
706 2951 : unsigned peekDigit()
707 : {
708 2951 : ASSERT(peekIsDigit());
709 2951 : return peek() - '0';
710 : }
711 :
712 1303385 : int consume()
713 : {
714 1303385 : ASSERT(m_index < m_size);
715 1303385 : return m_data[m_index++];
716 : }
717 :
718 10077 : unsigned consumeDigit()
719 : {
720 10077 : ASSERT(peekIsDigit());
721 10077 : return consume() - '0';
722 : }
723 :
724 10055 : bool consumeNumber(unsigned &accum)
725 : {
726 10055 : accum = consumeDigit();
727 23061 : while (peekIsDigit()) {
728 2951 : unsigned newValue = accum * 10 + peekDigit();
729 2951 : if (newValue < accum) { /* Overflow check. */
730 0 : m_err = QuantifierTooLarge;
731 0 : return false;
732 : }
733 2951 : accum = newValue;
734 2951 : consume();
735 : }
736 10055 : return true;
737 : }
738 :
739 22 : unsigned consumeOctal()
740 : {
741 22 : ASSERT(WTF::isASCIIOctalDigit(peek()));
742 :
743 22 : unsigned n = consumeDigit();
744 44 : while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
745 0 : n = n * 8 + consumeDigit();
746 22 : return n;
747 : }
748 :
749 226752 : bool tryConsume(UChar ch)
750 : {
751 226752 : if (atEndOfPattern() || (m_data[m_index] != ch))
752 189889 : return false;
753 36863 : ++m_index;
754 36863 : return true;
755 : }
756 :
757 4539 : int tryConsumeHex(int count)
758 : {
759 4539 : ParseState state = saveState();
760 :
761 4539 : int n = 0;
762 26324 : while (count--) {
763 17246 : if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
764 0 : restoreState(state);
765 0 : return -1;
766 : }
767 17246 : n = (n << 4) | WTF::toASCIIHexValue(consume());
768 : }
769 4539 : return n;
770 : }
771 :
772 : Delegate& m_delegate;
773 : unsigned m_backReferenceLimit;
774 : ErrorCode m_err;
775 : const UChar* m_data;
776 : unsigned m_size;
777 : unsigned m_index;
778 : unsigned m_parenthesesNestingDepth;
779 :
780 : // Derived by empirical testing of compile time in PCRE and WREC.
781 : static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
782 : };
783 :
784 : /*
785 : * Yarr::parse():
786 : *
787 : * The parse method is passed a pattern to be parsed and a delegate upon which
788 : * callbacks will be made to record the parsed tokens forming the regex.
789 : * Yarr::parse() returns null on success, or a const C string providing an error
790 : * message where a parse error occurs.
791 : *
792 : * The Delegate must implement the following interface:
793 : *
794 : * void assertionBOL();
795 : * void assertionEOL();
796 : * void assertionWordBoundary(bool invert);
797 : *
798 : * void atomPatternCharacter(UChar ch);
799 : * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
800 : * void atomCharacterClassBegin(bool invert)
801 : * void atomCharacterClassAtom(UChar ch)
802 : * void atomCharacterClassRange(UChar begin, UChar end)
803 : * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
804 : * void atomCharacterClassEnd()
805 : * void atomParenthesesSubpatternBegin(bool capture = true);
806 : * void atomParentheticalAssertionBegin(bool invert = false);
807 : * void atomParenthesesEnd();
808 : * void atomBackReference(unsigned subpatternId);
809 : *
810 : * void quantifyAtom(unsigned min, unsigned max, bool greedy);
811 : *
812 : * void disjunction();
813 : *
814 : * The regular expression is described by a sequence of assertion*() and atom*()
815 : * callbacks to the delegate, describing the terms in the regular expression.
816 : * Following an atom a quantifyAtom() call may occur to indicate that the previous
817 : * atom should be quantified. In the case of atoms described across multiple
818 : * calls (parentheses and character classes) the call to quantifyAtom() will come
819 : * after the call to the atom*End() method, never after atom*Begin().
820 : *
821 : * Character classes may either be described by a single call to
822 : * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
823 : * In the latter case, ...Begin() will be called, followed by a sequence of
824 : * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
825 : *
826 : * Sequences of atoms and assertions are broken into alternatives via calls to
827 : * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
828 : * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
829 : * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
830 : * capturing subpattern, this will be the subpatternId associated with these
831 : * parentheses, and will also by definition be the lowest subpatternId of these
832 : * parentheses and of any nested paretheses. The atomParenthesesEnd() method
833 : * is passed the subpatternId of the last capturing subexpression nested within
834 : * these paretheses. In the case of a capturing subpattern with no nested
835 : * capturing subpatterns, the same subpatternId will be passed to the begin and
836 : * end functions. In the case of non-capturing subpatterns the subpatternId
837 : * passed to the begin method is also the first possible subpatternId that might
838 : * be nested within these paretheses. If a set of non-capturing parentheses does
839 : * not contain any capturing subpatterns, then the subpatternId passed to begin
840 : * will be greater than the subpatternId passed to end.
841 : */
842 :
843 : template<class Delegate>
844 98052 : ErrorCode parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
845 : {
846 98052 : return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
847 : }
848 :
849 : } } // namespace JSC::Yarr
850 :
851 : #endif // YarrParser_h
|