LCOV - code coverage report
Current view: directory - js/src/yarr - YarrParser.h (source / functions) Found Hit Coverage
Test: app.info Lines: 323 286 88.5 %
Date: 2012-06-02 Functions: 58 56 96.6 %

       1                 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
       2                 :  * vim: set ts=8 sw=4 et tw=99 ft=cpp:
       3                 :  *
       4                 :  * ***** BEGIN LICENSE BLOCK *****
       5                 :  * Copyright (C) 2009 Apple Inc. All rights reserved.
       6                 :  *
       7                 :  * Redistribution and use in source and binary forms, with or without
       8                 :  * modification, are permitted provided that the following conditions
       9                 :  * are met:
      10                 :  * 1. Redistributions of source code must retain the above copyright
      11                 :  *    notice, this list of conditions and the following disclaimer.
      12                 :  * 2. Redistributions in binary form must reproduce the above copyright
      13                 :  *    notice, this list of conditions and the following disclaimer in the
      14                 :  *    documentation and/or other materials provided with the distribution.
      15                 :  *
      16                 :  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
      17                 :  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
      18                 :  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
      19                 :  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
      20                 :  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      21                 :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      22                 :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
      23                 :  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
      24                 :  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
      25                 :  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
      26                 :  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
      27                 :  *
      28                 :  * ***** END LICENSE BLOCK ***** */
      29                 : 
      30                 : #ifndef YarrParser_h
      31                 : #define YarrParser_h
      32                 : 
      33                 : #include "Yarr.h"
      34                 : 
      35                 : namespace JSC { namespace Yarr {
      36                 : 
      37                 : #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
      38                 : 
      39                 : enum BuiltInCharacterClassID {
      40                 :     DigitClassID,
      41                 :     SpaceClassID,
      42                 :     WordClassID,
      43                 :     NewlineClassID
      44                 : };
      45                 : 
      46                 : // The Parser class should not be used directly - only via the Yarr::parse() method.
      47                 : template<class Delegate>
      48                 : class Parser {
      49                 : private:
      50                 :     template<class FriendDelegate>
      51                 :     friend ErrorCode parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
      52                 : 
      53                 :     /*
      54                 :      * CharacterClassParserDelegate:
      55                 :      *
      56                 :      * The class CharacterClassParserDelegate is used in the parsing of character
      57                 :      * classes.  This class handles detection of character ranges.  This class
      58                 :      * implements enough of the delegate interface such that it can be passed to
      59                 :      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
      60                 :      * to perform the parsing of escape characters in character sets.
      61                 :      */
      62                 :     class CharacterClassParserDelegate {
      63                 :     public:
      64           48926 :         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
      65                 :             : m_delegate(delegate)
      66                 :             , m_err(err)
      67                 :             , m_state(Empty)
      68           48926 :             , m_character(0)
      69                 :         {
      70           48926 :         }
      71                 : 
      72                 :         /*
      73                 :          * begin():
      74                 :          *
      75                 :          * Called at beginning of construction.
      76                 :          */
      77           48926 :         void begin(bool invert)
      78                 :         {
      79           48926 :             m_delegate.atomCharacterClassBegin(invert);
      80           48926 :         }
      81                 : 
      82                 :         /*
      83                 :          * atomPatternCharacter():
      84                 :          *
      85                 :          * This method is called either from parseCharacterClass() (for an unescaped
      86                 :          * character in a character class), or from parseEscape(). In the former case
      87                 :          * the value true will be passed for the argument 'hyphenIsRange', and in this
      88                 :          * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
      89                 :          * is different to /[a\-z]/).
      90                 :          */
      91          267603 :         void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
      92                 :         {
      93          267603 :             switch (m_state) {
      94                 :             case AfterCharacterClass:
      95                 :                 // Following a builtin character class we need look out for a hyphen.
      96                 :                 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
      97                 :                 // If we see a hyphen following a charater class then unlike usual
      98                 :                 // we'll report it to the delegate immediately, and put ourself into
      99                 :                 // a poisoned state. Any following calls to add another character or
     100                 :                 // character class will result in an error. (A hypen following a
     101                 :                 // character-class is itself valid, but only  at the end of a regex).
     102            1193 :                 if (hyphenIsRange && ch == '-') {
     103             162 :                     m_delegate.atomCharacterClassAtom('-');
     104             162 :                     m_state = AfterCharacterClassHyphen;
     105             162 :                     return;
     106                 :                 }
     107                 :                 // Otherwise just fall through - cached character so treat this as Empty.
     108                 : 
     109                 :             case Empty:
     110           89842 :                 m_character = ch;
     111           89842 :                 m_state = CachedCharacter;
     112           89842 :                 return;
     113                 : 
     114                 :             case CachedCharacter:
     115          110560 :                 if (hyphenIsRange && ch == '-')
     116           67211 :                     m_state = CachedCharacterHyphen;
     117                 :                 else {
     118           43349 :                     m_delegate.atomCharacterClassAtom(m_character);
     119           43349 :                     m_character = ch;
     120                 :                 }
     121          110560 :                 return;
     122                 : 
     123                 :             case CachedCharacterHyphen:
     124           66994 :                 if (ch < m_character) {
     125               9 :                     m_err = CharacterClassOutOfOrder;
     126               9 :                     return;
     127                 :                 }
     128           66985 :                 m_delegate.atomCharacterClassRange(m_character, ch);
     129           66985 :                 m_state = Empty;
     130           66985 :                 return;
     131                 : 
     132                 :             case AfterCharacterClassHyphen:
     133              45 :                 m_delegate.atomCharacterClassAtom(ch);
     134              45 :                 m_state = Empty;
     135              45 :                 return;
     136                 :             }
     137                 :         }
     138                 : 
     139                 :         /*
     140                 :          * atomBuiltInCharacterClass():
     141                 :          *
     142                 :          * Adds a built-in character class, called by parseEscape().
     143                 :          */
     144            1686 :         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
     145                 :         {
     146            1686 :             switch (m_state) {
     147                 :             case CachedCharacter:
     148                 :                 // Flush the currently cached character, then fall through.
     149             239 :                 m_delegate.atomCharacterClassAtom(m_character);
     150                 : 
     151                 :             case Empty:
     152                 :             case AfterCharacterClass:
     153            1614 :                 m_state = AfterCharacterClass;
     154            1614 :                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
     155            1614 :                 return;
     156                 : 
     157                 :             case CachedCharacterHyphen:
     158                 :                 // Error! We have a range that looks like [x-\d]. We require
     159                 :                 // the end of the range to be a single character.
     160              45 :                 m_err = CharacterClassInvalidRange;
     161              45 :                 return;
     162                 : 
     163                 :             case AfterCharacterClassHyphen:
     164              27 :                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
     165              27 :                 m_state = Empty;
     166              27 :                 return;
     167                 :             }
     168                 :         }
     169                 : 
     170                 :         /*
     171                 :          * end():
     172                 :          *
     173                 :          * Called at end of construction.
     174                 :          */
     175           48870 :         void end()
     176                 :         {
     177           48870 :             if (m_state == CachedCharacter)
     178           22392 :                 m_delegate.atomCharacterClassAtom(m_character);
     179           26478 :             else if (m_state == CachedCharacterHyphen) {
     180             172 :                 m_delegate.atomCharacterClassAtom(m_character);
     181             172 :                 m_delegate.atomCharacterClassAtom('-');
     182                 :             }
     183           48870 :             m_delegate.atomCharacterClassEnd();
     184           48870 :         }
     185                 : 
     186                 :         // parseEscape() should never call these delegate methods when
     187                 :         // invoked with inCharacterClass set.
     188                 :         void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
     189                 :         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
     190                 : 
     191                 :     private:
     192                 :         Delegate& m_delegate;
     193                 :         ErrorCode& m_err;
     194                 :         enum CharacterClassConstructionState {
     195                 :             Empty,
     196                 :             CachedCharacter,
     197                 :             CachedCharacterHyphen,
     198                 :             AfterCharacterClass,
     199                 :             AfterCharacterClassHyphen
     200                 :         } m_state;
     201                 :         UChar m_character;
     202                 :     };
     203                 : 
     204           98052 :     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
     205                 :         : m_delegate(delegate)
     206                 :         , m_backReferenceLimit(backReferenceLimit)
     207                 :         , m_err(NoError)
     208                 :         , m_data(pattern.chars())
     209                 :         , m_size(pattern.length())
     210                 :         , m_index(0)
     211           98052 :         , m_parenthesesNestingDepth(0)
     212                 :     {
     213           98052 :     }
     214                 :     
     215                 :     /*
     216                 :      * parseEscape():
     217                 :      *
     218                 :      * Helper for parseTokens() AND parseCharacterClass().
     219                 :      * Unlike the other parser methods, this function does not report tokens
     220                 :      * directly to the member delegate (m_delegate), instead tokens are
     221                 :      * emitted to the delegate provided as an argument.  In the case of atom
     222                 :      * escapes, parseTokens() will call parseEscape() passing m_delegate as
     223                 :      * an argument, and as such the escape will be reported to the delegate.
     224                 :      *
     225                 :      * However this method may also be used by parseCharacterClass(), in which
     226                 :      * case a CharacterClassParserDelegate will be passed as the delegate that
     227                 :      * tokens should be added to.  A boolean flag is also provided to indicate
     228                 :      * whether that an escape in a CharacterClass is being parsed (some parsing
     229                 :      * rules change in this context).
     230                 :      *
     231                 :      * The boolean value returned by this method indicates whether the token
     232                 :      * parsed was an atom (outside of a characted class \b and \B will be
     233                 :      * interpreted as assertions).
     234                 :      */
     235                 :     template<bool inCharacterClass, class EscapeDelegate>
     236          107498 :     bool parseEscape(EscapeDelegate& delegate)
     237                 :     {
     238          107498 :         ASSERT(!m_err);
     239          107498 :         ASSERT(peek() == '\\');
     240          107498 :         consume();
     241                 : 
     242          107498 :         if (atEndOfPattern()) {
     243              18 :             m_err = EscapeUnterminated;
     244              18 :             return false;
     245                 :         }
     246                 : 
     247          107480 :         switch (peek()) {
     248                 :         // Assertions
     249                 :         case 'b':
     250            1026 :             consume();
     251                 :             if (inCharacterClass)
     252              54 :                 delegate.atomPatternCharacter('\b');
     253                 :             else {
     254             972 :                 delegate.assertionWordBoundary(false);
     255             972 :                 return false;
     256                 :             }
     257              54 :             break;
     258                 :         case 'B':
     259              54 :             consume();
     260                 :             if (inCharacterClass)
     261              54 :                 delegate.atomPatternCharacter('B');
     262                 :             else {
     263               0 :                 delegate.assertionWordBoundary(true);
     264               0 :                 return false;
     265                 :             }
     266              54 :             break;
     267                 : 
     268                 :         // CharacterClassEscape
     269                 :         case 'd':
     270           29741 :             consume();
     271           29741 :             delegate.atomBuiltInCharacterClass(DigitClassID, false);
     272           29741 :             break;
     273                 :         case 's':
     274            2755 :             consume();
     275            2755 :             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
     276            2755 :             break;
     277                 :         case 'w':
     278            2251 :             consume();
     279            2251 :             delegate.atomBuiltInCharacterClass(WordClassID, false);
     280            2251 :             break;
     281                 :         case 'D':
     282              44 :             consume();
     283              44 :             delegate.atomBuiltInCharacterClass(DigitClassID, true);
     284              44 :             break;
     285                 :         case 'S':
     286             146 :             consume();
     287             146 :             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
     288             146 :             break;
     289                 :         case 'W':
     290              72 :             consume();
     291              72 :             delegate.atomBuiltInCharacterClass(WordClassID, true);
     292              72 :             break;
     293                 : 
     294                 :         // DecimalEscape
     295                 :         case '1':
     296                 :         case '2':
     297                 :         case '3':
     298                 :         case '4':
     299                 :         case '5':
     300                 :         case '6':
     301                 :         case '7':
     302                 :         case '8':
     303                 :         case '9': {
     304                 :             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
     305                 :             // First, try to parse this as backreference.
     306                 :             if (!inCharacterClass) {
     307              36 :                 ParseState state = saveState();
     308                 : 
     309                 :                 unsigned backReference;
     310              36 :                 if (!consumeNumber(backReference))
     311               0 :                     break;
     312              36 :                 if (backReference <= m_backReferenceLimit) {
     313              36 :                     delegate.atomBackReference(backReference);
     314              36 :                     break;
     315                 :                 }
     316                 : 
     317               0 :                 restoreState(state);
     318                 :             }
     319                 :             
     320                 :             // Not a backreference, and not octal.
     321               0 :             if (peek() >= '8') {
     322               0 :                 delegate.atomPatternCharacter('\\');
     323               0 :                 break;
     324                 :             }
     325                 : 
     326                 :             // Fall-through to handle this as an octal escape.
     327                 :         }
     328                 : 
     329                 :         // Octal escape
     330                 :         case '0':
     331              22 :             delegate.atomPatternCharacter(consumeOctal());
     332              22 :             break;
     333                 : 
     334                 :         // ControlEscape
     335                 :         case 'f':
     336              18 :             consume();
     337              18 :             delegate.atomPatternCharacter('\f');
     338              18 :             break;
     339                 :         case 'n':
     340            5050 :             consume();
     341            5050 :             delegate.atomPatternCharacter('\n');
     342            5050 :             break;
     343                 :         case 'r':
     344            3484 :             consume();
     345            3484 :             delegate.atomPatternCharacter('\r');
     346            3484 :             break;
     347                 :         case 't':
     348            6105 :             consume();
     349            6105 :             delegate.atomPatternCharacter('\t');
     350            6105 :             break;
     351                 :         case 'v':
     352              18 :             consume();
     353              18 :             delegate.atomPatternCharacter('\v');
     354              18 :             break;
     355                 : 
     356                 :         // ControlLetter
     357                 :         case 'c': {
     358               0 :             ParseState state = saveState();
     359               0 :             consume();
     360               0 :             if (!atEndOfPattern()) {
     361               0 :                 int control = consume();
     362                 : 
     363                 :                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
     364               0 :                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
     365               0 :                     delegate.atomPatternCharacter(control & 0x1f);
     366               0 :                     break;
     367                 :                 }
     368                 :             }
     369               0 :             restoreState(state);
     370               0 :             delegate.atomPatternCharacter('\\');
     371               0 :             break;
     372                 :         }
     373                 : 
     374                 :         // HexEscape
     375                 :         case 'x': {
     376             455 :             consume();
     377             455 :             int x = tryConsumeHex(2);
     378             455 :             if (x == -1)
     379               0 :                 delegate.atomPatternCharacter('x');
     380                 :             else
     381             455 :                 delegate.atomPatternCharacter(x);
     382             455 :             break;
     383                 :         }
     384                 : 
     385                 :         // UnicodeEscape
     386                 :         case 'u': {
     387            4084 :             consume();
     388            4084 :             int u = tryConsumeHex(4);
     389            4084 :             if (u == -1)
     390               0 :                 delegate.atomPatternCharacter('u');
     391                 :             else
     392            4084 :                 delegate.atomPatternCharacter(u);
     393            4084 :             break;
     394                 :         }
     395                 : 
     396                 :         // IdentityEscape
     397                 :         default:
     398           52119 :             delegate.atomPatternCharacter(consume());
     399                 :         }
     400                 :         
     401          106508 :         return true;
     402                 :     }
     403                 : 
     404                 :     /*
     405                 :      * parseAtomEscape(), parseCharacterClassEscape():
     406                 :      *
     407                 :      * These methods alias to parseEscape().
     408                 :      */
     409           87330 :     bool parseAtomEscape()
     410                 :     {
     411           87330 :         return parseEscape<false>(m_delegate);
     412                 :     }
     413           20168 :     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
     414                 :     {
     415           20168 :         parseEscape<true>(delegate);
     416           20168 :     }
     417                 : 
     418                 :     /*
     419                 :      * parseCharacterClass():
     420                 :      *
     421                 :      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
     422                 :      * to an instance of CharacterClassParserDelegate, to describe the character class to the
     423                 :      * delegate.
     424                 :      */
     425           48926 :     void parseCharacterClass()
     426                 :     {
     427           48926 :         ASSERT(!m_err);
     428           48926 :         ASSERT(peek() == '[');
     429           48926 :         consume();
     430                 : 
     431           48926 :         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
     432                 : 
     433           48926 :         characterClassConstructor.begin(tryConsume('^'));
     434                 : 
     435           48926 :         while (!atEndOfPattern()) {
     436          318159 :             switch (peek()) {
     437                 :             case ']':
     438           48870 :                 consume();
     439           48870 :                 characterClassConstructor.end();
     440           48870 :                 return;
     441                 : 
     442                 :             case '\\':
     443           20168 :                 parseCharacterClassEscape(characterClassConstructor);
     444           20168 :                 break;
     445                 : 
     446                 :             default:
     447          249121 :                 characterClassConstructor.atomPatternCharacter(consume(), true);
     448                 :             }
     449                 : 
     450          269289 :             if (m_err)
     451              54 :                 return;
     452                 :         }
     453                 : 
     454               2 :         m_err = CharacterClassUnmatched;
     455                 :     }
     456                 : 
     457                 :     /*
     458                 :      * parseParenthesesBegin():
     459                 :      *
     460                 :      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
     461                 :      */
     462           62028 :     void parseParenthesesBegin()
     463                 :     {
     464           62028 :         ASSERT(!m_err);
     465           62028 :         ASSERT(peek() == '(');
     466           62028 :         consume();
     467                 : 
     468           62028 :         if (tryConsume('?')) {
     469           22067 :             if (atEndOfPattern()) {
     470               0 :                 m_err = ParenthesesTypeInvalid;
     471               0 :                 return;
     472                 :             }
     473                 : 
     474           22067 :             switch (consume()) {
     475                 :             case ':':
     476           21923 :                 m_delegate.atomParenthesesSubpatternBegin(false);
     477           21923 :                 break;
     478                 :             
     479                 :             case '=':
     480              72 :                 m_delegate.atomParentheticalAssertionBegin();
     481              72 :                 break;
     482                 : 
     483                 :             case '!':
     484              72 :                 m_delegate.atomParentheticalAssertionBegin(true);
     485              72 :                 break;
     486                 :             
     487                 :             default:
     488               0 :                 m_err = ParenthesesTypeInvalid;
     489                 :             }
     490                 :         } else
     491           39961 :             m_delegate.atomParenthesesSubpatternBegin();
     492                 : 
     493           62028 :         ++m_parenthesesNestingDepth;
     494                 :     }
     495                 : 
     496                 :     /*
     497                 :      * parseParenthesesEnd():
     498                 :      *
     499                 :      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
     500                 :      */
     501           62028 :     void parseParenthesesEnd()
     502                 :     {
     503           62028 :         ASSERT(!m_err);
     504           62028 :         ASSERT(peek() == ')');
     505           62028 :         consume();
     506                 : 
     507           62028 :         if (m_parenthesesNestingDepth > 0)
     508           62028 :             m_delegate.atomParenthesesEnd();
     509                 :         else
     510               0 :             m_err = ParenthesesUnmatched;
     511                 : 
     512           62028 :         --m_parenthesesNestingDepth;
     513           62028 :     }
     514                 : 
     515                 :     /*
     516                 :      * parseQuantifier():
     517                 :      *
     518                 :      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
     519                 :      */
     520           96382 :     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
     521                 :     {
     522           96382 :         ASSERT(!m_err);
     523           96382 :         ASSERT(min <= max);
     524                 : 
     525           96382 :         if (min == unsigned(-1)) {
     526               9 :             m_err = QuantifierTooLarge;
     527               9 :             return;
     528                 :         }
     529                 : 
     530           96373 :         if (lastTokenWasAnAtom)
     531           96364 :             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
     532                 :         else
     533               9 :             m_err = QuantifierWithoutAtom;
     534                 :     }
     535                 : 
     536                 :     /*
     537                 :      * parseTokens():
     538                 :      *
     539                 :      * This method loops over the input pattern reporting tokens to the delegate.
     540                 :      * The method returns when a parse error is detected, or the end of the pattern
     541                 :      * is reached.  One piece of state is tracked around the loop, which is whether
     542                 :      * the last token passed to the delegate was an atom (this is necessary to detect
     543                 :      * a parse error when a quantifier provided without an atom to quantify).
     544                 :      */
     545           98052 :     void parseTokens()
     546                 :     {
     547           98052 :         bool lastTokenWasAnAtom = false;
     548                 : 
     549         1021475 :         while (!atEndOfPattern()) {
     550          825463 :             switch (peek()) {
     551                 :             case '|':
     552           13041 :                 consume();
     553           13041 :                 m_delegate.disjunction();
     554           13041 :                 lastTokenWasAnAtom = false;
     555           13041 :                 break;
     556                 : 
     557                 :             case '(':
     558           62028 :                 parseParenthesesBegin();
     559           62028 :                 lastTokenWasAnAtom = false;
     560           62028 :                 break;
     561                 : 
     562                 :             case ')':
     563           62028 :                 parseParenthesesEnd();
     564           62028 :                 lastTokenWasAnAtom = true;
     565           62028 :                 break;
     566                 : 
     567                 :             case '^':
     568           28371 :                 consume();
     569           28371 :                 m_delegate.assertionBOL();
     570           28371 :                 lastTokenWasAnAtom = false;
     571           28371 :                 break;
     572                 : 
     573                 :             case '$':
     574           27881 :                 consume();
     575           27881 :                 m_delegate.assertionEOL();
     576           27881 :                 lastTokenWasAnAtom = false;
     577           27881 :                 break;
     578                 : 
     579                 :             case '.':
     580            8795 :                 consume();
     581            8795 :                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
     582            8795 :                 lastTokenWasAnAtom = true;
     583            8795 :                 break;
     584                 : 
     585                 :             case '[':
     586           48926 :                 parseCharacterClass();
     587           48926 :                 lastTokenWasAnAtom = true;
     588           48926 :                 break;
     589                 : 
     590                 :             case '\\':
     591           87330 :                 lastTokenWasAnAtom = parseAtomEscape();
     592           87330 :                 break;
     593                 : 
     594                 :             case '*':
     595           19497 :                 consume();
     596           19497 :                 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
     597           19497 :                 lastTokenWasAnAtom = false;
     598           19497 :                 break;
     599                 : 
     600                 :             case '+':
     601           52839 :                 consume();
     602           52839 :                 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
     603           52839 :                 lastTokenWasAnAtom = false;
     604           52839 :                 break;
     605                 : 
     606                 :             case '?':
     607           14329 :                 consume();
     608           14329 :                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
     609           14329 :                 lastTokenWasAnAtom = false;
     610           14329 :                 break;
     611                 : 
     612                 :             case '{': {
     613            9717 :                 ParseState state = saveState();
     614                 : 
     615            9717 :                 consume();
     616            9717 :                 if (peekIsDigit()) {
     617                 :                     unsigned min;
     618            9717 :                     if (!consumeNumber(min))
     619               0 :                         break;
     620            9717 :                     unsigned max = min;
     621                 :                     
     622            9717 :                     if (tryConsume(',')) {
     623             874 :                         if (peekIsDigit()) {
     624             302 :                             if (!consumeNumber(max))
     625               0 :                                 break;
     626                 :                         } else {
     627             572 :                             max = quantifyInfinite;
     628                 :                         }
     629                 :                     }
     630                 : 
     631            9717 :                     if (tryConsume('}')) {
     632            9717 :                         if (min <= max)
     633            9717 :                             parseQuantifier(lastTokenWasAnAtom, min, max);
     634                 :                         else
     635               0 :                             m_err = QuantifierOutOfOrder;
     636            9717 :                         lastTokenWasAnAtom = false;
     637            9717 :                         break;
     638                 :                     }
     639                 :                 }
     640                 : 
     641               0 :                 restoreState(state);
     642                 :             } // if we did not find a complete quantifer, fall through to the default case.
     643                 : 
     644                 :             default:
     645          390681 :                 m_delegate.atomPatternCharacter(consume());
     646          390681 :                 lastTokenWasAnAtom = true;
     647                 :             }
     648                 : 
     649          825463 :             if (m_err)
     650              92 :                 return;
     651                 :         }
     652                 : 
     653           97960 :         if (m_parenthesesNestingDepth > 0)
     654               0 :             m_err = MissingParentheses;
     655                 :     }
     656                 : 
     657                 :     /*
     658                 :      * parse():
     659                 :      *
     660                 :      * This method calls parseTokens() to parse over the input and converts any
     661                 :      * error code to a const char* for a result.
     662                 :      */
     663           98052 :     ErrorCode parse()
     664                 :     {
     665           98052 :         if (m_size > MAX_PATTERN_SIZE)
     666               0 :             m_err = PatternTooLarge;
     667                 :         else
     668           98052 :             parseTokens();
     669           98052 :         ASSERT(atEndOfPattern() || m_err);
     670                 : 
     671           98052 :         return m_err;
     672                 :     }
     673                 : 
     674                 : 
     675                 :     // Misc helper functions:
     676                 : 
     677                 :     typedef unsigned ParseState;
     678                 :     
     679           14292 :     ParseState saveState()
     680                 :     {
     681           14292 :         return m_index;
     682                 :     }
     683                 : 
     684               0 :     void restoreState(ParseState state)
     685                 :     {
     686               0 :         m_index = state;
     687               0 :     }
     688                 : 
     689         1749846 :     bool atEndOfPattern()
     690                 :     {
     691         1749846 :         ASSERT(m_index <= m_size);
     692         1749846 :         return m_index == m_size;
     693                 :     }
     694                 : 
     695         1588448 :     int peek()
     696                 :     {
     697         1588448 :         ASSERT(m_index < m_size);
     698         1588448 :         return m_data[m_index];
     699                 :     }
     700                 : 
     701           36625 :     bool peekIsDigit()
     702                 :     {
     703           36625 :         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
     704                 :     }
     705                 : 
     706            2951 :     unsigned peekDigit()
     707                 :     {
     708            2951 :         ASSERT(peekIsDigit());
     709            2951 :         return peek() - '0';
     710                 :     }
     711                 : 
     712         1303385 :     int consume()
     713                 :     {
     714         1303385 :         ASSERT(m_index < m_size);
     715         1303385 :         return m_data[m_index++];
     716                 :     }
     717                 : 
     718           10077 :     unsigned consumeDigit()
     719                 :     {
     720           10077 :         ASSERT(peekIsDigit());
     721           10077 :         return consume() - '0';
     722                 :     }
     723                 : 
     724           10055 :     bool consumeNumber(unsigned &accum)
     725                 :     {
     726           10055 :         accum = consumeDigit();
     727           23061 :         while (peekIsDigit()) {
     728            2951 :             unsigned newValue = accum * 10 + peekDigit();
     729            2951 :             if (newValue < accum) { /* Overflow check. */
     730               0 :                 m_err = QuantifierTooLarge;
     731               0 :                 return false;
     732                 :             }
     733            2951 :             accum = newValue;
     734            2951 :             consume();
     735                 :         }
     736           10055 :         return true;
     737                 :     }
     738                 : 
     739              22 :     unsigned consumeOctal()
     740                 :     {
     741              22 :         ASSERT(WTF::isASCIIOctalDigit(peek()));
     742                 : 
     743              22 :         unsigned n = consumeDigit();
     744              44 :         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
     745               0 :             n = n * 8 + consumeDigit();
     746              22 :         return n;
     747                 :     }
     748                 : 
     749          226752 :     bool tryConsume(UChar ch)
     750                 :     {
     751          226752 :         if (atEndOfPattern() || (m_data[m_index] != ch))
     752          189889 :             return false;
     753           36863 :         ++m_index;
     754           36863 :         return true;
     755                 :     }
     756                 : 
     757            4539 :     int tryConsumeHex(int count)
     758                 :     {
     759            4539 :         ParseState state = saveState();
     760                 : 
     761            4539 :         int n = 0;
     762           26324 :         while (count--) {
     763           17246 :             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
     764               0 :                 restoreState(state);
     765               0 :                 return -1;
     766                 :             }
     767           17246 :             n = (n << 4) | WTF::toASCIIHexValue(consume());
     768                 :         }
     769            4539 :         return n;
     770                 :     }
     771                 : 
     772                 :     Delegate& m_delegate;
     773                 :     unsigned m_backReferenceLimit;
     774                 :     ErrorCode m_err;
     775                 :     const UChar* m_data;
     776                 :     unsigned m_size;
     777                 :     unsigned m_index;
     778                 :     unsigned m_parenthesesNestingDepth;
     779                 : 
     780                 :     // Derived by empirical testing of compile time in PCRE and WREC.
     781                 :     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
     782                 : };
     783                 : 
     784                 : /*
     785                 :  * Yarr::parse():
     786                 :  *
     787                 :  * The parse method is passed a pattern to be parsed and a delegate upon which
     788                 :  * callbacks will be made to record the parsed tokens forming the regex.
     789                 :  * Yarr::parse() returns null on success, or a const C string providing an error
     790                 :  * message where a parse error occurs.
     791                 :  *
     792                 :  * The Delegate must implement the following interface:
     793                 :  *
     794                 :  *    void assertionBOL();
     795                 :  *    void assertionEOL();
     796                 :  *    void assertionWordBoundary(bool invert);
     797                 :  *
     798                 :  *    void atomPatternCharacter(UChar ch);
     799                 :  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
     800                 :  *    void atomCharacterClassBegin(bool invert)
     801                 :  *    void atomCharacterClassAtom(UChar ch)
     802                 :  *    void atomCharacterClassRange(UChar begin, UChar end)
     803                 :  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
     804                 :  *    void atomCharacterClassEnd()
     805                 :  *    void atomParenthesesSubpatternBegin(bool capture = true);
     806                 :  *    void atomParentheticalAssertionBegin(bool invert = false);
     807                 :  *    void atomParenthesesEnd();
     808                 :  *    void atomBackReference(unsigned subpatternId);
     809                 :  *
     810                 :  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
     811                 :  *
     812                 :  *    void disjunction();
     813                 :  *
     814                 :  * The regular expression is described by a sequence of assertion*() and atom*()
     815                 :  * callbacks to the delegate, describing the terms in the regular expression.
     816                 :  * Following an atom a quantifyAtom() call may occur to indicate that the previous
     817                 :  * atom should be quantified.  In the case of atoms described across multiple
     818                 :  * calls (parentheses and character classes) the call to quantifyAtom() will come
     819                 :  * after the call to the atom*End() method, never after atom*Begin().
     820                 :  *
     821                 :  * Character classes may either be described by a single call to
     822                 :  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
     823                 :  * In the latter case, ...Begin() will be called, followed by a sequence of
     824                 :  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
     825                 :  *
     826                 :  * Sequences of atoms and assertions are broken into alternatives via calls to
     827                 :  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
     828                 :  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
     829                 :  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
     830                 :  * capturing subpattern, this will be the subpatternId associated with these
     831                 :  * parentheses, and will also by definition be the lowest subpatternId of these
     832                 :  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
     833                 :  * is passed the subpatternId of the last capturing subexpression nested within
     834                 :  * these paretheses.  In the case of a capturing subpattern with no nested
     835                 :  * capturing subpatterns, the same subpatternId will be passed to the begin and
     836                 :  * end functions.  In the case of non-capturing subpatterns the subpatternId
     837                 :  * passed to the begin method is also the first possible subpatternId that might
     838                 :  * be nested within these paretheses.  If a set of non-capturing parentheses does
     839                 :  * not contain any capturing subpatterns, then the subpatternId passed to begin
     840                 :  * will be greater than the subpatternId passed to end.
     841                 :  */
     842                 : 
     843                 : template<class Delegate>
     844           98052 : ErrorCode parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
     845                 : {
     846           98052 :     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
     847                 : }
     848                 : 
     849                 : } } // namespace JSC::Yarr
     850                 : 
     851                 : #endif // YarrParser_h

Generated by: LCOV version 1.7