LCOV - code coverage report
Current view: directory - js/src/yarr - YarrPattern.cpp (source / functions) Found Hit Coverage
Test: app.info Lines: 395 339 85.8 %
Date: 2012-06-02 Functions: 39 36 92.3 %

       1                 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
       2                 :  * vim: set ts=8 sw=4 et tw=99 ft=cpp:
       3                 :  *
       4                 :  * ***** BEGIN LICENSE BLOCK *****
       5                 :  * Copyright (C) 2009 Apple Inc. All rights reserved.
       6                 :  * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
       7                 :  *
       8                 :  * Redistribution and use in source and binary forms, with or without
       9                 :  * modification, are permitted provided that the following conditions
      10                 :  * are met:
      11                 :  * 1. Redistributions of source code must retain the above copyright
      12                 :  *    notice, this list of conditions and the following disclaimer.
      13                 :  * 2. Redistributions in binary form must reproduce the above copyright
      14                 :  *    notice, this list of conditions and the following disclaimer in the
      15                 :  *    documentation and/or other materials provided with the distribution.
      16                 :  *
      17                 :  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
      18                 :  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
      19                 :  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
      20                 :  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
      21                 :  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      22                 :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      23                 :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
      24                 :  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
      25                 :  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
      26                 :  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
      27                 :  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
      28                 :  *
      29                 :  * ***** END LICENSE BLOCK ***** */
      30                 : 
      31                 : #include "YarrPattern.h"
      32                 : 
      33                 : #include "Yarr.h"
      34                 : #include "YarrParser.h"
      35                 : 
      36                 : using namespace WTF;
      37                 : 
      38                 : namespace JSC { namespace Yarr {
      39                 : 
      40                 : #include "RegExpJitTables.h"
      41                 : 
      42                 : #if WTF_CPU_SPARC
      43                 : #define BASE_FRAME_SIZE 24
      44                 : #else
      45                 : #define BASE_FRAME_SIZE 0
      46                 : #endif
      47                 : 
      48           58244 : class CharacterClassConstructor {
      49                 : public:
      50           58244 :     CharacterClassConstructor(bool isCaseInsensitive = false)
      51           58244 :         : m_isCaseInsensitive(isCaseInsensitive)
      52                 :     {
      53           58244 :     }
      54                 :     
      55           30537 :     void reset()
      56                 :     {
      57           30537 :         m_matches.clear();
      58           30537 :         m_ranges.clear();
      59           30537 :         m_matchesUnicode.clear();
      60           30537 :         m_rangesUnicode.clear();
      61           30537 :     }
      62                 : 
      63             759 :     void append(const CharacterClass* other)
      64                 :     {
      65            1481 :         for (size_t i = 0; i < other->m_matches.size(); ++i)
      66             722 :             addSorted(m_matches, other->m_matches[i]);
      67            2791 :         for (size_t i = 0; i < other->m_ranges.size(); ++i)
      68            2032 :             addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
      69            1551 :         for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
      70             792 :             addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
      71             948 :         for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
      72             189 :             addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
      73             759 :     }
      74                 : 
      75           35924 :     void putChar(UChar ch)
      76                 :     {
      77           35924 :         if (ch <= 0x7f) {
      78           35897 :             if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
      79            3578 :                 addSorted(m_matches, toASCIIUpper(ch));
      80            3578 :                 addSorted(m_matches, toASCIILower(ch));
      81                 :             } else
      82           32319 :                 addSorted(m_matches, ch);
      83                 :         } else {
      84                 :             UChar upper, lower;
      85              27 :             if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
      86               0 :                 addSorted(m_matchesUnicode, upper);
      87               0 :                 addSorted(m_matchesUnicode, lower);
      88                 :             } else
      89              27 :                 addSorted(m_matchesUnicode, ch);
      90                 :         }
      91           35924 :     }
      92                 : 
      93                 :     // returns true if this character has another case, and 'ch' is the upper case form.
      94               0 :     static inline bool isUnicodeUpper(UChar ch)
      95                 :     {
      96               0 :         return ch != Unicode::toLower(ch);
      97                 :     }
      98                 : 
      99                 :     // returns true if this character has another case, and 'ch' is the lower case form.
     100               0 :     static inline bool isUnicodeLower(UChar ch)
     101                 :     {
     102               0 :         return ch != Unicode::toUpper(ch);
     103                 :     }
     104                 : 
     105           41723 :     void putRange(UChar lo, UChar hi)
     106                 :     {
     107           41723 :         if (lo <= 0x7f) {
     108           41678 :             char asciiLo = lo;
     109           41678 :             char asciiHi = std::min(hi, (UChar)0x7f);
     110           41678 :             addSortedRange(m_ranges, lo, asciiHi);
     111                 :             
     112           41678 :             if (m_isCaseInsensitive) {
     113           38007 :                 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
     114              12 :                     addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
     115           38007 :                 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
     116           20441 :                     addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
     117                 :             }
     118                 :         }
     119           41723 :         if (hi >= 0x80) {
     120              48 :             uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
     121              48 :             addSortedRange(m_rangesUnicode, unicodeCurr, hi);
     122                 :             
     123              48 :             if (m_isCaseInsensitive) {
     124               0 :                 while (unicodeCurr <= hi) {
     125                 :                     // If the upper bound of the range (hi) is 0xffff, the increments to
     126                 :                     // unicodeCurr in this loop may take it to 0x10000.  This is fine
     127                 :                     // (if so we won't re-enter the loop, since the loop condition above
     128                 :                     // will definitely fail) - but this does mean we cannot use a UChar
     129                 :                     // to represent unicodeCurr, we must use a 32-bit value instead.
     130               0 :                     ASSERT(unicodeCurr <= 0xffff);
     131                 : 
     132               0 :                     if (isUnicodeUpper(unicodeCurr)) {
     133               0 :                         UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
     134               0 :                         UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
     135               0 :                         while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
     136               0 :                             lowerCaseRangeEnd++;
     137               0 :                         addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
     138               0 :                     } else if (isUnicodeLower(unicodeCurr)) {
     139               0 :                         UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
     140               0 :                         UChar upperCaseRangeEnd = upperCaseRangeBegin;
     141               0 :                         while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
     142               0 :                             upperCaseRangeEnd++;
     143               0 :                         addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
     144                 :                     } else
     145               0 :                         ++unicodeCurr;
     146                 :                 }
     147                 :             }
     148                 :         }
     149           41723 :     }
     150                 : 
     151           30537 :     CharacterClass* charClass()
     152                 :     {
     153           30537 :         CharacterClass* characterClass = js::OffTheBooks::new_<CharacterClass>(PassRefPtr<CharacterClassTable>(0));
     154                 : 
     155           30537 :         characterClass->m_matches.append(m_matches);
     156           30537 :         characterClass->m_ranges.append(m_ranges);
     157           30537 :         characterClass->m_matchesUnicode.append(m_matchesUnicode);
     158           30537 :         characterClass->m_rangesUnicode.append(m_rangesUnicode);
     159                 : 
     160           30537 :         reset();
     161                 : 
     162           30537 :         return characterClass;
     163                 :     }
     164                 : 
     165                 : private:
     166           41016 :     void addSorted(Vector<UChar>& matches, UChar ch)
     167                 :     {
     168           41016 :         unsigned pos = 0;
     169           41016 :         unsigned range = matches.size();
     170                 : 
     171                 :         // binary chop, find position to insert char.
     172          153396 :         while (range) {
     173           71472 :             unsigned index = range >> 1;
     174                 : 
     175           71472 :             int val = matches[pos+index] - ch;
     176           71472 :             if (!val)
     177             108 :                 return;
     178           71364 :             else if (val > 0)
     179           30215 :                 range = index;
     180                 :             else {
     181           41149 :                 pos += (index+1);
     182           41149 :                 range -= (index+1);
     183                 :             }
     184                 :         }
     185                 :         
     186           40908 :         if (pos == matches.size())
     187           28236 :             matches.append(ch);
     188                 :         else
     189           12672 :             matches.insert(pos, ch);
     190                 :     }
     191                 : 
     192           64400 :     void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
     193                 :     {
     194           64400 :         unsigned end = ranges.size();
     195                 :         
     196                 :         // Simple linear scan - I doubt there are that many ranges anyway...
     197                 :         // feel free to fix this with something faster (eg binary chop).
     198           71196 :         for (unsigned i = 0; i < end; ++i) {
     199                 :             // does the new range fall before the current position in the array
     200           44125 :             if (hi < ranges[i].begin) {
     201                 :                 // optional optimization: concatenate appending ranges? - may not be worthwhile.
     202           37274 :                 if (hi == (ranges[i].begin - 1)) {
     203              27 :                     ranges[i].begin = lo;
     204              27 :                     return;
     205                 :                 }
     206           37247 :                 ranges.insert(i, CharacterRange(lo, hi));
     207           37247 :                 return;
     208                 :             }
     209                 :             // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
     210                 :             // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
     211                 :             // end of the last range they concatenate, which is just as good.
     212            6851 :             if (lo <= (ranges[i].end + 1)) {
     213                 :                 // found an intersect! we'll replace this entry in the array.
     214              55 :                 ranges[i].begin = std::min(ranges[i].begin, lo);
     215              55 :                 ranges[i].end = std::max(ranges[i].end, hi);
     216                 : 
     217                 :                 // now check if the new range can subsume any subsequent ranges.
     218              55 :                 unsigned next = i+1;
     219                 :                 // each iteration of the loop we will either remove something from the list, or break the loop.
     220             119 :                 while (next < ranges.size()) {
     221              37 :                     if (ranges[next].begin <= (ranges[i].end + 1)) {
     222                 :                         // the next entry now overlaps / concatenates this one.
     223               9 :                         ranges[i].end = std::max(ranges[i].end, ranges[next].end);
     224               9 :                         ranges.remove(next);
     225                 :                     } else
     226              28 :                         break;
     227                 :                 }
     228                 :                 
     229              55 :                 return;
     230                 :             }
     231                 :         }
     232                 : 
     233                 :         // CharacterRange comes after all existing ranges.
     234           27071 :         ranges.append(CharacterRange(lo, hi));
     235                 :     }
     236                 : 
     237                 :     bool m_isCaseInsensitive;
     238                 : 
     239                 :     Vector<UChar> m_matches;
     240                 :     Vector<CharacterRange> m_ranges;
     241                 :     Vector<UChar> m_matchesUnicode;
     242                 :     Vector<CharacterRange> m_rangesUnicode;
     243                 : };
     244                 : 
     245                 : class YarrPatternConstructor {
     246                 : public:
     247           58244 :     YarrPatternConstructor(YarrPattern& pattern)
     248                 :         : m_pattern(pattern)
     249                 :         , m_characterClassConstructor(pattern.m_ignoreCase)
     250           58244 :         , m_invertParentheticalAssertion(false)
     251                 :     {
     252           58244 :         m_pattern.m_body = js::OffTheBooks::new_<PatternDisjunction>();
     253           58244 :         m_alternative = m_pattern.m_body->addNewAlternative();
     254           58244 :         m_pattern.m_disjunctions.append(m_pattern.m_body);
     255           58244 :     }
     256                 : 
     257           58244 :     ~YarrPatternConstructor()
     258           58244 :     {
     259           58244 :     }
     260                 : 
     261               0 :     void reset()
     262                 :     {
     263               0 :         m_pattern.reset();
     264               0 :         m_characterClassConstructor.reset();
     265                 : 
     266               0 :         m_pattern.m_body = js::OffTheBooks::new_<PatternDisjunction>();
     267               0 :         m_alternative = m_pattern.m_body->addNewAlternative();
     268               0 :         m_pattern.m_disjunctions.append(m_pattern.m_body);
     269               0 :     }
     270                 :     
     271           19762 :     void assertionBOL()
     272                 :     {
     273           19762 :         if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
     274           19762 :             m_alternative->m_startsWithBOL = true;
     275           19762 :             m_alternative->m_containsBOL = true;
     276           19762 :             m_pattern.m_containsBOL = true;
     277                 :         }
     278           19762 :         m_alternative->m_terms.append(PatternTerm::BOL());
     279           19762 :     }
     280           21477 :     void assertionEOL()
     281                 :     {
     282           21477 :         m_alternative->m_terms.append(PatternTerm::EOL());
     283           21477 :     }
     284             423 :     void assertionWordBoundary(bool invert)
     285                 :     {
     286             423 :         m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
     287             423 :     }
     288                 : 
     289          201564 :     void atomPatternCharacter(UChar ch)
     290                 :     {
     291                 :         // We handle case-insensitive checking of unicode characters which do have both
     292                 :         // cases by handling them as if they were defined using a CharacterClass.
     293          201564 :         if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
     294               0 :             atomCharacterClassBegin();
     295               0 :             atomCharacterClassAtom(ch);
     296               0 :             atomCharacterClassEnd();
     297                 :         } else
     298          201564 :             m_alternative->m_terms.append(PatternTerm(ch));
     299          201564 :     }
     300                 : 
     301           31477 :     void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
     302                 :     {
     303           31477 :         switch (classID) {
     304                 :         case DigitClassID:
     305           25123 :             m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
     306           25123 :             break;
     307                 :         case SpaceClassID:
     308             809 :             m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
     309             809 :             break;
     310                 :         case WordClassID:
     311             372 :             m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
     312             372 :             break;
     313                 :         case NewlineClassID:
     314            5173 :             m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
     315            5173 :             break;
     316                 :         }
     317           31477 :     }
     318                 : 
     319           30537 :     void atomCharacterClassBegin(bool invert = false)
     320                 :     {
     321           30537 :         m_invertCharacterClass = invert;
     322           30537 :     }
     323                 : 
     324           35924 :     void atomCharacterClassAtom(UChar ch)
     325                 :     {
     326           35924 :         m_characterClassConstructor.putChar(ch);
     327           35924 :     }
     328                 : 
     329           41723 :     void atomCharacterClassRange(UChar begin, UChar end)
     330                 :     {
     331           41723 :         m_characterClassConstructor.putRange(begin, end);
     332           41723 :     }
     333                 : 
     334             759 :     void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
     335                 :     {
     336             759 :         ASSERT(classID != NewlineClassID);
     337                 : 
     338             759 :         switch (classID) {
     339                 :         case DigitClassID:
     340              28 :             m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
     341              28 :             break;
     342                 :         
     343                 :         case SpaceClassID:
     344             108 :             m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
     345             108 :             break;
     346                 :         
     347                 :         case WordClassID:
     348             623 :             m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
     349             623 :             break;
     350                 :         
     351                 :         default:
     352               0 :             ASSERT_NOT_REACHED();
     353                 :         }
     354             759 :     }
     355                 : 
     356           30537 :     void atomCharacterClassEnd()
     357                 :     {
     358           30537 :         CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
     359           30537 :         m_pattern.m_userCharacterClasses.append(newCharacterClass);
     360           30537 :         m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
     361           30537 :     }
     362                 : 
     363           49725 :     void atomParenthesesSubpatternBegin(bool capture = true)
     364                 :     {
     365           49725 :         unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
     366           49725 :         if (capture)
     367           31267 :             m_pattern.m_numSubpatterns++;
     368                 : 
     369           49725 :         PatternDisjunction* parenthesesDisjunction = js::OffTheBooks::new_<PatternDisjunction>(m_alternative);
     370           49725 :         m_pattern.m_disjunctions.append(parenthesesDisjunction);
     371           49725 :         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
     372           49725 :         m_alternative = parenthesesDisjunction->addNewAlternative();
     373           49725 :     }
     374                 : 
     375              72 :     void atomParentheticalAssertionBegin(bool invert = false)
     376                 :     {
     377              72 :         PatternDisjunction* parenthesesDisjunction = js::OffTheBooks::new_<PatternDisjunction>(m_alternative);
     378              72 :         m_pattern.m_disjunctions.append(parenthesesDisjunction);
     379              72 :         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
     380              72 :         m_alternative = parenthesesDisjunction->addNewAlternative();
     381              72 :         m_invertParentheticalAssertion = invert;
     382              72 :     }
     383                 : 
     384           49797 :     void atomParenthesesEnd()
     385                 :     {
     386           49797 :         ASSERT(m_alternative->m_parent);
     387           49797 :         ASSERT(m_alternative->m_parent->m_parent);
     388                 : 
     389           49797 :         PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
     390           49797 :         m_alternative = m_alternative->m_parent->m_parent;
     391                 : 
     392           49797 :         PatternTerm& lastTerm = m_alternative->lastTerm();
     393                 : 
     394           49797 :         unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
     395           49797 :         unsigned numBOLAnchoredAlts = 0;
     396                 : 
     397          105757 :         for (unsigned i = 0; i < numParenAlternatives; i++) {
     398                 :             // Bubble up BOL flags
     399           55960 :             if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
     400             126 :                 numBOLAnchoredAlts++;
     401                 :         }
     402                 : 
     403           49797 :         if (numBOLAnchoredAlts) {
     404             126 :             m_alternative->m_containsBOL = true;
     405                 :             // If all the alternatives in parens start with BOL, then so does this one
     406             126 :             if (numBOLAnchoredAlts == numParenAlternatives)
     407               0 :                 m_alternative->m_startsWithBOL = true;
     408                 :         }
     409                 : 
     410           49797 :         lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
     411           49797 :         m_invertParentheticalAssertion = false;
     412           49797 :     }
     413                 : 
     414              18 :     void atomBackReference(unsigned subpatternId)
     415                 :     {
     416              18 :         ASSERT(subpatternId);
     417              18 :         m_pattern.m_containsBackreferences = true;
     418              18 :         m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
     419                 : 
     420              18 :         if (subpatternId > m_pattern.m_numSubpatterns) {
     421               0 :             m_alternative->m_terms.append(PatternTerm::ForwardReference());
     422               0 :             return;
     423                 :         }
     424                 : 
     425              18 :         PatternAlternative* currentAlternative = m_alternative;
     426              18 :         ASSERT(currentAlternative);
     427                 : 
     428                 :         // Note to self: if we waited until the AST was baked, we could also remove forwards refs 
     429              54 :         while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
     430              18 :             PatternTerm& term = currentAlternative->lastTerm();
     431              18 :             ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
     432                 : 
     433              18 :             if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
     434               0 :                 m_alternative->m_terms.append(PatternTerm::ForwardReference());
     435               0 :                 return;
     436                 :             }
     437                 :         }
     438                 : 
     439              18 :         m_alternative->m_terms.append(PatternTerm(subpatternId));
     440                 :     }
     441                 : 
     442                 :     // deep copy the argument disjunction.  If filterStartsWithBOL is true, 
     443                 :     // skip alternatives with m_startsWithBOL set true.
     444           25416 :     PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
     445                 :     {
     446           25416 :         PatternDisjunction* newDisjunction = 0;
     447           51453 :         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
     448           26037 :             PatternAlternative* alternative = disjunction->m_alternatives[alt];
     449           26037 :             if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
     450            6385 :                 if (!newDisjunction) {
     451            5926 :                     newDisjunction = js::OffTheBooks::new_<PatternDisjunction>();
     452            5926 :                     newDisjunction->m_parent = disjunction->m_parent;
     453                 :                 }
     454            6385 :                 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
     455           22886 :                 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
     456           16501 :                     newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
     457                 :             }
     458                 :         }
     459                 :         
     460           25416 :         if (newDisjunction)
     461            5926 :             m_pattern.m_disjunctions.append(newDisjunction);
     462           25416 :         return newDisjunction;
     463                 :     }
     464                 :     
     465           57343 :     PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
     466                 :     {
     467           57343 :         if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
     468           51570 :             return PatternTerm(term);
     469                 :         
     470            5773 :         PatternTerm termCopy = term;
     471            5773 :         termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
     472            5773 :         return termCopy;
     473                 :     }
     474                 :     
     475           67672 :     void quantifyAtom(unsigned min, unsigned max, bool greedy)
     476                 :     {
     477           67672 :         ASSERT(min <= max);
     478           67672 :         ASSERT(m_alternative->m_terms.size());
     479                 : 
     480           67672 :         if (!max) {
     481               0 :             m_alternative->removeLastTerm();
     482               0 :             return;
     483                 :         }
     484                 : 
     485           67672 :         PatternTerm& term = m_alternative->lastTerm();
     486           67672 :         ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
     487           67672 :         ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
     488                 : 
     489                 :         // For any assertion with a zero minimum, not matching is valid and has no effect,
     490                 :         // remove it.  Otherwise, we need to match as least once, but there is no point
     491                 :         // matching more than once, so remove the quantifier.  It is not entirely clear
     492                 :         // from the spec whether or not this behavior is correct, but I believe this
     493                 :         // matches Firefox. :-/
     494           67672 :         if (term.type == PatternTerm::TypeParentheticalAssertion) {
     495               0 :             if (!min)
     496               0 :                 m_alternative->removeLastTerm();
     497               0 :             return;
     498                 :         }
     499                 : 
     500           67672 :         if (min == 0)
     501           24144 :             term.quantify(max, greedy   ? QuantifierGreedy : QuantifierNonGreedy);
     502           43528 :         else if (min == max)
     503            2686 :             term.quantify(min, QuantifierFixedCount);
     504                 :         else {
     505           40842 :             term.quantify(min, QuantifierFixedCount);
     506           40842 :             m_alternative->m_terms.append(copyTerm(term));
     507                 :             // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
     508           40842 :             m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
     509           40842 :             if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
     510            2784 :                 m_alternative->lastTerm().parentheses.isCopy = true;
     511                 :         }
     512                 :     }
     513                 : 
     514            7702 :     void disjunction()
     515                 :     {
     516            7702 :         m_alternative = m_alternative->m_parent->addNewAlternative();
     517            7702 :     }
     518                 : 
     519          122128 :     ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned *callFrameSizeOut)
     520                 :     {
     521          122128 :         alternative->m_hasFixedSize = true;
     522          122128 :         unsigned currentInputPosition = initialInputPosition;
     523                 : 
     524          534526 :         for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
     525          412398 :             PatternTerm& term = alternative->m_terms[i];
     526                 : 
     527          412398 :             switch (term.type) {
     528                 :             case PatternTerm::TypeAssertionBOL:
     529                 :             case PatternTerm::TypeAssertionEOL:
     530                 :             case PatternTerm::TypeAssertionWordBoundary:
     531           41788 :                 term.inputPosition = currentInputPosition;
     532           41788 :                 break;
     533                 : 
     534                 :             case PatternTerm::TypeBackReference:
     535              18 :                 term.inputPosition = currentInputPosition;
     536              18 :                 term.frameLocation = currentCallFrameSize;
     537              18 :                 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
     538              18 :                 alternative->m_hasFixedSize = false;
     539              18 :                 break;
     540                 : 
     541                 :             case PatternTerm::TypeForwardReference:
     542               0 :                 break;
     543                 : 
     544                 :             case PatternTerm::TypePatternCharacter:
     545          214280 :                 term.inputPosition = currentInputPosition;
     546          214280 :                 if (term.quantityType != QuantifierFixedCount) {
     547            6041 :                     term.frameLocation = currentCallFrameSize;
     548            6041 :                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
     549            6041 :                     alternative->m_hasFixedSize = false;
     550                 :                 } else
     551          208239 :                     currentInputPosition += term.quantityCount;
     552          214280 :                 break;
     553                 : 
     554                 :             case PatternTerm::TypeCharacterClass:
     555          100742 :                 term.inputPosition = currentInputPosition;
     556          100742 :                 if (term.quantityType != QuantifierFixedCount) {
     557           46290 :                     term.frameLocation = currentCallFrameSize;
     558           46290 :                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
     559           46290 :                     alternative->m_hasFixedSize = false;
     560                 :                 } else
     561           54452 :                     currentInputPosition += term.quantityCount;
     562          100742 :                 break;
     563                 : 
     564                 :             case PatternTerm::TypeParenthesesSubpattern:
     565                 :                 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
     566           55498 :                 term.frameLocation = currentCallFrameSize;
     567           55498 :                 if (term.quantityCount == 1 && !term.parentheses.isCopy) {
     568           49775 :                     if (term.quantityType != QuantifierFixedCount)
     569           12548 :                         currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
     570           49775 :                     if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition, &currentCallFrameSize))
     571               0 :                         return error;
     572                 :                     // If quantity is fixed, then pre-check its minimum size.
     573           49775 :                     if (term.quantityType == QuantifierFixedCount)
     574           37227 :                         currentInputPosition += term.parentheses.disjunction->m_minimumSize;
     575           49775 :                     term.inputPosition = currentInputPosition;
     576            5723 :                 } else if (term.parentheses.isTerminal) {
     577            2726 :                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
     578            2726 :                     if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition, &currentCallFrameSize))
     579               0 :                         return error;
     580            2726 :                     term.inputPosition = currentInputPosition;
     581                 :                 } else {
     582            2997 :                     term.inputPosition = currentInputPosition;
     583                 :                     unsigned dummy;
     584            2997 :                     if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, BASE_FRAME_SIZE, currentInputPosition, &dummy))
     585               0 :                         return error;
     586            2997 :                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
     587                 :                 }
     588                 :                 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
     589           55498 :                 alternative->m_hasFixedSize = false;
     590           55498 :                 break;
     591                 : 
     592                 :             case PatternTerm::TypeParentheticalAssertion:
     593              72 :                 term.inputPosition = currentInputPosition;
     594              72 :                 term.frameLocation = currentCallFrameSize;
     595              72 :                 if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition, &currentCallFrameSize))
     596               0 :                     return error;
     597              72 :                 break;
     598                 :             }
     599                 :         }
     600                 : 
     601          122128 :         alternative->m_minimumSize = currentInputPosition - initialInputPosition;
     602          122128 :         *callFrameSizeOut = currentCallFrameSize;
     603          122128 :         return NoError;
     604                 :     }
     605                 : 
     606          113814 :     ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned *maximumCallFrameSizeOut)
     607                 :     {
     608          113814 :         if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
     609            5895 :             initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
     610                 : 
     611          113814 :         unsigned minimumInputSize = UINT_MAX;
     612          113814 :         unsigned maximumCallFrameSize = 0;
     613          113814 :         bool hasFixedSize = true;
     614                 : 
     615          235942 :         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
     616          122128 :             PatternAlternative* alternative = disjunction->m_alternatives[alt];
     617                 :             unsigned currentAlternativeCallFrameSize;
     618          122128 :             if (ErrorCode error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, &currentAlternativeCallFrameSize))
     619               0 :                 return error;
     620          122128 :             minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
     621          122128 :             maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
     622          122128 :             hasFixedSize &= alternative->m_hasFixedSize;
     623                 :         }
     624                 :         
     625          113814 :         if (minimumInputSize == UINT_MAX)
     626               0 :             return PatternTooLarge;
     627                 : 
     628          113814 :         ASSERT(maximumCallFrameSize >= initialCallFrameSize);
     629                 : 
     630          113814 :         disjunction->m_hasFixedSize = hasFixedSize;
     631          113814 :         disjunction->m_minimumSize = minimumInputSize;
     632          113814 :         disjunction->m_callFrameSize = maximumCallFrameSize;
     633          113814 :         *maximumCallFrameSizeOut = maximumCallFrameSize;
     634          113814 :         return NoError;
     635                 :     }
     636                 : 
     637           58244 :     ErrorCode setupOffsets()
     638                 :     {
     639                 :         unsigned dummy;
     640           58244 :         return setupDisjunctionOffsets(m_pattern.m_body, BASE_FRAME_SIZE, 0, &dummy);
     641                 :     }
     642                 : 
     643                 :     // This optimization identifies sets of parentheses that we will never need to backtrack.
     644                 :     // In these cases we do not need to store state from prior iterations.
     645                 :     // We can presently avoid backtracking for:
     646                 :     //   * where the parens are at the end of the regular expression (last term in any of the
     647                 :     //     alternatives of the main body disjunction).
     648                 :     //   * where the parens are non-capturing, and quantified unbounded greedy (*).
     649                 :     //   * where the parens do not contain any capturing subpatterns.
     650           58244 :     void checkForTerminalParentheses()
     651                 :     {
     652                 :         // This check is much too crude; should be just checking whether the candidate
     653                 :         // node contains nested capturing subpatterns, not the whole expression!
     654           58244 :         if (m_pattern.m_numSubpatterns)
     655           26328 :             return;
     656                 : 
     657           31916 :         Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
     658           65092 :         for (size_t i = 0; i < alternatives.size(); ++i) {
     659           33176 :             Vector<PatternTerm>& terms = alternatives[i]->m_terms;
     660           33176 :             if (terms.size()) {
     661           33158 :                 PatternTerm& term = terms.last();
     662           35875 :                 if (term.type == PatternTerm::TypeParenthesesSubpattern
     663                 :                     && term.quantityType == QuantifierGreedy
     664                 :                     && term.quantityCount == quantifyInfinite
     665            2717 :                     && !term.capture())
     666            2717 :                     term.parentheses.isTerminal = true;
     667                 :             }
     668                 :         }
     669                 :     }
     670                 : 
     671           58244 :     void optimizeBOL()
     672                 :     {
     673                 :         // Look for expressions containing beginning of line (^) anchoring and unroll them.
     674                 :         // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
     675                 :         // This code relies on the parsing code tagging alternatives with m_containsBOL and
     676                 :         // m_startsWithBOL and rolling those up to containing alternatives.
     677                 :         // At this point, this is only valid for non-multiline expressions.
     678           58244 :         PatternDisjunction* disjunction = m_pattern.m_body;
     679                 :         
     680           58244 :         if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
     681           38601 :             return;
     682                 :         
     683           19643 :         PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
     684                 : 
     685                 :         // Set alternatives in disjunction to "onceThrough"
     686           39322 :         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
     687           19679 :             disjunction->m_alternatives[alt]->setOnceThrough();
     688                 : 
     689           19643 :         if (loopDisjunction) {
     690                 :             // Move alternatives from loopDisjunction to disjunction
     691             306 :             for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
     692             153 :                 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
     693                 :                 
     694             153 :             loopDisjunction->m_alternatives.clear();
     695                 :         }
     696                 :     }
     697                 : 
     698                 : private:
     699                 :     YarrPattern& m_pattern;
     700                 :     PatternAlternative* m_alternative;
     701                 :     CharacterClassConstructor m_characterClassConstructor;
     702                 :     bool m_invertCharacterClass;
     703                 :     bool m_invertParentheticalAssertion;
     704                 : };
     705                 : 
     706           58244 : ErrorCode YarrPattern::compile(const UString& patternString)
     707                 : {
     708          116488 :     YarrPatternConstructor constructor(*this);
     709                 : 
     710           58244 :     if (ErrorCode error = parse(constructor, patternString))
     711               0 :         return error;
     712                 :     
     713                 :     // If the pattern contains illegal backreferences reset & reparse.
     714                 :     // Quoting Netscape's "What's new in JavaScript 1.2",
     715                 :     //      "Note: if the number of left parentheses is less than the number specified
     716                 :     //       in \#, the \# is taken as an octal escape as described in the next row."
     717           58244 :     if (containsIllegalBackReference()) {
     718               0 :         unsigned numSubpatterns = m_numSubpatterns;
     719                 : 
     720               0 :         constructor.reset();
     721                 : #if !ASSERT_DISABLED
     722                 :         ErrorCode error =
     723                 : #endif
     724               0 :             parse(constructor, patternString, numSubpatterns);
     725                 : 
     726               0 :         ASSERT(!error);
     727               0 :         ASSERT(numSubpatterns == m_numSubpatterns);
     728                 :     }
     729                 : 
     730           58244 :     constructor.checkForTerminalParentheses();
     731           58244 :     constructor.optimizeBOL();
     732                 :         
     733           58244 :     if (ErrorCode error = constructor.setupOffsets())
     734               0 :         return error;
     735                 : 
     736           58244 :     return NoError;
     737                 : }
     738                 : 
     739           58244 : YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, ErrorCode* error)
     740                 :     : m_ignoreCase(ignoreCase)
     741                 :     , m_multiline(multiline)
     742                 :     , m_containsBackreferences(false)
     743                 :     , m_containsBOL(false)
     744                 :     , m_numSubpatterns(0)
     745                 :     , m_maxBackReference(0)
     746                 :     , newlineCached(0)
     747                 :     , digitsCached(0)
     748                 :     , spacesCached(0)
     749                 :     , wordcharCached(0)
     750                 :     , nondigitsCached(0)
     751                 :     , nonspacesCached(0)
     752           58244 :     , nonwordcharCached(0)
     753                 : {
     754           58244 :     *error = compile(pattern);
     755           58244 : }
     756                 : 
     757                 : } }

Generated by: LCOV version 1.7