1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sw=4 et tw=99 ft=cpp:
3 : *
4 : * ***** BEGIN LICENSE BLOCK *****
5 : * Copyright (C) 2009 Apple Inc. All rights reserved.
6 : * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
7 : *
8 : * Redistribution and use in source and binary forms, with or without
9 : * modification, are permitted provided that the following conditions
10 : * are met:
11 : * 1. Redistributions of source code must retain the above copyright
12 : * notice, this list of conditions and the following disclaimer.
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : *
17 : * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
18 : * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 : * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
21 : * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 : * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 : * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 : * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 : * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 : *
29 : * ***** END LICENSE BLOCK ***** */
30 :
31 : #include "YarrPattern.h"
32 :
33 : #include "Yarr.h"
34 : #include "YarrParser.h"
35 :
36 : using namespace WTF;
37 :
38 : namespace JSC { namespace Yarr {
39 :
40 : #include "RegExpJitTables.h"
41 :
42 : #if WTF_CPU_SPARC
43 : #define BASE_FRAME_SIZE 24
44 : #else
45 : #define BASE_FRAME_SIZE 0
46 : #endif
47 :
48 58244 : class CharacterClassConstructor {
49 : public:
50 58244 : CharacterClassConstructor(bool isCaseInsensitive = false)
51 58244 : : m_isCaseInsensitive(isCaseInsensitive)
52 : {
53 58244 : }
54 :
55 30537 : void reset()
56 : {
57 30537 : m_matches.clear();
58 30537 : m_ranges.clear();
59 30537 : m_matchesUnicode.clear();
60 30537 : m_rangesUnicode.clear();
61 30537 : }
62 :
63 759 : void append(const CharacterClass* other)
64 : {
65 1481 : for (size_t i = 0; i < other->m_matches.size(); ++i)
66 722 : addSorted(m_matches, other->m_matches[i]);
67 2791 : for (size_t i = 0; i < other->m_ranges.size(); ++i)
68 2032 : addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
69 1551 : for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
70 792 : addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
71 948 : for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
72 189 : addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
73 759 : }
74 :
75 35924 : void putChar(UChar ch)
76 : {
77 35924 : if (ch <= 0x7f) {
78 35897 : if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
79 3578 : addSorted(m_matches, toASCIIUpper(ch));
80 3578 : addSorted(m_matches, toASCIILower(ch));
81 : } else
82 32319 : addSorted(m_matches, ch);
83 : } else {
84 : UChar upper, lower;
85 27 : if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
86 0 : addSorted(m_matchesUnicode, upper);
87 0 : addSorted(m_matchesUnicode, lower);
88 : } else
89 27 : addSorted(m_matchesUnicode, ch);
90 : }
91 35924 : }
92 :
93 : // returns true if this character has another case, and 'ch' is the upper case form.
94 0 : static inline bool isUnicodeUpper(UChar ch)
95 : {
96 0 : return ch != Unicode::toLower(ch);
97 : }
98 :
99 : // returns true if this character has another case, and 'ch' is the lower case form.
100 0 : static inline bool isUnicodeLower(UChar ch)
101 : {
102 0 : return ch != Unicode::toUpper(ch);
103 : }
104 :
105 41723 : void putRange(UChar lo, UChar hi)
106 : {
107 41723 : if (lo <= 0x7f) {
108 41678 : char asciiLo = lo;
109 41678 : char asciiHi = std::min(hi, (UChar)0x7f);
110 41678 : addSortedRange(m_ranges, lo, asciiHi);
111 :
112 41678 : if (m_isCaseInsensitive) {
113 38007 : if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
114 12 : addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
115 38007 : if ((asciiLo <= 'z') && (asciiHi >= 'a'))
116 20441 : addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
117 : }
118 : }
119 41723 : if (hi >= 0x80) {
120 48 : uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
121 48 : addSortedRange(m_rangesUnicode, unicodeCurr, hi);
122 :
123 48 : if (m_isCaseInsensitive) {
124 0 : while (unicodeCurr <= hi) {
125 : // If the upper bound of the range (hi) is 0xffff, the increments to
126 : // unicodeCurr in this loop may take it to 0x10000. This is fine
127 : // (if so we won't re-enter the loop, since the loop condition above
128 : // will definitely fail) - but this does mean we cannot use a UChar
129 : // to represent unicodeCurr, we must use a 32-bit value instead.
130 0 : ASSERT(unicodeCurr <= 0xffff);
131 :
132 0 : if (isUnicodeUpper(unicodeCurr)) {
133 0 : UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
134 0 : UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
135 0 : while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
136 0 : lowerCaseRangeEnd++;
137 0 : addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
138 0 : } else if (isUnicodeLower(unicodeCurr)) {
139 0 : UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
140 0 : UChar upperCaseRangeEnd = upperCaseRangeBegin;
141 0 : while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
142 0 : upperCaseRangeEnd++;
143 0 : addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
144 : } else
145 0 : ++unicodeCurr;
146 : }
147 : }
148 : }
149 41723 : }
150 :
151 30537 : CharacterClass* charClass()
152 : {
153 30537 : CharacterClass* characterClass = js::OffTheBooks::new_<CharacterClass>(PassRefPtr<CharacterClassTable>(0));
154 :
155 30537 : characterClass->m_matches.append(m_matches);
156 30537 : characterClass->m_ranges.append(m_ranges);
157 30537 : characterClass->m_matchesUnicode.append(m_matchesUnicode);
158 30537 : characterClass->m_rangesUnicode.append(m_rangesUnicode);
159 :
160 30537 : reset();
161 :
162 30537 : return characterClass;
163 : }
164 :
165 : private:
166 41016 : void addSorted(Vector<UChar>& matches, UChar ch)
167 : {
168 41016 : unsigned pos = 0;
169 41016 : unsigned range = matches.size();
170 :
171 : // binary chop, find position to insert char.
172 153396 : while (range) {
173 71472 : unsigned index = range >> 1;
174 :
175 71472 : int val = matches[pos+index] - ch;
176 71472 : if (!val)
177 108 : return;
178 71364 : else if (val > 0)
179 30215 : range = index;
180 : else {
181 41149 : pos += (index+1);
182 41149 : range -= (index+1);
183 : }
184 : }
185 :
186 40908 : if (pos == matches.size())
187 28236 : matches.append(ch);
188 : else
189 12672 : matches.insert(pos, ch);
190 : }
191 :
192 64400 : void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
193 : {
194 64400 : unsigned end = ranges.size();
195 :
196 : // Simple linear scan - I doubt there are that many ranges anyway...
197 : // feel free to fix this with something faster (eg binary chop).
198 71196 : for (unsigned i = 0; i < end; ++i) {
199 : // does the new range fall before the current position in the array
200 44125 : if (hi < ranges[i].begin) {
201 : // optional optimization: concatenate appending ranges? - may not be worthwhile.
202 37274 : if (hi == (ranges[i].begin - 1)) {
203 27 : ranges[i].begin = lo;
204 27 : return;
205 : }
206 37247 : ranges.insert(i, CharacterRange(lo, hi));
207 37247 : return;
208 : }
209 : // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
210 : // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
211 : // end of the last range they concatenate, which is just as good.
212 6851 : if (lo <= (ranges[i].end + 1)) {
213 : // found an intersect! we'll replace this entry in the array.
214 55 : ranges[i].begin = std::min(ranges[i].begin, lo);
215 55 : ranges[i].end = std::max(ranges[i].end, hi);
216 :
217 : // now check if the new range can subsume any subsequent ranges.
218 55 : unsigned next = i+1;
219 : // each iteration of the loop we will either remove something from the list, or break the loop.
220 119 : while (next < ranges.size()) {
221 37 : if (ranges[next].begin <= (ranges[i].end + 1)) {
222 : // the next entry now overlaps / concatenates this one.
223 9 : ranges[i].end = std::max(ranges[i].end, ranges[next].end);
224 9 : ranges.remove(next);
225 : } else
226 28 : break;
227 : }
228 :
229 55 : return;
230 : }
231 : }
232 :
233 : // CharacterRange comes after all existing ranges.
234 27071 : ranges.append(CharacterRange(lo, hi));
235 : }
236 :
237 : bool m_isCaseInsensitive;
238 :
239 : Vector<UChar> m_matches;
240 : Vector<CharacterRange> m_ranges;
241 : Vector<UChar> m_matchesUnicode;
242 : Vector<CharacterRange> m_rangesUnicode;
243 : };
244 :
245 : class YarrPatternConstructor {
246 : public:
247 58244 : YarrPatternConstructor(YarrPattern& pattern)
248 : : m_pattern(pattern)
249 : , m_characterClassConstructor(pattern.m_ignoreCase)
250 58244 : , m_invertParentheticalAssertion(false)
251 : {
252 58244 : m_pattern.m_body = js::OffTheBooks::new_<PatternDisjunction>();
253 58244 : m_alternative = m_pattern.m_body->addNewAlternative();
254 58244 : m_pattern.m_disjunctions.append(m_pattern.m_body);
255 58244 : }
256 :
257 58244 : ~YarrPatternConstructor()
258 58244 : {
259 58244 : }
260 :
261 0 : void reset()
262 : {
263 0 : m_pattern.reset();
264 0 : m_characterClassConstructor.reset();
265 :
266 0 : m_pattern.m_body = js::OffTheBooks::new_<PatternDisjunction>();
267 0 : m_alternative = m_pattern.m_body->addNewAlternative();
268 0 : m_pattern.m_disjunctions.append(m_pattern.m_body);
269 0 : }
270 :
271 19762 : void assertionBOL()
272 : {
273 19762 : if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
274 19762 : m_alternative->m_startsWithBOL = true;
275 19762 : m_alternative->m_containsBOL = true;
276 19762 : m_pattern.m_containsBOL = true;
277 : }
278 19762 : m_alternative->m_terms.append(PatternTerm::BOL());
279 19762 : }
280 21477 : void assertionEOL()
281 : {
282 21477 : m_alternative->m_terms.append(PatternTerm::EOL());
283 21477 : }
284 423 : void assertionWordBoundary(bool invert)
285 : {
286 423 : m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
287 423 : }
288 :
289 201564 : void atomPatternCharacter(UChar ch)
290 : {
291 : // We handle case-insensitive checking of unicode characters which do have both
292 : // cases by handling them as if they were defined using a CharacterClass.
293 201564 : if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
294 0 : atomCharacterClassBegin();
295 0 : atomCharacterClassAtom(ch);
296 0 : atomCharacterClassEnd();
297 : } else
298 201564 : m_alternative->m_terms.append(PatternTerm(ch));
299 201564 : }
300 :
301 31477 : void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
302 : {
303 31477 : switch (classID) {
304 : case DigitClassID:
305 25123 : m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
306 25123 : break;
307 : case SpaceClassID:
308 809 : m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
309 809 : break;
310 : case WordClassID:
311 372 : m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
312 372 : break;
313 : case NewlineClassID:
314 5173 : m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
315 5173 : break;
316 : }
317 31477 : }
318 :
319 30537 : void atomCharacterClassBegin(bool invert = false)
320 : {
321 30537 : m_invertCharacterClass = invert;
322 30537 : }
323 :
324 35924 : void atomCharacterClassAtom(UChar ch)
325 : {
326 35924 : m_characterClassConstructor.putChar(ch);
327 35924 : }
328 :
329 41723 : void atomCharacterClassRange(UChar begin, UChar end)
330 : {
331 41723 : m_characterClassConstructor.putRange(begin, end);
332 41723 : }
333 :
334 759 : void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
335 : {
336 759 : ASSERT(classID != NewlineClassID);
337 :
338 759 : switch (classID) {
339 : case DigitClassID:
340 28 : m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
341 28 : break;
342 :
343 : case SpaceClassID:
344 108 : m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
345 108 : break;
346 :
347 : case WordClassID:
348 623 : m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
349 623 : break;
350 :
351 : default:
352 0 : ASSERT_NOT_REACHED();
353 : }
354 759 : }
355 :
356 30537 : void atomCharacterClassEnd()
357 : {
358 30537 : CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
359 30537 : m_pattern.m_userCharacterClasses.append(newCharacterClass);
360 30537 : m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
361 30537 : }
362 :
363 49725 : void atomParenthesesSubpatternBegin(bool capture = true)
364 : {
365 49725 : unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
366 49725 : if (capture)
367 31267 : m_pattern.m_numSubpatterns++;
368 :
369 49725 : PatternDisjunction* parenthesesDisjunction = js::OffTheBooks::new_<PatternDisjunction>(m_alternative);
370 49725 : m_pattern.m_disjunctions.append(parenthesesDisjunction);
371 49725 : m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
372 49725 : m_alternative = parenthesesDisjunction->addNewAlternative();
373 49725 : }
374 :
375 72 : void atomParentheticalAssertionBegin(bool invert = false)
376 : {
377 72 : PatternDisjunction* parenthesesDisjunction = js::OffTheBooks::new_<PatternDisjunction>(m_alternative);
378 72 : m_pattern.m_disjunctions.append(parenthesesDisjunction);
379 72 : m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
380 72 : m_alternative = parenthesesDisjunction->addNewAlternative();
381 72 : m_invertParentheticalAssertion = invert;
382 72 : }
383 :
384 49797 : void atomParenthesesEnd()
385 : {
386 49797 : ASSERT(m_alternative->m_parent);
387 49797 : ASSERT(m_alternative->m_parent->m_parent);
388 :
389 49797 : PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
390 49797 : m_alternative = m_alternative->m_parent->m_parent;
391 :
392 49797 : PatternTerm& lastTerm = m_alternative->lastTerm();
393 :
394 49797 : unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
395 49797 : unsigned numBOLAnchoredAlts = 0;
396 :
397 105757 : for (unsigned i = 0; i < numParenAlternatives; i++) {
398 : // Bubble up BOL flags
399 55960 : if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
400 126 : numBOLAnchoredAlts++;
401 : }
402 :
403 49797 : if (numBOLAnchoredAlts) {
404 126 : m_alternative->m_containsBOL = true;
405 : // If all the alternatives in parens start with BOL, then so does this one
406 126 : if (numBOLAnchoredAlts == numParenAlternatives)
407 0 : m_alternative->m_startsWithBOL = true;
408 : }
409 :
410 49797 : lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
411 49797 : m_invertParentheticalAssertion = false;
412 49797 : }
413 :
414 18 : void atomBackReference(unsigned subpatternId)
415 : {
416 18 : ASSERT(subpatternId);
417 18 : m_pattern.m_containsBackreferences = true;
418 18 : m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
419 :
420 18 : if (subpatternId > m_pattern.m_numSubpatterns) {
421 0 : m_alternative->m_terms.append(PatternTerm::ForwardReference());
422 0 : return;
423 : }
424 :
425 18 : PatternAlternative* currentAlternative = m_alternative;
426 18 : ASSERT(currentAlternative);
427 :
428 : // Note to self: if we waited until the AST was baked, we could also remove forwards refs
429 54 : while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
430 18 : PatternTerm& term = currentAlternative->lastTerm();
431 18 : ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
432 :
433 18 : if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
434 0 : m_alternative->m_terms.append(PatternTerm::ForwardReference());
435 0 : return;
436 : }
437 : }
438 :
439 18 : m_alternative->m_terms.append(PatternTerm(subpatternId));
440 : }
441 :
442 : // deep copy the argument disjunction. If filterStartsWithBOL is true,
443 : // skip alternatives with m_startsWithBOL set true.
444 25416 : PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
445 : {
446 25416 : PatternDisjunction* newDisjunction = 0;
447 51453 : for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
448 26037 : PatternAlternative* alternative = disjunction->m_alternatives[alt];
449 26037 : if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
450 6385 : if (!newDisjunction) {
451 5926 : newDisjunction = js::OffTheBooks::new_<PatternDisjunction>();
452 5926 : newDisjunction->m_parent = disjunction->m_parent;
453 : }
454 6385 : PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
455 22886 : for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
456 16501 : newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
457 : }
458 : }
459 :
460 25416 : if (newDisjunction)
461 5926 : m_pattern.m_disjunctions.append(newDisjunction);
462 25416 : return newDisjunction;
463 : }
464 :
465 57343 : PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
466 : {
467 57343 : if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
468 51570 : return PatternTerm(term);
469 :
470 5773 : PatternTerm termCopy = term;
471 5773 : termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
472 5773 : return termCopy;
473 : }
474 :
475 67672 : void quantifyAtom(unsigned min, unsigned max, bool greedy)
476 : {
477 67672 : ASSERT(min <= max);
478 67672 : ASSERT(m_alternative->m_terms.size());
479 :
480 67672 : if (!max) {
481 0 : m_alternative->removeLastTerm();
482 0 : return;
483 : }
484 :
485 67672 : PatternTerm& term = m_alternative->lastTerm();
486 67672 : ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
487 67672 : ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
488 :
489 : // For any assertion with a zero minimum, not matching is valid and has no effect,
490 : // remove it. Otherwise, we need to match as least once, but there is no point
491 : // matching more than once, so remove the quantifier. It is not entirely clear
492 : // from the spec whether or not this behavior is correct, but I believe this
493 : // matches Firefox. :-/
494 67672 : if (term.type == PatternTerm::TypeParentheticalAssertion) {
495 0 : if (!min)
496 0 : m_alternative->removeLastTerm();
497 0 : return;
498 : }
499 :
500 67672 : if (min == 0)
501 24144 : term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
502 43528 : else if (min == max)
503 2686 : term.quantify(min, QuantifierFixedCount);
504 : else {
505 40842 : term.quantify(min, QuantifierFixedCount);
506 40842 : m_alternative->m_terms.append(copyTerm(term));
507 : // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
508 40842 : m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
509 40842 : if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
510 2784 : m_alternative->lastTerm().parentheses.isCopy = true;
511 : }
512 : }
513 :
514 7702 : void disjunction()
515 : {
516 7702 : m_alternative = m_alternative->m_parent->addNewAlternative();
517 7702 : }
518 :
519 122128 : ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned *callFrameSizeOut)
520 : {
521 122128 : alternative->m_hasFixedSize = true;
522 122128 : unsigned currentInputPosition = initialInputPosition;
523 :
524 534526 : for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
525 412398 : PatternTerm& term = alternative->m_terms[i];
526 :
527 412398 : switch (term.type) {
528 : case PatternTerm::TypeAssertionBOL:
529 : case PatternTerm::TypeAssertionEOL:
530 : case PatternTerm::TypeAssertionWordBoundary:
531 41788 : term.inputPosition = currentInputPosition;
532 41788 : break;
533 :
534 : case PatternTerm::TypeBackReference:
535 18 : term.inputPosition = currentInputPosition;
536 18 : term.frameLocation = currentCallFrameSize;
537 18 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
538 18 : alternative->m_hasFixedSize = false;
539 18 : break;
540 :
541 : case PatternTerm::TypeForwardReference:
542 0 : break;
543 :
544 : case PatternTerm::TypePatternCharacter:
545 214280 : term.inputPosition = currentInputPosition;
546 214280 : if (term.quantityType != QuantifierFixedCount) {
547 6041 : term.frameLocation = currentCallFrameSize;
548 6041 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
549 6041 : alternative->m_hasFixedSize = false;
550 : } else
551 208239 : currentInputPosition += term.quantityCount;
552 214280 : break;
553 :
554 : case PatternTerm::TypeCharacterClass:
555 100742 : term.inputPosition = currentInputPosition;
556 100742 : if (term.quantityType != QuantifierFixedCount) {
557 46290 : term.frameLocation = currentCallFrameSize;
558 46290 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
559 46290 : alternative->m_hasFixedSize = false;
560 : } else
561 54452 : currentInputPosition += term.quantityCount;
562 100742 : break;
563 :
564 : case PatternTerm::TypeParenthesesSubpattern:
565 : // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
566 55498 : term.frameLocation = currentCallFrameSize;
567 55498 : if (term.quantityCount == 1 && !term.parentheses.isCopy) {
568 49775 : if (term.quantityType != QuantifierFixedCount)
569 12548 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
570 49775 : if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition, ¤tCallFrameSize))
571 0 : return error;
572 : // If quantity is fixed, then pre-check its minimum size.
573 49775 : if (term.quantityType == QuantifierFixedCount)
574 37227 : currentInputPosition += term.parentheses.disjunction->m_minimumSize;
575 49775 : term.inputPosition = currentInputPosition;
576 5723 : } else if (term.parentheses.isTerminal) {
577 2726 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
578 2726 : if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition, ¤tCallFrameSize))
579 0 : return error;
580 2726 : term.inputPosition = currentInputPosition;
581 : } else {
582 2997 : term.inputPosition = currentInputPosition;
583 : unsigned dummy;
584 2997 : if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, BASE_FRAME_SIZE, currentInputPosition, &dummy))
585 0 : return error;
586 2997 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
587 : }
588 : // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
589 55498 : alternative->m_hasFixedSize = false;
590 55498 : break;
591 :
592 : case PatternTerm::TypeParentheticalAssertion:
593 72 : term.inputPosition = currentInputPosition;
594 72 : term.frameLocation = currentCallFrameSize;
595 72 : if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition, ¤tCallFrameSize))
596 0 : return error;
597 72 : break;
598 : }
599 : }
600 :
601 122128 : alternative->m_minimumSize = currentInputPosition - initialInputPosition;
602 122128 : *callFrameSizeOut = currentCallFrameSize;
603 122128 : return NoError;
604 : }
605 :
606 113814 : ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned *maximumCallFrameSizeOut)
607 : {
608 113814 : if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
609 5895 : initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
610 :
611 113814 : unsigned minimumInputSize = UINT_MAX;
612 113814 : unsigned maximumCallFrameSize = 0;
613 113814 : bool hasFixedSize = true;
614 :
615 235942 : for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
616 122128 : PatternAlternative* alternative = disjunction->m_alternatives[alt];
617 : unsigned currentAlternativeCallFrameSize;
618 122128 : if (ErrorCode error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, ¤tAlternativeCallFrameSize))
619 0 : return error;
620 122128 : minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
621 122128 : maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
622 122128 : hasFixedSize &= alternative->m_hasFixedSize;
623 : }
624 :
625 113814 : if (minimumInputSize == UINT_MAX)
626 0 : return PatternTooLarge;
627 :
628 113814 : ASSERT(maximumCallFrameSize >= initialCallFrameSize);
629 :
630 113814 : disjunction->m_hasFixedSize = hasFixedSize;
631 113814 : disjunction->m_minimumSize = minimumInputSize;
632 113814 : disjunction->m_callFrameSize = maximumCallFrameSize;
633 113814 : *maximumCallFrameSizeOut = maximumCallFrameSize;
634 113814 : return NoError;
635 : }
636 :
637 58244 : ErrorCode setupOffsets()
638 : {
639 : unsigned dummy;
640 58244 : return setupDisjunctionOffsets(m_pattern.m_body, BASE_FRAME_SIZE, 0, &dummy);
641 : }
642 :
643 : // This optimization identifies sets of parentheses that we will never need to backtrack.
644 : // In these cases we do not need to store state from prior iterations.
645 : // We can presently avoid backtracking for:
646 : // * where the parens are at the end of the regular expression (last term in any of the
647 : // alternatives of the main body disjunction).
648 : // * where the parens are non-capturing, and quantified unbounded greedy (*).
649 : // * where the parens do not contain any capturing subpatterns.
650 58244 : void checkForTerminalParentheses()
651 : {
652 : // This check is much too crude; should be just checking whether the candidate
653 : // node contains nested capturing subpatterns, not the whole expression!
654 58244 : if (m_pattern.m_numSubpatterns)
655 26328 : return;
656 :
657 31916 : Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
658 65092 : for (size_t i = 0; i < alternatives.size(); ++i) {
659 33176 : Vector<PatternTerm>& terms = alternatives[i]->m_terms;
660 33176 : if (terms.size()) {
661 33158 : PatternTerm& term = terms.last();
662 35875 : if (term.type == PatternTerm::TypeParenthesesSubpattern
663 : && term.quantityType == QuantifierGreedy
664 : && term.quantityCount == quantifyInfinite
665 2717 : && !term.capture())
666 2717 : term.parentheses.isTerminal = true;
667 : }
668 : }
669 : }
670 :
671 58244 : void optimizeBOL()
672 : {
673 : // Look for expressions containing beginning of line (^) anchoring and unroll them.
674 : // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
675 : // This code relies on the parsing code tagging alternatives with m_containsBOL and
676 : // m_startsWithBOL and rolling those up to containing alternatives.
677 : // At this point, this is only valid for non-multiline expressions.
678 58244 : PatternDisjunction* disjunction = m_pattern.m_body;
679 :
680 58244 : if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
681 38601 : return;
682 :
683 19643 : PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
684 :
685 : // Set alternatives in disjunction to "onceThrough"
686 39322 : for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
687 19679 : disjunction->m_alternatives[alt]->setOnceThrough();
688 :
689 19643 : if (loopDisjunction) {
690 : // Move alternatives from loopDisjunction to disjunction
691 306 : for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
692 153 : disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
693 :
694 153 : loopDisjunction->m_alternatives.clear();
695 : }
696 : }
697 :
698 : private:
699 : YarrPattern& m_pattern;
700 : PatternAlternative* m_alternative;
701 : CharacterClassConstructor m_characterClassConstructor;
702 : bool m_invertCharacterClass;
703 : bool m_invertParentheticalAssertion;
704 : };
705 :
706 58244 : ErrorCode YarrPattern::compile(const UString& patternString)
707 : {
708 116488 : YarrPatternConstructor constructor(*this);
709 :
710 58244 : if (ErrorCode error = parse(constructor, patternString))
711 0 : return error;
712 :
713 : // If the pattern contains illegal backreferences reset & reparse.
714 : // Quoting Netscape's "What's new in JavaScript 1.2",
715 : // "Note: if the number of left parentheses is less than the number specified
716 : // in \#, the \# is taken as an octal escape as described in the next row."
717 58244 : if (containsIllegalBackReference()) {
718 0 : unsigned numSubpatterns = m_numSubpatterns;
719 :
720 0 : constructor.reset();
721 : #if !ASSERT_DISABLED
722 : ErrorCode error =
723 : #endif
724 0 : parse(constructor, patternString, numSubpatterns);
725 :
726 0 : ASSERT(!error);
727 0 : ASSERT(numSubpatterns == m_numSubpatterns);
728 : }
729 :
730 58244 : constructor.checkForTerminalParentheses();
731 58244 : constructor.optimizeBOL();
732 :
733 58244 : if (ErrorCode error = constructor.setupOffsets())
734 0 : return error;
735 :
736 58244 : return NoError;
737 : }
738 :
739 58244 : YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, ErrorCode* error)
740 : : m_ignoreCase(ignoreCase)
741 : , m_multiline(multiline)
742 : , m_containsBackreferences(false)
743 : , m_containsBOL(false)
744 : , m_numSubpatterns(0)
745 : , m_maxBackReference(0)
746 : , newlineCached(0)
747 : , digitsCached(0)
748 : , spacesCached(0)
749 : , wordcharCached(0)
750 : , nondigitsCached(0)
751 : , nonspacesCached(0)
752 58244 : , nonwordcharCached(0)
753 : {
754 58244 : *error = compile(pattern);
755 58244 : }
756 :
757 : } }
|