1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sw=4 et tw=99 ft=cpp:
3 : *
4 : * ***** BEGIN LICENSE BLOCK *****
5 : * Copyright (C) 2009 Apple Inc. All rights reserved.
6 : * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
7 : *
8 : * Redistribution and use in source and binary forms, with or without
9 : * modification, are permitted provided that the following conditions
10 : * are met:
11 : * 1. Redistributions of source code must retain the above copyright
12 : * notice, this list of conditions and the following disclaimer.
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : *
17 : * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
18 : * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 : * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
21 : * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 : * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 : * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 : * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 : * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 : *
29 : * ***** END LICENSE BLOCK ***** */
30 :
31 : #ifndef YarrPattern_h
32 : #define YarrPattern_h
33 :
34 : #include "wtfbridge.h"
35 : #include "ASCIICType.h"
36 :
37 : namespace JSC { namespace Yarr {
38 :
39 : enum ErrorCode {
40 : NoError,
41 : PatternTooLarge,
42 : QuantifierOutOfOrder,
43 : QuantifierWithoutAtom,
44 : MissingParentheses,
45 : ParenthesesUnmatched,
46 : ParenthesesTypeInvalid,
47 : CharacterClassUnmatched,
48 : CharacterClassInvalidRange,
49 : CharacterClassOutOfOrder,
50 : EscapeUnterminated,
51 : QuantifierTooLarge,
52 : NumberOfErrorCodes
53 : };
54 :
55 : struct PatternDisjunction;
56 :
57 180600 : struct CharacterRange {
58 : UChar begin;
59 : UChar end;
60 :
61 89400 : CharacterRange(UChar begin, UChar end)
62 : : begin(begin)
63 89400 : , end(end)
64 : {
65 89400 : }
66 : };
67 :
68 4820 : struct CharacterClassTable : RefCounted<CharacterClassTable> {
69 : friend class js::OffTheBooks;
70 : const char* m_table;
71 : bool m_inverted;
72 4820 : static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
73 : {
74 4820 : return adoptRef(js::OffTheBooks::new_<CharacterClassTable>(table, inverted));
75 : }
76 :
77 : private:
78 4820 : CharacterClassTable(const char* table, bool inverted)
79 : : m_table(table)
80 4820 : , m_inverted(inverted)
81 : {
82 4820 : }
83 : };
84 :
85 : struct CharacterClass {
86 : WTF_MAKE_FAST_ALLOCATED
87 : public:
88 : // All CharacterClass instances have to have the full set of matches and ranges,
89 : // they may have an optional table for faster lookups (which must match the
90 : // specified matches and ranges)
91 54364 : CharacterClass(PassRefPtr<CharacterClassTable> table)
92 54364 : : m_table(table)
93 : {
94 54364 : }
95 54364 : ~CharacterClass()
96 54364 : {
97 54364 : js::Foreground::delete_(m_table.get());
98 54364 : }
99 : Vector<UChar> m_matches;
100 : Vector<CharacterRange> m_ranges;
101 : Vector<UChar> m_matchesUnicode;
102 : Vector<CharacterRange> m_rangesUnicode;
103 : RefPtr<CharacterClassTable> m_table;
104 : };
105 :
106 : enum QuantifierType {
107 : QuantifierFixedCount,
108 : QuantifierGreedy,
109 : QuantifierNonGreedy
110 : };
111 :
112 829856 : struct PatternTerm {
113 : enum Type {
114 : TypeAssertionBOL,
115 : TypeAssertionEOL,
116 : TypeAssertionWordBoundary,
117 : TypePatternCharacter,
118 : TypeCharacterClass,
119 : TypeBackReference,
120 : TypeForwardReference,
121 : TypeParenthesesSubpattern,
122 : TypeParentheticalAssertion
123 : } type;
124 : bool m_capture :1;
125 : bool m_invert :1;
126 : union {
127 : UChar patternCharacter;
128 : CharacterClass* characterClass;
129 : unsigned backReferenceSubpatternId;
130 : struct {
131 : PatternDisjunction* disjunction;
132 : unsigned subpatternId;
133 : unsigned lastSubpatternId;
134 : bool isCopy;
135 : bool isTerminal;
136 : } parentheses;
137 : };
138 : QuantifierType quantityType;
139 : unsigned quantityCount;
140 : int inputPosition;
141 : unsigned frameLocation;
142 :
143 : // No-argument constructor for js::Vector.
144 0 : PatternTerm()
145 : : type(PatternTerm::TypePatternCharacter)
146 : , m_capture(false)
147 0 : , m_invert(false)
148 : {
149 0 : patternCharacter = 0;
150 0 : quantityType = QuantifierFixedCount;
151 0 : quantityCount = 1;
152 0 : }
153 :
154 201564 : PatternTerm(UChar ch)
155 : : type(PatternTerm::TypePatternCharacter)
156 : , m_capture(false)
157 201564 : , m_invert(false)
158 : {
159 201564 : patternCharacter = ch;
160 201564 : quantityType = QuantifierFixedCount;
161 201564 : quantityCount = 1;
162 201564 : }
163 :
164 62014 : PatternTerm(CharacterClass* charClass, bool invert)
165 : : type(PatternTerm::TypeCharacterClass)
166 : , m_capture(false)
167 62014 : , m_invert(invert)
168 : {
169 62014 : characterClass = charClass;
170 62014 : quantityType = QuantifierFixedCount;
171 62014 : quantityCount = 1;
172 62014 : }
173 :
174 49797 : PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
175 : : type(type)
176 : , m_capture(capture)
177 49797 : , m_invert(invert)
178 : {
179 49797 : parentheses.disjunction = disjunction;
180 49797 : parentheses.subpatternId = subpatternId;
181 49797 : parentheses.isCopy = false;
182 49797 : parentheses.isTerminal = false;
183 49797 : quantityType = QuantifierFixedCount;
184 49797 : quantityCount = 1;
185 49797 : }
186 :
187 41662 : PatternTerm(Type type, bool invert = false)
188 : : type(type)
189 : , m_capture(false)
190 41662 : , m_invert(invert)
191 : {
192 41662 : quantityType = QuantifierFixedCount;
193 41662 : quantityCount = 1;
194 41662 : }
195 :
196 18 : PatternTerm(unsigned spatternId)
197 : : type(TypeBackReference)
198 : , m_capture(false)
199 18 : , m_invert(false)
200 : {
201 18 : backReferenceSubpatternId = spatternId;
202 18 : quantityType = QuantifierFixedCount;
203 18 : quantityCount = 1;
204 18 : }
205 :
206 0 : static PatternTerm ForwardReference()
207 : {
208 0 : return PatternTerm(TypeForwardReference);
209 : }
210 :
211 19762 : static PatternTerm BOL()
212 : {
213 19762 : return PatternTerm(TypeAssertionBOL);
214 : }
215 :
216 21477 : static PatternTerm EOL()
217 : {
218 21477 : return PatternTerm(TypeAssertionEOL);
219 : }
220 :
221 423 : static PatternTerm WordBoundary(bool invert)
222 : {
223 423 : return PatternTerm(TypeAssertionWordBoundary, invert);
224 : }
225 :
226 101687 : bool invert()
227 : {
228 101687 : return m_invert;
229 : }
230 :
231 177975 : bool capture()
232 : {
233 177975 : return m_capture;
234 : }
235 :
236 108514 : void quantify(unsigned count, QuantifierType type)
237 : {
238 108514 : quantityCount = count;
239 108514 : quantityType = type;
240 108514 : }
241 : };
242 :
243 122128 : struct PatternAlternative {
244 : WTF_MAKE_FAST_ALLOCATED
245 : public:
246 122128 : PatternAlternative(PatternDisjunction* disjunction)
247 : : m_parent(disjunction)
248 : , m_onceThrough(false)
249 : , m_hasFixedSize(false)
250 : , m_startsWithBOL(false)
251 122128 : , m_containsBOL(false)
252 : {
253 122128 : }
254 :
255 201955 : PatternTerm& lastTerm()
256 : {
257 201955 : ASSERT(m_terms.size());
258 201955 : return m_terms[m_terms.size() - 1];
259 : }
260 :
261 0 : void removeLastTerm()
262 : {
263 0 : ASSERT(m_terms.size());
264 0 : m_terms.shrink(m_terms.size() - 1);
265 0 : }
266 :
267 19679 : void setOnceThrough()
268 : {
269 19679 : m_onceThrough = true;
270 19679 : }
271 :
272 101656 : bool onceThrough()
273 : {
274 101656 : return m_onceThrough;
275 : }
276 :
277 : Vector<PatternTerm> m_terms;
278 : PatternDisjunction* m_parent;
279 : unsigned m_minimumSize;
280 : bool m_onceThrough : 1;
281 : bool m_hasFixedSize : 1;
282 : bool m_startsWithBOL : 1;
283 : bool m_containsBOL : 1;
284 : };
285 :
286 : struct PatternDisjunction {
287 : WTF_MAKE_FAST_ALLOCATED
288 : public:
289 113967 : PatternDisjunction(PatternAlternative* parent = 0)
290 : : m_parent(parent)
291 113967 : , m_hasFixedSize(false)
292 : {
293 113967 : }
294 :
295 113967 : ~PatternDisjunction()
296 113967 : {
297 113967 : deleteAllValues(m_alternatives);
298 113967 : }
299 :
300 122128 : PatternAlternative* addNewAlternative()
301 : {
302 122128 : PatternAlternative* alternative = js::OffTheBooks::new_<PatternAlternative>(this);
303 122128 : m_alternatives.append(alternative);
304 122128 : return alternative;
305 : }
306 :
307 : Vector<PatternAlternative*> m_alternatives;
308 : PatternAlternative* m_parent;
309 : unsigned m_minimumSize;
310 : unsigned m_callFrameSize;
311 : bool m_hasFixedSize;
312 : };
313 :
314 : // You probably don't want to be calling these functions directly
315 : // (please to be calling newlineCharacterClass() et al on your
316 : // friendly neighborhood YarrPattern instance to get nicely
317 : // cached copies).
318 : CharacterClass* newlineCreate();
319 : CharacterClass* digitsCreate();
320 : CharacterClass* spacesCreate();
321 : CharacterClass* wordcharCreate();
322 : CharacterClass* nondigitsCreate();
323 : CharacterClass* nonspacesCreate();
324 : CharacterClass* nonwordcharCreate();
325 :
326 : struct TermChain {
327 : TermChain(PatternTerm term)
328 : : term(term)
329 : {}
330 :
331 : PatternTerm term;
332 : Vector<TermChain> hotTerms;
333 : };
334 :
335 : struct YarrPattern {
336 : YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, ErrorCode* error);
337 :
338 58244 : ~YarrPattern()
339 58244 : {
340 58244 : deleteAllValues(m_disjunctions);
341 58244 : deleteAllValues(m_userCharacterClasses);
342 58244 : }
343 :
344 0 : void reset()
345 : {
346 0 : m_numSubpatterns = 0;
347 0 : m_maxBackReference = 0;
348 :
349 0 : m_containsBackreferences = false;
350 0 : m_containsBOL = false;
351 :
352 0 : newlineCached = 0;
353 0 : digitsCached = 0;
354 0 : spacesCached = 0;
355 0 : wordcharCached = 0;
356 0 : nondigitsCached = 0;
357 0 : nonspacesCached = 0;
358 0 : nonwordcharCached = 0;
359 :
360 0 : deleteAllValues(m_disjunctions);
361 0 : m_disjunctions.clear();
362 0 : deleteAllValues(m_userCharacterClasses);
363 0 : m_userCharacterClasses.clear();
364 0 : }
365 :
366 58244 : bool containsIllegalBackReference()
367 : {
368 58244 : return m_maxBackReference > m_numSubpatterns;
369 : }
370 :
371 8352 : CharacterClass* newlineCharacterClass()
372 : {
373 8352 : if (!newlineCached)
374 7875 : m_userCharacterClasses.append(newlineCached = newlineCreate());
375 8352 : return newlineCached;
376 : }
377 25151 : CharacterClass* digitsCharacterClass()
378 : {
379 25151 : if (!digitsCached)
380 11132 : m_userCharacterClasses.append(digitsCached = digitsCreate());
381 25151 : return digitsCached;
382 : }
383 908 : CharacterClass* spacesCharacterClass()
384 : {
385 908 : if (!spacesCached)
386 609 : m_userCharacterClasses.append(spacesCached = spacesCreate());
387 908 : return spacesCached;
388 : }
389 5203 : CharacterClass* wordcharCharacterClass()
390 : {
391 5203 : if (!wordcharCached)
392 4193 : m_userCharacterClasses.append(wordcharCached = wordcharCreate());
393 5203 : return wordcharCached;
394 : }
395 0 : CharacterClass* nondigitsCharacterClass()
396 : {
397 0 : if (!nondigitsCached)
398 0 : m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
399 0 : return nondigitsCached;
400 : }
401 9 : CharacterClass* nonspacesCharacterClass()
402 : {
403 9 : if (!nonspacesCached)
404 9 : m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
405 9 : return nonspacesCached;
406 : }
407 9 : CharacterClass* nonwordcharCharacterClass()
408 : {
409 9 : if (!nonwordcharCached)
410 9 : m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
411 9 : return nonwordcharCached;
412 : }
413 :
414 : bool m_ignoreCase : 1;
415 : bool m_multiline : 1;
416 : bool m_containsBackreferences : 1;
417 : bool m_containsBOL : 1;
418 : unsigned m_numSubpatterns;
419 : unsigned m_maxBackReference;
420 : PatternDisjunction* m_body;
421 : Vector<PatternDisjunction*, 4> m_disjunctions;
422 : Vector<CharacterClass*> m_userCharacterClasses;
423 :
424 : private:
425 : ErrorCode compile(const UString& patternString);
426 :
427 : CharacterClass* newlineCached;
428 : CharacterClass* digitsCached;
429 : CharacterClass* spacesCached;
430 : CharacterClass* wordcharCached;
431 : CharacterClass* nondigitsCached;
432 : CharacterClass* nonspacesCached;
433 : CharacterClass* nonwordcharCached;
434 : };
435 :
436 : } } // namespace JSC::Yarr
437 :
438 : #endif // YarrPattern_h
|