]>
Commit | Line | Data |
---|---|---|
1 | // -*- C++ -*- | |
2 | // Copyright (C) 2012-2018 Red Hat Inc. | |
3 | // | |
4 | // This file is part of systemtap, and is free software. You can | |
5 | // redistribute it and/or modify it under the terms of the GNU General | |
6 | // Public License (GPL); either version 2, or (at your option) any | |
7 | // later version. | |
8 | // | |
9 | // --- | |
10 | // | |
11 | // This file incorporates code from the re2c project; please see | |
12 | // the file README.stapregex for details. | |
13 | ||
14 | #include "util.h" | |
15 | ||
16 | #include "stapregex-tree.h" | |
17 | #include "stapregex-parse.h" | |
18 | ||
19 | #include <cstdlib> | |
20 | #include <cstring> | |
21 | #include <string> | |
22 | ||
23 | using namespace std; | |
24 | ||
25 | namespace stapregex { | |
26 | ||
27 | void print_escaped(std::ostream& o, rchar c) | |
28 | { | |
29 | o << escaped_character((unsigned)c); | |
30 | } | |
31 | ||
32 | // ------------------------------------------------------------------------ | |
33 | ||
34 | cursor::cursor() : input(NULL), do_unescape(false), pos(~0), | |
35 | last_pos(~0), finished(false), next_c(0), last_c(0) {} | |
36 | ||
37 | cursor::cursor(const std::string *input, bool do_unescape) | |
38 | : input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false) | |
39 | { | |
40 | next_c = 0; last_c = 0; | |
41 | finished = ( pos >= input->length() ); | |
42 | } | |
43 | ||
44 | rchar | |
45 | cursor::next () | |
46 | { | |
47 | if (! next_c && finished) | |
48 | throw regex_error(_("unexpected end of regex"), pos); | |
49 | if (! next_c) | |
50 | get_unescaped(); | |
51 | ||
52 | last_c = next_c; | |
53 | // advance by zeroing next_c | |
54 | next_c = 0; | |
55 | ||
56 | return last_c; | |
57 | } | |
58 | ||
59 | rchar | |
60 | cursor::peek () | |
61 | { | |
62 | if (! next_c && ! finished) | |
63 | get_unescaped(); | |
64 | ||
65 | // don't advance by zeroing next_c | |
66 | last_c = next_c; | |
67 | ||
68 | return next_c; | |
69 | } | |
70 | ||
71 | bool | |
72 | cursor::has (unsigned n) | |
73 | { | |
74 | return ( pos <= input->length() - n ); | |
75 | } | |
76 | ||
77 | /* Systemtap doesn't unescape string literals for us, presuming to | |
78 | pass the backslashes intact to a C compiler; hence we need to do | |
79 | our own unescaping here. | |
80 | ||
81 | This functionality needs to be handled as part of cursor, in order | |
82 | to correctly retain the original positions in the string when doing | |
83 | error reporting. */ | |
84 | void | |
85 | cursor::get_unescaped () | |
86 | { | |
87 | static const char *hex = "0123456789abcdef"; | |
88 | static const char *oct = "01234567"; | |
89 | ||
90 | last_pos = pos; | |
91 | char c = (*input)[pos]; | |
92 | ||
93 | if (c != '\\' || !do_unescape) | |
94 | { | |
95 | next_c = c; | |
96 | pos++; | |
97 | finished = ( pos >= input->length() ); | |
98 | return; | |
99 | } | |
100 | ||
101 | pos++; | |
102 | ||
103 | /* Check for improper string end: */ | |
104 | if (pos >= input->length()) | |
105 | throw regex_error(_("unexpected end of regex"), pos); | |
106 | ||
107 | /* The logic is based on re2c's Scanner::unescape() method; | |
108 | the set of accepted escape codes should correspond to | |
109 | lexer::scan() in parse.cxx. */ | |
110 | c = (*input)[pos]; | |
111 | switch (c) | |
112 | { | |
113 | case 'a': c = '\a'; break; | |
114 | case 'b': c = '\b'; break; | |
115 | case 't': c = '\t'; break; | |
116 | case 'n': c = '\n'; break; | |
117 | case 'v': c = '\v'; break; | |
118 | case 'f': c = '\f'; break; | |
119 | case 'r': c = '\r'; break; | |
120 | ||
121 | case 'x': | |
122 | { | |
123 | if (pos >= input->length() - 2) | |
124 | throw regex_error(_("two hex digits required in escape sequence"), pos); | |
125 | ||
126 | const char *d1 = strchr(hex, tolower((*input)[pos+1])); | |
127 | const char *d2 = strchr(hex, tolower((*input)[pos+2])); | |
128 | ||
129 | if (!d1 || !d2) | |
130 | throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2)); | |
131 | ||
132 | c = (char)((d1-hex) << 4) + (char)(d2-hex); | |
133 | pos += 2; // skip two chars more than usual | |
134 | break; | |
135 | } | |
136 | case '4' ... '7': | |
137 | // XXX: perhaps perform error recovery (slurp 3 octal chars)? | |
138 | throw regex_error(_("octal escape sequence out of range"), pos); | |
139 | ||
140 | case '0' ... '3': | |
141 | { | |
142 | if (pos >= input->length() - 2) | |
143 | throw regex_error(_("three octal digits required in escape sequence"), pos); | |
144 | ||
145 | const char *d0 = strchr(oct, (*input)[pos]); | |
146 | const char *d1 = strchr(oct, (*input)[pos+1]); | |
147 | const char *d2 = strchr(oct, (*input)[pos+2]); | |
148 | ||
149 | if (!d0 || !d1 || !d2) | |
150 | throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2)); | |
151 | ||
152 | c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct); | |
153 | pos += 2; // skip two chars more than usual | |
154 | break; | |
155 | } | |
156 | default: | |
157 | // do nothing; this removes the backslash from c | |
158 | ; | |
159 | } | |
160 | ||
161 | next_c = c; | |
162 | pos++; | |
163 | finished = ( pos >= input->length() ); | |
164 | } | |
165 | ||
166 | // ------------------------------------------------------------------------ | |
167 | ||
168 | regexp * | |
169 | regex_parser::parse (bool do_tag) | |
170 | { | |
171 | cur = cursor(&input, do_unescape); | |
172 | this->do_tag = do_tag; | |
173 | num_subexpressions = do_tag ? 1 : 0; // group 0 is guaranteed when using tag | |
174 | ||
175 | regexp *result = parse_expr (); | |
176 | ||
177 | // PR15065 glom appropriate tag_ops onto the expr (subexpression 0) | |
178 | if (do_tag) { | |
179 | result = new cat_op(new tag_op(TAG_START(0)), result); | |
180 | result = new cat_op(result, new tag_op(TAG_END(0))); | |
181 | } | |
182 | ||
183 | if (! cur.finished) | |
184 | { | |
185 | rchar c = cur.peek (); | |
186 | if (c == ')') | |
187 | parse_error (_("unbalanced ')'"), cur.pos); | |
188 | else | |
189 | // This should not be possible: | |
190 | parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos); | |
191 | } | |
192 | ||
193 | // PR15065 store num_tags in result | |
194 | result->num_tags = 2 * num_subexpressions; | |
195 | return result; | |
196 | } | |
197 | ||
198 | bool | |
199 | regex_parser::isspecial (rchar c) | |
200 | { | |
201 | return ( c == '.' || c == '[' || c == '{' || c == '(' || c == ')' | |
202 | || c == '\\' || c == '*' || c == '+' || c == '?' || c == '|' | |
203 | || c == '^' || c == '$' ); | |
204 | } | |
205 | ||
206 | void | |
207 | regex_parser::expect (rchar expected) | |
208 | { | |
209 | rchar c = 0; | |
210 | try { | |
211 | c = cur.next (); | |
212 | } catch (const regex_error &e) { | |
213 | parse_error (_F("expected %c, found end of regex", expected)); | |
214 | } | |
215 | ||
216 | if (c != expected) | |
217 | parse_error (_F("expected %c, found %c", expected, c)); | |
218 | } | |
219 | ||
220 | void | |
221 | regex_parser::parse_error (const string& msg, unsigned pos) | |
222 | { | |
223 | throw regex_error(msg, pos); | |
224 | } | |
225 | ||
226 | void | |
227 | regex_parser::parse_error (const string& msg) | |
228 | { | |
229 | parse_error (msg, cur.last_pos); | |
230 | } | |
231 | ||
232 | // ------------------------------------------------------------------------ | |
233 | ||
234 | regexp * | |
235 | regex_parser::parse_expr () | |
236 | { | |
237 | regexp *result = parse_term (); | |
238 | ||
239 | rchar c = cur.peek (); | |
240 | while (c && c == '|') | |
241 | { | |
242 | cur.next (); | |
243 | regexp *alt = parse_term (); | |
244 | result = make_alt (result, alt); | |
245 | c = cur.peek (); | |
246 | } | |
247 | ||
248 | return result; | |
249 | } | |
250 | ||
251 | regexp * | |
252 | regex_parser::parse_term () | |
253 | { | |
254 | regexp *result = parse_factor (); | |
255 | ||
256 | rchar c = cur.peek (); | |
257 | while (c && c != '|' && c != ')') | |
258 | { | |
259 | regexp *next = parse_factor (); | |
260 | result = new cat_op(result, next); | |
261 | c = cur.peek (); | |
262 | } | |
263 | ||
264 | return result; | |
265 | } | |
266 | ||
267 | regexp * | |
268 | regex_parser::parse_factor () | |
269 | { | |
270 | regexp *result; | |
271 | regexp *old_result = NULL; | |
272 | ||
273 | rchar c = cur.peek (); | |
274 | if (! c || c == '|' || c == ')') | |
275 | { | |
276 | result = new null_op; | |
277 | return result; | |
278 | } | |
279 | else if (c == '*' || c == '+' || c == '?' || c == '{') | |
280 | { | |
281 | parse_error(_F("unexpected '%c'", c)); | |
282 | } | |
283 | ||
284 | if (isspecial (c) && c != '\\') | |
285 | cur.next (); // c is guaranteed to be swallowed | |
286 | ||
287 | if (c == '.') | |
288 | { | |
289 | result = make_dot (); | |
290 | } | |
291 | else if (c == '[') | |
292 | { | |
293 | result = parse_char_range (); | |
294 | expect (']'); | |
295 | } | |
296 | else if (c == '(') | |
297 | { | |
298 | // Allow non-capturing group (?:...) | |
299 | bool do_tag_now = do_tag; | |
300 | rchar c2 = cur.peek(); | |
301 | if (c2 == '?') | |
302 | { | |
303 | cur.next(); | |
304 | rchar c3 = cur.peek(); | |
305 | if (c3 != ':') | |
306 | parse_error(_F("unexpected '(?%c'", c3)); | |
307 | do_tag_now = false; | |
308 | } | |
309 | ||
310 | // To number tags correctly, reserve a subexpression number here: | |
311 | unsigned curr_subexpression = 0; | |
312 | if (do_tag_now) | |
313 | curr_subexpression = num_subexpressions++; | |
314 | ||
315 | result = parse_expr (); | |
316 | ||
317 | // PR15065 glom appropriate tag_ops onto the expr | |
318 | if (do_tag_now) { | |
319 | result = new cat_op(new tag_op(TAG_START(curr_subexpression)), result); | |
320 | result = new cat_op(result, new tag_op(TAG_END(curr_subexpression))); | |
321 | } else { | |
322 | // XXX: workaround for certain error checking test cases which | |
323 | // would otherwise produce divergent behaviour without tag_ops | |
324 | // (e.g. "^*" vs "(^)*"). | |
325 | result = new cat_op(result, new null_op); | |
326 | } | |
327 | ||
328 | expect (')'); | |
329 | } | |
330 | else if (c == '^' || c == '$') | |
331 | { | |
332 | result = new anchor_op(c); | |
333 | } | |
334 | else // escaped or ordinary character -- not yet swallowed | |
335 | { | |
336 | string accumulate; | |
337 | rchar d = 0; | |
338 | ||
339 | while (c && ( ! isspecial (c) || c == '\\' )) | |
340 | { | |
341 | if (c == '\\') | |
342 | { | |
343 | cur.next (); | |
344 | c = cur.peek (); | |
345 | } | |
346 | ||
347 | cur.next (); | |
348 | d = cur.peek (); | |
349 | ||
350 | /* if we end in a closure, it should only govern the last character */ | |
351 | if (d == '*' || d == '+' || d == '?' || d == '{') | |
352 | { | |
353 | /* save the last character */ | |
354 | d = c; break; | |
355 | } | |
356 | ||
357 | accumulate.push_back (c); | |
358 | c = d; d = 0; | |
359 | } | |
360 | ||
361 | result = str_to_re (accumulate); | |
362 | ||
363 | /* separately deal with the last character before a closure */ | |
364 | if (d != 0) { | |
365 | old_result = result; /* will add it back outside closure at the end */ | |
366 | result = str_to_re (string(1,d)); | |
367 | } | |
368 | } | |
369 | ||
370 | /* parse closures or other postfix operators */ | |
371 | c = cur.peek (); | |
372 | while (c == '*' || c == '+' || c == '?' || c == '{') | |
373 | { | |
374 | cur.next (); | |
375 | ||
376 | /* closure-type operators applied to $^ are definitely not kosher */ | |
377 | if (result->type_of() == "anchor_op") | |
378 | { | |
379 | parse_error(_F("postfix closure '%c' applied to anchoring operator", c)); | |
380 | } | |
381 | ||
382 | if (c == '*') | |
383 | { | |
384 | result = make_alt (new close_op(result), new null_op); | |
385 | } | |
386 | else if (c == '+') | |
387 | { | |
388 | result = new close_op(result); | |
389 | } | |
390 | else if (c == '?') | |
391 | { | |
392 | result = make_alt (result, new null_op); | |
393 | } | |
394 | else if (c == '{') | |
395 | { | |
396 | int minsize = parse_number (); | |
397 | int maxsize = -1; | |
398 | ||
399 | c = cur.next (); | |
400 | if (c == ',') | |
401 | { | |
402 | c = cur.peek (); | |
403 | if (c == '}') | |
404 | { | |
405 | cur.next (); | |
406 | maxsize = -1; | |
407 | } | |
408 | else if (isdigit (c)) | |
409 | { | |
410 | maxsize = parse_number (); | |
411 | expect ('}'); | |
412 | } | |
413 | else | |
414 | parse_error(_("expected '}' or number"), cur.pos); | |
415 | } | |
416 | else if (c == '}') | |
417 | { | |
418 | maxsize = minsize; | |
419 | } | |
420 | else | |
421 | parse_error(_("expected ',' or '}'")); | |
422 | ||
423 | /* optimize {0,0}, {0,} and {1,} */ | |
424 | if (!do_tag && minsize == 0 && maxsize == 0) | |
425 | { | |
426 | // XXX: this optimization is only used when | |
427 | // subexpression-extraction is disabled | |
428 | delete result; | |
429 | result = new null_op; | |
430 | } | |
431 | else if (minsize == 0 && maxsize == -1) | |
432 | { | |
433 | result = make_alt (new close_op(result), new null_op); | |
434 | } | |
435 | else if (minsize == 1 && maxsize == -1) | |
436 | { | |
437 | result = new close_op(result); | |
438 | } | |
439 | else | |
440 | { | |
441 | result = new closev_op(result, minsize, maxsize); | |
442 | } | |
443 | } | |
444 | ||
445 | c = cur.peek (); | |
446 | } | |
447 | ||
448 | if (old_result) | |
449 | result = new cat_op(old_result, result); | |
450 | ||
451 | return result; | |
452 | } | |
453 | ||
454 | regexp * | |
455 | regex_parser::parse_char_range () | |
456 | { | |
457 | range *ran = NULL; | |
458 | ||
459 | // check for inversion | |
460 | bool inv = false; | |
461 | rchar c = cur.peek (); | |
462 | if (c == '^') | |
463 | { | |
464 | inv = true; | |
465 | cur.next (); | |
466 | } | |
467 | ||
468 | for (;;) | |
469 | { | |
470 | // break on string end whenever we encounter it | |
471 | if (cur.finished) parse_error(_("unclosed character class")); | |
472 | ||
473 | range *add = stapregex_getrange (cur); | |
474 | range *new_ran = ( ran != NULL ? range_union(ran, add) : add ); | |
475 | delete ran; if (new_ran != add) delete add; | |
476 | ran = new_ran; | |
477 | ||
478 | // break on ']' (except at the start of the class) | |
479 | c = cur.peek (); | |
480 | if (c == ']') | |
481 | break; | |
482 | } | |
483 | ||
484 | if (inv) | |
485 | { | |
486 | range *new_ran = range_invert(ran); | |
487 | delete ran; | |
488 | ran = new_ran; | |
489 | } | |
490 | ||
491 | if (ran == NULL) | |
492 | return new null_op; | |
493 | ||
494 | return new match_op(ran); | |
495 | } | |
496 | ||
497 | unsigned | |
498 | regex_parser::parse_number () | |
499 | { | |
500 | string digits; | |
501 | ||
502 | rchar c = cur.peek (); | |
503 | while (c && isdigit (c)) | |
504 | { | |
505 | cur.next (); | |
506 | digits.push_back (c); | |
507 | c = cur.peek (); | |
508 | } | |
509 | ||
510 | if (digits == "") parse_error(_("expected number"), cur.pos); | |
511 | ||
512 | char *endptr = NULL; | |
513 | int val = strtol (digits.c_str (), &endptr, 10); | |
514 | ||
515 | if (*endptr != '\0' || errno == ERANGE) // paranoid error checking | |
516 | parse_error(_F("could not parse number %s", digits.c_str()), cur.pos); | |
517 | #define MAX_DFA_REPETITIONS 12345 | |
518 | if (val >= MAX_DFA_REPETITIONS) // XXX: is there a more sensible max size? | |
519 | parse_error(_F("%s is too large", digits.c_str()), cur.pos); | |
520 | ||
521 | return atoi (digits.c_str ()); | |
522 | } | |
523 | ||
524 | // ------------------------------------------------------------------------ | |
525 | ||
526 | std::map<std::string, range *> named_char_classes; | |
527 | ||
528 | range * | |
529 | named_char_class (const string& name) | |
530 | { | |
531 | // static initialization of table | |
532 | if (named_char_classes.empty()) | |
533 | { | |
534 | // original source for these is http://www.regular-expressions.info/posixbrackets.html | |
535 | // also checked against (intended to match) the c stdlib isFOO() chr class functions | |
536 | named_char_classes["alpha"] = new range("A-Za-z"); | |
537 | named_char_classes["alnum"] = new range("A-Za-z0-9"); | |
538 | named_char_classes["blank"] = new range(" \t"); | |
539 | named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not! | |
540 | named_char_classes["d"] = named_char_classes["digit"] = new range("0-9"); | |
541 | named_char_classes["xdigit"] = new range("0-9a-fA-F"); | |
542 | named_char_classes["graph"] = new range("\x21-\x7E"); | |
543 | named_char_classes["l"] = named_char_classes["lower"] = new range("a-z"); | |
544 | named_char_classes["print"] = new range("\x20-\x7E"); | |
545 | named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-"); | |
546 | named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f"); | |
547 | named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z"); | |
548 | } | |
549 | ||
550 | if (named_char_classes.find(name) == named_char_classes.end()) | |
551 | { | |
552 | throw regex_error (_F("unknown character class '%s'", name.c_str())); // XXX: position unknown | |
553 | } | |
554 | ||
555 | return new range(*named_char_classes[name]); | |
556 | } | |
557 | ||
558 | range * | |
559 | stapregex_getrange (cursor& cur) | |
560 | { | |
561 | rchar c = cur.peek (); | |
562 | ||
563 | if (c == '\\') | |
564 | { | |
565 | // Grab escaped char regardless of what it is. | |
566 | cur.next (); c = cur.peek (); cur.next (); | |
567 | } | |
568 | else if (c == '[') | |
569 | { | |
570 | // Check for '[:' digraph. | |
571 | rchar old_c = c; cur.next (); c = cur.peek (); | |
572 | ||
573 | if (c == ':') | |
574 | { | |
575 | cur.next (); c = cur.peek (); // skip ':' | |
576 | string charclass; | |
577 | ||
578 | for (;;) | |
579 | { | |
580 | if (cur.finished) | |
581 | throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos); | |
582 | ||
583 | if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']') | |
584 | { | |
585 | cur.next (); cur.next (); // skip ':]' | |
586 | return named_char_class(charclass); | |
587 | } | |
588 | ||
589 | charclass.push_back(c); cur.next(); c = cur.peek(); | |
590 | } | |
591 | } | |
592 | else | |
593 | { | |
594 | // Backtrack; fall through to processing c. | |
595 | c = old_c; | |
596 | } | |
597 | } | |
598 | else | |
599 | cur.next (); | |
600 | ||
601 | rchar lb = c, ub; | |
602 | ||
603 | if (!cur.has(2) || cur.peek () != '-' || (*cur.input)[cur.pos] == ']') | |
604 | { | |
605 | ub = lb; | |
606 | } | |
607 | else | |
608 | { | |
609 | cur.next (); // skip '-' | |
610 | ub = cur.peek (); | |
611 | ||
612 | if (ub < lb) | |
613 | throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos); | |
614 | ||
615 | cur.next (); | |
616 | } | |
617 | ||
618 | return new range(lb, ub); | |
619 | } | |
620 | ||
621 | }; | |
622 | ||
623 | /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */ |