]> sourceware.org Git - systemtap.git/blob - stapregex-parse.cxx
d227faeb2693aa70766601144b2a0c45c27f538d
[systemtap.git] / stapregex-parse.cxx
1 // -*- C++ -*-
2 // Copyright (C) 2012-2018 Red Hat Inc.
3 //
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
7 // later version.
8 //
9 // ---
10 //
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
13
14 #include "util.h"
15
16 #include "stapregex-tree.h"
17 #include "stapregex-parse.h"
18
19 #include <cstdlib>
20 #include <cstring>
21 #include <string>
22
23 using namespace std;
24
25 namespace stapregex {
26
27 void print_escaped(std::ostream& o, rchar c)
28 {
29 o << escaped_character((unsigned)c);
30 }
31
32 // ------------------------------------------------------------------------
33
34 cursor::cursor() : input(NULL), do_unescape(false), pos(~0),
35 last_pos(~0), finished(false), next_c(0), last_c(0) {}
36
37 cursor::cursor(const std::string *input, bool do_unescape)
38 : input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false)
39 {
40 next_c = 0; last_c = 0;
41 finished = ( pos >= input->length() );
42 }
43
44 rchar
45 cursor::next ()
46 {
47 if (! next_c && finished)
48 throw regex_error(_("unexpected end of regex"), pos);
49 if (! next_c)
50 get_unescaped();
51
52 last_c = next_c;
53 // advance by zeroing next_c
54 next_c = 0;
55
56 return last_c;
57 }
58
59 rchar
60 cursor::peek ()
61 {
62 if (! next_c && ! finished)
63 get_unescaped();
64
65 // don't advance by zeroing next_c
66 last_c = next_c;
67
68 return next_c;
69 }
70
71 bool
72 cursor::has (unsigned n)
73 {
74 return ( pos <= input->length() - n );
75 }
76
77 /* Systemtap doesn't unescape string literals for us, presuming to
78 pass the backslashes intact to a C compiler; hence we need to do
79 our own unescaping here.
80
81 This functionality needs to be handled as part of cursor, in order
82 to correctly retain the original positions in the string when doing
83 error reporting. */
84 void
85 cursor::get_unescaped ()
86 {
87 static const char *hex = "0123456789abcdef";
88 static const char *oct = "01234567";
89
90 last_pos = pos;
91 char c = (*input)[pos];
92
93 if (c != '\\' || !do_unescape)
94 {
95 next_c = c;
96 pos++;
97 finished = ( pos >= input->length() );
98 return;
99 }
100
101 pos++;
102
103 /* Check for improper string end: */
104 if (pos >= input->length())
105 throw regex_error(_("unexpected end of regex"), pos);
106
107 /* The logic is based on re2c's Scanner::unescape() method;
108 the set of accepted escape codes should correspond to
109 lexer::scan() in parse.cxx. */
110 c = (*input)[pos];
111 switch (c)
112 {
113 case 'a': c = '\a'; break;
114 case 'b': c = '\b'; break;
115 case 't': c = '\t'; break;
116 case 'n': c = '\n'; break;
117 case 'v': c = '\v'; break;
118 case 'f': c = '\f'; break;
119 case 'r': c = '\r'; break;
120
121 case 'x':
122 {
123 if (pos >= input->length() - 2)
124 throw regex_error(_("two hex digits required in escape sequence"), pos);
125
126 const char *d1 = strchr(hex, tolower((*input)[pos+1]));
127 const char *d2 = strchr(hex, tolower((*input)[pos+2]));
128
129 if (!d1 || !d2)
130 throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2));
131
132 c = (char)((d1-hex) << 4) + (char)(d2-hex);
133 pos += 2; // skip two chars more than usual
134 break;
135 }
136 case '4' ... '7':
137 // XXX: perhaps perform error recovery (slurp 3 octal chars)?
138 throw regex_error(_("octal escape sequence out of range"), pos);
139
140 case '0' ... '3':
141 {
142 if (pos >= input->length() - 2)
143 throw regex_error(_("three octal digits required in escape sequence"), pos);
144
145 const char *d0 = strchr(oct, (*input)[pos]);
146 const char *d1 = strchr(oct, (*input)[pos+1]);
147 const char *d2 = strchr(oct, (*input)[pos+2]);
148
149 if (!d0 || !d1 || !d2)
150 throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2));
151
152 c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct);
153 pos += 2; // skip two chars more than usual
154 break;
155 }
156 default:
157 // do nothing; this removes the backslash from c
158 ;
159 }
160
161 next_c = c;
162 pos++;
163 finished = ( pos >= input->length() );
164 }
165
166 // ------------------------------------------------------------------------
167
168 regexp *
169 regex_parser::parse (bool do_tag)
170 {
171 cur = cursor(&input, do_unescape);
172 this->do_tag = do_tag;
173 num_subexpressions = do_tag ? 1 : 0; // group 0 is guaranteed when using tag
174
175 regexp *result = parse_expr ();
176
177 // PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
178 if (do_tag) {
179 result = new cat_op(new tag_op(TAG_START(0)), result);
180 result = new cat_op(result, new tag_op(TAG_END(0)));
181 }
182
183 if (! cur.finished)
184 {
185 rchar c = cur.peek ();
186 if (c == ')')
187 parse_error (_("unbalanced ')'"), cur.pos);
188 else
189 // This should not be possible:
190 parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos);
191 }
192
193 // PR15065 store num_tags in result
194 result->num_tags = 2 * num_subexpressions;
195 return result;
196 }
197
198 bool
199 regex_parser::isspecial (rchar c)
200 {
201 return ( c == '.' || c == '[' || c == '{' || c == '(' || c == ')'
202 || c == '\\' || c == '*' || c == '+' || c == '?' || c == '|'
203 || c == '^' || c == '$' );
204 }
205
206 void
207 regex_parser::expect (rchar expected)
208 {
209 rchar c = 0;
210 try {
211 c = cur.next ();
212 } catch (const regex_error &e) {
213 parse_error (_F("expected %c, found end of regex", expected));
214 }
215
216 if (c != expected)
217 parse_error (_F("expected %c, found %c", expected, c));
218 }
219
220 void
221 regex_parser::parse_error (const string& msg, unsigned pos)
222 {
223 throw regex_error(msg, pos);
224 }
225
226 void
227 regex_parser::parse_error (const string& msg)
228 {
229 parse_error (msg, cur.last_pos);
230 }
231
232 // ------------------------------------------------------------------------
233
234 regexp *
235 regex_parser::parse_expr ()
236 {
237 regexp *result = parse_term ();
238
239 rchar c = cur.peek ();
240 while (c && c == '|')
241 {
242 cur.next ();
243 regexp *alt = parse_term ();
244 result = make_alt (result, alt);
245 c = cur.peek ();
246 }
247
248 return result;
249 }
250
251 regexp *
252 regex_parser::parse_term ()
253 {
254 regexp *result = parse_factor ();
255
256 rchar c = cur.peek ();
257 while (c && c != '|' && c != ')')
258 {
259 regexp *next = parse_factor ();
260 result = new cat_op(result, next);
261 c = cur.peek ();
262 }
263
264 return result;
265 }
266
267 regexp *
268 regex_parser::parse_factor ()
269 {
270 regexp *result;
271 regexp *old_result = NULL;
272
273 rchar c = cur.peek ();
274 if (! c || c == '|' || c == ')')
275 {
276 result = new null_op;
277 return result;
278 }
279 else if (c == '*' || c == '+' || c == '?' || c == '{')
280 {
281 parse_error(_F("unexpected '%c'", c));
282 }
283
284 if (isspecial (c) && c != '\\')
285 cur.next (); // c is guaranteed to be swallowed
286
287 if (c == '.')
288 {
289 result = make_dot ();
290 }
291 else if (c == '[')
292 {
293 result = parse_char_range ();
294 expect (']');
295 }
296 else if (c == '(')
297 {
298 // Allow non-capturing group (?:...)
299 bool do_tag_now = do_tag;
300 rchar c2 = cur.peek();
301 if (c2 == '?')
302 {
303 cur.next();
304 rchar c3 = cur.peek();
305 if (c3 != ':')
306 parse_error(_F("unexpected '(?%c'", c3));
307 do_tag_now = false;
308 }
309
310 // To number tags correctly, reserve a subexpression number here:
311 unsigned curr_subexpression = 0;
312 if (do_tag_now)
313 curr_subexpression = num_subexpressions++;
314
315 result = parse_expr ();
316
317 // PR15065 glom appropriate tag_ops onto the expr
318 if (do_tag_now) {
319 result = new cat_op(new tag_op(TAG_START(curr_subexpression)), result);
320 result = new cat_op(result, new tag_op(TAG_END(curr_subexpression)));
321 } else {
322 // XXX: workaround for certain error checking test cases which
323 // would otherwise produce divergent behaviour without tag_ops
324 // (e.g. "^*" vs "(^)*").
325 result = new cat_op(result, new null_op);
326 }
327
328 expect (')');
329 }
330 else if (c == '^' || c == '$')
331 {
332 result = new anchor_op(c);
333 }
334 else // escaped or ordinary character -- not yet swallowed
335 {
336 string accumulate;
337 rchar d = 0;
338
339 while (c && ( ! isspecial (c) || c == '\\' ))
340 {
341 if (c == '\\')
342 {
343 cur.next ();
344 c = cur.peek ();
345 }
346
347 cur.next ();
348 d = cur.peek ();
349
350 /* if we end in a closure, it should only govern the last character */
351 if (d == '*' || d == '+' || d == '?' || d == '{')
352 {
353 /* save the last character */
354 d = c; break;
355 }
356
357 accumulate.push_back (c);
358 c = d; d = 0;
359 }
360
361 result = str_to_re (accumulate);
362
363 /* separately deal with the last character before a closure */
364 if (d != 0) {
365 old_result = result; /* will add it back outside closure at the end */
366 result = str_to_re (string(1,d));
367 }
368 }
369
370 /* parse closures or other postfix operators */
371 c = cur.peek ();
372 while (c == '*' || c == '+' || c == '?' || c == '{')
373 {
374 cur.next ();
375
376 /* closure-type operators applied to $^ are definitely not kosher */
377 if (result->type_of() == "anchor_op")
378 {
379 parse_error(_F("postfix closure '%c' applied to anchoring operator", c));
380 }
381
382 if (c == '*')
383 {
384 result = make_alt (new close_op(result), new null_op);
385 }
386 else if (c == '+')
387 {
388 result = new close_op(result);
389 }
390 else if (c == '?')
391 {
392 result = make_alt (result, new null_op);
393 }
394 else if (c == '{')
395 {
396 int minsize = parse_number ();
397 int maxsize = -1;
398
399 c = cur.next ();
400 if (c == ',')
401 {
402 c = cur.peek ();
403 if (c == '}')
404 {
405 cur.next ();
406 maxsize = -1;
407 }
408 else if (isdigit (c))
409 {
410 maxsize = parse_number ();
411 expect ('}');
412 }
413 else
414 parse_error(_("expected '}' or number"), cur.pos);
415 }
416 else if (c == '}')
417 {
418 maxsize = minsize;
419 }
420 else
421 parse_error(_("expected ',' or '}'"));
422
423 /* optimize {0,0}, {0,} and {1,} */
424 if (!do_tag && minsize == 0 && maxsize == 0)
425 {
426 // XXX: this optimization is only used when
427 // subexpression-extraction is disabled
428 delete result;
429 result = new null_op;
430 }
431 else if (minsize == 0 && maxsize == -1)
432 {
433 result = make_alt (new close_op(result), new null_op);
434 }
435 else if (minsize == 1 && maxsize == -1)
436 {
437 result = new close_op(result);
438 }
439 else
440 {
441 result = new closev_op(result, minsize, maxsize);
442 }
443 }
444
445 c = cur.peek ();
446 }
447
448 if (old_result)
449 result = new cat_op(old_result, result);
450
451 return result;
452 }
453
454 regexp *
455 regex_parser::parse_char_range ()
456 {
457 range *ran = NULL;
458
459 // check for inversion
460 bool inv = false;
461 rchar c = cur.peek ();
462 if (c == '^')
463 {
464 inv = true;
465 cur.next ();
466 }
467
468 for (;;)
469 {
470 // break on string end whenever we encounter it
471 if (cur.finished) parse_error(_("unclosed character class"));
472
473 range *add = stapregex_getrange (cur);
474 range *new_ran = ( ran != NULL ? range_union(ran, add) : add );
475 delete ran; if (new_ran != add) delete add;
476 ran = new_ran;
477
478 // break on ']' (except at the start of the class)
479 c = cur.peek ();
480 if (c == ']')
481 break;
482 }
483
484 if (inv)
485 {
486 range *new_ran = range_invert(ran);
487 delete ran;
488 ran = new_ran;
489 }
490
491 if (ran == NULL)
492 return new null_op;
493
494 return new match_op(ran);
495 }
496
497 unsigned
498 regex_parser::parse_number ()
499 {
500 string digits;
501
502 rchar c = cur.peek ();
503 while (c && isdigit (c))
504 {
505 cur.next ();
506 digits.push_back (c);
507 c = cur.peek ();
508 }
509
510 if (digits == "") parse_error(_("expected number"), cur.pos);
511
512 char *endptr = NULL;
513 int val = strtol (digits.c_str (), &endptr, 10);
514
515 if (*endptr != '\0' || errno == ERANGE) // paranoid error checking
516 parse_error(_F("could not parse number %s", digits.c_str()), cur.pos);
517 #define MAX_DFA_REPETITIONS 12345
518 if (val >= MAX_DFA_REPETITIONS) // XXX: is there a more sensible max size?
519 parse_error(_F("%s is too large", digits.c_str()), cur.pos);
520
521 return atoi (digits.c_str ());
522 }
523
524 // ------------------------------------------------------------------------
525
526 std::map<std::string, range *> named_char_classes;
527
528 range *
529 named_char_class (const string& name)
530 {
531 // static initialization of table
532 if (named_char_classes.empty())
533 {
534 // original source for these is http://www.regular-expressions.info/posixbrackets.html
535 // also checked against (intended to match) the c stdlib isFOO() chr class functions
536 named_char_classes["alpha"] = new range("A-Za-z");
537 named_char_classes["alnum"] = new range("A-Za-z0-9");
538 named_char_classes["blank"] = new range(" \t");
539 named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
540 named_char_classes["d"] = named_char_classes["digit"] = new range("0-9");
541 named_char_classes["xdigit"] = new range("0-9a-fA-F");
542 named_char_classes["graph"] = new range("\x21-\x7E");
543 named_char_classes["l"] = named_char_classes["lower"] = new range("a-z");
544 named_char_classes["print"] = new range("\x20-\x7E");
545 named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
546 named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f");
547 named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z");
548 }
549
550 if (named_char_classes.find(name) == named_char_classes.end())
551 {
552 throw regex_error (_F("unknown character class '%s'", name.c_str())); // XXX: position unknown
553 }
554
555 return new range(*named_char_classes[name]);
556 }
557
558 range *
559 stapregex_getrange (cursor& cur)
560 {
561 rchar c = cur.peek ();
562
563 if (c == '\\')
564 {
565 // Grab escaped char regardless of what it is.
566 cur.next (); c = cur.peek (); cur.next ();
567 }
568 else if (c == '[')
569 {
570 // Check for '[:' digraph.
571 rchar old_c = c; cur.next (); c = cur.peek ();
572
573 if (c == ':')
574 {
575 cur.next (); c = cur.peek (); // skip ':'
576 string charclass;
577
578 for (;;)
579 {
580 if (cur.finished)
581 throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos);
582
583 if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']')
584 {
585 cur.next (); cur.next (); // skip ':]'
586 return named_char_class(charclass);
587 }
588
589 charclass.push_back(c); cur.next(); c = cur.peek();
590 }
591 }
592 else
593 {
594 // Backtrack; fall through to processing c.
595 c = old_c;
596 }
597 }
598 else
599 cur.next ();
600
601 rchar lb = c, ub;
602
603 if (!cur.has(2) || cur.peek () != '-' || (*cur.input)[cur.pos] == ']')
604 {
605 ub = lb;
606 }
607 else
608 {
609 cur.next (); // skip '-'
610 ub = cur.peek ();
611
612 if (ub < lb)
613 throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos);
614
615 cur.next ();
616 }
617
618 return new range(lb, ub);
619 }
620
621 };
622
623 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.064623 seconds and 6 git commands to generate.