]> sourceware.org Git - systemtap.git/blame - stapregex-parse.cxx
pre-release update-docs
[systemtap.git] / stapregex-parse.cxx
CommitLineData
cd4882d7 1// -*- C++ -*-
73fcca6f 2// Copyright (C) 2012-2018 Red Hat Inc.
cd4882d7
SM
3//
4// This file is part of systemtap, and is free software. You can
5// redistribute it and/or modify it under the terms of the GNU General
6// Public License (GPL); either version 2, or (at your option) any
7// later version.
8//
9// ---
10//
11// This file incorporates code from the re2c project; please see
12// the file README.stapregex for details.
13
14#include "util.h"
15
16#include "stapregex-tree.h"
17#include "stapregex-parse.h"
18
19#include <cstdlib>
20#include <cstring>
21#include <string>
22
23using namespace std;
24
25namespace stapregex {
26
6c9b8d26 27void print_escaped(std::ostream& o, rchar c)
07777235 28{
f44430b4 29 o << escaped_character((unsigned)c);
07777235
SM
30}
31
32// ------------------------------------------------------------------------
33
c92d3b42 34cursor::cursor() : input(NULL), do_unescape(false), pos(~0),
82c6d474 35 last_pos(~0), finished(false), next_c(0), last_c(0) {}
e5fcd199 36
28a27de2 37cursor::cursor(const std::string *input, bool do_unescape)
82c6d474 38 : input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false)
cd4882d7
SM
39{
40 next_c = 0; last_c = 0;
41 finished = ( pos >= input->length() );
42}
43
6c9b8d26 44rchar
cd4882d7
SM
45cursor::next ()
46{
47 if (! next_c && finished)
e5fcd199 48 throw regex_error(_("unexpected end of regex"), pos);
cd4882d7 49 if (! next_c)
e5fcd199 50 get_unescaped();
cd4882d7
SM
51
52 last_c = next_c;
53 // advance by zeroing next_c
54 next_c = 0;
55
56 return last_c;
57}
58
6c9b8d26 59rchar
cd4882d7
SM
60cursor::peek ()
61{
62 if (! next_c && ! finished)
63 get_unescaped();
64
65 // don't advance by zeroing next_c
66 last_c = next_c;
67
68 return next_c;
69}
70
40fd16cf
SM
71bool
72cursor::has (unsigned n)
73{
e5fcd199 74 return ( pos <= input->length() - n );
40fd16cf
SM
75}
76
cd4882d7
SM
77/* Systemtap doesn't unescape string literals for us, presuming to
78 pass the backslashes intact to a C compiler; hence we need to do
79 our own unescaping here.
80
81 This functionality needs to be handled as part of cursor, in order
82 to correctly retain the original positions in the string when doing
83 error reporting. */
84void
85cursor::get_unescaped ()
86{
87 static const char *hex = "0123456789abcdef";
88 static const char *oct = "01234567";
89
90 last_pos = pos;
91 char c = (*input)[pos];
92
93 if (c != '\\' || !do_unescape)
94 {
95 next_c = c;
96 pos++;
248f3856 97 finished = ( pos >= input->length() );
cd4882d7
SM
98 return;
99 }
100
40fd16cf
SM
101 pos++;
102
103 /* Check for improper string end: */
104 if (pos >= input->length())
105 throw regex_error(_("unexpected end of regex"), pos);
106
cd4882d7
SM
107 /* The logic is based on re2c's Scanner::unescape() method;
108 the set of accepted escape codes should correspond to
109 lexer::scan() in parse.cxx. */
40fd16cf 110 c = (*input)[pos];
cd4882d7
SM
111 switch (c)
112 {
113 case 'a': c = '\a'; break;
114 case 'b': c = '\b'; break;
115 case 't': c = '\t'; break;
116 case 'n': c = '\n'; break;
117 case 'v': c = '\v'; break;
118 case 'f': c = '\f'; break;
119 case 'r': c = '\r'; break;
120
121 case 'x':
e5fcd199 122 {
cd4882d7
SM
123 if (pos >= input->length() - 2)
124 throw regex_error(_("two hex digits required in escape sequence"), pos);
125
126 const char *d1 = strchr(hex, tolower((*input)[pos+1]));
127 const char *d2 = strchr(hex, tolower((*input)[pos+2]));
128
129 if (!d1 || !d2)
130 throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2));
131
e5fcd199 132 c = (char)((d1-hex) << 4) + (char)(d2-hex);
cd4882d7
SM
133 pos += 2; // skip two chars more than usual
134 break;
e5fcd199 135 }
cd4882d7
SM
136 case '4' ... '7':
137 // XXX: perhaps perform error recovery (slurp 3 octal chars)?
138 throw regex_error(_("octal escape sequence out of range"), pos);
139
140 case '0' ... '3':
e5fcd199 141 {
cd4882d7
SM
142 if (pos >= input->length() - 2)
143 throw regex_error(_("three octal digits required in escape sequence"), pos);
144
145 const char *d0 = strchr(oct, (*input)[pos]);
146 const char *d1 = strchr(oct, (*input)[pos+1]);
147 const char *d2 = strchr(oct, (*input)[pos+2]);
148
149 if (!d0 || !d1 || !d2)
150 throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2));
151
152 c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct);
153 pos += 2; // skip two chars more than usual
154 break;
e5fcd199 155 }
cd4882d7
SM
156 default:
157 // do nothing; this removes the backslash from c
e5fcd199 158 ;
cd4882d7
SM
159 }
160
161 next_c = c;
162 pos++;
163 finished = ( pos >= input->length() );
164}
165
166// ------------------------------------------------------------------------
167
168regexp *
169regex_parser::parse (bool do_tag)
170{
112e9e2b 171 cur = cursor(&input, do_unescape);
3f822291
SM
172 this->do_tag = do_tag;
173 num_subexpressions = do_tag ? 1 : 0; // group 0 is guaranteed when using tag
cd4882d7 174
e5fcd199 175 regexp *result = parse_expr ();
cd4882d7
SM
176
177 // PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
178 if (do_tag) {
3f822291
SM
179 result = new cat_op(new tag_op(TAG_START(0)), result);
180 result = new cat_op(result, new tag_op(TAG_END(0)));
cd4882d7
SM
181 }
182
183 if (! cur.finished)
184 {
6c9b8d26 185 rchar c = cur.peek ();
cd4882d7 186 if (c == ')')
e5fcd199 187 parse_error (_("unbalanced ')'"), cur.pos);
cd4882d7
SM
188 else
189 // This should not be possible:
e5fcd199 190 parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos);
cd4882d7
SM
191 }
192
193 // PR15065 store num_tags in result
3f822291 194 result->num_tags = 2 * num_subexpressions;
cd4882d7
SM
195 return result;
196}
197
198bool
6c9b8d26 199regex_parser::isspecial (rchar c)
cd4882d7
SM
200{
201 return ( c == '.' || c == '[' || c == '{' || c == '(' || c == ')'
202 || c == '\\' || c == '*' || c == '+' || c == '?' || c == '|'
203 || c == '^' || c == '$' );
204}
205
206void
6c9b8d26 207regex_parser::expect (rchar expected)
cd4882d7 208{
6c9b8d26 209 rchar c = 0;
cd4882d7
SM
210 try {
211 c = cur.next ();
212 } catch (const regex_error &e) {
213 parse_error (_F("expected %c, found end of regex", expected));
214 }
215
216 if (c != expected)
217 parse_error (_F("expected %c, found %c", expected, c));
218}
219
220void
221regex_parser::parse_error (const string& msg, unsigned pos)
222{
223 throw regex_error(msg, pos);
224}
225
226void
227regex_parser::parse_error (const string& msg)
228{
229 parse_error (msg, cur.last_pos);
230}
231
232// ------------------------------------------------------------------------
233
234regexp *
235regex_parser::parse_expr ()
236{
237 regexp *result = parse_term ();
238
6c9b8d26 239 rchar c = cur.peek ();
cd4882d7
SM
240 while (c && c == '|')
241 {
40fd16cf 242 cur.next ();
cd4882d7
SM
243 regexp *alt = parse_term ();
244 result = make_alt (result, alt);
245 c = cur.peek ();
246 }
247
248 return result;
249}
250
251regexp *
252regex_parser::parse_term ()
253{
254 regexp *result = parse_factor ();
255
6c9b8d26 256 rchar c = cur.peek ();
cd4882d7
SM
257 while (c && c != '|' && c != ')')
258 {
259 regexp *next = parse_factor ();
260 result = new cat_op(result, next);
261 c = cur.peek ();
262 }
263
264 return result;
265}
266
267regexp *
268regex_parser::parse_factor ()
269{
270 regexp *result;
271 regexp *old_result = NULL;
272
6c9b8d26 273 rchar c = cur.peek ();
cd4882d7
SM
274 if (! c || c == '|' || c == ')')
275 {
276 result = new null_op;
277 return result;
278 }
279 else if (c == '*' || c == '+' || c == '?' || c == '{')
280 {
281 parse_error(_F("unexpected '%c'", c));
282 }
283
284 if (isspecial (c) && c != '\\')
285 cur.next (); // c is guaranteed to be swallowed
286
287 if (c == '.')
288 {
289 result = make_dot ();
290 }
291 else if (c == '[')
292 {
293 result = parse_char_range ();
294 expect (']');
295 }
296 else if (c == '(')
297 {
9e265076
SM
298 // Allow non-capturing group (?:...)
299 bool do_tag_now = do_tag;
300 rchar c2 = cur.peek();
301 if (c2 == '?')
302 {
303 cur.next();
304 rchar c3 = cur.peek();
305 if (c3 != ':')
306 parse_error(_F("unexpected '(?%c'", c3));
307 do_tag_now = false;
308 }
309
3f822291
SM
310 // To number tags correctly, reserve a subexpression number here:
311 unsigned curr_subexpression = 0;
9e265076 312 if (do_tag_now)
3f822291
SM
313 curr_subexpression = num_subexpressions++;
314
cd4882d7
SM
315 result = parse_expr ();
316
317 // PR15065 glom appropriate tag_ops onto the expr
9e265076 318 if (do_tag_now) {
3f822291
SM
319 result = new cat_op(new tag_op(TAG_START(curr_subexpression)), result);
320 result = new cat_op(result, new tag_op(TAG_END(curr_subexpression)));
d6833508
SM
321 } else {
322 // XXX: workaround for certain error checking test cases which
3f822291 323 // would otherwise produce divergent behaviour without tag_ops
d6833508
SM
324 // (e.g. "^*" vs "(^)*").
325 result = new cat_op(result, new null_op);
cd4882d7
SM
326 }
327
328 expect (')');
329 }
330 else if (c == '^' || c == '$')
331 {
332 result = new anchor_op(c);
333 }
334 else // escaped or ordinary character -- not yet swallowed
335 {
336 string accumulate;
6c9b8d26 337 rchar d = 0;
cd4882d7
SM
338
339 while (c && ( ! isspecial (c) || c == '\\' ))
340 {
341 if (c == '\\')
342 {
343 cur.next ();
344 c = cur.peek ();
345 }
346
347 cur.next ();
348 d = cur.peek ();
349
350 /* if we end in a closure, it should only govern the last character */
351 if (d == '*' || d == '+' || d == '?' || d == '{')
352 {
353 /* save the last character */
354 d = c; break;
355 }
356
357 accumulate.push_back (c);
358 c = d; d = 0;
359 }
360
361 result = str_to_re (accumulate);
362
363 /* separately deal with the last character before a closure */
364 if (d != 0) {
365 old_result = result; /* will add it back outside closure at the end */
e5fcd199 366 result = str_to_re (string(1,d));
cd4882d7
SM
367 }
368 }
369
370 /* parse closures or other postfix operators */
371 c = cur.peek ();
372 while (c == '*' || c == '+' || c == '?' || c == '{')
373 {
374 cur.next ();
375
376 /* closure-type operators applied to $^ are definitely not kosher */
377 if (result->type_of() == "anchor_op")
378 {
379 parse_error(_F("postfix closure '%c' applied to anchoring operator", c));
380 }
381
382 if (c == '*')
383 {
384 result = make_alt (new close_op(result), new null_op);
385 }
386 else if (c == '+')
387 {
388 result = new close_op(result);
389 }
390 else if (c == '?')
391 {
392 result = make_alt (result, new null_op);
393 }
394 else if (c == '{')
395 {
396 int minsize = parse_number ();
397 int maxsize = -1;
398
399 c = cur.next ();
400 if (c == ',')
401 {
402 c = cur.peek ();
403 if (c == '}')
404 {
405 cur.next ();
406 maxsize = -1;
407 }
408 else if (isdigit (c))
409 {
410 maxsize = parse_number ();
411 expect ('}');
412 }
413 else
414 parse_error(_("expected '}' or number"), cur.pos);
415 }
416 else if (c == '}')
417 {
418 maxsize = minsize;
419 }
420 else
421 parse_error(_("expected ',' or '}'"));
422
423 /* optimize {0,0}, {0,} and {1,} */
e5fcd199 424 if (!do_tag && minsize == 0 && maxsize == 0)
cd4882d7 425 {
e5fcd199
SM
426 // XXX: this optimization is only used when
427 // subexpression-extraction is disabled
cd4882d7
SM
428 delete result;
429 result = new null_op;
430 }
431 else if (minsize == 0 && maxsize == -1)
432 {
433 result = make_alt (new close_op(result), new null_op);
434 }
435 else if (minsize == 1 && maxsize == -1)
436 {
437 result = new close_op(result);
438 }
439 else
440 {
441 result = new closev_op(result, minsize, maxsize);
442 }
443 }
444
445 c = cur.peek ();
446 }
447
448 if (old_result)
449 result = new cat_op(old_result, result);
450
451 return result;
452}
453
cd4882d7
SM
454regexp *
455regex_parser::parse_char_range ()
456{
40fd16cf 457 range *ran = NULL;
cd4882d7 458
40fd16cf 459 // check for inversion
cd4882d7 460 bool inv = false;
6c9b8d26 461 rchar c = cur.peek ();
cd4882d7
SM
462 if (c == '^')
463 {
464 inv = true;
465 cur.next ();
cd4882d7
SM
466 }
467
cd4882d7
SM
468 for (;;)
469 {
40fd16cf 470 // break on string end whenever we encounter it
3f822291 471 if (cur.finished) parse_error(_("unclosed character class"));
cd4882d7 472
e5fcd199 473 range *add = stapregex_getrange (cur);
28a27de2
SM
474 range *new_ran = ( ran != NULL ? range_union(ran, add) : add );
475 delete ran; if (new_ran != add) delete add;
476 ran = new_ran;
cd4882d7 477
40fd16cf 478 // break on ']' (except at the start of the class)
248f3856 479 c = cur.peek ();
40fd16cf 480 if (c == ']')
248f3856 481 break;
cd4882d7
SM
482 }
483
e5fcd199
SM
484 if (inv)
485 {
28a27de2
SM
486 range *new_ran = range_invert(ran);
487 delete ran;
488 ran = new_ran;
e5fcd199
SM
489 }
490
40fd16cf
SM
491 if (ran == NULL)
492 return new null_op;
493
494 return new match_op(ran);
cd4882d7
SM
495}
496
497unsigned
498regex_parser::parse_number ()
499{
500 string digits;
501
6c9b8d26 502 rchar c = cur.peek ();
cd4882d7
SM
503 while (c && isdigit (c))
504 {
505 cur.next ();
506 digits.push_back (c);
507 c = cur.peek ();
508 }
509
510 if (digits == "") parse_error(_("expected number"), cur.pos);
511
512 char *endptr = NULL;
513 int val = strtol (digits.c_str (), &endptr, 10);
514
515 if (*endptr != '\0' || errno == ERANGE) // paranoid error checking
516 parse_error(_F("could not parse number %s", digits.c_str()), cur.pos);
517#define MAX_DFA_REPETITIONS 12345
518 if (val >= MAX_DFA_REPETITIONS) // XXX: is there a more sensible max size?
519 parse_error(_F("%s is too large", digits.c_str()), cur.pos);
520
521 return atoi (digits.c_str ());
522}
523
cd4882d7
SM
524// ------------------------------------------------------------------------
525
e5fcd199 526std::map<std::string, range *> named_char_classes;
40fd16cf
SM
527
528range *
529named_char_class (const string& name)
cd4882d7 530{
40fd16cf
SM
531 // static initialization of table
532 if (named_char_classes.empty())
533 {
534 // original source for these is http://www.regular-expressions.info/posixbrackets.html
535 // also checked against (intended to match) the c stdlib isFOO() chr class functions
536 named_char_classes["alpha"] = new range("A-Za-z");
537 named_char_classes["alnum"] = new range("A-Za-z0-9");
538 named_char_classes["blank"] = new range(" \t");
539 named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
540 named_char_classes["d"] = named_char_classes["digit"] = new range("0-9");
541 named_char_classes["xdigit"] = new range("0-9a-fA-F");
542 named_char_classes["graph"] = new range("\x21-\x7E");
543 named_char_classes["l"] = named_char_classes["lower"] = new range("a-z");
544 named_char_classes["print"] = new range("\x20-\x7E");
d6833508 545 named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
40fd16cf
SM
546 named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f");
547 named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z");
548 }
cd4882d7 549
40fd16cf 550 if (named_char_classes.find(name) == named_char_classes.end())
cd4882d7 551 {
e5fcd199 552 throw regex_error (_F("unknown character class '%s'", name.c_str())); // XXX: position unknown
40fd16cf 553 }
cd4882d7 554
40fd16cf
SM
555 return new range(*named_char_classes[name]);
556}
557
558range *
559stapregex_getrange (cursor& cur)
560{
6c9b8d26 561 rchar c = cur.peek ();
40fd16cf
SM
562
563 if (c == '\\')
564 {
565 // Grab escaped char regardless of what it is.
566 cur.next (); c = cur.peek (); cur.next ();
567 }
568 else if (c == '[')
569 {
570 // Check for '[:' digraph.
6c9b8d26 571 rchar old_c = c; cur.next (); c = cur.peek ();
40fd16cf
SM
572
573 if (c == ':')
cd4882d7 574 {
248f3856 575 cur.next (); c = cur.peek (); // skip ':'
40fd16cf 576 string charclass;
cd4882d7 577
40fd16cf
SM
578 for (;;)
579 {
580 if (cur.finished)
581 throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos);
cd4882d7 582
e5fcd199 583 if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']')
40fd16cf
SM
584 {
585 cur.next (); cur.next (); // skip ':]'
586 return named_char_class(charclass);
587 }
588
589 charclass.push_back(c); cur.next(); c = cur.peek();
590 }
cd4882d7 591 }
40fd16cf
SM
592 else
593 {
594 // Backtrack; fall through to processing c.
595 c = old_c;
596 }
597 }
598 else
599 cur.next ();
600
6c9b8d26 601 rchar lb = c, ub;
40fd16cf 602
d6833508 603 if (!cur.has(2) || cur.peek () != '-' || (*cur.input)[cur.pos] == ']')
40fd16cf
SM
604 {
605 ub = lb;
cd4882d7 606 }
40fd16cf
SM
607 else
608 {
609 cur.next (); // skip '-'
610 ub = cur.peek ();
611
612 if (ub < lb)
613 throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos);
614
615 cur.next ();
616 }
617
618 return new range(lb, ub);
cd4882d7
SM
619}
620
40fd16cf
SM
621};
622
cd4882d7 623/* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.184014 seconds and 6 git commands to generate.