]> sourceware.org Git - systemtap.git/blame - stapregex-parse.cxx
Ensure use of the function argument values from function entry in lwtools
[systemtap.git] / stapregex-parse.cxx
CommitLineData
cd4882d7
SM
1// -*- C++ -*-
2// Copyright (C) 2012-2013 Red Hat Inc.
3//
4// This file is part of systemtap, and is free software. You can
5// redistribute it and/or modify it under the terms of the GNU General
6// Public License (GPL); either version 2, or (at your option) any
7// later version.
8//
9// ---
10//
11// This file incorporates code from the re2c project; please see
12// the file README.stapregex for details.
13
14#include "util.h"
15
16#include "stapregex-tree.h"
17#include "stapregex-parse.h"
18
19#include <cstdlib>
20#include <cstring>
21#include <string>
22
23using namespace std;
24
25namespace stapregex {
26
07777235
SM
27// TODOXXX compress / eliminate / move to util
28
29// void prtChOrHex(std::ostream& o, unsigned c)
30// {
31// if (eFlag)
32// {
33// prtHex(o, c);
34// }
35// else if ((c < 256u) && (isprint(c) || isspace(c)))
36// {
37// prtCh(o, c);
38// }
39// else
40// {
41// prtHex(o, c);
42// }
43// }
44
45// void prtHex(std::ostream& o, unsigned c)
46// {
47// int oc = (int)(c);
48
49// if (re2c::uFlag)
50// {
51// o << "0x"
52// << hexCh(oc >> 28)
53// << hexCh(oc >> 24)
54// << hexCh(oc >> 20)
55// << hexCh(oc >> 16)
56// << hexCh(oc >> 12)
57// << hexCh(oc >> 8)
58// << hexCh(oc >> 4)
59// << hexCh(oc);
60// }
61// else if (re2c::wFlag)
62// {
63// o << "0x"
64// << hexCh(oc >> 12)
65// << hexCh(oc >> 8)
66// << hexCh(oc >> 4)
67// << hexCh(oc);
68// }
69// else
70// {
71// o << "0x"
72// << hexCh(oc >> 4)
73// << hexCh(oc);
74// }
75// }
76
77char octCh(unsigned c)
78{
79 return '0' + c % 8;
80}
81
82void prtCh(std::ostream& o, unsigned c)
83{
84 int oc = (int)(c);
85
86 switch (oc)
87 {
88 case '\'':
89 o << "\\'";
90 break;
91
92 case '"':
93 o << "\\\"";
94 break;
95
96 case '\n':
97 o << "\\n";
98 break;
99
100 case '\t':
101 o << "\\t";
102 break;
103
104 case '\v':
105 o << "\\v";
106 break;
107
108 case '\b':
109 o << "\\b";
110 break;
111
112 case '\r':
113 o << "\\r";
114 break;
115
116 case '\f':
117 o << "\\f";
118 break;
119
120 case '\a':
121 o << "\\a";
122 break;
123
124 case '\\':
125 o << "\\\\";
126 break;
127
128 default:
129
130 if ((oc < 256) && isprint(oc))
131 {
132 o << (char) oc;
133 }
134 else
135 {
136 o << '\\' << octCh(oc / 64) << octCh(oc / 8) << octCh(oc);
137 }
138 }
139}
140
141void print_escaped(std::ostream& o, char c)
142{
143 prtCh(o, c);
144}
145
146// ------------------------------------------------------------------------
147
c92d3b42 148cursor::cursor() : input(NULL), do_unescape(false), pos(~0),
82c6d474 149 last_pos(~0), finished(false), next_c(0), last_c(0) {}
e5fcd199 150
28a27de2 151cursor::cursor(const std::string *input, bool do_unescape)
82c6d474 152 : input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false)
cd4882d7
SM
153{
154 next_c = 0; last_c = 0;
155 finished = ( pos >= input->length() );
156}
157
158char
159cursor::next ()
160{
161 if (! next_c && finished)
e5fcd199 162 throw regex_error(_("unexpected end of regex"), pos);
cd4882d7 163 if (! next_c)
e5fcd199 164 get_unescaped();
cd4882d7
SM
165
166 last_c = next_c;
167 // advance by zeroing next_c
168 next_c = 0;
169
170 return last_c;
171}
172
173char
174cursor::peek ()
175{
176 if (! next_c && ! finished)
177 get_unescaped();
178
179 // don't advance by zeroing next_c
180 last_c = next_c;
181
182 return next_c;
183}
184
40fd16cf
SM
185bool
186cursor::has (unsigned n)
187{
e5fcd199 188 return ( pos <= input->length() - n );
40fd16cf
SM
189}
190
cd4882d7
SM
191/* Systemtap doesn't unescape string literals for us, presuming to
192 pass the backslashes intact to a C compiler; hence we need to do
193 our own unescaping here.
194
195 This functionality needs to be handled as part of cursor, in order
196 to correctly retain the original positions in the string when doing
197 error reporting. */
198void
199cursor::get_unescaped ()
200{
201 static const char *hex = "0123456789abcdef";
202 static const char *oct = "01234567";
203
204 last_pos = pos;
205 char c = (*input)[pos];
206
207 if (c != '\\' || !do_unescape)
208 {
209 next_c = c;
210 pos++;
248f3856 211 finished = ( pos >= input->length() );
cd4882d7
SM
212 return;
213 }
214
40fd16cf
SM
215 pos++;
216
217 /* Check for improper string end: */
218 if (pos >= input->length())
219 throw regex_error(_("unexpected end of regex"), pos);
220
cd4882d7
SM
221 /* The logic is based on re2c's Scanner::unescape() method;
222 the set of accepted escape codes should correspond to
223 lexer::scan() in parse.cxx. */
40fd16cf 224 c = (*input)[pos];
cd4882d7
SM
225 switch (c)
226 {
227 case 'a': c = '\a'; break;
228 case 'b': c = '\b'; break;
229 case 't': c = '\t'; break;
230 case 'n': c = '\n'; break;
231 case 'v': c = '\v'; break;
232 case 'f': c = '\f'; break;
233 case 'r': c = '\r'; break;
234
235 case 'x':
e5fcd199 236 {
cd4882d7
SM
237 if (pos >= input->length() - 2)
238 throw regex_error(_("two hex digits required in escape sequence"), pos);
239
240 const char *d1 = strchr(hex, tolower((*input)[pos+1]));
241 const char *d2 = strchr(hex, tolower((*input)[pos+2]));
242
243 if (!d1 || !d2)
244 throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2));
245
e5fcd199 246 c = (char)((d1-hex) << 4) + (char)(d2-hex);
cd4882d7
SM
247 pos += 2; // skip two chars more than usual
248 break;
e5fcd199 249 }
cd4882d7
SM
250 case '4' ... '7':
251 // XXX: perhaps perform error recovery (slurp 3 octal chars)?
252 throw regex_error(_("octal escape sequence out of range"), pos);
253
254 case '0' ... '3':
e5fcd199 255 {
cd4882d7
SM
256 if (pos >= input->length() - 2)
257 throw regex_error(_("three octal digits required in escape sequence"), pos);
258
259 const char *d0 = strchr(oct, (*input)[pos]);
260 const char *d1 = strchr(oct, (*input)[pos+1]);
261 const char *d2 = strchr(oct, (*input)[pos+2]);
262
263 if (!d0 || !d1 || !d2)
264 throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2));
265
266 c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct);
267 pos += 2; // skip two chars more than usual
268 break;
e5fcd199 269 }
cd4882d7
SM
270 default:
271 // do nothing; this removes the backslash from c
e5fcd199 272 ;
cd4882d7
SM
273 }
274
275 next_c = c;
276 pos++;
277 finished = ( pos >= input->length() );
278}
279
280// ------------------------------------------------------------------------
281
282regexp *
283regex_parser::parse (bool do_tag)
284{
112e9e2b 285 cur = cursor(&input, do_unescape);
e5fcd199 286 num_tags = 0; this->do_tag = do_tag;
cd4882d7 287
e5fcd199 288 regexp *result = parse_expr ();
cd4882d7
SM
289
290 // PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
291 if (do_tag) {
292 result = new cat_op(new tag_op(num_tags++), result);
293 result = new cat_op(result, new tag_op(num_tags++));
294 }
295
296 if (! cur.finished)
297 {
298 char c = cur.peek ();
299 if (c == ')')
e5fcd199 300 parse_error (_("unbalanced ')'"), cur.pos);
cd4882d7
SM
301 else
302 // This should not be possible:
e5fcd199 303 parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos);
cd4882d7
SM
304 }
305
306 // PR15065 store num_tags in result
e5fcd199 307 result->num_tags = num_tags;
cd4882d7
SM
308 return result;
309}
310
311bool
312regex_parser::isspecial (char c)
313{
314 return ( c == '.' || c == '[' || c == '{' || c == '(' || c == ')'
315 || c == '\\' || c == '*' || c == '+' || c == '?' || c == '|'
316 || c == '^' || c == '$' );
317}
318
319void
320regex_parser::expect (char expected)
321{
322 char c = 0;
323 try {
324 c = cur.next ();
325 } catch (const regex_error &e) {
326 parse_error (_F("expected %c, found end of regex", expected));
327 }
328
329 if (c != expected)
330 parse_error (_F("expected %c, found %c", expected, c));
331}
332
333void
334regex_parser::parse_error (const string& msg, unsigned pos)
335{
336 throw regex_error(msg, pos);
337}
338
339void
340regex_parser::parse_error (const string& msg)
341{
342 parse_error (msg, cur.last_pos);
343}
344
345// ------------------------------------------------------------------------
346
347regexp *
348regex_parser::parse_expr ()
349{
350 regexp *result = parse_term ();
351
352 char c = cur.peek ();
353 while (c && c == '|')
354 {
40fd16cf 355 cur.next ();
cd4882d7
SM
356 regexp *alt = parse_term ();
357 result = make_alt (result, alt);
358 c = cur.peek ();
359 }
360
361 return result;
362}
363
364regexp *
365regex_parser::parse_term ()
366{
367 regexp *result = parse_factor ();
368
369 char c = cur.peek ();
370 while (c && c != '|' && c != ')')
371 {
372 regexp *next = parse_factor ();
373 result = new cat_op(result, next);
374 c = cur.peek ();
375 }
376
377 return result;
378}
379
380regexp *
381regex_parser::parse_factor ()
382{
383 regexp *result;
384 regexp *old_result = NULL;
385
386 char c = cur.peek ();
387 if (! c || c == '|' || c == ')')
388 {
389 result = new null_op;
390 return result;
391 }
392 else if (c == '*' || c == '+' || c == '?' || c == '{')
393 {
394 parse_error(_F("unexpected '%c'", c));
395 }
396
397 if (isspecial (c) && c != '\\')
398 cur.next (); // c is guaranteed to be swallowed
399
400 if (c == '.')
401 {
402 result = make_dot ();
403 }
404 else if (c == '[')
405 {
406 result = parse_char_range ();
407 expect (']');
408 }
409 else if (c == '(')
410 {
411 result = parse_expr ();
412
413 // PR15065 glom appropriate tag_ops onto the expr
414 if (do_tag) {
415 result = new cat_op(new tag_op(num_tags++), result);
416 result = new cat_op(result, new tag_op(num_tags++));
d6833508
SM
417 } else {
418 // XXX: workaround for certain error checking test cases which
419 // would otherwise produce divergent behaviour
420 // (e.g. "^*" vs "(^)*").
421 result = new cat_op(result, new null_op);
cd4882d7
SM
422 }
423
424 expect (')');
425 }
426 else if (c == '^' || c == '$')
427 {
428 result = new anchor_op(c);
429 }
430 else // escaped or ordinary character -- not yet swallowed
431 {
432 string accumulate;
433 char d = 0;
434
435 while (c && ( ! isspecial (c) || c == '\\' ))
436 {
437 if (c == '\\')
438 {
439 cur.next ();
440 c = cur.peek ();
441 }
442
443 cur.next ();
444 d = cur.peek ();
445
446 /* if we end in a closure, it should only govern the last character */
447 if (d == '*' || d == '+' || d == '?' || d == '{')
448 {
449 /* save the last character */
450 d = c; break;
451 }
452
453 accumulate.push_back (c);
454 c = d; d = 0;
455 }
456
457 result = str_to_re (accumulate);
458
459 /* separately deal with the last character before a closure */
460 if (d != 0) {
461 old_result = result; /* will add it back outside closure at the end */
e5fcd199 462 result = str_to_re (string(1,d));
cd4882d7
SM
463 }
464 }
465
466 /* parse closures or other postfix operators */
467 c = cur.peek ();
468 while (c == '*' || c == '+' || c == '?' || c == '{')
469 {
470 cur.next ();
471
472 /* closure-type operators applied to $^ are definitely not kosher */
473 if (result->type_of() == "anchor_op")
474 {
475 parse_error(_F("postfix closure '%c' applied to anchoring operator", c));
476 }
477
478 if (c == '*')
479 {
480 result = make_alt (new close_op(result), new null_op);
481 }
482 else if (c == '+')
483 {
484 result = new close_op(result);
485 }
486 else if (c == '?')
487 {
488 result = make_alt (result, new null_op);
489 }
490 else if (c == '{')
491 {
492 int minsize = parse_number ();
493 int maxsize = -1;
494
495 c = cur.next ();
496 if (c == ',')
497 {
498 c = cur.peek ();
499 if (c == '}')
500 {
501 cur.next ();
502 maxsize = -1;
503 }
504 else if (isdigit (c))
505 {
506 maxsize = parse_number ();
507 expect ('}');
508 }
509 else
510 parse_error(_("expected '}' or number"), cur.pos);
511 }
512 else if (c == '}')
513 {
514 maxsize = minsize;
515 }
516 else
517 parse_error(_("expected ',' or '}'"));
518
519 /* optimize {0,0}, {0,} and {1,} */
e5fcd199 520 if (!do_tag && minsize == 0 && maxsize == 0)
cd4882d7 521 {
e5fcd199
SM
522 // XXX: this optimization is only used when
523 // subexpression-extraction is disabled
cd4882d7
SM
524 delete result;
525 result = new null_op;
526 }
527 else if (minsize == 0 && maxsize == -1)
528 {
529 result = make_alt (new close_op(result), new null_op);
530 }
531 else if (minsize == 1 && maxsize == -1)
532 {
533 result = new close_op(result);
534 }
535 else
536 {
537 result = new closev_op(result, minsize, maxsize);
538 }
539 }
540
541 c = cur.peek ();
542 }
543
544 if (old_result)
545 result = new cat_op(old_result, result);
546
547 return result;
548}
549
cd4882d7
SM
550regexp *
551regex_parser::parse_char_range ()
552{
40fd16cf 553 range *ran = NULL;
cd4882d7 554
40fd16cf 555 // check for inversion
cd4882d7
SM
556 bool inv = false;
557 char c = cur.peek ();
558 if (c == '^')
559 {
560 inv = true;
561 cur.next ();
cd4882d7
SM
562 }
563
cd4882d7
SM
564 for (;;)
565 {
40fd16cf 566 // break on string end whenever we encounter it
e5fcd199 567 if (cur.finished) parse_error(_("unclosed character class")); // TODOXXX doublecheck that this is triggered correctly
cd4882d7 568
e5fcd199 569 range *add = stapregex_getrange (cur);
28a27de2
SM
570 range *new_ran = ( ran != NULL ? range_union(ran, add) : add );
571 delete ran; if (new_ran != add) delete add;
572 ran = new_ran;
cd4882d7 573
40fd16cf 574 // break on ']' (except at the start of the class)
248f3856 575 c = cur.peek ();
40fd16cf 576 if (c == ']')
248f3856 577 break;
cd4882d7
SM
578 }
579
e5fcd199
SM
580 if (inv)
581 {
28a27de2
SM
582 range *new_ran = range_invert(ran);
583 delete ran;
584 ran = new_ran;
e5fcd199
SM
585 }
586
40fd16cf
SM
587 if (ran == NULL)
588 return new null_op;
589
590 return new match_op(ran);
cd4882d7
SM
591}
592
593unsigned
594regex_parser::parse_number ()
595{
596 string digits;
597
598 char c = cur.peek ();
599 while (c && isdigit (c))
600 {
601 cur.next ();
602 digits.push_back (c);
603 c = cur.peek ();
604 }
605
606 if (digits == "") parse_error(_("expected number"), cur.pos);
607
608 char *endptr = NULL;
609 int val = strtol (digits.c_str (), &endptr, 10);
610
611 if (*endptr != '\0' || errno == ERANGE) // paranoid error checking
612 parse_error(_F("could not parse number %s", digits.c_str()), cur.pos);
613#define MAX_DFA_REPETITIONS 12345
614 if (val >= MAX_DFA_REPETITIONS) // XXX: is there a more sensible max size?
615 parse_error(_F("%s is too large", digits.c_str()), cur.pos);
616
617 return atoi (digits.c_str ());
618}
619
cd4882d7
SM
620// ------------------------------------------------------------------------
621
e5fcd199 622std::map<std::string, range *> named_char_classes;
40fd16cf
SM
623
624range *
625named_char_class (const string& name)
cd4882d7 626{
40fd16cf
SM
627 // static initialization of table
628 if (named_char_classes.empty())
629 {
630 // original source for these is http://www.regular-expressions.info/posixbrackets.html
631 // also checked against (intended to match) the c stdlib isFOO() chr class functions
632 named_char_classes["alpha"] = new range("A-Za-z");
633 named_char_classes["alnum"] = new range("A-Za-z0-9");
634 named_char_classes["blank"] = new range(" \t");
635 named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
636 named_char_classes["d"] = named_char_classes["digit"] = new range("0-9");
637 named_char_classes["xdigit"] = new range("0-9a-fA-F");
638 named_char_classes["graph"] = new range("\x21-\x7E");
639 named_char_classes["l"] = named_char_classes["lower"] = new range("a-z");
640 named_char_classes["print"] = new range("\x20-\x7E");
d6833508 641 named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
40fd16cf
SM
642 named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f");
643 named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z");
644 }
cd4882d7 645
40fd16cf 646 if (named_char_classes.find(name) == named_char_classes.end())
cd4882d7 647 {
e5fcd199 648 throw regex_error (_F("unknown character class '%s'", name.c_str())); // XXX: position unknown
40fd16cf 649 }
cd4882d7 650
40fd16cf
SM
651 return new range(*named_char_classes[name]);
652}
653
654range *
655stapregex_getrange (cursor& cur)
656{
657 char c = cur.peek ();
40fd16cf
SM
658
659 if (c == '\\')
660 {
661 // Grab escaped char regardless of what it is.
662 cur.next (); c = cur.peek (); cur.next ();
663 }
664 else if (c == '[')
665 {
666 // Check for '[:' digraph.
667 char old_c = c; cur.next (); c = cur.peek ();
668
669 if (c == ':')
cd4882d7 670 {
248f3856 671 cur.next (); c = cur.peek (); // skip ':'
40fd16cf 672 string charclass;
cd4882d7 673
40fd16cf
SM
674 for (;;)
675 {
676 if (cur.finished)
677 throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos);
cd4882d7 678
e5fcd199 679 if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']')
40fd16cf
SM
680 {
681 cur.next (); cur.next (); // skip ':]'
682 return named_char_class(charclass);
683 }
684
685 charclass.push_back(c); cur.next(); c = cur.peek();
686 }
cd4882d7 687 }
40fd16cf
SM
688 else
689 {
690 // Backtrack; fall through to processing c.
691 c = old_c;
692 }
693 }
694 else
695 cur.next ();
696
697 char lb = c, ub;
698
d6833508 699 if (!cur.has(2) || cur.peek () != '-' || (*cur.input)[cur.pos] == ']')
40fd16cf
SM
700 {
701 ub = lb;
cd4882d7 702 }
40fd16cf
SM
703 else
704 {
705 cur.next (); // skip '-'
706 ub = cur.peek ();
707
708 if (ub < lb)
709 throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos);
710
711 cur.next ();
712 }
713
714 return new range(lb, ub);
cd4882d7
SM
715}
716
40fd16cf
SM
717};
718
cd4882d7 719/* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.125274 seconds and 5 git commands to generate.