]>
sourceware.org Git - systemtap.git/blob - stapregex-parse.cxx
d227faeb2693aa70766601144b2a0c45c27f538d
2 // Copyright (C) 2012-2018 Red Hat Inc.
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
16 #include "stapregex-tree.h"
17 #include "stapregex-parse.h"
27 void print_escaped(std::ostream
& o
, rchar c
)
29 o
<< escaped_character((unsigned)c
);
32 // ------------------------------------------------------------------------
34 cursor::cursor() : input(NULL
), do_unescape(false), pos(~0),
35 last_pos(~0), finished(false), next_c(0), last_c(0) {}
37 cursor::cursor(const std::string
*input
, bool do_unescape
)
38 : input(input
), do_unescape(do_unescape
), pos(0), last_pos(0), finished(false)
40 next_c
= 0; last_c
= 0;
41 finished
= ( pos
>= input
->length() );
47 if (! next_c
&& finished
)
48 throw regex_error(_("unexpected end of regex"), pos
);
53 // advance by zeroing next_c
62 if (! next_c
&& ! finished
)
65 // don't advance by zeroing next_c
72 cursor::has (unsigned n
)
74 return ( pos
<= input
->length() - n
);
77 /* Systemtap doesn't unescape string literals for us, presuming to
78 pass the backslashes intact to a C compiler; hence we need to do
79 our own unescaping here.
81 This functionality needs to be handled as part of cursor, in order
82 to correctly retain the original positions in the string when doing
85 cursor::get_unescaped ()
87 static const char *hex
= "0123456789abcdef";
88 static const char *oct
= "01234567";
91 char c
= (*input
)[pos
];
93 if (c
!= '\\' || !do_unescape
)
97 finished
= ( pos
>= input
->length() );
103 /* Check for improper string end: */
104 if (pos
>= input
->length())
105 throw regex_error(_("unexpected end of regex"), pos
);
107 /* The logic is based on re2c's Scanner::unescape() method;
108 the set of accepted escape codes should correspond to
109 lexer::scan() in parse.cxx. */
113 case 'a': c
= '\a'; break;
114 case 'b': c
= '\b'; break;
115 case 't': c
= '\t'; break;
116 case 'n': c
= '\n'; break;
117 case 'v': c
= '\v'; break;
118 case 'f': c
= '\f'; break;
119 case 'r': c
= '\r'; break;
123 if (pos
>= input
->length() - 2)
124 throw regex_error(_("two hex digits required in escape sequence"), pos
);
126 const char *d1
= strchr(hex
, tolower((*input
)[pos
+1]));
127 const char *d2
= strchr(hex
, tolower((*input
)[pos
+2]));
130 throw regex_error(_("two hex digits required in escape sequence"), pos
+ (d1
? 1 : 2));
132 c
= (char)((d1
-hex
) << 4) + (char)(d2
-hex
);
133 pos
+= 2; // skip two chars more than usual
137 // XXX: perhaps perform error recovery (slurp 3 octal chars)?
138 throw regex_error(_("octal escape sequence out of range"), pos
);
142 if (pos
>= input
->length() - 2)
143 throw regex_error(_("three octal digits required in escape sequence"), pos
);
145 const char *d0
= strchr(oct
, (*input
)[pos
]);
146 const char *d1
= strchr(oct
, (*input
)[pos
+1]);
147 const char *d2
= strchr(oct
, (*input
)[pos
+2]);
149 if (!d0
|| !d1
|| !d2
)
150 throw regex_error(_("three octal digits required in escape sequence"), pos
+ (d1
? 1 : 2));
152 c
= (char)((d0
-oct
) << 6) + (char)((d1
-oct
) << 3) + (char)(d2
-oct
);
153 pos
+= 2; // skip two chars more than usual
157 // do nothing; this removes the backslash from c
163 finished
= ( pos
>= input
->length() );
166 // ------------------------------------------------------------------------
169 regex_parser::parse (bool do_tag
)
171 cur
= cursor(&input
, do_unescape
);
172 this->do_tag
= do_tag
;
173 num_subexpressions
= do_tag
? 1 : 0; // group 0 is guaranteed when using tag
175 regexp
*result
= parse_expr ();
177 // PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
179 result
= new cat_op(new tag_op(TAG_START(0)), result
);
180 result
= new cat_op(result
, new tag_op(TAG_END(0)));
185 rchar c
= cur
.peek ();
187 parse_error (_("unbalanced ')'"), cur
.pos
);
189 // This should not be possible:
190 parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur
.pos
);
193 // PR15065 store num_tags in result
194 result
->num_tags
= 2 * num_subexpressions
;
199 regex_parser::isspecial (rchar c
)
201 return ( c
== '.' || c
== '[' || c
== '{' || c
== '(' || c
== ')'
202 || c
== '\\' || c
== '*' || c
== '+' || c
== '?' || c
== '|'
203 || c
== '^' || c
== '$' );
207 regex_parser::expect (rchar expected
)
212 } catch (const regex_error
&e
) {
213 parse_error (_F("expected %c, found end of regex", expected
));
217 parse_error (_F("expected %c, found %c", expected
, c
));
221 regex_parser::parse_error (const string
& msg
, unsigned pos
)
223 throw regex_error(msg
, pos
);
227 regex_parser::parse_error (const string
& msg
)
229 parse_error (msg
, cur
.last_pos
);
232 // ------------------------------------------------------------------------
235 regex_parser::parse_expr ()
237 regexp
*result
= parse_term ();
239 rchar c
= cur
.peek ();
240 while (c
&& c
== '|')
243 regexp
*alt
= parse_term ();
244 result
= make_alt (result
, alt
);
252 regex_parser::parse_term ()
254 regexp
*result
= parse_factor ();
256 rchar c
= cur
.peek ();
257 while (c
&& c
!= '|' && c
!= ')')
259 regexp
*next
= parse_factor ();
260 result
= new cat_op(result
, next
);
268 regex_parser::parse_factor ()
271 regexp
*old_result
= NULL
;
273 rchar c
= cur
.peek ();
274 if (! c
|| c
== '|' || c
== ')')
276 result
= new null_op
;
279 else if (c
== '*' || c
== '+' || c
== '?' || c
== '{')
281 parse_error(_F("unexpected '%c'", c
));
284 if (isspecial (c
) && c
!= '\\')
285 cur
.next (); // c is guaranteed to be swallowed
289 result
= make_dot ();
293 result
= parse_char_range ();
298 // Allow non-capturing group (?:...)
299 bool do_tag_now
= do_tag
;
300 rchar c2
= cur
.peek();
304 rchar c3
= cur
.peek();
306 parse_error(_F("unexpected '(?%c'", c3
));
310 // To number tags correctly, reserve a subexpression number here:
311 unsigned curr_subexpression
= 0;
313 curr_subexpression
= num_subexpressions
++;
315 result
= parse_expr ();
317 // PR15065 glom appropriate tag_ops onto the expr
319 result
= new cat_op(new tag_op(TAG_START(curr_subexpression
)), result
);
320 result
= new cat_op(result
, new tag_op(TAG_END(curr_subexpression
)));
322 // XXX: workaround for certain error checking test cases which
323 // would otherwise produce divergent behaviour without tag_ops
324 // (e.g. "^*" vs "(^)*").
325 result
= new cat_op(result
, new null_op
);
330 else if (c
== '^' || c
== '$')
332 result
= new anchor_op(c
);
334 else // escaped or ordinary character -- not yet swallowed
339 while (c
&& ( ! isspecial (c
) || c
== '\\' ))
350 /* if we end in a closure, it should only govern the last character */
351 if (d
== '*' || d
== '+' || d
== '?' || d
== '{')
353 /* save the last character */
357 accumulate
.push_back (c
);
361 result
= str_to_re (accumulate
);
363 /* separately deal with the last character before a closure */
365 old_result
= result
; /* will add it back outside closure at the end */
366 result
= str_to_re (string(1,d
));
370 /* parse closures or other postfix operators */
372 while (c
== '*' || c
== '+' || c
== '?' || c
== '{')
376 /* closure-type operators applied to $^ are definitely not kosher */
377 if (result
->type_of() == "anchor_op")
379 parse_error(_F("postfix closure '%c' applied to anchoring operator", c
));
384 result
= make_alt (new close_op(result
), new null_op
);
388 result
= new close_op(result
);
392 result
= make_alt (result
, new null_op
);
396 int minsize
= parse_number ();
408 else if (isdigit (c
))
410 maxsize
= parse_number ();
414 parse_error(_("expected '}' or number"), cur
.pos
);
421 parse_error(_("expected ',' or '}'"));
423 /* optimize {0,0}, {0,} and {1,} */
424 if (!do_tag
&& minsize
== 0 && maxsize
== 0)
426 // XXX: this optimization is only used when
427 // subexpression-extraction is disabled
429 result
= new null_op
;
431 else if (minsize
== 0 && maxsize
== -1)
433 result
= make_alt (new close_op(result
), new null_op
);
435 else if (minsize
== 1 && maxsize
== -1)
437 result
= new close_op(result
);
441 result
= new closev_op(result
, minsize
, maxsize
);
449 result
= new cat_op(old_result
, result
);
455 regex_parser::parse_char_range ()
459 // check for inversion
461 rchar c
= cur
.peek ();
470 // break on string end whenever we encounter it
471 if (cur
.finished
) parse_error(_("unclosed character class"));
473 range
*add
= stapregex_getrange (cur
);
474 range
*new_ran
= ( ran
!= NULL
? range_union(ran
, add
) : add
);
475 delete ran
; if (new_ran
!= add
) delete add
;
478 // break on ']' (except at the start of the class)
486 range
*new_ran
= range_invert(ran
);
494 return new match_op(ran
);
498 regex_parser::parse_number ()
502 rchar c
= cur
.peek ();
503 while (c
&& isdigit (c
))
506 digits
.push_back (c
);
510 if (digits
== "") parse_error(_("expected number"), cur
.pos
);
513 int val
= strtol (digits
.c_str (), &endptr
, 10);
515 if (*endptr
!= '\0' || errno
== ERANGE
) // paranoid error checking
516 parse_error(_F("could not parse number %s", digits
.c_str()), cur
.pos
);
517 #define MAX_DFA_REPETITIONS 12345
518 if (val
>= MAX_DFA_REPETITIONS
) // XXX: is there a more sensible max size?
519 parse_error(_F("%s is too large", digits
.c_str()), cur
.pos
);
521 return atoi (digits
.c_str ());
524 // ------------------------------------------------------------------------
526 std::map
<std::string
, range
*> named_char_classes
;
529 named_char_class (const string
& name
)
531 // static initialization of table
532 if (named_char_classes
.empty())
534 // original source for these is http://www.regular-expressions.info/posixbrackets.html
535 // also checked against (intended to match) the c stdlib isFOO() chr class functions
536 named_char_classes
["alpha"] = new range("A-Za-z");
537 named_char_classes
["alnum"] = new range("A-Za-z0-9");
538 named_char_classes
["blank"] = new range(" \t");
539 named_char_classes
["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
540 named_char_classes
["d"] = named_char_classes
["digit"] = new range("0-9");
541 named_char_classes
["xdigit"] = new range("0-9a-fA-F");
542 named_char_classes
["graph"] = new range("\x21-\x7E");
543 named_char_classes
["l"] = named_char_classes
["lower"] = new range("a-z");
544 named_char_classes
["print"] = new range("\x20-\x7E");
545 named_char_classes
["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
546 named_char_classes
["s"] = named_char_classes
["space"] = new range(" \t\r\n\v\f");
547 named_char_classes
["u"] = named_char_classes
["upper"] = new range("A-Z");
550 if (named_char_classes
.find(name
) == named_char_classes
.end())
552 throw regex_error (_F("unknown character class '%s'", name
.c_str())); // XXX: position unknown
555 return new range(*named_char_classes
[name
]);
559 stapregex_getrange (cursor
& cur
)
561 rchar c
= cur
.peek ();
565 // Grab escaped char regardless of what it is.
566 cur
.next (); c
= cur
.peek (); cur
.next ();
570 // Check for '[:' digraph.
571 rchar old_c
= c
; cur
.next (); c
= cur
.peek ();
575 cur
.next (); c
= cur
.peek (); // skip ':'
581 throw regex_error (_F("unclosed character class '[:%s'", charclass
.c_str()), cur
.pos
);
583 if (cur
.has(2) && c
== ':' && (*cur
.input
)[cur
.pos
] == ']')
585 cur
.next (); cur
.next (); // skip ':]'
586 return named_char_class(charclass
);
589 charclass
.push_back(c
); cur
.next(); c
= cur
.peek();
594 // Backtrack; fall through to processing c.
603 if (!cur
.has(2) || cur
.peek () != '-' || (*cur
.input
)[cur
.pos
] == ']')
609 cur
.next (); // skip '-'
613 throw regex_error (_F("Inverted character range %c-%c", lb
, ub
), cur
.pos
);
618 return new range(lb
, ub
);
623 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.064623 seconds and 6 git commands to generate.