]>
sourceware.org Git - systemtap.git/blob - stapregex-parse.cxx
2 // Copyright (C) 2012-2018 Red Hat Inc.
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
16 #include "stapregex-tree.h"
17 #include "stapregex-parse.h"
27 void print_escaped(std::ostream
& o
, rchar c
)
29 o
<< escaped_character((unsigned)c
);
32 // ------------------------------------------------------------------------
34 cursor::cursor() : input(NULL
), do_unescape(false), pos(~0),
35 last_pos(~0), finished(false), next_c(0), last_c(0) {}
37 cursor::cursor(const std::string
*input
, bool do_unescape
)
38 : input(input
), do_unescape(do_unescape
), pos(0), last_pos(0), finished(false)
40 next_c
= 0; last_c
= 0;
41 finished
= ( pos
>= input
->length() );
47 if (! next_c
&& finished
)
48 throw regex_error(_("unexpected end of regex"), pos
);
53 // advance by zeroing next_c
62 if (! next_c
&& ! finished
)
65 // don't advance by zeroing next_c
72 cursor::has (unsigned n
)
74 return ( pos
<= input
->length() - n
);
77 /* Systemtap doesn't unescape string literals for us, presuming to
78 pass the backslashes intact to a C compiler; hence we need to do
79 our own unescaping here.
81 This functionality needs to be handled as part of cursor, in order
82 to correctly retain the original positions in the string when doing
85 cursor::get_unescaped ()
87 static const char *hex
= "0123456789abcdef";
88 static const char *oct
= "01234567";
91 char c
= (*input
)[pos
];
93 if (c
!= '\\' || !do_unescape
)
97 finished
= ( pos
>= input
->length() );
103 /* Check for improper string end: */
104 if (pos
>= input
->length())
105 throw regex_error(_("unexpected end of regex"), pos
);
107 /* The logic is based on re2c's Scanner::unescape() method;
108 the set of accepted escape codes should correspond to
109 lexer::scan() in parse.cxx. */
113 case 'a': c
= '\a'; break;
114 case 'b': c
= '\b'; break;
115 case 't': c
= '\t'; break;
116 case 'n': c
= '\n'; break;
117 case 'v': c
= '\v'; break;
118 case 'f': c
= '\f'; break;
119 case 'r': c
= '\r'; break;
123 if (pos
>= input
->length() - 2)
124 throw regex_error(_("two hex digits required in escape sequence"), pos
);
126 const char *d1
= strchr(hex
, tolower((*input
)[pos
+1]));
127 const char *d2
= strchr(hex
, tolower((*input
)[pos
+2]));
130 throw regex_error(_("two hex digits required in escape sequence"), pos
+ (d1
? 1 : 2));
132 c
= (char)((d1
-hex
) << 4) + (char)(d2
-hex
);
133 pos
+= 2; // skip two chars more than usual
137 // XXX: perhaps perform error recovery (slurp 3 octal chars)?
138 throw regex_error(_("octal escape sequence out of range"), pos
);
142 if (pos
>= input
->length() - 2)
143 throw regex_error(_("three octal digits required in escape sequence"), pos
);
145 const char *d0
= strchr(oct
, (*input
)[pos
]);
146 const char *d1
= strchr(oct
, (*input
)[pos
+1]);
147 const char *d2
= strchr(oct
, (*input
)[pos
+2]);
149 if (!d0
|| !d1
|| !d2
)
150 throw regex_error(_("three octal digits required in escape sequence"), pos
+ (d1
? 1 : 2));
152 c
= (char)((d0
-oct
) << 6) + (char)((d1
-oct
) << 3) + (char)(d2
-oct
);
153 pos
+= 2; // skip two chars more than usual
157 // do nothing; this removes the backslash from c
163 finished
= ( pos
>= input
->length() );
166 // ------------------------------------------------------------------------
169 regex_parser::parse (bool do_tag
)
171 cur
= cursor(&input
, do_unescape
);
172 this->do_tag
= do_tag
;
173 num_subexpressions
= do_tag
? 1 : 0; // group 0 is guaranteed when using tag
175 regexp
*result
= parse_expr ();
177 // PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
179 result
= new cat_op(new tag_op(TAG_START(0)), result
);
180 result
= new cat_op(result
, new tag_op(TAG_END(0)));
185 rchar c
= cur
.peek ();
187 parse_error (_("unbalanced ')'"), cur
.pos
);
189 // This should not be possible:
190 parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur
.pos
);
193 // PR15065 store num_tags in result
194 result
->num_tags
= 2 * num_subexpressions
;
199 regex_parser::isspecial (rchar c
)
201 return ( c
== '.' || c
== '[' || c
== '{' || c
== '(' || c
== ')'
202 || c
== '\\' || c
== '*' || c
== '+' || c
== '?' || c
== '|'
203 || c
== '^' || c
== '$' );
207 regex_parser::expect (rchar expected
)
212 } catch (const regex_error
&e
) {
213 parse_error (_F("expected %c, found end of regex", expected
));
217 parse_error (_F("expected %c, found %c", expected
, c
));
221 regex_parser::parse_error (const string
& msg
, unsigned pos
)
223 throw regex_error(msg
, pos
);
227 regex_parser::parse_error (const string
& msg
)
229 parse_error (msg
, cur
.last_pos
);
232 // ------------------------------------------------------------------------
235 regex_parser::parse_expr ()
237 regexp
*result
= parse_term ();
239 rchar c
= cur
.peek ();
240 while (c
&& c
== '|')
243 regexp
*alt
= parse_term ();
244 result
= make_alt (result
, alt
);
252 regex_parser::parse_term ()
254 regexp
*result
= parse_factor ();
256 rchar c
= cur
.peek ();
257 while (c
&& c
!= '|' && c
!= ')')
259 regexp
*next
= parse_factor ();
260 result
= new cat_op(result
, next
);
268 regex_parser::parse_factor ()
271 regexp
*old_result
= NULL
;
273 rchar c
= cur
.peek ();
274 if (! c
|| c
== '|' || c
== ')')
276 result
= new null_op
;
279 else if (c
== '*' || c
== '+' || c
== '?' || c
== '{')
281 parse_error(_F("unexpected '%c'", c
));
284 if (isspecial (c
) && c
!= '\\')
285 cur
.next (); // c is guaranteed to be swallowed
289 result
= make_dot ();
293 result
= parse_char_range ();
298 // To number tags correctly, reserve a subexpression number here:
299 unsigned curr_subexpression
= 0;
301 curr_subexpression
= num_subexpressions
++;
303 result
= parse_expr ();
305 // PR15065 glom appropriate tag_ops onto the expr
307 result
= new cat_op(new tag_op(TAG_START(curr_subexpression
)), result
);
308 result
= new cat_op(result
, new tag_op(TAG_END(curr_subexpression
)));
310 // XXX: workaround for certain error checking test cases which
311 // would otherwise produce divergent behaviour without tag_ops
312 // (e.g. "^*" vs "(^)*").
313 result
= new cat_op(result
, new null_op
);
318 else if (c
== '^' || c
== '$')
320 result
= new anchor_op(c
);
322 else // escaped or ordinary character -- not yet swallowed
327 while (c
&& ( ! isspecial (c
) || c
== '\\' ))
338 /* if we end in a closure, it should only govern the last character */
339 if (d
== '*' || d
== '+' || d
== '?' || d
== '{')
341 /* save the last character */
345 accumulate
.push_back (c
);
349 result
= str_to_re (accumulate
);
351 /* separately deal with the last character before a closure */
353 old_result
= result
; /* will add it back outside closure at the end */
354 result
= str_to_re (string(1,d
));
358 /* parse closures or other postfix operators */
360 while (c
== '*' || c
== '+' || c
== '?' || c
== '{')
364 /* closure-type operators applied to $^ are definitely not kosher */
365 if (result
->type_of() == "anchor_op")
367 parse_error(_F("postfix closure '%c' applied to anchoring operator", c
));
372 result
= make_alt (new close_op(result
), new null_op
);
376 result
= new close_op(result
);
380 result
= make_alt (result
, new null_op
);
384 int minsize
= parse_number ();
396 else if (isdigit (c
))
398 maxsize
= parse_number ();
402 parse_error(_("expected '}' or number"), cur
.pos
);
409 parse_error(_("expected ',' or '}'"));
411 /* optimize {0,0}, {0,} and {1,} */
412 if (!do_tag
&& minsize
== 0 && maxsize
== 0)
414 // XXX: this optimization is only used when
415 // subexpression-extraction is disabled
417 result
= new null_op
;
419 else if (minsize
== 0 && maxsize
== -1)
421 result
= make_alt (new close_op(result
), new null_op
);
423 else if (minsize
== 1 && maxsize
== -1)
425 result
= new close_op(result
);
429 result
= new closev_op(result
, minsize
, maxsize
);
437 result
= new cat_op(old_result
, result
);
443 regex_parser::parse_char_range ()
447 // check for inversion
449 rchar c
= cur
.peek ();
458 // break on string end whenever we encounter it
459 if (cur
.finished
) parse_error(_("unclosed character class"));
461 range
*add
= stapregex_getrange (cur
);
462 range
*new_ran
= ( ran
!= NULL
? range_union(ran
, add
) : add
);
463 delete ran
; if (new_ran
!= add
) delete add
;
466 // break on ']' (except at the start of the class)
474 range
*new_ran
= range_invert(ran
);
482 return new match_op(ran
);
486 regex_parser::parse_number ()
490 rchar c
= cur
.peek ();
491 while (c
&& isdigit (c
))
494 digits
.push_back (c
);
498 if (digits
== "") parse_error(_("expected number"), cur
.pos
);
501 int val
= strtol (digits
.c_str (), &endptr
, 10);
503 if (*endptr
!= '\0' || errno
== ERANGE
) // paranoid error checking
504 parse_error(_F("could not parse number %s", digits
.c_str()), cur
.pos
);
505 #define MAX_DFA_REPETITIONS 12345
506 if (val
>= MAX_DFA_REPETITIONS
) // XXX: is there a more sensible max size?
507 parse_error(_F("%s is too large", digits
.c_str()), cur
.pos
);
509 return atoi (digits
.c_str ());
512 // ------------------------------------------------------------------------
514 std::map
<std::string
, range
*> named_char_classes
;
517 named_char_class (const string
& name
)
519 // static initialization of table
520 if (named_char_classes
.empty())
522 // original source for these is http://www.regular-expressions.info/posixbrackets.html
523 // also checked against (intended to match) the c stdlib isFOO() chr class functions
524 named_char_classes
["alpha"] = new range("A-Za-z");
525 named_char_classes
["alnum"] = new range("A-Za-z0-9");
526 named_char_classes
["blank"] = new range(" \t");
527 named_char_classes
["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
528 named_char_classes
["d"] = named_char_classes
["digit"] = new range("0-9");
529 named_char_classes
["xdigit"] = new range("0-9a-fA-F");
530 named_char_classes
["graph"] = new range("\x21-\x7E");
531 named_char_classes
["l"] = named_char_classes
["lower"] = new range("a-z");
532 named_char_classes
["print"] = new range("\x20-\x7E");
533 named_char_classes
["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
534 named_char_classes
["s"] = named_char_classes
["space"] = new range(" \t\r\n\v\f");
535 named_char_classes
["u"] = named_char_classes
["upper"] = new range("A-Z");
538 if (named_char_classes
.find(name
) == named_char_classes
.end())
540 throw regex_error (_F("unknown character class '%s'", name
.c_str())); // XXX: position unknown
543 return new range(*named_char_classes
[name
]);
547 stapregex_getrange (cursor
& cur
)
549 rchar c
= cur
.peek ();
553 // Grab escaped char regardless of what it is.
554 cur
.next (); c
= cur
.peek (); cur
.next ();
558 // Check for '[:' digraph.
559 rchar old_c
= c
; cur
.next (); c
= cur
.peek ();
563 cur
.next (); c
= cur
.peek (); // skip ':'
569 throw regex_error (_F("unclosed character class '[:%s'", charclass
.c_str()), cur
.pos
);
571 if (cur
.has(2) && c
== ':' && (*cur
.input
)[cur
.pos
] == ']')
573 cur
.next (); cur
.next (); // skip ':]'
574 return named_char_class(charclass
);
577 charclass
.push_back(c
); cur
.next(); c
= cur
.peek();
582 // Backtrack; fall through to processing c.
591 if (!cur
.has(2) || cur
.peek () != '-' || (*cur
.input
)[cur
.pos
] == ']')
597 cur
.next (); // skip '-'
601 throw regex_error (_F("Inverted character range %c-%c", lb
, ub
), cur
.pos
);
606 return new range(lb
, ub
);
611 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.065229 seconds and 5 git commands to generate.