]> sourceware.org Git - systemtap.git/blob - stapregex-parse.cxx
buildrun.cxx: adapt to kernel 5.4+
[systemtap.git] / stapregex-parse.cxx
1 // -*- C++ -*-
2 // Copyright (C) 2012-2018 Red Hat Inc.
3 //
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
7 // later version.
8 //
9 // ---
10 //
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
13
14 #include "util.h"
15
16 #include "stapregex-tree.h"
17 #include "stapregex-parse.h"
18
19 #include <cstdlib>
20 #include <cstring>
21 #include <string>
22
23 using namespace std;
24
25 namespace stapregex {
26
27 void print_escaped(std::ostream& o, rchar c)
28 {
29 o << escaped_character((unsigned)c);
30 }
31
32 // ------------------------------------------------------------------------
33
34 cursor::cursor() : input(NULL), do_unescape(false), pos(~0),
35 last_pos(~0), finished(false), next_c(0), last_c(0) {}
36
37 cursor::cursor(const std::string *input, bool do_unescape)
38 : input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false)
39 {
40 next_c = 0; last_c = 0;
41 finished = ( pos >= input->length() );
42 }
43
44 rchar
45 cursor::next ()
46 {
47 if (! next_c && finished)
48 throw regex_error(_("unexpected end of regex"), pos);
49 if (! next_c)
50 get_unescaped();
51
52 last_c = next_c;
53 // advance by zeroing next_c
54 next_c = 0;
55
56 return last_c;
57 }
58
59 rchar
60 cursor::peek ()
61 {
62 if (! next_c && ! finished)
63 get_unescaped();
64
65 // don't advance by zeroing next_c
66 last_c = next_c;
67
68 return next_c;
69 }
70
71 bool
72 cursor::has (unsigned n)
73 {
74 return ( pos <= input->length() - n );
75 }
76
77 /* Systemtap doesn't unescape string literals for us, presuming to
78 pass the backslashes intact to a C compiler; hence we need to do
79 our own unescaping here.
80
81 This functionality needs to be handled as part of cursor, in order
82 to correctly retain the original positions in the string when doing
83 error reporting. */
84 void
85 cursor::get_unescaped ()
86 {
87 static const char *hex = "0123456789abcdef";
88 static const char *oct = "01234567";
89
90 last_pos = pos;
91 char c = (*input)[pos];
92
93 if (c != '\\' || !do_unescape)
94 {
95 next_c = c;
96 pos++;
97 finished = ( pos >= input->length() );
98 return;
99 }
100
101 pos++;
102
103 /* Check for improper string end: */
104 if (pos >= input->length())
105 throw regex_error(_("unexpected end of regex"), pos);
106
107 /* The logic is based on re2c's Scanner::unescape() method;
108 the set of accepted escape codes should correspond to
109 lexer::scan() in parse.cxx. */
110 c = (*input)[pos];
111 switch (c)
112 {
113 case 'a': c = '\a'; break;
114 case 'b': c = '\b'; break;
115 case 't': c = '\t'; break;
116 case 'n': c = '\n'; break;
117 case 'v': c = '\v'; break;
118 case 'f': c = '\f'; break;
119 case 'r': c = '\r'; break;
120
121 case 'x':
122 {
123 if (pos >= input->length() - 2)
124 throw regex_error(_("two hex digits required in escape sequence"), pos);
125
126 const char *d1 = strchr(hex, tolower((*input)[pos+1]));
127 const char *d2 = strchr(hex, tolower((*input)[pos+2]));
128
129 if (!d1 || !d2)
130 throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2));
131
132 c = (char)((d1-hex) << 4) + (char)(d2-hex);
133 pos += 2; // skip two chars more than usual
134 break;
135 }
136 case '4' ... '7':
137 // XXX: perhaps perform error recovery (slurp 3 octal chars)?
138 throw regex_error(_("octal escape sequence out of range"), pos);
139
140 case '0' ... '3':
141 {
142 if (pos >= input->length() - 2)
143 throw regex_error(_("three octal digits required in escape sequence"), pos);
144
145 const char *d0 = strchr(oct, (*input)[pos]);
146 const char *d1 = strchr(oct, (*input)[pos+1]);
147 const char *d2 = strchr(oct, (*input)[pos+2]);
148
149 if (!d0 || !d1 || !d2)
150 throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2));
151
152 c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct);
153 pos += 2; // skip two chars more than usual
154 break;
155 }
156 default:
157 // do nothing; this removes the backslash from c
158 ;
159 }
160
161 next_c = c;
162 pos++;
163 finished = ( pos >= input->length() );
164 }
165
166 // ------------------------------------------------------------------------
167
168 regexp *
169 regex_parser::parse (bool do_tag)
170 {
171 cur = cursor(&input, do_unescape);
172 this->do_tag = do_tag;
173 num_subexpressions = do_tag ? 1 : 0; // group 0 is guaranteed when using tag
174
175 regexp *result = parse_expr ();
176
177 // PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
178 if (do_tag) {
179 result = new cat_op(new tag_op(TAG_START(0)), result);
180 result = new cat_op(result, new tag_op(TAG_END(0)));
181 }
182
183 if (! cur.finished)
184 {
185 rchar c = cur.peek ();
186 if (c == ')')
187 parse_error (_("unbalanced ')'"), cur.pos);
188 else
189 // This should not be possible:
190 parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos);
191 }
192
193 // PR15065 store num_tags in result
194 result->num_tags = 2 * num_subexpressions;
195 return result;
196 }
197
198 bool
199 regex_parser::isspecial (rchar c)
200 {
201 return ( c == '.' || c == '[' || c == '{' || c == '(' || c == ')'
202 || c == '\\' || c == '*' || c == '+' || c == '?' || c == '|'
203 || c == '^' || c == '$' );
204 }
205
206 void
207 regex_parser::expect (rchar expected)
208 {
209 rchar c = 0;
210 try {
211 c = cur.next ();
212 } catch (const regex_error &e) {
213 parse_error (_F("expected %c, found end of regex", expected));
214 }
215
216 if (c != expected)
217 parse_error (_F("expected %c, found %c", expected, c));
218 }
219
220 void
221 regex_parser::parse_error (const string& msg, unsigned pos)
222 {
223 throw regex_error(msg, pos);
224 }
225
226 void
227 regex_parser::parse_error (const string& msg)
228 {
229 parse_error (msg, cur.last_pos);
230 }
231
232 // ------------------------------------------------------------------------
233
234 regexp *
235 regex_parser::parse_expr ()
236 {
237 regexp *result = parse_term ();
238
239 rchar c = cur.peek ();
240 while (c && c == '|')
241 {
242 cur.next ();
243 regexp *alt = parse_term ();
244 result = make_alt (result, alt);
245 c = cur.peek ();
246 }
247
248 return result;
249 }
250
251 regexp *
252 regex_parser::parse_term ()
253 {
254 regexp *result = parse_factor ();
255
256 rchar c = cur.peek ();
257 while (c && c != '|' && c != ')')
258 {
259 regexp *next = parse_factor ();
260 result = new cat_op(result, next);
261 c = cur.peek ();
262 }
263
264 return result;
265 }
266
267 regexp *
268 regex_parser::parse_factor ()
269 {
270 regexp *result;
271 regexp *old_result = NULL;
272
273 rchar c = cur.peek ();
274 if (! c || c == '|' || c == ')')
275 {
276 result = new null_op;
277 return result;
278 }
279 else if (c == '*' || c == '+' || c == '?' || c == '{')
280 {
281 parse_error(_F("unexpected '%c'", c));
282 }
283
284 if (isspecial (c) && c != '\\')
285 cur.next (); // c is guaranteed to be swallowed
286
287 if (c == '.')
288 {
289 result = make_dot ();
290 }
291 else if (c == '[')
292 {
293 result = parse_char_range ();
294 expect (']');
295 }
296 else if (c == '(')
297 {
298 // To number tags correctly, reserve a subexpression number here:
299 unsigned curr_subexpression = 0;
300 if (do_tag)
301 curr_subexpression = num_subexpressions++;
302
303 result = parse_expr ();
304
305 // PR15065 glom appropriate tag_ops onto the expr
306 if (do_tag) {
307 result = new cat_op(new tag_op(TAG_START(curr_subexpression)), result);
308 result = new cat_op(result, new tag_op(TAG_END(curr_subexpression)));
309 } else {
310 // XXX: workaround for certain error checking test cases which
311 // would otherwise produce divergent behaviour without tag_ops
312 // (e.g. "^*" vs "(^)*").
313 result = new cat_op(result, new null_op);
314 }
315
316 expect (')');
317 }
318 else if (c == '^' || c == '$')
319 {
320 result = new anchor_op(c);
321 }
322 else // escaped or ordinary character -- not yet swallowed
323 {
324 string accumulate;
325 rchar d = 0;
326
327 while (c && ( ! isspecial (c) || c == '\\' ))
328 {
329 if (c == '\\')
330 {
331 cur.next ();
332 c = cur.peek ();
333 }
334
335 cur.next ();
336 d = cur.peek ();
337
338 /* if we end in a closure, it should only govern the last character */
339 if (d == '*' || d == '+' || d == '?' || d == '{')
340 {
341 /* save the last character */
342 d = c; break;
343 }
344
345 accumulate.push_back (c);
346 c = d; d = 0;
347 }
348
349 result = str_to_re (accumulate);
350
351 /* separately deal with the last character before a closure */
352 if (d != 0) {
353 old_result = result; /* will add it back outside closure at the end */
354 result = str_to_re (string(1,d));
355 }
356 }
357
358 /* parse closures or other postfix operators */
359 c = cur.peek ();
360 while (c == '*' || c == '+' || c == '?' || c == '{')
361 {
362 cur.next ();
363
364 /* closure-type operators applied to $^ are definitely not kosher */
365 if (result->type_of() == "anchor_op")
366 {
367 parse_error(_F("postfix closure '%c' applied to anchoring operator", c));
368 }
369
370 if (c == '*')
371 {
372 result = make_alt (new close_op(result), new null_op);
373 }
374 else if (c == '+')
375 {
376 result = new close_op(result);
377 }
378 else if (c == '?')
379 {
380 result = make_alt (result, new null_op);
381 }
382 else if (c == '{')
383 {
384 int minsize = parse_number ();
385 int maxsize = -1;
386
387 c = cur.next ();
388 if (c == ',')
389 {
390 c = cur.peek ();
391 if (c == '}')
392 {
393 cur.next ();
394 maxsize = -1;
395 }
396 else if (isdigit (c))
397 {
398 maxsize = parse_number ();
399 expect ('}');
400 }
401 else
402 parse_error(_("expected '}' or number"), cur.pos);
403 }
404 else if (c == '}')
405 {
406 maxsize = minsize;
407 }
408 else
409 parse_error(_("expected ',' or '}'"));
410
411 /* optimize {0,0}, {0,} and {1,} */
412 if (!do_tag && minsize == 0 && maxsize == 0)
413 {
414 // XXX: this optimization is only used when
415 // subexpression-extraction is disabled
416 delete result;
417 result = new null_op;
418 }
419 else if (minsize == 0 && maxsize == -1)
420 {
421 result = make_alt (new close_op(result), new null_op);
422 }
423 else if (minsize == 1 && maxsize == -1)
424 {
425 result = new close_op(result);
426 }
427 else
428 {
429 result = new closev_op(result, minsize, maxsize);
430 }
431 }
432
433 c = cur.peek ();
434 }
435
436 if (old_result)
437 result = new cat_op(old_result, result);
438
439 return result;
440 }
441
442 regexp *
443 regex_parser::parse_char_range ()
444 {
445 range *ran = NULL;
446
447 // check for inversion
448 bool inv = false;
449 rchar c = cur.peek ();
450 if (c == '^')
451 {
452 inv = true;
453 cur.next ();
454 }
455
456 for (;;)
457 {
458 // break on string end whenever we encounter it
459 if (cur.finished) parse_error(_("unclosed character class"));
460
461 range *add = stapregex_getrange (cur);
462 range *new_ran = ( ran != NULL ? range_union(ran, add) : add );
463 delete ran; if (new_ran != add) delete add;
464 ran = new_ran;
465
466 // break on ']' (except at the start of the class)
467 c = cur.peek ();
468 if (c == ']')
469 break;
470 }
471
472 if (inv)
473 {
474 range *new_ran = range_invert(ran);
475 delete ran;
476 ran = new_ran;
477 }
478
479 if (ran == NULL)
480 return new null_op;
481
482 return new match_op(ran);
483 }
484
485 unsigned
486 regex_parser::parse_number ()
487 {
488 string digits;
489
490 rchar c = cur.peek ();
491 while (c && isdigit (c))
492 {
493 cur.next ();
494 digits.push_back (c);
495 c = cur.peek ();
496 }
497
498 if (digits == "") parse_error(_("expected number"), cur.pos);
499
500 char *endptr = NULL;
501 int val = strtol (digits.c_str (), &endptr, 10);
502
503 if (*endptr != '\0' || errno == ERANGE) // paranoid error checking
504 parse_error(_F("could not parse number %s", digits.c_str()), cur.pos);
505 #define MAX_DFA_REPETITIONS 12345
506 if (val >= MAX_DFA_REPETITIONS) // XXX: is there a more sensible max size?
507 parse_error(_F("%s is too large", digits.c_str()), cur.pos);
508
509 return atoi (digits.c_str ());
510 }
511
512 // ------------------------------------------------------------------------
513
514 std::map<std::string, range *> named_char_classes;
515
516 range *
517 named_char_class (const string& name)
518 {
519 // static initialization of table
520 if (named_char_classes.empty())
521 {
522 // original source for these is http://www.regular-expressions.info/posixbrackets.html
523 // also checked against (intended to match) the c stdlib isFOO() chr class functions
524 named_char_classes["alpha"] = new range("A-Za-z");
525 named_char_classes["alnum"] = new range("A-Za-z0-9");
526 named_char_classes["blank"] = new range(" \t");
527 named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
528 named_char_classes["d"] = named_char_classes["digit"] = new range("0-9");
529 named_char_classes["xdigit"] = new range("0-9a-fA-F");
530 named_char_classes["graph"] = new range("\x21-\x7E");
531 named_char_classes["l"] = named_char_classes["lower"] = new range("a-z");
532 named_char_classes["print"] = new range("\x20-\x7E");
533 named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
534 named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f");
535 named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z");
536 }
537
538 if (named_char_classes.find(name) == named_char_classes.end())
539 {
540 throw regex_error (_F("unknown character class '%s'", name.c_str())); // XXX: position unknown
541 }
542
543 return new range(*named_char_classes[name]);
544 }
545
546 range *
547 stapregex_getrange (cursor& cur)
548 {
549 rchar c = cur.peek ();
550
551 if (c == '\\')
552 {
553 // Grab escaped char regardless of what it is.
554 cur.next (); c = cur.peek (); cur.next ();
555 }
556 else if (c == '[')
557 {
558 // Check for '[:' digraph.
559 rchar old_c = c; cur.next (); c = cur.peek ();
560
561 if (c == ':')
562 {
563 cur.next (); c = cur.peek (); // skip ':'
564 string charclass;
565
566 for (;;)
567 {
568 if (cur.finished)
569 throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos);
570
571 if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']')
572 {
573 cur.next (); cur.next (); // skip ':]'
574 return named_char_class(charclass);
575 }
576
577 charclass.push_back(c); cur.next(); c = cur.peek();
578 }
579 }
580 else
581 {
582 // Backtrack; fall through to processing c.
583 c = old_c;
584 }
585 }
586 else
587 cur.next ();
588
589 rchar lb = c, ub;
590
591 if (!cur.has(2) || cur.peek () != '-' || (*cur.input)[cur.pos] == ']')
592 {
593 ub = lb;
594 }
595 else
596 {
597 cur.next (); // skip '-'
598 ub = cur.peek ();
599
600 if (ub < lb)
601 throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos);
602
603 cur.next ();
604 }
605
606 return new range(lb, ub);
607 }
608
609 };
610
611 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.065229 seconds and 5 git commands to generate.