]>
Commit | Line | Data |
---|---|---|
07777235 | 1 | // -*- C++ -*- |
73fcca6f | 2 | // Copyright (C) 2012-2018 Red Hat Inc. |
07777235 SM |
3 | // |
4 | // This file is part of systemtap, and is free software. You can | |
5 | // redistribute it and/or modify it under the terms of the GNU General | |
6 | // Public License (GPL); either version 2, or (at your option) any | |
7 | // later version. | |
8 | // | |
9 | // --- | |
10 | // | |
11 | // This file incorporates code from the re2c project; please see | |
12 | // the file README.stapregex for details. | |
13 | ||
14 | #include <string> | |
15 | #include <iostream> | |
54f92ee4 | 16 | #include <sstream> |
07777235 SM |
17 | #include <set> |
18 | #include <list> | |
19 | #include <map> | |
20 | #include <vector> | |
8873c23d | 21 | #include <stack> |
07777235 SM |
22 | #include <queue> |
23 | #include <utility> | |
8873c23d | 24 | #include <climits> |
07777235 SM |
25 | |
26 | #include "translator-output.h" | |
64e5e3cc | 27 | #include "util.h" |
07777235 SM |
28 | |
29 | #include "stapregex-parse.h" | |
30 | #include "stapregex-tree.h" | |
31 | #include "stapregex-dfa.h" | |
32 | ||
54f92ee4 SM |
33 | // Uncomment to show result of ins (NFA) compilation: |
34 | //#define STAPREGEX_DEBUG_INS | |
8873c23d | 35 | // Uncomment to emit DFA in a non-working compact format (use with -p3): |
54f92ee4 | 36 | //#define STAPREGEX_DEBUG_DFA |
20d6e21f SM |
37 | // Uncomment to have the generated engine do a trace of visited states |
38 | // (only when testing using the standalone regtest module): | |
112e9e2b | 39 | //#define STAPREGEX_DEBUG_MATCH |
54f92ee4 | 40 | |
8873c23d SM |
41 | // Uncomment for a detailed walkthrough of the tagged-NFA conversion: |
42 | //#define STAPREGEX_DEBUG_TNFA | |
43 | ||
07777235 SM |
44 | using namespace std; |
45 | ||
46 | namespace stapregex { | |
47 | ||
48 | regexp *pad_re = NULL; | |
49 | regexp *fail_re = NULL; | |
50 | ||
51 | dfa * | |
52 | stapregex_compile (regexp *re, const std::string& match_snippet, | |
53 | const std::string& fail_snippet) | |
54 | { | |
55 | if (pad_re == NULL) { | |
56 | // build regexp for ".*" | |
57 | pad_re = make_dot (); | |
248f3856 SM |
58 | pad_re = new close_op (pad_re, true); // -- prefer shorter match |
59 | pad_re = new alt_op (pad_re, new null_op, true); // -- prefer second match | |
07777235 SM |
60 | } |
61 | if (fail_re == NULL) { | |
9f232afb | 62 | // build regexp for ".*$", but allow '\0' and support fail outcome |
9b7e0f3d | 63 | fail_re = make_dot (true); // -- allow '\0' |
248f3856 SM |
64 | fail_re = new close_op (fail_re, true); // -- prefer shorter match |
65 | fail_re = new alt_op (fail_re, new null_op, true); // -- prefer second match | |
9f232afb | 66 | fail_re = new cat_op (fail_re, new anchor_op('$')); |
248f3856 | 67 | fail_re = new rule_op(fail_re, 0); |
9f232afb SM |
68 | // XXX: this approach creates one extra spurious-but-safe state |
69 | // (safe because the matching procedure stops after encountering '\0') | |
07777235 SM |
70 | } |
71 | ||
72 | vector<string> outcomes(2); | |
73 | outcomes[0] = fail_snippet; | |
74 | outcomes[1] = match_snippet; | |
75 | ||
76 | int num_tags = re->num_tags; | |
77 | ||
78 | // Pad & wrap re in appropriate rule_ops to control match behaviour: | |
8d2eb09e SM |
79 | bool anchored = re->anchored (); |
80 | if (!anchored) re = new cat_op(pad_re, re); // -- left-padding | |
07777235 SM |
81 | re = new rule_op(re, 1); |
82 | re = new alt_op(re, fail_re); | |
83 | ||
248f3856 SM |
84 | #ifdef STAPREGEX_DEBUG_INS |
85 | cerr << "RESULTING INS FROM REGEX " << re << ":" << endl; | |
86 | #endif | |
87 | ||
07777235 | 88 | ins *i = re->compile(); |
248f3856 SM |
89 | |
90 | #ifdef STAPREGEX_DEBUG_INS | |
223522d4 | 91 | for (const ins *j = i; (j - i) < (int)re->ins_size() + 1; ) |
248f3856 SM |
92 | { |
93 | j = show_ins(cerr, j, i); cerr << endl; | |
94 | } | |
95 | cerr << endl; | |
96 | #endif | |
97 | ||
20d6e21f | 98 | ins_optimize(i); |
223522d4 | 99 | for (ins *j = i; (j - i) < (int)re->ins_size() + 1; ) |
20d6e21f SM |
100 | { |
101 | unmark(j); | |
102 | if (j->i.tag == CHAR) | |
103 | j = (ins *) j->i.link; | |
104 | else | |
105 | j++; | |
106 | } | |
107 | ||
108 | #ifdef STAPREGEX_DEBUG_INS | |
109 | cerr << "OPTIMIZED INS FROM THE SAME REGEX" << endl; | |
223522d4 | 110 | for (const ins *j = i; (j - i) < (int)re->ins_size() + 1; ) |
20d6e21f SM |
111 | { |
112 | j = show_ins(cerr, j, i); cerr << endl; | |
113 | } | |
114 | cerr << endl; | |
115 | #endif | |
248f3856 | 116 | |
07777235 SM |
117 | dfa *d = new dfa(i, num_tags, outcomes); |
118 | ||
8d2eb09e SM |
119 | // Carefully deallocate temporary scaffolding: |
120 | if (!anchored) delete ((rule_op*) ((alt_op*) re)->a)->re; // -- new cat_op | |
07777235 SM |
121 | delete ((alt_op*) re)->a; // -- new rule_op |
122 | delete re; // -- new alt_op | |
123 | // NB: deleting a regular expression DOES NOT deallocate its | |
124 | // children. The original re parameter is presumed to be retained | |
125 | // indefinitely as part of a stapdfa table, or such.... | |
126 | ||
127 | return d; | |
128 | } | |
129 | ||
130 | // ------------------------------------------------------------------------ | |
131 | ||
132 | /* Now follows the heart of the tagged-DFA algorithm. This is a basic | |
133 | implementation of the algorithm described in Ville Laurikari's | |
134 | Masters thesis and summarized in the paper "NFAs with Tagged | |
135 | Transitions, their Conversion to Deterministic Automata and | |
136 | Application to Regular Expressions" | |
137 | (http://laurikari.net/ville/spire2000-tnfa.pdf). | |
138 | ||
139 | HERE BE DRAGONS (and not the friendly kind) */ | |
140 | ||
07777235 SM |
141 | /* Functions to deal with relative transition priorities: */ |
142 | ||
143 | arc_priority | |
144 | refine_higher(const arc_priority& a) | |
145 | { | |
64e5e3cc SM |
146 | if (a.first > ULLONG_MAX/4) // detect overflow |
147 | throw regex_error(_("arc_priority overflow due to excessive branching factor"), 0); | |
07777235 SM |
148 | return make_pair(2 * a.first + 1, a.second + 1); |
149 | } | |
150 | ||
151 | arc_priority | |
152 | refine_lower (const arc_priority& a) | |
153 | { | |
64e5e3cc SM |
154 | if (a.first > ULLONG_MAX/4) // detect overflow |
155 | throw regex_error(_("arc_priority overflow due to excessive branching factor"), 0); | |
07777235 SM |
156 | return make_pair(2 * a.first, a.second + 1); |
157 | } | |
158 | ||
159 | int | |
160 | arc_compare (const arc_priority& a, const arc_priority& b) | |
161 | { | |
162 | unsigned long x = a.first; | |
163 | unsigned long y = b.first; | |
164 | ||
165 | if (a.second > b.second) | |
8873c23d | 166 | y = y << (a.second - b.second); |
07777235 | 167 | else if (a.second < b.second) |
8873c23d SM |
168 | x = x << (b.second - a.second); |
169 | ||
170 | // Special case: 0/n </> 0/m iff m </> n. | |
171 | // This is because such priorities are obtained by refine_lower(). | |
172 | if (x == 0 && y == 0) | |
173 | return ( a.second == b.second ? 0 : a.second < b.second ? 1 : -1 ); | |
07777235 SM |
174 | |
175 | return ( x == y ? 0 : x < y ? -1 : 1 ); | |
176 | } | |
177 | ||
178 | /* Manage the linked list of states in a DFA: */ | |
179 | ||
8873c23d SM |
180 | state::state (dfa *owner, state_kernel *kernel) |
181 | : owner(owner), label(~0), next(NULL), kernel(kernel), | |
182 | accepts(false), accept_outcome(0) {} | |
183 | ||
184 | void | |
185 | dfa::add_map_item (const map_item &m) | |
186 | { | |
187 | // TODOXXX: later, compute a mapping into a single-level tag_states array | |
188 | // TODOXXX: could drop the +1 and instead subtract 1 in YYTAG macro | |
189 | nmapitems = max(nmapitems, m.second) + 1; | |
190 | } | |
07777235 SM |
191 | |
192 | state * | |
193 | dfa::add_state (state *s) | |
194 | { | |
195 | s->label = nstates++; | |
8873c23d | 196 | |
07777235 SM |
197 | if (last == NULL) |
198 | { | |
199 | last = s; | |
200 | first = last; | |
201 | } | |
202 | else | |
203 | { | |
204 | // append to the end | |
205 | last->next = s; | |
206 | last = last->next; | |
207 | } | |
208 | ||
209 | return last; | |
210 | } | |
211 | ||
212 | /* Operations to build a simple kernel prior to taking closure: */ | |
213 | ||
8873c23d | 214 | /* Create a new kernel_point in kernel with empty map items. */ |
9b7e0f3d SM |
215 | void |
216 | add_kernel (state_kernel *kernel, ins *i) | |
07777235 SM |
217 | { |
218 | kernel_point point; | |
219 | point.i = i; | |
3c18de01 | 220 | point.priority = MAKE_START_PRIORITY; |
07777235 SM |
221 | // NB: point->map_items is empty |
222 | ||
223 | kernel->push_back(point); | |
224 | } | |
225 | ||
226 | state_kernel * | |
227 | make_kernel (ins *i) | |
228 | { | |
229 | state_kernel *kernel = new state_kernel; | |
230 | add_kernel (kernel, i); | |
231 | return kernel; | |
232 | } | |
233 | ||
3c18de01 SM |
234 | struct sort_priorities { |
235 | bool operator ()(const arc_priority &a, const arc_priority &b) | |
236 | { | |
237 | return arc_compare(a,b) > 0; | |
238 | } | |
239 | }; | |
240 | ||
241 | struct sort_denominator { | |
242 | bool operator ()(const arc_priority &a, const arc_priority &b) | |
243 | { | |
244 | return a.second > b.second; | |
245 | } | |
246 | }; | |
247 | ||
248 | struct sort_kernel_points { | |
249 | bool operator ()(const kernel_point &k, const kernel_point &l) | |
250 | { | |
251 | return arc_compare(k.priority, l.priority) > 0; | |
252 | } | |
253 | }; | |
254 | ||
255 | // Move points from worklist to new_worklist while rebalancing priorities: | |
256 | void | |
257 | rebalance_priorities(stack<kernel_point> &worklist, stack<kernel_point> &new_worklist) | |
258 | { | |
259 | // Sort worklist in order of priority: | |
260 | priority_queue<kernel_point, vector<kernel_point>, sort_kernel_points> sorted_worklist; | |
261 | while (!worklist.empty()) | |
262 | { | |
263 | kernel_point point = worklist.top(); worklist.pop(); | |
264 | sorted_worklist.push(point); | |
265 | } | |
266 | ||
267 | // Generate a 'clean' set of priorities: | |
268 | priority_queue<arc_priority, vector<arc_priority>, sort_priorities> sorted_priorities; | |
269 | priority_queue<arc_priority, vector<arc_priority>, sort_denominator> new_priorities; | |
270 | new_priorities.push(MAKE_START_PRIORITY); | |
271 | while (new_priorities.size() < sorted_worklist.size()) | |
272 | { | |
273 | arc_priority new_priority = new_priorities.top(); new_priorities.pop(); | |
274 | new_priorities.push(refine_higher(new_priority)); | |
275 | new_priorities.push(refine_lower(new_priority)); | |
276 | } | |
277 | for (unsigned i = 0; i < sorted_worklist.size(); i++) | |
278 | { | |
279 | arc_priority new_priority = new_priorities.top(); new_priorities.pop(); | |
280 | sorted_priorities.push(new_priority); | |
281 | } | |
282 | ||
283 | while (!sorted_worklist.empty()) | |
284 | { | |
285 | kernel_point point = sorted_worklist.top(); sorted_worklist.pop(); | |
286 | arc_priority new_priority = sorted_priorities.top(); sorted_priorities.pop(); | |
287 | point.priority = new_priority; | |
288 | new_worklist.push(point); | |
289 | } | |
290 | } | |
291 | ||
07777235 SM |
292 | /* Compute the set of kernel_points that are 'tag-wise unambiguously |
293 | reachable' from a given initial set of points. Absent tagging, this | |
294 | becomes a bog-standard NFA e_closure construction. */ | |
295 | state_kernel * | |
8873c23d | 296 | te_closure (dfa *dfa, state_kernel *start, int ntags, bool is_initial = false) |
07777235 SM |
297 | { |
298 | state_kernel *closure = new state_kernel(*start); | |
3c18de01 SM |
299 | stack<kernel_point> base_worklist; // -- with old priorities |
300 | stack<kernel_point> worklist; // -- with rebalanced priorities | |
8873c23d SM |
301 | // XXX: state_kernel is a list<kernel_point> so we avoid iterator |
302 | // invalidation and make a new copy of each kernel_point from start | |
07777235 SM |
303 | |
304 | /* To avoid searching through closure incessantly when retrieving | |
305 | information about existing elements, the following caches are | |
306 | needed: */ | |
307 | vector<unsigned> max_tags (ntags, 0); | |
8873c23d | 308 | map<ins *, list<list<kernel_point>::iterator> > closure_map; |
07777235 | 309 | |
8873c23d | 310 | /* Cache initial elements of closure: */ |
07777235 SM |
311 | for (state_kernel::iterator it = closure->begin(); |
312 | it != closure->end(); it++) | |
313 | { | |
8873c23d SM |
314 | #if 0 |
315 | cerr << "**DEBUG** initial closure point "; | |
316 | it->print(cerr, dfa->orig_nfa); | |
317 | cerr << endl; | |
318 | #endif | |
319 | ||
3c18de01 | 320 | base_worklist.push(*it); // -- push with existing priority, rebalance later |
07777235 SM |
321 | |
322 | // Store the element in relevant caches: | |
323 | ||
324 | for (list<map_item>::const_iterator jt = it->map_items.begin(); | |
325 | jt != it->map_items.end(); jt++) | |
326 | max_tags[jt->first] = max(jt->second, max_tags[jt->first]); | |
327 | ||
8873c23d | 328 | closure_map[it->i].push_back(it); |
07777235 SM |
329 | } |
330 | ||
3c18de01 SM |
331 | // PR23608: Retaining the priority from the previous state has the |
332 | // potential to overflow the arc_priority representation with large | |
333 | // numbers when there are many distinct DFA states. This should | |
334 | // cause an explicit assertion failure if it occurs in practice (see | |
335 | // refine_*()), e.g. with long non-branching regexes such as | |
336 | // "aaaa...aaaaa". | |
337 | // | |
338 | // Fixed by adding an explicit step to rebalance priorities: | |
339 | rebalance_priorities(base_worklist, worklist); | |
340 | ||
07777235 SM |
341 | while (!worklist.empty()) |
342 | { | |
8873c23d | 343 | kernel_point point = worklist.top(); worklist.pop(); |
07777235 SM |
344 | |
345 | // Identify e-transitions depending on the opcode. | |
346 | // There are at most two e-transitions emerging from an insn. | |
347 | // If we have two e-transitions, the 'other' has higher priority. | |
348 | ||
349 | ins *target = NULL; int tag = -1; | |
350 | ins *other_target = NULL; int other_tag = -1; | |
351 | ||
352 | bool do_split = false; | |
353 | ||
354 | if (point.i->i.tag == TAG) | |
355 | { | |
356 | target = &point.i[1]; | |
357 | tag = (int) point.i->i.param; | |
358 | } | |
179688ce SM |
359 | else if (point.i->i.tag == FORK && point.i == (ins *) point.i->i.link) |
360 | { | |
361 | /* Workaround for a FORK that points to itself: */ | |
362 | target = &point.i[1]; | |
363 | } | |
07777235 SM |
364 | else if (point.i->i.tag == FORK) |
365 | { | |
366 | do_split = true; | |
367 | // Relative priority of two e-transitions depends on param: | |
368 | if (point.i->i.param) | |
369 | { | |
370 | // Prefer jumping to link. | |
371 | target = &point.i[1]; | |
372 | other_target = (ins *) point.i->i.link; | |
373 | } | |
374 | else | |
375 | { | |
376 | // Prefer stepping to next instruction. | |
377 | target = (ins *) point.i->i.link; | |
378 | other_target = &point.i[1]; | |
379 | } | |
380 | } | |
381 | else if (point.i->i.tag == GOTO) | |
382 | { | |
383 | target = (ins *) point.i->i.link; | |
384 | } | |
385 | else if (point.i->i.tag == INIT && is_initial) | |
386 | { | |
387 | target = &point.i[1]; | |
388 | } | |
389 | ||
179688ce SM |
390 | bool already_found; |
391 | ||
07777235 SM |
392 | // Data for the endpoint of the first transition: |
393 | kernel_point next; | |
394 | next.i = target; | |
395 | next.priority = do_split ? refine_lower(point.priority) : point.priority; | |
396 | next.map_items = point.map_items; | |
397 | ||
9b7e0f3d SM |
398 | // Date for the endpoint of the second transition: |
399 | kernel_point other_next; | |
400 | other_next.i = other_target; | |
401 | other_next.priority = do_split ? refine_higher(point.priority) : point.priority; | |
402 | other_next.map_items = point.map_items; | |
403 | ||
179688ce SM |
404 | // Do infinite-loop-check: |
405 | other_next.parents = point.parents; | |
406 | if (point.parents.find(other_next.i) != point.parents.end()) | |
407 | { | |
408 | other_target = NULL; | |
409 | other_tag = -1; | |
410 | } | |
411 | other_next.parents.insert(other_next.i); | |
412 | ||
413 | next.parents = point.parents; | |
414 | if (point.parents.find(next.i) != point.parents.end()) | |
415 | { | |
c92d3b42 FCE |
416 | // target = NULL; |
417 | // tag = -1; | |
8873c23d | 418 | // <- XXX will be overwritten by other_target / other_tag immediately |
179688ce SM |
419 | goto next_target; |
420 | } | |
421 | next.parents.insert(next.i); | |
422 | ||
07777235 | 423 | another_transition: |
9b7e0f3d SM |
424 | if (target == NULL) |
425 | continue; | |
07777235 SM |
426 | |
427 | // Deal with the current e-transition: | |
428 | ||
429 | if (tag >= 0) | |
430 | { | |
431 | /* Delete all existing next.map_items of the form m[tag,x]. */ | |
432 | for (list<map_item>::iterator it = next.map_items.begin(); | |
a6be9455 | 433 | it != next.map_items.end(); ) |
07777235 | 434 | if (it->first == (unsigned) tag) |
a6be9455 SM |
435 | { |
436 | list<map_item>::iterator next_it = it; | |
437 | next_it++; | |
438 | next.map_items.erase (it); | |
439 | it = next_it; | |
440 | } | |
441 | else it++; | |
07777235 SM |
442 | |
443 | /* Add m[tag,x] to next.map_items, where x is the smallest | |
444 | nonnegative integer such that m[tag,x] does not occur | |
445 | anywhere in closure. Then update the cache. */ | |
446 | unsigned x = max_tags[tag]; | |
447 | next.map_items.push_back(make_pair(tag, ++x)); | |
448 | max_tags[tag] = x; | |
449 | } | |
450 | ||
8873c23d | 451 | /* Deal with similar transitions that have a different priority: */ |
179688ce | 452 | already_found = false; |
8873c23d SM |
453 | for (list<list<kernel_point>::iterator>::iterator it |
454 | = closure_map[next.i].begin(); | |
248f3856 | 455 | it != closure_map[next.i].end(); ) |
07777235 | 456 | { |
8873c23d SM |
457 | // NB: it is an iterator into closure_map[next.i], |
458 | // while *it is an iterator into closure | |
459 | ||
460 | int result = arc_compare(next.priority, (*it)->priority); | |
461 | if (result == 0) | |
462 | { | |
463 | ins *base = dfa->orig_nfa; | |
464 | cerr << "stapregex **UNEXPECTED** -- identical arc_priorities for "; | |
465 | (*it)->print(cerr, base); | |
466 | cerr << " and "; | |
467 | next.print(cerr, base); | |
468 | cerr << endl; | |
469 | } | |
470 | #if 0 | |
471 | // XXX This is an experimental solution which did not work correctly. | |
472 | if (result == 0 && (*it)->i == next.i) | |
473 | { | |
474 | // Reached the same kernel_point via two alternate | |
475 | // (equal priority) paths. Merge map_items from next into *it: | |
476 | cerr << "**DEBUG** (merging paths for same ins)" << endl; | |
477 | for (list<map_item>::iterator jt = next.map_items.begin(); | |
478 | jt != next.map_items.end(); jt++) | |
479 | (*it)->map_items.push_back(*jt); | |
480 | } | |
481 | else | |
482 | #endif | |
483 | assert (result != 0); // Expect this to fail. | |
484 | ||
485 | if (result > 0) { // i.e. next.priority > (*it)->priority | |
486 | #if 0 | |
487 | ins *base = dfa->orig_nfa; | |
488 | cerr << "**DEBUG** erasing "; | |
489 | (*it)->print(cerr, base); | |
490 | cerr << " in favour of "; | |
491 | next.print(cerr, base); | |
492 | cerr << endl; | |
493 | #endif | |
494 | ||
495 | // next.priority is higher, delete existing element | |
496 | closure->erase(*it); | |
179688ce | 497 | |
248f3856 | 498 | // obnoxious shuffle to avoid iterator invalidation |
8873c23d | 499 | list<list<kernel_point>::iterator>::iterator old_it = it; |
248f3856 SM |
500 | it++; |
501 | closure_map[next.i].erase(old_it); | |
502 | continue; | |
8873c23d SM |
503 | } else { // result <= 0 |
504 | // next.priority is lower, skip adding next | |
07777235 SM |
505 | already_found = true; |
506 | } | |
8873c23d | 507 | |
248f3856 | 508 | it++; |
07777235 SM |
509 | } |
510 | ||
511 | if (!already_found) { | |
8873c23d SM |
512 | #if 0 |
513 | cerr << "**DEBUG** added to closure: "; | |
514 | next.print(cerr, dfa->orig_nfa); | |
515 | cerr << endl; | |
516 | #endif | |
517 | ||
518 | // Store the element in closure: | |
519 | closure->push_back(next); | |
520 | worklist.push(next); | |
521 | ||
07777235 SM |
522 | // Store the element in relevant caches: |
523 | ||
8873c23d SM |
524 | list<kernel_point>::iterator next_it = closure->end(); |
525 | next_it --; // XXX rewind to just-pushed element | |
526 | closure_map[next.i].push_back(next_it); | |
248f3856 | 527 | |
07777235 SM |
528 | for (list<map_item>::iterator jt = next.map_items.begin(); |
529 | jt != next.map_items.end(); jt++) | |
530 | max_tags[jt->first] = max(jt->second, max_tags[jt->first]); | |
531 | ||
07777235 SM |
532 | } |
533 | ||
179688ce | 534 | next_target: |
07777235 | 535 | // Now move to dealing with the second e-transition, if any. |
07777235 SM |
536 | target = other_target; other_target = NULL; |
537 | tag = other_tag; other_tag = -1; | |
9b7e0f3d | 538 | next = other_next; |
07777235 SM |
539 | |
540 | goto another_transition; | |
541 | } | |
542 | ||
543 | return closure; | |
544 | } | |
545 | ||
8873c23d SM |
546 | /* Helpers for constructing span table: */ |
547 | ||
548 | bool | |
549 | same_ins(list<kernel_point> &e1, list<kernel_point> &e2) | |
550 | { | |
551 | set<ins *> s1; | |
552 | for (list<kernel_point>::iterator it = e1.begin(); | |
553 | it != e1.end(); it++) | |
554 | s1.insert(it->i); | |
555 | set<ins *> s2; | |
556 | for (list<kernel_point>::iterator it = e2.begin(); | |
557 | it != e2.end(); it++) | |
558 | s2.insert(it->i); | |
559 | return s1 == s2; | |
560 | } | |
561 | ||
562 | /* Helpers for constructing TDFA actions: */ | |
563 | ||
07777235 SM |
564 | /* Find the set of reordering commands (if any) that will get us from |
565 | state s to some existing state in the dfa (returns the state in | |
566 | question, appends reordering commands to r). Returns NULL is no | |
567 | suitable state is found. */ | |
568 | state * | |
8873c23d | 569 | dfa::find_equivalent (state *s, tdfa_action &action) |
07777235 SM |
570 | { |
571 | state *answer = NULL; | |
572 | ||
573 | for (state_kernel::iterator it = s->kernel->begin(); | |
574 | it != s->kernel->end(); it++) | |
9b7e0f3d | 575 | mark(it->i); |
8d2eb09e | 576 | |
07777235 SM |
577 | /* Check kernels of existing states for size equivalence and for |
578 | unmarked items (similar to re2c's original algorithm): */ | |
579 | unsigned n = s->kernel->size(); | |
8873c23d SM |
580 | map<map_item, map_item> shift_map; |
581 | map<map_item, map_item> shift_back; | |
07777235 SM |
582 | for (state *t = first; t != NULL; t = t->next) |
583 | { | |
584 | if (t->kernel->size() == n) | |
585 | { | |
586 | for (state_kernel::iterator it = t->kernel->begin(); | |
587 | it != t->kernel->end(); it++) | |
8d2eb09e SM |
588 | if (!marked(it->i)) |
589 | goto next_state; | |
07777235 | 590 | |
8873c23d SM |
591 | // Check for existence of a reordering tdfa_action r that will |
592 | // produce identical kernel_points with identical map values. | |
593 | ||
594 | // XXX In the below code, we search for more-or-less an | |
595 | // arbitrary permutation of map values. | |
596 | // | |
597 | // To simplify the algorithm, we could instead only check | |
598 | // where lower-index map values are missing from s and | |
599 | // replace them with higher-index map values. The paper | |
600 | // claims this leads to only a slight penalty in number of | |
601 | // TDFA states. | |
602 | ||
603 | // Mapping must be one-to-one; check consistency in both directions: | |
604 | shift_map.clear(); // map item of s -> map item of t | |
605 | shift_back.clear(); // map item of t -> map item of s | |
606 | ||
607 | for (state_kernel::iterator it = s->kernel->begin(); | |
608 | it != s->kernel->end(); it++) | |
609 | { | |
610 | kernel_point *kp1 = &*it; | |
2a0e91d6 | 611 | kernel_point *kp2 = 0; |
8873c23d SM |
612 | |
613 | // Find matching kernel_point in t: | |
614 | bool found_kp = false; | |
615 | for (state_kernel::iterator jt = t->kernel->begin(); | |
616 | jt != t->kernel->end(); jt++) | |
617 | if (kp1->i == jt->i) | |
618 | { | |
619 | // XXX check that ins appears only once | |
620 | assert (!found_kp); | |
621 | kp2 = &*jt; // TODO found matching point | |
622 | found_kp = true; | |
623 | } | |
624 | assert(found_kp); | |
625 | ||
626 | set<int> seen_tags; | |
627 | for (list<map_item>::iterator jt = kp1->map_items.begin(); | |
628 | jt != kp1->map_items.end(); jt++) | |
629 | { | |
630 | map_item mt1 = *jt; | |
631 | map_item mt2; | |
632 | ||
633 | // XXX check that tag appears only once | |
634 | assert (seen_tags.count(mt1.first) == 0); | |
635 | seen_tags.insert(mt1.first); | |
636 | ||
637 | // Find matching map_item in kp2 | |
638 | bool found_tag = false; | |
639 | for (list<map_item>::iterator kt = kp2->map_items.begin(); | |
640 | kt != kp2->map_items.end(); kt++) | |
641 | if (mt1.first == kt->first) | |
642 | { | |
643 | // XXX check that tag appears only once | |
644 | assert (!found_tag); | |
645 | mt2 = *kt; | |
646 | found_tag = true; | |
647 | } | |
648 | ||
649 | if (!found_tag) // if no matching tag, can't use this state | |
650 | goto next_state; | |
651 | if (shift_map.count(mt1) != 0 | |
652 | && shift_map[mt1] != mt2) // if contradiction | |
653 | goto next_state; | |
654 | if (shift_back.count(mt2) != 0 | |
655 | && shift_back[mt2] != mt1) // if contradiction | |
656 | goto next_state; | |
657 | ||
658 | shift_map[mt1] = mt2; | |
659 | shift_back[mt2] = mt1; | |
660 | } | |
661 | ||
662 | // XXX check that every tag in kp2 appears in seen_tag | |
663 | for (list<map_item>::iterator jt = kp2->map_items.begin(); | |
664 | jt != kp2->map_items.end(); jt++) | |
665 | { | |
666 | int t2 = jt->first; | |
667 | if (seen_tags.count(t2) == 0) | |
668 | goto next_state; | |
669 | } | |
670 | } | |
671 | ||
672 | // #ifdef STAPREGEX_DEBUG_TNFA | |
673 | // cerr << " -*- PRE CYCLE CHECK DEBUG obtained valid reorder "; | |
674 | // for (map<map_item, map_item>::iterator it = shift_map.begin(); | |
675 | // it != shift_map.end(); it++) | |
676 | // if (it->first != it->second) | |
677 | // cerr << it->first << "=>" << it->second << " "; | |
678 | // cerr << "to existing state " << t->label << endl; | |
679 | // #endif | |
680 | ||
681 | #if 1 | |
682 | // Check for cyclical dependencies in the resulting reorder. | |
683 | // XXX: If we find a cycle, just create a new state. We could | |
684 | // also break the cycle with a temporary variable. | |
685 | set<map_item> cycle_okay; cycle_okay.clear(); | |
686 | set<map_item> cycle_seen; cycle_seen.clear(); | |
687 | for (map<map_item, map_item>::iterator it = shift_map.begin(); | |
688 | it != shift_map.end(); it++) | |
689 | { | |
690 | map_item m = it->first; | |
691 | if (cycle_okay.count(m) != 0) | |
692 | continue; // -- already checked for cycle | |
693 | ||
694 | while (shift_map.count(m) != 0 && shift_map[m] != m) | |
695 | { | |
696 | if (cycle_okay.count(shift_map[m]) != 0) | |
697 | break; // -- found not-cycle | |
698 | if (cycle_seen.count(shift_map[m]) != 0) | |
699 | goto next_state; // -- found cycle | |
700 | cycle_seen.insert(m); | |
701 | m = shift_map[m]; | |
702 | } | |
703 | ||
704 | // If we reach the end of the chain, or find a map item | |
705 | // where shift_map[m] == m, this is not considered a | |
706 | // cycle, and therefore none of the map items leading to | |
707 | // here are in cycles: | |
708 | cycle_okay.insert(m); | |
709 | for (set<map_item>::iterator jt = cycle_seen.begin(); | |
710 | jt != cycle_seen.end(); jt++) | |
711 | cycle_okay.insert(*jt); | |
712 | cycle_seen.clear(); | |
713 | } | |
714 | #endif | |
715 | ||
716 | #ifdef STAPREGEX_DEBUG_TNFA | |
717 | cerr << " -*- obtained valid reorder "; | |
718 | for (map<map_item, map_item>::iterator it = shift_map.begin(); | |
719 | it != shift_map.end(); it++) | |
720 | if (it->first != it->second) | |
721 | cerr << it->first << "=>" << it->second << " "; | |
722 | cerr << "to existing state " << t->label << endl; | |
723 | #endif | |
724 | ||
725 | // Generate reordering command based on the contents of shift_map: | |
726 | tdfa_action r; | |
727 | set<map_item> saved; saved.clear(); // <- elts safe to overwite | |
728 | queue<map_item> to_shift; | |
729 | for (map<map_item, map_item>::iterator it = shift_back.begin(); | |
730 | it != shift_back.end(); it++) | |
731 | if (it->first != it->second) // skip trivial shifts | |
732 | to_shift.push(it->first); | |
733 | while (!to_shift.empty()) | |
734 | { | |
735 | map_item elt = to_shift.front(); to_shift.pop(); | |
736 | if (shift_map.count(elt) != 0 && saved.count(elt) == 0) | |
737 | { | |
738 | // Need to save it first -- put back on queue: | |
739 | to_shift.push(elt); | |
740 | continue; | |
741 | } | |
742 | ||
743 | tdfa_insn insn; | |
744 | insn.to = elt; | |
745 | insn.from = shift_back[elt]; | |
746 | insn.save_tag = false; | |
747 | insn.save_pos = false; | |
748 | r.push_back(insn); | |
749 | ||
750 | // shift_back[elt] is now safe to overwrite | |
751 | saved.insert(shift_back[elt]); | |
752 | } | |
753 | ||
8d2eb09e | 754 | answer = t; |
8873c23d | 755 | action.insert(action.end(), r.begin(), r.end()); // XXX append |
8d2eb09e | 756 | goto cleanup; |
07777235 | 757 | } |
8d2eb09e SM |
758 | next_state: |
759 | ; | |
07777235 SM |
760 | } |
761 | ||
762 | cleanup: | |
763 | for (state_kernel::iterator it = s->kernel->begin(); | |
764 | it != s->kernel->end(); it++) | |
765 | unmark(it->i); | |
766 | ||
767 | return answer; | |
768 | } | |
769 | ||
8873c23d SM |
770 | /* Generate position-save commands for any map items in new_k that do |
771 | not appear in old_k (old_k can be NULL). */ | |
772 | tdfa_action | |
773 | dfa::compute_action (state_kernel *old_k, state_kernel *new_k) | |
774 | { | |
775 | tdfa_action c; | |
776 | ||
777 | set<map_item> old_items; | |
778 | if (old_k != NULL) | |
779 | for (state_kernel::const_iterator it = old_k->begin(); | |
780 | it != old_k->end(); it++) | |
781 | for (list<map_item>::const_iterator jt = it->map_items.begin(); | |
782 | jt != it->map_items.end(); jt++) | |
783 | old_items.insert(*jt); | |
784 | ||
785 | // XXX: use a set, since we only need one position-save per new map item | |
786 | set<map_item> store_items; | |
787 | for (state_kernel::const_iterator it = new_k->begin(); | |
788 | it != new_k->end(); it++) | |
789 | for (list<map_item>::const_iterator jt = it->map_items.begin(); | |
790 | jt != it->map_items.end(); jt++) | |
791 | if (old_items.find(*jt) == old_items.end()) | |
792 | store_items.insert(*jt); | |
793 | ||
794 | for (set<map_item>::iterator it = store_items.begin(); | |
795 | it != store_items.end(); it++) | |
796 | { | |
797 | // ensure room for m[i,n] is present in tag_states: | |
798 | add_map_item(*it); | |
799 | ||
800 | // append m[i,n] <- <curr position> to c | |
801 | tdfa_insn insn; | |
802 | insn.to = *it; | |
803 | insn.save_tag = false; | |
804 | insn.save_pos = true; | |
805 | c.push_back(insn); | |
806 | } | |
07777235 | 807 | |
8873c23d SM |
808 | return c; |
809 | } | |
810 | ||
811 | tdfa_action | |
812 | dfa::compute_finalizer (state *s) | |
813 | { | |
814 | // TODO VERIFY THAT THIS WORKS -- CAN THERE BE CONFLICTS? | |
815 | tdfa_action c; | |
816 | assert (s->accept_kp != NULL); | |
817 | ||
818 | // iterate map items m[i,j] | |
819 | for (list<map_item>::iterator it = s->accept_kp->map_items.begin(); | |
820 | it != s->accept_kp->map_items.end(); it++) | |
821 | { | |
822 | // append t[i] <- m[i,j] to c | |
823 | tdfa_insn insn; | |
824 | insn.from = *it; | |
825 | insn.save_tag = true; | |
826 | insn.save_pos = false; | |
827 | c.push_back(insn); | |
828 | } | |
829 | ||
830 | return c; | |
831 | } | |
832 | ||
833 | /* The main DFA-construction algorithm: */ | |
834 | ||
835 | dfa::dfa (ins *i, int ntags, vector<string>& outcome_snippets, | |
836 | int accept_outcome) | |
837 | : orig_nfa(i), nstates(0), nmapitems(0), ntags(ntags), | |
838 | outcome_snippets(outcome_snippets), success_outcome(accept_outcome) | |
07777235 | 839 | { |
8873c23d SM |
840 | #ifdef STAPREGEX_DEBUG_TNFA |
841 | cerr << "DFA CONSTRUCTION (ntags=" << ntags << "):" << endl; | |
842 | #endif | |
843 | ||
844 | // XXX: Longest-match priority requires one success and one failure outcome: | |
845 | if (ntags > 0) | |
846 | { | |
847 | assert(outcome_snippets.size() == 2); | |
848 | assert(success_outcome == 1); | |
849 | fail_outcome = 0; | |
850 | } | |
851 | ||
07777235 SM |
852 | /* Initialize empty linked list of states: */ |
853 | first = last = NULL; | |
854 | ||
855 | ins *start = &i[0]; | |
fb5669f0 | 856 | state_kernel *seed_kernel = make_kernel(start); |
8873c23d | 857 | state_kernel *initial_kernel = te_closure(this, seed_kernel, ntags, true); |
fb5669f0 | 858 | delete seed_kernel; |
8873c23d | 859 | state *initial = add_state(new state(this, initial_kernel)); |
248f3856 | 860 | queue<state *> worklist; worklist.push(initial); |
07777235 | 861 | |
8873c23d SM |
862 | initializer = compute_action(NULL, initial_kernel); |
863 | #ifdef STAPREGEX_DEBUG_TNFA | |
864 | cerr << " - constructed initializer " << initializer << endl << endl; | |
865 | #endif | |
866 | ||
07777235 SM |
867 | while (!worklist.empty()) |
868 | { | |
248f3856 | 869 | state *curr = worklist.front(); worklist.pop(); |
07777235 | 870 | |
8873c23d SM |
871 | // Kernel points before and after each edge: |
872 | vector<list<kernel_point> > edge_begin(NUM_REAL_CHARS); | |
873 | vector<list<kernel_point> > edge_end(NUM_REAL_CHARS); | |
07777235 SM |
874 | |
875 | /* Using the CHAR instructions in kernel, build the initial | |
876 | table of spans for curr. Also check for final states. */ | |
877 | ||
248f3856 SM |
878 | for (list<kernel_point>::iterator it = curr->kernel->begin(); |
879 | it != curr->kernel->end(); it++) | |
07777235 SM |
880 | { |
881 | if (it->i->i.tag == CHAR) | |
882 | { | |
8873c23d | 883 | // Add a new kernel_point for each targeted insn: |
07777235 | 884 | for (ins *j = &it->i[1]; j < (ins *) it->i->i.link; j++) |
8873c23d SM |
885 | { |
886 | // XXX: deallocate together with span table | |
887 | kernel_point point; | |
888 | point.i = (ins *) it->i->i.link; | |
889 | point.priority = it->priority; | |
890 | point.map_items = it->map_items; // copy map items | |
891 | ||
892 | edge_begin[j->c.value].push_back(*it); | |
893 | edge_end[j->c.value].push_back(point); | |
894 | } | |
07777235 SM |
895 | } |
896 | else if (it->i->i.tag == ACCEPT) | |
897 | { | |
8873c23d SM |
898 | /* In case of multiple accepting NFA states, |
899 | prefer the highest numbered outcome. | |
900 | ||
901 | XXX: A possible refinement (commented-out). | |
902 | In case of NFA states with identical outcomes | |
903 | pick the one with the highest arc_priority. */ | |
904 | if (!curr->accepts || it->i->i.param > curr->accept_outcome | |
905 | /* || arc_compare(it->priority, curr->accept_kp->priority) > 0 */) | |
906 | { | |
907 | curr->accept_kp = &*it; | |
908 | curr->accept_outcome = it->i->i.param; | |
909 | } | |
07777235 | 910 | curr->accepts = true; |
07777235 SM |
911 | } |
912 | } | |
913 | ||
8873c23d SM |
914 | /* If the state was marked as accepting, add a finalizer: */ |
915 | if (curr->accepts) | |
916 | { | |
917 | assert(curr->finalizer.empty()); // XXX: only process a state once | |
918 | curr->finalizer = compute_finalizer(curr); | |
919 | } | |
920 | ||
aaed700d | 921 | for (unsigned c = 0; c < NUM_REAL_CHARS; ) |
07777235 | 922 | { |
8873c23d SM |
923 | list<kernel_point> eb = edge_begin[c]; |
924 | list<kernel_point> ee = edge_end[c]; | |
925 | assert (!ee.empty()); // XXX: ensured by fail_re in stapregex_compile | |
07777235 SM |
926 | |
927 | span s; | |
928 | ||
929 | s.lb = c; | |
930 | ||
8873c23d | 931 | while (++c < NUM_REAL_CHARS && same_ins(edge_end[c], ee)) ; |
07777235 | 932 | |
248f3856 | 933 | s.ub = c - 1; |
07777235 SM |
934 | |
935 | s.reach_pairs = new state_kernel; | |
8873c23d | 936 | s.jump_pairs = new state_kernel; |
07777235 | 937 | |
8873c23d SM |
938 | for (list<kernel_point>::iterator it = eb.begin(); |
939 | it != eb.end(); it++) | |
940 | s.jump_pairs->push_back(*it); | |
941 | for (list<kernel_point>::iterator it = ee.begin(); | |
942 | it != ee.end(); it++) | |
943 | s.reach_pairs->push_back(*it); | |
07777235 SM |
944 | |
945 | curr->spans.push_back(s); | |
946 | } | |
947 | ||
948 | /* For each of the spans in curr, determine the reachable | |
949 | points assuming a character in the span. */ | |
8873c23d SM |
950 | #ifdef STAPREGEX_DEBUG_TNFA |
951 | cerr << "building transitions for state " << curr->label << ":" << endl; | |
952 | #endif | |
07777235 SM |
953 | for (list<span>::iterator it = curr->spans.begin(); |
954 | it != curr->spans.end(); it++) | |
955 | { | |
07777235 | 956 | /* Set up candidate target state: */ |
8873c23d SM |
957 | state_kernel *u_pairs = te_closure(this, it->reach_pairs, ntags); |
958 | state *target = new state(this, u_pairs); | |
07777235 SM |
959 | |
960 | /* Generate position-save commands for any map items | |
8873c23d SM |
961 | that do not appear in the edge: */ |
962 | tdfa_action c = compute_action(it->jump_pairs, u_pairs); | |
07777235 SM |
963 | |
964 | /* If there is a state t_prime in states such that some | |
965 | sequence of reordering commands r produces t_prime | |
966 | from target, use t_prime as the target state, | |
967 | appending the reordering commands to c. */ | |
968 | state *t_prime = find_equivalent(target, c); | |
9b7e0f3d | 969 | if (t_prime != NULL) |
248f3856 | 970 | { |
8873c23d | 971 | assert (t_prime != target); |
07777235 SM |
972 | delete target; |
973 | } | |
974 | else | |
975 | { | |
976 | /* We need to actually add target to the dfa: */ | |
977 | t_prime = target; | |
978 | add_state(t_prime); | |
248f3856 | 979 | worklist.push(t_prime); |
8873c23d SM |
980 | #ifdef STAPREGEX_DEBUG_TNFA |
981 | cerr << " -*- add new state " << t_prime->label << endl; | |
982 | #endif | |
07777235 SM |
983 | } |
984 | ||
985 | /* Set the transition: */ | |
986 | it->to = t_prime; | |
987 | it->action = c; | |
988 | } | |
8873c23d SM |
989 | #ifdef STAPREGEX_DEBUG_TNFA |
990 | cerr << " -> constructed " << curr << endl; | |
991 | #endif | |
07777235 | 992 | } |
8873c23d SM |
993 | #ifdef STAPREGEX_DEBUG_TNFA |
994 | cerr << endl; | |
995 | #endif | |
07777235 SM |
996 | } |
997 | ||
998 | dfa::~dfa () | |
999 | { | |
1000 | state * s; | |
1001 | while ((s = first)) | |
1002 | { | |
1003 | first = s->next; | |
1004 | delete s; | |
1005 | } | |
1006 | ||
1007 | delete orig_nfa; | |
1008 | } | |
1009 | ||
1010 | // ------------------------------------------------------------------------ | |
1011 | ||
54f92ee4 SM |
1012 | void |
1013 | span::emit_jump (translator_output *o, const dfa *d) const | |
1014 | { | |
112e9e2b | 1015 | #ifdef STAPREGEX_DEBUG_MATCH |
1cef69fd SM |
1016 | o->newline () << "_stp_printf(\" --> @%ld GOTO yystate%d\\n\", " |
1017 | << "YYLENGTH, " << to->label << ");"; | |
1018 | o->newline () << "_stp_print_flush();"; | |
112e9e2b | 1019 | #endif |
54f92ee4 | 1020 | |
54f92ee4 SM |
1021 | if (to->accepts) |
1022 | { | |
1023 | emit_final(o, d); | |
1cef69fd | 1024 | return; |
54f92ee4 | 1025 | } |
1cef69fd SM |
1026 | |
1027 | // We record map_items *after* consuming YYCURSOR: | |
1028 | o->newline () << "YYCURSOR++;"; | |
1029 | d->emit_action(o, action); | |
1030 | o->newline () << "goto yystate" << to->label << ";"; | |
54f92ee4 SM |
1031 | } |
1032 | ||
1cef69fd SM |
1033 | /* Assuming the target DFA state of the span is a final state, emit code to |
1034 | cleanup tags and (if appropriate) exit with a final answer. */ | |
54f92ee4 SM |
1035 | void |
1036 | span::emit_final (translator_output *o, const dfa *d) const | |
1037 | { | |
1038 | assert (to->accepts); // XXX: must guarantee correct usage of emit_final() | |
1cef69fd SM |
1039 | |
1040 | // We record map_items *after* consuming YYCURSOR: | |
1041 | o->newline () << "YYCURSOR++;"; | |
1042 | d->emit_action(o, action); | |
1043 | ||
1044 | // XXX: Note that condition to->finalizer.empty() is only | |
1045 | // appropriate for the two-outcome scheme with one outcome being a | |
1046 | // failure. | |
1047 | if (d->ntags == 0 || to->finalizer.empty()) // terminate immediately | |
1048 | { | |
1049 | d->emit_action(o, to->finalizer); | |
1050 | if (d->ntags == 0) | |
1051 | { | |
1052 | o->newline() << d->outcome_snippets[to->accept_outcome]; | |
1053 | o->newline() << "goto yyfinish;"; | |
1054 | } | |
1055 | else | |
1056 | { | |
1057 | // Need to return the outcome associated with the longest match: | |
1058 | o->newline() << "if ( YYFINAL(0) >= 0 ) {"; | |
1059 | o->newline(1) << d->outcome_snippets[d->success_outcome]; | |
1060 | o->newline(-1) << "} else {"; | |
1061 | o->newline(1) << d->outcome_snippets[d->fail_outcome]; | |
1062 | o->newline(-1) << "}"; | |
1063 | o->newline() << "goto yyfinish;"; | |
1064 | } | |
1065 | } | |
1066 | else | |
1067 | { | |
1068 | // Ensure longest-match priority by comparing length + start coord: | |
1069 | map_item new_tag_0; bool found = false; | |
1070 | for (tdfa_action::iterator it = to->finalizer.begin(); | |
1071 | it != to->finalizer.end(); it++) | |
1072 | // TODOXXX: Only works if finalizer only contains reordering commands | |
1073 | // (perhaps make that into an explicitly checked condition?) | |
1074 | if (it->save_tag && it->from.first == 0) | |
1075 | { | |
1076 | new_tag_0 = it->from; // the map_item saved to tag 0 | |
1077 | found = true; | |
1078 | } | |
1079 | assert(found); | |
1080 | #define NEW_TAG_0 "YYTAG(" << new_tag_0.first << "," << new_tag_0.second << ")" | |
1081 | // if (new_tag_0 == old_tag_0 && new_length > old_length) emit action; | |
1082 | o->newline() << "if ( YYFINAL(0) < 0 || " | |
1083 | << "(" << NEW_TAG_0 << " == YYFINAL(0) &&"; | |
1084 | o->newline() << " (YYLENGTH - " << NEW_TAG_0 << ")" | |
1085 | << " > (YYFINAL(1) - YYFINAL(0)))) {"; | |
1086 | o->newline(1); d->emit_action(o, to->finalizer); | |
1087 | o->indent(-1); | |
1088 | o->newline() << "}"; | |
1089 | ||
1090 | o->newline () << "goto yystate" << to->label << ";"; | |
1091 | } | |
54f92ee4 SM |
1092 | } |
1093 | ||
6c9b8d26 | 1094 | string c_char(rchar c) |
54f92ee4 SM |
1095 | { |
1096 | stringstream o; | |
1097 | o << "'"; | |
1098 | print_escaped(o, c); | |
1099 | o << "'"; | |
1100 | return o.str(); | |
1101 | } | |
1102 | ||
1103 | void | |
1104 | state::emit (translator_output *o, const dfa *d) const | |
1105 | { | |
1106 | o->newline() << "yystate" << label << ": "; | |
112e9e2b | 1107 | #ifdef STAPREGEX_DEBUG_MATCH |
1cef69fd SM |
1108 | o->newline () << "_stp_printf(\"@%ld READ '%s' %c\", " |
1109 | << "YYLENGTH, cur, *YYCURSOR);"; | |
1110 | o->newline () << "_stp_print_flush();"; | |
112e9e2b | 1111 | #endif |
54f92ee4 SM |
1112 | o->newline() << "switch (*YYCURSOR) {"; |
1113 | o->indent(1); | |
1cef69fd | 1114 | const span *default_span = NULL; |
54f92ee4 SM |
1115 | for (list<span>::const_iterator it = spans.begin(); |
1116 | it != spans.end(); it++) | |
1117 | { | |
1118 | // If we see a '\0', go immediately into an accept state: | |
1119 | if (it->lb == '\0') | |
1120 | { | |
1121 | o->newline() << "case " << c_char('\0') << ":"; | |
1cef69fd | 1122 | it->emit_final(o, d); |
54f92ee4 SM |
1123 | } |
1124 | ||
1125 | // Emit labels to handle all the other elements of the span: | |
6c9b8d26 SM |
1126 | for (unsigned c = max((rchar) '\1', it->lb); |
1127 | c <= (unsigned) it->ub; c++) { | |
1cef69fd SM |
1128 | if (c > 127) |
1129 | { | |
1130 | default_span = &(*it); | |
1131 | continue; // XXX: not an ASCII char, needs special handling | |
1132 | } | |
6c9b8d26 | 1133 | o->newline() << "case " << c_char((rchar) c) << ":"; |
54f92ee4 SM |
1134 | } |
1135 | it->emit_jump(o, d); | |
1136 | ||
1cef69fd SM |
1137 | // TODOXXX 'default' option should handle the largest span |
1138 | // TODOXXX optimize by accepting before end of string whenever possible | |
1139 | } | |
1140 | if (default_span) | |
1141 | { | |
1142 | // Handle a non-ASCII (unknown) char: | |
1143 | o->newline() << "default:"; | |
1144 | default_span->emit_jump(o, d); | |
54f92ee4 SM |
1145 | } |
1146 | o->newline(-1) << "}"; | |
1147 | } | |
1148 | ||
07777235 SM |
1149 | void |
1150 | dfa::emit (translator_output *o) const | |
1151 | { | |
54f92ee4 | 1152 | #ifdef STAPREGEX_DEBUG_DFA |
07777235 | 1153 | print(o); |
54f92ee4 SM |
1154 | #else |
1155 | o->newline() << "{"; | |
1156 | o->newline(1); | |
e5f8ae47 | 1157 | |
1cef69fd SM |
1158 | // Initialize tags: |
1159 | if (ntags > 0) | |
1160 | { | |
1161 | o->newline() << "unsigned int i;"; | |
b25b302a | 1162 | o->newline() << "for (i = 0; i < STAPREGEX_MAX_TAG; i++)"; |
1cef69fd SM |
1163 | o->newline(1) << "YYFINAL(i) = -1;"; |
1164 | o->indent(-1); | |
1165 | } | |
1166 | ||
1167 | emit_action(o, initializer); | |
1168 | ||
e5f8ae47 | 1169 | if (first->accepts) |
1cef69fd SM |
1170 | { |
1171 | emit_action(o, first->finalizer); | |
1172 | } | |
1173 | if (first->accepts && ntags == 0) // XXX workaround for empty regex | |
e5f8ae47 SM |
1174 | { |
1175 | o->newline() << outcome_snippets[first->accept_outcome]; | |
1176 | o->newline() << "goto yyfinish;"; | |
1177 | } | |
1178 | ||
54f92ee4 SM |
1179 | for (state *s = first; s; s = s->next) |
1180 | s->emit(o, this); | |
e5f8ae47 | 1181 | |
54f92ee4 SM |
1182 | o->newline() << "yyfinish: ;"; |
1183 | o->newline(-1) << "}"; | |
1184 | #endif | |
07777235 SM |
1185 | } |
1186 | ||
1187 | void | |
1cef69fd SM |
1188 | dfa::emit_action (translator_output *o, const tdfa_action &act) const |
1189 | { | |
1190 | #ifdef STAPREGEX_DEBUG_MATCH | |
1191 | o->newline () << "_stp_printf(\" --> @%ld, SET_TAG %s\\n\", " | |
1192 | << "YYLENGTH, \"" << act << "\");"; | |
1193 | o->newline () << "_stp_print_flush();"; | |
1194 | #endif | |
1195 | for (tdfa_action::const_iterator it = act.begin(); it != act.end(); it++) | |
1196 | { | |
1197 | if (it->save_tag) | |
1198 | o->newline() << "YYFINAL(" << it->from.first << ") = "; | |
1199 | else | |
1200 | o->newline() << "YYTAG(" << it->to.first | |
1201 | << "," << it->to.second << ") = "; | |
1202 | if (it->save_pos) | |
1203 | o->line() << "YYLENGTH"; | |
1204 | else | |
1205 | o->line() << "YYTAG(" << it->from.first | |
1206 | << "," << it->from.second << ")"; | |
1207 | o->line() << ";"; | |
1208 | } | |
1209 | } | |
1210 | ||
1211 | void | |
1212 | dfa::emit_tagsave (translator_output *o, std::string, | |
1213 | std::string, std::string num_final_tags) const | |
07777235 | 1214 | { |
1cef69fd SM |
1215 | // TODOXXX: ignoring other two snippets (tag_states and tag_vals), |
1216 | // which are handled by the earlier code in the actual matcher. | |
1217 | o->newline() << num_final_tags << " = " << ntags << ";"; | |
07777235 SM |
1218 | } |
1219 | ||
1220 | // ------------------------------------------------------------------------ | |
1221 | ||
8873c23d SM |
1222 | std::ostream& |
1223 | operator << (std::ostream &o, const map_item& m) | |
1224 | { | |
1225 | o << "m[" << m.first << "," << m.second << "]"; | |
1226 | return o; | |
1227 | } | |
1228 | ||
07777235 SM |
1229 | std::ostream& |
1230 | operator << (std::ostream &o, const tdfa_action& a) | |
1231 | { | |
1232 | for (list<tdfa_insn>::const_iterator it = a.begin(); | |
1233 | it != a.end(); it++) | |
1234 | { | |
1235 | if (it != a.begin()) o << "; "; | |
1236 | ||
8873c23d SM |
1237 | if (it->save_tag) |
1238 | o << "t[" << it->from.first << "] <- "; | |
1239 | else | |
1240 | o << it->to << " <- "; | |
07777235 SM |
1241 | |
1242 | if (it->save_pos) | |
1243 | o << "p"; | |
1244 | else | |
8873c23d | 1245 | o << it->from; |
07777235 SM |
1246 | } |
1247 | ||
1248 | return o; | |
1249 | } | |
1250 | ||
248f3856 SM |
1251 | std::ostream& |
1252 | operator << (std::ostream &o, const arc_priority& p) | |
1253 | { | |
1254 | o << p.first << "/" << (1 << p.second); | |
1255 | return o; | |
1256 | } | |
1257 | ||
8873c23d SM |
1258 | void |
1259 | kernel_point::print (std::ostream &o, ins *base) const | |
1260 | { | |
1261 | o << (i - base); | |
1262 | o << "[" << priority << "]"; | |
1263 | if (!map_items.empty()) | |
1264 | { | |
1265 | o << ":"; | |
1266 | for (list<map_item>::const_iterator it = map_items.begin(); | |
1267 | it != map_items.end(); it++) | |
1268 | { | |
1269 | if (it != map_items.begin()) o << ","; | |
1270 | o << *it; | |
1271 | } | |
1272 | } | |
1273 | } | |
1274 | ||
07777235 SM |
1275 | void |
1276 | state::print (translator_output *o) const | |
1277 | { | |
1278 | o->line() << "state " << label; | |
8873c23d SM |
1279 | |
1280 | #ifdef STAPREGEX_DEBUG_TNFA | |
1281 | // For debugging, also show the kernel: | |
1282 | ins *base = owner->orig_nfa; | |
1283 | o->line() << " w/kernel {"; | |
1284 | for (state_kernel::iterator it = kernel->begin(); | |
1285 | it != kernel->end(); it++) | |
1286 | { | |
1287 | if (it != kernel->begin()) o->line() << "; "; | |
1288 | it->print(o->line(), base); | |
1289 | } | |
1290 | o->line() << "}"; | |
1291 | ||
1292 | // Also print information for constructing reorderings: | |
1293 | set<map_item> all_items; | |
1294 | for (state_kernel::iterator it = kernel->begin(); | |
1295 | it != kernel->end(); it++) | |
1296 | for (list<map_item>::iterator jt = it->map_items.begin(); | |
1297 | jt != it->map_items.end(); jt++) | |
1298 | all_items.insert(*jt); | |
1299 | if (!all_items.empty()) | |
1300 | { | |
1301 | o->newline() << " "; | |
1302 | o->line() << " with map_items "; | |
1303 | for (set<map_item>::iterator it = all_items.begin(); | |
1304 | it != all_items.end(); it++) | |
1305 | o->line() << *it << " "; | |
1306 | } | |
1307 | ||
1308 | if (accepts || !finalizer.empty()) | |
1309 | o->newline() << " "; | |
1310 | #endif | |
1311 | ||
07777235 SM |
1312 | if (accepts) |
1313 | o->line() << " accepts " << accept_outcome; | |
1314 | if (!finalizer.empty()) | |
8873c23d | 1315 | o->line() << " with finalizer {" << finalizer << "}"; |
07777235 | 1316 | |
8873c23d | 1317 | // TODOXXX: factor this out to span::print() |
07777235 SM |
1318 | o->indent(1); |
1319 | for (list<span>::const_iterator it = spans.begin(); | |
1320 | it != spans.end(); it++) | |
1321 | { | |
248f3856 | 1322 | o->newline() << "'"; |
07777235 | 1323 | if (it->lb == it->ub) |
248f3856 SM |
1324 | { |
1325 | print_escaped (o->line(), it->lb); | |
1326 | o->line() << " "; | |
1327 | } | |
07777235 | 1328 | else |
248f3856 SM |
1329 | { |
1330 | print_escaped (o->line(), it->lb); | |
1331 | o->line() << "-"; | |
1332 | print_escaped (o->line(), it->ub); | |
1333 | } | |
07777235 | 1334 | |
8873c23d SM |
1335 | if (it->to != NULL) |
1336 | o->line() << "' -> " << it->to->label; | |
1337 | else | |
1338 | o->line() << "' -> <none>"; | |
07777235 SM |
1339 | |
1340 | if (!it->action.empty()) | |
8873c23d | 1341 | o->line() << " {" << it->action << "}"; |
07777235 SM |
1342 | } |
1343 | o->newline(-1); | |
1344 | } | |
1345 | ||
1346 | void | |
8873c23d | 1347 | state::print (std::ostream &o) const |
07777235 SM |
1348 | { |
1349 | translator_output to(o); print(&to); | |
1350 | } | |
1351 | ||
8873c23d SM |
1352 | std::ostream& |
1353 | operator << (std::ostream &o, const state *s) | |
1354 | { | |
1355 | s->print(o); | |
1356 | return o; | |
1357 | } | |
1358 | ||
07777235 SM |
1359 | void |
1360 | dfa::print (translator_output *o) const | |
1361 | { | |
248f3856 | 1362 | o->newline(); |
07777235 SM |
1363 | for (state *s = first; s; s = s->next) |
1364 | { | |
1365 | s->print(o); | |
1366 | o->newline(); | |
1367 | } | |
248f3856 | 1368 | o->newline(); |
07777235 SM |
1369 | } |
1370 | ||
8873c23d SM |
1371 | void |
1372 | dfa::print (std::ostream& o) const | |
1373 | { | |
1374 | translator_output to(o); print(&to); | |
1375 | } | |
1376 | ||
07777235 SM |
1377 | std::ostream& |
1378 | operator << (std::ostream& o, const dfa& d) | |
1379 | { | |
1380 | d.print(o); | |
1381 | return o; | |
1382 | } | |
1383 | ||
1384 | std::ostream& | |
1385 | operator << (std::ostream &o, const dfa *d) | |
1386 | { | |
1387 | o << *d; | |
1388 | return o; | |
1389 | } | |
1390 | ||
1391 | }; | |
1392 | ||
1393 | /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */ |